Line data Source code
1 : #include "fd_wksp_private.h"
2 : #include <sys/mman.h>
3 : #include <errno.h>
4 :
5 : int
6 545652576 : fd_wksp_private_lock( fd_wksp_t * wksp ) {
7 : # if FD_WKSP_LOCK_RECLAIM
8 : int warning = 0;
9 : #endif
10 545652576 : ulong me = fd_log_group_id();
11 :
12 545652576 : ulong * _owner = &wksp->owner;
13 545652576 : for(;;) {
14 :
15 : /* Note that we emulate CAS on platforms without FD_HAS_ATOMIC
16 : to minimize the amount of code differences we have to test. On
17 : platforms without FD_HAS_ATOMIC, a workspace should not be used
18 : concurrently though. */
19 :
20 545652576 : FD_COMPILER_MFENCE();
21 545652576 : # if FD_HAS_ATOMIC
22 545652576 : ulong pid = FD_ATOMIC_CAS( _owner, ULONG_MAX, me );
23 : # else
24 : ulong pid = FD_VOLATILE_CONST( *_owner );
25 : if( pid==ULONG_MAX ) FD_VOLATILE( *_owner ) = me;
26 : # endif
27 545652576 : FD_COMPILER_MFENCE();
28 :
29 545652576 : if( FD_LIKELY( pid==ULONG_MAX ) ) return FD_WKSP_SUCCESS;
30 :
31 : # if FD_WKSP_LOCK_RECLAIM
32 : int status = fd_log_group_id_query( pid );
33 : if( FD_UNLIKELY( status==FD_LOG_GROUP_ID_QUERY_DEAD ) ) { /* A process died while holding the lock, try to recover the lock */
34 :
35 : FD_COMPILER_MFENCE();
36 : # if FD_HAS_ATOMIC
37 : ulong cur = FD_ATOMIC_CAS( _owner, pid, me );
38 : # else
39 : ulong cur = FD_VOLATILE_CONST( *_owner );
40 : if( cur==pid ) FD_VOLATILE( *_owner ) = me;
41 : # endif
42 : FD_COMPILER_MFENCE();
43 :
44 : if( FD_LIKELY( cur==pid ) ) { /* We recovered the lock from the dead pid, try to fix up incomplete ops */
45 :
46 : FD_LOG_WARNING(( "Process %lu died in an operation on wksp %s; verifying", pid, wksp->name ));
47 : if( FD_LIKELY( !fd_wksp_verify( wksp ) ) ) { /* logs details of issues detected */
48 : FD_LOG_NOTICE(( "wksp verified" ));
49 : return FD_WKSP_SUCCESS;
50 : }
51 :
52 : FD_LOG_WARNING(( "Issues detected; rebuilding" ));
53 : if( FD_UNLIKELY( fd_wksp_rebuild( wksp, wksp->seed ) ) ) { /* Rebuild failed (logs details of issues detected) */
54 : /* Return control of the lock to the previous owner */
55 : FD_COMPILER_MFENCE();
56 : FD_VOLATILE( *_owner ) = pid;
57 : FD_COMPILER_MFENCE();
58 : FD_LOG_WARNING(( "corrupt wksp detected" ));
59 : return FD_WKSP_ERR_CORRUPT;
60 : }
61 :
62 : FD_LOG_NOTICE(( "wksp rebuilt" ));
63 : return FD_WKSP_SUCCESS;
64 :
65 : }
66 :
67 : /* Somebody beat us to recovering the lock ... try again */
68 :
69 : } else if( FD_UNLIKELY( status!=FD_LOG_GROUP_ID_QUERY_LIVE ) ) { /* Unclear pid status ... issue a warning and try again */
70 :
71 : if( FD_UNLIKELY( !warning ) ) {
72 : FD_LOG_WARNING(( "wksp %s is owned by unknown pid %li; attempting to recover", wksp->name, pid ));
73 : warning = 1;
74 : }
75 :
76 : }
77 :
78 : /* At this point, either another thread in this process has the
79 : lock, another active thread in another process has the lock,
80 : another unknown status thread in other process has the lock or
81 : another thread beat us to reclaim the lock from a dead process.
82 : In any case, we don't have the lock. Wait a while to limit O/S
83 : contention and try again. */
84 :
85 : FD_YIELD();
86 : # else
87 :
88 : /* If we are running without FD_WKSP_LOCK_RECLAIM then it is assumed
89 : that the contention is caused by a tile pinned to another core,
90 : and that this core is itself pinned so spin locking is best. */
91 0 : FD_SPIN_PAUSE();
92 :
93 0 : #endif
94 0 : }
95 :
96 : /* never get here */
97 545652576 : }
98 :
99 : /* Public APIs ********************************************************/
100 :
101 : ulong
102 : fd_wksp_part_max_est( ulong footprint,
103 135 : ulong sz_typical ) {
104 135 : footprint = fd_ulong_align_dn( footprint, FD_WKSP_ALIGN );
105 135 : ulong data_end = footprint - 1UL;
106 135 : ulong pinfo_off = fd_wksp_private_pinfo_off();
107 135 : ulong consumed = sizeof(fd_wksp_private_pinfo_t) + sz_typical;
108 135 : ulong part_max = (data_end - pinfo_off) / (consumed + (ulong)!consumed); /* avoid div-by-zero */
109 135 : if( FD_UNLIKELY( (!footprint) | (!sz_typical) | (sz_typical>consumed) | (pinfo_off>data_end) ) ) return 0UL;
110 126 : return fd_ulong_min( part_max, FD_WKSP_PRIVATE_PINFO_IDX_NULL );
111 135 : }
112 :
113 : ulong
114 : fd_wksp_data_max_est( ulong footprint,
115 144 : ulong part_max ) {
116 144 : footprint = fd_ulong_align_dn( footprint, FD_WKSP_ALIGN );
117 144 : ulong data_end = footprint - 1UL;
118 144 : ulong data_off = fd_wksp_private_data_off( part_max );
119 144 : if( FD_UNLIKELY( (!part_max) | (part_max>FD_WKSP_PRIVATE_PINFO_IDX_NULL) |
120 144 : (part_max > ((ULONG_MAX - fd_wksp_private_pinfo_off())/sizeof(fd_wksp_private_pinfo_t))) | /* covered above */
121 144 : (!footprint) | (data_off>=data_end) ) ) return 0UL;
122 126 : return data_end - data_off;
123 144 : }
124 :
125 : ulong
126 3 : fd_wksp_align( void ) {
127 3 : return FD_WKSP_ALIGN;
128 3 : }
129 :
130 : ulong
131 : fd_wksp_footprint( ulong part_max,
132 1596 : ulong data_max ) {
133 1596 : ulong data_off = fd_wksp_private_data_off( part_max );
134 1596 : if( FD_UNLIKELY( (!part_max) | (part_max>FD_WKSP_PRIVATE_PINFO_IDX_NULL) | (!data_max) |
135 1596 : (part_max > ((ULONG_MAX - fd_wksp_private_pinfo_off())/sizeof(fd_wksp_private_pinfo_t))) | /* Covered above */
136 1596 : (data_max > (ULONG_MAX - FD_WKSP_ALIGN + 1UL - data_off - 1UL) ) ) ) return 0UL;
137 1527 : return fd_ulong_align_up( data_off + data_max + 1UL, FD_WKSP_ALIGN );
138 1596 : }
139 :
140 : void *
141 : fd_wksp_new( void * shmem,
142 : char const * name,
143 : uint seed,
144 : ulong part_max,
145 129 : ulong data_max ) {
146 129 : fd_wksp_t * wksp = (fd_wksp_t *)shmem;
147 :
148 129 : if( FD_UNLIKELY( !wksp ) ) {
149 3 : FD_LOG_WARNING(( "NULL shmem" ));
150 3 : return NULL;
151 3 : }
152 :
153 126 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)wksp, FD_WKSP_ALIGN ) ) ) {
154 3 : FD_LOG_WARNING(( "bad align" ));
155 3 : return NULL;
156 3 : }
157 :
158 123 : ulong name_len = fd_shmem_name_len( name );
159 123 : if( FD_UNLIKELY( !name_len ) ) {
160 3 : FD_LOG_WARNING(( "bad name" ));
161 3 : return NULL;
162 3 : }
163 :
164 120 : ulong footprint = fd_wksp_footprint( part_max, data_max );
165 120 : if( FD_UNLIKELY( !footprint ) ) {
166 12 : FD_LOG_WARNING(( "bad part_max and/or data_max" ));
167 12 : return NULL;
168 12 : }
169 :
170 108 : fd_memset( wksp, 0, fd_wksp_footprint( part_max, 1UL ) );
171 :
172 108 : wksp->part_max = part_max;
173 108 : wksp->data_max = data_max;
174 108 : wksp->gaddr_lo = fd_wksp_private_data_off( part_max );
175 108 : wksp->gaddr_hi = wksp->gaddr_lo + data_max;
176 108 : fd_memcpy( wksp->name, name, name_len+1UL );
177 108 : wksp->seed = seed;
178 108 : wksp->idle_top_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
179 108 : wksp->part_head_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
180 108 : wksp->part_tail_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
181 108 : wksp->part_used_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
182 108 : wksp->part_free_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
183 108 : wksp->cycle_tag = 4UL; /* Verify uses tags 0-3 */
184 108 : wksp->owner = 0UL; /* Mark as locked and in construction */
185 :
186 : /* Note that wksp->owner was set to zero above, "locking" the wksp by
187 : group_id 0. And the memset above set all the partition tags to
188 : zero such that there are no allocated partitions. So once we set
189 : magic below, we can finish the initialization by rebuilding and
190 : unlocking. Since fd_log_group_id is non-zero, the zero owner
191 : indicates to any remote observer of the shared memory region that
192 : the wksp is being built for the first time. */
193 :
194 108 : FD_COMPILER_MFENCE();
195 108 : FD_VOLATILE( wksp->magic ) = FD_WKSP_MAGIC;
196 108 : FD_COMPILER_MFENCE();
197 :
198 108 : int err = fd_wksp_rebuild( wksp, seed );
199 108 : if( FD_UNLIKELY( err ) ) { /* Should be impossible at this point */
200 :
201 0 : FD_COMPILER_MFENCE();
202 0 : FD_VOLATILE( wksp->magic ) = 0UL;
203 0 : FD_COMPILER_MFENCE();
204 :
205 0 : FD_LOG_WARNING(( "fd_wksp_rebuild failed (%i-%s)", err, fd_wksp_strerror( err ) ));
206 0 : return NULL;
207 0 : }
208 :
209 : #if FD_HAS_DEEPASAN
210 : /* Poison entire workspace except wksp header and the pinfo array. */
211 : void * wksp_data = (void*)((ulong)wksp + fd_wksp_private_pinfo_off());
212 : fd_asan_poison( wksp_data, footprint - fd_wksp_private_pinfo_off() );
213 : fd_wksp_private_pinfo_t * pinfo = fd_wksp_private_pinfo( wksp );
214 : fd_asan_unpoison( pinfo, FD_WKSP_PRIVATE_PINFO_FOOTPRINT * part_max );
215 : #endif
216 :
217 108 : fd_wksp_private_unlock( wksp );
218 :
219 108 : return wksp;
220 108 : }
221 :
222 : fd_wksp_t *
223 1923 : fd_wksp_join( void * shwksp ) {
224 1923 : fd_wksp_t * wksp = (fd_wksp_t *)shwksp;
225 :
226 1923 : if( FD_UNLIKELY( !wksp ) ) {
227 3 : FD_LOG_WARNING(( "NULL shwksp" ));
228 3 : return NULL;
229 3 : }
230 :
231 1920 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)wksp, FD_WKSP_ALIGN ) ) ) {
232 3 : FD_LOG_WARNING(( "bad align" ));
233 3 : return NULL;
234 3 : }
235 :
236 1917 : if( FD_UNLIKELY( wksp->magic!=FD_WKSP_MAGIC ) ) {
237 3 : FD_LOG_WARNING(( "bad magic" ));
238 3 : return NULL;
239 3 : }
240 :
241 1914 : return wksp;
242 1917 : }
243 :
244 : void *
245 1815 : fd_wksp_leave( fd_wksp_t * wksp ) {
246 1815 : if( FD_UNLIKELY( !wksp ) ) {
247 3 : FD_LOG_WARNING(( "NULL wksp" ));
248 3 : return NULL;
249 3 : }
250 :
251 1812 : return (void *)wksp;
252 1815 : }
253 :
254 : void *
255 105 : fd_wksp_delete( void * shwksp ) {
256 105 : fd_wksp_t * wksp = (fd_wksp_t *)shwksp;
257 :
258 105 : if( FD_UNLIKELY( !wksp ) ) {
259 3 : FD_LOG_WARNING(( "NULL shwksp" ));
260 3 : return NULL;
261 3 : }
262 :
263 102 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)wksp, FD_WKSP_ALIGN ) ) ) {
264 3 : FD_LOG_WARNING(( "bad align" ));
265 3 : return NULL;
266 3 : }
267 :
268 99 : if( FD_UNLIKELY( wksp->magic!=FD_WKSP_MAGIC ) ) {
269 3 : FD_LOG_WARNING(( "bad magic" ));
270 3 : return NULL;
271 3 : }
272 :
273 : /* TODO: consider testing owner */
274 :
275 96 : FD_COMPILER_MFENCE();
276 96 : FD_VOLATILE( wksp->magic ) = 0UL;
277 96 : FD_COMPILER_MFENCE();
278 :
279 : # if FD_HAS_DEEPASAN
280 : /* Unpoison entire wksp region. */
281 : ulong footprint = fd_wksp_footprint( wksp->part_max, wksp->data_max );
282 : void * wksp_data = (void*)((ulong)wksp + fd_wksp_private_pinfo_off());
283 : fd_asan_unpoison( wksp_data, footprint - fd_wksp_private_pinfo_off());
284 : # endif
285 :
286 96 : return wksp;
287 99 : }
288 :
289 3 : char const * fd_wksp_name ( fd_wksp_t const * wksp ) { return wksp->name; }
290 111 : uint fd_wksp_seed ( fd_wksp_t const * wksp ) { return wksp->seed; }
291 3 : ulong fd_wksp_part_max( fd_wksp_t const * wksp ) { return wksp->part_max; }
292 3 : ulong fd_wksp_data_max( fd_wksp_t const * wksp ) { return wksp->data_max; }
293 :
294 : ulong
295 0 : fd_wksp_owner( fd_wksp_t const * wksp ) {
296 0 : FD_COMPILER_MFENCE();
297 0 : ulong owner = FD_VOLATILE_CONST( wksp->owner );
298 0 : FD_COMPILER_MFENCE();
299 0 : return owner;
300 0 : }
301 :
302 : char const *
303 48 : fd_wksp_strerror( int err ) {
304 48 : switch( err ) {
305 3 : case FD_WKSP_SUCCESS: return "success";
306 12 : case FD_WKSP_ERR_INVAL: return "inval";
307 30 : case FD_WKSP_ERR_FAIL: return "fail";
308 3 : case FD_WKSP_ERR_CORRUPT: return "corrupt";
309 0 : default: break;
310 48 : }
311 0 : return "unknown";
312 48 : }
313 :
314 : int
315 51 : fd_wksp_verify( fd_wksp_t * wksp ) {
316 :
317 430491 : # define TEST(c) do { \
318 430491 : if( FD_UNLIKELY( !(c) ) ) { FD_LOG_WARNING(( "FAIL: %s", #c )); return FD_WKSP_ERR_CORRUPT; } \
319 430491 : } while(0)
320 :
321 : /* Validate metadata */
322 :
323 51 : TEST( wksp );
324 51 : TEST( wksp->magic==FD_WKSP_MAGIC );
325 :
326 51 : ulong part_max = wksp->part_max;
327 51 : ulong data_max = wksp->data_max;
328 51 : TEST( fd_wksp_footprint( part_max, data_max ) );
329 :
330 51 : ulong gaddr_lo = wksp->gaddr_lo; TEST( gaddr_lo==fd_wksp_private_data_off( part_max ) );
331 51 : ulong gaddr_hi = wksp->gaddr_hi; TEST( gaddr_hi==gaddr_lo+data_max );
332 :
333 51 : TEST( fd_shmem_name_len( wksp->name ) );
334 :
335 : /* seed is arbitrary */
336 :
337 51 : TEST( wksp->cycle_tag >= 4UL );
338 :
339 : /* TODO: consider verifying owner */
340 :
341 51 : fd_wksp_private_pinfo_t * pinfo = fd_wksp_private_pinfo( wksp );
342 :
343 : /* Clear out cycle tags */
344 :
345 201381 : for( ulong i=0UL; i<part_max; i++ ) pinfo[ i ].cycle_tag = 0UL;
346 :
347 : /* Verify the idle stack */
348 :
349 51 : ulong idle_cnt = 0UL;
350 :
351 51 : do {
352 51 : ulong i = fd_wksp_private_pinfo_idx( wksp->idle_top_cidx );
353 199071 : while( !fd_wksp_private_pinfo_idx_is_null( i ) ) {
354 :
355 : /* Visit i. Note that i has not been validated yet. */
356 :
357 199020 : TEST( i<part_max ); /* Validate i */
358 199020 : TEST( !pinfo[ i ].cycle_tag ); /* Make sure not visited before */
359 199020 : pinfo[ i ].cycle_tag = 1UL; /* Mark as visited in idle stack */
360 199020 : idle_cnt++; /* Update the idle cnt */
361 :
362 : /* Advance to the next idle */
363 :
364 199020 : i = fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx );
365 199020 : }
366 51 : } while(0);
367 :
368 : /* Idle stack looks intact, verify partitioning */
369 :
370 51 : ulong free_cnt = 0UL;
371 51 : ulong used_cnt = 0UL;
372 :
373 51 : do {
374 51 : ulong j = FD_WKSP_PRIVATE_PINFO_IDX_NULL;
375 51 : ulong i = fd_wksp_private_pinfo_idx( wksp->part_head_cidx );
376 51 : ulong g = gaddr_lo;
377 :
378 51 : int last_free = 0;
379 :
380 2361 : while( !fd_wksp_private_pinfo_idx_is_null( i ) ) {
381 :
382 : /* At this point, we last visited j. Visit i. Note that j has
383 : been validated but i has not. */
384 :
385 2310 : TEST( i<part_max ); /* Validate i */
386 2310 : TEST( pinfo[ i ].gaddr_lo==g ); /* Make sure partition is tightly adjacent to previous */
387 2310 : TEST( pinfo[ i ].gaddr_hi> g ); /* Make sure partition size is non-zero */
388 2310 : TEST( fd_wksp_private_pinfo_idx( pinfo[ i ].prev_cidx )==j ); /* Make sure correct prev partition */
389 2310 : TEST( !pinfo[ i ].cycle_tag ); /* Make sure not visited before */
390 2310 : pinfo[ i ].cycle_tag = 2UL; /* Mark as visited in partitioning */
391 :
392 2310 : g = pinfo[ i ].gaddr_hi; /* Extract where the next partition should start */
393 2310 : int is_free = !pinfo[ i ].tag; /* Determine if this partition is free or used */
394 2310 : TEST( !(last_free & is_free) ); /* Make sure no adjacent free partitions */
395 2310 : free_cnt += (ulong) is_free; /* Update the free cnt */
396 2310 : used_cnt += (ulong)!is_free; /* Update the used cnt */
397 :
398 : /* Advance to the next partition */
399 :
400 2310 : last_free = is_free;
401 :
402 2310 : j = i;
403 2310 : i = fd_wksp_private_pinfo_idx( pinfo[ i ].next_cidx );
404 2310 : }
405 :
406 51 : TEST( fd_wksp_private_pinfo_idx( wksp->part_tail_cidx )==j ); /* Make sure correct partition tail */
407 51 : TEST( g==gaddr_hi ); /* Make sure complete partitioning */
408 51 : TEST( (idle_cnt + free_cnt + used_cnt)==part_max ); /* Make sure no lost idle partitions */
409 51 : } while(0);
410 :
411 : /* Idle stack and partitioning look intact, validate used treap */
412 :
413 51 : do {
414 51 : ulong visit_cnt = 0UL;
415 :
416 51 : ulong i = fd_wksp_private_pinfo_idx( wksp->part_used_cidx );
417 51 : ulong s = FD_WKSP_PRIVATE_PINFO_IDX_NULL;
418 51 : ulong g = gaddr_lo;
419 :
420 51 : if( !fd_wksp_private_pinfo_idx_is_null( i ) ) {
421 39 : TEST( i<part_max ); /* Validate i */
422 39 : TEST( fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx )==s ); /* Validate parent */
423 39 : }
424 :
425 2691 : for(;;) {
426 :
427 : /* At this point i is and everything on stack is validated */
428 :
429 2691 : if( fd_wksp_private_pinfo_idx_is_null( i ) ) {
430 1371 : if( fd_wksp_private_pinfo_idx_is_null( s ) ) break; /* Done */
431 :
432 : /* Pop stack */
433 :
434 1320 : i = s;
435 1320 : s = fd_wksp_private_pinfo_idx( pinfo[ i ].stack_cidx );
436 :
437 : /* Visit i */
438 :
439 1320 : ulong p = fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx ); /* Extract the parent */
440 :
441 1320 : TEST( pinfo[ i ].gaddr_lo>=g ); /* Make sure this starts after last visited */
442 1320 : TEST( pinfo[ i ].tag ); /* Make sure tagged as a used partition */
443 1320 : TEST( pinfo[ i ].cycle_tag==2UL ); /* Make sure in partitioning and not visited yet this traversal */
444 1320 : if( !fd_wksp_private_pinfo_idx_is_null( p ) ) /* Make sure heap property satisfied */
445 1281 : TEST( pinfo[ p ].heap_prio >= pinfo[ i ].heap_prio );
446 :
447 1320 : TEST( !pinfo[ i ].in_same ); /* Make sure unique */
448 1320 : TEST( fd_wksp_private_pinfo_idx_is_null( fd_wksp_private_pinfo_idx( pinfo[ i ].same_cidx ) ) ); /* " */
449 :
450 1320 : pinfo[ i ].cycle_tag = 3UL; /* Mark as visited this traversal */
451 1320 : visit_cnt++; /* Update the visit cnt */
452 1320 : g = pinfo[ i ].gaddr_hi; /* Get minimum start for next partition */
453 :
454 : /* Traverse the right subtree */
455 :
456 1320 : p = i;
457 1320 : i = fd_wksp_private_pinfo_idx( pinfo[ i ].right_cidx );
458 1320 : if( !fd_wksp_private_pinfo_idx_is_null( i ) ) {
459 672 : TEST( i<part_max ); /* Validate i */
460 672 : TEST( fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx )==p ); /* Validate parent */
461 672 : }
462 :
463 1320 : } else {
464 :
465 : /* At this point i and everything on the stack is validated.
466 : Push i to the stack and recurse on the left subtree. */
467 :
468 1320 : pinfo[ i ].stack_cidx = fd_wksp_private_pinfo_cidx( s );
469 1320 : s = i;
470 1320 : i = fd_wksp_private_pinfo_idx( pinfo[ i ].left_cidx );
471 1320 : if( !fd_wksp_private_pinfo_idx_is_null( i ) ) {
472 609 : TEST( i<part_max ); /* Validate i */
473 609 : TEST( fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx )==s ); /* Validate parent */
474 609 : }
475 :
476 1320 : }
477 2691 : }
478 :
479 51 : TEST( visit_cnt==used_cnt ); /* Make sure all used partitions in used treap */
480 51 : } while(0);
481 :
482 : /* Idle stack, partitioning and used treap look intact, validate the
483 : free treap. */
484 :
485 51 : do {
486 51 : ulong visit_cnt = 0UL;
487 :
488 51 : ulong i = fd_wksp_private_pinfo_idx( wksp->part_free_cidx );
489 51 : ulong s = FD_WKSP_PRIVATE_PINFO_IDX_NULL;
490 51 : ulong sz = 0UL;
491 :
492 51 : if( !fd_wksp_private_pinfo_idx_is_null( i ) ) {
493 51 : TEST( i<part_max ); /* Validate i */
494 51 : TEST( fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx )==s ); /* Validate parent */
495 51 : }
496 :
497 1209 : for(;;) {
498 :
499 : /* At this point i and everything on the stack is validated */
500 :
501 1209 : if( fd_wksp_private_pinfo_idx_is_null( i ) ) {
502 630 : if( fd_wksp_private_pinfo_idx_is_null( s ) ) break; /* Done */
503 :
504 : /* Pop stack */
505 :
506 579 : i = s;
507 579 : s = fd_wksp_private_pinfo_idx( pinfo[ i ].stack_cidx );
508 :
509 : /* Visit i */
510 :
511 579 : ulong p = fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx ); /* Extract the parent */
512 579 : ulong isz = fd_wksp_private_pinfo_sz( pinfo + i ); /* Extract the size */
513 :
514 579 : TEST( isz>sz ); /* Make sure this partition i larger than previous */
515 579 : TEST( !pinfo[ i ].tag ); /* Make sure tagged as a free partition */
516 579 : TEST( !pinfo[ i ].in_same ); /* Make sure marked as not in same */
517 :
518 579 : if( !fd_wksp_private_pinfo_idx_is_null( p ) ) { /* Make sure heap property satisfied */
519 528 : TEST( pinfo[ p ].heap_prio >= pinfo[ i ].heap_prio );
520 528 : }
521 :
522 579 : sz = isz; /* Update largest size partition seen so far */
523 :
524 : /* Traverse all same sized partitions */
525 :
526 579 : ulong j = i;
527 990 : for(;;) {
528 :
529 : /* At this point, j is validated */
530 :
531 990 : TEST( pinfo[ j ].cycle_tag==2UL ); /* Make sure in partitioning and not visited yet this traversal */
532 990 : pinfo[ j ].cycle_tag = 3UL; /* Mark as visited this traversal */
533 990 : visit_cnt++;
534 :
535 990 : ulong k = fd_wksp_private_pinfo_idx( pinfo[ j ].same_cidx ); /* Get the next same sized */
536 990 : if( fd_wksp_private_pinfo_idx_is_null( k ) ) break; /* If no more, we are done with this node */
537 411 : TEST( k<part_max ); /* Make sure valid index */
538 411 : TEST( fd_wksp_private_pinfo_sz( pinfo + k )==sz ); /* Make sure same size */
539 411 : TEST( pinfo[ k ].in_same ); /* Make sure marked as in same */
540 411 : TEST( fd_wksp_private_pinfo_idx_is_null( fd_wksp_private_pinfo_idx( pinfo[ k ].left_cidx ) ) );
541 411 : TEST( fd_wksp_private_pinfo_idx_is_null( fd_wksp_private_pinfo_idx( pinfo[ k ].right_cidx ) ) );
542 411 : TEST( fd_wksp_private_pinfo_idx( pinfo[ k ].parent_cidx )==j ); /* Make sure correct parent */
543 411 : j = k;
544 411 : }
545 :
546 : /* Recurse on the right subtree */
547 :
548 579 : p = i;
549 579 : i = fd_wksp_private_pinfo_idx( pinfo[ i ].right_cidx );
550 579 : if( !fd_wksp_private_pinfo_idx_is_null( i ) ) {
551 270 : TEST( i<part_max ); /* Validate i */
552 270 : TEST( fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx )==p ); /* Validate parent */
553 270 : }
554 :
555 579 : } else {
556 :
557 579 : TEST( i<part_max ); /* Validate i */
558 :
559 : /* At this point i and everything on the stack is validated.
560 : Push i to the stack and recurse on the left subtree. */
561 :
562 579 : pinfo[ i ].stack_cidx = fd_wksp_private_pinfo_cidx( s );
563 579 : s = i;
564 579 : i = fd_wksp_private_pinfo_idx( pinfo[ i ].left_cidx );
565 579 : if( !fd_wksp_private_pinfo_idx_is_null( i ) ) {
566 258 : TEST( i<part_max ); /* Validate i */
567 258 : TEST( fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx )==s ); /* Validate parent */
568 258 : }
569 579 : }
570 1209 : }
571 :
572 51 : TEST( visit_cnt==free_cnt ); /* Make sure all free partitions in free treap */
573 :
574 51 : } while(0);
575 :
576 51 : # undef TEST
577 :
578 51 : return FD_WKSP_SUCCESS;
579 51 : }
580 :
581 : int
582 : fd_wksp_rebuild( fd_wksp_t * wksp,
583 792 : uint seed ) {
584 :
585 : /* Load the wksp metadata, don't rebuild if any of it looks even
586 : slightly off. */
587 :
588 792 : if( FD_UNLIKELY( !wksp ) ) {
589 0 : FD_LOG_WARNING(( "NULL wksp" ));
590 0 : return FD_WKSP_ERR_CORRUPT;
591 0 : }
592 :
593 792 : ulong magic = wksp->magic;
594 792 : ulong part_max = wksp->part_max;
595 792 : ulong data_max = wksp->data_max;
596 792 : ulong gaddr_lo = wksp->gaddr_lo;
597 792 : ulong gaddr_hi = wksp->gaddr_hi;
598 792 : ulong cycle_tag = wksp->cycle_tag;
599 :
600 : /* TODO: consider verifying owner */
601 :
602 792 : ulong footprint = fd_wksp_footprint( part_max, data_max );
603 792 : ulong gaddr_lo_exp = fd_wksp_private_data_off( part_max );
604 792 : ulong gaddr_hi_exp = gaddr_lo_exp + data_max;
605 792 : if( FD_UNLIKELY( (magic!=FD_WKSP_MAGIC) | (!footprint) | (!fd_shmem_name_len( wksp->name )) |
606 792 : (gaddr_lo!=gaddr_lo_exp) | (gaddr_hi!=gaddr_hi_exp) | (cycle_tag<4UL) ) ) {
607 0 : FD_LOG_WARNING(( "bad metadata\n\t"
608 0 : "magic %016lx (exp %016lx)\n\t"
609 0 : "part_max %lu data_max %lu (footprint %lu)\n\t"
610 0 : "gaddr_lo %lu (exp %lu)\n\t"
611 0 : "gaddr_hi %lu (exp %lu)\n\t"
612 0 : "cycle_tag %lu (exp>=4)",
613 0 : magic, FD_WKSP_MAGIC, part_max, data_max, footprint,
614 0 : gaddr_lo, gaddr_lo_exp, gaddr_hi, gaddr_hi_exp, cycle_tag ));
615 0 : return FD_WKSP_ERR_CORRUPT;
616 0 : }
617 :
618 : /* Scan the wksp pinfo and insert any used partitions into the used
619 : treap and put the rest on the idle stack. If there is any sign of
620 : corruption (empty, bad range or overlap between used partitions),
621 : we abort the rebuild (this is almost certainly data corruption of
622 : some form and we don't have enough info to resolve a conflict
623 : without potentially making the situation worse). We do the scan in
624 : reverse order to rebuild the idle stack in forward order.
625 :
626 : Note that we don't ever change the gaddr_lo,gaddr_hi of any tagged
627 : partitions such that operation is guaranteed to never change the
628 : single source of truth. As such, this operation can be interrupted
629 : and restarted arbitrarily safely.*/
630 :
631 792 : fd_wksp_private_pinfo_t * pinfo = fd_wksp_private_pinfo( wksp );
632 :
633 792 : do {
634 792 : wksp->seed = seed;
635 792 : wksp->idle_top_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL ); /* Flush idle stack */
636 792 : wksp->part_used_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL ); /* Flush used treap */
637 792 : wksp->part_free_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL ); /* Flush free treap */
638 :
639 792 : ulong i = part_max;
640 4931139 : while( i ) {
641 4930347 : i--;
642 :
643 : /* Ideally, heap priorities should just be a shuffling of the
644 : integers [0,part_max). fd_uint_hash will generate such a
645 : shuffling for part_max = 2^32. Using the lower 30 bits
646 : (reserving bit 31 for bulk operations) will yield something
647 : very close. We use seed to mix it up some more. */
648 :
649 4930347 : pinfo[ i ].in_same = 0U;
650 4930347 : pinfo[ i ].heap_prio = fd_uint_hash( seed ^ (uint)i ) & ((1U<<30)-1U);
651 4930347 : pinfo[ i ].stack_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
652 4930347 : pinfo[ i ].cycle_tag = 0U;
653 :
654 4930347 : ulong tag = pinfo[ i ].tag;
655 4930347 : if( !tag ) { /* Not used ... make it available for reuse below */
656 4921479 : fd_wksp_private_idle_stack_push( i, wksp, pinfo );
657 4921479 : continue;
658 4921479 : }
659 :
660 8868 : pinfo[ i ].prev_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
661 8868 : pinfo[ i ].next_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
662 :
663 8868 : if( FD_UNLIKELY( fd_wksp_private_used_treap_insert( i, wksp, pinfo ) ) ) return FD_WKSP_ERR_CORRUPT; /* Logs details */
664 8868 : }
665 792 : } while(0);
666 :
667 : /* At this point, a partition is either in the idle stack or used
668 : treap. Further, we have:
669 :
670 : | used | idle
671 : ----------+----------------------------+--------
672 : gaddr_* | non-empty range | 0
673 : | no overlap with other used | 0
674 : tag | non-zero | 0
675 : in_same | 0 | 0
676 : heap_prio | randomized | randomized
677 : prev | NULL | NULL
678 : next | NULL | NULL
679 : left | used treap managed | NULL
680 : right | used treap managed | NULL
681 : same | used treap managed (NULL) | NULL
682 : parent | used treap managed | idle stack managed
683 : stack | wksp managed | wksp managed
684 : cycle_tag | wksp managed | wksp managed
685 :
686 : In-order traverse the used treap to rebuild the partitioning and
687 : the free treap. */
688 :
689 792 : do {
690 792 : uint * j_next_cidx_ptr = &wksp->part_head_cidx; /* Location of most recently added partition next link */
691 :
692 792 : ulong j = FD_WKSP_PRIVATE_PINFO_IDX_NULL; /* Most recently added partition */
693 792 : ulong g0 = gaddr_lo; /* Most recently added partition end */
694 :
695 792 : ulong i = fd_wksp_private_pinfo_idx( wksp->part_used_cidx );
696 792 : ulong s = FD_WKSP_PRIVATE_PINFO_IDX_NULL;
697 18528 : for(;;) {
698 18528 : if( fd_wksp_private_pinfo_idx_is_null( i ) ) {
699 9660 : if( fd_wksp_private_pinfo_idx_is_null( s ) ) break; /* Done */
700 :
701 : /* Pop traversal stack */
702 :
703 8868 : i = s;
704 8868 : s = fd_wksp_private_pinfo_idx( pinfo[ i ].stack_cidx );
705 :
706 : /* Visit i */
707 :
708 8868 : ulong g1 = pinfo[ i ].gaddr_lo;
709 8868 : if( g1 > g0 ) { /* There's a gap between i and the most recently added partition */
710 :
711 : /* Acquire an idle partition to hold the gap */
712 :
713 6231 : if( FD_UNLIKELY( fd_wksp_private_idle_stack_is_empty( wksp ) ) ) {
714 0 : FD_LOG_WARNING(( "part_max (%lu) too small to fill gap before partition %lu (tag %lu gaddr_lo %lu gaddr_hi %lu)",
715 0 : part_max, i, pinfo[i].tag, pinfo[i].gaddr_lo, pinfo[i].gaddr_hi ));
716 0 : return FD_WKSP_ERR_CORRUPT;
717 0 : }
718 6231 : ulong k = fd_wksp_private_idle_stack_pop( wksp, pinfo );
719 :
720 : /* Populate the acquired partition with the gap details,
721 : append it to the wksp partitioning and insert it into the
722 : free treap. Note that stack_push/pop reset gaddr_lo,
723 : gaddr_hi, tag, in_same, {prev, next, left, right, same,
724 : parent}_cidx. It preserved heap_prio from its original
725 : assignment and didn't touch stack_cidx or cycle_tag. */
726 :
727 6231 : pinfo[ k ].gaddr_lo = g0;
728 6231 : pinfo[ k ].gaddr_hi = g1;
729 6231 : pinfo[ k ].prev_cidx = fd_wksp_private_pinfo_cidx( j );
730 6231 : *j_next_cidx_ptr = fd_wksp_private_pinfo_cidx( k );
731 6231 : j_next_cidx_ptr = &pinfo[ k ].next_cidx;
732 6231 : j = k;
733 6231 : g0 = g1;
734 :
735 6231 : fd_wksp_private_free_treap_insert( j, wksp, pinfo );
736 6231 : }
737 :
738 : /* Add i to the partitioning. */
739 :
740 8868 : pinfo[ i ].prev_cidx = fd_wksp_private_pinfo_cidx( j );
741 8868 : *j_next_cidx_ptr = fd_wksp_private_pinfo_cidx( i );
742 8868 : j_next_cidx_ptr = &pinfo[ i ].next_cidx;
743 8868 : j = i;
744 8868 : g0 = pinfo[ i ].gaddr_hi;
745 :
746 : /* Traverse the right subtree */
747 :
748 8868 : i = fd_wksp_private_pinfo_idx( pinfo[ i ].right_cidx );
749 :
750 8868 : } else {
751 :
752 : /* Push i to the stack and recurse on the left subtree. */
753 :
754 8868 : pinfo[ i ].stack_cidx = fd_wksp_private_pinfo_cidx( s );
755 8868 : s = i;
756 8868 : i = fd_wksp_private_pinfo_idx( pinfo[ i ].left_cidx );
757 :
758 8868 : }
759 18528 : }
760 :
761 792 : if( g0 < gaddr_hi ) { /* Have final gap to fill */
762 :
763 : /* This works the same as the above */
764 :
765 792 : if( FD_UNLIKELY( fd_wksp_private_idle_stack_is_empty( wksp ) ) ) {
766 0 : FD_LOG_WARNING(( "part_max (%lu) too small to complete partitioning", part_max ));
767 0 : return FD_WKSP_ERR_CORRUPT;
768 0 : }
769 792 : ulong k = fd_wksp_private_idle_stack_pop( wksp, pinfo );
770 :
771 792 : pinfo[ k ].gaddr_lo = g0;
772 792 : pinfo[ k ].gaddr_hi = gaddr_hi;
773 792 : pinfo[ k ].prev_cidx = fd_wksp_private_pinfo_cidx( j );
774 792 : *j_next_cidx_ptr = fd_wksp_private_pinfo_cidx( k );
775 792 : j_next_cidx_ptr = &pinfo[ k ].next_cidx;
776 792 : j = k;
777 : //g0 = gaddr_hi;
778 :
779 792 : fd_wksp_private_free_treap_insert( j, wksp, pinfo );
780 792 : }
781 :
782 792 : wksp->part_tail_cidx = fd_wksp_private_pinfo_cidx( j );
783 :
784 792 : } while(0);
785 :
786 792 : return FD_WKSP_SUCCESS;
787 792 : }
788 :
789 : void
790 0 : fd_wksp_mprotect( fd_wksp_t * wksp, int flag ) {
791 0 : if( FD_UNLIKELY( !wksp ) ) {
792 0 : FD_LOG_WARNING(( "NULL wksp" ));
793 0 : return;
794 0 : }
795 :
796 0 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)wksp, FD_WKSP_ALIGN ) ) ) {
797 0 : FD_LOG_WARNING(( "bad align" ));
798 0 : return;
799 0 : }
800 :
801 0 : if( FD_UNLIKELY( wksp->magic!=FD_WKSP_MAGIC ) ) {
802 0 : FD_LOG_WARNING(( "bad magic" ));
803 0 : return;
804 0 : }
805 :
806 0 : ulong wksp_footprint = fd_wksp_footprint( wksp->part_max, wksp->data_max );
807 0 : if( FD_UNLIKELY( mprotect( (void*)wksp, wksp_footprint, ( flag ? PROT_READ : (PROT_READ|PROT_WRITE) ) ) ) ) {
808 0 : FD_LOG_WARNING(( "mprotect failed (%i-%s)", errno, fd_io_strerror( errno ) ));
809 0 : return;
810 0 : }
811 0 : }
|