Line data Source code
1 : #include "fd_wksp_private.h"
2 :
3 : int
4 311315901 : fd_wksp_private_lock( fd_wksp_t * wksp ) {
5 : # if FD_WKSP_LOCK_RECLAIM
6 : int warning = 0;
7 : #endif
8 311315901 : ulong me = fd_log_group_id();
9 :
10 311315901 : ulong * _owner = &wksp->owner;
11 311315901 : for(;;) {
12 :
13 : /* Note that we emulate CAS on platforms without FD_HAS_ATOMIC
14 : to minimize the amount of code differences we have to test. On
15 : platforms without FD_HAS_ATOMIC, a workspace should not be used
16 : concurrently though. */
17 :
18 311315901 : FD_COMPILER_MFENCE();
19 311315901 : # if FD_HAS_ATOMIC
20 311315901 : ulong pid = FD_ATOMIC_CAS( _owner, ULONG_MAX, me );
21 : # else
22 : ulong pid = FD_VOLATILE_CONST( *_owner );
23 : if( pid==ULONG_MAX ) FD_VOLATILE( *_owner ) = me;
24 : # endif
25 311315901 : FD_COMPILER_MFENCE();
26 :
27 311315901 : if( FD_LIKELY( pid==ULONG_MAX ) ) return FD_WKSP_SUCCESS;
28 :
29 : # if FD_WKSP_LOCK_RECLAIM
30 : int status = fd_log_group_id_query( pid );
31 : if( FD_UNLIKELY( status==FD_LOG_GROUP_ID_QUERY_DEAD ) ) { /* A process died while holding the lock, try to recover the lock */
32 :
33 : FD_COMPILER_MFENCE();
34 : # if FD_HAS_ATOMIC
35 : ulong cur = FD_ATOMIC_CAS( _owner, pid, me );
36 : # else
37 : ulong cur = FD_VOLATILE_CONST( *_owner );
38 : if( cur==pid ) FD_VOLATILE( *_owner ) = me;
39 : # endif
40 : FD_COMPILER_MFENCE();
41 :
42 : if( FD_LIKELY( cur==pid ) ) { /* We recovered the lock from the dead pid, try to fix up incomplete ops */
43 :
44 : FD_LOG_WARNING(( "Process %lu died in an operation on wksp %s; verifying", pid, wksp->name ));
45 : if( FD_LIKELY( !fd_wksp_verify( wksp ) ) ) { /* logs details of issues detected */
46 : FD_LOG_NOTICE(( "wksp verified" ));
47 : return FD_WKSP_SUCCESS;
48 : }
49 :
50 : FD_LOG_WARNING(( "Issues detected; rebuilding" ));
51 : if( FD_UNLIKELY( fd_wksp_rebuild( wksp, wksp->seed ) ) ) { /* Rebuild failed (logs details of issues detected) */
52 : /* Return control of the lock to the previous owner */
53 : FD_COMPILER_MFENCE();
54 : FD_VOLATILE( *_owner ) = pid;
55 : FD_COMPILER_MFENCE();
56 : FD_LOG_WARNING(( "corrupt wksp detected" ));
57 : return FD_WKSP_ERR_CORRUPT;
58 : }
59 :
60 : FD_LOG_NOTICE(( "wksp rebuilt" ));
61 : return FD_WKSP_SUCCESS;
62 :
63 : }
64 :
65 : /* Somebody beat us to recovering the lock ... try again */
66 :
67 : } else if( FD_UNLIKELY( status!=FD_LOG_GROUP_ID_QUERY_LIVE ) ) { /* Unclear pid status ... issue a warning and try again */
68 :
69 : if( FD_UNLIKELY( !warning ) ) {
70 : FD_LOG_WARNING(( "wksp %s is owned by unknown pid %li; attempting to recover", wksp->name, pid ));
71 : warning = 1;
72 : }
73 :
74 : }
75 :
76 : /* At this point, either another thread in this process has the
77 : lock, another active thread in another process has the lock,
78 : another unknown status thread in other process has the lock or
79 : another thread beat us to reclaim the lock from a dead process.
80 : In any case, we don't have the lock. Wait a while to limit O/S
81 : contention and try again. */
82 :
83 : FD_YIELD();
84 : # else
85 :
86 : /* If we are running without FD_WKSP_LOCK_RECLAIM then it is assumed
87 : that the contention is caused by a tile pinned to another core,
88 : and that this core is itself pinned so spin locking is best. */
89 0 : FD_SPIN_PAUSE();
90 :
91 0 : #endif
92 0 : }
93 :
94 : /* never get here */
95 311315901 : }
96 :
97 : /* Public APIs ********************************************************/
98 :
99 : ulong
100 : fd_wksp_part_max_est( ulong footprint,
101 213 : ulong sz_typical ) {
102 213 : footprint = fd_ulong_align_dn( footprint, FD_WKSP_ALIGN );
103 213 : ulong data_end = footprint - 1UL;
104 213 : ulong pinfo_off = fd_wksp_private_pinfo_off();
105 213 : ulong consumed = sizeof(fd_wksp_private_pinfo_t) + sz_typical;
106 213 : ulong part_max = (data_end - pinfo_off) / (consumed + (ulong)!consumed); /* avoid div-by-zero */
107 213 : if( FD_UNLIKELY( (!footprint) | (!sz_typical) | (sz_typical>consumed) | (pinfo_off>data_end) ) ) return 0UL;
108 204 : return fd_ulong_min( part_max, FD_WKSP_PRIVATE_PINFO_IDX_NULL );
109 213 : }
110 :
111 : ulong
112 : fd_wksp_data_max_est( ulong footprint,
113 231 : ulong part_max ) {
114 231 : footprint = fd_ulong_align_dn( footprint, FD_WKSP_ALIGN );
115 231 : ulong data_end = footprint - 1UL;
116 231 : ulong data_off = fd_wksp_private_data_off( part_max );
117 231 : if( FD_UNLIKELY( (!part_max) | (part_max>FD_WKSP_PRIVATE_PINFO_IDX_NULL) |
118 231 : (part_max > ((ULONG_MAX - fd_wksp_private_pinfo_off())/sizeof(fd_wksp_private_pinfo_t))) | /* covered above */
119 231 : (!footprint) | (data_off>=data_end) ) ) return 0UL;
120 213 : return data_end - data_off;
121 231 : }
122 :
123 : ulong
124 3 : fd_wksp_align( void ) {
125 3 : return FD_WKSP_ALIGN;
126 3 : }
127 :
128 : ulong
129 : fd_wksp_footprint( ulong part_max,
130 1824 : ulong data_max ) {
131 1824 : ulong data_off = fd_wksp_private_data_off( part_max );
132 1824 : if( FD_UNLIKELY( (!part_max) | (part_max>FD_WKSP_PRIVATE_PINFO_IDX_NULL) | (!data_max) |
133 1824 : (part_max > ((ULONG_MAX - fd_wksp_private_pinfo_off())/sizeof(fd_wksp_private_pinfo_t))) | /* Covered above */
134 1824 : (data_max > (ULONG_MAX - FD_WKSP_ALIGN + 1UL - data_off - 1UL) ) ) ) return 0UL;
135 1755 : return fd_ulong_align_up( data_off + data_max + 1UL, FD_WKSP_ALIGN );
136 1824 : }
137 :
138 : void *
139 : fd_wksp_new( void * shmem,
140 : char const * name,
141 : uint seed,
142 : ulong part_max,
143 216 : ulong data_max ) {
144 216 : fd_wksp_t * wksp = (fd_wksp_t *)shmem;
145 :
146 216 : if( FD_UNLIKELY( !wksp ) ) {
147 3 : FD_LOG_WARNING(( "NULL shmem" ));
148 3 : return NULL;
149 3 : }
150 :
151 213 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)wksp, FD_WKSP_ALIGN ) ) ) {
152 3 : FD_LOG_WARNING(( "bad align" ));
153 3 : return NULL;
154 3 : }
155 :
156 210 : ulong name_len = fd_shmem_name_len( name );
157 210 : if( FD_UNLIKELY( !name_len ) ) {
158 3 : FD_LOG_WARNING(( "bad name" ));
159 3 : return NULL;
160 3 : }
161 :
162 207 : ulong footprint = fd_wksp_footprint( part_max, data_max );
163 207 : if( FD_UNLIKELY( !footprint ) ) {
164 12 : FD_LOG_WARNING(( "bad part_max and/or data_max" ));
165 12 : return NULL;
166 12 : }
167 :
168 195 : fd_memset( wksp, 0, fd_wksp_footprint( part_max, 1UL ) );
169 :
170 195 : wksp->part_max = part_max;
171 195 : wksp->data_max = data_max;
172 195 : wksp->gaddr_lo = fd_wksp_private_data_off( part_max );
173 195 : wksp->gaddr_hi = wksp->gaddr_lo + data_max;
174 195 : fd_memcpy( wksp->name, name, name_len+1UL );
175 195 : wksp->seed = seed;
176 195 : wksp->idle_top_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
177 195 : wksp->part_head_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
178 195 : wksp->part_tail_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
179 195 : wksp->part_used_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
180 195 : wksp->part_free_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
181 195 : wksp->cycle_tag = 4UL; /* Verify uses tags 0-3 */
182 195 : wksp->owner = 0UL; /* Mark as locked and in construction */
183 :
184 : /* Note that wksp->owner was set to zero above, "locking" the wksp by
185 : group_id 0. And the memset above set all the partition tags to
186 : zero such that there are no allocated partitions. So once we set
187 : magic below, we can finish the initialization by rebuilding and
188 : unlocking. Since fd_log_group_id is non-zero, the zero owner
189 : indicates to any remote observer of the shared memory region that
190 : the wksp is being built for the first time. */
191 :
192 195 : FD_COMPILER_MFENCE();
193 195 : FD_VOLATILE( wksp->magic ) = FD_WKSP_MAGIC;
194 195 : FD_COMPILER_MFENCE();
195 :
196 195 : int err = fd_wksp_rebuild( wksp, seed );
197 195 : if( FD_UNLIKELY( err ) ) { /* Should be impossible at this point */
198 :
199 0 : FD_COMPILER_MFENCE();
200 0 : FD_VOLATILE( wksp->magic ) = 0UL;
201 0 : FD_COMPILER_MFENCE();
202 :
203 0 : FD_LOG_WARNING(( "fd_wksp_rebuild failed (%i-%s)", err, fd_wksp_strerror( err ) ));
204 0 : return NULL;
205 0 : }
206 :
207 : #if FD_HAS_DEEPASAN
208 : /* Poison entire workspace except wksp header and the pinfo array. */
209 : void * wksp_data = (void*)((ulong)wksp + fd_wksp_private_pinfo_off());
210 : fd_asan_poison( wksp_data, footprint - fd_wksp_private_pinfo_off() );
211 : fd_wksp_private_pinfo_t * pinfo = fd_wksp_private_pinfo( wksp );
212 : fd_asan_unpoison( pinfo, FD_WKSP_PRIVATE_PINFO_FOOTPRINT * part_max );
213 : #endif
214 :
215 195 : fd_wksp_private_unlock( wksp );
216 :
217 195 : return wksp;
218 195 : }
219 :
220 : fd_wksp_t *
221 2010 : fd_wksp_join( void * shwksp ) {
222 2010 : fd_wksp_t * wksp = (fd_wksp_t *)shwksp;
223 :
224 2010 : if( FD_UNLIKELY( !wksp ) ) {
225 3 : FD_LOG_WARNING(( "NULL shwksp" ));
226 3 : return NULL;
227 3 : }
228 :
229 2007 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)wksp, FD_WKSP_ALIGN ) ) ) {
230 3 : FD_LOG_WARNING(( "bad align" ));
231 3 : return NULL;
232 3 : }
233 :
234 2004 : if( FD_UNLIKELY( wksp->magic!=FD_WKSP_MAGIC ) ) {
235 3 : FD_LOG_WARNING(( "bad magic" ));
236 3 : return NULL;
237 3 : }
238 :
239 2001 : return wksp;
240 2004 : }
241 :
242 : void *
243 1848 : fd_wksp_leave( fd_wksp_t * wksp ) {
244 1848 : if( FD_UNLIKELY( !wksp ) ) {
245 3 : FD_LOG_WARNING(( "NULL wksp" ));
246 3 : return NULL;
247 3 : }
248 :
249 1845 : return (void *)wksp;
250 1848 : }
251 :
252 : void *
253 138 : fd_wksp_delete( void * shwksp ) {
254 138 : fd_wksp_t * wksp = (fd_wksp_t *)shwksp;
255 :
256 138 : if( FD_UNLIKELY( !wksp ) ) {
257 3 : FD_LOG_WARNING(( "NULL shwksp" ));
258 3 : return NULL;
259 3 : }
260 :
261 135 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)wksp, FD_WKSP_ALIGN ) ) ) {
262 3 : FD_LOG_WARNING(( "bad align" ));
263 3 : return NULL;
264 3 : }
265 :
266 132 : if( FD_UNLIKELY( wksp->magic!=FD_WKSP_MAGIC ) ) {
267 3 : FD_LOG_WARNING(( "bad magic" ));
268 3 : return NULL;
269 3 : }
270 :
271 : /* TODO: consider testing owner */
272 :
273 129 : FD_COMPILER_MFENCE();
274 129 : FD_VOLATILE( wksp->magic ) = 0UL;
275 129 : FD_COMPILER_MFENCE();
276 :
277 : # if FD_HAS_DEEPASAN
278 : /* Unpoison entire wksp region. */
279 : ulong footprint = fd_wksp_footprint( wksp->part_max, wksp->data_max );
280 : void * wksp_data = (void*)((ulong)wksp + fd_wksp_private_pinfo_off());
281 : fd_asan_unpoison( wksp_data, footprint - fd_wksp_private_pinfo_off());
282 : # endif
283 :
284 129 : return wksp;
285 132 : }
286 :
287 3 : char const * fd_wksp_name ( fd_wksp_t const * wksp ) { return wksp->name; }
288 111 : uint fd_wksp_seed ( fd_wksp_t const * wksp ) { return wksp->seed; }
289 3 : ulong fd_wksp_part_max( fd_wksp_t const * wksp ) { return wksp->part_max; }
290 3 : ulong fd_wksp_data_max( fd_wksp_t const * wksp ) { return wksp->data_max; }
291 :
292 : ulong
293 0 : fd_wksp_owner( fd_wksp_t const * wksp ) {
294 0 : FD_COMPILER_MFENCE();
295 0 : ulong owner = FD_VOLATILE_CONST( wksp->owner );
296 0 : FD_COMPILER_MFENCE();
297 0 : return owner;
298 0 : }
299 :
300 : char const *
301 48 : fd_wksp_strerror( int err ) {
302 48 : switch( err ) {
303 3 : case FD_WKSP_SUCCESS: return "success";
304 12 : case FD_WKSP_ERR_INVAL: return "inval";
305 30 : case FD_WKSP_ERR_FAIL: return "fail";
306 3 : case FD_WKSP_ERR_CORRUPT: return "corrupt";
307 0 : default: break;
308 48 : }
309 0 : return "unknown";
310 48 : }
311 :
312 : int
313 51 : fd_wksp_verify( fd_wksp_t * wksp ) {
314 :
315 430491 : # define TEST(c) do { \
316 430491 : if( FD_UNLIKELY( !(c) ) ) { FD_LOG_WARNING(( "FAIL: %s", #c )); return FD_WKSP_ERR_CORRUPT; } \
317 430491 : } while(0)
318 :
319 : /* Validate metadata */
320 :
321 51 : TEST( wksp );
322 51 : TEST( wksp->magic==FD_WKSP_MAGIC );
323 :
324 51 : ulong part_max = wksp->part_max;
325 51 : ulong data_max = wksp->data_max;
326 51 : TEST( fd_wksp_footprint( part_max, data_max ) );
327 :
328 51 : ulong gaddr_lo = wksp->gaddr_lo; TEST( gaddr_lo==fd_wksp_private_data_off( part_max ) );
329 51 : ulong gaddr_hi = wksp->gaddr_hi; TEST( gaddr_hi==gaddr_lo+data_max );
330 :
331 51 : TEST( fd_shmem_name_len( wksp->name ) );
332 :
333 : /* seed is arbitrary */
334 :
335 51 : TEST( wksp->cycle_tag >= 4UL );
336 :
337 : /* TODO: consider verifying owner */
338 :
339 51 : fd_wksp_private_pinfo_t * pinfo = fd_wksp_private_pinfo( wksp );
340 :
341 : /* Clear out cycle tags */
342 :
343 201381 : for( ulong i=0UL; i<part_max; i++ ) pinfo[ i ].cycle_tag = 0UL;
344 :
345 : /* Verify the idle stack */
346 :
347 51 : ulong idle_cnt = 0UL;
348 :
349 51 : do {
350 51 : ulong i = fd_wksp_private_pinfo_idx( wksp->idle_top_cidx );
351 199071 : while( !fd_wksp_private_pinfo_idx_is_null( i ) ) {
352 :
353 : /* Visit i. Note that i has not been validated yet. */
354 :
355 199020 : TEST( i<part_max ); /* Validate i */
356 199020 : TEST( !pinfo[ i ].cycle_tag ); /* Make sure not visited before */
357 199020 : pinfo[ i ].cycle_tag = 1UL; /* Mark as visited in idle stack */
358 199020 : idle_cnt++; /* Update the idle cnt */
359 :
360 : /* Advance to the next idle */
361 :
362 199020 : i = fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx );
363 199020 : }
364 51 : } while(0);
365 :
366 : /* Idle stack looks intact, verify partitioning */
367 :
368 51 : ulong free_cnt = 0UL;
369 51 : ulong used_cnt = 0UL;
370 :
371 51 : do {
372 51 : ulong j = FD_WKSP_PRIVATE_PINFO_IDX_NULL;
373 51 : ulong i = fd_wksp_private_pinfo_idx( wksp->part_head_cidx );
374 51 : ulong g = gaddr_lo;
375 :
376 51 : int last_free = 0;
377 :
378 2361 : while( !fd_wksp_private_pinfo_idx_is_null( i ) ) {
379 :
380 : /* At this point, we last visited j. Visit i. Note that j has
381 : been validated but i has not. */
382 :
383 2310 : TEST( i<part_max ); /* Validate i */
384 2310 : TEST( pinfo[ i ].gaddr_lo==g ); /* Make sure partition is tightly adjacent to previous */
385 2310 : TEST( pinfo[ i ].gaddr_hi> g ); /* Make sure partition size is non-zero */
386 2310 : TEST( fd_wksp_private_pinfo_idx( pinfo[ i ].prev_cidx )==j ); /* Make sure correct prev partition */
387 2310 : TEST( !pinfo[ i ].cycle_tag ); /* Make sure not visited before */
388 2310 : pinfo[ i ].cycle_tag = 2UL; /* Mark as visited in partitioning */
389 :
390 2310 : g = pinfo[ i ].gaddr_hi; /* Extract where the next partition should start */
391 2310 : int is_free = !pinfo[ i ].tag; /* Determine if this partition is free or used */
392 2310 : TEST( !(last_free & is_free) ); /* Make sure no adjacent free partitions */
393 2310 : free_cnt += (ulong) is_free; /* Update the free cnt */
394 2310 : used_cnt += (ulong)!is_free; /* Update the used cnt */
395 :
396 : /* Advance to the next partition */
397 :
398 2310 : last_free = is_free;
399 :
400 2310 : j = i;
401 2310 : i = fd_wksp_private_pinfo_idx( pinfo[ i ].next_cidx );
402 2310 : }
403 :
404 51 : TEST( fd_wksp_private_pinfo_idx( wksp->part_tail_cidx )==j ); /* Make sure correct partition tail */
405 51 : TEST( g==gaddr_hi ); /* Make sure complete partitioning */
406 51 : TEST( (idle_cnt + free_cnt + used_cnt)==part_max ); /* Make sure no lost idle partitions */
407 51 : } while(0);
408 :
409 : /* Idle stack and partitioning look intact, validate used treap */
410 :
411 51 : do {
412 51 : ulong visit_cnt = 0UL;
413 :
414 51 : ulong i = fd_wksp_private_pinfo_idx( wksp->part_used_cidx );
415 51 : ulong s = FD_WKSP_PRIVATE_PINFO_IDX_NULL;
416 51 : ulong g = gaddr_lo;
417 :
418 51 : if( !fd_wksp_private_pinfo_idx_is_null( i ) ) {
419 39 : TEST( i<part_max ); /* Validate i */
420 39 : TEST( fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx )==s ); /* Validate parent */
421 39 : }
422 :
423 2691 : for(;;) {
424 :
425 : /* At this point i is and everything on stack is validated */
426 :
427 2691 : if( fd_wksp_private_pinfo_idx_is_null( i ) ) {
428 1371 : if( fd_wksp_private_pinfo_idx_is_null( s ) ) break; /* Done */
429 :
430 : /* Pop stack */
431 :
432 1320 : i = s;
433 1320 : s = fd_wksp_private_pinfo_idx( pinfo[ i ].stack_cidx );
434 :
435 : /* Visit i */
436 :
437 1320 : ulong p = fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx ); /* Extract the parent */
438 :
439 1320 : TEST( pinfo[ i ].gaddr_lo>=g ); /* Make sure this starts after last visited */
440 1320 : TEST( pinfo[ i ].tag ); /* Make sure tagged as a used partition */
441 1320 : TEST( pinfo[ i ].cycle_tag==2UL ); /* Make sure in partitioning and not visited yet this traversal */
442 1320 : if( !fd_wksp_private_pinfo_idx_is_null( p ) ) /* Make sure heap property satisfied */
443 1281 : TEST( pinfo[ p ].heap_prio >= pinfo[ i ].heap_prio );
444 :
445 1320 : TEST( !pinfo[ i ].in_same ); /* Make sure unique */
446 1320 : TEST( fd_wksp_private_pinfo_idx_is_null( fd_wksp_private_pinfo_idx( pinfo[ i ].same_cidx ) ) ); /* " */
447 :
448 1320 : pinfo[ i ].cycle_tag = 3UL; /* Mark as visited this traversal */
449 1320 : visit_cnt++; /* Update the visit cnt */
450 1320 : g = pinfo[ i ].gaddr_hi; /* Get minimum start for next partition */
451 :
452 : /* Traverse the right subtree */
453 :
454 1320 : p = i;
455 1320 : i = fd_wksp_private_pinfo_idx( pinfo[ i ].right_cidx );
456 1320 : if( !fd_wksp_private_pinfo_idx_is_null( i ) ) {
457 672 : TEST( i<part_max ); /* Validate i */
458 672 : TEST( fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx )==p ); /* Validate parent */
459 672 : }
460 :
461 1320 : } else {
462 :
463 : /* At this point i and everything on the stack is validated.
464 : Push i to the stack and recurse on the left subtree. */
465 :
466 1320 : pinfo[ i ].stack_cidx = fd_wksp_private_pinfo_cidx( s );
467 1320 : s = i;
468 1320 : i = fd_wksp_private_pinfo_idx( pinfo[ i ].left_cidx );
469 1320 : if( !fd_wksp_private_pinfo_idx_is_null( i ) ) {
470 609 : TEST( i<part_max ); /* Validate i */
471 609 : TEST( fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx )==s ); /* Validate parent */
472 609 : }
473 :
474 1320 : }
475 2691 : }
476 :
477 51 : TEST( visit_cnt==used_cnt ); /* Make sure all used partitions in used treap */
478 51 : } while(0);
479 :
480 : /* Idle stack, partitioning and used treap look intact, validate the
481 : free treap. */
482 :
483 51 : do {
484 51 : ulong visit_cnt = 0UL;
485 :
486 51 : ulong i = fd_wksp_private_pinfo_idx( wksp->part_free_cidx );
487 51 : ulong s = FD_WKSP_PRIVATE_PINFO_IDX_NULL;
488 51 : ulong sz = 0UL;
489 :
490 51 : if( !fd_wksp_private_pinfo_idx_is_null( i ) ) {
491 51 : TEST( i<part_max ); /* Validate i */
492 51 : TEST( fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx )==s ); /* Validate parent */
493 51 : }
494 :
495 1209 : for(;;) {
496 :
497 : /* At this point i and everything on the stack is validated */
498 :
499 1209 : if( fd_wksp_private_pinfo_idx_is_null( i ) ) {
500 630 : if( fd_wksp_private_pinfo_idx_is_null( s ) ) break; /* Done */
501 :
502 : /* Pop stack */
503 :
504 579 : i = s;
505 579 : s = fd_wksp_private_pinfo_idx( pinfo[ i ].stack_cidx );
506 :
507 : /* Visit i */
508 :
509 579 : ulong p = fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx ); /* Extract the parent */
510 579 : ulong isz = fd_wksp_private_pinfo_sz( pinfo + i ); /* Extract the size */
511 :
512 579 : TEST( isz>sz ); /* Make sure this partition i larger than previous */
513 579 : TEST( !pinfo[ i ].tag ); /* Make sure tagged as a free partition */
514 579 : TEST( !pinfo[ i ].in_same ); /* Make sure marked as not in same */
515 :
516 579 : if( !fd_wksp_private_pinfo_idx_is_null( p ) ) { /* Make sure heap property satisfied */
517 528 : TEST( pinfo[ p ].heap_prio >= pinfo[ i ].heap_prio );
518 528 : }
519 :
520 579 : sz = isz; /* Update largest size partition seen so far */
521 :
522 : /* Traverse all same sized partitions */
523 :
524 579 : ulong j = i;
525 990 : for(;;) {
526 :
527 : /* At this point, j is validated */
528 :
529 990 : TEST( pinfo[ j ].cycle_tag==2UL ); /* Make sure in partitioning and not visited yet this traversal */
530 990 : pinfo[ j ].cycle_tag = 3UL; /* Mark as visited this traversal */
531 990 : visit_cnt++;
532 :
533 990 : ulong k = fd_wksp_private_pinfo_idx( pinfo[ j ].same_cidx ); /* Get the next same sized */
534 990 : if( fd_wksp_private_pinfo_idx_is_null( k ) ) break; /* If no more, we are done with this node */
535 411 : TEST( k<part_max ); /* Make sure valid index */
536 411 : TEST( fd_wksp_private_pinfo_sz( pinfo + k )==sz ); /* Make sure same size */
537 411 : TEST( pinfo[ k ].in_same ); /* Make sure marked as in same */
538 411 : TEST( fd_wksp_private_pinfo_idx_is_null( fd_wksp_private_pinfo_idx( pinfo[ k ].left_cidx ) ) );
539 411 : TEST( fd_wksp_private_pinfo_idx_is_null( fd_wksp_private_pinfo_idx( pinfo[ k ].right_cidx ) ) );
540 411 : TEST( fd_wksp_private_pinfo_idx( pinfo[ k ].parent_cidx )==j ); /* Make sure correct parent */
541 411 : j = k;
542 411 : }
543 :
544 : /* Recurse on the right subtree */
545 :
546 579 : p = i;
547 579 : i = fd_wksp_private_pinfo_idx( pinfo[ i ].right_cidx );
548 579 : if( !fd_wksp_private_pinfo_idx_is_null( i ) ) {
549 270 : TEST( i<part_max ); /* Validate i */
550 270 : TEST( fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx )==p ); /* Validate parent */
551 270 : }
552 :
553 579 : } else {
554 :
555 579 : TEST( i<part_max ); /* Validate i */
556 :
557 : /* At this point i and everything on the stack is validated.
558 : Push i to the stack and recurse on the left subtree. */
559 :
560 579 : pinfo[ i ].stack_cidx = fd_wksp_private_pinfo_cidx( s );
561 579 : s = i;
562 579 : i = fd_wksp_private_pinfo_idx( pinfo[ i ].left_cidx );
563 579 : if( !fd_wksp_private_pinfo_idx_is_null( i ) ) {
564 258 : TEST( i<part_max ); /* Validate i */
565 258 : TEST( fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx )==s ); /* Validate parent */
566 258 : }
567 579 : }
568 1209 : }
569 :
570 51 : TEST( visit_cnt==free_cnt ); /* Make sure all free partitions in free treap */
571 :
572 51 : } while(0);
573 :
574 51 : # undef TEST
575 :
576 51 : return FD_WKSP_SUCCESS;
577 51 : }
578 :
579 : int
580 : fd_wksp_rebuild( fd_wksp_t * wksp,
581 960 : uint seed ) {
582 :
583 : /* Load the wksp metadata, don't rebuild if any of it looks even
584 : slightly off. */
585 :
586 960 : if( FD_UNLIKELY( !wksp ) ) {
587 0 : FD_LOG_WARNING(( "NULL wksp" ));
588 0 : return FD_WKSP_ERR_CORRUPT;
589 0 : }
590 :
591 960 : ulong magic = wksp->magic;
592 960 : ulong part_max = wksp->part_max;
593 960 : ulong data_max = wksp->data_max;
594 960 : ulong gaddr_lo = wksp->gaddr_lo;
595 960 : ulong gaddr_hi = wksp->gaddr_hi;
596 960 : ulong cycle_tag = wksp->cycle_tag;
597 :
598 : /* TODO: consider verifying owner */
599 :
600 960 : ulong footprint = fd_wksp_footprint( part_max, data_max );
601 960 : ulong gaddr_lo_exp = fd_wksp_private_data_off( part_max );
602 960 : ulong gaddr_hi_exp = gaddr_lo_exp + data_max;
603 960 : if( FD_UNLIKELY( (magic!=FD_WKSP_MAGIC) | (!footprint) | (!fd_shmem_name_len( wksp->name )) |
604 960 : (gaddr_lo!=gaddr_lo_exp) | (gaddr_hi!=gaddr_hi_exp) | (cycle_tag<4UL) ) ) {
605 0 : FD_LOG_WARNING(( "bad metadata\n\t"
606 0 : "magic %016lx (exp %016lx)\n\t"
607 0 : "part_max %lu data_max %lu (footprint %lu)\n\t"
608 0 : "gaddr_lo %lu (exp %lu)\n\t"
609 0 : "gaddr_hi %lu (exp %lu)\n\t"
610 0 : "cycle_tag %lu (exp>=4)",
611 0 : magic, FD_WKSP_MAGIC, part_max, data_max, footprint,
612 0 : gaddr_lo, gaddr_lo_exp, gaddr_hi, gaddr_hi_exp, cycle_tag ));
613 0 : return FD_WKSP_ERR_CORRUPT;
614 0 : }
615 :
616 : /* Scan the wksp pinfo and insert any used partitions into the used
617 : treap and put the rest on the idle stack. If there is any sign of
618 : corruption (empty, bad range or overlap between used partitions),
619 : we abort the rebuild (this is almost certainly data corruption of
620 : some form and we don't have enough info to resolve a conflict
621 : without potentially making the situation worse). We do the scan in
622 : reverse order to rebuild the idle stack in forward order.
623 :
624 : Note that we don't ever change the gaddr_lo,gaddr_hi of any tagged
625 : partitions such that operation is guaranteed to never change the
626 : single source of truth. As such, this operation can be interrupted
627 : and restarted arbitrarily safely.*/
628 :
629 960 : fd_wksp_private_pinfo_t * pinfo = fd_wksp_private_pinfo( wksp );
630 :
631 960 : do {
632 960 : wksp->seed = seed;
633 960 : wksp->idle_top_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL ); /* Flush idle stack */
634 960 : wksp->part_used_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL ); /* Flush used treap */
635 960 : wksp->part_free_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL ); /* Flush free treap */
636 :
637 960 : ulong i = part_max;
638 21761976 : while( i ) {
639 21761016 : i--;
640 :
641 : /* Ideally, heap priorities should just be a shuffling of the
642 : integers [0,part_max). fd_uint_hash will generate such a
643 : shuffling for part_max = 2^32. Using the lower 30 bits
644 : (reserving bit 31 for bulk operations) will yield something
645 : very close. We use seed to mix it up some more. */
646 :
647 21761016 : pinfo[ i ].in_same = 0U;
648 21761016 : pinfo[ i ].heap_prio = fd_uint_hash( seed ^ (uint)i ) & ((1U<<30)-1U);
649 21761016 : pinfo[ i ].stack_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
650 21761016 : pinfo[ i ].cycle_tag = 0U;
651 :
652 21761016 : ulong tag = pinfo[ i ].tag;
653 21761016 : if( !tag ) { /* Not used ... make it available for reuse below */
654 21752148 : fd_wksp_private_idle_stack_push( i, wksp, pinfo );
655 21752148 : continue;
656 21752148 : }
657 :
658 8868 : pinfo[ i ].prev_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
659 8868 : pinfo[ i ].next_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
660 :
661 8868 : if( FD_UNLIKELY( fd_wksp_private_used_treap_insert( i, wksp, pinfo ) ) ) return FD_WKSP_ERR_CORRUPT; /* Logs details */
662 8868 : }
663 960 : } while(0);
664 :
665 : /* At this point, a partition is either in the idle stack or used
666 : treap. Further, we have:
667 :
668 : | used | idle
669 : ----------+----------------------------+--------
670 : gaddr_* | non-empty range | 0
671 : | no overlap with other used | 0
672 : tag | non-zero | 0
673 : in_same | 0 | 0
674 : heap_prio | randomized | randomized
675 : prev | NULL | NULL
676 : next | NULL | NULL
677 : left | used treap managed | NULL
678 : right | used treap managed | NULL
679 : same | used treap managed (NULL) | NULL
680 : parent | used treap managed | idle stack managed
681 : stack | wksp managed | wksp managed
682 : cycle_tag | wksp managed | wksp managed
683 :
684 : In-order traverse the used treap to rebuild the partitioning and
685 : the free treap. */
686 :
687 960 : do {
688 960 : uint * j_next_cidx_ptr = &wksp->part_head_cidx; /* Location of most recently added partition next link */
689 :
690 960 : ulong j = FD_WKSP_PRIVATE_PINFO_IDX_NULL; /* Most recently added partition */
691 960 : ulong g0 = gaddr_lo; /* Most recently added partition end */
692 :
693 960 : ulong i = fd_wksp_private_pinfo_idx( wksp->part_used_cidx );
694 960 : ulong s = FD_WKSP_PRIVATE_PINFO_IDX_NULL;
695 18696 : for(;;) {
696 18696 : if( fd_wksp_private_pinfo_idx_is_null( i ) ) {
697 9828 : if( fd_wksp_private_pinfo_idx_is_null( s ) ) break; /* Done */
698 :
699 : /* Pop traversal stack */
700 :
701 8868 : i = s;
702 8868 : s = fd_wksp_private_pinfo_idx( pinfo[ i ].stack_cidx );
703 :
704 : /* Visit i */
705 :
706 8868 : ulong g1 = pinfo[ i ].gaddr_lo;
707 8868 : if( g1 > g0 ) { /* There's a gap between i and the most recently added partition */
708 :
709 : /* Acquire an idle partition to hold the gap */
710 :
711 6231 : if( FD_UNLIKELY( fd_wksp_private_idle_stack_is_empty( wksp ) ) ) {
712 0 : FD_LOG_WARNING(( "part_max (%lu) too small to fill gap before partition %lu (tag %lu gaddr_lo %lu gaddr_hi %lu)",
713 0 : part_max, i, pinfo[i].tag, pinfo[i].gaddr_lo, pinfo[i].gaddr_hi ));
714 0 : return FD_WKSP_ERR_CORRUPT;
715 0 : }
716 6231 : ulong k = fd_wksp_private_idle_stack_pop( wksp, pinfo );
717 :
718 : /* Populate the acquired partition with the gap details,
719 : append it to the wksp partitioning and insert it into the
720 : free treap. Note that stack_push/pop reset gaddr_lo,
721 : gaddr_hi, tag, in_same, {prev, next, left, right, same,
722 : parent}_cidx. It preserved heap_prio from its original
723 : assignment and didn't touch stack_cidx or cycle_tag. */
724 :
725 6231 : pinfo[ k ].gaddr_lo = g0;
726 6231 : pinfo[ k ].gaddr_hi = g1;
727 6231 : pinfo[ k ].prev_cidx = fd_wksp_private_pinfo_cidx( j );
728 6231 : *j_next_cidx_ptr = fd_wksp_private_pinfo_cidx( k );
729 6231 : j_next_cidx_ptr = &pinfo[ k ].next_cidx;
730 6231 : j = k;
731 6231 : g0 = g1;
732 :
733 6231 : fd_wksp_private_free_treap_insert( j, wksp, pinfo );
734 6231 : }
735 :
736 : /* Add i to the partitioning. */
737 :
738 8868 : pinfo[ i ].prev_cidx = fd_wksp_private_pinfo_cidx( j );
739 8868 : *j_next_cidx_ptr = fd_wksp_private_pinfo_cidx( i );
740 8868 : j_next_cidx_ptr = &pinfo[ i ].next_cidx;
741 8868 : j = i;
742 8868 : g0 = pinfo[ i ].gaddr_hi;
743 :
744 : /* Traverse the right subtree */
745 :
746 8868 : i = fd_wksp_private_pinfo_idx( pinfo[ i ].right_cidx );
747 :
748 8868 : } else {
749 :
750 : /* Push i to the stack and recurse on the left subtree. */
751 :
752 8868 : pinfo[ i ].stack_cidx = fd_wksp_private_pinfo_cidx( s );
753 8868 : s = i;
754 8868 : i = fd_wksp_private_pinfo_idx( pinfo[ i ].left_cidx );
755 :
756 8868 : }
757 18696 : }
758 :
759 960 : if( g0 < gaddr_hi ) { /* Have final gap to fill */
760 :
761 : /* This works the same as the above */
762 :
763 960 : if( FD_UNLIKELY( fd_wksp_private_idle_stack_is_empty( wksp ) ) ) {
764 0 : FD_LOG_WARNING(( "part_max (%lu) too small to complete partitioning", part_max ));
765 0 : return FD_WKSP_ERR_CORRUPT;
766 0 : }
767 960 : ulong k = fd_wksp_private_idle_stack_pop( wksp, pinfo );
768 :
769 960 : pinfo[ k ].gaddr_lo = g0;
770 960 : pinfo[ k ].gaddr_hi = gaddr_hi;
771 960 : pinfo[ k ].prev_cidx = fd_wksp_private_pinfo_cidx( j );
772 960 : *j_next_cidx_ptr = fd_wksp_private_pinfo_cidx( k );
773 960 : j_next_cidx_ptr = &pinfo[ k ].next_cidx;
774 960 : j = k;
775 : //g0 = gaddr_hi;
776 :
777 960 : fd_wksp_private_free_treap_insert( j, wksp, pinfo );
778 960 : }
779 :
780 960 : wksp->part_tail_cidx = fd_wksp_private_pinfo_cidx( j );
781 :
782 960 : } while(0);
783 :
784 960 : return FD_WKSP_SUCCESS;
785 960 : }
786 :
787 : #if defined(__linux__)
788 :
789 : #include <sys/mman.h>
790 : #include <errno.h>
791 :
792 : void
793 0 : fd_wksp_mprotect( fd_wksp_t * wksp, int flag ) {
794 0 : if( FD_UNLIKELY( !wksp ) ) {
795 0 : FD_LOG_WARNING(( "NULL wksp" ));
796 0 : return;
797 0 : }
798 :
799 0 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)wksp, FD_WKSP_ALIGN ) ) ) {
800 0 : FD_LOG_WARNING(( "bad align" ));
801 0 : return;
802 0 : }
803 :
804 0 : if( FD_UNLIKELY( wksp->magic!=FD_WKSP_MAGIC ) ) {
805 0 : FD_LOG_WARNING(( "bad magic" ));
806 0 : return;
807 0 : }
808 :
809 0 : ulong wksp_footprint = fd_wksp_footprint( wksp->part_max, wksp->data_max );
810 0 : if( FD_UNLIKELY( mprotect( (void*)wksp, wksp_footprint, ( flag ? PROT_READ : (PROT_READ|PROT_WRITE) ) ) ) ) {
811 0 : FD_LOG_WARNING(( "mprotect failed (%i-%s)", errno, fd_io_strerror( errno ) ));
812 0 : return;
813 0 : }
814 0 : }
815 :
816 : #endif
|