Line data Source code
1 : #include "fd_repair_metrics.h" 2 : #include <stdio.h> 3 : #include <stdlib.h> 4 : 5 : #include "../../disco/metrics/fd_metrics.h" 6 : 7 : void * 8 0 : fd_repair_metrics_new( void * mem ) { 9 0 : fd_repair_metrics_t * repair_metrics = (fd_repair_metrics_t *)mem; 10 0 : repair_metrics->st = UINT_MAX; 11 0 : repair_metrics->en = UINT_MAX; 12 0 : repair_metrics->turbine_slot0 = 0; 13 : 14 0 : return repair_metrics; 15 0 : } 16 : 17 : fd_repair_metrics_t * 18 0 : fd_repair_metrics_join( void * repair_metrics ) { 19 0 : return (fd_repair_metrics_t *)repair_metrics; 20 0 : } 21 : 22 : void 23 0 : fd_repair_metrics_set_turbine_slot0( fd_repair_metrics_t * repair_metrics, ulong turbine_slot0 ) { 24 0 : repair_metrics->turbine_slot0 = turbine_slot0; 25 0 : } 26 : 27 : void 28 : fd_repair_metrics_add_slot( fd_repair_metrics_t * repair_metrics, 29 : ulong slot, 30 : long first_shred_ts, 31 : long slot_complete_ts, 32 : uint repair_cnt, 33 0 : uint turbine_cnt ) { 34 0 : uint next_en = (repair_metrics->en + 1) % FD_CATCHUP_METRICS_MAX; 35 0 : if( FD_UNLIKELY( next_en == repair_metrics->st || repair_metrics->st == UINT_MAX ) ) { 36 0 : repair_metrics->st = (repair_metrics->st + 1) % FD_CATCHUP_METRICS_MAX; 37 0 : } 38 0 : repair_metrics->slots[ next_en ].slot = slot; 39 0 : repair_metrics->slots[ next_en ].first_shred_ts = first_shred_ts; 40 0 : repair_metrics->slots[ next_en ].slot_complete_ts = slot_complete_ts; 41 0 : repair_metrics->slots[ next_en ].repair_cnt = repair_cnt; 42 0 : repair_metrics->slots[ next_en ].turbine_cnt = turbine_cnt; 43 0 : repair_metrics->en = next_en; 44 0 : } 45 : 46 0 : #define MAX_WIDTH 120 47 : static char dashes[MAX_WIDTH + 1] = "========================================================================================================================"; 48 : static char spaces[MAX_WIDTH + 1] = " "; 49 : 50 : #define print_slot_interval_bar( slot_metrics, tick_sz, min_ts, verbose ) \ 51 0 : long duration = slot_metrics->slot_complete_ts - slot_metrics->first_shred_ts; \ 52 0 : int width = (int)((double)(duration) / tick_sz); \ 53 0 : int start = (int)((double)(slot_metrics->first_shred_ts - min_ts) / tick_sz); \ 54 0 : if( FD_UNLIKELY( verbose ) ) { \ 55 0 : printf( "%lu [repaired: %u/%u]%.*s|%.*s| (%.2f ms)", \ 56 0 : slot_metrics->slot, \ 57 0 : slot_metrics->repair_cnt, slot_metrics->turbine_cnt + slot_metrics->repair_cnt, \ 58 0 : start, spaces, width, dashes, \ 59 0 : (double)fd_metrics_convert_ticks_to_nanoseconds((ulong)duration) / 1e6 ); \ 60 0 : } else { \ 61 0 : printf( "%lu %.*s|%.*s| (%.2f ms)", \ 62 0 : slot_metrics->slot, \ 63 0 : start, spaces, width, dashes, \ 64 0 : (double)fd_metrics_convert_ticks_to_nanoseconds((ulong)duration) / 1e6 ); \ 65 0 : } \ 66 0 : if( FD_UNLIKELY( slot_metrics->slot == repair_metrics->turbine_slot0 ) ) { \ 67 0 : printf( " <--- (first turbine shred received)" ); \ 68 0 : } \ 69 0 : printf( "\n" ); 70 : 71 : static int 72 0 : compare_slots( const void * a, const void * b ) { 73 0 : const fd_slot_metrics_t * slot_a = (const fd_slot_metrics_t *)a; 74 0 : const fd_slot_metrics_t * slot_b = (const fd_slot_metrics_t *)b; 75 : 76 0 : if( slot_a->slot < slot_b->slot ) return -1; 77 0 : if( slot_a->slot > slot_b->slot ) return 1; 78 0 : return 0; 79 0 : } 80 : 81 : /* Should be typically only called once, at the end of a catchup. Other 82 : wise the information is not meaningful (during regular turbine, the 83 : slots are pretty much sequentially arriving in order anyway) */ 84 : void 85 0 : fd_repair_metrics_print_sorted( fd_repair_metrics_t * repair_metrics, int verbose, fd_slot_metrics_t * temp_slots ) { 86 0 : if( repair_metrics->st == UINT_MAX ) return; // no data to sort 87 : 88 0 : uint temp_idx = 0; 89 0 : long min_ts = repair_metrics->slots[ repair_metrics->st ].first_shred_ts; 90 0 : long max_ts = repair_metrics->slots[ repair_metrics->en ].slot_complete_ts; 91 0 : long repair_kickoff_ts = 0; /* When we receive the first turbine shred, is when we begin orphans. */ 92 0 : long finish_catchup_ts = 0; /* the max of all slot < turbine slot0 completion times */ 93 0 : int num_catchup_slots = 0; 94 0 : uint total_slots = 0; 95 0 : long slot_durations_sum = 0; 96 0 : long incremental_cmpl_sum = 0; 97 0 : long prev_slot_ts = repair_metrics->slots[ repair_metrics->st ].slot_complete_ts; 98 : 99 0 : for( uint i = repair_metrics->st;; i = (i + 1) % FD_CATCHUP_METRICS_MAX ) { 100 0 : fd_slot_metrics_t * slot_data = &repair_metrics->slots[ i ]; 101 0 : temp_slots[ temp_idx++ ] = *slot_data; 102 0 : total_slots++; 103 0 : min_ts = fd_min( min_ts, slot_data->first_shred_ts ); 104 0 : max_ts = fd_max( max_ts, slot_data->slot_complete_ts ); 105 0 : if( FD_UNLIKELY( slot_data->slot == repair_metrics->turbine_slot0 ) ) repair_kickoff_ts = slot_data->first_shred_ts; 106 0 : if( FD_UNLIKELY( slot_data->slot <= repair_metrics->turbine_slot0 ) ) finish_catchup_ts = fd_max( finish_catchup_ts, slot_data->slot_complete_ts ); 107 0 : if( FD_UNLIKELY( slot_data->slot <= repair_metrics->turbine_slot0 ) ) num_catchup_slots++; 108 0 : slot_durations_sum += slot_data->slot_complete_ts - slot_data->first_shred_ts; 109 : 110 0 : incremental_cmpl_sum += slot_data->slot_complete_ts - prev_slot_ts; 111 0 : prev_slot_ts = slot_data->slot_complete_ts; 112 : 113 0 : if( i == repair_metrics->en ) break; 114 0 : } 115 : 116 : /* Sort temp array by slot */ 117 0 : qsort( temp_slots, total_slots, sizeof(fd_slot_metrics_t), compare_slots ); 118 : 119 : /* prints a stacked depth chart of the catchup metrics: 120 : slot |===============| (duration in ms) 121 : slot |================| 122 : etc. */ 123 : 124 0 : double tick_sz = (double)(max_ts - min_ts) / (double)MAX_WIDTH; 125 0 : long orphan_cmpl_ts = temp_slots[0].first_shred_ts; /* When we make the request for the snapshot slot, this is about when the full tree is connected. */ 126 0 : int orphans_cmpl_cnt = 0; /* Count of slots completed by the time orphan requests are done */ 127 0 : for( uint i = 0; i < total_slots; i++ ) { 128 0 : fd_slot_metrics_t * slot_metrics = &temp_slots[ i ]; 129 0 : if( FD_UNLIKELY( slot_metrics->slot_complete_ts < orphan_cmpl_ts ) ) orphans_cmpl_cnt++; 130 0 : print_slot_interval_bar( slot_metrics, tick_sz, min_ts, verbose ); 131 0 : } 132 0 : fflush( stdout ); 133 : 134 : 135 0 : double pipelined_time = (double)(max_ts - min_ts); 136 0 : double non_pipelined_time = (double)slot_durations_sum; 137 0 : FD_LOG_NOTICE(( "\n" 138 0 : "Completed %u slots in %.2f seconds total. \n" 139 0 : "Average slot duration (time from first shred/rq to all shreds received): %.2f ms\n" 140 0 : "Average time between slot completions: %.2f ms\n" 141 0 : "Average slots per second: %.2f\n" 142 0 : "Pipeline factor (sum duration of all slots / total time): %.2f\n", 143 0 : total_slots, 144 0 : (double)fd_metrics_convert_ticks_to_nanoseconds((ulong)pipelined_time) / 1e9, 145 0 : (double)fd_metrics_convert_ticks_to_nanoseconds((ulong)slot_durations_sum) / (double)total_slots / 1e6, 146 0 : (double)fd_metrics_convert_ticks_to_nanoseconds((ulong)incremental_cmpl_sum) / (double)total_slots / 1e6, 147 0 : (double)total_slots / (double)fd_metrics_convert_ticks_to_nanoseconds((ulong)pipelined_time) * 1e9, 148 0 : non_pipelined_time / pipelined_time )); 149 : 150 0 : FD_LOG_NOTICE(( "\n" 151 0 : "Caught up %d slots in %.2f ms total. \n" 152 0 : "Time to repair orphans: %.2f ms \n" 153 0 : "Total time from connected orphan to done: %.2f ms \n" 154 0 : "Slots completed by orphans connected: %d\n", 155 0 : num_catchup_slots, 156 0 : (double)fd_metrics_convert_ticks_to_nanoseconds((ulong)(finish_catchup_ts - repair_kickoff_ts)) / 1e6, 157 0 : (double)fd_metrics_convert_ticks_to_nanoseconds((ulong)(orphan_cmpl_ts - repair_kickoff_ts)) / 1e6, 158 0 : (double)fd_metrics_convert_ticks_to_nanoseconds((ulong)(finish_catchup_ts - orphan_cmpl_ts )) / 1e6, 159 0 : orphans_cmpl_cnt )); 160 0 : } 161 : 162 : void 163 0 : fd_repair_metrics_print( fd_repair_metrics_t * repair_metrics, int verbose ) { 164 0 : if( repair_metrics->st == UINT_MAX ) return; // no data to print 165 0 : long min_ts = repair_metrics->slots[ repair_metrics->st ].first_shred_ts; 166 0 : long max_ts = repair_metrics->slots[ repair_metrics->en ].slot_complete_ts; 167 0 : uint total_slots = 0; 168 0 : long prev_slot_ts = repair_metrics->slots[ repair_metrics->st ].slot_complete_ts; 169 0 : long incremental_cmpl_sum = 0; 170 : 171 0 : long slot_durations_sum = 0; 172 0 : for( uint i = repair_metrics->st;; i = (i + 1) % FD_CATCHUP_METRICS_MAX ) { 173 0 : fd_slot_metrics_t * slot_data = &repair_metrics->slots[ i ]; 174 0 : slot_durations_sum += (slot_data->slot_complete_ts - slot_data->first_shred_ts); 175 0 : total_slots++; 176 0 : min_ts = fd_min( min_ts, slot_data->first_shred_ts ); 177 0 : max_ts = fd_max( max_ts, slot_data->slot_complete_ts ); 178 : 179 : /* st -> en are already ordered by completion time. so this-prev 180 : is guaranteed to be > 0 */ 181 0 : incremental_cmpl_sum += slot_data->slot_complete_ts - prev_slot_ts; 182 0 : prev_slot_ts = slot_data->slot_complete_ts; 183 : 184 0 : if( FD_UNLIKELY( i == repair_metrics->en ) ) break; 185 0 : } 186 : 187 : /* prints a stacked depth chart of the catchup metrics: 188 : slot |===============| (duration in ms) 189 : slot |================| 190 : etc. */ 191 : 192 0 : double tick_sz = (double)(max_ts - min_ts) / (double)MAX_WIDTH; 193 0 : for( uint i = repair_metrics->st;; i = (i + 1) % FD_CATCHUP_METRICS_MAX ) { 194 0 : fd_slot_metrics_t * slot_metrics = &repair_metrics->slots[ i ]; 195 0 : print_slot_interval_bar( slot_metrics, tick_sz, min_ts, verbose ); 196 0 : if( i == repair_metrics->en ) break; 197 0 : } 198 0 : fflush( stdout ); 199 : 200 0 : double pipelined_time = (double)(max_ts - min_ts); 201 0 : double non_pipelined_time = (double)slot_durations_sum; 202 0 : FD_LOG_NOTICE(( "\n" 203 0 : "Over past %u completed slots: \n" 204 0 : "Average slot duration (time from first shred/rq to all shreds received): %.2f ms\n" 205 0 : "Average time between slot completions: %.2f ms\n" 206 0 : "Average slots per second: %.2f\n" 207 0 : "Pipeline factor (sum duration of all slots / total time): %.2f\n", 208 0 : total_slots, 209 0 : (double)fd_metrics_convert_ticks_to_nanoseconds((ulong)slot_durations_sum) / (double)total_slots / 1e6, 210 0 : (double)fd_metrics_convert_ticks_to_nanoseconds((ulong)incremental_cmpl_sum) / (double)total_slots / 1e6, 211 0 : (double)total_slots / (double)fd_metrics_convert_ticks_to_nanoseconds((ulong)pipelined_time) * 1e9, 212 0 : non_pipelined_time / pipelined_time )); 213 0 : } 214 : 215 : #undef MAX_WIDTH