Line data Source code
1 : #include "fd_repair_metrics.h" 2 : #include <stdio.h> 3 : #include <stdlib.h> 4 : 5 : #include "../../disco/metrics/fd_metrics.h" 6 : 7 : void * 8 0 : fd_repair_metrics_new( void * mem ) { 9 0 : fd_repair_metrics_t * repair_metrics = (fd_repair_metrics_t *)mem; 10 0 : repair_metrics->st = UINT_MAX; 11 0 : repair_metrics->en = UINT_MAX; 12 0 : repair_metrics->turbine_slot0 = 0; 13 : 14 0 : return repair_metrics; 15 0 : } 16 : 17 : fd_repair_metrics_t * 18 0 : fd_repair_metrics_join( void * repair_metrics ) { 19 0 : return (fd_repair_metrics_t *)repair_metrics; 20 0 : } 21 : 22 : void 23 0 : fd_repair_metrics_set_turbine_slot0( fd_repair_metrics_t * repair_metrics, ulong turbine_slot0 ) { 24 0 : repair_metrics->turbine_slot0 = turbine_slot0; 25 0 : } 26 : 27 : void 28 : fd_repair_metrics_add_slot( fd_repair_metrics_t * repair_metrics, 29 : ulong slot, 30 : long first_shred_ts, 31 : long slot_complete_ts, 32 : uint repair_cnt, 33 0 : uint turbine_cnt ) { 34 0 : uint next_en = (repair_metrics->en + 1) % FD_CATCHUP_METRICS_MAX; 35 0 : if( FD_UNLIKELY( next_en == repair_metrics->st || repair_metrics->st == UINT_MAX ) ) { 36 0 : repair_metrics->st = (repair_metrics->st + 1) % FD_CATCHUP_METRICS_MAX; 37 0 : } 38 0 : repair_metrics->slots[ next_en ].slot = slot; 39 0 : repair_metrics->slots[ next_en ].first_shred_ts = first_shred_ts; 40 0 : repair_metrics->slots[ next_en ].slot_complete_ts = slot_complete_ts; 41 0 : repair_metrics->slots[ next_en ].repair_cnt = repair_cnt; 42 0 : repair_metrics->slots[ next_en ].turbine_cnt = turbine_cnt; 43 0 : repair_metrics->en = next_en; 44 : 45 : # if DEBUG_LOGGING 46 : if( FD_UNLIKELY( slot == repair_metrics->turbine_slot0 ) ) { 47 : fd_repair_metrics_print( repair_metrics ); 48 : } 49 : # endif 50 0 : } 51 : 52 0 : #define MAX_WIDTH 120 53 : static char dashes[MAX_WIDTH + 1] = "========================================================================================================================"; 54 : static char spaces[MAX_WIDTH + 1] = " "; 55 : 56 : #define print_slot_interval_bar( slot_metrics, tick_sz, min_ts, verbose ) \ 57 0 : long duration = slot_metrics->slot_complete_ts - slot_metrics->first_shred_ts; \ 58 0 : int width = (int)((double)(duration) / tick_sz); \ 59 0 : int start = (int)((double)(slot_metrics->first_shred_ts - min_ts) / tick_sz); \ 60 0 : if( FD_UNLIKELY( verbose ) ) { \ 61 0 : printf( "%lu [repaired: %u/%u]%.*s|%.*s| (%.2f ms)", \ 62 0 : slot_metrics->slot, \ 63 0 : slot_metrics->repair_cnt, slot_metrics->turbine_cnt + slot_metrics->repair_cnt, \ 64 0 : start, spaces, width, dashes, \ 65 0 : (double)fd_metrics_convert_ticks_to_nanoseconds((ulong)duration) / 1e6 ); \ 66 0 : } else { \ 67 0 : printf( "%lu %.*s|%.*s| (%.2f ms)", \ 68 0 : slot_metrics->slot, \ 69 0 : start, spaces, width, dashes, \ 70 0 : (double)fd_metrics_convert_ticks_to_nanoseconds((ulong)duration) / 1e6 ); \ 71 0 : } \ 72 0 : if( FD_UNLIKELY( slot_metrics->slot == repair_metrics->turbine_slot0 ) ) { \ 73 0 : printf( " <--- (first turbine shred received)" ); \ 74 0 : } \ 75 0 : printf( "\n" ); 76 : 77 : static int 78 0 : compare_slots( const void * a, const void * b ) { 79 0 : const fd_slot_metrics_t * slot_a = (const fd_slot_metrics_t *)a; 80 0 : const fd_slot_metrics_t * slot_b = (const fd_slot_metrics_t *)b; 81 : 82 0 : if( slot_a->slot < slot_b->slot ) return -1; 83 0 : if( slot_a->slot > slot_b->slot ) return 1; 84 0 : return 0; 85 0 : } 86 : 87 : /* Should be typically only called once, at the end of a catchup. Other 88 : wise the information is not meaningful (during regular turbine, the 89 : slots are pretty much sequentially arriving in order anyway) */ 90 : void 91 0 : fd_repair_metrics_print_sorted( fd_repair_metrics_t * repair_metrics, int verbose, fd_slot_metrics_t * temp_slots ) { 92 0 : if( repair_metrics->st == UINT_MAX ) return; // no data to sort 93 : 94 0 : uint temp_idx = 0; 95 0 : long min_ts = repair_metrics->slots[ repair_metrics->st ].first_shred_ts; 96 0 : long max_ts = repair_metrics->slots[ repair_metrics->en ].slot_complete_ts; 97 0 : long repair_kickoff_ts = 0; /* When we receive the first turbine shred, is when we begin orphans. */ 98 0 : long finish_catchup_ts = 0; /* the max of all slot < turbine slot0 completion times */ 99 0 : int num_catchup_slots = 0; 100 0 : uint total_slots = 0; 101 0 : long slot_durations_sum = 0; 102 0 : long incremental_cmpl_sum = 0; 103 0 : long prev_slot_ts = repair_metrics->slots[ repair_metrics->st ].slot_complete_ts; 104 : 105 0 : for( uint i = repair_metrics->st;; i = (i + 1) % FD_CATCHUP_METRICS_MAX ) { 106 0 : fd_slot_metrics_t * slot_data = &repair_metrics->slots[ i ]; 107 0 : temp_slots[ temp_idx++ ] = *slot_data; 108 0 : total_slots++; 109 0 : min_ts = fd_min( min_ts, slot_data->first_shred_ts ); 110 0 : max_ts = fd_max( max_ts, slot_data->slot_complete_ts ); 111 0 : if( FD_UNLIKELY( slot_data->slot == repair_metrics->turbine_slot0 ) ) repair_kickoff_ts = slot_data->first_shred_ts; 112 0 : if( FD_UNLIKELY( slot_data->slot <= repair_metrics->turbine_slot0 ) ) finish_catchup_ts = fd_max( finish_catchup_ts, slot_data->slot_complete_ts ); 113 0 : if( FD_UNLIKELY( slot_data->slot <= repair_metrics->turbine_slot0 ) ) num_catchup_slots++; 114 0 : slot_durations_sum += slot_data->slot_complete_ts - slot_data->first_shred_ts; 115 : 116 0 : incremental_cmpl_sum += slot_data->slot_complete_ts - prev_slot_ts; 117 0 : prev_slot_ts = slot_data->slot_complete_ts; 118 : 119 0 : if( i == repair_metrics->en ) break; 120 0 : } 121 : 122 : /* Sort temp array by slot */ 123 0 : qsort( temp_slots, total_slots, sizeof(fd_slot_metrics_t), compare_slots ); 124 : 125 : /* prints a stacked depth chart of the catchup metrics: 126 : slot |===============| (duration in ms) 127 : slot |================| 128 : etc. */ 129 : 130 0 : double tick_sz = (double)(max_ts - min_ts) / (double)MAX_WIDTH; 131 0 : long orphan_cmpl_ts = temp_slots[0].first_shred_ts; /* When we make the request for the snapshot slot, this is about when the full tree is connected. */ 132 0 : int orphans_cmpl_cnt = 0; /* Count of slots completed by the time orphan requests are done */ 133 0 : for( uint i = 0; i < total_slots; i++ ) { 134 0 : fd_slot_metrics_t * slot_metrics = &temp_slots[ i ]; 135 0 : if( FD_UNLIKELY( slot_metrics->slot_complete_ts < orphan_cmpl_ts ) ) orphans_cmpl_cnt++; 136 0 : print_slot_interval_bar( slot_metrics, tick_sz, min_ts, verbose ); 137 0 : } 138 0 : fflush( stdout ); 139 : 140 : 141 0 : double pipelined_time = (double)(max_ts - min_ts); 142 0 : double non_pipelined_time = (double)slot_durations_sum; 143 0 : FD_LOG_NOTICE(( "\n" 144 0 : "Completed %u slots in %.2f seconds total. \n" 145 0 : "Average slot duration (time from first shred/rq to all shreds received): %.2f ms\n" 146 0 : "Average time between slot completions: %.2f ms\n" 147 0 : "Average slots per second: %.2f\n" 148 0 : "Pipeline factor (sum duration of all slots / total time): %.2f\n", 149 0 : total_slots, 150 0 : (double)fd_metrics_convert_ticks_to_nanoseconds((ulong)pipelined_time) / 1e9, 151 0 : (double)fd_metrics_convert_ticks_to_nanoseconds((ulong)slot_durations_sum) / (double)total_slots / 1e6, 152 0 : (double)fd_metrics_convert_ticks_to_nanoseconds((ulong)incremental_cmpl_sum) / (double)total_slots / 1e6, 153 0 : (double)total_slots / (double)fd_metrics_convert_ticks_to_nanoseconds((ulong)pipelined_time) * 1e9, 154 0 : non_pipelined_time / pipelined_time )); 155 : 156 0 : FD_LOG_NOTICE(( "\n" 157 0 : "Caught up %d slots in %.2f ms total. \n" 158 0 : "Time to repair orphans: %.2f ms \n" 159 0 : "Total time from connected orphan to done: %.2f ms \n" 160 0 : "Slots completed by orphans connected: %d\n", 161 0 : num_catchup_slots, 162 0 : (double)fd_metrics_convert_ticks_to_nanoseconds((ulong)(finish_catchup_ts - repair_kickoff_ts)) / 1e6, 163 0 : (double)fd_metrics_convert_ticks_to_nanoseconds((ulong)(orphan_cmpl_ts - repair_kickoff_ts)) / 1e6, 164 0 : (double)fd_metrics_convert_ticks_to_nanoseconds((ulong)(finish_catchup_ts - orphan_cmpl_ts )) / 1e6, 165 0 : orphans_cmpl_cnt )); 166 0 : } 167 : 168 : void 169 0 : fd_repair_metrics_print( fd_repair_metrics_t * repair_metrics, int verbose ) { 170 0 : long min_ts = repair_metrics->slots[ repair_metrics->st ].first_shred_ts; 171 0 : long max_ts = repair_metrics->slots[ repair_metrics->en ].slot_complete_ts; 172 0 : uint total_slots = 0; 173 0 : long prev_slot_ts = repair_metrics->slots[ repair_metrics->st ].slot_complete_ts; 174 0 : long incremental_cmpl_sum = 0; 175 : 176 0 : long slot_durations_sum = 0; 177 0 : for( uint i = repair_metrics->st;; i = (i + 1) % FD_CATCHUP_METRICS_MAX ) { 178 0 : fd_slot_metrics_t * slot_data = &repair_metrics->slots[ i ]; 179 0 : slot_durations_sum += (slot_data->slot_complete_ts - slot_data->first_shred_ts); 180 0 : total_slots++; 181 0 : min_ts = fd_min( min_ts, slot_data->first_shred_ts ); 182 0 : max_ts = fd_max( max_ts, slot_data->slot_complete_ts ); 183 : 184 : /* st -> en are already ordered by completion time. so this-prev 185 : is guaranteed to be > 0 */ 186 0 : incremental_cmpl_sum += slot_data->slot_complete_ts - prev_slot_ts; 187 0 : prev_slot_ts = slot_data->slot_complete_ts; 188 : 189 0 : if( FD_UNLIKELY( i == repair_metrics->en ) ) break; 190 0 : } 191 : 192 : /* prints a stacked depth chart of the catchup metrics: 193 : slot |===============| (duration in ms) 194 : slot |================| 195 : etc. */ 196 : 197 0 : double tick_sz = (double)(max_ts - min_ts) / (double)MAX_WIDTH; 198 0 : for( uint i = repair_metrics->st;; i = (i + 1) % FD_CATCHUP_METRICS_MAX ) { 199 0 : fd_slot_metrics_t * slot_metrics = &repair_metrics->slots[ i ]; 200 0 : print_slot_interval_bar( slot_metrics, tick_sz, min_ts, verbose ); 201 0 : if( i == repair_metrics->en ) break; 202 0 : } 203 0 : fflush( stdout ); 204 : 205 0 : double pipelined_time = (double)(max_ts - min_ts); 206 0 : double non_pipelined_time = (double)slot_durations_sum; 207 0 : FD_LOG_NOTICE(( "\n" 208 0 : "Over past %u completed slots: \n" 209 0 : "Average slot duration (time from first shred/rq to all shreds received): %.2f ms\n" 210 0 : "Average time between slot completions: %.2f ms\n" 211 0 : "Average slots per second: %.2f\n" 212 0 : "Pipeline factor (sum duration of all slots / total time): %.2f\n", 213 0 : total_slots, 214 0 : (double)fd_metrics_convert_ticks_to_nanoseconds((ulong)slot_durations_sum) / (double)total_slots / 1e6, 215 0 : (double)fd_metrics_convert_ticks_to_nanoseconds((ulong)incremental_cmpl_sum) / (double)total_slots / 1e6, 216 0 : (double)total_slots / (double)fd_metrics_convert_ticks_to_nanoseconds((ulong)pipelined_time) * 1e9, 217 0 : non_pipelined_time / pipelined_time )); 218 0 : } 219 : 220 : #undef MAX_WIDTH