circuitstats.c (60163B)
1 /* Copyright (c) 2001 Matej Pfajfar. 2 * Copyright (c) 2001-2004, Roger Dingledine. 3 * Copyright (c) 2004-2006, Roger Dingledine, Nick Mathewson. 4 * Copyright (c) 2007-2021, The Tor Project, Inc. */ 5 /* See LICENSE for licensing information */ 6 7 /** 8 * \file circuitstats.c 9 * 10 * \brief Maintains and analyzes statistics about circuit built times, so we 11 * can tell how long we may need to wait for a fast circuit to be constructed. 12 * 13 * By keeping these statistics, a client learns when it should time out a slow 14 * circuit for being too slow, and when it should keep a circuit open in order 15 * to wait for it to complete. 16 * 17 * The information here is kept in a circuit_built_times_t structure, which is 18 * currently a singleton, but doesn't need to be. It's updated by calls to 19 * circuit_build_times_count_timeout() from circuituse.c, 20 * circuit_build_times_count_close() from circuituse.c, and 21 * circuit_build_times_add_time() from circuitbuild.c, and inspected by other 22 * calls into this module, mostly from circuitlist.c. Observations are 23 * persisted to disk via the or_state_t-related calls. 24 */ 25 26 #define CIRCUITSTATS_PRIVATE 27 28 #include "core/or/or.h" 29 #include "core/or/circuitbuild.h" 30 #include "core/or/circuitstats.h" 31 #include "app/config/config.h" 32 #include "lib/confmgt/confmgt.h" 33 #include "feature/control/control_events.h" 34 #include "lib/crypt_ops/crypto_rand.h" 35 #include "core/mainloop/mainloop.h" 36 #include "feature/nodelist/networkstatus.h" 37 #include "feature/relay/router.h" 38 #include "app/config/statefile.h" 39 #include "core/or/circuitlist.h" 40 #include "core/or/circuituse.h" 41 #include "lib/math/fp.h" 42 #include "lib/time/tvdiff.h" 43 #include "lib/encoding/confline.h" 44 #include "feature/dirauth/authmode.h" 45 #include "feature/hs/hs_service.h" 46 #include "feature/relay/relay_periodic.h" 47 48 #include "core/or/crypt_path_st.h" 49 #include "core/or/origin_circuit_st.h" 50 #include "app/config/or_state_st.h" 51 52 #undef log 53 #include <math.h> 54 55 static void circuit_build_times_scale_circ_counts(circuit_build_times_t *cbt); 56 57 #define CBT_BIN_TO_MS(bin) ((bin)*CBT_BIN_WIDTH + (CBT_BIN_WIDTH/2)) 58 59 /** Global list of circuit build times */ 60 // XXXX: Add this as a member for entry_guard_t instead of global? 61 // Then we could do per-guard statistics, as guards are likely to 62 // vary in their own latency. The downside of this is that guards 63 // can change frequently, so we'd be building a lot more circuits 64 // most likely. 65 static circuit_build_times_t circ_times; 66 67 #ifdef TOR_UNIT_TESTS 68 /** If set, we're running the unit tests: we should avoid clobbering 69 * our state file or accessing get_options() or get_or_state() */ 70 static int unit_tests = 0; 71 #else 72 #define unit_tests 0 73 #endif /* defined(TOR_UNIT_TESTS) */ 74 75 /** Return a pointer to the data structure describing our current circuit 76 * build time history and computations. */ 77 const circuit_build_times_t * 78 get_circuit_build_times(void) 79 { 80 return &circ_times; 81 } 82 83 /** As get_circuit_build_times, but return a mutable pointer. */ 84 circuit_build_times_t * 85 get_circuit_build_times_mutable(void) 86 { 87 return &circ_times; 88 } 89 90 /** Return the time to wait before actually closing an under-construction, in 91 * milliseconds. */ 92 double 93 get_circuit_build_close_time_ms(void) 94 { 95 return circ_times.close_ms; 96 } 97 98 /** Return the time to wait before giving up on an under-construction circuit, 99 * in milliseconds. */ 100 double 101 get_circuit_build_timeout_ms(void) 102 { 103 return circ_times.timeout_ms; 104 } 105 106 /** 107 * This function decides if CBT learning should be disabled. It returns 108 * true if one or more of the following conditions are met: 109 * 110 * 1. If the cbtdisabled consensus parameter is set. 111 * 2. If the torrc option LearnCircuitBuildTimeout is false. 112 * 3. If we are a directory authority 113 * 4. If we fail to write circuit build time history to our state file. 114 * 5. If we are configured in Single Onion mode 115 */ 116 int 117 circuit_build_times_disabled(const or_options_t *options) 118 { 119 return circuit_build_times_disabled_(options, 0); 120 } 121 122 /** As circuit_build_times_disabled, but take options as an argument. */ 123 int 124 circuit_build_times_disabled_(const or_options_t *options, 125 int ignore_consensus) 126 { 127 if (unit_tests) { 128 return 0; 129 } else { 130 int consensus_disabled = 131 ignore_consensus ? 0 : networkstatus_get_param(NULL, "cbtdisabled", 132 0, 0, 1); 133 int config_disabled = !options->LearnCircuitBuildTimeout; 134 int dirauth_disabled = authdir_mode(options); 135 int state_disabled = did_last_state_file_write_fail() ? 1 : 0; 136 /* LearnCircuitBuildTimeout and Single Onion Services are 137 * incompatible in two ways: 138 * 139 * - LearnCircuitBuildTimeout results in a low CBT, which 140 * Single Onion use of one-hop intro and rendezvous circuits lowers 141 * much further, producing *far* too many timeouts. 142 * 143 * - The adaptive CBT code does not update its timeout estimate 144 * using build times for single-hop circuits. 145 * 146 * If we fix both of these issues someday, we should test 147 * these modes with LearnCircuitBuildTimeout on again. */ 148 int single_onion_disabled = hs_service_allow_non_anonymous_connection( 149 options); 150 151 if (consensus_disabled || config_disabled || dirauth_disabled || 152 state_disabled || single_onion_disabled) { 153 #if 0 154 log_debug(LD_CIRC, 155 "CircuitBuildTime learning is disabled. " 156 "Consensus=%d, Config=%d, AuthDir=%d, StateFile=%d", 157 consensus_disabled, config_disabled, dirauth_disabled, 158 state_disabled); 159 #endif /* 0 */ 160 return 1; 161 } else { 162 #if 0 163 log_debug(LD_CIRC, 164 "CircuitBuildTime learning is not disabled. " 165 "Consensus=%d, Config=%d, AuthDir=%d, StateFile=%d", 166 consensus_disabled, config_disabled, dirauth_disabled, 167 state_disabled); 168 #endif /* 0 */ 169 return 0; 170 } 171 } 172 } 173 174 /** 175 * Retrieve and bounds-check the cbtmaxtimeouts consensus parameter. 176 * 177 * Effect: When this many timeouts happen in the last 'cbtrecentcount' 178 * circuit attempts, the client should discard all of its history and 179 * begin learning a fresh timeout value. 180 */ 181 static int32_t 182 circuit_build_times_max_timeouts(void) 183 { 184 int32_t cbt_maxtimeouts; 185 186 cbt_maxtimeouts = networkstatus_get_param(NULL, "cbtmaxtimeouts", 187 CBT_DEFAULT_MAX_RECENT_TIMEOUT_COUNT, 188 CBT_MIN_MAX_RECENT_TIMEOUT_COUNT, 189 CBT_MAX_MAX_RECENT_TIMEOUT_COUNT); 190 191 if (!(get_options()->LearnCircuitBuildTimeout)) { 192 log_debug(LD_BUG, 193 "circuit_build_times_max_timeouts() called, cbtmaxtimeouts is" 194 " %d", 195 cbt_maxtimeouts); 196 } 197 198 return cbt_maxtimeouts; 199 } 200 201 /** 202 * Retrieve and bounds-check the cbtnummodes consensus parameter. 203 * 204 * Effect: This value governs how many modes to use in the weighted 205 * average calculation of Pareto parameter Xm. Analysis of pairs of 206 * geographically near, far, and mixed guaeds has shown that a value of 207 * 10 introduces some allows for the actual timeout rate to be within 208 * 2-7% of the cutoff quantile, for quantiles between 60-80%. 209 */ 210 static int32_t 211 circuit_build_times_default_num_xm_modes(void) 212 { 213 int32_t num = networkstatus_get_param(NULL, "cbtnummodes", 214 CBT_DEFAULT_NUM_XM_MODES, 215 CBT_MIN_NUM_XM_MODES, 216 CBT_MAX_NUM_XM_MODES); 217 218 if (!(get_options()->LearnCircuitBuildTimeout)) { 219 log_debug(LD_BUG, 220 "circuit_build_times_default_num_xm_modes() called, cbtnummodes" 221 " is %d", 222 num); 223 } 224 225 return num; 226 } 227 228 /** 229 * Retrieve and bounds-check the cbtmincircs consensus parameter. 230 * 231 * Effect: This is the minimum number of circuits to build before 232 * computing a timeout. 233 */ 234 static int32_t 235 circuit_build_times_min_circs_to_observe(void) 236 { 237 int32_t num = networkstatus_get_param(NULL, "cbtmincircs", 238 CBT_DEFAULT_MIN_CIRCUITS_TO_OBSERVE, 239 CBT_MIN_MIN_CIRCUITS_TO_OBSERVE, 240 CBT_MAX_MIN_CIRCUITS_TO_OBSERVE); 241 242 if (!(get_options()->LearnCircuitBuildTimeout)) { 243 log_debug(LD_BUG, 244 "circuit_build_times_min_circs_to_observe() called, cbtmincircs" 245 " is %d", 246 num); 247 } 248 249 return num; 250 } 251 252 /** Return true iff <b>cbt</b> has recorded enough build times that we 253 * want to start acting on the timeout it implies. */ 254 int 255 circuit_build_times_enough_to_compute(const circuit_build_times_t *cbt) 256 { 257 return cbt->total_build_times >= circuit_build_times_min_circs_to_observe(); 258 } 259 260 /** 261 * Retrieve and bounds-check the cbtquantile consensus parameter. 262 * 263 * Effect: This is the position on the quantile curve to use to set the 264 * timeout value. It is a percent (10-99). 265 */ 266 double 267 circuit_build_times_quantile_cutoff(void) 268 { 269 int32_t num = networkstatus_get_param(NULL, "cbtquantile", 270 CBT_DEFAULT_QUANTILE_CUTOFF, 271 CBT_MIN_QUANTILE_CUTOFF, 272 CBT_MAX_QUANTILE_CUTOFF); 273 274 if (!(get_options()->LearnCircuitBuildTimeout)) { 275 log_debug(LD_BUG, 276 "circuit_build_times_quantile_cutoff() called, cbtquantile" 277 " is %d", 278 num); 279 } 280 281 return num/100.0; 282 } 283 284 /** 285 * Retrieve and bounds-check the cbtclosequantile consensus parameter. 286 * 287 * Effect: This is the position on the quantile curve to use to set the 288 * timeout value to use to actually close circuits. It is a percent 289 * (0-99). 290 */ 291 static double 292 circuit_build_times_close_quantile(void) 293 { 294 int32_t param; 295 /* Cast is safe - circuit_build_times_quantile_cutoff() is capped */ 296 int32_t min = (int)tor_lround(100*circuit_build_times_quantile_cutoff()); 297 param = networkstatus_get_param(NULL, "cbtclosequantile", 298 CBT_DEFAULT_CLOSE_QUANTILE, 299 CBT_MIN_CLOSE_QUANTILE, 300 CBT_MAX_CLOSE_QUANTILE); 301 302 if (!(get_options()->LearnCircuitBuildTimeout)) { 303 log_debug(LD_BUG, 304 "circuit_build_times_close_quantile() called, cbtclosequantile" 305 " is %d", param); 306 } 307 308 if (param < min) { 309 log_warn(LD_DIR, "Consensus parameter cbtclosequantile is " 310 "too small, raising to %d", min); 311 param = min; 312 } 313 return param / 100.0; 314 } 315 316 /** 317 * Retrieve and bounds-check the cbttestfreq consensus parameter. 318 * 319 * Effect: Describes how often in seconds to build a test circuit to 320 * gather timeout values. Only applies if less than 'cbtmincircs' 321 * have been recorded. 322 */ 323 static int32_t 324 circuit_build_times_test_frequency(void) 325 { 326 int32_t num = networkstatus_get_param(NULL, "cbttestfreq", 327 CBT_DEFAULT_TEST_FREQUENCY, 328 CBT_MIN_TEST_FREQUENCY, 329 CBT_MAX_TEST_FREQUENCY); 330 331 if (!(get_options()->LearnCircuitBuildTimeout)) { 332 log_debug(LD_BUG, 333 "circuit_build_times_test_frequency() called, cbttestfreq is %d", 334 num); 335 } 336 337 return num; 338 } 339 340 /** 341 * Retrieve and bounds-check the cbtmintimeout consensus parameter. 342 * 343 * Effect: This is the minimum allowed timeout value in milliseconds. 344 * The minimum is to prevent rounding to 0 (we only check once 345 * per second). 346 */ 347 static int32_t 348 circuit_build_times_min_timeout(void) 349 { 350 int32_t num = networkstatus_get_param(NULL, "cbtmintimeout", 351 CBT_DEFAULT_TIMEOUT_MIN_VALUE, 352 CBT_MIN_TIMEOUT_MIN_VALUE, 353 CBT_MAX_TIMEOUT_MIN_VALUE); 354 355 if (!(get_options()->LearnCircuitBuildTimeout)) { 356 log_debug(LD_BUG, 357 "circuit_build_times_min_timeout() called, cbtmintimeout is %d", 358 num); 359 } 360 return num; 361 } 362 363 /** 364 * Retrieve and bounds-check the cbtinitialtimeout consensus parameter. 365 * 366 * Effect: This is the timeout value to use before computing a timeout, 367 * in milliseconds. 368 */ 369 int32_t 370 circuit_build_times_initial_timeout(void) 371 { 372 int32_t min = circuit_build_times_min_timeout(); 373 int32_t param = networkstatus_get_param(NULL, "cbtinitialtimeout", 374 CBT_DEFAULT_TIMEOUT_INITIAL_VALUE, 375 CBT_MIN_TIMEOUT_INITIAL_VALUE, 376 CBT_MAX_TIMEOUT_INITIAL_VALUE); 377 378 if (!(get_options()->LearnCircuitBuildTimeout)) { 379 log_debug(LD_BUG, 380 "circuit_build_times_initial_timeout() called, " 381 "cbtinitialtimeout is %d", 382 param); 383 } 384 385 if (param < min) { 386 log_warn(LD_DIR, "Consensus parameter cbtinitialtimeout is too small, " 387 "raising to %d", min); 388 param = min; 389 } 390 return param; 391 } 392 393 /** 394 * Retrieve and bounds-check the cbtrecentcount consensus parameter. 395 * 396 * Effect: This is the number of circuit build times to keep track of 397 * for deciding if we hit cbtmaxtimeouts and need to reset our state 398 * and learn a new timeout. 399 */ 400 static int32_t 401 circuit_build_times_recent_circuit_count(const networkstatus_t *ns) 402 { 403 int32_t num; 404 num = networkstatus_get_param(ns, "cbtrecentcount", 405 CBT_DEFAULT_RECENT_CIRCUITS, 406 CBT_MIN_RECENT_CIRCUITS, 407 CBT_MAX_RECENT_CIRCUITS); 408 409 if (!(get_options()->LearnCircuitBuildTimeout)) { 410 log_debug(LD_BUG, 411 "circuit_build_times_recent_circuit_count() called, " 412 "cbtrecentcount is %d", 413 num); 414 } 415 416 return num; 417 } 418 419 /** 420 * This function is called when we get a consensus update. 421 * 422 * It checks to see if we have changed any consensus parameters 423 * that require reallocation or discard of previous stats. 424 */ 425 void 426 circuit_build_times_new_consensus_params(circuit_build_times_t *cbt, 427 const networkstatus_t *ns) 428 { 429 int32_t num; 430 431 /* 432 * First check if we're doing adaptive timeouts at all; nothing to 433 * update if we aren't. 434 */ 435 436 if (!circuit_build_times_disabled(get_options())) { 437 num = circuit_build_times_recent_circuit_count(ns); 438 439 if (num > 0) { 440 if (num != cbt->liveness.num_recent_circs) { 441 int8_t *recent_circs; 442 if (cbt->liveness.num_recent_circs > 0) { 443 log_notice(LD_CIRC, "The Tor Directory Consensus has changed how " 444 "many circuits we must track to detect network failures " 445 "from %d to %d.", cbt->liveness.num_recent_circs, num); 446 } else { 447 log_notice(LD_CIRC, "Upon receiving a consensus directory, " 448 "re-enabling circuit-based network failure detection."); 449 } 450 451 tor_assert(cbt->liveness.timeouts_after_firsthop || 452 cbt->liveness.num_recent_circs == 0); 453 454 /* 455 * Technically this is a circular array that we are reallocating 456 * and memcopying. However, since it only consists of either 1s 457 * or 0s, and is only used in a statistical test to determine when 458 * we should discard our history after a sufficient number of 1's 459 * have been reached, it is fine if order is not preserved or 460 * elements are lost. 461 * 462 * cbtrecentcount should only be changing in cases of severe network 463 * distress anyway, so memory correctness here is paramount over 464 * doing acrobatics to preserve the array. 465 */ 466 recent_circs = tor_calloc(num, sizeof(int8_t)); 467 if (cbt->liveness.timeouts_after_firsthop && 468 cbt->liveness.num_recent_circs > 0) { 469 memcpy(recent_circs, cbt->liveness.timeouts_after_firsthop, 470 sizeof(int8_t)*MIN(num, cbt->liveness.num_recent_circs)); 471 } 472 473 // Adjust the index if it needs it. 474 if (num < cbt->liveness.num_recent_circs) { 475 cbt->liveness.after_firsthop_idx = MIN(num-1, 476 cbt->liveness.after_firsthop_idx); 477 } 478 479 tor_free(cbt->liveness.timeouts_after_firsthop); 480 cbt->liveness.timeouts_after_firsthop = recent_circs; 481 cbt->liveness.num_recent_circs = num; 482 } 483 /* else no change, nothing to do */ 484 } else { /* num == 0 */ 485 /* 486 * Weird. This probably shouldn't happen, so log a warning, but try 487 * to do something sensible anyway. 488 */ 489 490 log_warn(LD_CIRC, 491 "The cbtrecentcircs consensus parameter came back zero! " 492 "This disables adaptive timeouts since we can't keep track of " 493 "any recent circuits."); 494 495 circuit_build_times_free_timeouts(cbt); 496 } 497 } else { 498 /* 499 * Adaptive timeouts are disabled; this might be because of the 500 * LearnCircuitBuildTimes config parameter, and hence permanent, or 501 * the cbtdisabled consensus parameter, so it may be a new condition. 502 * Treat it like getting num == 0 above and free the circuit history 503 * if we have any. 504 */ 505 506 circuit_build_times_free_timeouts(cbt); 507 } 508 } 509 510 /** 511 * Return the initial default or configured timeout in milliseconds 512 */ 513 static double 514 circuit_build_times_get_initial_timeout(void) 515 { 516 double timeout; 517 const or_options_t *options = get_options(); 518 519 /* 520 * Check if we have LearnCircuitBuildTimeout, and if we don't, 521 * always use CircuitBuildTimeout, no questions asked. 522 */ 523 if (!unit_tests && options->CircuitBuildTimeout) { 524 timeout = options->CircuitBuildTimeout*1000; 525 if (!circuit_build_times_disabled(options) && 526 timeout < circuit_build_times_min_timeout()) { 527 log_warn(LD_CIRC, "Config CircuitBuildTimeout too low. Setting to %ds", 528 circuit_build_times_min_timeout()/1000); 529 timeout = circuit_build_times_min_timeout(); 530 } 531 } else { 532 timeout = circuit_build_times_initial_timeout(); 533 } 534 535 return timeout; 536 } 537 538 /** 539 * Reset the build time state. 540 * 541 * Leave estimated parameters, timeout and network liveness intact 542 * for future use. 543 */ 544 void 545 circuit_build_times_reset(circuit_build_times_t *cbt) 546 { 547 memset(cbt->circuit_build_times, 0, sizeof(cbt->circuit_build_times)); 548 cbt->total_build_times = 0; 549 cbt->build_times_idx = 0; 550 cbt->have_computed_timeout = 0; 551 552 // Reset timeout and close counts 553 cbt->num_circ_succeeded = 0; 554 cbt->num_circ_closed = 0; 555 cbt->num_circ_timeouts = 0; 556 } 557 558 /** 559 * Initialize the buildtimes structure for first use. 560 * 561 * Sets the initial timeout values based on either the config setting, 562 * the consensus param, or the default (CBT_DEFAULT_TIMEOUT_INITIAL_VALUE). 563 */ 564 void 565 circuit_build_times_init(circuit_build_times_t *cbt) 566 { 567 memset(cbt, 0, sizeof(*cbt)); 568 /* 569 * Check if we really are using adaptive timeouts, and don't keep 570 * track of this stuff if not. 571 */ 572 if (!circuit_build_times_disabled(get_options())) { 573 cbt->liveness.num_recent_circs = 574 circuit_build_times_recent_circuit_count(NULL); 575 cbt->liveness.timeouts_after_firsthop = 576 tor_calloc(cbt->liveness.num_recent_circs, sizeof(int8_t)); 577 } else { 578 cbt->liveness.num_recent_circs = 0; 579 cbt->liveness.timeouts_after_firsthop = NULL; 580 } 581 cbt->close_ms = cbt->timeout_ms = circuit_build_times_get_initial_timeout(); 582 cbt_control_event_buildtimeout_set(cbt, BUILDTIMEOUT_SET_EVENT_RESET); 583 } 584 585 /** 586 * Free the saved timeouts, if the cbtdisabled consensus parameter got turned 587 * on or something. 588 */ 589 590 void 591 circuit_build_times_free_timeouts(circuit_build_times_t *cbt) 592 { 593 if (!cbt) return; 594 595 if (cbt->liveness.timeouts_after_firsthop) { 596 tor_free(cbt->liveness.timeouts_after_firsthop); 597 } 598 599 cbt->liveness.num_recent_circs = 0; 600 } 601 602 #if 0 603 /** 604 * Rewind our build time history by n positions. 605 */ 606 static void 607 circuit_build_times_rewind_history(circuit_build_times_t *cbt, int n) 608 { 609 int i = 0; 610 611 cbt->build_times_idx -= n; 612 cbt->build_times_idx %= CBT_NCIRCUITS_TO_OBSERVE; 613 614 for (i = 0; i < n; i++) { 615 cbt->circuit_build_times[(i+cbt->build_times_idx) 616 %CBT_NCIRCUITS_TO_OBSERVE]=0; 617 } 618 619 if (cbt->total_build_times > n) { 620 cbt->total_build_times -= n; 621 } else { 622 cbt->total_build_times = 0; 623 } 624 625 log_info(LD_CIRC, 626 "Rewound history by %d places. Current index: %d. " 627 "Total: %d", n, cbt->build_times_idx, cbt->total_build_times); 628 } 629 #endif /* 0 */ 630 631 /** 632 * Mark this circuit as timed out, but change its purpose 633 * so that it continues to build, allowing us to measure 634 * its full build time. 635 */ 636 void 637 circuit_build_times_mark_circ_as_measurement_only(origin_circuit_t *circ) 638 { 639 circuit_event_status(circ, 640 CIRC_EVENT_FAILED, 641 END_CIRC_REASON_TIMEOUT); 642 circuit_change_purpose(TO_CIRCUIT(circ), 643 CIRCUIT_PURPOSE_C_MEASURE_TIMEOUT); 644 /* Record this event to check for too many timeouts 645 * in a row. This function does not record a time value yet 646 * (we do that later); it only counts the fact that we did 647 * have a timeout. We also want to avoid double-counting 648 * already "relaxed" circuits, which are counted in 649 * circuit_expire_building(). */ 650 if (!circ->relaxed_timeout) { 651 int first_hop_succeeded = circ->cpath && 652 circ->cpath->state == CPATH_STATE_OPEN; 653 654 circuit_build_times_count_timeout( 655 get_circuit_build_times_mutable(), 656 first_hop_succeeded); 657 } 658 } 659 660 /** 661 * Perform the build time work that needs to be done when a circuit 662 * completes a hop. 663 * 664 * This function decides if we should record a circuit's build time 665 * in our histogram data and other statistics, and if so, records it. 666 * It also will mark circuits that have already timed out as 667 * measurement-only circuits, so they can continue to build but 668 * not get used. 669 * 670 * For this, we want to consider circuits that will eventually make 671 * it to the third hop. For circuits longer than 3 hops, we want to 672 * record their build time when they reach the third hop, but let 673 * them continue (and not count them later). For circuits that are 674 * exactly 3 hops, this will count them when they are completed. We 675 * do this so that CBT is always gathering statistics on circuits 676 * of the same length, regardless of their type. 677 */ 678 void 679 circuit_build_times_handle_completed_hop(origin_circuit_t *circ) 680 { 681 struct timeval end; 682 long timediff; 683 684 /* If circuit build times are disabled, let circuit_expire_building() 685 * handle it.. */ 686 if (circuit_build_times_disabled(get_options())) { 687 return; 688 } 689 690 /* Is this a circuit for which the timeout applies in a straight-forward 691 * way? If so, handle it below. If not, just return (and let 692 * circuit_expire_building() eventually take care of it). 693 */ 694 if (!circuit_timeout_want_to_count_circ(circ)) { 695 return; 696 } 697 698 tor_gettimeofday(&end); 699 timediff = tv_mdiff(&circ->base_.timestamp_began, &end); 700 701 /* Check if we would have timed out already. If so, change the 702 * purpose here. But don't do any timeout handling here if there 703 * are no circuits opened yet. Save it for circuit_expire_building() 704 * (to allow it to handle timeout "relaxing" over there). */ 705 if (timediff > get_circuit_build_timeout_ms() && 706 circuit_any_opened_circuits_cached()) { 707 708 /* Circuits are allowed to last longer for measurement. 709 * Switch their purpose and wait. */ 710 if (circ->base_.purpose != CIRCUIT_PURPOSE_C_MEASURE_TIMEOUT) { 711 log_info(LD_CIRC, 712 "Deciding to timeout circuit %"PRIu32, 713 (circ->global_identifier)); 714 circuit_build_times_mark_circ_as_measurement_only(circ); 715 } 716 } 717 718 /* If the circuit is built to exactly the DEFAULT_ROUTE_LEN, 719 * add it to our buildtimes. */ 720 if (circuit_get_cpath_opened_len(circ) == DEFAULT_ROUTE_LEN) { 721 /* If the circuit build time is much greater than we would have cut 722 * it off at, we probably had a suspend event along this codepath, 723 * and we should discard the value. 724 */ 725 if (timediff < 0 || 726 timediff > 2*get_circuit_build_close_time_ms()+1000) { 727 log_notice(LD_CIRC, "Strange value for circuit build time: %ldmsec. " 728 "Assuming clock jump. Purpose %d (%s)", timediff, 729 circ->base_.purpose, 730 circuit_purpose_to_string(circ->base_.purpose)); 731 } else { 732 /* Only count circuit times if the network is live */ 733 if (circuit_build_times_network_check_live( 734 get_circuit_build_times())) { 735 circuit_build_times_add_time(get_circuit_build_times_mutable(), 736 (build_time_t)timediff); 737 circuit_build_times_set_timeout(get_circuit_build_times_mutable()); 738 } 739 740 if (circ->base_.purpose != CIRCUIT_PURPOSE_C_MEASURE_TIMEOUT) { 741 circuit_build_times_network_circ_success( 742 get_circuit_build_times_mutable()); 743 } 744 } 745 } 746 } 747 748 /** 749 * Add a new build time value <b>time</b> to the set of build times. Time 750 * units are milliseconds. 751 * 752 * circuit_build_times <b>cbt</b> is a circular array, so loop around when 753 * array is full. 754 */ 755 int 756 circuit_build_times_add_time(circuit_build_times_t *cbt, build_time_t btime) 757 { 758 if (btime <= 0 || btime > CBT_BUILD_TIME_MAX) { 759 log_warn(LD_BUG, "Circuit build time is too large (%u)." 760 "This is probably a bug.", btime); 761 tor_fragile_assert(); 762 return -1; 763 } 764 765 log_debug(LD_CIRC, "Adding circuit build time %u", btime); 766 767 cbt->circuit_build_times[cbt->build_times_idx] = btime; 768 cbt->build_times_idx = (cbt->build_times_idx + 1) % CBT_NCIRCUITS_TO_OBSERVE; 769 if (cbt->total_build_times < CBT_NCIRCUITS_TO_OBSERVE) 770 cbt->total_build_times++; 771 772 if ((cbt->total_build_times % CBT_SAVE_STATE_EVERY) == 0) { 773 /* Save state every n circuit builds */ 774 if (!unit_tests && !get_options()->AvoidDiskWrites) 775 or_state_mark_dirty(get_or_state(), 0); 776 } 777 778 return 0; 779 } 780 781 /** 782 * Return maximum circuit build time 783 */ 784 static build_time_t 785 circuit_build_times_max(const circuit_build_times_t *cbt) 786 { 787 int i = 0; 788 build_time_t max_build_time = 0; 789 for (i = 0; i < CBT_NCIRCUITS_TO_OBSERVE; i++) { 790 if (cbt->circuit_build_times[i] > max_build_time 791 && cbt->circuit_build_times[i] != CBT_BUILD_ABANDONED) 792 max_build_time = cbt->circuit_build_times[i]; 793 } 794 return max_build_time; 795 } 796 797 #if 0 798 /** Return minimum circuit build time */ 799 build_time_t 800 circuit_build_times_min(circuit_build_times_t *cbt) 801 { 802 int i = 0; 803 build_time_t min_build_time = CBT_BUILD_TIME_MAX; 804 for (i = 0; i < CBT_NCIRCUITS_TO_OBSERVE; i++) { 805 if (cbt->circuit_build_times[i] && /* 0 <-> uninitialized */ 806 cbt->circuit_build_times[i] < min_build_time) 807 min_build_time = cbt->circuit_build_times[i]; 808 } 809 if (min_build_time == CBT_BUILD_TIME_MAX) { 810 log_warn(LD_CIRC, "No build times less than CBT_BUILD_TIME_MAX!"); 811 } 812 return min_build_time; 813 } 814 #endif /* 0 */ 815 816 /** 817 * Calculate and return a histogram for the set of build times. 818 * 819 * Returns an allocated array of histrogram bins representing 820 * the frequency of index*CBT_BIN_WIDTH millisecond 821 * build times. Also outputs the number of bins in nbins. 822 * 823 * The return value must be freed by the caller. 824 */ 825 static uint32_t * 826 circuit_build_times_create_histogram(const circuit_build_times_t *cbt, 827 build_time_t *nbins) 828 { 829 uint32_t *histogram; 830 build_time_t max_build_time = circuit_build_times_max(cbt); 831 int i, c; 832 833 *nbins = 1 + (max_build_time / CBT_BIN_WIDTH); 834 histogram = tor_calloc(*nbins, sizeof(build_time_t)); 835 836 // calculate histogram 837 for (i = 0; i < CBT_NCIRCUITS_TO_OBSERVE; i++) { 838 if (cbt->circuit_build_times[i] == 0 839 || cbt->circuit_build_times[i] == CBT_BUILD_ABANDONED) 840 continue; /* 0 <-> uninitialized */ 841 842 c = (cbt->circuit_build_times[i] / CBT_BIN_WIDTH); 843 histogram[c]++; 844 } 845 846 return histogram; 847 } 848 849 /** 850 * Return the Pareto start-of-curve parameter Xm. 851 * 852 * Because we are not a true Pareto curve, we compute this as the 853 * weighted average of the 10 most frequent build time bins. This 854 * heuristic allowed for the actual timeout rate to be closest 855 * to the chosen quantile cutoff, for quantiles 60-80%, out of 856 * many variant approaches (see #40157 for analysis). 857 */ 858 STATIC build_time_t 859 circuit_build_times_get_xm(circuit_build_times_t *cbt) 860 { 861 build_time_t nbins = 0; 862 build_time_t *nth_max_bin; 863 build_time_t xm_total = 0; 864 build_time_t Xm = 0; 865 int32_t xm_counts=0; 866 int num_modes = circuit_build_times_default_num_xm_modes(); 867 uint32_t *histogram = circuit_build_times_create_histogram(cbt, &nbins); 868 869 tor_assert(nbins > 0); 870 tor_assert(num_modes > 0); 871 872 nth_max_bin = tor_calloc(num_modes, sizeof(build_time_t)); 873 874 /* Determine the N most common build times, by selecting the 875 * nth largest mode, counting it, and removing it from the histogram. */ 876 for (int n = 0; n < num_modes; n++) { 877 /* Get nth mode */ 878 for (build_time_t i = 0; i < nbins; i++) { 879 if (histogram[i] > histogram[nth_max_bin[n]]) { 880 nth_max_bin[n] = i; 881 } 882 } 883 884 /* Update average */ 885 xm_counts += histogram[nth_max_bin[n]]; 886 xm_total += CBT_BIN_TO_MS(nth_max_bin[n])*histogram[nth_max_bin[n]]; 887 888 /* Prevent from re-counting this value */ 889 histogram[nth_max_bin[n]] = 0; 890 } 891 892 /* xm_counts can become zero if all of our last CBT_NCIRCUITS_TO_OBSERVE 893 * circuits were abandoned before they completed. This shouldn't happen, 894 * though. We should have reset/re-learned a lower timeout first. */ 895 if (xm_counts == 0) { 896 log_warn(LD_CIRC, 897 "No valid circuit build time data out of %d times, %u modes, " 898 "have_timeout=%d, %lfms", cbt->total_build_times, num_modes, 899 cbt->have_computed_timeout, cbt->timeout_ms); 900 goto done; 901 } 902 903 Xm = xm_total / xm_counts; 904 905 done: 906 tor_free(histogram); 907 tor_free(nth_max_bin); 908 909 return Xm; 910 } 911 912 /** 913 * Output a histogram of current circuit build times to 914 * the or_state_t state structure. 915 */ 916 void 917 circuit_build_times_update_state(const circuit_build_times_t *cbt, 918 or_state_t *state) 919 { 920 uint32_t *histogram; 921 build_time_t i = 0; 922 build_time_t nbins = 0; 923 config_line_t **next, *line; 924 925 histogram = circuit_build_times_create_histogram(cbt, &nbins); 926 // write to state 927 config_free_lines(state->BuildtimeHistogram); 928 next = &state->BuildtimeHistogram; 929 *next = NULL; 930 931 state->TotalBuildTimes = cbt->total_build_times; 932 state->CircuitBuildAbandonedCount = 0; 933 934 for (i = 0; i < CBT_NCIRCUITS_TO_OBSERVE; i++) { 935 if (cbt->circuit_build_times[i] == CBT_BUILD_ABANDONED) 936 state->CircuitBuildAbandonedCount++; 937 } 938 939 for (i = 0; i < nbins; i++) { 940 // compress the histogram by skipping the blanks 941 if (histogram[i] == 0) continue; 942 *next = line = tor_malloc_zero(sizeof(config_line_t)); 943 line->key = tor_strdup("CircuitBuildTimeBin"); 944 tor_asprintf(&line->value, "%d %d", 945 CBT_BIN_TO_MS(i), histogram[i]); 946 next = &(line->next); 947 } 948 949 if (!unit_tests) { 950 if (!get_options()->AvoidDiskWrites) 951 or_state_mark_dirty(get_or_state(), 0); 952 } 953 954 tor_free(histogram); 955 } 956 957 /** 958 * Shuffle the build times array. 959 * 960 * Adapted from https://en.wikipedia.org/wiki/Fisher-Yates_shuffle 961 */ 962 static void 963 circuit_build_times_shuffle_and_store_array(circuit_build_times_t *cbt, 964 build_time_t *raw_times, 965 uint32_t num_times) 966 { 967 uint32_t n = num_times; 968 if (num_times > CBT_NCIRCUITS_TO_OBSERVE) { 969 log_notice(LD_CIRC, "The number of circuit times that this Tor version " 970 "uses to calculate build times is less than the number stored " 971 "in your state file. Decreasing the circuit time history from " 972 "%lu to %d.", (unsigned long)num_times, 973 CBT_NCIRCUITS_TO_OBSERVE); 974 } 975 976 if (n > INT_MAX-1) { 977 log_warn(LD_CIRC, "For some insane reasons, you had %lu circuit build " 978 "observations in your state file. That's far too many; probably " 979 "there's a bug here.", (unsigned long)n); 980 n = INT_MAX-1; 981 } 982 983 /* This code can only be run on a compact array */ 984 while (n-- > 1) { 985 int k = crypto_rand_int(n + 1); /* 0 <= k <= n. */ 986 build_time_t tmp = raw_times[k]; 987 raw_times[k] = raw_times[n]; 988 raw_times[n] = tmp; 989 } 990 991 /* Since the times are now shuffled, take a random CBT_NCIRCUITS_TO_OBSERVE 992 * subset (ie the first CBT_NCIRCUITS_TO_OBSERVE values) */ 993 for (n = 0; n < MIN(num_times, CBT_NCIRCUITS_TO_OBSERVE); n++) { 994 circuit_build_times_add_time(cbt, raw_times[n]); 995 } 996 } 997 998 /** 999 * Load histogram from <b>state</b>, shuffling the resulting array 1000 * after we do so. Use this result to estimate parameters and 1001 * calculate the timeout. 1002 * 1003 * Return -1 on error. 1004 */ 1005 int 1006 circuit_build_times_parse_state(circuit_build_times_t *cbt, 1007 or_state_t *state) 1008 { 1009 int tot_values = 0; 1010 uint32_t loaded_cnt = 0, N = 0; 1011 config_line_t *line; 1012 int i; 1013 build_time_t *loaded_times; 1014 int err = 0; 1015 circuit_build_times_init(cbt); 1016 1017 if (circuit_build_times_disabled(get_options())) { 1018 return 0; 1019 } 1020 1021 /* We had a case where someone removed their TotalBuildTimes from the state 1022 * files while having CircuitBuildAbandonedCount above 0 leading to a 1023 * segfault (#40437). Simply bug on it and return an error so at least the 1024 * user will learn that they broke the state file. */ 1025 if (BUG(state->TotalBuildTimes <= 0 && 1026 state->CircuitBuildAbandonedCount > 0)) { 1027 log_warn(LD_GENERAL, "CircuitBuildAbandonedCount count is above 0 but " 1028 "no TotalBuildTimes have been found. Unable to " 1029 "parse broken state file"); 1030 return -1; 1031 } 1032 1033 /* build_time_t 0 means uninitialized */ 1034 loaded_times = tor_calloc(state->TotalBuildTimes, sizeof(build_time_t)); 1035 1036 for (line = state->BuildtimeHistogram; line; line = line->next) { 1037 smartlist_t *args = smartlist_new(); 1038 smartlist_split_string(args, line->value, " ", 1039 SPLIT_SKIP_SPACE|SPLIT_IGNORE_BLANK, 0); 1040 if (smartlist_len(args) < 2) { 1041 log_warn(LD_GENERAL, "Unable to parse circuit build times: " 1042 "Too few arguments to CircuitBuildTime"); 1043 err = 1; 1044 SMARTLIST_FOREACH(args, char*, cp, tor_free(cp)); 1045 smartlist_free(args); 1046 break; 1047 } else { 1048 const char *ms_str = smartlist_get(args,0); 1049 const char *count_str = smartlist_get(args,1); 1050 uint32_t count, k; 1051 build_time_t ms; 1052 int ok; 1053 ms = (build_time_t)tor_parse_ulong(ms_str, 10, 0, 1054 CBT_BUILD_TIME_MAX, &ok, NULL); 1055 if (!ok) { 1056 log_warn(LD_GENERAL, "Unable to parse circuit build times: " 1057 "Unparsable bin number"); 1058 err = 1; 1059 SMARTLIST_FOREACH(args, char*, cp, tor_free(cp)); 1060 smartlist_free(args); 1061 break; 1062 } 1063 count = (uint32_t)tor_parse_ulong(count_str, 10, 0, 1064 UINT32_MAX, &ok, NULL); 1065 if (!ok) { 1066 log_warn(LD_GENERAL, "Unable to parse circuit build times: " 1067 "Unparsable bin count"); 1068 err = 1; 1069 SMARTLIST_FOREACH(args, char*, cp, tor_free(cp)); 1070 smartlist_free(args); 1071 break; 1072 } 1073 1074 if (loaded_cnt+count+ (unsigned)state->CircuitBuildAbandonedCount 1075 > (unsigned) state->TotalBuildTimes) { 1076 log_warn(LD_CIRC, 1077 "Too many build times in state file. " 1078 "Stopping short before %d", 1079 loaded_cnt+count); 1080 SMARTLIST_FOREACH(args, char*, cp, tor_free(cp)); 1081 smartlist_free(args); 1082 break; 1083 } 1084 1085 for (k = 0; k < count; k++) { 1086 loaded_times[loaded_cnt++] = ms; 1087 } 1088 N++; 1089 SMARTLIST_FOREACH(args, char*, cp, tor_free(cp)); 1090 smartlist_free(args); 1091 } 1092 } 1093 1094 log_info(LD_CIRC, 1095 "Adding %d timeouts.", state->CircuitBuildAbandonedCount); 1096 for (i=0; i < state->CircuitBuildAbandonedCount; i++) { 1097 loaded_times[loaded_cnt++] = CBT_BUILD_ABANDONED; 1098 } 1099 1100 if (loaded_cnt != (unsigned)state->TotalBuildTimes) { 1101 log_warn(LD_CIRC, 1102 "Corrupt state file? Build times count mismatch. " 1103 "Read %d times, but file says %d", loaded_cnt, 1104 state->TotalBuildTimes); 1105 err = 1; 1106 circuit_build_times_reset(cbt); 1107 goto done; 1108 } 1109 1110 circuit_build_times_shuffle_and_store_array(cbt, loaded_times, loaded_cnt); 1111 1112 /* Verify that we didn't overwrite any indexes */ 1113 for (i=0; i < CBT_NCIRCUITS_TO_OBSERVE; i++) { 1114 if (!cbt->circuit_build_times[i]) 1115 break; 1116 tot_values++; 1117 } 1118 log_info(LD_CIRC, 1119 "Loaded %d/%d values from %d lines in circuit time histogram", 1120 tot_values, cbt->total_build_times, N); 1121 1122 if (cbt->total_build_times != tot_values 1123 || cbt->total_build_times > CBT_NCIRCUITS_TO_OBSERVE) { 1124 log_warn(LD_CIRC, 1125 "Corrupt state file? Shuffled build times mismatch. " 1126 "Read %d times, but file says %d", tot_values, 1127 state->TotalBuildTimes); 1128 err = 1; 1129 circuit_build_times_reset(cbt); 1130 goto done; 1131 } 1132 1133 circuit_build_times_set_timeout(cbt); 1134 1135 done: 1136 tor_free(loaded_times); 1137 return err ? -1 : 0; 1138 } 1139 1140 /** 1141 * Estimates the Xm and Alpha parameters using 1142 * https://en.wikipedia.org/wiki/Pareto_distribution#Parameter_estimation 1143 * 1144 * The notable difference is that we use mode instead of min to estimate Xm. 1145 * This is because our distribution is frechet-like. We claim this is 1146 * an acceptable approximation because we are only concerned with the 1147 * accuracy of the CDF of the tail. 1148 */ 1149 STATIC int 1150 circuit_build_times_update_alpha(circuit_build_times_t *cbt) 1151 { 1152 build_time_t *x=cbt->circuit_build_times; 1153 double a = 0; 1154 int n=0,i=0,abandoned_count=0; 1155 1156 /* https://en.wikipedia.org/wiki/Pareto_distribution#Parameter_estimation */ 1157 /* We sort of cheat here and make our samples slightly more pareto-like 1158 * and less frechet-like. */ 1159 cbt->Xm = circuit_build_times_get_xm(cbt); 1160 1161 /* If Xm came back 0, then too many circuits were abandoned. */ 1162 if (cbt->Xm == 0) 1163 return 0; 1164 1165 tor_assert(cbt->Xm > 0); 1166 1167 for (i=0; i< CBT_NCIRCUITS_TO_OBSERVE; i++) { 1168 if (!x[i]) { 1169 continue; 1170 } 1171 1172 if (x[i] < cbt->Xm) { 1173 a += tor_mathlog(cbt->Xm); 1174 n++; 1175 } else if (x[i] == CBT_BUILD_ABANDONED) { 1176 abandoned_count++; 1177 } else { 1178 a += tor_mathlog(x[i]); 1179 n++; 1180 } 1181 } 1182 1183 /* 1184 * We are erring and asserting here because this can only happen 1185 * in codepaths other than startup. The startup state parsing code 1186 * performs this same check, and resets state if it hits it. If we 1187 * hit it at runtime, something serious has gone wrong. 1188 */ 1189 if (n!=cbt->total_build_times-abandoned_count) { 1190 log_err(LD_CIRC, "Discrepancy in build times count: %d vs %d", n, 1191 cbt->total_build_times); 1192 } 1193 tor_assert_nonfatal(n==cbt->total_build_times-abandoned_count); 1194 1195 /* This is the "Maximum Likelihood Estimator" for parameter alpha of a Pareto 1196 * Distribution. See: 1197 * https://en.wikipedia.org/wiki/Pareto_distribution#Estimation_of_parameters 1198 * 1199 * The division in the estimator is done with subtraction outside the ln(), 1200 * with the sum occurring in the for loop above. 1201 * 1202 * This done is to avoid the precision issues of logs of small values. 1203 */ 1204 a -= n*tor_mathlog(cbt->Xm); 1205 a = n/a; 1206 1207 cbt->alpha = a; 1208 1209 return 1; 1210 } 1211 1212 /** 1213 * This is the Pareto Quantile Function. It calculates the point x 1214 * in the distribution such that F(x) = quantile (ie quantile*100% 1215 * of the mass of the density function is below x on the curve). 1216 * 1217 * We use it to calculate the timeout and also to generate synthetic 1218 * values of time for circuits that timeout before completion. 1219 * 1220 * See https://en.wikipedia.org/wiki/Quantile_function, 1221 * https://en.wikipedia.org/wiki/Inverse_transform_sampling and 1222 * https://en.wikipedia.org/wiki/Pareto_distribution#Generating_a_ 1223 * random_sample_from_Pareto_distribution 1224 * That's right. I'll cite wikipedia all day long. 1225 * 1226 * Return value is in milliseconds, clamped to INT32_MAX. 1227 */ 1228 STATIC double 1229 circuit_build_times_calculate_timeout(circuit_build_times_t *cbt, 1230 double quantile) 1231 { 1232 double ret; 1233 tor_assert(quantile >= 0); 1234 tor_assert(1.0-quantile > 0); 1235 tor_assert(cbt->Xm > 0); 1236 1237 /* If either alpha or p are 0, we would divide by zero, yielding an 1238 * infinite (double) result; which would be clamped to INT32_MAX. 1239 * Instead, initialise ret to INT32_MAX, and skip over these 1240 * potentially illegal/trapping divides by zero. 1241 */ 1242 ret = INT32_MAX; 1243 1244 if (cbt->alpha > 0) { 1245 double p; 1246 p = pow(1.0-quantile,1.0/cbt->alpha); 1247 if (p > 0) { 1248 ret = cbt->Xm/p; 1249 } 1250 } 1251 1252 if (ret > INT32_MAX) { 1253 ret = INT32_MAX; 1254 } 1255 tor_assert(ret > 0); 1256 return ret; 1257 } 1258 1259 #ifdef TOR_UNIT_TESTS 1260 /** Pareto CDF */ 1261 double 1262 circuit_build_times_cdf(circuit_build_times_t *cbt, double x) 1263 { 1264 double ret; 1265 tor_assert(cbt->Xm > 0); 1266 ret = 1.0-pow(cbt->Xm/x,cbt->alpha); 1267 tor_assert(0 <= ret && ret <= 1.0); 1268 return ret; 1269 } 1270 #endif /* defined(TOR_UNIT_TESTS) */ 1271 1272 #ifdef TOR_UNIT_TESTS 1273 /** 1274 * Generate a synthetic time using our distribution parameters. 1275 * 1276 * The return value will be within the [q_lo, q_hi) quantile points 1277 * on the CDF. 1278 */ 1279 build_time_t 1280 circuit_build_times_generate_sample(circuit_build_times_t *cbt, 1281 double q_lo, double q_hi) 1282 { 1283 double randval = crypto_rand_double(); 1284 build_time_t ret; 1285 double u; 1286 1287 /* Generate between [q_lo, q_hi) */ 1288 /*XXXX This is what nextafter is supposed to be for; we should use it on the 1289 * platforms that support it. */ 1290 q_hi -= 1.0/(INT32_MAX); 1291 1292 tor_assert(q_lo >= 0); 1293 tor_assert(q_hi < 1); 1294 tor_assert(q_lo < q_hi); 1295 1296 u = q_lo + (q_hi-q_lo)*randval; 1297 1298 tor_assert(0 <= u && u < 1.0); 1299 /* circuit_build_times_calculate_timeout returns <= INT32_MAX */ 1300 ret = (build_time_t) 1301 tor_lround(circuit_build_times_calculate_timeout(cbt, u)); 1302 tor_assert(ret > 0); 1303 return ret; 1304 } 1305 #endif /* defined(TOR_UNIT_TESTS) */ 1306 1307 #ifdef TOR_UNIT_TESTS 1308 /** 1309 * Estimate an initial alpha parameter by solving the quantile 1310 * function with a quantile point and a specific timeout value. 1311 */ 1312 void 1313 circuit_build_times_initial_alpha(circuit_build_times_t *cbt, 1314 double quantile, double timeout_ms) 1315 { 1316 // Q(u) = Xm/((1-u)^(1/a)) 1317 // Q(0.8) = Xm/((1-0.8))^(1/a)) = CircBuildTimeout 1318 // CircBuildTimeout = Xm/((1-0.8))^(1/a)) 1319 // CircBuildTimeout = Xm*((1-0.8))^(-1/a)) 1320 // ln(CircBuildTimeout) = ln(Xm)+ln(((1-0.8)))*(-1/a) 1321 // -ln(1-0.8)/(ln(CircBuildTimeout)-ln(Xm))=a 1322 tor_assert(quantile >= 0); 1323 tor_assert(cbt->Xm > 0); 1324 cbt->alpha = tor_mathlog(1.0-quantile)/ 1325 (tor_mathlog(cbt->Xm)-tor_mathlog(timeout_ms)); 1326 tor_assert(cbt->alpha > 0); 1327 } 1328 #endif /* defined(TOR_UNIT_TESTS) */ 1329 1330 /** 1331 * Returns true if we need circuits to be built 1332 */ 1333 int 1334 circuit_build_times_needs_circuits(const circuit_build_times_t *cbt) 1335 { 1336 /* Return true if < MIN_CIRCUITS_TO_OBSERVE */ 1337 return !circuit_build_times_enough_to_compute(cbt); 1338 } 1339 1340 /** 1341 * Returns true if we should build a timeout test circuit 1342 * right now. 1343 */ 1344 int 1345 circuit_build_times_needs_circuits_now(const circuit_build_times_t *cbt) 1346 { 1347 return circuit_build_times_needs_circuits(cbt) && 1348 approx_time()-cbt->last_circ_at > circuit_build_times_test_frequency(); 1349 } 1350 1351 /** 1352 * How long should we be unreachable before we think we need to check if 1353 * our published IP address has changed. 1354 */ 1355 #define CIRCUIT_TIMEOUT_BEFORE_RECHECK_IP (60*3) 1356 1357 /** 1358 * Called to indicate that the network showed some signs of liveness, 1359 * i.e. we received a cell. 1360 * 1361 * This is used by circuit_build_times_network_check_live() to decide 1362 * if we should record the circuit build timeout or not. 1363 * 1364 * This function is called every time we receive a cell. Avoid 1365 * syscalls, events, and other high-intensity work. 1366 */ 1367 void 1368 circuit_build_times_network_is_live(circuit_build_times_t *cbt) 1369 { 1370 time_t now = approx_time(); 1371 // XXXX this should use pubsub 1372 if (cbt->liveness.nonlive_timeouts > 0) { 1373 time_t time_since_live = now - cbt->liveness.network_last_live; 1374 log_notice(LD_CIRC, 1375 "Tor now sees network activity. Restoring circuit build " 1376 "timeout recording. Network was down for %d seconds " 1377 "during %d circuit attempts.", 1378 (int)time_since_live, 1379 cbt->liveness.nonlive_timeouts); 1380 if (time_since_live > CIRCUIT_TIMEOUT_BEFORE_RECHECK_IP) 1381 reschedule_descriptor_update_check(); 1382 } 1383 cbt->liveness.network_last_live = now; 1384 cbt->liveness.nonlive_timeouts = 0; 1385 1386 /* Tell control.c */ 1387 control_event_network_liveness_update(1); 1388 } 1389 1390 /** 1391 * Non-destructively scale all of our circuit success, timeout, and close 1392 * counts down by a factor of two. Scaling in this way preserves the 1393 * ratios between succeeded vs timed out vs closed circuits, so that 1394 * our statistics don't change when we scale. 1395 * 1396 * This is used only in the rare event that we build more than 1397 * INT32_MAX circuits. Since the num_circ_* variables are 1398 * uint32_t, we won't even be close to overflowing them. 1399 */ 1400 void 1401 circuit_build_times_scale_circ_counts(circuit_build_times_t *cbt) 1402 { 1403 cbt->num_circ_succeeded /= 2; 1404 cbt->num_circ_timeouts /= 2; 1405 cbt->num_circ_closed /= 2; 1406 } 1407 1408 /** 1409 * Called to indicate that we "completed" a circuit. Because this circuit 1410 * succeeded, it doesn't count as a timeout-after-the-first-hop. 1411 * 1412 * (For the purposes of the cbt code, we consider a circuit "completed" if 1413 * it has 3 hops, regardless of its final hop count. We do this because 1414 * we're trying to answer the question, "how long should a circuit take to 1415 * reach the 3-hop count".) 1416 * 1417 * This is used by circuit_build_times_network_check_changed() to determine 1418 * if we had too many recent timeouts and need to reset our learned timeout 1419 * to something higher. 1420 */ 1421 void 1422 circuit_build_times_network_circ_success(circuit_build_times_t *cbt) 1423 { 1424 // Count circuit success 1425 cbt->num_circ_succeeded++; 1426 1427 // If we're going to wrap int32, scale everything 1428 if (cbt->num_circ_succeeded >= INT32_MAX) { 1429 circuit_build_times_scale_circ_counts(cbt); 1430 } 1431 1432 /* Check for NULLness because we might not be using adaptive timeouts */ 1433 if (cbt->liveness.timeouts_after_firsthop && 1434 cbt->liveness.num_recent_circs > 0) { 1435 cbt->liveness.timeouts_after_firsthop[cbt->liveness.after_firsthop_idx] 1436 = 0; 1437 cbt->liveness.after_firsthop_idx++; 1438 cbt->liveness.after_firsthop_idx %= cbt->liveness.num_recent_circs; 1439 } 1440 } 1441 1442 /** 1443 * A circuit just timed out. If it failed after the first hop, record it 1444 * in our history for later deciding if the network speed has changed. 1445 * 1446 * This is used by circuit_build_times_network_check_changed() to determine 1447 * if we had too many recent timeouts and need to reset our learned timeout 1448 * to something higher. 1449 */ 1450 static void 1451 circuit_build_times_network_timeout(circuit_build_times_t *cbt, 1452 int did_onehop) 1453 { 1454 // Count circuit timeout 1455 cbt->num_circ_timeouts++; 1456 1457 // If we're going to wrap int32, scale everything 1458 if (cbt->num_circ_timeouts >= INT32_MAX) { 1459 circuit_build_times_scale_circ_counts(cbt); 1460 } 1461 1462 /* Check for NULLness because we might not be using adaptive timeouts */ 1463 if (cbt->liveness.timeouts_after_firsthop && 1464 cbt->liveness.num_recent_circs > 0) { 1465 if (did_onehop) { 1466 cbt->liveness.timeouts_after_firsthop[cbt->liveness.after_firsthop_idx] 1467 = 1; 1468 cbt->liveness.after_firsthop_idx++; 1469 cbt->liveness.after_firsthop_idx %= cbt->liveness.num_recent_circs; 1470 } 1471 } 1472 } 1473 1474 /** 1475 * A circuit was just forcibly closed. If there has been no recent network 1476 * activity at all, but this circuit was launched back when we thought the 1477 * network was live, increment the number of "nonlive" circuit timeouts. 1478 * 1479 * This is used by circuit_build_times_network_check_live() to decide 1480 * if we should record the circuit build timeout or not. 1481 */ 1482 static void 1483 circuit_build_times_network_close(circuit_build_times_t *cbt, 1484 int did_onehop, time_t start_time) 1485 { 1486 time_t now = time(NULL); 1487 1488 // Count circuit close 1489 cbt->num_circ_closed++; 1490 1491 // If we're going to wrap int32, scale everything 1492 if (cbt->num_circ_closed >= INT32_MAX) { 1493 circuit_build_times_scale_circ_counts(cbt); 1494 } 1495 1496 /* 1497 * Check if this is a timeout that was for a circuit that spent its 1498 * entire existence during a time where we have had no network activity. 1499 */ 1500 if (cbt->liveness.network_last_live < start_time) { 1501 if (did_onehop) { 1502 char last_live_buf[ISO_TIME_LEN+1]; 1503 char start_time_buf[ISO_TIME_LEN+1]; 1504 char now_buf[ISO_TIME_LEN+1]; 1505 format_local_iso_time(last_live_buf, cbt->liveness.network_last_live); 1506 format_local_iso_time(start_time_buf, start_time); 1507 format_local_iso_time(now_buf, now); 1508 log_notice(LD_CIRC, 1509 "A circuit somehow completed a hop while the network was " 1510 "not live. The network was last live at %s, but the circuit " 1511 "launched at %s. It's now %s. This could mean your clock " 1512 "changed.", last_live_buf, start_time_buf, now_buf); 1513 } 1514 cbt->liveness.nonlive_timeouts++; 1515 if (cbt->liveness.nonlive_timeouts == 1) { 1516 log_notice(LD_CIRC, 1517 "Tor has not observed any network activity for the past %d " 1518 "seconds. Disabling circuit build timeout recording.", 1519 (int)(now - cbt->liveness.network_last_live)); 1520 1521 /* Tell control.c */ 1522 control_event_network_liveness_update(0); 1523 } else { 1524 log_info(LD_CIRC, 1525 "Got non-live timeout. Current count is: %d", 1526 cbt->liveness.nonlive_timeouts); 1527 } 1528 } 1529 } 1530 1531 /** 1532 * When the network is not live, we do not record circuit build times. 1533 * 1534 * The network is considered not live if there has been at least one 1535 * circuit build that began and ended (had its close_ms measurement 1536 * period expire) since we last received a cell. 1537 * 1538 * Also has the side effect of rewinding the circuit time history 1539 * in the case of recent liveness changes. 1540 */ 1541 int 1542 circuit_build_times_network_check_live(const circuit_build_times_t *cbt) 1543 { 1544 if (cbt->liveness.nonlive_timeouts > 0) { 1545 return 0; 1546 } 1547 1548 return 1; 1549 } 1550 1551 /** 1552 * Returns true if we have seen more than MAX_RECENT_TIMEOUT_COUNT of 1553 * the past RECENT_CIRCUITS time out after the first hop. Used to detect 1554 * if the network connection has changed significantly, and if so, 1555 * resets our circuit build timeout to the default. 1556 * 1557 * Also resets the entire timeout history in this case and causes us 1558 * to restart the process of building test circuits and estimating a 1559 * new timeout. 1560 */ 1561 STATIC int 1562 circuit_build_times_network_check_changed(circuit_build_times_t *cbt) 1563 { 1564 int total_build_times = cbt->total_build_times; 1565 int timeout_count=0; 1566 int i; 1567 1568 if (cbt->liveness.timeouts_after_firsthop && 1569 cbt->liveness.num_recent_circs > 0) { 1570 /* how many of our recent circuits made it to the first hop but then 1571 * timed out? */ 1572 for (i = 0; i < cbt->liveness.num_recent_circs; i++) { 1573 timeout_count += cbt->liveness.timeouts_after_firsthop[i]; 1574 } 1575 } 1576 1577 /* If 80% of our recent circuits are timing out after the first hop, 1578 * we need to re-estimate a new initial alpha and timeout. */ 1579 if (timeout_count < circuit_build_times_max_timeouts()) { 1580 return 0; 1581 } 1582 1583 circuit_build_times_reset(cbt); 1584 if (cbt->liveness.timeouts_after_firsthop && 1585 cbt->liveness.num_recent_circs > 0) { 1586 memset(cbt->liveness.timeouts_after_firsthop, 0, 1587 sizeof(*cbt->liveness.timeouts_after_firsthop)* 1588 cbt->liveness.num_recent_circs); 1589 } 1590 cbt->liveness.after_firsthop_idx = 0; 1591 1592 #define MAX_TIMEOUT ((int32_t) (INT32_MAX/2)) 1593 /* Check to see if this has happened before. If so, double the timeout 1594 * to give clients on abysmally bad network connections a shot at access */ 1595 if (cbt->timeout_ms >= circuit_build_times_get_initial_timeout()) { 1596 if (cbt->timeout_ms > MAX_TIMEOUT || cbt->close_ms > MAX_TIMEOUT) { 1597 log_warn(LD_CIRC, "Insanely large circuit build timeout value. " 1598 "(timeout = %fmsec, close = %fmsec)", 1599 cbt->timeout_ms, cbt->close_ms); 1600 } else { 1601 cbt->timeout_ms *= 2; 1602 cbt->close_ms *= 2; 1603 } 1604 } else { 1605 cbt->close_ms = cbt->timeout_ms 1606 = circuit_build_times_get_initial_timeout(); 1607 } 1608 #undef MAX_TIMEOUT 1609 1610 cbt_control_event_buildtimeout_set(cbt, BUILDTIMEOUT_SET_EVENT_RESET); 1611 1612 log_notice(LD_CIRC, 1613 "Your network connection speed appears to have changed. Resetting " 1614 "timeout to %ldms after %d timeouts and %d buildtimes.", 1615 tor_lround(cbt->timeout_ms), timeout_count, total_build_times); 1616 1617 return 1; 1618 } 1619 1620 /** 1621 * Count the number of timeouts in a set of cbt data. 1622 */ 1623 double 1624 circuit_build_times_timeout_rate(const circuit_build_times_t *cbt) 1625 { 1626 int i=0,timeouts=0; 1627 for (i = 0; i < CBT_NCIRCUITS_TO_OBSERVE; i++) { 1628 if (cbt->circuit_build_times[i] >= cbt->timeout_ms) { 1629 timeouts++; 1630 } 1631 } 1632 1633 if (!cbt->total_build_times) 1634 return 0; 1635 1636 return ((double)timeouts)/cbt->total_build_times; 1637 } 1638 1639 /** 1640 * Count the number of closed circuits in a set of cbt data. 1641 */ 1642 double 1643 circuit_build_times_close_rate(const circuit_build_times_t *cbt) 1644 { 1645 int i=0,closed=0; 1646 for (i = 0; i < CBT_NCIRCUITS_TO_OBSERVE; i++) { 1647 if (cbt->circuit_build_times[i] == CBT_BUILD_ABANDONED) { 1648 closed++; 1649 } 1650 } 1651 1652 if (!cbt->total_build_times) 1653 return 0; 1654 1655 return ((double)closed)/cbt->total_build_times; 1656 } 1657 1658 /** 1659 * Store a timeout as a synthetic value. 1660 * 1661 * Returns true if the store was successful and we should possibly 1662 * update our timeout estimate. 1663 */ 1664 int 1665 circuit_build_times_count_close(circuit_build_times_t *cbt, 1666 int did_onehop, 1667 time_t start_time) 1668 { 1669 if (circuit_build_times_disabled(get_options())) { 1670 cbt->close_ms = cbt->timeout_ms 1671 = circuit_build_times_get_initial_timeout(); 1672 return 0; 1673 } 1674 1675 /* Record this force-close to help determine if the network is dead */ 1676 circuit_build_times_network_close(cbt, did_onehop, start_time); 1677 1678 /* Only count timeouts if network is live.. */ 1679 if (!circuit_build_times_network_check_live(cbt)) { 1680 return 0; 1681 } 1682 1683 circuit_build_times_add_time(cbt, CBT_BUILD_ABANDONED); 1684 return 1; 1685 } 1686 1687 /** 1688 * Update timeout counts to determine if we need to expire 1689 * our build time history due to excessive timeouts. 1690 * 1691 * We do not record any actual time values at this stage; 1692 * we are only interested in recording the fact that a timeout 1693 * happened. We record the time values via 1694 * circuit_build_times_count_close() and circuit_build_times_add_time(). 1695 */ 1696 void 1697 circuit_build_times_count_timeout(circuit_build_times_t *cbt, 1698 int did_onehop) 1699 { 1700 if (circuit_build_times_disabled(get_options())) { 1701 cbt->close_ms = cbt->timeout_ms 1702 = circuit_build_times_get_initial_timeout(); 1703 return; 1704 } 1705 1706 /* Register the fact that a timeout just occurred. */ 1707 circuit_build_times_network_timeout(cbt, did_onehop); 1708 1709 /* If there are a ton of timeouts, we should reset 1710 * the circuit build timeout. */ 1711 circuit_build_times_network_check_changed(cbt); 1712 } 1713 1714 /** 1715 * Estimate a new timeout based on history and set our timeout 1716 * variable accordingly. 1717 */ 1718 static int 1719 circuit_build_times_set_timeout_worker(circuit_build_times_t *cbt) 1720 { 1721 build_time_t max_time; 1722 if (!circuit_build_times_enough_to_compute(cbt)) 1723 return 0; 1724 1725 if (!circuit_build_times_update_alpha(cbt)) 1726 return 0; 1727 1728 cbt->timeout_ms = circuit_build_times_calculate_timeout(cbt, 1729 circuit_build_times_quantile_cutoff()); 1730 1731 cbt->close_ms = circuit_build_times_calculate_timeout(cbt, 1732 circuit_build_times_close_quantile()); 1733 1734 max_time = circuit_build_times_max(cbt); 1735 1736 if (cbt->timeout_ms > max_time) { 1737 log_info(LD_CIRC, 1738 "Circuit build timeout of %dms is beyond the maximum build " 1739 "time we have ever observed. Capping it to %dms.", 1740 (int)cbt->timeout_ms, max_time); 1741 cbt->timeout_ms = max_time; 1742 } 1743 1744 if (max_time < INT32_MAX/2 && cbt->close_ms > 2*max_time) { 1745 log_info(LD_CIRC, 1746 "Circuit build measurement period of %dms is more than twice " 1747 "the maximum build time we have ever observed. Capping it to " 1748 "%dms.", (int)cbt->close_ms, 2*max_time); 1749 cbt->close_ms = 2*max_time; 1750 } 1751 1752 /* Sometimes really fast guard nodes give us such a steep curve 1753 * that this ends up being not that much greater than timeout_ms. 1754 * Make it be at least 1 min to handle this case. */ 1755 cbt->close_ms = MAX(cbt->close_ms, circuit_build_times_initial_timeout()); 1756 1757 cbt->have_computed_timeout = 1; 1758 return 1; 1759 } 1760 1761 /** 1762 * Exposed function to compute a new timeout. Dispatches events and 1763 * also filters out extremely high timeout values. 1764 */ 1765 void 1766 circuit_build_times_set_timeout(circuit_build_times_t *cbt) 1767 { 1768 long prev_timeout = tor_lround(cbt->timeout_ms/1000); 1769 double timeout_rate; 1770 1771 /* 1772 * Just return if we aren't using adaptive timeouts 1773 */ 1774 if (circuit_build_times_disabled(get_options())) 1775 return; 1776 1777 if (!circuit_build_times_set_timeout_worker(cbt)) 1778 return; 1779 1780 if (cbt->timeout_ms < circuit_build_times_min_timeout()) { 1781 log_notice(LD_CIRC, "Set buildtimeout to low value %fms. Setting to %dms", 1782 cbt->timeout_ms, circuit_build_times_min_timeout()); 1783 cbt->timeout_ms = circuit_build_times_min_timeout(); 1784 if (cbt->close_ms < cbt->timeout_ms) { 1785 /* This shouldn't happen because of MAX() in timeout_worker above, 1786 * but doing it just in case */ 1787 cbt->close_ms = circuit_build_times_initial_timeout(); 1788 } 1789 } 1790 1791 cbt_control_event_buildtimeout_set(cbt, BUILDTIMEOUT_SET_EVENT_COMPUTED); 1792 1793 timeout_rate = circuit_build_times_timeout_rate(cbt); 1794 1795 if (prev_timeout > tor_lround(cbt->timeout_ms/1000)) { 1796 log_info(LD_CIRC, 1797 "Based on %d circuit times, it looks like we don't need to " 1798 "wait so long for circuits to finish. We will now assume a " 1799 "circuit is too slow to use after waiting %ld milliseconds.", 1800 cbt->total_build_times, 1801 tor_lround(cbt->timeout_ms)); 1802 log_info(LD_CIRC, 1803 "Circuit timeout data: %fms, %fms, Xm: %d, a: %f, r: %f", 1804 cbt->timeout_ms, cbt->close_ms, cbt->Xm, cbt->alpha, 1805 timeout_rate); 1806 } else if (prev_timeout < tor_lround(cbt->timeout_ms/1000)) { 1807 log_info(LD_CIRC, 1808 "Based on %d circuit times, it looks like we need to wait " 1809 "longer for circuits to finish. We will now assume a " 1810 "circuit is too slow to use after waiting %ld milliseconds.", 1811 cbt->total_build_times, 1812 tor_lround(cbt->timeout_ms)); 1813 log_info(LD_CIRC, 1814 "Circuit timeout data: %fms, %fms, Xm: %d, a: %f, r: %f", 1815 cbt->timeout_ms, cbt->close_ms, cbt->Xm, cbt->alpha, 1816 timeout_rate); 1817 } else { 1818 log_info(LD_CIRC, 1819 "Set circuit build timeout to %ldms (%fms, %fms, Xm: %d, a: %f," 1820 " r: %f) based on %d circuit times", 1821 tor_lround(cbt->timeout_ms), 1822 cbt->timeout_ms, cbt->close_ms, cbt->Xm, cbt->alpha, timeout_rate, 1823 cbt->total_build_times); 1824 } 1825 } 1826 1827 #ifdef TOR_UNIT_TESTS 1828 /** Make a note that we're running unit tests (rather than running Tor 1829 * itself), so we avoid clobbering our state file. */ 1830 void 1831 circuitbuild_running_unit_tests(void) 1832 { 1833 unit_tests = 1; 1834 } 1835 #endif /* defined(TOR_UNIT_TESTS) */ 1836 1837 void 1838 circuit_build_times_update_last_circ(circuit_build_times_t *cbt) 1839 { 1840 cbt->last_circ_at = approx_time(); 1841 }