congestion_control_common.c (38578B)
1 /* Copyright (c) 2021, The Tor Project, Inc. */ 2 /* See LICENSE for licensing information */ 3 4 /** 5 * \file congestion_control_common.c 6 * \brief Common code used by all congestion control algorithms. 7 */ 8 9 #define TOR_CONGESTION_CONTROL_COMMON_PRIVATE 10 #define TOR_CONGESTION_CONTROL_PRIVATE 11 12 #include "core/or/or.h" 13 14 #include "core/crypto/onion_crypto.h" 15 #include "core/or/circuitlist.h" 16 #include "core/or/crypt_path.h" 17 #include "core/or/or_circuit_st.h" 18 #include "core/or/origin_circuit_st.h" 19 #include "core/or/channel.h" 20 #include "core/mainloop/connection.h" 21 #include "core/or/sendme.h" 22 #include "core/or/congestion_control_st.h" 23 #include "core/or/congestion_control_common.h" 24 #include "core/or/congestion_control_vegas.h" 25 #include "core/or/congestion_control_st.h" 26 #include "core/or/conflux.h" 27 #include "core/or/conflux_util.h" 28 #include "core/or/trace_probes_cc.h" 29 #include "lib/time/compat_time.h" 30 #include "feature/nodelist/networkstatus.h" 31 #include "app/config/config.h" 32 33 #include "trunnel/congestion_control.h" 34 #include "trunnel/extension.h" 35 36 /* Consensus parameter defaults. 37 * 38 * More details for each of the parameters can be found in proposal 324, 39 * section 6.5 including tuning notes. */ 40 #define SENDME_INC_DFLT (TLS_RECORD_MAX_CELLS) 41 #define CIRCWINDOW_INIT (4*SENDME_INC_DFLT) 42 43 #define CC_ALG_DFLT (CC_ALG_VEGAS) 44 #define CC_ALG_DFLT_ALWAYS (CC_ALG_VEGAS) 45 46 #define CWND_INC_DFLT (1) 47 #define CWND_INC_PCT_SS_DFLT (100) 48 #define CWND_INC_RATE_DFLT (SENDME_INC_DFLT) 49 50 #define CWND_MIN_DFLT (CIRCWINDOW_INIT) 51 #define CWND_MAX_DFLT (INT32_MAX) 52 53 #define BWE_SENDME_MIN_DFLT (5) 54 55 #define N_EWMA_CWND_PCT_DFLT (50) 56 #define N_EWMA_MAX_DFLT (10) 57 #define N_EWMA_SS_DFLT (2) 58 59 #define RTT_RESET_PCT_DFLT (100) 60 61 /* BDP algorithms for each congestion control algorithms use the piecewise 62 * estimattor. See section 3.1.4 of proposal 324. */ 63 #define WESTWOOD_BDP_ALG BDP_ALG_PIECEWISE 64 #define VEGAS_BDP_MIX_ALG BDP_ALG_PIECEWISE 65 #define NOLA_BDP_ALG BDP_ALG_PIECEWISE 66 67 /* Indicate OR connection buffer limitations used to stop or start accepting 68 * cells in its outbuf. 69 * 70 * These watermarks are historical to tor in a sense that they've been used 71 * almost from the genesis point. And were likely defined to fit the bounds of 72 * TLS records of 16KB which would be around 32 cells. 73 * 74 * These are defaults of the consensus parameter "orconn_high" and "orconn_low" 75 * values. */ 76 #define OR_CONN_HIGHWATER_DFLT (32*1024) 77 #define OR_CONN_LOWWATER_DFLT (16*1024) 78 79 /* Low and high values of circuit cell queue sizes. They are used to tell when 80 * to start or stop reading on the streams attached on the circuit. 81 * 82 * These are defaults of the consensus parameters "cellq_high" and "cellq_low". 83 */ 84 #define CELL_QUEUE_LOW_DFLT (10) 85 #define CELL_QUEUE_HIGH_DFLT (256) 86 87 static bool congestion_control_update_circuit_bdp(congestion_control_t *, 88 const circuit_t *, 89 uint64_t); 90 /* Number of times the RTT value was reset. For MetricsPort. */ 91 static uint64_t num_rtt_reset; 92 93 /* Number of times the clock was stalled. For MetricsPort. */ 94 static uint64_t num_clock_stalls; 95 96 /* Consensus parameters cached. The non static ones are extern. */ 97 static uint32_t cwnd_max = CWND_MAX_DFLT; 98 int32_t cell_queue_high = CELL_QUEUE_HIGH_DFLT; 99 int32_t cell_queue_low = CELL_QUEUE_LOW_DFLT; 100 uint32_t or_conn_highwater = OR_CONN_HIGHWATER_DFLT; 101 uint32_t or_conn_lowwater = OR_CONN_LOWWATER_DFLT; 102 uint8_t cc_sendme_inc = SENDME_INC_DFLT; 103 STATIC cc_alg_t cc_alg = CC_ALG_DFLT; 104 105 /** 106 * Number of cwnd worth of sendme acks to smooth RTT and BDP with, 107 * using N_EWMA */ 108 static uint8_t n_ewma_cwnd_pct = N_EWMA_CWND_PCT_DFLT; 109 110 /** 111 * Maximum number N for the N-count EWMA averaging of RTT and BDP. 112 */ 113 static uint8_t n_ewma_max = N_EWMA_MAX_DFLT; 114 115 /** 116 * Maximum number N for the N-count EWMA averaging of RTT in Slow Start. 117 */ 118 static uint8_t n_ewma_ss = N_EWMA_SS_DFLT; 119 120 /** 121 * Minimum number of sendmes before we begin BDP estimates 122 */ 123 static uint8_t bwe_sendme_min = BWE_SENDME_MIN_DFLT; 124 125 /** 126 * Percentage of the current RTT to use when resetting the minimum RTT 127 * for a circuit. (RTT is reset when the cwnd hits cwnd_min). 128 */ 129 static uint8_t rtt_reset_pct = RTT_RESET_PCT_DFLT; 130 131 /** Metric to count the number of congestion control circuits **/ 132 uint64_t cc_stats_circs_created = 0; 133 134 /** Return the number of RTT reset that have been done. */ 135 uint64_t 136 congestion_control_get_num_rtt_reset(void) 137 { 138 return num_rtt_reset; 139 } 140 141 /** Return the number of clock stalls that have been done. */ 142 uint64_t 143 congestion_control_get_num_clock_stalls(void) 144 { 145 return num_clock_stalls; 146 } 147 148 /** 149 * Update global congestion control related consensus parameter values, 150 * every consensus update. 151 */ 152 void 153 congestion_control_new_consensus_params(const networkstatus_t *ns) 154 { 155 #define CELL_QUEUE_HIGH_MIN (1) 156 #define CELL_QUEUE_HIGH_MAX (1000) 157 cell_queue_high = networkstatus_get_param(ns, "cellq_high", 158 CELL_QUEUE_HIGH_DFLT, 159 CELL_QUEUE_HIGH_MIN, 160 CELL_QUEUE_HIGH_MAX); 161 162 #define CELL_QUEUE_LOW_MIN (1) 163 #define CELL_QUEUE_LOW_MAX (1000) 164 cell_queue_low = networkstatus_get_param(ns, "cellq_low", 165 CELL_QUEUE_LOW_DFLT, 166 CELL_QUEUE_LOW_MIN, 167 CELL_QUEUE_LOW_MAX); 168 169 #define OR_CONN_HIGHWATER_MIN (CELL_PAYLOAD_SIZE) 170 #define OR_CONN_HIGHWATER_MAX (INT32_MAX) 171 or_conn_highwater = 172 networkstatus_get_param(ns, "orconn_high", 173 OR_CONN_HIGHWATER_DFLT, 174 OR_CONN_HIGHWATER_MIN, 175 OR_CONN_HIGHWATER_MAX); 176 177 #define OR_CONN_LOWWATER_MIN (CELL_PAYLOAD_SIZE) 178 #define OR_CONN_LOWWATER_MAX (INT32_MAX) 179 or_conn_lowwater = 180 networkstatus_get_param(ns, "orconn_low", 181 OR_CONN_LOWWATER_DFLT, 182 OR_CONN_LOWWATER_MIN, 183 OR_CONN_LOWWATER_MAX); 184 185 #define CWND_MAX_MIN 500 186 #define CWND_MAX_MAX (INT32_MAX) 187 cwnd_max = 188 networkstatus_get_param(NULL, "cc_cwnd_max", 189 CWND_MAX_DFLT, 190 CWND_MAX_MIN, 191 CWND_MAX_MAX); 192 193 #define RTT_RESET_PCT_MIN (0) 194 #define RTT_RESET_PCT_MAX (100) 195 rtt_reset_pct = 196 networkstatus_get_param(NULL, "cc_rtt_reset_pct", 197 RTT_RESET_PCT_DFLT, 198 RTT_RESET_PCT_MIN, 199 RTT_RESET_PCT_MAX); 200 201 #define SENDME_INC_MIN 1 202 #define SENDME_INC_MAX (254) 203 cc_sendme_inc = 204 networkstatus_get_param(NULL, "cc_sendme_inc", 205 SENDME_INC_DFLT, 206 SENDME_INC_MIN, 207 SENDME_INC_MAX); 208 209 #define CC_ALG_MIN 0 210 #define CC_ALG_MAX (NUM_CC_ALGS-1) 211 cc_alg = 212 networkstatus_get_param(NULL, "cc_alg", 213 CC_ALG_DFLT, 214 CC_ALG_MIN, 215 CC_ALG_MAX); 216 if (cc_alg != CC_ALG_SENDME && cc_alg != CC_ALG_VEGAS) { 217 // Does not need rate limiting because consensus updates 218 // are at most 1x/hour 219 log_warn(LD_BUG, "Unsupported congestion control algorithm %d", 220 cc_alg); 221 cc_alg = CC_ALG_DFLT; 222 } 223 224 #define BWE_SENDME_MIN_MIN 2 225 #define BWE_SENDME_MIN_MAX (20) 226 bwe_sendme_min = 227 networkstatus_get_param(NULL, "cc_bwe_min", 228 BWE_SENDME_MIN_DFLT, 229 BWE_SENDME_MIN_MIN, 230 BWE_SENDME_MIN_MAX); 231 232 #define N_EWMA_CWND_PCT_MIN 1 233 #define N_EWMA_CWND_PCT_MAX (255) 234 n_ewma_cwnd_pct = 235 networkstatus_get_param(NULL, "cc_ewma_cwnd_pct", 236 N_EWMA_CWND_PCT_DFLT, 237 N_EWMA_CWND_PCT_MIN, 238 N_EWMA_CWND_PCT_MAX); 239 240 #define N_EWMA_MAX_MIN 2 241 #define N_EWMA_MAX_MAX (INT32_MAX) 242 n_ewma_max = 243 networkstatus_get_param(NULL, "cc_ewma_max", 244 N_EWMA_MAX_DFLT, 245 N_EWMA_MAX_MIN, 246 N_EWMA_MAX_MAX); 247 248 #define N_EWMA_SS_MIN 2 249 #define N_EWMA_SS_MAX (INT32_MAX) 250 n_ewma_ss = 251 networkstatus_get_param(NULL, "cc_ewma_ss", 252 N_EWMA_SS_DFLT, 253 N_EWMA_SS_MIN, 254 N_EWMA_SS_MAX); 255 } 256 257 /** 258 * Set congestion control parameters on a circuit's congestion 259 * control object based on values from the consensus. 260 * 261 * cc_alg is the negotiated congestion control algorithm. 262 * 263 * sendme_inc is the number of packaged cells that a sendme cell 264 * acks. This parameter will come from circuit negotiation. 265 */ 266 static void 267 congestion_control_init_params(congestion_control_t *cc, 268 const circuit_params_t *params, 269 cc_path_t path) 270 { 271 const or_options_t *opts = get_options(); 272 cc->sendme_inc = params->sendme_inc_cells; 273 274 #define CWND_INIT_MIN SENDME_INC_DFLT 275 #define CWND_INIT_MAX (10000) 276 cc->cwnd = 277 networkstatus_get_param(NULL, "cc_cwnd_init", 278 CIRCWINDOW_INIT, 279 CWND_INIT_MIN, 280 CWND_INIT_MAX); 281 282 #define CWND_INC_PCT_SS_MIN 1 283 #define CWND_INC_PCT_SS_MAX (500) 284 cc->cwnd_inc_pct_ss = 285 networkstatus_get_param(NULL, "cc_cwnd_inc_pct_ss", 286 CWND_INC_PCT_SS_DFLT, 287 CWND_INC_PCT_SS_MIN, 288 CWND_INC_PCT_SS_MAX); 289 290 #define CWND_INC_MIN 1 291 #define CWND_INC_MAX (1000) 292 cc->cwnd_inc = 293 networkstatus_get_param(NULL, "cc_cwnd_inc", 294 CWND_INC_DFLT, 295 CWND_INC_MIN, 296 CWND_INC_MAX); 297 298 #define CWND_INC_RATE_MIN 1 299 #define CWND_INC_RATE_MAX (250) 300 cc->cwnd_inc_rate = 301 networkstatus_get_param(NULL, "cc_cwnd_inc_rate", 302 CWND_INC_RATE_DFLT, 303 CWND_INC_RATE_MIN, 304 CWND_INC_RATE_MAX); 305 306 #define CWND_MIN_MIN SENDME_INC_DFLT 307 #define CWND_MIN_MAX (1000) 308 cc->cwnd_min = 309 networkstatus_get_param(NULL, "cc_cwnd_min", 310 CWND_MIN_DFLT, 311 CWND_MIN_MIN, 312 CWND_MIN_MAX); 313 314 /* If the consensus says to use OG sendme, but torrc has 315 * always-enabled, use the default "always" alg (vegas), 316 * else use cached conensus alg. */ 317 if (cc_alg == CC_ALG_SENDME && opts->AlwaysCongestionControl) { 318 cc->cc_alg = CC_ALG_DFLT_ALWAYS; 319 } else { 320 cc->cc_alg = cc_alg; 321 } 322 323 /* Algorithm-specific parameters */ 324 if (cc->cc_alg == CC_ALG_VEGAS) { 325 congestion_control_vegas_set_params(cc, path); 326 } else { 327 // This should not happen anymore 328 log_warn(LD_BUG, "Unknown congestion control algorithm %d", 329 cc->cc_alg); 330 } 331 } 332 333 /** Returns true if congestion control is enabled in the most recent 334 * consensus, or if __AlwaysCongestionControl is set to true. 335 * 336 * Note that this function (and many many other functions) should not 337 * be called from the CPU worker threads when handling congestion 338 * control negotiation. Relevant values are marshaled into the 339 * `circuit_params_t` struct, in order to be used in worker threads 340 * without touching global state. Use those values in CPU worker 341 * threads, instead of calling this function. 342 * 343 * The danger is still present, in your time, as it was in ours. 344 */ 345 bool 346 congestion_control_enabled(void) 347 { 348 const or_options_t *opts = NULL; 349 350 tor_assert_nonfatal_once(in_main_thread()); 351 352 opts = get_options(); 353 354 /* If the user has set "__AlwaysCongesttionControl", 355 * then always try to negotiate congestion control, regardless 356 * of consensus param. This is to be used for testing and sbws. 357 * 358 * Note that we do *not* allow disabling congestion control 359 * if the consensus says to use it, as this is bad for queueing 360 * and fairness. */ 361 if (opts->AlwaysCongestionControl) 362 return 1; 363 364 return cc_alg != CC_ALG_SENDME; 365 } 366 367 #ifdef TOR_UNIT_TESTS 368 /** 369 * For unit tests only: set the cached consensus cc alg to 370 * specified value. 371 */ 372 void 373 congestion_control_set_cc_enabled(void) 374 { 375 cc_alg = CC_ALG_VEGAS; 376 } 377 378 /** 379 * For unit tests only: set the cached consensus cc alg to 380 * specified value. 381 */ 382 void 383 congestion_control_set_cc_disabled(void) 384 { 385 cc_alg = CC_ALG_SENDME; 386 } 387 #endif 388 389 /** 390 * Allocate and initialize fields in congestion control object. 391 * 392 * cc_alg is the negotiated congestion control algorithm. 393 * 394 * sendme_inc is the number of packaged cells that a sendme cell 395 * acks. This parameter will come from circuit negotiation. 396 */ 397 static void 398 congestion_control_init(congestion_control_t *cc, 399 const circuit_params_t *params, 400 cc_path_t path) 401 { 402 cc->sendme_pending_timestamps = smartlist_new(); 403 404 cc->in_slow_start = 1; 405 congestion_control_init_params(cc, params, path); 406 407 cc->next_cc_event = CWND_UPDATE_RATE(cc); 408 } 409 410 /** Allocate and initialize a new congestion control object */ 411 congestion_control_t * 412 congestion_control_new(const circuit_params_t *params, cc_path_t path) 413 { 414 congestion_control_t *cc = tor_malloc_zero(sizeof(congestion_control_t)); 415 416 congestion_control_init(cc, params, path); 417 418 cc_stats_circs_created++; 419 420 return cc; 421 } 422 423 /** 424 * Free a congestion control object and its associated state. 425 */ 426 void 427 congestion_control_free_(congestion_control_t *cc) 428 { 429 if (!cc) 430 return; 431 432 SMARTLIST_FOREACH(cc->sendme_pending_timestamps, uint64_t *, t, tor_free(t)); 433 smartlist_free(cc->sendme_pending_timestamps); 434 435 tor_free(cc); 436 } 437 438 /** 439 * Enqueue a u64 timestamp to the end of a queue of timestamps. 440 */ 441 STATIC inline void 442 enqueue_timestamp(smartlist_t *timestamps_u64, uint64_t timestamp_usec) 443 { 444 uint64_t *timestamp_ptr = tor_malloc(sizeof(uint64_t)); 445 *timestamp_ptr = timestamp_usec; 446 447 smartlist_add(timestamps_u64, timestamp_ptr); 448 } 449 450 /** 451 * Dequeue a u64 monotime usec timestamp from the front of a 452 * smartlist of pointers to 64. 453 */ 454 static inline uint64_t 455 dequeue_timestamp(smartlist_t *timestamps_u64_usecs) 456 { 457 uint64_t *timestamp_ptr = smartlist_get(timestamps_u64_usecs, 0); 458 uint64_t timestamp_u64; 459 460 if (BUG(!timestamp_ptr)) { 461 log_err(LD_CIRC, "Congestion control timestamp list became empty!"); 462 return 0; 463 } 464 465 timestamp_u64 = *timestamp_ptr; 466 smartlist_del_keeporder(timestamps_u64_usecs, 0); 467 tor_free(timestamp_ptr); 468 469 return timestamp_u64; 470 } 471 472 /** 473 * Returns the number N of N-count EWMA, for averaging RTT and BDP over 474 * N SENDME acks. 475 * 476 * This N is bracketed between a divisor of the number of acks in a CWND 477 * and a max value. It is always at least 2. 478 */ 479 static inline uint64_t 480 n_ewma_count(const congestion_control_t *cc) 481 { 482 uint64_t ewma_cnt = 0; 483 484 if (cc->in_slow_start) { 485 /* In slow-start, we check the Vegas condition every sendme, 486 * so much lower ewma counts are needed. */ 487 ewma_cnt = n_ewma_ss; 488 } else { 489 /* After slow-start, we check the Vegas condition only once per 490 * CWND, so it is better to average over longer periods. */ 491 ewma_cnt = MIN(CWND_UPDATE_RATE(cc)*n_ewma_cwnd_pct/100, 492 n_ewma_max); 493 } 494 ewma_cnt = MAX(ewma_cnt, 2); 495 return ewma_cnt; 496 } 497 498 /** 499 * Get a package window from either old sendme logic, or congestion control. 500 * 501 * A package window is how many cells you can still send. 502 */ 503 int 504 congestion_control_get_package_window(const circuit_t *circ, 505 const crypt_path_t *cpath) 506 { 507 int package_window; 508 congestion_control_t *cc; 509 510 tor_assert(circ); 511 512 if (cpath) { 513 package_window = cpath->package_window; 514 cc = cpath->ccontrol; 515 } else { 516 package_window = circ->package_window; 517 cc = circ->ccontrol; 518 } 519 520 if (!cc) { 521 return package_window; 522 } else { 523 /* Inflight can be above cwnd if cwnd was just reduced */ 524 if (cc->inflight > cc->cwnd) 525 return 0; 526 /* In the extremely unlikely event that cwnd-inflight is larger than 527 * INT32_MAX, just return that cap, so old code doesn't explode. */ 528 else if (cc->cwnd - cc->inflight > INT32_MAX) 529 return INT32_MAX; 530 else 531 return (int)(cc->cwnd - cc->inflight); 532 } 533 } 534 535 /** 536 * Returns the number of cells that are acked by every sendme. 537 */ 538 int 539 sendme_get_inc_count(const circuit_t *circ, const crypt_path_t *layer_hint) 540 { 541 int sendme_inc = CIRCWINDOW_INCREMENT; 542 congestion_control_t *cc = NULL; 543 544 if (layer_hint) { 545 cc = layer_hint->ccontrol; 546 } else { 547 cc = circ->ccontrol; 548 } 549 550 if (cc) { 551 sendme_inc = cc->sendme_inc; 552 } 553 554 return sendme_inc; 555 } 556 557 /** Return true iff the next cell we send will result in the other endpoint 558 * sending a SENDME. 559 * 560 * We are able to know that because the package or inflight window value minus 561 * one cell (the possible SENDME cell) should be a multiple of the 562 * cells-per-sendme increment value (set via consensus parameter, negotiated 563 * for the circuit, and passed in as sendme_inc). 564 * 565 * This function is used when recording a cell digest and this is done quite 566 * low in the stack when decrypting or encrypting a cell. The window is only 567 * updated once the cell is actually put in the outbuf. 568 */ 569 bool 570 circuit_sent_cell_for_sendme(const circuit_t *circ, 571 const crypt_path_t *layer_hint) 572 { 573 congestion_control_t *cc; 574 int window; 575 576 tor_assert(circ); 577 578 if (layer_hint) { 579 window = layer_hint->package_window; 580 cc = layer_hint->ccontrol; 581 } else { 582 window = circ->package_window; 583 cc = circ->ccontrol; 584 } 585 586 /* If we are using congestion control and the alg is not 587 * old-school 'fixed', then use cc->inflight to determine 588 * when sendmes will be sent */ 589 if (cc) { 590 if (!cc->inflight) 591 return false; 592 593 /* This check must be +1 because this function is called *before* 594 * inflight is incremented for the sent cell */ 595 if ((cc->inflight+1) % cc->sendme_inc != 0) 596 return false; 597 598 return true; 599 } 600 601 /* At the start of the window, no SENDME will be expected. */ 602 if (window == CIRCWINDOW_START) { 603 return false; 604 } 605 606 /* Are we at the limit of the increment and if not, we don't expect next 607 * cell is a SENDME. 608 * 609 * We test against the window minus 1 because when we are looking if the 610 * next cell is a SENDME, the window (either package or deliver) hasn't been 611 * decremented just yet so when this is called, we are currently processing 612 * the "window - 1" cell. 613 */ 614 if (((window - 1) % CIRCWINDOW_INCREMENT) != 0) { 615 return false; 616 } 617 618 /* Next cell is expected to be a SENDME. */ 619 return true; 620 } 621 622 /** 623 * Call-in to tell congestion control code that this circuit sent a cell. 624 * 625 * This updates the 'inflight' counter, and if this is a cell that will 626 * cause the other end to send a SENDME, record the current time in a list 627 * of pending timestamps, so that we can later compute the circuit RTT when 628 * the SENDME comes back. */ 629 void 630 congestion_control_note_cell_sent(congestion_control_t *cc, 631 const circuit_t *circ, 632 const crypt_path_t *cpath) 633 { 634 tor_assert(circ); 635 tor_assert(cc); 636 637 /* Is this the last cell before a SENDME? The idea is that if the 638 * package_window reaches a multiple of the increment, after this cell, we 639 * should expect a SENDME. Note that this function must be called *before* 640 * we account for the sent cell. */ 641 if (!circuit_sent_cell_for_sendme(circ, cpath)) { 642 cc->inflight++; 643 return; 644 } 645 646 cc->inflight++; 647 648 /* Record this cell time for RTT computation when SENDME arrives */ 649 enqueue_timestamp(cc->sendme_pending_timestamps, 650 monotime_absolute_usec()); 651 } 652 653 /** 654 * Upon receipt of a SENDME, pop the oldest timestamp off the timestamp 655 * list, and use this to update RTT. 656 * 657 * Returns true if circuit estimates were successfully updated, false 658 * otherwise. 659 */ 660 bool 661 congestion_control_update_circuit_estimates(congestion_control_t *cc, 662 const circuit_t *circ) 663 { 664 uint64_t now_usec = monotime_absolute_usec(); 665 666 /* Update RTT first, then BDP. BDP needs fresh RTT */ 667 uint64_t curr_rtt_usec = congestion_control_update_circuit_rtt(cc, now_usec); 668 return congestion_control_update_circuit_bdp(cc, circ, curr_rtt_usec); 669 } 670 671 /** 672 * Returns true if we have enough time data to use heuristics 673 * to compare RTT to a baseline. 674 */ 675 static bool 676 time_delta_should_use_heuristics(const congestion_control_t *cc) 677 { 678 /* If we have exited slow start and also have an EWMA RTT, we 679 * should have processed at least a cwnd worth of RTTs */ 680 if (!cc->in_slow_start && cc->ewma_rtt_usec) { 681 return true; 682 } 683 684 /* Not enough data to estimate clock jumps */ 685 return false; 686 } 687 688 STATIC bool is_monotime_clock_broken = false; 689 690 /** 691 * Returns true if the monotime delta is 0, or is significantly 692 * different than the previous delta. Either case indicates 693 * that the monotime time source stalled or jumped. 694 * 695 * Also caches the clock state in the is_monotime_clock_broken flag, 696 * so we can also provide a is_monotime_clock_reliable() function, 697 * used by flow control rate timing. 698 */ 699 STATIC bool 700 time_delta_stalled_or_jumped(const congestion_control_t *cc, 701 uint64_t old_delta, uint64_t new_delta) 702 { 703 #define DELTA_DISCREPENCY_RATIO_MAX 5000 704 /* If we have a 0 new_delta, that is definitely a monotime stall */ 705 if (new_delta == 0) { 706 static ratelim_t stall_info_limit = RATELIM_INIT(60); 707 log_fn_ratelim(&stall_info_limit, LOG_INFO, LD_CIRC, 708 "Congestion control cannot measure RTT due to monotime stall."); 709 710 is_monotime_clock_broken = true; 711 return true; 712 } 713 714 /* 715 * For the heuristic cases, we need at least a few timestamps, 716 * to average out any previous partial stalls or jumps. So until 717 * that point, let's just assume its OK. 718 */ 719 if (!time_delta_should_use_heuristics(cc)) { 720 return false; 721 } 722 723 /* If old_delta is significantly larger than new_delta, then 724 * this means that the monotime clock could have recently 725 * stopped moving forward. However, use the cache for this 726 * value, because it may also be caused by network activity, 727 * or by a previous clock jump that was not detected. 728 * 729 * So if we have not gotten a 0-delta recently, we will 730 * still allow this new low RTT, but just yell about it. */ 731 if (old_delta > new_delta * DELTA_DISCREPENCY_RATIO_MAX) { 732 static ratelim_t dec_notice_limit = RATELIM_INIT(300); 733 log_fn_ratelim(&dec_notice_limit, LOG_NOTICE, LD_CIRC, 734 "Sudden decrease in circuit RTT (%"PRIu64" vs %"PRIu64 735 "), likely due to clock jump.", 736 new_delta/1000, old_delta/1000); 737 738 return is_monotime_clock_broken; 739 } 740 741 /* If new_delta is significantly larger than old_delta, then 742 * this means that the monotime clock suddenly jumped forward. 743 * However, do not cache this value, because it may also be caused 744 * by network activity. 745 */ 746 if (new_delta > old_delta * DELTA_DISCREPENCY_RATIO_MAX) { 747 static ratelim_t dec_notice_limit = RATELIM_INIT(300); 748 log_fn_ratelim(&dec_notice_limit, LOG_PROTOCOL_WARN, LD_CIRC, 749 "Sudden increase in circuit RTT (%"PRIu64" vs %"PRIu64 750 "), likely due to clock jump or suspended remote endpoint.", 751 new_delta/1000, old_delta/1000); 752 753 return true; 754 } 755 756 /* All good! Update cached status, too */ 757 is_monotime_clock_broken = false; 758 759 return false; 760 } 761 762 /** 763 * Is the monotime clock stalled according to any circuits? 764 */ 765 bool 766 is_monotime_clock_reliable(void) 767 { 768 return !is_monotime_clock_broken; 769 } 770 771 /** 772 * Called when we get a SENDME. Updates circuit RTT by pulling off a 773 * timestamp of when we sent the CIRCWINDOW_INCREMENT-th cell from 774 * the queue of such timestamps, and comparing that to current time. 775 * 776 * Also updates min, max, and EWMA of RTT. 777 * 778 * Returns the current circuit RTT in usecs, or 0 if it could not be 779 * measured (due to clock jump, stall, etc). 780 */ 781 STATIC uint64_t 782 congestion_control_update_circuit_rtt(congestion_control_t *cc, 783 uint64_t now_usec) 784 { 785 uint64_t rtt, ewma_cnt; 786 uint64_t sent_at_timestamp; 787 788 tor_assert(cc); 789 790 /* Get the time that we sent the cell that resulted in the other 791 * end sending this sendme. Use this to calculate RTT */ 792 sent_at_timestamp = dequeue_timestamp(cc->sendme_pending_timestamps); 793 794 rtt = now_usec - sent_at_timestamp; 795 796 /* Do not update RTT at all if it looks fishy */ 797 if (time_delta_stalled_or_jumped(cc, cc->ewma_rtt_usec, rtt)) { 798 num_clock_stalls++; /* Accounting */ 799 return 0; 800 } 801 802 ewma_cnt = n_ewma_count(cc); 803 804 cc->ewma_rtt_usec = n_count_ewma(rtt, cc->ewma_rtt_usec, ewma_cnt); 805 806 if (rtt > cc->max_rtt_usec) { 807 cc->max_rtt_usec = rtt; 808 } 809 810 if (cc->min_rtt_usec == 0) { 811 // If we do not have a min_rtt yet, use current ewma 812 cc->min_rtt_usec = cc->ewma_rtt_usec; 813 } else if (cc->cwnd == cc->cwnd_min && !cc->in_slow_start) { 814 // Raise min rtt if cwnd hit cwnd_min. This gets us out of a wedge state 815 // if we hit cwnd_min due to an abnormally low rtt. 816 uint64_t new_rtt = percent_max_mix(cc->ewma_rtt_usec, cc->min_rtt_usec, 817 rtt_reset_pct); 818 819 static ratelim_t rtt_notice_limit = RATELIM_INIT(300); 820 log_fn_ratelim(&rtt_notice_limit, LOG_NOTICE, LD_CIRC, 821 "Resetting circ RTT from %"PRIu64" to %"PRIu64" due to low cwnd", 822 cc->min_rtt_usec/1000, new_rtt/1000); 823 824 cc->min_rtt_usec = new_rtt; 825 num_rtt_reset++; /* Accounting */ 826 } else if (cc->ewma_rtt_usec < cc->min_rtt_usec) { 827 // Using the EWMA for min instead of current RTT helps average out 828 // effects from other conns 829 cc->min_rtt_usec = cc->ewma_rtt_usec; 830 } 831 832 return rtt; 833 } 834 835 /** 836 * Called when we get a SENDME. Updates the bandwidth-delay-product (BDP) 837 * estimates of a circuit. Several methods of computing BDP are used, 838 * depending on scenario. While some congestion control algorithms only 839 * use one of these methods, we update them all because it's quick and easy. 840 * 841 * - now_usec is the current monotime in usecs. 842 * - curr_rtt_usec is the current circuit RTT in usecs. It may be 0 if no 843 * RTT could bemeasured. 844 * 845 * Returns true if we were able to update BDP, false otherwise. 846 */ 847 static bool 848 congestion_control_update_circuit_bdp(congestion_control_t *cc, 849 const circuit_t *circ, 850 uint64_t curr_rtt_usec) 851 { 852 int chan_q = 0; 853 unsigned int blocked_on_chan = 0; 854 855 tor_assert(cc); 856 857 if (CIRCUIT_IS_ORIGIN(circ)) { 858 /* origin circs use n_chan */ 859 chan_q = circ->n_chan_cells.n; 860 blocked_on_chan = circ->circuit_blocked_on_n_chan; 861 } else { 862 /* Both onion services and exits use or_circuit and p_chan */ 863 chan_q = CONST_TO_OR_CIRCUIT(circ)->p_chan_cells.n; 864 blocked_on_chan = circ->circuit_blocked_on_p_chan; 865 } 866 867 /* If we have no EWMA RTT, it is because monotime has been stalled 868 * or messed up the entire time so far. Set our BDP estimates directly 869 * to current cwnd */ 870 if (!cc->ewma_rtt_usec) { 871 uint64_t cwnd = cc->cwnd; 872 873 tor_assert_nonfatal(cc->cwnd <= cwnd_max); 874 875 /* If the channel is blocked, keep subtracting off the chan_q 876 * until we hit the min cwnd. */ 877 if (blocked_on_chan) { 878 /* Cast is fine because we're less than int32 */ 879 if (chan_q >= (int64_t)cwnd) { 880 log_notice(LD_CIRC, 881 "Clock stall with large chanq: %d %"PRIu64, chan_q, cwnd); 882 cwnd = cc->cwnd_min; 883 } else { 884 cwnd = MAX(cwnd - chan_q, cc->cwnd_min); 885 } 886 cc->blocked_chan = 1; 887 } else { 888 cc->blocked_chan = 0; 889 } 890 891 cc->bdp = cwnd; 892 893 static ratelim_t dec_notice_limit = RATELIM_INIT(300); 894 log_fn_ratelim(&dec_notice_limit, LOG_NOTICE, LD_CIRC, 895 "Our clock has been stalled for the entire lifetime of a circuit. " 896 "Performance may be sub-optimal."); 897 898 return blocked_on_chan; 899 } 900 901 /* Congestion window based BDP will respond to changes in RTT only, and is 902 * relative to cwnd growth. It is useful for correcting for BDP 903 * overestimation, but if BDP is higher than the current cwnd, it will 904 * underestimate it. 905 * 906 * We multiply here first to avoid precision issues from min_RTT being 907 * close to ewma RTT. Since all fields are u64, there is plenty of 908 * room here to multiply first. 909 */ 910 cc->bdp = cc->cwnd*cc->min_rtt_usec/cc->ewma_rtt_usec; 911 912 /* The orconn is blocked; use smaller of inflight vs SENDME */ 913 if (blocked_on_chan) { 914 log_info(LD_CIRC, "CC: Streams blocked on circ channel. Chanq: %d", 915 chan_q); 916 917 /* A blocked channel is an immediate congestion signal, but it still 918 * happens only once per cwnd */ 919 if (!cc->blocked_chan) { 920 cc->next_cc_event = 0; 921 cc->blocked_chan = 1; 922 } 923 } else { 924 /* If we were previously blocked, emit a new congestion event 925 * now that we are unblocked, to re-evaluate cwnd */ 926 if (cc->blocked_chan) { 927 cc->blocked_chan = 0; 928 cc->next_cc_event = 0; 929 log_info(LD_CIRC, "CC: Streams un-blocked on circ channel. Chanq: %d", 930 chan_q); 931 } 932 } 933 934 if (cc->next_cc_event == 0) { 935 if (CIRCUIT_IS_ORIGIN(circ)) { 936 log_info(LD_CIRC, 937 "CC: Circuit %d " 938 "SENDME RTT: %"PRIu64", %"PRIu64", %"PRIu64", %"PRIu64", " 939 "BDP estimate: %"PRIu64, 940 CONST_TO_ORIGIN_CIRCUIT(circ)->global_identifier, 941 cc->min_rtt_usec/1000, 942 curr_rtt_usec/1000, 943 cc->ewma_rtt_usec/1000, 944 cc->max_rtt_usec/1000, 945 cc->bdp); 946 } else { 947 log_info(LD_CIRC, 948 "CC: Circuit %"PRIu64":%d " 949 "SENDME RTT: %"PRIu64", %"PRIu64", %"PRIu64", %"PRIu64", " 950 "%"PRIu64, 951 CONST_TO_OR_CIRCUIT(circ)->p_chan->global_identifier, 952 CONST_TO_OR_CIRCUIT(circ)->p_circ_id, 953 cc->min_rtt_usec/1000, 954 curr_rtt_usec/1000, 955 cc->ewma_rtt_usec/1000, 956 cc->max_rtt_usec/1000, 957 cc->bdp); 958 } 959 } 960 961 /* We updated BDP this round if either we had a blocked channel, or 962 * the curr_rtt_usec was not 0. */ 963 bool ret = (blocked_on_chan || curr_rtt_usec != 0); 964 if (ret) { 965 tor_trace(TR_SUBSYS(cc), TR_EV(bdp_update), circ, cc, curr_rtt_usec); 966 } 967 return ret; 968 } 969 970 /** 971 * Dispatch the sendme to the appropriate congestion control algorithm. 972 */ 973 int 974 congestion_control_dispatch_cc_alg(congestion_control_t *cc, 975 circuit_t *circ) 976 { 977 int ret = -END_CIRC_REASON_INTERNAL; 978 979 tor_assert_nonfatal_once(cc->cc_alg == CC_ALG_VEGAS); 980 ret = congestion_control_vegas_process_sendme(cc, circ); 981 982 if (cc->cwnd > cwnd_max) { 983 static ratelim_t cwnd_limit = RATELIM_INIT(60); 984 log_fn_ratelim(&cwnd_limit, LOG_NOTICE, LD_CIRC, 985 "Congestion control cwnd %"PRIu64" exceeds max %d, clamping.", 986 cc->cwnd, cwnd_max); 987 cc->cwnd = cwnd_max; 988 } 989 990 /* If we have a non-zero RTT measurement, update conflux. */ 991 if (circ->conflux && cc->ewma_rtt_usec) 992 conflux_update_rtt(circ->conflux, circ, cc->ewma_rtt_usec); 993 994 return ret; 995 } 996 997 /** 998 * Build an extension field request to negotiate congestion control. 999 * 1000 * If congestion control is enabled, field TRUNNEL_EXT_TYPE_CC_FIELD_REQUEST 1001 * added to ext. It is a single 0-length field that signifies that we 1002 * want to use congestion control. 1003 * 1004 * If congestion control is not enabled, no extension is added. 1005 * 1006 * If there is a failure building the request, -1 is returned, else 0. 1007 */ 1008 int 1009 congestion_control_build_ext_request(trn_extension_t *ext) 1010 { 1011 trn_extension_field_t *field = NULL; 1012 1013 /* With congestion control enabled, add the request, else it is an empty 1014 * request in the payload. */ 1015 1016 if (congestion_control_enabled()) { 1017 /* Build the extension field that will hold the CC field. */ 1018 field = trn_extension_field_new(); 1019 trn_extension_field_set_field_type(field, 1020 TRUNNEL_EXT_TYPE_CC_FIELD_REQUEST); 1021 1022 /* No payload indicating a request to use congestion control. */ 1023 trn_extension_field_set_field_len(field, 0); 1024 1025 /* Build final extension. */ 1026 trn_extension_add_fields(ext, field); 1027 } 1028 1029 return 0; 1030 } 1031 1032 /** 1033 * Parse a congestion control ntorv3 request payload for extensions. 1034 * 1035 * On parsing failure, -1 is returned. 1036 * 1037 * If congestion control request is present, return 1. If it is not present, 1038 * return 0. 1039 * 1040 * WARNING: Called from CPU worker! Must not access any global state. 1041 */ 1042 int 1043 congestion_control_parse_ext_request(const trn_extension_t *ext) 1044 { 1045 ssize_t ret = 0; 1046 1047 if (trn_extension_find(ext, TRUNNEL_EXT_TYPE_CC_FIELD_REQUEST) == NULL) { 1048 /* No extension implies no support for congestion control. In this case, we 1049 * simply return 0 to indicate CC is disabled. */ 1050 ret = 0; 1051 } else { 1052 /* For congestion control to be enabled, we only need the field type. */ 1053 ret = 1; 1054 } 1055 1056 return (int)ret; 1057 } 1058 1059 /** 1060 * Given our observed parameters for circuits and congestion control, 1061 * as well as the parameters for the resulting circuit, build a response 1062 * payload using extension fields into *msg_out, with length specified in 1063 * *msg_out_len. 1064 * 1065 * If congestion control will be enabled, the extension field for 1066 * TRUNNEL_EXT_TYPE_CC_FIELD_RESPONSE will contain the sendme_inc value. 1067 * 1068 * If congestion control won't be enabled, an extension payload with 0 1069 * fields will be created. 1070 * 1071 * Return 0 if an extension payload was created in *msg_out, and -1 on 1072 * error. 1073 * 1074 * *msg_out must be freed if the return value is 0. 1075 * 1076 * WARNING: Called from CPU worker! Must not access any global state. 1077 */ 1078 int 1079 congestion_control_build_ext_response(const circuit_params_t *our_params, 1080 const circuit_params_t *circ_params, 1081 uint8_t **msg_out, size_t *msg_len_out) 1082 { 1083 ssize_t ret; 1084 uint8_t *request = NULL; 1085 trn_extension_t *ext = NULL; 1086 trn_extension_field_t *field = NULL; 1087 trn_extension_field_cc_t *cc_field = NULL; 1088 1089 tor_assert(our_params); 1090 tor_assert(circ_params); 1091 tor_assert(msg_out); 1092 tor_assert(msg_len_out); 1093 1094 ext = trn_extension_new(); 1095 1096 if (circ_params->cc_enabled) { 1097 /* Build the extension field that will hold the CC field. */ 1098 field = trn_extension_field_new(); 1099 trn_extension_field_set_field_type(field, 1100 TRUNNEL_EXT_TYPE_CC_FIELD_RESPONSE); 1101 1102 /* Build the congestion control field response. */ 1103 cc_field = trn_extension_field_cc_new(); 1104 trn_extension_field_cc_set_sendme_inc(cc_field, 1105 our_params->sendme_inc_cells); 1106 1107 ret = trn_extension_field_cc_encoded_len(cc_field); 1108 if (BUG(ret <= 0)) { 1109 trn_extension_field_free(field); 1110 goto err; 1111 } 1112 size_t field_len = ret; 1113 trn_extension_field_set_field_len(field, field_len); 1114 trn_extension_field_setlen_field(field, field_len); 1115 1116 uint8_t *field_array = trn_extension_field_getarray_field(field); 1117 ret = trn_extension_field_cc_encode(field_array, 1118 trn_extension_field_getlen_field(field), cc_field); 1119 if (BUG(ret <= 0)) { 1120 trn_extension_field_free(field); 1121 goto err; 1122 } 1123 1124 /* Build final extension. */ 1125 trn_extension_add_fields(ext, field); 1126 trn_extension_set_num(ext, 1); 1127 } 1128 1129 /* Encode extension. */ 1130 ret = trn_extension_encoded_len(ext); 1131 if (BUG(ret < 0)) { 1132 goto err; 1133 } 1134 size_t request_len = ret; 1135 request = tor_malloc_zero(request_len); 1136 ret = trn_extension_encode(request, request_len, ext); 1137 if (BUG(ret < 0)) { 1138 tor_free(request); 1139 goto err; 1140 } 1141 *msg_out = request; 1142 *msg_len_out = request_len; 1143 1144 /* We've just encoded the extension, clean everything. */ 1145 ret = 0; 1146 1147 err: 1148 trn_extension_free(ext); 1149 trn_extension_field_cc_free(cc_field); 1150 return (int)ret; 1151 } 1152 1153 /** Return true iff the given sendme increment is within the acceptable 1154 * margins. */ 1155 bool 1156 congestion_control_validate_sendme_increment(uint8_t sendme_inc) 1157 { 1158 /* We will only accept this response (and this circuit) if sendme_inc 1159 * is within +/- 1 of the current consensus value. We should not need 1160 * to change cc_sendme_inc much, and if we do, we can spread out those 1161 * changes over smaller increments once every 4 hours. Exits that 1162 * violate this range should just not be used. */ 1163 1164 if (sendme_inc == 0) 1165 return false; 1166 1167 if (sendme_inc > (congestion_control_sendme_inc() + 1) || 1168 sendme_inc < (congestion_control_sendme_inc() - 1)) { 1169 return false; 1170 } 1171 return true; 1172 } 1173 1174 /** Return 1 if CC is enabled which also will set the SENDME increment into our 1175 * params_out. Return 0 if CC is disabled. Else, return -1 on error. */ 1176 int 1177 congestion_control_parse_ext_response(const trn_extension_t *ext, 1178 circuit_params_t *params_out) 1179 { 1180 ssize_t ret = 0; 1181 const trn_extension_field_t *field = NULL; 1182 trn_extension_field_cc_t *cc_field = NULL; 1183 1184 /* We will only accept this response (and this circuit) if sendme_inc 1185 * is within a factor of 2 of our consensus value. We should not need 1186 * to change cc_sendme_inc much, and if we do, we can spread out those 1187 * changes over smaller increments once every 4 hours. Exits that 1188 * violate this range should just not be used. */ 1189 #define MAX_SENDME_INC_NEGOTIATE_FACTOR 2 1190 1191 field = trn_extension_find(ext, TRUNNEL_EXT_TYPE_CC_FIELD_RESPONSE); 1192 1193 if (field == 0) { 1194 ret = 0; 1195 } else { 1196 /* Parse the field into the congestion control field. */ 1197 ret = trn_extension_field_cc_parse(&cc_field, 1198 trn_extension_field_getconstarray_field(field), 1199 trn_extension_field_getlen_field(field)); 1200 if (ret < 0) { 1201 goto end; 1202 } 1203 1204 uint8_t sendme_inc_cells = 1205 trn_extension_field_cc_get_sendme_inc(cc_field); 1206 if (!congestion_control_validate_sendme_increment(sendme_inc_cells)) { 1207 ret = -1; 1208 goto end; 1209 } 1210 1211 /* All good. Get value and break */ 1212 params_out->sendme_inc_cells = sendme_inc_cells; 1213 ret = 1; 1214 } 1215 1216 end: 1217 trn_extension_field_cc_free(cc_field); 1218 1219 return (int)ret; 1220 } 1221 1222 /** 1223 * Returns a formatted string of fields containing congestion 1224 * control information, for the CIRC_BW control port event. 1225 * 1226 * An origin circuit can have a ccontrol object directly on it, 1227 * if it is an onion service, or onion client. Exit-bound clients 1228 * will have the ccontrol on the cpath associated with their exit 1229 * (the last one in the cpath list). 1230 * 1231 * WARNING: This function does not support leaky-pipe topology. It 1232 * is to be used for control port information only. 1233 */ 1234 char * 1235 congestion_control_get_control_port_fields(const origin_circuit_t *circ) 1236 { 1237 const congestion_control_t *ccontrol = NULL; 1238 char *ret = NULL; 1239 int len; 1240 1241 if (TO_CIRCUIT(circ)->ccontrol) { 1242 ccontrol = TO_CIRCUIT(circ)->ccontrol; 1243 } else if (circ->cpath && circ->cpath->prev->ccontrol) { 1244 /* Get ccontrol for last hop (exit) if it exists */ 1245 ccontrol = circ->cpath->prev->ccontrol; 1246 } 1247 1248 if (!ccontrol) 1249 return NULL; 1250 1251 len = tor_asprintf(&ret, 1252 " SS=%d CWND=%"PRIu64" RTT=%"PRIu64" MIN_RTT=%"PRIu64, 1253 ccontrol->in_slow_start, ccontrol->cwnd, 1254 ccontrol->ewma_rtt_usec/1000, 1255 ccontrol->min_rtt_usec/1000); 1256 if (len < 0) { 1257 log_warn(LD_BUG, "Unable to format event for controller."); 1258 return NULL; 1259 } 1260 1261 return ret; 1262 }