scheduler_kist.c (33200B)
1 /* Copyright (c) 2017-2021, The Tor Project, Inc. */ 2 /* See LICENSE for licensing information */ 3 4 /** 5 * @file scheduler_kist.c 6 * @brief Implements the KIST cell scheduler. 7 **/ 8 9 #define SCHEDULER_KIST_PRIVATE 10 11 #include "core/or/or.h" 12 #include "lib/buf/buffers.h" 13 #include "app/config/config.h" 14 #include "core/mainloop/connection.h" 15 #include "feature/nodelist/networkstatus.h" 16 #include "feature/relay/routermode.h" 17 #define CHANNEL_OBJECT_PRIVATE 18 #include "core/or/channel.h" 19 #include "core/or/channeltls.h" 20 #define SCHEDULER_PRIVATE 21 #include "core/or/scheduler.h" 22 #include "lib/math/fp.h" 23 24 #include "core/or/or_connection_st.h" 25 26 #ifdef HAVE_SYS_IOCTL_H 27 #include <sys/ioctl.h> 28 #endif 29 30 #ifdef HAVE_KIST_SUPPORT 31 /* Kernel interface needed for KIST. */ 32 #include <netinet/tcp.h> 33 #include <linux/sockios.h> 34 #endif /* HAVE_KIST_SUPPORT */ 35 36 /***************************************************************************** 37 * Data structures and supporting functions 38 *****************************************************************************/ 39 40 /* Socket_table hash table stuff. The socket_table keeps track of per-socket 41 * limit information imposed by kist and used by kist. */ 42 43 static uint32_t 44 socket_table_ent_hash(const socket_table_ent_t *ent) 45 { 46 return (uint32_t)ent->chan->global_identifier; 47 } 48 49 static unsigned 50 socket_table_ent_eq(const socket_table_ent_t *a, const socket_table_ent_t *b) 51 { 52 return a->chan == b->chan; 53 } 54 55 typedef HT_HEAD(socket_table_s, socket_table_ent_t) socket_table_t; 56 57 static socket_table_t socket_table = HT_INITIALIZER(); 58 59 HT_PROTOTYPE(socket_table_s, socket_table_ent_t, node, socket_table_ent_hash, 60 socket_table_ent_eq); 61 HT_GENERATE2(socket_table_s, socket_table_ent_t, node, socket_table_ent_hash, 62 socket_table_ent_eq, 0.6, tor_reallocarray, tor_free_); 63 64 /* outbuf_table hash table stuff. The outbuf_table keeps track of which 65 * channels have data sitting in their outbuf so the kist scheduler can force 66 * a write from outbuf to kernel periodically during a run and at the end of a 67 * run. */ 68 69 typedef struct outbuf_table_ent_t { 70 HT_ENTRY(outbuf_table_ent_t) node; 71 channel_t *chan; 72 } outbuf_table_ent_t; 73 74 static uint32_t 75 outbuf_table_ent_hash(const outbuf_table_ent_t *ent) 76 { 77 return (uint32_t)ent->chan->global_identifier; 78 } 79 80 static unsigned 81 outbuf_table_ent_eq(const outbuf_table_ent_t *a, const outbuf_table_ent_t *b) 82 { 83 return a->chan->global_identifier == b->chan->global_identifier; 84 } 85 86 HT_PROTOTYPE(outbuf_table_s, outbuf_table_ent_t, node, outbuf_table_ent_hash, 87 outbuf_table_ent_eq); 88 HT_GENERATE2(outbuf_table_s, outbuf_table_ent_t, node, outbuf_table_ent_hash, 89 outbuf_table_ent_eq, 0.6, tor_reallocarray, tor_free_); 90 91 /***************************************************************************** 92 * Other internal data 93 *****************************************************************************/ 94 95 /* Store the last time the scheduler was run so we can decide when to next run 96 * the scheduler based on it. */ 97 static monotime_t scheduler_last_run; 98 /* This is a factor for the extra_space calculation in kist per-socket limits. 99 * It is the number of extra congestion windows we want to write to the kernel. 100 */ 101 static double sock_buf_size_factor = 1.0; 102 /* How often the scheduler runs. */ 103 STATIC int sched_run_interval = KIST_SCHED_RUN_INTERVAL_DEFAULT; 104 105 #ifdef HAVE_KIST_SUPPORT 106 /* Indicate if KIST lite mode is on or off. We can disable it at runtime. 107 * Important to have because of the KISTLite -> KIST possible transition. */ 108 static unsigned int kist_lite_mode = 0; 109 /* Indicate if we don't have the kernel support. This can happen if the kernel 110 * changed and it doesn't recognized the values passed to the syscalls needed 111 * by KIST. In that case, fallback to the naive approach. */ 112 static unsigned int kist_no_kernel_support = 0; 113 #else /* !defined(HAVE_KIST_SUPPORT) */ 114 static unsigned int kist_lite_mode = 1; 115 #endif /* defined(HAVE_KIST_SUPPORT) */ 116 117 /***************************************************************************** 118 * Internally called function implementations 119 *****************************************************************************/ 120 121 /* Little helper function to get the length of a channel's output buffer */ 122 static inline size_t 123 channel_outbuf_length(channel_t *chan) 124 { 125 tor_assert(chan); 126 /* In theory, this can not happen because we can not scheduler a channel 127 * without a connection that has its outbuf initialized. Just in case, bug 128 * on this so we can understand a bit more why it happened. */ 129 if (SCHED_BUG(BASE_CHAN_TO_TLS(chan)->conn == NULL, chan)) { 130 return 0; 131 } 132 return buf_datalen(TO_CONN(BASE_CHAN_TO_TLS(chan)->conn)->outbuf); 133 } 134 135 /* Little helper function for HT_FOREACH_FN. */ 136 static int 137 each_channel_write_to_kernel(outbuf_table_ent_t *ent, void *data) 138 { 139 (void) data; /* Make compiler happy. */ 140 channel_write_to_kernel(ent->chan); 141 return 0; /* Returning non-zero removes the element from the table. */ 142 } 143 144 /* Free the given outbuf table entry ent. */ 145 static int 146 free_outbuf_info_by_ent(outbuf_table_ent_t *ent, void *data) 147 { 148 (void) data; /* Make compiler happy. */ 149 log_debug(LD_SCHED, "Freeing outbuf table entry from chan=%" PRIu64, 150 ent->chan->global_identifier); 151 tor_free(ent); 152 return 1; /* So HT_FOREACH_FN will remove the element */ 153 } 154 155 /* Free the given socket table entry ent. */ 156 static int 157 free_socket_info_by_ent(socket_table_ent_t *ent, void *data) 158 { 159 (void) data; /* Make compiler happy. */ 160 log_debug(LD_SCHED, "Freeing socket table entry from chan=%" PRIu64, 161 ent->chan->global_identifier); 162 tor_free(ent); 163 return 1; /* So HT_FOREACH_FN will remove the element */ 164 } 165 166 /* Clean up socket_table. Probably because the KIST sched impl is going away */ 167 static void 168 free_all_socket_info(void) 169 { 170 HT_FOREACH_FN(socket_table_s, &socket_table, free_socket_info_by_ent, NULL); 171 HT_CLEAR(socket_table_s, &socket_table); 172 } 173 174 static socket_table_ent_t * 175 socket_table_search(socket_table_t *table, const channel_t *chan) 176 { 177 socket_table_ent_t search, *ent = NULL; 178 search.chan = chan; 179 ent = HT_FIND(socket_table_s, table, &search); 180 return ent; 181 } 182 183 /* Free a socket entry in table for the given chan. */ 184 static void 185 free_socket_info_by_chan(socket_table_t *table, const channel_t *chan) 186 { 187 socket_table_ent_t *ent = NULL; 188 ent = socket_table_search(table, chan); 189 if (!ent) 190 return; 191 log_debug(LD_SCHED, "scheduler free socket info for chan=%" PRIu64, 192 chan->global_identifier); 193 HT_REMOVE(socket_table_s, table, ent); 194 free_socket_info_by_ent(ent, NULL); 195 } 196 197 /* Perform system calls for the given socket in order to calculate kist's 198 * per-socket limit as documented in the function body. */ 199 MOCK_IMPL(void, 200 update_socket_info_impl, (socket_table_ent_t *ent)) 201 { 202 #ifdef HAVE_KIST_SUPPORT 203 int64_t tcp_space, extra_space; 204 tor_assert(ent); 205 tor_assert(ent->chan); 206 const tor_socket_t sock = 207 TO_CONN(CONST_BASE_CHAN_TO_TLS(ent->chan)->conn)->s; 208 struct tcp_info tcp; 209 socklen_t tcp_info_len = sizeof(tcp); 210 211 if (kist_no_kernel_support || kist_lite_mode) { 212 goto fallback; 213 } 214 215 /* Gather information */ 216 if (getsockopt(sock, SOL_TCP, TCP_INFO, (void *)&(tcp), &tcp_info_len) < 0) { 217 if (errno == EINVAL) { 218 /* Oops, this option is not provided by the kernel, we'll have to 219 * disable KIST entirely. This can happen if tor was built on a machine 220 * with the support previously or if the kernel was updated and lost the 221 * support. */ 222 log_notice(LD_SCHED, "Looks like our kernel doesn't have the support " 223 "for KIST anymore. We will fallback to the naive " 224 "approach. Remove KIST from the Schedulers list " 225 "to disable."); 226 kist_no_kernel_support = 1; 227 } 228 goto fallback; 229 } 230 if (ioctl(sock, SIOCOUTQNSD, &(ent->notsent)) < 0) { 231 if (errno == EINVAL) { 232 log_notice(LD_SCHED, "Looks like our kernel doesn't have the support " 233 "for KIST anymore. We will fallback to the naive " 234 "approach. Remove KIST from the Schedulers list " 235 "to disable."); 236 /* Same reason as the above. */ 237 kist_no_kernel_support = 1; 238 } 239 goto fallback; 240 } 241 ent->cwnd = tcp.tcpi_snd_cwnd; 242 ent->unacked = tcp.tcpi_unacked; 243 ent->mss = tcp.tcpi_snd_mss; 244 245 /* In order to reduce outbound kernel queuing delays and thus improve Tor's 246 * ability to prioritize circuits, KIST wants to set a socket write limit 247 * that is near the amount that the socket would be able to immediately send 248 * into the Internet. 249 * 250 * We first calculate how much the socket could send immediately (assuming 251 * completely full packets) according to the congestion window and the number 252 * of unacked packets. 253 * 254 * Then we add a little extra space in a controlled way. We do this so any 255 * when the kernel gets ACKs back for data currently sitting in the "TCP 256 * space", it will already have some more data to send immediately. It will 257 * not have to wait for the scheduler to run again. The amount of extra space 258 * is a factor of the current congestion window. With the suggested 259 * sock_buf_size_factor value of 1.0, we allow at most 2*cwnd bytes to sit in 260 * the kernel: 1 cwnd on the wire waiting for ACKs and 1 cwnd ready and 261 * waiting to be sent when those ACKs finally come. 262 * 263 * In the below diagram, we see some bytes in the TCP-space (denoted by '*') 264 * that have be sent onto the wire and are waiting for ACKs. We have a little 265 * more room in "TCP space" that we can fill with data that will be 266 * immediately sent. We also see the "extra space" KIST calculates. The sum 267 * of the empty "TCP space" and the "extra space" is the kist-imposed write 268 * limit for this socket. 269 * 270 * <----------------kernel-outbound-socket-queue----------------| 271 * <*********---------------------------------------------------| 272 * |----TCP-space-----|----extra-space-----| 273 * |------------------| 274 * ^ ((cwnd - unacked) * mss) bytes 275 * |--------------------| 276 * ^ ((cwnd * mss) * factor) bytes 277 */ 278 279 /* These values from the kernel are uint32_t, they will always fit into a 280 * int64_t tcp_space variable but if the congestion window cwnd is smaller 281 * than the unacked packets, the remaining TCP space is set to 0. */ 282 if (ent->cwnd >= ent->unacked) { 283 tcp_space = (ent->cwnd - ent->unacked) * (int64_t)(ent->mss); 284 } else { 285 tcp_space = 0; 286 } 287 288 /* The clamp_double_to_int64 makes sure the first part fits into an int64_t. 289 * In fact, if sock_buf_size_factor is still forced to be >= 0 in config.c, 290 * then it will be positive for sure. Then we subtract a uint32_t. Getting a 291 * negative value is OK, see after how it is being handled. */ 292 extra_space = 293 clamp_double_to_int64( 294 (ent->cwnd * (int64_t)ent->mss) * sock_buf_size_factor) - 295 ent->notsent - (int64_t)channel_outbuf_length((channel_t *) ent->chan); 296 if ((tcp_space + extra_space) < 0) { 297 /* This means that the "notsent" queue is just too big so we shouldn't put 298 * more in the kernel for now. */ 299 ent->limit = 0; 300 } else { 301 /* The positive sum of two int64_t will always fit into an uint64_t. 302 * And we know this will always be positive, since we checked above. */ 303 ent->limit = (uint64_t)tcp_space + (uint64_t)extra_space; 304 } 305 return; 306 307 #else /* !defined(HAVE_KIST_SUPPORT) */ 308 goto fallback; 309 #endif /* defined(HAVE_KIST_SUPPORT) */ 310 311 fallback: 312 /* If all of a sudden we don't have kist support, we just zero out all the 313 * variables for this socket since we don't know what they should be. We 314 * also allow the socket to write as much as it can from the estimated 315 * number of cells the lower layer can accept, effectively returning it to 316 * Vanilla scheduler behavior. */ 317 ent->cwnd = ent->unacked = ent->mss = ent->notsent = 0; 318 /* This function calls the specialized channel object (currently channeltls) 319 * and ask how many cells it can write on the outbuf which we then multiply 320 * by the size of the cells for this channel. The cast is because this 321 * function requires a non-const channel object, meh. */ 322 ent->limit = channel_num_cells_writeable((channel_t *) ent->chan) * 323 (get_cell_network_size(ent->chan->wide_circ_ids) + 324 TLS_PER_CELL_OVERHEAD); 325 } 326 327 /* Given a socket that isn't in the table, add it. 328 * Given a socket that is in the table, re-init values that need init-ing 329 * every scheduling run 330 */ 331 static void 332 init_socket_info(socket_table_t *table, const channel_t *chan) 333 { 334 socket_table_ent_t *ent = NULL; 335 ent = socket_table_search(table, chan); 336 if (!ent) { 337 log_debug(LD_SCHED, "scheduler init socket info for chan=%" PRIu64, 338 chan->global_identifier); 339 ent = tor_malloc_zero(sizeof(*ent)); 340 ent->chan = chan; 341 HT_INSERT(socket_table_s, table, ent); 342 } 343 ent->written = 0; 344 } 345 346 /* Add chan to the outbuf table if it isn't already in it. If it is, then don't 347 * do anything */ 348 static void 349 outbuf_table_add(outbuf_table_t *table, channel_t *chan) 350 { 351 outbuf_table_ent_t search, *ent; 352 search.chan = chan; 353 ent = HT_FIND(outbuf_table_s, table, &search); 354 if (!ent) { 355 log_debug(LD_SCHED, "scheduler init outbuf info for chan=%" PRIu64, 356 chan->global_identifier); 357 ent = tor_malloc_zero(sizeof(*ent)); 358 ent->chan = chan; 359 HT_INSERT(outbuf_table_s, table, ent); 360 } 361 } 362 363 static void 364 outbuf_table_remove(outbuf_table_t *table, channel_t *chan) 365 { 366 outbuf_table_ent_t search, *ent; 367 search.chan = chan; 368 ent = HT_FIND(outbuf_table_s, table, &search); 369 if (ent) { 370 HT_REMOVE(outbuf_table_s, table, ent); 371 free_outbuf_info_by_ent(ent, NULL); 372 } 373 } 374 375 /* Set the scheduler running interval. */ 376 static void 377 set_scheduler_run_interval(void) 378 { 379 int old_sched_run_interval = sched_run_interval; 380 sched_run_interval = kist_scheduler_run_interval(); 381 if (old_sched_run_interval != sched_run_interval) { 382 log_info(LD_SCHED, "Scheduler KIST changing its running interval " 383 "from %" PRId32 " to %" PRId32, 384 old_sched_run_interval, sched_run_interval); 385 } 386 } 387 388 /* Return true iff the channel hasn't hit its kist-imposed write limit yet */ 389 static int 390 socket_can_write(socket_table_t *table, const channel_t *chan) 391 { 392 socket_table_ent_t *ent = NULL; 393 ent = socket_table_search(table, chan); 394 if (SCHED_BUG(!ent, chan)) { 395 return 1; // Just return true, saying that kist wouldn't limit the socket 396 } 397 398 /* We previously calculated a write limit for this socket. In the below 399 * calculation, first determine how much room is left in bytes. Then divide 400 * that by the amount of space a cell takes. If there's room for at least 1 401 * cell, then KIST will allow the socket to write. */ 402 int64_t kist_limit_space = 403 (int64_t) (ent->limit - ent->written) / 404 (CELL_MAX_NETWORK_SIZE + TLS_PER_CELL_OVERHEAD); 405 return kist_limit_space > 0; 406 } 407 408 /* Update the channel's socket kernel information. */ 409 static void 410 update_socket_info(socket_table_t *table, const channel_t *chan) 411 { 412 socket_table_ent_t *ent = NULL; 413 ent = socket_table_search(table, chan); 414 if (SCHED_BUG(!ent, chan)) { 415 return; // Whelp. Entry didn't exist for some reason so nothing to do. 416 } 417 update_socket_info_impl(ent); 418 log_debug(LD_SCHED, "chan=%" PRIu64 " updated socket info, limit: %" PRIu64 419 ", cwnd: %" PRIu32 ", unacked: %" PRIu32 420 ", notsent: %" PRIu32 ", mss: %" PRIu32, 421 ent->chan->global_identifier, ent->limit, ent->cwnd, ent->unacked, 422 ent->notsent, ent->mss); 423 } 424 425 /* Increment the channel's socket written value by the number of bytes. */ 426 static void 427 update_socket_written(socket_table_t *table, channel_t *chan, size_t bytes) 428 { 429 socket_table_ent_t *ent = NULL; 430 ent = socket_table_search(table, chan); 431 if (SCHED_BUG(!ent, chan)) { 432 return; // Whelp. Entry didn't exist so nothing to do. 433 } 434 435 log_debug(LD_SCHED, "chan=%" PRIu64 " wrote %lu bytes, old was %" PRIi64, 436 chan->global_identifier, (unsigned long) bytes, ent->written); 437 438 ent->written += bytes; 439 } 440 441 /* 442 * A naive KIST impl would write every single cell all the way to the kernel. 443 * That would take a lot of system calls. A less bad KIST impl would write a 444 * channel's outbuf to the kernel only when we are switching to a different 445 * channel. But if we have two channels with equal priority, we end up writing 446 * one cell for each and bouncing back and forth. This KIST impl avoids that 447 * by only writing a channel's outbuf to the kernel if it has 8 cells or more 448 * in it. 449 * 450 * Note: The number 8 was picked so that, when using 512-byte cells, it 451 * would produce 4096 bytes: a common number for buffering. A TLS 452 * record can hold up to 16KiB; thus, using 8 512-byte cells means that 453 * a relay will at most send a TLS record of 4KiB or 1/4 of the maximum 454 * capacity of a TLS record. 455 * 456 * Of course, the above calculation became incorrect when we moved to 457 * 514-byte cells in order to accommodate a 4-byte circuit ID; we may 458 * want to consider profiling with '7' to see if it produces better 459 * results. (TODO) 460 */ 461 MOCK_IMPL(int, channel_should_write_to_kernel, 462 (outbuf_table_t *table, channel_t *chan)) 463 { 464 outbuf_table_add(table, chan); 465 /* CELL_MAX_NETWORK_SIZE * 8 because we only want to write the outbuf to the 466 * kernel if there's 8 or more cells waiting */ 467 return channel_outbuf_length(chan) > (CELL_MAX_NETWORK_SIZE * 8); 468 } 469 470 /* Little helper function to write a channel's outbuf all the way to the 471 * kernel */ 472 MOCK_IMPL(void, channel_write_to_kernel, (channel_t *chan)) 473 { 474 tor_assert(chan); 475 476 /* This is possible because a channel might have an outbuf table entry even 477 * though it has no more cells in its outbuf. Just move on. */ 478 size_t outbuf_len = channel_outbuf_length(chan); 479 if (outbuf_len == 0) { 480 return; 481 } 482 483 log_debug(LD_SCHED, "Writing %lu bytes to kernel for chan %" PRIu64, 484 (unsigned long) outbuf_len, chan->global_identifier); 485 486 /* Note that 'connection_handle_write()' may change the scheduler state of 487 * the channel during the scheduling loop with 488 * 'connection_or_flushed_some()' -> 'scheduler_channel_wants_writes()'. 489 * This side-effect will only occur if the channel is currently in the 490 * 'SCHED_CHAN_WAITING_TO_WRITE' or 'SCHED_CHAN_IDLE' states, which KIST 491 * rarely uses, so it should be fine unless KIST begins using these states 492 * in the future. */ 493 connection_handle_write(TO_CONN(BASE_CHAN_TO_TLS(chan)->conn), 0); 494 } 495 496 /* Return true iff the scheduler has work to perform. */ 497 static int 498 have_work(void) 499 { 500 smartlist_t *cp = get_channels_pending(); 501 IF_BUG_ONCE(!cp) { 502 return 0; // channels_pending doesn't exist so... no work? 503 } 504 return smartlist_len(cp) > 0; 505 } 506 507 /* Function of the scheduler interface: free_all() */ 508 static void 509 kist_free_all(void) 510 { 511 free_all_socket_info(); 512 } 513 514 /* Function of the scheduler interface: on_channel_free() */ 515 static void 516 kist_on_channel_free_fn(const channel_t *chan) 517 { 518 free_socket_info_by_chan(&socket_table, chan); 519 } 520 521 /* Function of the scheduler interface: on_new_consensus() */ 522 static void 523 kist_scheduler_on_new_consensus(void) 524 { 525 set_scheduler_run_interval(); 526 } 527 528 /* Function of the scheduler interface: on_new_options() */ 529 static void 530 kist_scheduler_on_new_options(void) 531 { 532 sock_buf_size_factor = get_options()->KISTSockBufSizeFactor; 533 534 /* Calls kist_scheduler_run_interval which calls get_options(). */ 535 set_scheduler_run_interval(); 536 } 537 538 /* Function of the scheduler interface: init() */ 539 static void 540 kist_scheduler_init(void) 541 { 542 /* When initializing the scheduler, the last run could be 0 because it is 543 * declared static or a value in the past that was set when it was last 544 * used. In both cases, we want to initialize it to now so we don't risk 545 * using the value 0 which doesn't play well with our monotonic time 546 * interface. 547 * 548 * One side effect is that the first scheduler run will be at the next tick 549 * that is in now + 10 msec (KIST_SCHED_RUN_INTERVAL_DEFAULT) by default. */ 550 monotime_get(&scheduler_last_run); 551 552 kist_scheduler_on_new_options(); 553 IF_BUG_ONCE(sched_run_interval == 0) { 554 log_warn(LD_SCHED, "We are initing the KIST scheduler and noticed the " 555 "KISTSchedRunInterval is telling us to not use KIST. That's " 556 "weird! We'll continue using KIST, but at %" PRId32 "ms.", 557 KIST_SCHED_RUN_INTERVAL_DEFAULT); 558 sched_run_interval = KIST_SCHED_RUN_INTERVAL_DEFAULT; 559 } 560 } 561 562 /* Function of the scheduler interface: schedule() */ 563 static void 564 kist_scheduler_schedule(void) 565 { 566 struct monotime_t now; 567 struct timeval next_run; 568 int64_t diff; 569 570 if (!have_work()) { 571 return; 572 } 573 monotime_get(&now); 574 575 /* If time is really monotonic, we can never have now being smaller than the 576 * last scheduler run. The scheduler_last_run at first is set to 0. 577 * Unfortunately, not all platforms guarantee monotonic time so we log at 578 * info level but don't make it more noisy. */ 579 diff = monotime_diff_msec(&scheduler_last_run, &now); 580 if (diff < 0) { 581 log_info(LD_SCHED, "Monotonic time between now and last run of scheduler " 582 "is negative: %" PRId64 ". Setting diff to 0.", diff); 583 diff = 0; 584 } 585 if (diff < sched_run_interval) { 586 next_run.tv_sec = 0; 587 /* Takes 1000 ms -> us. This will always be valid because diff can NOT be 588 * negative and can NOT be bigger than sched_run_interval so values can 589 * only go from 1000 usec (diff set to interval - 1) to 100000 usec (diff 590 * set to 0) for the maximum allowed run interval (100ms). */ 591 next_run.tv_usec = (int) ((sched_run_interval - diff) * 1000); 592 /* Re-adding an event reschedules it. It does not duplicate it. */ 593 scheduler_ev_add(&next_run); 594 } else { 595 scheduler_ev_active(); 596 } 597 } 598 599 /* Function of the scheduler interface: run() */ 600 static void 601 kist_scheduler_run(void) 602 { 603 /* Define variables */ 604 channel_t *chan = NULL; // current working channel 605 /* The last distinct chan served in a sched loop. */ 606 channel_t *prev_chan = NULL; 607 int flush_result; // temporarily store results from flush calls 608 /* Channels to be re-adding to pending at the end */ 609 smartlist_t *to_readd = NULL; 610 smartlist_t *cp = get_channels_pending(); 611 612 outbuf_table_t outbuf_table = HT_INITIALIZER(); 613 614 /* For each pending channel, collect new kernel information */ 615 SMARTLIST_FOREACH_BEGIN(cp, const channel_t *, pchan) { 616 init_socket_info(&socket_table, pchan); 617 update_socket_info(&socket_table, pchan); 618 } SMARTLIST_FOREACH_END(pchan); 619 620 log_debug(LD_SCHED, "Running the scheduler. %d channels pending", 621 smartlist_len(cp)); 622 623 /* The main scheduling loop. Loop until there are no more pending channels */ 624 while (smartlist_len(cp) > 0) { 625 /* get best channel */ 626 chan = smartlist_pqueue_pop(cp, scheduler_compare_channels, 627 offsetof(channel_t, sched_heap_idx)); 628 if (SCHED_BUG(!chan, NULL)) { 629 /* Some-freaking-how a NULL got into the channels_pending. That should 630 * never happen, but it should be harmless to ignore it and keep looping. 631 */ 632 continue; 633 } 634 outbuf_table_add(&outbuf_table, chan); 635 636 /* if we have switched to a new channel, consider writing the previous 637 * channel's outbuf to the kernel. */ 638 if (!prev_chan) { 639 prev_chan = chan; 640 } 641 if (prev_chan != chan) { 642 if (channel_should_write_to_kernel(&outbuf_table, prev_chan)) { 643 channel_write_to_kernel(prev_chan); 644 outbuf_table_remove(&outbuf_table, prev_chan); 645 } 646 prev_chan = chan; 647 } 648 649 /* Only flush and write if the per-socket limit hasn't been hit */ 650 if (socket_can_write(&socket_table, chan)) { 651 /* flush to channel queue/outbuf */ 652 flush_result = (int)channel_flush_some_cells(chan, 1); // 1 for num cells 653 /* XXX: While flushing cells, it is possible that the connection write 654 * fails leading to the channel to be closed which triggers a release 655 * and free its entry in the socket table. And because of a engineering 656 * design issue, the error is not propagated back so we don't get an 657 * error at this point. So before we continue, make sure the channel is 658 * open and if not just ignore it. See #23751. */ 659 if (!CHANNEL_IS_OPEN(chan)) { 660 /* Channel isn't open so we put it back in IDLE mode. It is either 661 * renegotiating its TLS session or about to be released. */ 662 scheduler_set_channel_state(chan, SCHED_CHAN_IDLE); 663 continue; 664 } 665 /* flush_result has the # cells flushed */ 666 if (flush_result > 0) { 667 update_socket_written(&socket_table, chan, flush_result * 668 (CELL_MAX_NETWORK_SIZE + TLS_PER_CELL_OVERHEAD)); 669 } else { 670 /* XXX: This can happen because tor sometimes does flush in an 671 * opportunistic way cells from the circuit to the outbuf so the 672 * channel can end up here without having anything to flush nor needed 673 * to write to the kernel. Hopefully we'll fix that soon but for now 674 * we have to handle this case which happens kind of often. */ 675 log_debug(LD_SCHED, 676 "We didn't flush anything on a chan that we think " 677 "can write and wants to write. The channel's state is '%s' " 678 "and in scheduler state '%s'. We're going to mark it as " 679 "waiting_for_cells (as that's most likely the issue) and " 680 "stop scheduling it this round.", 681 channel_state_to_string(chan->state), 682 get_scheduler_state_string(chan->scheduler_state)); 683 scheduler_set_channel_state(chan, SCHED_CHAN_WAITING_FOR_CELLS); 684 continue; 685 } 686 } 687 688 /* Decide what to do with the channel now */ 689 690 if (!channel_more_to_flush(chan) && 691 !socket_can_write(&socket_table, chan)) { 692 693 /* Case 1: no more cells to send, and cannot write */ 694 695 /* 696 * You might think we should put the channel in SCHED_CHAN_IDLE. And 697 * you're probably correct. While implementing KIST, we found that the 698 * scheduling system would sometimes lose track of channels when we did 699 * that. We suspect it has to do with the difference between "can't 700 * write because socket/outbuf is full" and KIST's "can't write because 701 * we've arbitrarily decided that that's enough for now." Sometimes 702 * channels run out of cells at the same time they hit their 703 * kist-imposed write limit and maybe the rest of Tor doesn't put the 704 * channel back in pending when it is supposed to. 705 * 706 * This should be investigated again. It is as simple as changing 707 * SCHED_CHAN_WAITING_FOR_CELLS to SCHED_CHAN_IDLE and seeing if Tor 708 * starts having serious throughput issues. Best done in shadow/chutney. 709 */ 710 scheduler_set_channel_state(chan, SCHED_CHAN_WAITING_FOR_CELLS); 711 } else if (!channel_more_to_flush(chan)) { 712 713 /* Case 2: no more cells to send, but still open for writes */ 714 715 scheduler_set_channel_state(chan, SCHED_CHAN_WAITING_FOR_CELLS); 716 } else if (!socket_can_write(&socket_table, chan)) { 717 718 /* Case 3: cells to send, but cannot write */ 719 720 /* 721 * We want to write, but can't. If we left the channel in 722 * channels_pending, we would never exit the scheduling loop. We need to 723 * add it to a temporary list of channels to be added to channels_pending 724 * after the scheduling loop is over. They can hopefully be taken care of 725 * in the next scheduling round. 726 */ 727 if (!to_readd) { 728 to_readd = smartlist_new(); 729 } 730 smartlist_add(to_readd, chan); 731 } else { 732 733 /* Case 4: cells to send, and still open for writes */ 734 735 scheduler_set_channel_state(chan, SCHED_CHAN_PENDING); 736 if (!SCHED_BUG(chan->sched_heap_idx != -1, chan)) { 737 smartlist_pqueue_add(cp, scheduler_compare_channels, 738 offsetof(channel_t, sched_heap_idx), chan); 739 } 740 } 741 } /* End of main scheduling loop */ 742 743 /* Write the outbuf of any channels that still have data */ 744 HT_FOREACH_FN(outbuf_table_s, &outbuf_table, each_channel_write_to_kernel, 745 NULL); 746 /* We are done with it. */ 747 HT_FOREACH_FN(outbuf_table_s, &outbuf_table, free_outbuf_info_by_ent, NULL); 748 HT_CLEAR(outbuf_table_s, &outbuf_table); 749 750 log_debug(LD_SCHED, "len pending=%d, len to_readd=%d", 751 smartlist_len(cp), 752 (to_readd ? smartlist_len(to_readd) : -1)); 753 754 /* Re-add any channels we need to */ 755 if (to_readd) { 756 SMARTLIST_FOREACH_BEGIN(to_readd, channel_t *, readd_chan) { 757 scheduler_set_channel_state(readd_chan, SCHED_CHAN_PENDING); 758 if (!smartlist_contains(cp, readd_chan)) { 759 if (!SCHED_BUG(readd_chan->sched_heap_idx != -1, readd_chan)) { 760 /* XXXX Note that the check above is in theory redundant with 761 * the smartlist_contains check. But let's make sure we're 762 * not messing anything up, and leave them both for now. */ 763 smartlist_pqueue_add(cp, scheduler_compare_channels, 764 offsetof(channel_t, sched_heap_idx), readd_chan); 765 } 766 } 767 } SMARTLIST_FOREACH_END(readd_chan); 768 smartlist_free(to_readd); 769 } 770 771 monotime_get(&scheduler_last_run); 772 } 773 774 /***************************************************************************** 775 * Externally called function implementations not called through scheduler_t 776 *****************************************************************************/ 777 778 /* Stores the kist scheduler function pointers. */ 779 static scheduler_t kist_scheduler = { 780 .type = SCHEDULER_KIST, 781 .free_all = kist_free_all, 782 .on_channel_free = kist_on_channel_free_fn, 783 .init = kist_scheduler_init, 784 .on_new_consensus = kist_scheduler_on_new_consensus, 785 .schedule = kist_scheduler_schedule, 786 .run = kist_scheduler_run, 787 .on_new_options = kist_scheduler_on_new_options, 788 }; 789 790 /* Return the KIST scheduler object. If it didn't exists, return a newly 791 * allocated one but init() is not called. */ 792 scheduler_t * 793 get_kist_scheduler(void) 794 { 795 return &kist_scheduler; 796 } 797 798 /* Check the torrc (and maybe consensus) for the configured KIST scheduler run 799 * interval. 800 * - If torrc > 0, then return the positive torrc value (should use KIST, and 801 * should use the set value) 802 * - If torrc == 0, then look in the consensus for what the value should be. 803 * - If == 0, then return 0 (don't use KIST) 804 * - If > 0, then return the positive consensus value 805 * - If consensus doesn't say anything, return 10 milliseconds, default. 806 */ 807 int 808 kist_scheduler_run_interval(void) 809 { 810 int run_interval = get_options()->KISTSchedRunInterval; 811 812 if (run_interval != 0) { 813 log_debug(LD_SCHED, "Found KISTSchedRunInterval=%" PRId32 " in torrc. " 814 "Using that.", run_interval); 815 return run_interval; 816 } 817 818 log_debug(LD_SCHED, "KISTSchedRunInterval=0, turning to the consensus."); 819 820 /* Clients and relays have a separate consensus parameter. Clients 821 * need a lower KIST interval, since they have only a couple connections */ 822 if (server_mode(get_options())) { 823 return networkstatus_get_param(NULL, "KISTSchedRunInterval", 824 KIST_SCHED_RUN_INTERVAL_DEFAULT, 825 KIST_SCHED_RUN_INTERVAL_MIN, 826 KIST_SCHED_RUN_INTERVAL_MAX); 827 } else { 828 return networkstatus_get_param(NULL, "KISTSchedRunIntervalClient", 829 KIST_SCHED_RUN_INTERVAL_DEFAULT, 830 KIST_SCHED_RUN_INTERVAL_MIN, 831 KIST_SCHED_RUN_INTERVAL_MAX); 832 } 833 } 834 835 /* Set KISTLite mode that is KIST without kernel support. */ 836 void 837 scheduler_kist_set_lite_mode(void) 838 { 839 kist_lite_mode = 1; 840 kist_scheduler.type = SCHEDULER_KIST_LITE; 841 log_info(LD_SCHED, 842 "Setting KIST scheduler without kernel support (KISTLite mode)"); 843 } 844 845 /* Set KIST mode that is KIST with kernel support. */ 846 void 847 scheduler_kist_set_full_mode(void) 848 { 849 kist_lite_mode = 0; 850 kist_scheduler.type = SCHEDULER_KIST; 851 log_info(LD_SCHED, 852 "Setting KIST scheduler with kernel support (KIST mode)"); 853 } 854 855 #ifdef HAVE_KIST_SUPPORT 856 857 /* Return true iff the scheduler subsystem should use KIST. */ 858 int 859 scheduler_can_use_kist(void) 860 { 861 if (kist_no_kernel_support) { 862 /* We have no kernel support so we can't use KIST. */ 863 return 0; 864 } 865 866 /* We do have the support, time to check if we can get the interval that the 867 * consensus can be disabling. */ 868 int run_interval = kist_scheduler_run_interval(); 869 log_debug(LD_SCHED, "Determined KIST sched_run_interval should be " 870 "%" PRId32 ". Can%s use KIST.", 871 run_interval, (run_interval > 0 ? "" : " not")); 872 return run_interval > 0; 873 } 874 875 #else /* !defined(HAVE_KIST_SUPPORT) */ 876 877 int 878 scheduler_can_use_kist(void) 879 { 880 return 0; 881 } 882 883 #endif /* defined(HAVE_KIST_SUPPORT) */