compress.c (21650B)
1 /* Copyright (c) 2004, Roger Dingledine. 2 * Copyright (c) 2004-2006, Roger Dingledine, Nick Mathewson. 3 * Copyright (c) 2007-2021, The Tor Project, Inc. */ 4 /* See LICENSE for licensing information */ 5 6 /** 7 * \file compress.c 8 * \brief Common compression API implementation. 9 * 10 * This file provides a unified interface to all the compression libraries Tor 11 * knows how to use. 12 **/ 13 14 #include "orconfig.h" 15 16 #include <stdlib.h> 17 #include <stdio.h> 18 #include <string.h> 19 #include "lib/cc/torint.h" 20 21 #ifdef HAVE_NETINET_IN_H 22 #include <netinet/in.h> 23 #endif 24 25 #include "lib/log/log.h" 26 #include "lib/log/util_bug.h" 27 #include "lib/arch/bytes.h" 28 #include "lib/ctime/di_ops.h" 29 #include "lib/compress/compress.h" 30 #include "lib/compress/compress_lzma.h" 31 #include "lib/compress/compress_none.h" 32 #include "lib/compress/compress_sys.h" 33 #include "lib/compress/compress_zlib.h" 34 #include "lib/compress/compress_zstd.h" 35 #include "lib/intmath/cmp.h" 36 #include "lib/malloc/malloc.h" 37 #include "lib/subsys/subsys.h" 38 #include "lib/thread/threads.h" 39 40 /** Total number of bytes allocated for compression state overhead. */ 41 static atomic_counter_t total_compress_allocation; 42 43 /** @{ */ 44 /* These macros define the maximum allowable compression factor. Anything of 45 * size greater than CHECK_FOR_COMPRESSION_BOMB_AFTER is not allowed to 46 * have an uncompression factor (uncompressed size:compressed size ratio) of 47 * any greater than MAX_UNCOMPRESSION_FACTOR. 48 * 49 * Picking a value for MAX_UNCOMPRESSION_FACTOR is a trade-off: we want it to 50 * be small to limit the attack multiplier, but we also want it to be large 51 * enough so that no legitimate document --even ones we might invent in the 52 * future -- ever compresses by a factor of greater than 53 * MAX_UNCOMPRESSION_FACTOR. Within those parameters, there's a reasonably 54 * large range of possible values. IMO, anything over 8 is probably safe; IMO 55 * anything under 50 is probably sufficient. 56 * 57 * 2025-10-01: (ahf) Bumped to 5 MB to avoid the situation described in 58 * tor#40739. 59 */ 60 #define MAX_UNCOMPRESSION_FACTOR 25 61 #define CHECK_FOR_COMPRESSION_BOMB_AFTER (5 * 1024 * 1024) 62 /** @} */ 63 64 /** Return true if uncompressing an input of size <b>in_size</b> to an input of 65 * size at least <b>size_out</b> looks like a compression bomb. */ 66 MOCK_IMPL(int, 67 tor_compress_is_compression_bomb,(size_t size_in, size_t size_out)) 68 { 69 if (size_in == 0 || size_out < CHECK_FOR_COMPRESSION_BOMB_AFTER) 70 return 0; 71 72 double compression_factor = (double)size_out / size_in; 73 if (compression_factor > MAX_UNCOMPRESSION_FACTOR) { 74 log_warn(LD_GENERAL, 75 "Detected possible compression bomb with " 76 "input size = %"TOR_PRIuSZ" and output size = %"TOR_PRIuSZ" " 77 "(compression factor = %.2f)", 78 size_in, size_out, compression_factor); 79 return 1; 80 } 81 82 return 0; 83 } 84 85 /** Guess the size that <b>in_len</b> will be after compression or 86 * decompression. */ 87 static size_t 88 guess_compress_size(int compress, compress_method_t method, 89 compression_level_t compression_level, 90 size_t in_len) 91 { 92 // ignore these for now. 93 (void)compression_level; 94 if (method == NO_METHOD) { 95 /* Guess that we'll need an extra byte, to avoid a needless realloc 96 * for nul-termination */ 97 return (in_len < SIZE_MAX) ? in_len + 1 : in_len; 98 } 99 100 /* Always guess a factor of 2. */ 101 if (compress) { 102 in_len /= 2; 103 } else { 104 if (in_len < SIZE_T_CEILING/2) 105 in_len *= 2; 106 } 107 return MAX(in_len, 1024); 108 } 109 110 /** Internal function to implement tor_compress/tor_uncompress, depending on 111 * whether <b>compress</b> is set. All arguments are as for tor_compress or 112 * tor_uncompress. */ 113 static int 114 tor_compress_impl(int compress, 115 char **out, size_t *out_len, 116 const char *in, size_t in_len, 117 compress_method_t method, 118 compression_level_t compression_level, 119 int complete_only, 120 int protocol_warn_level) 121 { 122 tor_compress_state_t *stream; 123 int rv; 124 125 stream = tor_compress_new(compress, method, compression_level); 126 127 if (stream == NULL) { 128 log_warn(LD_GENERAL, "NULL stream while %scompressing", 129 compress?"":"de"); 130 log_debug(LD_GENERAL, "method: %d level: %d at len: %lu", 131 method, compression_level, (unsigned long)in_len); 132 return -1; 133 } 134 135 size_t in_len_orig = in_len; 136 size_t out_remaining, out_alloc; 137 char *outptr; 138 139 out_remaining = out_alloc = 140 guess_compress_size(compress, method, compression_level, in_len); 141 *out = outptr = tor_malloc(out_remaining); 142 143 const int finish = complete_only || compress; 144 145 while (1) { 146 switch (tor_compress_process(stream, 147 &outptr, &out_remaining, 148 &in, &in_len, finish)) { 149 case TOR_COMPRESS_DONE: 150 if (in_len == 0 || compress) { 151 goto done; 152 } else { 153 // More data is present, and we're decompressing. So we may need to 154 // reinitialize the stream if we are handling multiple concatenated 155 // inputs. 156 tor_compress_free(stream); 157 stream = tor_compress_new(compress, method, compression_level); 158 if (stream == NULL) { 159 log_warn(LD_GENERAL, "NULL stream while %scompressing", 160 compress?"":"de"); 161 goto err; 162 } 163 } 164 break; 165 case TOR_COMPRESS_OK: 166 if (compress || complete_only) { 167 log_fn(protocol_warn_level, LD_PROTOCOL, 168 "Unexpected %s while %scompressing", 169 complete_only?"end of input":"result", 170 compress?"":"de"); 171 log_debug(LD_GENERAL, "method: %d level: %d at len: %lu", 172 method, compression_level, (unsigned long)in_len); 173 goto err; 174 } else { 175 if (in_len == 0) { 176 goto done; 177 } 178 } 179 break; 180 case TOR_COMPRESS_BUFFER_FULL: { 181 if (!compress && outptr < *out+out_alloc) { 182 // A buffer error in this case means that we have a problem 183 // with our input. 184 log_fn(protocol_warn_level, LD_PROTOCOL, 185 "Possible truncated or corrupt compressed data"); 186 goto err; 187 } 188 if (out_alloc >= SIZE_T_CEILING / 2) { 189 log_warn(LD_GENERAL, "While %scompressing data: ran out of space.", 190 compress?"":"un"); 191 goto err; 192 } 193 if (!compress && 194 tor_compress_is_compression_bomb(in_len_orig, out_alloc)) { 195 // This should already have been caught down in the backend logic. 196 // LCOV_EXCL_START 197 tor_assert_nonfatal_unreached(); 198 goto err; 199 // LCOV_EXCL_STOP 200 } 201 const size_t offset = outptr - *out; 202 out_alloc *= 2; 203 *out = tor_realloc(*out, out_alloc); 204 outptr = *out + offset; 205 out_remaining = out_alloc - offset; 206 break; 207 } 208 case TOR_COMPRESS_ERROR: 209 log_fn(protocol_warn_level, LD_GENERAL, 210 "Error while %scompressing data: bad input?", 211 compress?"":"un"); 212 goto err; // bad data. 213 214 // LCOV_EXCL_START 215 default: 216 tor_assert_nonfatal_unreached(); 217 goto err; 218 // LCOV_EXCL_STOP 219 } 220 } 221 done: 222 *out_len = outptr - *out; 223 if (compress && tor_compress_is_compression_bomb(*out_len, in_len_orig)) { 224 log_warn(LD_BUG, "We compressed something and got an insanely high " 225 "compression factor; other Tors would think this was a " 226 "compression bomb."); 227 goto err; 228 } 229 if (!compress) { 230 // NUL-terminate our output. 231 if (out_alloc == *out_len) 232 *out = tor_realloc(*out, out_alloc + 1); 233 (*out)[*out_len] = '\0'; 234 } 235 rv = 0; 236 goto out; 237 238 err: 239 tor_free(*out); 240 *out_len = 0; 241 rv = -1; 242 goto out; 243 244 out: 245 tor_compress_free(stream); 246 return rv; 247 } 248 249 /** Given <b>in_len</b> bytes at <b>in</b>, compress them into a newly 250 * allocated buffer, using the method described in <b>method</b>. Store the 251 * compressed string in *<b>out</b>, and its length in *<b>out_len</b>. 252 * Return 0 on success, -1 on failure. 253 */ 254 int 255 tor_compress(char **out, size_t *out_len, 256 const char *in, size_t in_len, 257 compress_method_t method) 258 { 259 return tor_compress_impl(1, out, out_len, in, in_len, method, 260 BEST_COMPRESSION, 261 1, LOG_WARN); 262 } 263 264 /** Given zero or more compressed strings of total length <b>in_len</b> bytes 265 * at <b>in</b>, uncompress them into a newly allocated buffer, using the 266 * method described in <b>method</b>. Store the uncompressed string in 267 * *<b>out</b>, and its length in *<b>out_len</b>. Return 0 on success, -1 on 268 * failure. 269 * 270 * If any bytes are written to <b>out</b>, an extra byte NUL is always 271 * written at the end, but not counted in <b>out_len</b>. This is a 272 * safety feature to ensure that the output can be treated as a 273 * NUL-terminated string -- though of course, callers should check 274 * out_len anyway. 275 * 276 * If <b>complete_only</b> is true, we consider a truncated input as a 277 * failure; otherwise we decompress as much as we can. Warn about truncated 278 * or corrupt inputs at <b>protocol_warn_level</b>. 279 */ 280 int 281 tor_uncompress(char **out, size_t *out_len, 282 const char *in, size_t in_len, 283 compress_method_t method, 284 int complete_only, 285 int protocol_warn_level) 286 { 287 return tor_compress_impl(0, out, out_len, in, in_len, method, 288 BEST_COMPRESSION, 289 complete_only, protocol_warn_level); 290 } 291 292 /** Try to tell whether the <b>in_len</b>-byte string in <b>in</b> is likely 293 * to be compressed or not. If it is, return the likeliest compression method. 294 * Otherwise, return UNKNOWN_METHOD. 295 */ 296 compress_method_t 297 detect_compression_method(const char *in, size_t in_len) 298 { 299 if (in_len > 2 && fast_memeq(in, "\x1f\x8b", 2)) { 300 return GZIP_METHOD; 301 } else if (in_len > 2 && (in[0] & 0x0f) == 8 && 302 (tor_ntohs(get_uint16(in)) % 31) == 0) { 303 return ZLIB_METHOD; 304 } else if (in_len > 2 && 305 fast_memeq(in, "\x5d\x00\x00", 3)) { 306 return LZMA_METHOD; 307 } else if (in_len > 3 && 308 fast_memeq(in, "\x28\xb5\x2f\xfd", 4)) { 309 return ZSTD_METHOD; 310 } else { 311 return UNKNOWN_METHOD; 312 } 313 } 314 315 /** Return 1 if a given <b>method</b> is supported; otherwise 0. */ 316 int 317 tor_compress_supports_method(compress_method_t method) 318 { 319 switch (method) { 320 case GZIP_METHOD: 321 case ZLIB_METHOD: 322 return tor_zlib_method_supported(); 323 case LZMA_METHOD: 324 return tor_lzma_method_supported(); 325 case ZSTD_METHOD: 326 return tor_zstd_method_supported(); 327 case NO_METHOD: 328 return 1; 329 case UNKNOWN_METHOD: 330 default: 331 return 0; 332 } 333 } 334 335 /** 336 * Return a bitmask of the supported compression types, where 1<<m is 337 * set in the bitmask if and only if compression with method <b>m</b> is 338 * supported. 339 */ 340 unsigned 341 tor_compress_get_supported_method_bitmask(void) 342 { 343 static unsigned supported = 0; 344 if (supported == 0) { 345 compress_method_t m; 346 for (m = NO_METHOD; m <= UNKNOWN_METHOD; ++m) { 347 if (tor_compress_supports_method(m)) { 348 supported |= (1u << m); 349 } 350 } 351 } 352 return supported; 353 } 354 355 /** Table of compression method names. These should have an "x-" prefix, 356 * if they are not listed in the IANA content coding registry. */ 357 static const struct { 358 const char *name; 359 compress_method_t method; 360 } compression_method_names[] = { 361 { "gzip", GZIP_METHOD }, 362 { "deflate", ZLIB_METHOD }, 363 // We call this "x-tor-lzma" rather than "x-lzma", because we impose a 364 // lower maximum memory usage on the decoding side. 365 { "x-tor-lzma", LZMA_METHOD }, 366 { "x-zstd" , ZSTD_METHOD }, 367 { "identity", NO_METHOD }, 368 369 /* Later entries in this table are not canonical; these are recognized but 370 * not emitted. */ 371 { "x-gzip", GZIP_METHOD }, 372 }; 373 374 /** Return the canonical string representation of the compression method 375 * <b>method</b>, or NULL if the method isn't recognized. */ 376 const char * 377 compression_method_get_name(compress_method_t method) 378 { 379 unsigned i; 380 for (i = 0; i < ARRAY_LENGTH(compression_method_names); ++i) { 381 if (method == compression_method_names[i].method) 382 return compression_method_names[i].name; 383 } 384 return NULL; 385 } 386 387 /** Table of compression human readable method names. */ 388 static const struct { 389 compress_method_t method; 390 const char *name; 391 } compression_method_human_names[] = { 392 { NO_METHOD, "uncompressed" }, 393 { GZIP_METHOD, "gzipped" }, 394 { ZLIB_METHOD, "deflated" }, 395 { LZMA_METHOD, "LZMA compressed" }, 396 { ZSTD_METHOD, "Zstandard compressed" }, 397 { UNKNOWN_METHOD, "unknown encoding" }, 398 }; 399 400 /** Return a human readable string representation of the compression method 401 * <b>method</b>, or NULL if the method isn't recognized. */ 402 const char * 403 compression_method_get_human_name(compress_method_t method) 404 { 405 unsigned i; 406 for (i = 0; i < ARRAY_LENGTH(compression_method_human_names); ++i) { 407 if (method == compression_method_human_names[i].method) 408 return compression_method_human_names[i].name; 409 } 410 return NULL; 411 } 412 413 /** Return the compression method represented by the string <b>name</b>, or 414 * UNKNOWN_METHOD if the string isn't recognized. */ 415 compress_method_t 416 compression_method_get_by_name(const char *name) 417 { 418 unsigned i; 419 for (i = 0; i < ARRAY_LENGTH(compression_method_names); ++i) { 420 if (!strcmp(compression_method_names[i].name, name)) 421 return compression_method_names[i].method; 422 } 423 return UNKNOWN_METHOD; 424 } 425 426 /** Return a string representation of the version of the library providing the 427 * compression method given in <b>method</b>. Returns NULL if <b>method</b> is 428 * unknown or unsupported. */ 429 const char * 430 tor_compress_version_str(compress_method_t method) 431 { 432 switch (method) { 433 case GZIP_METHOD: 434 case ZLIB_METHOD: 435 return tor_zlib_get_version_str(); 436 case LZMA_METHOD: 437 return tor_lzma_get_version_str(); 438 case ZSTD_METHOD: 439 return tor_zstd_get_version_str(); 440 case NO_METHOD: 441 case UNKNOWN_METHOD: 442 default: 443 return NULL; 444 } 445 } 446 447 /** Return a string representation of the version of the library, found at 448 * compile time, providing the compression method given in <b>method</b>. 449 * Returns NULL if <b>method</b> is unknown or unsupported. */ 450 const char * 451 tor_compress_header_version_str(compress_method_t method) 452 { 453 switch (method) { 454 case GZIP_METHOD: 455 case ZLIB_METHOD: 456 return tor_zlib_get_header_version_str(); 457 case LZMA_METHOD: 458 return tor_lzma_get_header_version_str(); 459 case ZSTD_METHOD: 460 return tor_zstd_get_header_version_str(); 461 case NO_METHOD: 462 case UNKNOWN_METHOD: 463 default: 464 return NULL; 465 } 466 } 467 468 /** Return the approximate number of bytes allocated for all 469 * supported compression schemas. */ 470 size_t 471 tor_compress_get_total_allocation(void) 472 { 473 return atomic_counter_get(&total_compress_allocation) + 474 tor_zlib_get_total_allocation() + 475 tor_lzma_get_total_allocation() + 476 tor_zstd_get_total_allocation(); 477 } 478 479 /** Internal state for an incremental compression/decompression. The body of 480 * this struct is not exposed. */ 481 struct tor_compress_state_t { 482 compress_method_t method; /**< The compression method. */ 483 484 union { 485 tor_zlib_compress_state_t *zlib_state; 486 tor_lzma_compress_state_t *lzma_state; 487 tor_zstd_compress_state_t *zstd_state; 488 } u; /**< Compression backend state. */ 489 }; 490 491 /** Construct and return a tor_compress_state_t object using <b>method</b>. If 492 * <b>compress</b>, it's for compression; otherwise it's for decompression. */ 493 tor_compress_state_t * 494 tor_compress_new(int compress, compress_method_t method, 495 compression_level_t compression_level) 496 { 497 tor_compress_state_t *state; 498 499 state = tor_malloc_zero(sizeof(tor_compress_state_t)); 500 state->method = method; 501 502 switch (method) { 503 case GZIP_METHOD: 504 case ZLIB_METHOD: { 505 tor_zlib_compress_state_t *zlib_state = 506 tor_zlib_compress_new(compress, method, compression_level); 507 508 if (zlib_state == NULL) 509 goto err; 510 511 state->u.zlib_state = zlib_state; 512 break; 513 } 514 case LZMA_METHOD: { 515 tor_lzma_compress_state_t *lzma_state = 516 tor_lzma_compress_new(compress, method, compression_level); 517 518 if (lzma_state == NULL) 519 goto err; 520 521 state->u.lzma_state = lzma_state; 522 break; 523 } 524 case ZSTD_METHOD: { 525 tor_zstd_compress_state_t *zstd_state = 526 tor_zstd_compress_new(compress, method, compression_level); 527 528 if (zstd_state == NULL) 529 goto err; 530 531 state->u.zstd_state = zstd_state; 532 break; 533 } 534 case NO_METHOD: { 535 break; 536 } 537 case UNKNOWN_METHOD: 538 goto err; 539 } 540 541 atomic_counter_add(&total_compress_allocation, 542 sizeof(tor_compress_state_t)); 543 return state; 544 545 err: 546 tor_free(state); 547 return NULL; 548 } 549 550 /** Compress/decompress some bytes using <b>state</b>. Read up to 551 * *<b>in_len</b> bytes from *<b>in</b>, and write up to *<b>out_len</b> bytes 552 * to *<b>out</b>, adjusting the values as we go. If <b>finish</b> is true, 553 * we've reached the end of the input. 554 * 555 * Return TOR_COMPRESS_DONE if we've finished the entire 556 * compression/decompression. 557 * Return TOR_COMPRESS_OK if we're processed everything from the input. 558 * Return TOR_COMPRESS_BUFFER_FULL if we're out of space on <b>out</b>. 559 * Return TOR_COMPRESS_ERROR if the stream is corrupt. 560 */ 561 tor_compress_output_t 562 tor_compress_process(tor_compress_state_t *state, 563 char **out, size_t *out_len, 564 const char **in, size_t *in_len, 565 int finish) 566 { 567 tor_assert(state != NULL); 568 const size_t in_len_orig = *in_len; 569 const size_t out_len_orig = *out_len; 570 tor_compress_output_t rv; 571 572 if (*out_len == 0 && (*in_len > 0 || finish)) { 573 // If we still have input data, but no space for output data, we might as 574 // well return early and let the caller do the reallocation of the out 575 // variable. 576 return TOR_COMPRESS_BUFFER_FULL; 577 } 578 579 switch (state->method) { 580 case GZIP_METHOD: 581 case ZLIB_METHOD: 582 rv = tor_zlib_compress_process(state->u.zlib_state, 583 out, out_len, in, in_len, 584 finish); 585 break; 586 case LZMA_METHOD: 587 rv = tor_lzma_compress_process(state->u.lzma_state, 588 out, out_len, in, in_len, 589 finish); 590 break; 591 case ZSTD_METHOD: 592 rv = tor_zstd_compress_process(state->u.zstd_state, 593 out, out_len, in, in_len, 594 finish); 595 break; 596 case NO_METHOD: 597 rv = tor_cnone_compress_process(out, out_len, in, in_len, 598 finish); 599 break; 600 default: 601 case UNKNOWN_METHOD: 602 goto err; 603 } 604 if (BUG((rv == TOR_COMPRESS_OK) && 605 *in_len == in_len_orig && 606 *out_len == out_len_orig)) { 607 log_warn(LD_GENERAL, 608 "More info on the bug: method == %s, finish == %d, " 609 " *in_len == in_len_orig == %lu, " 610 "*out_len == out_len_orig == %lu", 611 compression_method_get_human_name(state->method), finish, 612 (unsigned long)in_len_orig, (unsigned long)out_len_orig); 613 return TOR_COMPRESS_ERROR; 614 } 615 616 return rv; 617 err: 618 return TOR_COMPRESS_ERROR; 619 } 620 621 /** Deallocate <b>state</b>. */ 622 void 623 tor_compress_free_(tor_compress_state_t *state) 624 { 625 if (state == NULL) 626 return; 627 628 switch (state->method) { 629 case GZIP_METHOD: 630 case ZLIB_METHOD: 631 tor_zlib_compress_free(state->u.zlib_state); 632 break; 633 case LZMA_METHOD: 634 tor_lzma_compress_free(state->u.lzma_state); 635 break; 636 case ZSTD_METHOD: 637 tor_zstd_compress_free(state->u.zstd_state); 638 break; 639 case NO_METHOD: 640 break; 641 case UNKNOWN_METHOD: 642 break; 643 } 644 645 atomic_counter_sub(&total_compress_allocation, 646 sizeof(tor_compress_state_t)); 647 tor_free(state); 648 } 649 650 /** Return the approximate number of bytes allocated for <b>state</b>. */ 651 size_t 652 tor_compress_state_size(const tor_compress_state_t *state) 653 { 654 tor_assert(state != NULL); 655 656 size_t size = sizeof(tor_compress_state_t); 657 658 switch (state->method) { 659 case GZIP_METHOD: 660 case ZLIB_METHOD: 661 size += tor_zlib_compress_state_size(state->u.zlib_state); 662 break; 663 case LZMA_METHOD: 664 size += tor_lzma_compress_state_size(state->u.lzma_state); 665 break; 666 case ZSTD_METHOD: 667 size += tor_zstd_compress_state_size(state->u.zstd_state); 668 break; 669 case NO_METHOD: 670 case UNKNOWN_METHOD: 671 break; 672 } 673 674 return size; 675 } 676 677 /** Initialize all compression modules. */ 678 int 679 tor_compress_init(void) 680 { 681 atomic_counter_init(&total_compress_allocation); 682 683 tor_zlib_init(); 684 tor_lzma_init(); 685 tor_zstd_init(); 686 687 return 0; 688 } 689 690 /** Warn if we had any problems while setting up our compression libraries. 691 * 692 * (This isn't part of tor_compress_init, since the logs aren't set up yet.) 693 */ 694 void 695 tor_compress_log_init_warnings(void) 696 { 697 // XXXX can we move this into tor_compress_init() after all? log.c queues 698 // XXXX log messages at startup. 699 tor_zstd_warn_if_version_mismatched(); 700 } 701 702 static int 703 subsys_compress_initialize(void) 704 { 705 return tor_compress_init(); 706 } 707 708 const subsys_fns_t sys_compress = { 709 .name = "compress", 710 SUBSYS_DECLARE_LOCATION(), 711 .supported = true, 712 .level = -55, 713 .initialize = subsys_compress_initialize, 714 };