tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

decode_rust_punycode.cc (10283B)


      1 // Copyright 2024 The Abseil Authors
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //     https://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 #include "absl/debugging/internal/decode_rust_punycode.h"
     16 
     17 #include <cstddef>
     18 #include <cstdint>
     19 #include <cstring>
     20 
     21 #include "absl/base/config.h"
     22 #include "absl/base/nullability.h"
     23 #include "absl/debugging/internal/bounded_utf8_length_sequence.h"
     24 #include "absl/debugging/internal/utf8_for_code_point.h"
     25 
     26 namespace absl {
     27 ABSL_NAMESPACE_BEGIN
     28 namespace debugging_internal {
     29 
     30 namespace {
     31 
     32 // Decoding Punycode requires repeated random-access insertion into a stream of
     33 // variable-length UTF-8 code-point encodings.  We need this to be tolerably
     34 // fast (no N^2 slowdown for unfortunate inputs), and we can't allocate any data
     35 // structures on the heap (async-signal-safety).
     36 //
     37 // It is pragmatic to impose a moderately low limit on the identifier length and
     38 // bail out if we ever hit it.  Then BoundedUtf8LengthSequence efficiently
     39 // determines where to insert the next code point, and memmove efficiently makes
     40 // room for it.
     41 //
     42 // The chosen limit is a round number several times larger than identifiers
     43 // expected in practice, yet still small enough that a memmove of this many
     44 // UTF-8 characters is not much more expensive than the division and modulus
     45 // operations that Punycode decoding requires.
     46 constexpr uint32_t kMaxChars = 256;
     47 
     48 // Constants from RFC 3492 section 5.
     49 constexpr uint32_t kBase = 36, kTMin = 1, kTMax = 26, kSkew = 38, kDamp = 700;
     50 
     51 constexpr uint32_t kMaxCodePoint = 0x10ffff;
     52 
     53 // Overflow threshold in DecodeRustPunycode's inner loop; see comments there.
     54 constexpr uint32_t kMaxI = 1 << 30;
     55 
     56 // If punycode_begin .. punycode_end begins with a prefix matching the regular
     57 // expression [0-9a-zA-Z_]+_, removes that prefix, copies all but the final
     58 // underscore into out_begin .. out_end, sets num_ascii_chars to the number of
     59 // bytes copied, and returns true.  (A prefix of this sort represents the
     60 // nonempty subsequence of ASCII characters in the corresponding plaintext.)
     61 //
     62 // If punycode_begin .. punycode_end does not contain an underscore, sets
     63 // num_ascii_chars to zero and returns true.  (The encoding of a plaintext
     64 // without any ASCII characters does not carry such a prefix.)
     65 //
     66 // Returns false and zeroes num_ascii_chars on failure (either parse error or
     67 // not enough space in the output buffer).
     68 bool ConsumeOptionalAsciiPrefix(const char*& punycode_begin,
     69                                const char* const punycode_end,
     70                                char* const out_begin,
     71                                char* const out_end,
     72                                uint32_t& num_ascii_chars) {
     73  num_ascii_chars = 0;
     74 
     75  // Remember the last underscore if any.  Also use the same string scan to
     76  // reject any ASCII bytes that do not belong in an identifier, including NUL,
     77  // as well as non-ASCII bytes, which should have been delta-encoded instead.
     78  int last_underscore = -1;
     79  for (int i = 0; i < punycode_end - punycode_begin; ++i) {
     80    const char c = punycode_begin[i];
     81    if (c == '_') {
     82      last_underscore = i;
     83      continue;
     84    }
     85    // We write out the meaning of absl::ascii_isalnum rather than call that
     86    // function because its documentation does not promise it will remain
     87    // async-signal-safe under future development.
     88    if ('a' <= c && c <= 'z') continue;
     89    if ('A' <= c && c <= 'Z') continue;
     90    if ('0' <= c && c <= '9') continue;
     91    return false;
     92  }
     93 
     94  // If there was no underscore, that means there were no ASCII characters in
     95  // the plaintext, so there is no prefix to consume.  Our work is done.
     96  if (last_underscore < 0) return true;
     97 
     98  // Otherwise there will be an underscore delimiter somewhere.  It can't be
     99  // initial because then there would be no ASCII characters to its left, and no
    100  // delimiter would have been added in that case.
    101  if (last_underscore == 0) return false;
    102 
    103  // Any other position is reasonable.  Make sure there's room in the buffer.
    104  if (last_underscore + 1 > out_end - out_begin) return false;
    105 
    106  // Consume and write out the ASCII characters.
    107  num_ascii_chars = static_cast<uint32_t>(last_underscore);
    108  std::memcpy(out_begin, punycode_begin, num_ascii_chars);
    109  out_begin[num_ascii_chars] = '\0';
    110  punycode_begin += num_ascii_chars + 1;
    111  return true;
    112 }
    113 
    114 // Returns the value of `c` as a base-36 digit according to RFC 3492 section 5,
    115 // or -1 if `c` is not such a digit.
    116 int DigitValue(char c) {
    117  if ('0' <= c && c <= '9') return c - '0' + 26;
    118  if ('a' <= c && c <= 'z') return c - 'a';
    119  if ('A' <= c && c <= 'Z') return c - 'A';
    120  return -1;
    121 }
    122 
    123 // Consumes the next delta encoding from punycode_begin .. punycode_end,
    124 // updating i accordingly.  Returns true on success.  Returns false on parse
    125 // failure or arithmetic overflow.
    126 bool ScanNextDelta(const char*& punycode_begin, const char* const punycode_end,
    127                   uint32_t bias, uint32_t& i) {
    128  uint64_t w = 1;  // 64 bits to prevent overflow in w *= kBase - t
    129 
    130  // "for k = base to infinity in steps of base do begin ... end" in RFC 3492
    131  // section 6.2.  Each loop iteration scans one digit of the delta.
    132  for (uint32_t k = kBase; punycode_begin != punycode_end; k += kBase) {
    133    const int digit_value = DigitValue(*punycode_begin++);
    134    if (digit_value < 0) return false;
    135 
    136    // Compute this in 64-bit arithmetic so we can check for overflow afterward.
    137    const uint64_t new_i = i + static_cast<uint64_t>(digit_value) * w;
    138 
    139    // Valid deltas are bounded by (#chars already emitted) * kMaxCodePoint, but
    140    // invalid input could encode an arbitrarily large delta.  Nip that in the
    141    // bud here.
    142    static_assert(
    143        kMaxI >= kMaxChars * kMaxCodePoint,
    144        "kMaxI is too small to prevent spurious failures on good input");
    145    if (new_i > kMaxI) return false;
    146 
    147    static_assert(
    148        kMaxI < (uint64_t{1} << 32),
    149        "Make kMaxI smaller or i 64 bits wide to prevent silent wraparound");
    150    i = static_cast<uint32_t>(new_i);
    151 
    152    // Compute the threshold that determines whether this is the last digit and
    153    // (if not) what the next digit's place value will be.  This logic from RFC
    154    // 3492 section 6.2 is explained in section 3.3.
    155    uint32_t t;
    156    if (k <= bias + kTMin) {
    157      t = kTMin;
    158    } else if (k >= bias + kTMax) {
    159      t = kTMax;
    160    } else {
    161      t = k - bias;
    162    }
    163    if (static_cast<uint32_t>(digit_value) < t) return true;
    164 
    165    // If this gets too large, the range check on new_i in the next iteration
    166    // will catch it.  We know this multiplication will not overwrap because w
    167    // is 64 bits wide.
    168    w *= kBase - t;
    169  }
    170  return false;
    171 }
    172 
    173 }  // namespace
    174 
    175 absl::Nullable<char*> DecodeRustPunycode(DecodeRustPunycodeOptions options) {
    176  const char* punycode_begin = options.punycode_begin;
    177  const char* const punycode_end = options.punycode_end;
    178  char* const out_begin = options.out_begin;
    179  char* const out_end = options.out_end;
    180 
    181  // Write a NUL terminator first.  Later memcpy calls will keep bumping it
    182  // along to its new right place.
    183  const size_t out_size = static_cast<size_t>(out_end - out_begin);
    184  if (out_size == 0) return nullptr;
    185  *out_begin = '\0';
    186 
    187  // RFC 3492 section 6.2 begins here.  We retain the names of integer variables
    188  // appearing in that text.
    189  uint32_t n = 128, i = 0, bias = 72, num_chars = 0;
    190 
    191  // If there are any ASCII characters, consume them and their trailing
    192  // underscore delimiter.
    193  if (!ConsumeOptionalAsciiPrefix(punycode_begin, punycode_end,
    194                                  out_begin, out_end, num_chars)) {
    195    return nullptr;
    196  }
    197  uint32_t total_utf8_bytes = num_chars;
    198 
    199  BoundedUtf8LengthSequence<kMaxChars> utf8_lengths;
    200 
    201  // "while the input is not exhausted do begin ... end"
    202  while (punycode_begin != punycode_end) {
    203    if (num_chars >= kMaxChars) return nullptr;
    204 
    205    const uint32_t old_i = i;
    206 
    207    if (!ScanNextDelta(punycode_begin, punycode_end, bias, i)) return nullptr;
    208 
    209    // Update bias as in RFC 3492 section 6.1.  (We have inlined adapt.)
    210    uint32_t delta = i - old_i;
    211    delta /= (old_i == 0 ? kDamp : 2);
    212    delta += delta/(num_chars + 1);
    213    bias = 0;
    214    while (delta > ((kBase - kTMin) * kTMax)/2) {
    215      delta /= kBase - kTMin;
    216      bias += kBase;
    217    }
    218    bias += ((kBase - kTMin + 1) * delta)/(delta + kSkew);
    219 
    220    // Back in section 6.2, compute the new code point and insertion index.
    221    static_assert(
    222        kMaxI + kMaxCodePoint < (uint64_t{1} << 32),
    223        "Make kMaxI smaller or n 64 bits wide to prevent silent wraparound");
    224    n += i/(num_chars + 1);
    225    i %= num_chars + 1;
    226 
    227    // To actually insert, we need to convert the code point n to UTF-8 and the
    228    // character index i to an index into the byte stream emitted so far.  First
    229    // prepare the UTF-8 encoding for n, rejecting surrogates, overlarge values,
    230    // and anything that won't fit into the remaining output storage.
    231    Utf8ForCodePoint utf8_for_code_point(n);
    232    if (!utf8_for_code_point.ok()) return nullptr;
    233    if (total_utf8_bytes + utf8_for_code_point.length + 1 > out_size) {
    234      return nullptr;
    235    }
    236 
    237    // Now insert the new character into both our length map and the output.
    238    uint32_t n_index =
    239        utf8_lengths.InsertAndReturnSumOfPredecessors(
    240            i, utf8_for_code_point.length);
    241    std::memmove(
    242        out_begin + n_index + utf8_for_code_point.length, out_begin + n_index,
    243        total_utf8_bytes + 1 - n_index);
    244    std::memcpy(out_begin + n_index, utf8_for_code_point.bytes,
    245                utf8_for_code_point.length);
    246    total_utf8_bytes += utf8_for_code_point.length;
    247    ++num_chars;
    248 
    249    // Finally, advance to the next state before continuing.
    250    ++i;
    251  }
    252 
    253  return out_begin + total_utf8_bytes;
    254 }
    255 
    256 }  // namespace debugging_internal
    257 ABSL_NAMESPACE_END
    258 }  // namespace absl