decode_rust_punycode.cc (10283B)
1 // Copyright 2024 The Abseil Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // https://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #include "absl/debugging/internal/decode_rust_punycode.h" 16 17 #include <cstddef> 18 #include <cstdint> 19 #include <cstring> 20 21 #include "absl/base/config.h" 22 #include "absl/base/nullability.h" 23 #include "absl/debugging/internal/bounded_utf8_length_sequence.h" 24 #include "absl/debugging/internal/utf8_for_code_point.h" 25 26 namespace absl { 27 ABSL_NAMESPACE_BEGIN 28 namespace debugging_internal { 29 30 namespace { 31 32 // Decoding Punycode requires repeated random-access insertion into a stream of 33 // variable-length UTF-8 code-point encodings. We need this to be tolerably 34 // fast (no N^2 slowdown for unfortunate inputs), and we can't allocate any data 35 // structures on the heap (async-signal-safety). 36 // 37 // It is pragmatic to impose a moderately low limit on the identifier length and 38 // bail out if we ever hit it. Then BoundedUtf8LengthSequence efficiently 39 // determines where to insert the next code point, and memmove efficiently makes 40 // room for it. 41 // 42 // The chosen limit is a round number several times larger than identifiers 43 // expected in practice, yet still small enough that a memmove of this many 44 // UTF-8 characters is not much more expensive than the division and modulus 45 // operations that Punycode decoding requires. 46 constexpr uint32_t kMaxChars = 256; 47 48 // Constants from RFC 3492 section 5. 49 constexpr uint32_t kBase = 36, kTMin = 1, kTMax = 26, kSkew = 38, kDamp = 700; 50 51 constexpr uint32_t kMaxCodePoint = 0x10ffff; 52 53 // Overflow threshold in DecodeRustPunycode's inner loop; see comments there. 54 constexpr uint32_t kMaxI = 1 << 30; 55 56 // If punycode_begin .. punycode_end begins with a prefix matching the regular 57 // expression [0-9a-zA-Z_]+_, removes that prefix, copies all but the final 58 // underscore into out_begin .. out_end, sets num_ascii_chars to the number of 59 // bytes copied, and returns true. (A prefix of this sort represents the 60 // nonempty subsequence of ASCII characters in the corresponding plaintext.) 61 // 62 // If punycode_begin .. punycode_end does not contain an underscore, sets 63 // num_ascii_chars to zero and returns true. (The encoding of a plaintext 64 // without any ASCII characters does not carry such a prefix.) 65 // 66 // Returns false and zeroes num_ascii_chars on failure (either parse error or 67 // not enough space in the output buffer). 68 bool ConsumeOptionalAsciiPrefix(const char*& punycode_begin, 69 const char* const punycode_end, 70 char* const out_begin, 71 char* const out_end, 72 uint32_t& num_ascii_chars) { 73 num_ascii_chars = 0; 74 75 // Remember the last underscore if any. Also use the same string scan to 76 // reject any ASCII bytes that do not belong in an identifier, including NUL, 77 // as well as non-ASCII bytes, which should have been delta-encoded instead. 78 int last_underscore = -1; 79 for (int i = 0; i < punycode_end - punycode_begin; ++i) { 80 const char c = punycode_begin[i]; 81 if (c == '_') { 82 last_underscore = i; 83 continue; 84 } 85 // We write out the meaning of absl::ascii_isalnum rather than call that 86 // function because its documentation does not promise it will remain 87 // async-signal-safe under future development. 88 if ('a' <= c && c <= 'z') continue; 89 if ('A' <= c && c <= 'Z') continue; 90 if ('0' <= c && c <= '9') continue; 91 return false; 92 } 93 94 // If there was no underscore, that means there were no ASCII characters in 95 // the plaintext, so there is no prefix to consume. Our work is done. 96 if (last_underscore < 0) return true; 97 98 // Otherwise there will be an underscore delimiter somewhere. It can't be 99 // initial because then there would be no ASCII characters to its left, and no 100 // delimiter would have been added in that case. 101 if (last_underscore == 0) return false; 102 103 // Any other position is reasonable. Make sure there's room in the buffer. 104 if (last_underscore + 1 > out_end - out_begin) return false; 105 106 // Consume and write out the ASCII characters. 107 num_ascii_chars = static_cast<uint32_t>(last_underscore); 108 std::memcpy(out_begin, punycode_begin, num_ascii_chars); 109 out_begin[num_ascii_chars] = '\0'; 110 punycode_begin += num_ascii_chars + 1; 111 return true; 112 } 113 114 // Returns the value of `c` as a base-36 digit according to RFC 3492 section 5, 115 // or -1 if `c` is not such a digit. 116 int DigitValue(char c) { 117 if ('0' <= c && c <= '9') return c - '0' + 26; 118 if ('a' <= c && c <= 'z') return c - 'a'; 119 if ('A' <= c && c <= 'Z') return c - 'A'; 120 return -1; 121 } 122 123 // Consumes the next delta encoding from punycode_begin .. punycode_end, 124 // updating i accordingly. Returns true on success. Returns false on parse 125 // failure or arithmetic overflow. 126 bool ScanNextDelta(const char*& punycode_begin, const char* const punycode_end, 127 uint32_t bias, uint32_t& i) { 128 uint64_t w = 1; // 64 bits to prevent overflow in w *= kBase - t 129 130 // "for k = base to infinity in steps of base do begin ... end" in RFC 3492 131 // section 6.2. Each loop iteration scans one digit of the delta. 132 for (uint32_t k = kBase; punycode_begin != punycode_end; k += kBase) { 133 const int digit_value = DigitValue(*punycode_begin++); 134 if (digit_value < 0) return false; 135 136 // Compute this in 64-bit arithmetic so we can check for overflow afterward. 137 const uint64_t new_i = i + static_cast<uint64_t>(digit_value) * w; 138 139 // Valid deltas are bounded by (#chars already emitted) * kMaxCodePoint, but 140 // invalid input could encode an arbitrarily large delta. Nip that in the 141 // bud here. 142 static_assert( 143 kMaxI >= kMaxChars * kMaxCodePoint, 144 "kMaxI is too small to prevent spurious failures on good input"); 145 if (new_i > kMaxI) return false; 146 147 static_assert( 148 kMaxI < (uint64_t{1} << 32), 149 "Make kMaxI smaller or i 64 bits wide to prevent silent wraparound"); 150 i = static_cast<uint32_t>(new_i); 151 152 // Compute the threshold that determines whether this is the last digit and 153 // (if not) what the next digit's place value will be. This logic from RFC 154 // 3492 section 6.2 is explained in section 3.3. 155 uint32_t t; 156 if (k <= bias + kTMin) { 157 t = kTMin; 158 } else if (k >= bias + kTMax) { 159 t = kTMax; 160 } else { 161 t = k - bias; 162 } 163 if (static_cast<uint32_t>(digit_value) < t) return true; 164 165 // If this gets too large, the range check on new_i in the next iteration 166 // will catch it. We know this multiplication will not overwrap because w 167 // is 64 bits wide. 168 w *= kBase - t; 169 } 170 return false; 171 } 172 173 } // namespace 174 175 absl::Nullable<char*> DecodeRustPunycode(DecodeRustPunycodeOptions options) { 176 const char* punycode_begin = options.punycode_begin; 177 const char* const punycode_end = options.punycode_end; 178 char* const out_begin = options.out_begin; 179 char* const out_end = options.out_end; 180 181 // Write a NUL terminator first. Later memcpy calls will keep bumping it 182 // along to its new right place. 183 const size_t out_size = static_cast<size_t>(out_end - out_begin); 184 if (out_size == 0) return nullptr; 185 *out_begin = '\0'; 186 187 // RFC 3492 section 6.2 begins here. We retain the names of integer variables 188 // appearing in that text. 189 uint32_t n = 128, i = 0, bias = 72, num_chars = 0; 190 191 // If there are any ASCII characters, consume them and their trailing 192 // underscore delimiter. 193 if (!ConsumeOptionalAsciiPrefix(punycode_begin, punycode_end, 194 out_begin, out_end, num_chars)) { 195 return nullptr; 196 } 197 uint32_t total_utf8_bytes = num_chars; 198 199 BoundedUtf8LengthSequence<kMaxChars> utf8_lengths; 200 201 // "while the input is not exhausted do begin ... end" 202 while (punycode_begin != punycode_end) { 203 if (num_chars >= kMaxChars) return nullptr; 204 205 const uint32_t old_i = i; 206 207 if (!ScanNextDelta(punycode_begin, punycode_end, bias, i)) return nullptr; 208 209 // Update bias as in RFC 3492 section 6.1. (We have inlined adapt.) 210 uint32_t delta = i - old_i; 211 delta /= (old_i == 0 ? kDamp : 2); 212 delta += delta/(num_chars + 1); 213 bias = 0; 214 while (delta > ((kBase - kTMin) * kTMax)/2) { 215 delta /= kBase - kTMin; 216 bias += kBase; 217 } 218 bias += ((kBase - kTMin + 1) * delta)/(delta + kSkew); 219 220 // Back in section 6.2, compute the new code point and insertion index. 221 static_assert( 222 kMaxI + kMaxCodePoint < (uint64_t{1} << 32), 223 "Make kMaxI smaller or n 64 bits wide to prevent silent wraparound"); 224 n += i/(num_chars + 1); 225 i %= num_chars + 1; 226 227 // To actually insert, we need to convert the code point n to UTF-8 and the 228 // character index i to an index into the byte stream emitted so far. First 229 // prepare the UTF-8 encoding for n, rejecting surrogates, overlarge values, 230 // and anything that won't fit into the remaining output storage. 231 Utf8ForCodePoint utf8_for_code_point(n); 232 if (!utf8_for_code_point.ok()) return nullptr; 233 if (total_utf8_bytes + utf8_for_code_point.length + 1 > out_size) { 234 return nullptr; 235 } 236 237 // Now insert the new character into both our length map and the output. 238 uint32_t n_index = 239 utf8_lengths.InsertAndReturnSumOfPredecessors( 240 i, utf8_for_code_point.length); 241 std::memmove( 242 out_begin + n_index + utf8_for_code_point.length, out_begin + n_index, 243 total_utf8_bytes + 1 - n_index); 244 std::memcpy(out_begin + n_index, utf8_for_code_point.bytes, 245 utf8_for_code_point.length); 246 total_utf8_bytes += utf8_for_code_point.length; 247 ++num_chars; 248 249 // Finally, advance to the next state before continuing. 250 ++i; 251 } 252 253 return out_begin + total_utf8_bytes; 254 } 255 256 } // namespace debugging_internal 257 ABSL_NAMESPACE_END 258 } // namespace absl