string_util_impl_helpers.h (22201B)
1 // Copyright 2020 The Chromium Authors 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef BASE_STRINGS_STRING_UTIL_IMPL_HELPERS_H_ 6 #define BASE_STRINGS_STRING_UTIL_IMPL_HELPERS_H_ 7 8 #include <algorithm> 9 10 #include "base/check.h" 11 #include "base/check_op.h" 12 #include "base/logging.h" 13 #include "base/notreached.h" 14 #include "base/ranges/algorithm.h" 15 #include "base/strings/string_piece.h" 16 #include "base/third_party/icu/icu_utf.h" 17 #include "third_party/abseil-cpp/absl/types/optional.h" 18 19 namespace base::internal { 20 21 // Used by ReplaceStringPlaceholders to track the position in the string of 22 // replaced parameters. 23 struct ReplacementOffset { 24 ReplacementOffset(uintptr_t parameter, size_t offset) 25 : parameter(parameter), offset(offset) {} 26 27 // Index of the parameter. 28 size_t parameter; 29 30 // Starting position in the string. 31 size_t offset; 32 }; 33 34 static bool CompareParameter(const ReplacementOffset& elem1, 35 const ReplacementOffset& elem2) { 36 return elem1.parameter < elem2.parameter; 37 } 38 39 // Assuming that a pointer is the size of a "machine word", then 40 // uintptr_t is an integer type that is also a machine word. 41 using MachineWord = uintptr_t; 42 43 inline bool IsMachineWordAligned(const void* pointer) { 44 return !(reinterpret_cast<MachineWord>(pointer) & (sizeof(MachineWord) - 1)); 45 } 46 47 template <typename T, typename CharT = typename T::value_type> 48 std::basic_string<CharT> ToLowerASCIIImpl(T str) { 49 std::basic_string<CharT> ret; 50 ret.reserve(str.size()); 51 for (size_t i = 0; i < str.size(); i++) 52 ret.push_back(ToLowerASCII(str[i])); 53 return ret; 54 } 55 56 template <typename T, typename CharT = typename T::value_type> 57 std::basic_string<CharT> ToUpperASCIIImpl(T str) { 58 std::basic_string<CharT> ret; 59 ret.reserve(str.size()); 60 for (size_t i = 0; i < str.size(); i++) 61 ret.push_back(ToUpperASCII(str[i])); 62 return ret; 63 } 64 65 template <typename T, typename CharT = typename T::value_type> 66 TrimPositions TrimStringT(T input, 67 T trim_chars, 68 TrimPositions positions, 69 std::basic_string<CharT>* output) { 70 // Find the edges of leading/trailing whitespace as desired. Need to use 71 // a StringPiece version of input to be able to call find* on it with the 72 // StringPiece version of trim_chars (normally the trim_chars will be a 73 // constant so avoid making a copy). 74 const size_t last_char = input.length() - 1; 75 const size_t first_good_char = 76 (positions & TRIM_LEADING) ? input.find_first_not_of(trim_chars) : 0; 77 const size_t last_good_char = (positions & TRIM_TRAILING) 78 ? input.find_last_not_of(trim_chars) 79 : last_char; 80 81 // When the string was all trimmed, report that we stripped off characters 82 // from whichever position the caller was interested in. For empty input, we 83 // stripped no characters, but we still need to clear |output|. 84 if (input.empty() || first_good_char == std::basic_string<CharT>::npos || 85 last_good_char == std::basic_string<CharT>::npos) { 86 bool input_was_empty = input.empty(); // in case output == &input 87 output->clear(); 88 return input_was_empty ? TRIM_NONE : positions; 89 } 90 91 // Trim. 92 output->assign(input.data() + first_good_char, 93 last_good_char - first_good_char + 1); 94 95 // Return where we trimmed from. 96 return static_cast<TrimPositions>( 97 (first_good_char == 0 ? TRIM_NONE : TRIM_LEADING) | 98 (last_good_char == last_char ? TRIM_NONE : TRIM_TRAILING)); 99 } 100 101 template <typename T, typename CharT = typename T::value_type> 102 T TrimStringPieceT(T input, T trim_chars, TrimPositions positions) { 103 size_t begin = 104 (positions & TRIM_LEADING) ? input.find_first_not_of(trim_chars) : 0; 105 size_t end = (positions & TRIM_TRAILING) 106 ? input.find_last_not_of(trim_chars) + 1 107 : input.size(); 108 return input.substr(std::min(begin, input.size()), end - begin); 109 } 110 111 template <typename T, typename CharT = typename T::value_type> 112 std::basic_string<CharT> CollapseWhitespaceT( 113 T text, 114 bool trim_sequences_with_line_breaks) { 115 std::basic_string<CharT> result; 116 result.resize(text.size()); 117 118 // Set flags to pretend we're already in a trimmed whitespace sequence, so we 119 // will trim any leading whitespace. 120 bool in_whitespace = true; 121 bool already_trimmed = true; 122 123 size_t chars_written = 0; 124 for (auto c : text) { 125 if (IsWhitespace(c)) { 126 if (!in_whitespace) { 127 // Reduce all whitespace sequences to a single space. 128 in_whitespace = true; 129 result[chars_written++] = L' '; 130 } 131 if (trim_sequences_with_line_breaks && !already_trimmed && 132 ((c == '\n') || (c == '\r'))) { 133 // Whitespace sequences containing CR or LF are eliminated entirely. 134 already_trimmed = true; 135 --chars_written; 136 } 137 } else { 138 // Non-whitespace characters are copied straight across. 139 in_whitespace = false; 140 already_trimmed = false; 141 result[chars_written++] = c; 142 } 143 } 144 145 if (in_whitespace && !already_trimmed) { 146 // Any trailing whitespace is eliminated. 147 --chars_written; 148 } 149 150 result.resize(chars_written); 151 return result; 152 } 153 154 template <class Char> 155 bool DoIsStringASCII(const Char* characters, size_t length) { 156 // Bitmasks to detect non ASCII characters for character sizes of 8, 16 and 32 157 // bits. 158 constexpr MachineWord NonASCIIMasks[] = { 159 0, MachineWord(0x8080808080808080ULL), MachineWord(0xFF80FF80FF80FF80ULL), 160 0, MachineWord(0xFFFFFF80FFFFFF80ULL), 161 }; 162 163 if (!length) 164 return true; 165 constexpr MachineWord non_ascii_bit_mask = NonASCIIMasks[sizeof(Char)]; 166 static_assert(non_ascii_bit_mask, "Error: Invalid Mask"); 167 MachineWord all_char_bits = 0; 168 const Char* end = characters + length; 169 170 // Prologue: align the input. 171 while (!IsMachineWordAligned(characters) && characters < end) 172 all_char_bits |= static_cast<MachineWord>(*characters++); 173 if (all_char_bits & non_ascii_bit_mask) 174 return false; 175 176 // Compare the values of CPU word size. 177 constexpr size_t chars_per_word = sizeof(MachineWord) / sizeof(Char); 178 constexpr int batch_count = 16; 179 while (characters <= end - batch_count * chars_per_word) { 180 all_char_bits = 0; 181 for (int i = 0; i < batch_count; ++i) { 182 all_char_bits |= *(reinterpret_cast<const MachineWord*>(characters)); 183 characters += chars_per_word; 184 } 185 if (all_char_bits & non_ascii_bit_mask) 186 return false; 187 } 188 189 // Process the remaining words. 190 all_char_bits = 0; 191 while (characters <= end - chars_per_word) { 192 all_char_bits |= *(reinterpret_cast<const MachineWord*>(characters)); 193 characters += chars_per_word; 194 } 195 196 // Process the remaining bytes. 197 while (characters < end) 198 all_char_bits |= static_cast<MachineWord>(*characters++); 199 200 return !(all_char_bits & non_ascii_bit_mask); 201 } 202 203 template <bool (*Validator)(base_icu::UChar32)> 204 inline bool DoIsStringUTF8(StringPiece str) { 205 const uint8_t* src = reinterpret_cast<const uint8_t*>(str.data()); 206 size_t src_len = str.length(); 207 size_t char_index = 0; 208 209 while (char_index < src_len) { 210 base_icu::UChar32 code_point; 211 CBU8_NEXT(src, char_index, src_len, code_point); 212 if (!Validator(code_point)) 213 return false; 214 } 215 return true; 216 } 217 218 template <typename T, typename CharT = typename T::value_type> 219 bool StartsWithT(T str, T search_for, CompareCase case_sensitivity) { 220 if (search_for.size() > str.size()) 221 return false; 222 223 BasicStringPiece<CharT> source = str.substr(0, search_for.size()); 224 225 switch (case_sensitivity) { 226 case CompareCase::SENSITIVE: 227 return source == search_for; 228 229 case CompareCase::INSENSITIVE_ASCII: 230 return std::equal(search_for.begin(), search_for.end(), source.begin(), 231 CaseInsensitiveCompareASCII<CharT>()); 232 } 233 234 return false; 235 } 236 237 template <typename T, typename CharT = typename T::value_type> 238 bool EndsWithT(T str, T search_for, CompareCase case_sensitivity) { 239 if (search_for.size() > str.size()) 240 return false; 241 242 BasicStringPiece<CharT> source = 243 str.substr(str.size() - search_for.size(), search_for.size()); 244 245 switch (case_sensitivity) { 246 case CompareCase::SENSITIVE: 247 return source == search_for; 248 249 case CompareCase::INSENSITIVE_ASCII: 250 return std::equal(source.begin(), source.end(), search_for.begin(), 251 CaseInsensitiveCompareASCII<CharT>()); 252 } 253 254 return false; 255 } 256 257 // A Matcher for DoReplaceMatchesAfterOffset() that matches substrings. 258 template <class CharT> 259 struct SubstringMatcher { 260 BasicStringPiece<CharT> find_this; 261 262 size_t Find(const std::basic_string<CharT>& input, size_t pos) { 263 return input.find(find_this.data(), pos, find_this.length()); 264 } 265 size_t MatchSize() { return find_this.length(); } 266 }; 267 268 // Type deduction helper for SubstringMatcher. 269 template <typename T, typename CharT = typename T::value_type> 270 auto MakeSubstringMatcher(T find_this) { 271 return SubstringMatcher<CharT>{find_this}; 272 } 273 274 // A Matcher for DoReplaceMatchesAfterOffset() that matches single characters. 275 template <class CharT> 276 struct CharacterMatcher { 277 BasicStringPiece<CharT> find_any_of_these; 278 279 size_t Find(const std::basic_string<CharT>& input, size_t pos) { 280 return input.find_first_of(find_any_of_these.data(), pos, 281 find_any_of_these.length()); 282 } 283 constexpr size_t MatchSize() { return 1; } 284 }; 285 286 // Type deduction helper for CharacterMatcher. 287 template <typename T, typename CharT = typename T::value_type> 288 auto MakeCharacterMatcher(T find_any_of_these) { 289 return CharacterMatcher<CharT>{find_any_of_these}; 290 } 291 292 enum class ReplaceType { REPLACE_ALL, REPLACE_FIRST }; 293 294 // Runs in O(n) time in the length of |str|, and transforms the string without 295 // reallocating when possible. Returns |true| if any matches were found. 296 // 297 // This is parameterized on a |Matcher| traits type, so that it can be the 298 // implementation for both ReplaceChars() and ReplaceSubstringsAfterOffset(). 299 template <typename Matcher, typename T, typename CharT = typename T::value_type> 300 bool DoReplaceMatchesAfterOffset(std::basic_string<CharT>* str, 301 size_t initial_offset, 302 Matcher matcher, 303 T replace_with, 304 ReplaceType replace_type) { 305 using CharTraits = std::char_traits<CharT>; 306 307 const size_t find_length = matcher.MatchSize(); 308 if (!find_length) 309 return false; 310 311 // If the find string doesn't appear, there's nothing to do. 312 size_t first_match = matcher.Find(*str, initial_offset); 313 if (first_match == std::basic_string<CharT>::npos) 314 return false; 315 316 // If we're only replacing one instance, there's no need to do anything 317 // complicated. 318 const size_t replace_length = replace_with.length(); 319 if (replace_type == ReplaceType::REPLACE_FIRST) { 320 str->replace(first_match, find_length, replace_with.data(), replace_length); 321 return true; 322 } 323 324 // If the find and replace strings are the same length, we can simply use 325 // replace() on each instance, and finish the entire operation in O(n) time. 326 if (find_length == replace_length) { 327 auto* buffer = &((*str)[0]); 328 for (size_t offset = first_match; offset != std::basic_string<CharT>::npos; 329 offset = matcher.Find(*str, offset + replace_length)) { 330 CharTraits::copy(buffer + offset, replace_with.data(), replace_length); 331 } 332 return true; 333 } 334 335 // Since the find and replace strings aren't the same length, a loop like the 336 // one above would be O(n^2) in the worst case, as replace() will shift the 337 // entire remaining string each time. We need to be more clever to keep things 338 // O(n). 339 // 340 // When the string is being shortened, it's possible to just shift the matches 341 // down in one pass while finding, and truncate the length at the end of the 342 // search. 343 // 344 // If the string is being lengthened, more work is required. The strategy used 345 // here is to make two find() passes through the string. The first pass counts 346 // the number of matches to determine the new size. The second pass will 347 // either construct the new string into a new buffer (if the existing buffer 348 // lacked capacity), or else -- if there is room -- create a region of scratch 349 // space after |first_match| by shifting the tail of the string to a higher 350 // index, and doing in-place moves from the tail to lower indices thereafter. 351 size_t str_length = str->length(); 352 size_t expansion = 0; 353 if (replace_length > find_length) { 354 // This operation lengthens the string; determine the new length by counting 355 // matches. 356 const size_t expansion_per_match = (replace_length - find_length); 357 size_t num_matches = 0; 358 for (size_t match = first_match; match != std::basic_string<CharT>::npos; 359 match = matcher.Find(*str, match + find_length)) { 360 expansion += expansion_per_match; 361 ++num_matches; 362 } 363 const size_t final_length = str_length + expansion; 364 365 if (str->capacity() < final_length) { 366 // If we'd have to allocate a new buffer to grow the string, build the 367 // result directly into the new allocation via append(). 368 std::basic_string<CharT> src(str->get_allocator()); 369 str->swap(src); 370 str->reserve(final_length); 371 372 size_t pos = 0; 373 for (size_t match = first_match;; match = matcher.Find(src, pos)) { 374 str->append(src, pos, match - pos); 375 str->append(replace_with.data(), replace_length); 376 pos = match + find_length; 377 378 // A mid-loop test/break enables skipping the final Find() call; the 379 // number of matches is known, so don't search past the last one. 380 if (!--num_matches) 381 break; 382 } 383 384 // Handle substring after the final match. 385 str->append(src, pos, str_length - pos); 386 return true; 387 } 388 389 // Prepare for the copy/move loop below -- expand the string to its final 390 // size by shifting the data after the first match to the end of the resized 391 // string. 392 size_t shift_src = first_match + find_length; 393 size_t shift_dst = shift_src + expansion; 394 395 // Big |expansion| factors (relative to |str_length|) require padding up to 396 // |shift_dst|. 397 if (shift_dst > str_length) 398 str->resize(shift_dst); 399 400 str->replace(shift_dst, str_length - shift_src, *str, shift_src, 401 str_length - shift_src); 402 str_length = final_length; 403 } 404 405 // We can alternate replacement and move operations. This won't overwrite the 406 // unsearched region of the string so long as |write_offset| <= |read_offset|; 407 // that condition is always satisfied because: 408 // 409 // (a) If the string is being shortened, |expansion| is zero and 410 // |write_offset| grows slower than |read_offset|. 411 // 412 // (b) If the string is being lengthened, |write_offset| grows faster than 413 // |read_offset|, but |expansion| is big enough so that |write_offset| 414 // will only catch up to |read_offset| at the point of the last match. 415 auto* buffer = &((*str)[0]); 416 size_t write_offset = first_match; 417 size_t read_offset = first_match + expansion; 418 do { 419 if (replace_length) { 420 CharTraits::copy(buffer + write_offset, replace_with.data(), 421 replace_length); 422 write_offset += replace_length; 423 } 424 read_offset += find_length; 425 426 // min() clamps std::basic_string<CharT>::npos (the largest unsigned value) 427 // to str_length. 428 size_t match = std::min(matcher.Find(*str, read_offset), str_length); 429 430 size_t length = match - read_offset; 431 if (length) { 432 CharTraits::move(buffer + write_offset, buffer + read_offset, length); 433 write_offset += length; 434 read_offset += length; 435 } 436 } while (read_offset < str_length); 437 438 // If we're shortening the string, truncate it now. 439 str->resize(write_offset); 440 return true; 441 } 442 443 template <typename T, typename CharT = typename T::value_type> 444 bool ReplaceCharsT(T input, 445 T find_any_of_these, 446 T replace_with, 447 std::basic_string<CharT>* output) { 448 // Commonly, this is called with output and input being the same string; in 449 // that case, skip the copy. 450 if (input.data() != output->data() || input.size() != output->size()) 451 output->assign(input.data(), input.size()); 452 453 return DoReplaceMatchesAfterOffset(output, 0, 454 MakeCharacterMatcher(find_any_of_these), 455 replace_with, ReplaceType::REPLACE_ALL); 456 } 457 458 template <class string_type> 459 inline typename string_type::value_type* WriteIntoT(string_type* str, 460 size_t length_with_null) { 461 DCHECK_GE(length_with_null, 1u); 462 str->reserve(length_with_null); 463 str->resize(length_with_null - 1); 464 return &((*str)[0]); 465 } 466 467 // Generic version for all JoinString overloads. |list_type| must be a sequence 468 // (base::span or std::initializer_list) of strings/StringPieces (std::string, 469 // std::u16string, StringPiece or StringPiece16). |CharT| is either char or 470 // char16_t. 471 template <typename list_type, 472 typename T, 473 typename CharT = typename T::value_type> 474 static std::basic_string<CharT> JoinStringT(list_type parts, T sep) { 475 if (std::empty(parts)) 476 return std::basic_string<CharT>(); 477 478 // Pre-allocate the eventual size of the string. Start with the size of all of 479 // the separators (note that this *assumes* parts.size() > 0). 480 size_t total_size = (parts.size() - 1) * sep.size(); 481 for (const auto& part : parts) 482 total_size += part.size(); 483 std::basic_string<CharT> result; 484 result.reserve(total_size); 485 486 auto iter = parts.begin(); 487 DCHECK(iter != parts.end()); 488 result.append(*iter); 489 ++iter; 490 491 for (; iter != parts.end(); ++iter) { 492 result.append(sep); 493 result.append(*iter); 494 } 495 496 // Sanity-check that we pre-allocated correctly. 497 DCHECK_EQ(total_size, result.size()); 498 499 return result; 500 } 501 502 // Replaces placeholders in `format_string` with values from `subst`. 503 // * `placeholder_prefix`: Allows using a specific character as the placeholder 504 // prefix. `base::ReplaceStringPlaceholders` uses '$'. 505 // * `should_escape_multiple_placeholder_prefixes`: 506 // * If this parameter is `true`, which is the case with 507 // `base::ReplaceStringPlaceholders`, `placeholder_prefix` characters are 508 // replaced by that number less one. Eg $$->$, $$$->$$, etc. 509 // * If this parameter is `false`, each literal `placeholder_prefix` character 510 // in `format_string` is escaped with another `placeholder_prefix`. For 511 // instance, with `%` as the `placeholder_prefix`: %%->%, %%%%->%%, etc. 512 // * `is_strict_mode`: 513 // * If this parameter is `true`, error handling is stricter. The function 514 // returns `absl::nullopt` if: 515 // * a placeholder %N is encountered where N > substitutions.size(). 516 // * a literal `%` is not escaped with a `%`. 517 template <typename T, typename CharT = typename T::value_type> 518 absl::optional<std::basic_string<CharT>> DoReplaceStringPlaceholders( 519 T format_string, 520 const std::vector<std::basic_string<CharT>>& subst, 521 const CharT placeholder_prefix, 522 const bool should_escape_multiple_placeholder_prefixes, 523 const bool is_strict_mode, 524 std::vector<size_t>* offsets) { 525 size_t substitutions = subst.size(); 526 DCHECK_LT(substitutions, 10U); 527 528 size_t sub_length = 0; 529 for (const auto& cur : subst) { 530 sub_length += cur.length(); 531 } 532 533 std::basic_string<CharT> formatted; 534 formatted.reserve(format_string.length() + sub_length); 535 536 std::vector<ReplacementOffset> r_offsets; 537 for (auto i = format_string.begin(); i != format_string.end(); ++i) { 538 if (placeholder_prefix == *i) { 539 if (i + 1 != format_string.end()) { 540 ++i; 541 if (placeholder_prefix == *i) { 542 do { 543 formatted.push_back(placeholder_prefix); 544 ++i; 545 } while (should_escape_multiple_placeholder_prefixes && 546 i != format_string.end() && placeholder_prefix == *i); 547 --i; 548 } else { 549 if (*i < '1' || *i > '9') { 550 if (is_strict_mode) { 551 DLOG(ERROR) << "Invalid placeholder after placeholder prefix: " 552 << std::basic_string<CharT>(1, placeholder_prefix) 553 << std::basic_string<CharT>(1, *i); 554 return absl::nullopt; 555 } 556 557 continue; 558 } 559 const size_t index = static_cast<size_t>(*i - '1'); 560 if (offsets) { 561 ReplacementOffset r_offset(index, formatted.size()); 562 r_offsets.insert( 563 ranges::upper_bound(r_offsets, r_offset, &CompareParameter), 564 r_offset); 565 } 566 if (index < substitutions) { 567 formatted.append(subst.at(index)); 568 } else if (is_strict_mode) { 569 DLOG(ERROR) << "index out of range: " << index << ": " 570 << substitutions; 571 return absl::nullopt; 572 } 573 } 574 } else if (is_strict_mode) { 575 DLOG(ERROR) << "unexpected placeholder prefix at end of string"; 576 return absl::nullopt; 577 } 578 } else { 579 formatted.push_back(*i); 580 } 581 } 582 if (offsets) { 583 for (const auto& cur : r_offsets) { 584 offsets->push_back(cur.offset); 585 } 586 } 587 return formatted; 588 } 589 590 // The following code is compatible with the OpenBSD lcpy interface. See: 591 // http://www.gratisoft.us/todd/papers/strlcpy.html 592 // ftp://ftp.openbsd.org/pub/OpenBSD/src/lib/libc/string/{wcs,str}lcpy.c 593 594 template <typename CHAR> 595 size_t lcpyT(CHAR* dst, const CHAR* src, size_t dst_size) { 596 for (size_t i = 0; i < dst_size; ++i) { 597 if ((dst[i] = src[i]) == 0) // We hit and copied the terminating NULL. 598 return i; 599 } 600 601 // We were left off at dst_size. We over copied 1 byte. Null terminate. 602 if (dst_size != 0) 603 dst[dst_size - 1] = 0; 604 605 // Count the rest of the |src|, and return it's length in characters. 606 while (src[dst_size]) 607 ++dst_size; 608 return dst_size; 609 } 610 611 } // namespace base::internal 612 613 #endif // BASE_STRINGS_STRING_UTIL_IMPL_HELPERS_H_