regexp-compiler-tonode.cc (80803B)
1 // Copyright 2019 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "irregexp/imported/regexp-compiler.h" 6 #include "irregexp/imported/regexp.h" 7 8 #ifdef V8_INTL_SUPPORT 9 #include "irregexp/imported/special-case.h" 10 #include "unicode/locid.h" 11 #include "unicode/uniset.h" 12 #include "unicode/utypes.h" 13 #endif // V8_INTL_SUPPORT 14 15 namespace v8 { 16 namespace internal { 17 18 using namespace regexp_compiler_constants; // NOLINT(build/namespaces) 19 20 constexpr base::uc32 kMaxCodePoint = 0x10ffff; 21 constexpr int kMaxUtf16CodeUnit = 0xffff; 22 constexpr uint32_t kMaxUtf16CodeUnitU = 0xffff; 23 24 // ------------------------------------------------------------------- 25 // Tree to graph conversion 26 27 RegExpNode* RegExpTree::ToNode(RegExpCompiler* compiler, 28 RegExpNode* on_success) { 29 compiler->ToNodeMaybeCheckForStackOverflow(); 30 return ToNodeImpl(compiler, on_success); 31 } 32 33 RegExpNode* RegExpAtom::ToNodeImpl(RegExpCompiler* compiler, 34 RegExpNode* on_success) { 35 ZoneList<TextElement>* elms = 36 compiler->zone()->New<ZoneList<TextElement>>(1, compiler->zone()); 37 elms->Add(TextElement::Atom(this), compiler->zone()); 38 return compiler->zone()->New<TextNode>(elms, compiler->read_backward(), 39 on_success); 40 } 41 42 RegExpNode* RegExpText::ToNodeImpl(RegExpCompiler* compiler, 43 RegExpNode* on_success) { 44 return compiler->zone()->New<TextNode>(elements(), compiler->read_backward(), 45 on_success); 46 } 47 48 namespace { 49 50 bool CompareInverseRanges(ZoneList<CharacterRange>* ranges, 51 const int* special_class, int length) { 52 length--; // Remove final marker. 53 54 DCHECK_EQ(kRangeEndMarker, special_class[length]); 55 DCHECK_NE(0, ranges->length()); 56 DCHECK_NE(0, length); 57 DCHECK_NE(0, special_class[0]); 58 59 if (ranges->length() != (length >> 1) + 1) return false; 60 61 CharacterRange range = ranges->at(0); 62 if (range.from() != 0) return false; 63 64 for (int i = 0; i < length; i += 2) { 65 if (static_cast<base::uc32>(special_class[i]) != (range.to() + 1)) { 66 return false; 67 } 68 range = ranges->at((i >> 1) + 1); 69 if (static_cast<base::uc32>(special_class[i + 1]) != range.from()) { 70 return false; 71 } 72 } 73 74 return range.to() == kMaxCodePoint; 75 } 76 77 bool CompareRanges(ZoneList<CharacterRange>* ranges, const int* special_class, 78 int length) { 79 length--; // Remove final marker. 80 81 DCHECK_EQ(kRangeEndMarker, special_class[length]); 82 if (ranges->length() * 2 != length) return false; 83 84 for (int i = 0; i < length; i += 2) { 85 CharacterRange range = ranges->at(i >> 1); 86 if (range.from() != static_cast<base::uc32>(special_class[i]) || 87 range.to() != static_cast<base::uc32>(special_class[i + 1] - 1)) { 88 return false; 89 } 90 } 91 return true; 92 } 93 94 } // namespace 95 96 bool RegExpClassRanges::is_standard(Zone* zone) { 97 // TODO(lrn): Remove need for this function, by not throwing away information 98 // along the way. 99 if (is_negated()) { 100 return false; 101 } 102 if (set_.is_standard()) { 103 return true; 104 } 105 if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) { 106 set_.set_standard_set_type(StandardCharacterSet::kWhitespace); 107 return true; 108 } 109 if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) { 110 set_.set_standard_set_type(StandardCharacterSet::kNotWhitespace); 111 return true; 112 } 113 if (CompareInverseRanges(set_.ranges(zone), kLineTerminatorRanges, 114 kLineTerminatorRangeCount)) { 115 set_.set_standard_set_type(StandardCharacterSet::kNotLineTerminator); 116 return true; 117 } 118 if (CompareRanges(set_.ranges(zone), kLineTerminatorRanges, 119 kLineTerminatorRangeCount)) { 120 set_.set_standard_set_type(StandardCharacterSet::kLineTerminator); 121 return true; 122 } 123 if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) { 124 set_.set_standard_set_type(StandardCharacterSet::kWord); 125 return true; 126 } 127 if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) { 128 set_.set_standard_set_type(StandardCharacterSet::kNotWord); 129 return true; 130 } 131 return false; 132 } 133 134 UnicodeRangeSplitter::UnicodeRangeSplitter(ZoneList<CharacterRange>* base) { 135 // The unicode range splitter categorizes given character ranges into: 136 // - Code points from the BMP representable by one code unit. 137 // - Code points outside the BMP that need to be split into 138 // surrogate pairs. 139 // - Lone lead surrogates. 140 // - Lone trail surrogates. 141 // Lone surrogates are valid code points, even though no actual characters. 142 // They require special matching to make sure we do not split surrogate pairs. 143 144 for (int i = 0; i < base->length(); i++) AddRange(base->at(i)); 145 } 146 147 void UnicodeRangeSplitter::AddRange(CharacterRange range) { 148 static constexpr base::uc32 kBmp1Start = 0; 149 static constexpr base::uc32 kBmp1End = kLeadSurrogateStart - 1; 150 static constexpr base::uc32 kBmp2Start = kTrailSurrogateEnd + 1; 151 static constexpr base::uc32 kBmp2End = kNonBmpStart - 1; 152 153 // Ends are all inclusive. 154 static_assert(kBmp1Start == 0); 155 static_assert(kBmp1Start < kBmp1End); 156 static_assert(kBmp1End + 1 == kLeadSurrogateStart); 157 static_assert(kLeadSurrogateStart < kLeadSurrogateEnd); 158 static_assert(kLeadSurrogateEnd + 1 == kTrailSurrogateStart); 159 static_assert(kTrailSurrogateStart < kTrailSurrogateEnd); 160 static_assert(kTrailSurrogateEnd + 1 == kBmp2Start); 161 static_assert(kBmp2Start < kBmp2End); 162 static_assert(kBmp2End + 1 == kNonBmpStart); 163 static_assert(kNonBmpStart < kNonBmpEnd); 164 165 static constexpr base::uc32 kStarts[] = { 166 kBmp1Start, kLeadSurrogateStart, kTrailSurrogateStart, 167 kBmp2Start, kNonBmpStart, 168 }; 169 170 static constexpr base::uc32 kEnds[] = { 171 kBmp1End, kLeadSurrogateEnd, kTrailSurrogateEnd, kBmp2End, kNonBmpEnd, 172 }; 173 174 CharacterRangeVector* const kTargets[] = { 175 &bmp_, &lead_surrogates_, &trail_surrogates_, &bmp_, &non_bmp_, 176 }; 177 178 static constexpr int kCount = arraysize(kStarts); 179 static_assert(kCount == arraysize(kEnds)); 180 static_assert(kCount == arraysize(kTargets)); 181 182 for (int i = 0; i < kCount; i++) { 183 if (kStarts[i] > range.to()) break; 184 const base::uc32 from = std::max(kStarts[i], range.from()); 185 const base::uc32 to = std::min(kEnds[i], range.to()); 186 if (from > to) continue; 187 kTargets[i]->emplace_back(CharacterRange::Range(from, to)); 188 } 189 } 190 191 namespace { 192 193 // Translates between new and old V8-isms (SmallVector, ZoneList). 194 ZoneList<CharacterRange>* ToCanonicalZoneList( 195 const UnicodeRangeSplitter::CharacterRangeVector* v, Zone* zone) { 196 if (v->empty()) return nullptr; 197 198 ZoneList<CharacterRange>* result = 199 zone->New<ZoneList<CharacterRange>>(static_cast<int>(v->size()), zone); 200 for (size_t i = 0; i < v->size(); i++) { 201 result->Add(v->at(i), zone); 202 } 203 204 CharacterRange::Canonicalize(result); 205 return result; 206 } 207 208 void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result, 209 RegExpNode* on_success, UnicodeRangeSplitter* splitter) { 210 ZoneList<CharacterRange>* bmp = 211 ToCanonicalZoneList(splitter->bmp(), compiler->zone()); 212 if (bmp == nullptr) return; 213 result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges( 214 compiler->zone(), bmp, compiler->read_backward(), on_success))); 215 } 216 217 using UC16Range = uint32_t; // {from, to} packed into one uint32_t. 218 constexpr UC16Range ToUC16Range(base::uc16 from, base::uc16 to) { 219 return (static_cast<uint32_t>(from) << 16) | to; 220 } 221 constexpr base::uc16 ExtractFrom(UC16Range r) { 222 return static_cast<base::uc16>(r >> 16); 223 } 224 constexpr base::uc16 ExtractTo(UC16Range r) { 225 return static_cast<base::uc16>(r); 226 } 227 228 void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result, 229 RegExpNode* on_success, 230 UnicodeRangeSplitter* splitter) { 231 DCHECK(!compiler->one_byte()); 232 Zone* const zone = compiler->zone(); 233 ZoneList<CharacterRange>* non_bmp = 234 ToCanonicalZoneList(splitter->non_bmp(), zone); 235 if (non_bmp == nullptr) return; 236 237 // Translate each 32-bit code point range into the corresponding 16-bit code 238 // unit representation consisting of the lead- and trail surrogate. 239 // 240 // The generated alternatives are grouped by the leading surrogate to avoid 241 // emitting excessive code. For example, for 242 // 243 // { \ud800[\udc00-\udc01] 244 // , \ud800[\udc05-\udc06] 245 // } 246 // 247 // there's no need to emit matching code for the leading surrogate \ud800 248 // twice. We also create a dedicated grouping for full trailing ranges, i.e. 249 // [dc00-dfff]. 250 ZoneUnorderedMap<UC16Range, ZoneList<CharacterRange>*> grouped_by_leading( 251 zone); 252 ZoneList<CharacterRange>* leading_with_full_trailing_range = 253 zone->New<ZoneList<CharacterRange>>(1, zone); 254 const auto AddRange = [&](base::uc16 from_l, base::uc16 to_l, 255 base::uc16 from_t, base::uc16 to_t) { 256 const UC16Range leading_range = ToUC16Range(from_l, to_l); 257 if (grouped_by_leading.count(leading_range) == 0) { 258 if (from_t == kTrailSurrogateStart && to_t == kTrailSurrogateEnd) { 259 leading_with_full_trailing_range->Add( 260 CharacterRange::Range(from_l, to_l), zone); 261 return; 262 } 263 grouped_by_leading[leading_range] = 264 zone->New<ZoneList<CharacterRange>>(2, zone); 265 } 266 grouped_by_leading[leading_range]->Add(CharacterRange::Range(from_t, to_t), 267 zone); 268 }; 269 270 // First, create the grouped ranges. 271 CharacterRange::Canonicalize(non_bmp); 272 for (int i = 0; i < non_bmp->length(); i++) { 273 // Match surrogate pair. 274 // E.g. [\u10005-\u11005] becomes 275 // \ud800[\udc05-\udfff]| 276 // [\ud801-\ud803][\udc00-\udfff]| 277 // \ud804[\udc00-\udc05] 278 base::uc32 from = non_bmp->at(i).from(); 279 base::uc32 to = non_bmp->at(i).to(); 280 base::uc16 from_l = unibrow::Utf16::LeadSurrogate(from); 281 base::uc16 from_t = unibrow::Utf16::TrailSurrogate(from); 282 base::uc16 to_l = unibrow::Utf16::LeadSurrogate(to); 283 base::uc16 to_t = unibrow::Utf16::TrailSurrogate(to); 284 285 if (from_l == to_l) { 286 // The lead surrogate is the same. 287 AddRange(from_l, to_l, from_t, to_t); 288 continue; 289 } 290 291 if (from_t != kTrailSurrogateStart) { 292 // Add [from_l][from_t-\udfff]. 293 AddRange(from_l, from_l, from_t, kTrailSurrogateEnd); 294 from_l++; 295 } 296 if (to_t != kTrailSurrogateEnd) { 297 // Add [to_l][\udc00-to_t]. 298 AddRange(to_l, to_l, kTrailSurrogateStart, to_t); 299 to_l--; 300 } 301 if (from_l <= to_l) { 302 // Add [from_l-to_l][\udc00-\udfff]. 303 AddRange(from_l, to_l, kTrailSurrogateStart, kTrailSurrogateEnd); 304 } 305 } 306 307 // Create the actual TextNode now that ranges are fully grouped. 308 if (!leading_with_full_trailing_range->is_empty()) { 309 CharacterRange::Canonicalize(leading_with_full_trailing_range); 310 result->AddAlternative(GuardedAlternative(TextNode::CreateForSurrogatePair( 311 zone, leading_with_full_trailing_range, 312 CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd), 313 compiler->read_backward(), on_success))); 314 } 315 for (const auto& it : grouped_by_leading) { 316 CharacterRange leading_range = 317 CharacterRange::Range(ExtractFrom(it.first), ExtractTo(it.first)); 318 ZoneList<CharacterRange>* trailing_ranges = it.second; 319 CharacterRange::Canonicalize(trailing_ranges); 320 result->AddAlternative(GuardedAlternative(TextNode::CreateForSurrogatePair( 321 zone, leading_range, trailing_ranges, compiler->read_backward(), 322 on_success))); 323 } 324 } 325 326 RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch( 327 RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind, 328 ZoneList<CharacterRange>* match, RegExpNode* on_success, 329 bool read_backward) { 330 Zone* zone = compiler->zone(); 331 RegExpNode* match_node = TextNode::CreateForCharacterRanges( 332 zone, match, read_backward, on_success); 333 int stack_register = compiler->UnicodeLookaroundStackRegister(); 334 int position_register = compiler->UnicodeLookaroundPositionRegister(); 335 RegExpLookaround::Builder lookaround(false, match_node, stack_register, 336 position_register); 337 RegExpNode* negative_match = TextNode::CreateForCharacterRanges( 338 zone, lookbehind, !read_backward, lookaround.on_match_success()); 339 return lookaround.ForMatch(negative_match); 340 } 341 342 RegExpNode* MatchAndNegativeLookaroundInReadDirection( 343 RegExpCompiler* compiler, ZoneList<CharacterRange>* match, 344 ZoneList<CharacterRange>* lookahead, RegExpNode* on_success, 345 bool read_backward) { 346 Zone* zone = compiler->zone(); 347 int stack_register = compiler->UnicodeLookaroundStackRegister(); 348 int position_register = compiler->UnicodeLookaroundPositionRegister(); 349 RegExpLookaround::Builder lookaround(false, on_success, stack_register, 350 position_register); 351 RegExpNode* negative_match = TextNode::CreateForCharacterRanges( 352 zone, lookahead, read_backward, lookaround.on_match_success()); 353 return TextNode::CreateForCharacterRanges( 354 zone, match, read_backward, lookaround.ForMatch(negative_match)); 355 } 356 357 void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result, 358 RegExpNode* on_success, 359 UnicodeRangeSplitter* splitter) { 360 ZoneList<CharacterRange>* lead_surrogates = 361 ToCanonicalZoneList(splitter->lead_surrogates(), compiler->zone()); 362 if (lead_surrogates == nullptr) return; 363 Zone* zone = compiler->zone(); 364 // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]). 365 ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List( 366 zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd)); 367 368 RegExpNode* match; 369 if (compiler->read_backward()) { 370 // Reading backward. Assert that reading forward, there is no trail 371 // surrogate, and then backward match the lead surrogate. 372 match = NegativeLookaroundAgainstReadDirectionAndMatch( 373 compiler, trail_surrogates, lead_surrogates, on_success, true); 374 } else { 375 // Reading forward. Forward match the lead surrogate and assert that 376 // no trail surrogate follows. 377 match = MatchAndNegativeLookaroundInReadDirection( 378 compiler, lead_surrogates, trail_surrogates, on_success, false); 379 } 380 result->AddAlternative(GuardedAlternative(match)); 381 } 382 383 void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result, 384 RegExpNode* on_success, 385 UnicodeRangeSplitter* splitter) { 386 ZoneList<CharacterRange>* trail_surrogates = 387 ToCanonicalZoneList(splitter->trail_surrogates(), compiler->zone()); 388 if (trail_surrogates == nullptr) return; 389 Zone* zone = compiler->zone(); 390 // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01 391 ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List( 392 zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd)); 393 394 RegExpNode* match; 395 if (compiler->read_backward()) { 396 // Reading backward. Backward match the trail surrogate and assert that no 397 // lead surrogate precedes it. 398 match = MatchAndNegativeLookaroundInReadDirection( 399 compiler, trail_surrogates, lead_surrogates, on_success, true); 400 } else { 401 // Reading forward. Assert that reading backward, there is no lead 402 // surrogate, and then forward match the trail surrogate. 403 match = NegativeLookaroundAgainstReadDirectionAndMatch( 404 compiler, lead_surrogates, trail_surrogates, on_success, false); 405 } 406 result->AddAlternative(GuardedAlternative(match)); 407 } 408 409 RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler, 410 RegExpNode* on_success) { 411 // This implements ES2015 21.2.5.2.3, AdvanceStringIndex. 412 DCHECK(!compiler->read_backward()); 413 Zone* zone = compiler->zone(); 414 // Advance any character. If the character happens to be a lead surrogate and 415 // we advanced into the middle of a surrogate pair, it will work out, as 416 // nothing will match from there. We will have to advance again, consuming 417 // the associated trail surrogate. 418 ZoneList<CharacterRange>* range = 419 CharacterRange::List(zone, CharacterRange::Range(0, kMaxUtf16CodeUnit)); 420 return TextNode::CreateForCharacterRanges(zone, range, false, on_success); 421 } 422 423 } // namespace 424 425 // static 426 // Only for /ui and /vi, not for /i regexps. 427 void CharacterRange::AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, 428 Zone* zone) { 429 #ifdef V8_INTL_SUPPORT 430 DCHECK(IsCanonical(ranges)); 431 432 // Micro-optimization to avoid passing large ranges to UnicodeSet::closeOver. 433 // See also https://crbug.com/v8/6727. 434 // TODO(jgruber): This only covers the special case of the {0,0x10FFFF} range, 435 // which we use frequently internally. But large ranges can also easily be 436 // created by the user. We might want to have a more general caching mechanism 437 // for such ranges. 438 if (ranges->length() == 1 && ranges->at(0).IsEverything(kNonBmpEnd)) return; 439 440 // Use ICU to compute the case fold closure over the ranges. 441 icu::UnicodeSet set; 442 for (int i = 0; i < ranges->length(); i++) { 443 set.add(ranges->at(i).from(), ranges->at(i).to()); 444 } 445 // Clear the ranges list without freeing the backing store. 446 ranges->Rewind(0); 447 set.closeOver(USET_SIMPLE_CASE_INSENSITIVE); 448 for (int i = 0; i < set.getRangeCount(); i++) { 449 ranges->Add(Range(set.getRangeStart(i), set.getRangeEnd(i)), zone); 450 } 451 // No errors and everything we collected have been ranges. 452 Canonicalize(ranges); 453 #endif // V8_INTL_SUPPORT 454 } 455 456 RegExpNode* RegExpClassRanges::ToNodeImpl(RegExpCompiler* compiler, 457 RegExpNode* on_success) { 458 set_.Canonicalize(); 459 Zone* const zone = compiler->zone(); 460 ZoneList<CharacterRange>* ranges = this->ranges(zone); 461 462 const bool needs_case_folding = 463 NeedsUnicodeCaseEquivalents(compiler->flags()) && !is_case_folded(); 464 if (needs_case_folding) { 465 CharacterRange::AddUnicodeCaseEquivalents(ranges, zone); 466 } 467 468 if (!IsEitherUnicode(compiler->flags()) || compiler->one_byte() || 469 contains_split_surrogate()) { 470 return zone->New<TextNode>(this, compiler->read_backward(), on_success); 471 } 472 473 if (is_negated()) { 474 // With /v, character classes are never negated. 475 // https://tc39.es/ecma262/#sec-compileatom 476 // Atom :: CharacterClass 477 // 4. Assert: cc.[[Invert]] is false. 478 // Instead the complement is created when evaluating the class set. 479 // The only exception is the "nothing range" (negated everything), which is 480 // internally created for an empty set. 481 DCHECK_IMPLIES( 482 IsUnicodeSets(compiler->flags()), 483 ranges->length() == 1 && ranges->first().IsEverything(kMaxCodePoint)); 484 ZoneList<CharacterRange>* negated = 485 zone->New<ZoneList<CharacterRange>>(2, zone); 486 CharacterRange::Negate(ranges, negated, zone); 487 ranges = negated; 488 } 489 490 if (ranges->length() == 0) { 491 // The empty character class is used as a 'fail' node. 492 RegExpClassRanges* fail = zone->New<RegExpClassRanges>(zone, ranges); 493 return zone->New<TextNode>(fail, compiler->read_backward(), on_success); 494 } 495 496 if (set_.is_standard() && 497 standard_type() == StandardCharacterSet::kEverything) { 498 return UnanchoredAdvance(compiler, on_success); 499 } 500 501 // Split ranges in order to handle surrogates correctly: 502 // - Surrogate pairs: translate the 32-bit code point into two uc16 code 503 // units (irregexp operates only on code units). 504 // - Lone surrogates: these require lookarounds to ensure we don't match in 505 // the middle of a surrogate pair. 506 ChoiceNode* result = zone->New<ChoiceNode>(2, zone); 507 UnicodeRangeSplitter splitter(ranges); 508 AddBmpCharacters(compiler, result, on_success, &splitter); 509 AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter); 510 AddLoneLeadSurrogates(compiler, result, on_success, &splitter); 511 AddLoneTrailSurrogates(compiler, result, on_success, &splitter); 512 513 static constexpr int kMaxRangesToInline = 32; // Arbitrary. 514 if (ranges->length() > kMaxRangesToInline) result->SetDoNotInline(); 515 516 if (result->alternatives()->length() == 1) { 517 return result->alternatives()->at(0).node(); 518 } 519 520 return result; 521 } 522 523 RegExpNode* RegExpClassSetOperand::ToNodeImpl(RegExpCompiler* compiler, 524 RegExpNode* on_success) { 525 Zone* zone = compiler->zone(); 526 const int size = (has_strings() ? static_cast<int>(strings()->size()) : 0) + 527 (ranges()->is_empty() ? 0 : 1); 528 if (size == 0) { 529 // If neither ranges nor strings are present, the operand is equal to an 530 // empty range (matching nothing). 531 ZoneList<CharacterRange>* empty = 532 zone->template New<ZoneList<CharacterRange>>(0, zone); 533 return zone->template New<RegExpClassRanges>(zone, empty) 534 ->ToNode(compiler, on_success); 535 } 536 ZoneList<RegExpTree*>* alternatives = 537 zone->template New<ZoneList<RegExpTree*>>(size, zone); 538 // Strings are sorted by length first (larger strings before shorter ones). 539 // See the comment on CharacterClassStrings. 540 // Empty strings (if present) are added after character ranges. 541 RegExpTree* empty_string = nullptr; 542 if (has_strings()) { 543 for (auto string : *strings()) { 544 if (string.second->IsEmpty()) { 545 empty_string = string.second; 546 } else { 547 alternatives->Add(string.second, zone); 548 } 549 } 550 } 551 if (!ranges()->is_empty()) { 552 // In unicode sets mode case folding has to be done at precise locations 553 // (e.g. before building complements). 554 // It is therefore the parsers responsibility to case fold (sub-) ranges 555 // before creating ClassSetOperands. 556 alternatives->Add(zone->template New<RegExpClassRanges>( 557 zone, ranges(), RegExpClassRanges::IS_CASE_FOLDED), 558 zone); 559 } 560 if (empty_string != nullptr) { 561 alternatives->Add(empty_string, zone); 562 } 563 564 RegExpTree* node = nullptr; 565 if (size == 1) { 566 DCHECK_EQ(alternatives->length(), 1); 567 node = alternatives->first(); 568 } else { 569 node = zone->template New<RegExpDisjunction>(alternatives); 570 } 571 return node->ToNode(compiler, on_success); 572 } 573 574 RegExpNode* RegExpClassSetExpression::ToNodeImpl(RegExpCompiler* compiler, 575 RegExpNode* on_success) { 576 Zone* zone = compiler->zone(); 577 ZoneList<CharacterRange>* temp_ranges = 578 zone->template New<ZoneList<CharacterRange>>(4, zone); 579 RegExpClassSetOperand* root = ComputeExpression(this, temp_ranges, zone); 580 return root->ToNode(compiler, on_success); 581 } 582 583 void RegExpClassSetOperand::Union(RegExpClassSetOperand* other, Zone* zone) { 584 ranges()->AddAll(*other->ranges(), zone); 585 if (other->has_strings()) { 586 if (strings_ == nullptr) { 587 strings_ = zone->template New<CharacterClassStrings>(zone); 588 } 589 strings()->insert(other->strings()->begin(), other->strings()->end()); 590 } 591 } 592 593 void RegExpClassSetOperand::Intersect(RegExpClassSetOperand* other, 594 ZoneList<CharacterRange>* temp_ranges, 595 Zone* zone) { 596 CharacterRange::Intersect(ranges(), other->ranges(), temp_ranges, zone); 597 std::swap(*ranges(), *temp_ranges); 598 temp_ranges->Rewind(0); 599 if (has_strings()) { 600 if (!other->has_strings()) { 601 strings()->clear(); 602 } else { 603 for (auto iter = strings()->begin(); iter != strings()->end();) { 604 if (other->strings()->find(iter->first) == other->strings()->end()) { 605 iter = strings()->erase(iter); 606 } else { 607 iter++; 608 } 609 } 610 } 611 } 612 } 613 614 void RegExpClassSetOperand::Subtract(RegExpClassSetOperand* other, 615 ZoneList<CharacterRange>* temp_ranges, 616 Zone* zone) { 617 CharacterRange::Subtract(ranges(), other->ranges(), temp_ranges, zone); 618 std::swap(*ranges(), *temp_ranges); 619 temp_ranges->Rewind(0); 620 if (has_strings() && other->has_strings()) { 621 for (auto iter = strings()->begin(); iter != strings()->end();) { 622 if (other->strings()->find(iter->first) != other->strings()->end()) { 623 iter = strings()->erase(iter); 624 } else { 625 iter++; 626 } 627 } 628 } 629 } 630 631 // static 632 RegExpClassSetOperand* RegExpClassSetExpression::ComputeExpression( 633 RegExpTree* root, ZoneList<CharacterRange>* temp_ranges, Zone* zone) { 634 DCHECK(temp_ranges->is_empty()); 635 if (root->IsClassSetOperand()) { 636 return root->AsClassSetOperand(); 637 } 638 DCHECK(root->IsClassSetExpression()); 639 RegExpClassSetExpression* node = root->AsClassSetExpression(); 640 RegExpClassSetOperand* result = 641 ComputeExpression(node->operands()->at(0), temp_ranges, zone); 642 switch (node->operation()) { 643 case OperationType::kUnion: { 644 for (int i = 1; i < node->operands()->length(); i++) { 645 RegExpClassSetOperand* op = 646 ComputeExpression(node->operands()->at(i), temp_ranges, zone); 647 result->Union(op, zone); 648 } 649 CharacterRange::Canonicalize(result->ranges()); 650 break; 651 } 652 case OperationType::kIntersection: { 653 for (int i = 1; i < node->operands()->length(); i++) { 654 RegExpClassSetOperand* op = 655 ComputeExpression(node->operands()->at(i), temp_ranges, zone); 656 result->Intersect(op, temp_ranges, zone); 657 } 658 break; 659 } 660 case OperationType::kSubtraction: { 661 for (int i = 1; i < node->operands()->length(); i++) { 662 RegExpClassSetOperand* op = 663 ComputeExpression(node->operands()->at(i), temp_ranges, zone); 664 result->Subtract(op, temp_ranges, zone); 665 } 666 break; 667 } 668 } 669 if (node->is_negated()) { 670 DCHECK(!result->has_strings()); 671 CharacterRange::Negate(result->ranges(), temp_ranges, zone); 672 std::swap(*result->ranges(), *temp_ranges); 673 temp_ranges->Rewind(0); 674 node->is_negated_ = false; 675 } 676 // Store the result as single operand of the current node. 677 node->operands()->Set(0, result); 678 node->operands()->Rewind(1); 679 680 return result; 681 } 682 683 namespace { 684 685 bool StartsWithAtom(RegExpTree* tree) { 686 if (tree->IsAtom()) return true; 687 return tree->IsText() && tree->AsText()->StartsWithAtom(); 688 } 689 690 RegExpAtom* FirstAtom(RegExpTree* tree) { 691 if (tree->IsAtom()) return tree->AsAtom(); 692 return tree->AsText()->FirstAtom(); 693 } 694 695 int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) { 696 RegExpAtom* atom1 = FirstAtom(*a); 697 RegExpAtom* atom2 = FirstAtom(*b); 698 base::uc16 character1 = atom1->data().at(0); 699 base::uc16 character2 = atom2->data().at(0); 700 if (character1 < character2) return -1; 701 if (character1 > character2) return 1; 702 return 0; 703 } 704 705 #ifdef V8_INTL_SUPPORT 706 707 int CompareCaseInsensitive(const icu::UnicodeString& a, 708 const icu::UnicodeString& b) { 709 return a.caseCompare(b, U_FOLD_CASE_DEFAULT); 710 } 711 712 int CompareFirstCharCaseInsensitive(RegExpTree* const* a, 713 RegExpTree* const* b) { 714 RegExpAtom* atom1 = FirstAtom(*a); 715 RegExpAtom* atom2 = FirstAtom(*b); 716 return CompareCaseInsensitive(icu::UnicodeString{atom1->data().at(0)}, 717 icu::UnicodeString{atom2->data().at(0)}); 718 } 719 720 bool Equals(bool ignore_case, const icu::UnicodeString& a, 721 const icu::UnicodeString& b) { 722 if (a == b) return true; 723 if (ignore_case) return CompareCaseInsensitive(a, b) == 0; 724 return false; // Case-sensitive equality already checked above. 725 } 726 727 bool CharAtEquals(bool ignore_case, int index, const RegExpAtom* a, 728 const RegExpAtom* b) { 729 return Equals(ignore_case, a->data().at(index), b->data().at(index)); 730 } 731 732 #else 733 734 unibrow::uchar Canonical( 735 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize, 736 unibrow::uchar c) { 737 unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth]; 738 int length = canonicalize->get(c, '\0', chars); 739 DCHECK_LE(length, 1); 740 unibrow::uchar canonical = c; 741 if (length == 1) canonical = chars[0]; 742 return canonical; 743 } 744 745 int CompareCaseInsensitive( 746 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize, 747 unibrow::uchar a, unibrow::uchar b) { 748 if (a == b) return 0; 749 if (a >= 'a' || b >= 'a') { 750 a = Canonical(canonicalize, a); 751 b = Canonical(canonicalize, b); 752 } 753 return static_cast<int>(a) - static_cast<int>(b); 754 } 755 756 int CompareFirstCharCaseInsensitive( 757 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize, 758 RegExpTree* const* a, RegExpTree* const* b) { 759 RegExpAtom* atom1 = FirstAtom(*a); 760 RegExpAtom* atom2 = FirstAtom(*b); 761 return CompareCaseInsensitive(canonicalize, atom1->data().at(0), 762 atom2->data().at(0)); 763 } 764 765 bool Equals(bool ignore_case, 766 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize, 767 unibrow::uchar a, unibrow::uchar b) { 768 if (a == b) return true; 769 if (ignore_case) { 770 return CompareCaseInsensitive(canonicalize, a, b) == 0; 771 } 772 return false; // Case-sensitive equality already checked above. 773 } 774 775 bool CharAtEquals(bool ignore_case, 776 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize, 777 int index, const RegExpAtom* a, const RegExpAtom* b) { 778 return Equals(ignore_case, canonicalize, a->data().at(index), 779 b->data().at(index)); 780 } 781 782 #endif // V8_INTL_SUPPORT 783 784 } // namespace 785 786 // We can stable sort runs of atoms, since the order does not matter if they 787 // start with different characters. 788 // Returns true if any consecutive atoms were found. 789 bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) { 790 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 791 int length = alternatives->length(); 792 bool found_consecutive_atoms = false; 793 for (int i = 0; i < length; i++) { 794 while (i < length) { 795 RegExpTree* alternative = alternatives->at(i); 796 if (StartsWithAtom(alternative)) break; 797 i++; 798 } 799 // i is length or it is the index of an atom. 800 if (i == length) break; 801 int first_atom = i; 802 i++; 803 while (i < length) { 804 RegExpTree* alternative = alternatives->at(i); 805 if (!StartsWithAtom(alternative)) break; 806 i++; 807 } 808 // Sort atoms to get ones with common prefixes together. 809 // This step is more tricky if we are in a case-independent regexp, 810 // because it would change /is|I/ to /I|is/, and order matters when 811 // the regexp parts don't match only disjoint starting points. To fix 812 // this we have a version of CompareFirstChar that uses case- 813 // independent character classes for comparison. 814 DCHECK_LT(first_atom, alternatives->length()); 815 DCHECK_LE(i, alternatives->length()); 816 DCHECK_LE(first_atom, i); 817 if (IsIgnoreCase(compiler->flags())) { 818 #ifdef V8_INTL_SUPPORT 819 alternatives->StableSort(CompareFirstCharCaseInsensitive, first_atom, 820 i - first_atom); 821 #else 822 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize = 823 compiler->isolate()->regexp_macro_assembler_canonicalize(); 824 auto compare_closure = [canonicalize](RegExpTree* const* a, 825 RegExpTree* const* b) { 826 return CompareFirstCharCaseInsensitive(canonicalize, a, b); 827 }; 828 alternatives->StableSort(compare_closure, first_atom, i - first_atom); 829 #endif // V8_INTL_SUPPORT 830 } else { 831 alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom); 832 } 833 if (i - first_atom > 1) found_consecutive_atoms = true; 834 } 835 return found_consecutive_atoms; 836 } 837 838 // Optimizes ab|ac|az to a(?:b|c|d). 839 void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { 840 Zone* zone = compiler->zone(); 841 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 842 int length = alternatives->length(); 843 const bool ignore_case = IsIgnoreCase(compiler->flags()); 844 845 int write_posn = 0; 846 int i = 0; 847 while (i < length) { 848 RegExpTree* alternative = alternatives->at(i); 849 if (!StartsWithAtom(alternative)) { 850 alternatives->at(write_posn++) = alternatives->at(i); 851 i++; 852 continue; 853 } 854 RegExpAtom* const atom = FirstAtom(alternative); 855 856 #ifdef V8_INTL_SUPPORT 857 icu::UnicodeString common_prefix(atom->data().at(0)); 858 #else 859 unibrow::Mapping<unibrow::Ecma262Canonicalize>* const canonicalize = 860 compiler->isolate()->regexp_macro_assembler_canonicalize(); 861 unibrow::uchar common_prefix = atom->data().at(0); 862 if (ignore_case) { 863 common_prefix = Canonical(canonicalize, common_prefix); 864 } 865 #endif // V8_INTL_SUPPORT 866 int first_with_prefix = i; 867 int prefix_length = atom->length(); 868 i++; 869 while (i < length) { 870 alternative = alternatives->at(i); 871 if (!StartsWithAtom(alternative)) break; 872 RegExpAtom* const alt_atom = FirstAtom(alternative); 873 #ifdef V8_INTL_SUPPORT 874 icu::UnicodeString new_prefix(alt_atom->data().at(0)); 875 if (!Equals(ignore_case, new_prefix, common_prefix)) break; 876 #else 877 unibrow::uchar new_prefix = alt_atom->data().at(0); 878 if (!Equals(ignore_case, canonicalize, new_prefix, common_prefix)) break; 879 #endif // V8_INTL_SUPPORT 880 prefix_length = std::min(prefix_length, alt_atom->length()); 881 i++; 882 } 883 if (i > first_with_prefix + 2) { 884 // Found worthwhile run of alternatives with common prefix of at least one 885 // character. The sorting function above did not sort on more than one 886 // character for reasons of correctness, but there may still be a longer 887 // common prefix if the terms were similar or presorted in the input. 888 // Find out how long the common prefix is. 889 int run_length = i - first_with_prefix; 890 RegExpAtom* const alt_atom = 891 FirstAtom(alternatives->at(first_with_prefix)); 892 alternatives->at(first_with_prefix)->AsAtom(); 893 for (int j = 1; j < run_length && prefix_length > 1; j++) { 894 RegExpAtom* old_atom = 895 FirstAtom(alternatives->at(j + first_with_prefix)); 896 for (int k = 1; k < prefix_length; k++) { 897 #ifdef V8_INTL_SUPPORT 898 if (!CharAtEquals(ignore_case, k, alt_atom, old_atom)) { 899 #else 900 if (!CharAtEquals(ignore_case, canonicalize, k, alt_atom, old_atom)) { 901 #endif // V8_INTL_SUPPORT 902 prefix_length = k; 903 break; 904 } 905 } 906 } 907 RegExpAtom* prefix = 908 zone->New<RegExpAtom>(alt_atom->data().SubVector(0, prefix_length)); 909 ZoneList<RegExpTree*>* pair = zone->New<ZoneList<RegExpTree*>>(2, zone); 910 pair->Add(prefix, zone); 911 ZoneList<RegExpTree*>* suffixes = 912 zone->New<ZoneList<RegExpTree*>>(run_length, zone); 913 for (int j = 0; j < run_length; j++) { 914 if (alternatives->at(j + first_with_prefix)->IsAtom()) { 915 RegExpAtom* old_atom = 916 alternatives->at(j + first_with_prefix)->AsAtom(); 917 int len = old_atom->length(); 918 if (len == prefix_length) { 919 suffixes->Add(zone->New<RegExpEmpty>(), zone); 920 } else { 921 RegExpTree* suffix = zone->New<RegExpAtom>( 922 old_atom->data().SubVector(prefix_length, len)); 923 suffixes->Add(suffix, zone); 924 } 925 } else { 926 RegExpText* new_text = zone->New<RegExpText>(zone); 927 RegExpText* old_text = 928 alternatives->at(j + first_with_prefix)->AsText(); 929 RegExpAtom* old_atom = old_text->FirstAtom(); 930 int len = old_atom->length(); 931 if (len != prefix_length) { 932 RegExpAtom* suffix = zone->New<RegExpAtom>( 933 old_atom->data().SubVector(prefix_length, len)); 934 new_text->AddElement(TextElement::Atom(suffix), zone); 935 } 936 for (int k = 1; k < old_text->elements()->length(); k++) { 937 new_text->AddElement(old_text->elements()->at(k), zone); 938 } 939 if (new_text->elements()->length() != 0) { 940 suffixes->Add(new_text, zone); 941 } else { 942 suffixes->Add(zone->New<RegExpEmpty>(), zone); 943 } 944 } 945 } 946 pair->Add(zone->New<RegExpDisjunction>(suffixes), zone); 947 alternatives->at(write_posn++) = zone->New<RegExpAlternative>(pair); 948 } else { 949 // Just copy any non-worthwhile alternatives. 950 for (int j = first_with_prefix; j < i; j++) { 951 alternatives->at(write_posn++) = alternatives->at(j); 952 } 953 } 954 } 955 alternatives->Rewind(write_posn); // Trim end of array. 956 } 957 958 // Optimizes b|c|z to [bcz]. 959 void RegExpDisjunction::FixSingleCharacterDisjunctions( 960 RegExpCompiler* compiler) { 961 Zone* zone = compiler->zone(); 962 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 963 int length = alternatives->length(); 964 965 int write_posn = 0; 966 int i = 0; 967 while (i < length) { 968 RegExpTree* alternative = alternatives->at(i); 969 if (!alternative->IsAtom()) { 970 alternatives->at(write_posn++) = alternatives->at(i); 971 i++; 972 continue; 973 } 974 RegExpAtom* const atom = alternative->AsAtom(); 975 if (atom->length() != 1) { 976 alternatives->at(write_posn++) = alternatives->at(i); 977 i++; 978 continue; 979 } 980 const RegExpFlags flags = compiler->flags(); 981 DCHECK_IMPLIES(IsEitherUnicode(flags), 982 !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0))); 983 bool contains_trail_surrogate = 984 unibrow::Utf16::IsTrailSurrogate(atom->data().at(0)); 985 int first_in_run = i; 986 i++; 987 // Find a run of single-character atom alternatives that have identical 988 // flags (case independence and unicode-ness). 989 while (i < length) { 990 alternative = alternatives->at(i); 991 if (!alternative->IsAtom()) break; 992 RegExpAtom* const alt_atom = alternative->AsAtom(); 993 if (alt_atom->length() != 1) break; 994 DCHECK_IMPLIES(IsEitherUnicode(flags), 995 !unibrow::Utf16::IsLeadSurrogate(alt_atom->data().at(0))); 996 contains_trail_surrogate |= 997 unibrow::Utf16::IsTrailSurrogate(alt_atom->data().at(0)); 998 i++; 999 } 1000 if (i > first_in_run + 1) { 1001 // Found non-trivial run of single-character alternatives. 1002 int run_length = i - first_in_run; 1003 ZoneList<CharacterRange>* ranges = 1004 zone->New<ZoneList<CharacterRange>>(2, zone); 1005 for (int j = 0; j < run_length; j++) { 1006 RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom(); 1007 DCHECK_EQ(old_atom->length(), 1); 1008 ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone); 1009 } 1010 RegExpClassRanges::ClassRangesFlags class_ranges_flags; 1011 if (IsEitherUnicode(flags) && contains_trail_surrogate) { 1012 class_ranges_flags = RegExpClassRanges::CONTAINS_SPLIT_SURROGATE; 1013 } 1014 alternatives->at(write_posn++) = 1015 zone->New<RegExpClassRanges>(zone, ranges, class_ranges_flags); 1016 } else { 1017 // Just copy any trivial alternatives. 1018 for (int j = first_in_run; j < i; j++) { 1019 alternatives->at(write_posn++) = alternatives->at(j); 1020 } 1021 } 1022 } 1023 alternatives->Rewind(write_posn); // Trim end of array. 1024 } 1025 1026 RegExpNode* RegExpDisjunction::ToNodeImpl(RegExpCompiler* compiler, 1027 RegExpNode* on_success) { 1028 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 1029 1030 if (alternatives->length() > 2) { 1031 bool found_consecutive_atoms = SortConsecutiveAtoms(compiler); 1032 if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler); 1033 FixSingleCharacterDisjunctions(compiler); 1034 if (alternatives->length() == 1) { 1035 return alternatives->at(0)->ToNode(compiler, on_success); 1036 } 1037 } 1038 1039 int length = alternatives->length(); 1040 1041 ChoiceNode* result = 1042 compiler->zone()->New<ChoiceNode>(length, compiler->zone()); 1043 for (int i = 0; i < length; i++) { 1044 GuardedAlternative alternative( 1045 alternatives->at(i)->ToNode(compiler, on_success)); 1046 result->AddAlternative(alternative); 1047 } 1048 return result; 1049 } 1050 1051 RegExpNode* RegExpQuantifier::ToNodeImpl(RegExpCompiler* compiler, 1052 RegExpNode* on_success) { 1053 return ToNode(min(), max(), is_greedy(), body(), compiler, on_success); 1054 } 1055 1056 namespace { 1057 // Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and 1058 // \B to (?<=\w)(?=\w)|(?<=\W)(?=\W) 1059 RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler, 1060 RegExpNode* on_success, 1061 RegExpAssertion::Type type) { 1062 CHECK(NeedsUnicodeCaseEquivalents(compiler->flags())); 1063 Zone* zone = compiler->zone(); 1064 ZoneList<CharacterRange>* word_range = 1065 zone->New<ZoneList<CharacterRange>>(2, zone); 1066 CharacterRange::AddClassEscape(StandardCharacterSet::kWord, word_range, true, 1067 zone); 1068 int stack_register = compiler->UnicodeLookaroundStackRegister(); 1069 int position_register = compiler->UnicodeLookaroundPositionRegister(); 1070 ChoiceNode* result = zone->New<ChoiceNode>(2, zone); 1071 // Add two choices. The (non-)boundary could start with a word or 1072 // a non-word-character. 1073 for (int i = 0; i < 2; i++) { 1074 bool lookbehind_for_word = i == 0; 1075 bool lookahead_for_word = 1076 (type == RegExpAssertion::Type::BOUNDARY) ^ lookbehind_for_word; 1077 // Look to the left. 1078 RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success, 1079 stack_register, position_register); 1080 RegExpNode* backward = TextNode::CreateForCharacterRanges( 1081 zone, word_range, true, lookbehind.on_match_success()); 1082 // Look to the right. 1083 RegExpLookaround::Builder lookahead(lookahead_for_word, 1084 lookbehind.ForMatch(backward), 1085 stack_register, position_register); 1086 RegExpNode* forward = TextNode::CreateForCharacterRanges( 1087 zone, word_range, false, lookahead.on_match_success()); 1088 result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward))); 1089 } 1090 return result; 1091 } 1092 } // anonymous namespace 1093 1094 RegExpNode* RegExpAssertion::ToNodeImpl(RegExpCompiler* compiler, 1095 RegExpNode* on_success) { 1096 NodeInfo info; 1097 Zone* zone = compiler->zone(); 1098 1099 switch (assertion_type()) { 1100 case Type::START_OF_LINE: 1101 return AssertionNode::AfterNewline(on_success); 1102 case Type::START_OF_INPUT: 1103 return AssertionNode::AtStart(on_success); 1104 case Type::BOUNDARY: 1105 return NeedsUnicodeCaseEquivalents(compiler->flags()) 1106 ? BoundaryAssertionAsLookaround(compiler, on_success, 1107 Type::BOUNDARY) 1108 : AssertionNode::AtBoundary(on_success); 1109 case Type::NON_BOUNDARY: 1110 return NeedsUnicodeCaseEquivalents(compiler->flags()) 1111 ? BoundaryAssertionAsLookaround(compiler, on_success, 1112 Type::NON_BOUNDARY) 1113 : AssertionNode::AtNonBoundary(on_success); 1114 case Type::END_OF_INPUT: 1115 return AssertionNode::AtEnd(on_success); 1116 case Type::END_OF_LINE: { 1117 // Compile $ in multiline regexps as an alternation with a positive 1118 // lookahead in one side and an end-of-input on the other side. 1119 // We need two registers for the lookahead. 1120 int stack_pointer_register = compiler->AllocateRegister(); 1121 int position_register = compiler->AllocateRegister(); 1122 // The ChoiceNode to distinguish between a newline and end-of-input. 1123 ChoiceNode* result = zone->New<ChoiceNode>(2, zone); 1124 // Create a newline atom. 1125 ZoneList<CharacterRange>* newline_ranges = 1126 zone->New<ZoneList<CharacterRange>>(3, zone); 1127 CharacterRange::AddClassEscape(StandardCharacterSet::kLineTerminator, 1128 newline_ranges, false, zone); 1129 RegExpClassRanges* newline_atom = 1130 zone->New<RegExpClassRanges>(StandardCharacterSet::kLineTerminator); 1131 ActionNode* submatch_success = ActionNode::PositiveSubmatchSuccess( 1132 stack_pointer_register, position_register, 1133 0, // No captures inside. 1134 -1, // Ignored if no captures. 1135 on_success); 1136 TextNode* newline_matcher = 1137 zone->New<TextNode>(newline_atom, false, submatch_success); 1138 // Create an end-of-input matcher. 1139 RegExpNode* end_of_line = ActionNode::BeginPositiveSubmatch( 1140 stack_pointer_register, position_register, newline_matcher, 1141 submatch_success); 1142 // Add the two alternatives to the ChoiceNode. 1143 GuardedAlternative eol_alternative(end_of_line); 1144 result->AddAlternative(eol_alternative); 1145 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success)); 1146 result->AddAlternative(end_alternative); 1147 return result; 1148 } 1149 default: 1150 UNREACHABLE(); 1151 } 1152 } 1153 1154 RegExpNode* RegExpBackReference::ToNodeImpl(RegExpCompiler* compiler, 1155 RegExpNode* on_success) { 1156 RegExpNode* backref_node = on_success; 1157 // Only one of the captures in the list can actually match. Since 1158 // back-references to unmatched captures are treated as empty, we can simply 1159 // create back-references to all possible captures. 1160 for (auto capture : *captures()) { 1161 backref_node = compiler->zone()->New<BackReferenceNode>( 1162 RegExpCapture::StartRegister(capture->index()), 1163 RegExpCapture::EndRegister(capture->index()), compiler->read_backward(), 1164 backref_node); 1165 } 1166 return backref_node; 1167 } 1168 1169 RegExpNode* RegExpEmpty::ToNodeImpl(RegExpCompiler* compiler, 1170 RegExpNode* on_success) { 1171 return on_success; 1172 } 1173 1174 namespace { 1175 1176 class V8_NODISCARD ModifiersScope { 1177 public: 1178 ModifiersScope(RegExpCompiler* compiler, RegExpFlags flags) 1179 : compiler_(compiler), previous_flags_(compiler->flags()) { 1180 compiler->set_flags(flags); 1181 } 1182 ~ModifiersScope() { compiler_->set_flags(previous_flags_); } 1183 1184 private: 1185 RegExpCompiler* compiler_; 1186 const RegExpFlags previous_flags_; 1187 }; 1188 1189 } // namespace 1190 1191 RegExpNode* RegExpGroup::ToNodeImpl(RegExpCompiler* compiler, 1192 RegExpNode* on_success) { 1193 // If no flags are modified, simply convert and return the body. 1194 if (flags() == compiler->flags()) { 1195 return body_->ToNode(compiler, on_success); 1196 } 1197 // Reset flags for successor node. 1198 const RegExpFlags old_flags = compiler->flags(); 1199 on_success = ActionNode::ModifyFlags(old_flags, on_success); 1200 1201 // Convert body using modifier. 1202 ModifiersScope modifiers_scope(compiler, flags()); 1203 RegExpNode* body = body_->ToNode(compiler, on_success); 1204 1205 // Wrap body into modifier node. 1206 RegExpNode* modified_body = ActionNode::ModifyFlags(flags(), body); 1207 return modified_body; 1208 } 1209 1210 RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success, 1211 int stack_pointer_register, 1212 int position_register, 1213 int capture_register_count, 1214 int capture_register_start) 1215 : is_positive_(is_positive), 1216 on_success_(on_success), 1217 stack_pointer_register_(stack_pointer_register), 1218 position_register_(position_register) { 1219 if (is_positive_) { 1220 on_match_success_ = ActionNode::PositiveSubmatchSuccess( 1221 stack_pointer_register, position_register, capture_register_count, 1222 capture_register_start, on_success_); 1223 } else { 1224 Zone* zone = on_success_->zone(); 1225 on_match_success_ = zone->New<NegativeSubmatchSuccess>( 1226 stack_pointer_register, position_register, capture_register_count, 1227 capture_register_start, zone); 1228 } 1229 } 1230 1231 RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) { 1232 if (is_positive_) { 1233 ActionNode* on_match_success = on_match_success_->AsActionNode(); 1234 return ActionNode::BeginPositiveSubmatch( 1235 stack_pointer_register_, position_register_, match, on_match_success); 1236 } else { 1237 Zone* zone = on_success_->zone(); 1238 // We use a ChoiceNode to represent the negative lookaround. The first 1239 // alternative is the negative match. On success, the end node backtracks. 1240 // On failure, the second alternative is tried and leads to success. 1241 // NegativeLookaroundChoiceNode is a special ChoiceNode that ignores the 1242 // first exit when calculating quick checks. 1243 ChoiceNode* choice_node = zone->New<NegativeLookaroundChoiceNode>( 1244 GuardedAlternative(match), GuardedAlternative(on_success_), zone); 1245 return ActionNode::BeginNegativeSubmatch(stack_pointer_register_, 1246 position_register_, choice_node); 1247 } 1248 } 1249 1250 RegExpNode* RegExpLookaround::ToNodeImpl(RegExpCompiler* compiler, 1251 RegExpNode* on_success) { 1252 int stack_pointer_register = compiler->AllocateRegister(); 1253 int position_register = compiler->AllocateRegister(); 1254 1255 const int registers_per_capture = 2; 1256 const int register_of_first_capture = 2; 1257 int register_count = capture_count_ * registers_per_capture; 1258 int register_start = 1259 register_of_first_capture + capture_from_ * registers_per_capture; 1260 1261 RegExpNode* result; 1262 bool was_reading_backward = compiler->read_backward(); 1263 compiler->set_read_backward(type() == LOOKBEHIND); 1264 Builder builder(is_positive(), on_success, stack_pointer_register, 1265 position_register, register_count, register_start); 1266 RegExpNode* match = body_->ToNode(compiler, builder.on_match_success()); 1267 result = builder.ForMatch(match); 1268 compiler->set_read_backward(was_reading_backward); 1269 return result; 1270 } 1271 1272 RegExpNode* RegExpCapture::ToNodeImpl(RegExpCompiler* compiler, 1273 RegExpNode* on_success) { 1274 return ToNode(body(), index(), compiler, on_success); 1275 } 1276 1277 RegExpNode* RegExpCapture::ToNode(RegExpTree* body, int index, 1278 RegExpCompiler* compiler, 1279 RegExpNode* on_success) { 1280 DCHECK_NOT_NULL(body); 1281 int start_reg = RegExpCapture::StartRegister(index); 1282 int end_reg = RegExpCapture::EndRegister(index); 1283 if (compiler->read_backward()) std::swap(start_reg, end_reg); 1284 RegExpNode* store_end = ActionNode::ClearPosition(end_reg, on_success); 1285 RegExpNode* body_node = body->ToNode(compiler, store_end); 1286 return ActionNode::ClearPosition(start_reg, body_node); 1287 } 1288 1289 namespace { 1290 1291 class AssertionSequenceRewriter final { 1292 public: 1293 // TODO(jgruber): Consider moving this to a separate AST tree rewriter pass 1294 // instead of sprinkling rewrites into the AST->Node conversion process. 1295 static void MaybeRewrite(ZoneList<RegExpTree*>* terms, Zone* zone) { 1296 AssertionSequenceRewriter rewriter(terms, zone); 1297 1298 static constexpr int kNoIndex = -1; 1299 int from = kNoIndex; 1300 1301 for (int i = 0; i < terms->length(); i++) { 1302 RegExpTree* t = terms->at(i); 1303 if (from == kNoIndex && t->IsAssertion()) { 1304 from = i; // Start a sequence. 1305 } else if (from != kNoIndex && !t->IsAssertion()) { 1306 // Terminate and process the sequence. 1307 if (i - from > 1) rewriter.Rewrite(from, i); 1308 from = kNoIndex; 1309 } 1310 } 1311 1312 if (from != kNoIndex && terms->length() - from > 1) { 1313 rewriter.Rewrite(from, terms->length()); 1314 } 1315 } 1316 1317 // All assertions are zero width. A consecutive sequence of assertions is 1318 // order-independent. There's two ways we can optimize here: 1319 // 1. fold all identical assertions. 1320 // 2. if any assertion combinations are known to fail (e.g. \b\B), the entire 1321 // sequence fails. 1322 void Rewrite(int from, int to) { 1323 DCHECK_GT(to, from + 1); 1324 1325 // Bitfield of all seen assertions. 1326 uint32_t seen_assertions = 0; 1327 static_assert(static_cast<int>(RegExpAssertion::Type::LAST_ASSERTION_TYPE) < 1328 kUInt32Size * kBitsPerByte); 1329 1330 for (int i = from; i < to; i++) { 1331 RegExpAssertion* t = terms_->at(i)->AsAssertion(); 1332 const uint32_t bit = 1 << static_cast<int>(t->assertion_type()); 1333 1334 if (seen_assertions & bit) { 1335 // Fold duplicates. 1336 terms_->Set(i, zone_->New<RegExpEmpty>()); 1337 } 1338 1339 seen_assertions |= bit; 1340 } 1341 1342 // Collapse failures. 1343 const uint32_t always_fails_mask = 1344 1 << static_cast<int>(RegExpAssertion::Type::BOUNDARY) | 1345 1 << static_cast<int>(RegExpAssertion::Type::NON_BOUNDARY); 1346 if ((seen_assertions & always_fails_mask) == always_fails_mask) { 1347 ReplaceSequenceWithFailure(from, to); 1348 } 1349 } 1350 1351 void ReplaceSequenceWithFailure(int from, int to) { 1352 // Replace the entire sequence with a single node that always fails. 1353 // TODO(jgruber): Consider adding an explicit Fail kind. Until then, the 1354 // negated '*' (everything) range serves the purpose. 1355 ZoneList<CharacterRange>* ranges = 1356 zone_->New<ZoneList<CharacterRange>>(0, zone_); 1357 RegExpClassRanges* cc = zone_->New<RegExpClassRanges>(zone_, ranges); 1358 terms_->Set(from, cc); 1359 1360 // Zero out the rest. 1361 RegExpEmpty* empty = zone_->New<RegExpEmpty>(); 1362 for (int i = from + 1; i < to; i++) terms_->Set(i, empty); 1363 } 1364 1365 private: 1366 AssertionSequenceRewriter(ZoneList<RegExpTree*>* terms, Zone* zone) 1367 : zone_(zone), terms_(terms) {} 1368 1369 Zone* zone_; 1370 ZoneList<RegExpTree*>* terms_; 1371 }; 1372 1373 } // namespace 1374 1375 RegExpNode* RegExpAlternative::ToNodeImpl(RegExpCompiler* compiler, 1376 RegExpNode* on_success) { 1377 ZoneList<RegExpTree*>* children = nodes(); 1378 1379 AssertionSequenceRewriter::MaybeRewrite(children, compiler->zone()); 1380 1381 RegExpNode* current = on_success; 1382 if (compiler->read_backward()) { 1383 for (int i = 0; i < children->length(); i++) { 1384 current = children->at(i)->ToNode(compiler, current); 1385 } 1386 } else { 1387 for (int i = children->length() - 1; i >= 0; i--) { 1388 current = children->at(i)->ToNode(compiler, current); 1389 } 1390 } 1391 return current; 1392 } 1393 1394 namespace { 1395 1396 void AddClass(const int* elmv, int elmc, ZoneList<CharacterRange>* ranges, 1397 Zone* zone) { 1398 elmc--; 1399 DCHECK_EQ(kRangeEndMarker, elmv[elmc]); 1400 for (int i = 0; i < elmc; i += 2) { 1401 DCHECK(elmv[i] < elmv[i + 1]); 1402 ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone); 1403 } 1404 } 1405 1406 void AddClassNegated(const int* elmv, int elmc, 1407 ZoneList<CharacterRange>* ranges, Zone* zone) { 1408 elmc--; 1409 DCHECK_EQ(kRangeEndMarker, elmv[elmc]); 1410 DCHECK_NE(0x0000, elmv[0]); 1411 DCHECK_NE(kMaxCodePoint, elmv[elmc - 1]); 1412 base::uc16 last = 0x0000; 1413 for (int i = 0; i < elmc; i += 2) { 1414 DCHECK(last <= elmv[i] - 1); 1415 DCHECK(elmv[i] < elmv[i + 1]); 1416 ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone); 1417 last = elmv[i + 1]; 1418 } 1419 ranges->Add(CharacterRange::Range(last, kMaxCodePoint), zone); 1420 } 1421 1422 } // namespace 1423 1424 void CharacterRange::AddClassEscape(StandardCharacterSet standard_character_set, 1425 ZoneList<CharacterRange>* ranges, 1426 bool add_unicode_case_equivalents, 1427 Zone* zone) { 1428 if (add_unicode_case_equivalents && 1429 (standard_character_set == StandardCharacterSet::kWord || 1430 standard_character_set == StandardCharacterSet::kNotWord)) { 1431 // See #sec-runtime-semantics-wordcharacters-abstract-operation 1432 // In case of unicode and ignore_case, we need to create the closure over 1433 // case equivalent characters before negating. 1434 ZoneList<CharacterRange>* new_ranges = 1435 zone->New<ZoneList<CharacterRange>>(2, zone); 1436 AddClass(kWordRanges, kWordRangeCount, new_ranges, zone); 1437 AddUnicodeCaseEquivalents(new_ranges, zone); 1438 if (standard_character_set == StandardCharacterSet::kNotWord) { 1439 ZoneList<CharacterRange>* negated = 1440 zone->New<ZoneList<CharacterRange>>(2, zone); 1441 CharacterRange::Negate(new_ranges, negated, zone); 1442 new_ranges = negated; 1443 } 1444 ranges->AddAll(*new_ranges, zone); 1445 return; 1446 } 1447 1448 switch (standard_character_set) { 1449 case StandardCharacterSet::kWhitespace: 1450 AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone); 1451 break; 1452 case StandardCharacterSet::kNotWhitespace: 1453 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone); 1454 break; 1455 case StandardCharacterSet::kWord: 1456 AddClass(kWordRanges, kWordRangeCount, ranges, zone); 1457 break; 1458 case StandardCharacterSet::kNotWord: 1459 AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone); 1460 break; 1461 case StandardCharacterSet::kDigit: 1462 AddClass(kDigitRanges, kDigitRangeCount, ranges, zone); 1463 break; 1464 case StandardCharacterSet::kNotDigit: 1465 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone); 1466 break; 1467 // This is the set of characters matched by the $ and ^ symbols 1468 // in multiline mode. 1469 case StandardCharacterSet::kLineTerminator: 1470 AddClass(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges, zone); 1471 break; 1472 case StandardCharacterSet::kNotLineTerminator: 1473 AddClassNegated(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges, 1474 zone); 1475 break; 1476 // This is not a character range as defined by the spec but a 1477 // convenient shorthand for a character class that matches any 1478 // character. 1479 case StandardCharacterSet::kEverything: 1480 ranges->Add(CharacterRange::Everything(), zone); 1481 break; 1482 } 1483 } 1484 1485 // static 1486 // Only for /i, not for /ui or /vi. 1487 void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone, 1488 ZoneList<CharacterRange>* ranges, 1489 bool is_one_byte) { 1490 CharacterRange::Canonicalize(ranges); 1491 int range_count = ranges->length(); 1492 #ifdef V8_INTL_SUPPORT 1493 icu::UnicodeSet others; 1494 for (int i = 0; i < range_count; i++) { 1495 CharacterRange range = ranges->at(i); 1496 base::uc32 from = range.from(); 1497 if (from > kMaxUtf16CodeUnit) continue; 1498 base::uc32 to = std::min({range.to(), kMaxUtf16CodeUnitU}); 1499 // Nothing to be done for surrogates. 1500 if (from >= kLeadSurrogateStart && to <= kTrailSurrogateEnd) continue; 1501 if (is_one_byte && !RangeContainsLatin1Equivalents(range)) { 1502 if (from > String::kMaxOneByteCharCode) continue; 1503 if (to > String::kMaxOneByteCharCode) to = String::kMaxOneByteCharCode; 1504 } 1505 others.add(from, to); 1506 } 1507 1508 // Compute the set of additional characters that should be added, 1509 // using UnicodeSet::closeOver. ECMA 262 defines slightly different 1510 // case-folding rules than Unicode, so some characters that are 1511 // added by closeOver do not match anything other than themselves in 1512 // JS. For example, 'Å¿' (U+017F LATIN SMALL LETTER LONG S) is the 1513 // same case-insensitive character as 's' or 'S' according to 1514 // Unicode, but does not match any other character in JS. To handle 1515 // this case, we add such characters to the IgnoreSet and filter 1516 // them out. We filter twice: once before calling closeOver (to 1517 // prevent 'Å¿' from adding 's'), and once after calling closeOver 1518 // (to prevent 's' from adding 'Å¿'). See regexp/special-case.h for 1519 // more information. 1520 icu::UnicodeSet already_added(others); 1521 others.removeAll(RegExpCaseFolding::IgnoreSet()); 1522 others.closeOver(USET_CASE_INSENSITIVE); 1523 others.removeAll(RegExpCaseFolding::IgnoreSet()); 1524 others.removeAll(already_added); 1525 1526 // Add others to the ranges 1527 for (int32_t i = 0; i < others.getRangeCount(); i++) { 1528 UChar32 from = others.getRangeStart(i); 1529 UChar32 to = others.getRangeEnd(i); 1530 if (from == to) { 1531 ranges->Add(CharacterRange::Singleton(from), zone); 1532 } else { 1533 ranges->Add(CharacterRange::Range(from, to), zone); 1534 } 1535 } 1536 #else 1537 for (int i = 0; i < range_count; i++) { 1538 CharacterRange range = ranges->at(i); 1539 base::uc32 bottom = range.from(); 1540 if (bottom > kMaxUtf16CodeUnit) continue; 1541 base::uc32 top = std::min({range.to(), kMaxUtf16CodeUnitU}); 1542 // Nothing to be done for surrogates. 1543 if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) continue; 1544 if (is_one_byte && !RangeContainsLatin1Equivalents(range)) { 1545 if (bottom > String::kMaxOneByteCharCode) continue; 1546 if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode; 1547 } 1548 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1549 if (top == bottom) { 1550 // If this is a singleton we just expand the one character. 1551 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars); 1552 for (int j = 0; j < length; j++) { 1553 base::uc32 chr = chars[j]; 1554 if (chr != bottom) { 1555 ranges->Add(CharacterRange::Singleton(chars[j]), zone); 1556 } 1557 } 1558 } else { 1559 // If this is a range we expand the characters block by block, expanding 1560 // contiguous subranges (blocks) one at a time. The approach is as 1561 // follows. For a given start character we look up the remainder of the 1562 // block that contains it (represented by the end point), for instance we 1563 // find 'z' if the character is 'c'. A block is characterized by the 1564 // property that all characters uncanonicalize in the same way, except 1565 // that each entry in the result is incremented by the distance from the 1566 // first element. So a-z is a block because 'a' uncanonicalizes to ['a', 1567 // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. Once 1568 // we've found the end point we look up its uncanonicalization and 1569 // produce a range for each element. For instance for [c-f] we look up 1570 // ['z', 'Z'] and produce [c-f] and [C-F]. We then only add a range if 1571 // it is not already contained in the input, so [c-f] will be skipped but 1572 // [C-F] will be added. If this range is not completely contained in a 1573 // block we do this for all the blocks covered by the range (handling 1574 // characters that is not in a block as a "singleton block"). 1575 unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1576 base::uc32 pos = bottom; 1577 while (pos <= top) { 1578 int length = 1579 isolate->jsregexp_canonrange()->get(pos, '\0', equivalents); 1580 base::uc32 block_end; 1581 if (length == 0) { 1582 block_end = pos; 1583 } else { 1584 DCHECK_EQ(1, length); 1585 block_end = equivalents[0]; 1586 } 1587 int end = (block_end > top) ? top : block_end; 1588 length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', 1589 equivalents); 1590 for (int j = 0; j < length; j++) { 1591 base::uc32 c = equivalents[j]; 1592 base::uc32 range_from = c - (block_end - pos); 1593 base::uc32 range_to = c - (block_end - end); 1594 if (!(bottom <= range_from && range_to <= top)) { 1595 ranges->Add(CharacterRange::Range(range_from, range_to), zone); 1596 } 1597 } 1598 pos = end + 1; 1599 } 1600 } 1601 } 1602 #endif // V8_INTL_SUPPORT 1603 } 1604 1605 bool CharacterRange::IsCanonical(const ZoneList<CharacterRange>* ranges) { 1606 DCHECK_NOT_NULL(ranges); 1607 int n = ranges->length(); 1608 if (n <= 1) return true; 1609 base::uc32 max = ranges->at(0).to(); 1610 for (int i = 1; i < n; i++) { 1611 CharacterRange next_range = ranges->at(i); 1612 if (next_range.from() <= max + 1) return false; 1613 max = next_range.to(); 1614 } 1615 return true; 1616 } 1617 1618 ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) { 1619 if (ranges_ == nullptr) { 1620 ranges_ = zone->New<ZoneList<CharacterRange>>(2, zone); 1621 CharacterRange::AddClassEscape(standard_set_type_.value(), ranges_, false, 1622 zone); 1623 } 1624 return ranges_; 1625 } 1626 1627 namespace { 1628 1629 // Move a number of elements in a zonelist to another position 1630 // in the same list. Handles overlapping source and target areas. 1631 void MoveRanges(ZoneList<CharacterRange>* list, int from, int to, int count) { 1632 // Ranges are potentially overlapping. 1633 if (from < to) { 1634 for (int i = count - 1; i >= 0; i--) { 1635 list->at(to + i) = list->at(from + i); 1636 } 1637 } else { 1638 for (int i = 0; i < count; i++) { 1639 list->at(to + i) = list->at(from + i); 1640 } 1641 } 1642 } 1643 1644 int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list, int count, 1645 CharacterRange insert) { 1646 // Inserts a range into list[0..count[, which must be sorted 1647 // by from value and non-overlapping and non-adjacent, using at most 1648 // list[0..count] for the result. Returns the number of resulting 1649 // canonicalized ranges. Inserting a range may collapse existing ranges into 1650 // fewer ranges, so the return value can be anything in the range 1..count+1. 1651 base::uc32 from = insert.from(); 1652 base::uc32 to = insert.to(); 1653 int start_pos = 0; 1654 int end_pos = count; 1655 for (int i = count - 1; i >= 0; i--) { 1656 CharacterRange current = list->at(i); 1657 if (current.from() > to + 1) { 1658 end_pos = i; 1659 } else if (current.to() + 1 < from) { 1660 start_pos = i + 1; 1661 break; 1662 } 1663 } 1664 1665 // Inserted range overlaps, or is adjacent to, ranges at positions 1666 // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are 1667 // not affected by the insertion. 1668 // If start_pos == end_pos, the range must be inserted before start_pos. 1669 // if start_pos < end_pos, the entire range from start_pos to end_pos 1670 // must be merged with the insert range. 1671 1672 if (start_pos == end_pos) { 1673 // Insert between existing ranges at position start_pos. 1674 if (start_pos < count) { 1675 MoveRanges(list, start_pos, start_pos + 1, count - start_pos); 1676 } 1677 list->at(start_pos) = insert; 1678 return count + 1; 1679 } 1680 if (start_pos + 1 == end_pos) { 1681 // Replace single existing range at position start_pos. 1682 CharacterRange to_replace = list->at(start_pos); 1683 int new_from = std::min(to_replace.from(), from); 1684 int new_to = std::max(to_replace.to(), to); 1685 list->at(start_pos) = CharacterRange::Range(new_from, new_to); 1686 return count; 1687 } 1688 // Replace a number of existing ranges from start_pos to end_pos - 1. 1689 // Move the remaining ranges down. 1690 1691 int new_from = std::min(list->at(start_pos).from(), from); 1692 int new_to = std::max(list->at(end_pos - 1).to(), to); 1693 if (end_pos < count) { 1694 MoveRanges(list, end_pos, start_pos + 1, count - end_pos); 1695 } 1696 list->at(start_pos) = CharacterRange::Range(new_from, new_to); 1697 return count - (end_pos - start_pos) + 1; 1698 } 1699 1700 } // namespace 1701 1702 void CharacterSet::Canonicalize() { 1703 // Special/default classes are always considered canonical. The result 1704 // of calling ranges() will be sorted. 1705 if (ranges_ == nullptr) return; 1706 CharacterRange::Canonicalize(ranges_); 1707 } 1708 1709 // static 1710 void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) { 1711 if (character_ranges->length() <= 1) return; 1712 // Check whether ranges are already canonical (increasing, non-overlapping, 1713 // non-adjacent). 1714 int n = character_ranges->length(); 1715 base::uc32 max = character_ranges->at(0).to(); 1716 int i = 1; 1717 while (i < n) { 1718 CharacterRange current = character_ranges->at(i); 1719 if (current.from() <= max + 1) { 1720 break; 1721 } 1722 max = current.to(); 1723 i++; 1724 } 1725 // Canonical until the i'th range. If that's all of them, we are done. 1726 if (i == n) return; 1727 1728 // The ranges at index i and forward are not canonicalized. Make them so by 1729 // doing the equivalent of insertion sort (inserting each into the previous 1730 // list, in order). 1731 // Notice that inserting a range can reduce the number of ranges in the 1732 // result due to combining of adjacent and overlapping ranges. 1733 int read = i; // Range to insert. 1734 int num_canonical = i; // Length of canonicalized part of list. 1735 do { 1736 num_canonical = InsertRangeInCanonicalList(character_ranges, num_canonical, 1737 character_ranges->at(read)); 1738 read++; 1739 } while (read < n); 1740 character_ranges->Rewind(num_canonical); 1741 1742 DCHECK(CharacterRange::IsCanonical(character_ranges)); 1743 } 1744 1745 // static 1746 void CharacterRange::Negate(const ZoneList<CharacterRange>* ranges, 1747 ZoneList<CharacterRange>* negated_ranges, 1748 Zone* zone) { 1749 DCHECK(CharacterRange::IsCanonical(ranges)); 1750 DCHECK_EQ(0, negated_ranges->length()); 1751 int range_count = ranges->length(); 1752 base::uc32 from = 0; 1753 int i = 0; 1754 if (range_count > 0 && ranges->at(0).from() == 0) { 1755 from = ranges->at(0).to() + 1; 1756 i = 1; 1757 } 1758 while (i < range_count) { 1759 CharacterRange range = ranges->at(i); 1760 negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone); 1761 from = range.to() + 1; 1762 i++; 1763 } 1764 if (from < kMaxCodePoint) { 1765 negated_ranges->Add(CharacterRange::Range(from, kMaxCodePoint), zone); 1766 } 1767 } 1768 1769 // static 1770 void CharacterRange::Intersect(const ZoneList<CharacterRange>* lhs, 1771 const ZoneList<CharacterRange>* rhs, 1772 ZoneList<CharacterRange>* intersection, 1773 Zone* zone) { 1774 DCHECK(CharacterRange::IsCanonical(lhs)); 1775 DCHECK(CharacterRange::IsCanonical(rhs)); 1776 DCHECK_EQ(0, intersection->length()); 1777 int lhs_index = 0; 1778 int rhs_index = 0; 1779 while (lhs_index < lhs->length() && rhs_index < rhs->length()) { 1780 // Skip non-overlapping ranges. 1781 if (lhs->at(lhs_index).to() < rhs->at(rhs_index).from()) { 1782 lhs_index++; 1783 continue; 1784 } 1785 if (rhs->at(rhs_index).to() < lhs->at(lhs_index).from()) { 1786 rhs_index++; 1787 continue; 1788 } 1789 1790 base::uc32 from = 1791 std::max(lhs->at(lhs_index).from(), rhs->at(rhs_index).from()); 1792 base::uc32 to = std::min(lhs->at(lhs_index).to(), rhs->at(rhs_index).to()); 1793 intersection->Add(CharacterRange::Range(from, to), zone); 1794 if (to == lhs->at(lhs_index).to()) { 1795 lhs_index++; 1796 } else { 1797 rhs_index++; 1798 } 1799 } 1800 1801 DCHECK(IsCanonical(intersection)); 1802 } 1803 1804 namespace { 1805 1806 // Advance |index| and set |from| and |to| to the new range, if not out of 1807 // bounds of |range|, otherwise |from| is set to a code point beyond the legal 1808 // unicode character range. 1809 void SafeAdvanceRange(const ZoneList<CharacterRange>* range, int* index, 1810 base::uc32* from, base::uc32* to) { 1811 ++(*index); 1812 if (*index < range->length()) { 1813 *from = range->at(*index).from(); 1814 *to = range->at(*index).to(); 1815 } else { 1816 *from = kMaxCodePoint + 1; 1817 } 1818 } 1819 1820 } // namespace 1821 1822 // static 1823 void CharacterRange::Subtract(const ZoneList<CharacterRange>* src, 1824 const ZoneList<CharacterRange>* to_remove, 1825 ZoneList<CharacterRange>* result, Zone* zone) { 1826 DCHECK(CharacterRange::IsCanonical(src)); 1827 DCHECK(CharacterRange::IsCanonical(to_remove)); 1828 DCHECK_EQ(0, result->length()); 1829 1830 if (src->is_empty()) return; 1831 1832 int src_index = 0; 1833 int to_remove_index = 0; 1834 base::uc32 from = src->at(src_index).from(); 1835 base::uc32 to = src->at(src_index).to(); 1836 while (src_index < src->length() && to_remove_index < to_remove->length()) { 1837 CharacterRange remove_range = to_remove->at(to_remove_index); 1838 if (remove_range.to() < from) { 1839 // (a) Non-overlapping case, ignore current to_remove range. 1840 // |-------| 1841 // |-------| 1842 to_remove_index++; 1843 } else if (to < remove_range.from()) { 1844 // (b) Non-overlapping case, add full current range to result. 1845 // |-------| 1846 // |-------| 1847 result->Add(CharacterRange::Range(from, to), zone); 1848 SafeAdvanceRange(src, &src_index, &from, &to); 1849 } else if (from >= remove_range.from() && to <= remove_range.to()) { 1850 // (c) Current to_remove range fully covers current range. 1851 // |---| 1852 // |-------| 1853 SafeAdvanceRange(src, &src_index, &from, &to); 1854 } else if (from < remove_range.from() && to > remove_range.to()) { 1855 // (d) Split current range. 1856 // |-------| 1857 // |---| 1858 result->Add(CharacterRange::Range(from, remove_range.from() - 1), zone); 1859 from = remove_range.to() + 1; 1860 to_remove_index++; 1861 } else if (from < remove_range.from()) { 1862 // (e) End current range. 1863 // |-------| 1864 // |-------| 1865 to = remove_range.from() - 1; 1866 result->Add(CharacterRange::Range(from, to), zone); 1867 SafeAdvanceRange(src, &src_index, &from, &to); 1868 } else if (to > remove_range.to()) { 1869 // (f) Modify start of current range. 1870 // |-------| 1871 // |-------| 1872 from = remove_range.to() + 1; 1873 to_remove_index++; 1874 } else { 1875 UNREACHABLE(); 1876 } 1877 } 1878 // The last range needs special treatment after |to_remove| is exhausted, as 1879 // |from| might have been modified by the last |to_remove| range and |to| was 1880 // not yet known (i.e. cases d and f). 1881 if (from <= to) { 1882 result->Add(CharacterRange::Range(from, to), zone); 1883 } 1884 src_index++; 1885 1886 // Add remaining ranges after |to_remove| is exhausted. 1887 for (; src_index < src->length(); src_index++) { 1888 result->Add(src->at(src_index), zone); 1889 } 1890 1891 DCHECK(IsCanonical(result)); 1892 } 1893 1894 // static 1895 void CharacterRange::ClampToOneByte(ZoneList<CharacterRange>* ranges) { 1896 DCHECK(IsCanonical(ranges)); 1897 1898 // Drop all ranges that don't contain one-byte code units, and clamp the last 1899 // range s.t. it likewise only contains one-byte code units. Note this relies 1900 // on `ranges` being canonicalized, i.e. sorted and non-overlapping. 1901 1902 static constexpr base::uc32 max_char = String::kMaxOneByteCharCodeU; 1903 int n = ranges->length(); 1904 for (; n > 0; n--) { 1905 CharacterRange& r = ranges->at(n - 1); 1906 if (r.from() <= max_char) { 1907 r.to_ = std::min(r.to_, max_char); 1908 break; 1909 } 1910 } 1911 1912 ranges->Rewind(n); 1913 } 1914 1915 // static 1916 bool CharacterRange::Equals(const ZoneList<CharacterRange>* lhs, 1917 const ZoneList<CharacterRange>* rhs) { 1918 DCHECK(IsCanonical(lhs)); 1919 DCHECK(IsCanonical(rhs)); 1920 if (lhs->length() != rhs->length()) return false; 1921 1922 for (int i = 0; i < lhs->length(); i++) { 1923 if (lhs->at(i) != rhs->at(i)) return false; 1924 } 1925 1926 return true; 1927 } 1928 1929 namespace { 1930 1931 // Scoped object to keep track of how much we unroll quantifier loops in the 1932 // regexp graph generator. 1933 class RegExpExpansionLimiter { 1934 public: 1935 static const int kMaxExpansionFactor = 6; 1936 RegExpExpansionLimiter(RegExpCompiler* compiler, int factor) 1937 : compiler_(compiler), 1938 saved_expansion_factor_(compiler->current_expansion_factor()), 1939 ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) { 1940 DCHECK_LT(0, factor); 1941 if (ok_to_expand_) { 1942 if (factor > kMaxExpansionFactor) { 1943 // Avoid integer overflow of the current expansion factor. 1944 ok_to_expand_ = false; 1945 compiler->set_current_expansion_factor(kMaxExpansionFactor + 1); 1946 } else { 1947 int new_factor = saved_expansion_factor_ * factor; 1948 ok_to_expand_ = (new_factor <= kMaxExpansionFactor); 1949 compiler->set_current_expansion_factor(new_factor); 1950 } 1951 } 1952 } 1953 1954 ~RegExpExpansionLimiter() { 1955 compiler_->set_current_expansion_factor(saved_expansion_factor_); 1956 } 1957 1958 bool ok_to_expand() { return ok_to_expand_; } 1959 1960 private: 1961 RegExpCompiler* compiler_; 1962 int saved_expansion_factor_; 1963 bool ok_to_expand_; 1964 1965 DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter); 1966 }; 1967 1968 } // namespace 1969 1970 RegExpNode* RegExpQuantifier::ToNode(int min, int max, bool is_greedy, 1971 RegExpTree* body, RegExpCompiler* compiler, 1972 RegExpNode* on_success, 1973 bool not_at_start) { 1974 // x{f, t} becomes this: 1975 // 1976 // (r++)<-. 1977 // | ` 1978 // | (x) 1979 // v ^ 1980 // (r=0)-->(?)---/ [if r < t] 1981 // | 1982 // [if r >= f] \----> ... 1983 // 1984 1985 // 15.10.2.5 RepeatMatcher algorithm. 1986 // The parser has already eliminated the case where max is 0. In the case 1987 // where max_match is zero the parser has removed the quantifier if min was 1988 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom. 1989 1990 // If we know that we cannot match zero length then things are a little 1991 // simpler since we don't need to make the special zero length match check 1992 // from step 2.1. If the min and max are small we can unroll a little in 1993 // this case. 1994 static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,} 1995 static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3} 1996 if (max == 0) return on_success; // This can happen due to recursion. 1997 bool body_can_be_empty = (body->min_match() == 0); 1998 int body_start_reg = RegExpCompiler::kNoRegister; 1999 Interval capture_registers = body->CaptureRegisters(); 2000 bool needs_capture_clearing = !capture_registers.is_empty(); 2001 Zone* zone = compiler->zone(); 2002 2003 if (body_can_be_empty) { 2004 body_start_reg = compiler->AllocateRegister(); 2005 } else if (compiler->optimize() && !needs_capture_clearing) { 2006 // Only unroll if there are no captures and the body can't be 2007 // empty. 2008 { 2009 RegExpExpansionLimiter limiter(compiler, min + ((max != min) ? 1 : 0)); 2010 if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) { 2011 int new_max = (max == kInfinity) ? max : max - min; 2012 // Recurse once to get the loop or optional matches after the fixed 2013 // ones. 2014 RegExpNode* answer = 2015 ToNode(0, new_max, is_greedy, body, compiler, on_success, true); 2016 // Unroll the forced matches from 0 to min. This can cause chains of 2017 // TextNodes (which the parser does not generate). These should be 2018 // combined if it turns out they hinder good code generation. 2019 for (int i = 0; i < min; i++) { 2020 answer = body->ToNode(compiler, answer); 2021 } 2022 return answer; 2023 } 2024 } 2025 if (max <= kMaxUnrolledMaxMatches && min == 0) { 2026 DCHECK_LT(0, max); // Due to the 'if' above. 2027 RegExpExpansionLimiter limiter(compiler, max); 2028 if (limiter.ok_to_expand()) { 2029 // Unroll the optional matches up to max. 2030 RegExpNode* answer = on_success; 2031 for (int i = 0; i < max; i++) { 2032 ChoiceNode* alternation = zone->New<ChoiceNode>(2, zone); 2033 if (is_greedy) { 2034 alternation->AddAlternative( 2035 GuardedAlternative(body->ToNode(compiler, answer))); 2036 alternation->AddAlternative(GuardedAlternative(on_success)); 2037 } else { 2038 alternation->AddAlternative(GuardedAlternative(on_success)); 2039 alternation->AddAlternative( 2040 GuardedAlternative(body->ToNode(compiler, answer))); 2041 } 2042 answer = alternation; 2043 if (not_at_start && !compiler->read_backward()) { 2044 alternation->set_not_at_start(); 2045 } 2046 } 2047 return answer; 2048 } 2049 } 2050 } 2051 bool has_min = min > 0; 2052 bool has_max = max < RegExpTree::kInfinity; 2053 bool needs_counter = has_min || has_max; 2054 int reg_ctr = needs_counter ? compiler->AllocateRegister() 2055 : RegExpCompiler::kNoRegister; 2056 LoopChoiceNode* center = zone->New<LoopChoiceNode>( 2057 body->min_match() == 0, compiler->read_backward(), min, zone); 2058 if (not_at_start && !compiler->read_backward()) center->set_not_at_start(); 2059 RegExpNode* loop_return = 2060 needs_counter ? static_cast<RegExpNode*>( 2061 ActionNode::IncrementRegister(reg_ctr, center)) 2062 : static_cast<RegExpNode*>(center); 2063 if (body_can_be_empty) { 2064 // If the body can be empty we need to check if it was and then 2065 // backtrack. 2066 loop_return = 2067 ActionNode::EmptyMatchCheck(body_start_reg, reg_ctr, min, loop_return); 2068 } 2069 RegExpNode* body_node = body->ToNode(compiler, loop_return); 2070 if (body_can_be_empty) { 2071 // If the body can be empty we need to store the start position 2072 // so we can bail out if it was empty. 2073 body_node = ActionNode::RestorePosition(body_start_reg, body_node); 2074 } 2075 if (needs_capture_clearing) { 2076 // Before entering the body of this loop we need to clear captures. 2077 body_node = ActionNode::ClearCaptures(capture_registers, body_node); 2078 } 2079 GuardedAlternative body_alt(body_node); 2080 if (has_max) { 2081 Guard* body_guard = zone->New<Guard>(reg_ctr, Guard::LT, max); 2082 body_alt.AddGuard(body_guard, zone); 2083 } 2084 GuardedAlternative rest_alt(on_success); 2085 if (has_min) { 2086 Guard* rest_guard = compiler->zone()->New<Guard>(reg_ctr, Guard::GEQ, min); 2087 rest_alt.AddGuard(rest_guard, zone); 2088 } 2089 if (is_greedy) { 2090 center->AddLoopAlternative(body_alt); 2091 center->AddContinueAlternative(rest_alt); 2092 } else { 2093 center->AddContinueAlternative(rest_alt); 2094 center->AddLoopAlternative(body_alt); 2095 } 2096 if (needs_counter) { 2097 return ActionNode::SetRegisterForLoop(reg_ctr, 0, center); 2098 } else { 2099 return center; 2100 } 2101 } 2102 2103 } // namespace internal 2104 } // namespace v8