regexp-parser.cc (112157B)
1 // Copyright 2016 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "irregexp/imported/regexp-parser.h" 6 7 #include "irregexp/imported/regexp-ast.h" 8 #include "irregexp/imported/regexp-macro-assembler.h" 9 #include "irregexp/imported/regexp.h" 10 11 #ifdef V8_INTL_SUPPORT 12 #include "unicode/uniset.h" 13 #include "unicode/unistr.h" 14 #include "unicode/usetiter.h" 15 #include "unicode/utf16.h" // For U16_NEXT 16 #endif // V8_INTL_SUPPORT 17 18 namespace v8 { 19 namespace internal { 20 21 namespace { 22 23 // Whether we're currently inside the ClassEscape production 24 // (tc39.es/ecma262/#prod-annexB-CharacterEscape). 25 enum class InClassEscapeState { 26 kInClass, 27 kNotInClass, 28 }; 29 30 // The production used to derive ClassSetOperand. 31 enum class ClassSetOperandType { 32 kClassSetCharacter, 33 kClassStringDisjunction, 34 kNestedClass, 35 kCharacterClassEscape, // \ CharacterClassEscape is a special nested class, 36 // as we can fold it directly into another range. 37 kClassSetRange 38 }; 39 40 class RegExpTextBuilder { 41 public: 42 using SmallRegExpTreeVector = SmallZoneVector<RegExpTree*, 8>; 43 44 RegExpTextBuilder(Zone* zone, SmallRegExpTreeVector* terms_storage, 45 RegExpFlags flags) 46 : zone_(zone), flags_(flags), terms_(terms_storage), text_(zone) {} 47 void AddCharacter(base::uc16 character); 48 void AddUnicodeCharacter(base::uc32 character); 49 void AddEscapedUnicodeCharacter(base::uc32 character); 50 void AddAtom(RegExpTree* atom); 51 void AddTerm(RegExpTree* term); 52 void AddClassRanges(RegExpClassRanges* cc); 53 void FlushPendingSurrogate(); 54 void FlushText(); 55 RegExpTree* PopLastAtom(); 56 RegExpTree* ToRegExp(); 57 58 private: 59 static const base::uc16 kNoPendingSurrogate = 0; 60 61 void AddLeadSurrogate(base::uc16 lead_surrogate); 62 void AddTrailSurrogate(base::uc16 trail_surrogate); 63 void FlushCharacters(); 64 bool NeedsDesugaringForUnicode(RegExpClassRanges* cc); 65 bool NeedsDesugaringForIgnoreCase(base::uc32 c); 66 void AddClassRangesForDesugaring(base::uc32 c); 67 bool ignore_case() const { return IsIgnoreCase(flags_); } 68 bool IsUnicodeMode() const { 69 // Either /v or /u enable UnicodeMode 70 // https://tc39.es/ecma262/#sec-parsepattern 71 return IsUnicode(flags_) || IsUnicodeSets(flags_); 72 } 73 Zone* zone() const { return zone_; } 74 75 Zone* const zone_; 76 const RegExpFlags flags_; 77 ZoneList<base::uc16>* characters_ = nullptr; 78 base::uc16 pending_surrogate_ = kNoPendingSurrogate; 79 SmallRegExpTreeVector* terms_; 80 SmallRegExpTreeVector text_; 81 }; 82 83 void RegExpTextBuilder::AddLeadSurrogate(base::uc16 lead_surrogate) { 84 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); 85 FlushPendingSurrogate(); 86 // Hold onto the lead surrogate, waiting for a trail surrogate to follow. 87 pending_surrogate_ = lead_surrogate; 88 } 89 90 void RegExpTextBuilder::AddTrailSurrogate(base::uc16 trail_surrogate) { 91 DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate)); 92 if (pending_surrogate_ != kNoPendingSurrogate) { 93 base::uc16 lead_surrogate = pending_surrogate_; 94 pending_surrogate_ = kNoPendingSurrogate; 95 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); 96 base::uc32 combined = 97 unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate); 98 if (NeedsDesugaringForIgnoreCase(combined)) { 99 AddClassRangesForDesugaring(combined); 100 } else { 101 ZoneList<base::uc16> surrogate_pair(2, zone()); 102 surrogate_pair.Add(lead_surrogate, zone()); 103 surrogate_pair.Add(trail_surrogate, zone()); 104 RegExpAtom* atom = 105 zone()->New<RegExpAtom>(surrogate_pair.ToConstVector()); 106 AddAtom(atom); 107 } 108 } else { 109 pending_surrogate_ = trail_surrogate; 110 FlushPendingSurrogate(); 111 } 112 } 113 114 void RegExpTextBuilder::FlushPendingSurrogate() { 115 if (pending_surrogate_ != kNoPendingSurrogate) { 116 DCHECK(IsUnicodeMode()); 117 base::uc32 c = pending_surrogate_; 118 pending_surrogate_ = kNoPendingSurrogate; 119 AddClassRangesForDesugaring(c); 120 } 121 } 122 123 void RegExpTextBuilder::FlushCharacters() { 124 FlushPendingSurrogate(); 125 if (characters_ != nullptr) { 126 RegExpTree* atom = zone()->New<RegExpAtom>(characters_->ToConstVector()); 127 characters_ = nullptr; 128 text_.emplace_back(atom); 129 } 130 } 131 132 void RegExpTextBuilder::FlushText() { 133 FlushCharacters(); 134 size_t num_text = text_.size(); 135 if (num_text == 0) { 136 return; 137 } else if (num_text == 1) { 138 terms_->emplace_back(text_.back()); 139 } else { 140 RegExpText* text = zone()->New<RegExpText>(zone()); 141 for (size_t i = 0; i < num_text; i++) { 142 text_[i]->AppendToText(text, zone()); 143 } 144 terms_->emplace_back(text); 145 } 146 text_.clear(); 147 } 148 149 void RegExpTextBuilder::AddCharacter(base::uc16 c) { 150 FlushPendingSurrogate(); 151 if (characters_ == nullptr) { 152 characters_ = zone()->New<ZoneList<base::uc16>>(4, zone()); 153 } 154 characters_->Add(c, zone()); 155 } 156 157 void RegExpTextBuilder::AddUnicodeCharacter(base::uc32 c) { 158 if (c > static_cast<base::uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) { 159 DCHECK(IsUnicodeMode()); 160 AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c)); 161 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c)); 162 } else if (IsUnicodeMode() && unibrow::Utf16::IsLeadSurrogate(c)) { 163 AddLeadSurrogate(c); 164 } else if (IsUnicodeMode() && unibrow::Utf16::IsTrailSurrogate(c)) { 165 AddTrailSurrogate(c); 166 } else { 167 AddCharacter(static_cast<base::uc16>(c)); 168 } 169 } 170 171 void RegExpTextBuilder::AddEscapedUnicodeCharacter(base::uc32 character) { 172 // A lead or trail surrogate parsed via escape sequence will not 173 // pair up with any preceding lead or following trail surrogate. 174 FlushPendingSurrogate(); 175 AddUnicodeCharacter(character); 176 FlushPendingSurrogate(); 177 } 178 179 void RegExpTextBuilder::AddClassRanges(RegExpClassRanges* cr) { 180 if (NeedsDesugaringForUnicode(cr)) { 181 // With /u or /v, character class needs to be desugared, so it 182 // must be a standalone term instead of being part of a RegExpText. 183 AddTerm(cr); 184 } else { 185 AddAtom(cr); 186 } 187 } 188 189 void RegExpTextBuilder::AddClassRangesForDesugaring(base::uc32 c) { 190 AddTerm(zone()->New<RegExpClassRanges>( 191 zone(), CharacterRange::List(zone(), CharacterRange::Singleton(c)))); 192 } 193 194 void RegExpTextBuilder::AddAtom(RegExpTree* atom) { 195 DCHECK(atom->IsTextElement()); 196 FlushCharacters(); 197 text_.emplace_back(atom); 198 } 199 200 void RegExpTextBuilder::AddTerm(RegExpTree* term) { 201 DCHECK(term->IsTextElement()); 202 FlushText(); 203 terms_->emplace_back(term); 204 } 205 206 bool RegExpTextBuilder::NeedsDesugaringForUnicode(RegExpClassRanges* cc) { 207 if (!IsUnicodeMode()) return false; 208 // TODO(yangguo): we could be smarter than this. Case-insensitivity does not 209 // necessarily mean that we need to desugar. It's probably nicer to have a 210 // separate pass to figure out unicode desugarings. 211 if (ignore_case()) return true; 212 ZoneList<CharacterRange>* ranges = cc->ranges(zone()); 213 CharacterRange::Canonicalize(ranges); 214 215 if (cc->is_negated()) { 216 ZoneList<CharacterRange>* negated_ranges = 217 zone()->New<ZoneList<CharacterRange>>(ranges->length(), zone()); 218 CharacterRange::Negate(ranges, negated_ranges, zone()); 219 ranges = negated_ranges; 220 } 221 222 for (int i = ranges->length() - 1; i >= 0; i--) { 223 base::uc32 from = ranges->at(i).from(); 224 base::uc32 to = ranges->at(i).to(); 225 // Check for non-BMP characters. 226 if (to >= kNonBmpStart) return true; 227 // Check for lone surrogates. 228 if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true; 229 } 230 return false; 231 } 232 233 // We only use this for characters made of surrogate pairs. All other 234 // characters outside of character classes are made case independent in the 235 // code generation. 236 bool RegExpTextBuilder::NeedsDesugaringForIgnoreCase(base::uc32 c) { 237 #ifdef V8_INTL_SUPPORT 238 if (IsUnicodeMode() && ignore_case()) { 239 icu::UnicodeSet set(c, c); 240 set.closeOver(USET_CASE_INSENSITIVE); 241 set.removeAllStrings(); 242 return set.size() > 1; 243 } 244 // In the case where ICU is not included, we act as if the unicode flag is 245 // not set, and do not desugar. 246 #endif // V8_INTL_SUPPORT 247 return false; 248 } 249 250 RegExpTree* RegExpTextBuilder::PopLastAtom() { 251 FlushPendingSurrogate(); 252 RegExpTree* atom; 253 if (characters_ != nullptr) { 254 base::Vector<const base::uc16> char_vector = characters_->ToConstVector(); 255 int num_chars = char_vector.length(); 256 if (num_chars > 1) { 257 base::Vector<const base::uc16> prefix = 258 char_vector.SubVector(0, num_chars - 1); 259 text_.emplace_back(zone()->New<RegExpAtom>(prefix)); 260 char_vector = char_vector.SubVector(num_chars - 1, num_chars); 261 } 262 characters_ = nullptr; 263 atom = zone()->New<RegExpAtom>(char_vector); 264 return atom; 265 } else if (!text_.empty()) { 266 atom = text_.back(); 267 text_.pop_back(); 268 return atom; 269 } 270 return nullptr; 271 } 272 273 RegExpTree* RegExpTextBuilder::ToRegExp() { 274 FlushText(); 275 size_t number_of_terms = terms_->size(); 276 if (number_of_terms == 0) return zone()->New<RegExpEmpty>(); 277 if (number_of_terms == 1) return terms_->back(); 278 return zone()->New<RegExpAlternative>(zone()->New<ZoneList<RegExpTree*>>( 279 base::VectorOf(terms_->begin(), terms_->size()), zone())); 280 } 281 282 // Accumulates RegExp atoms and assertions into lists of terms and alternatives. 283 class RegExpBuilder { 284 public: 285 RegExpBuilder(Zone* zone, RegExpFlags flags) 286 : zone_(zone), 287 flags_(flags), 288 terms_(zone), 289 alternatives_(zone), 290 text_builder_(RegExpTextBuilder{zone, &terms_, flags}) {} 291 void AddCharacter(base::uc16 character); 292 void AddUnicodeCharacter(base::uc32 character); 293 void AddEscapedUnicodeCharacter(base::uc32 character); 294 // "Adds" an empty expression. Does nothing except consume a 295 // following quantifier 296 void AddEmpty(); 297 void AddClassRanges(RegExpClassRanges* cc); 298 void AddAtom(RegExpTree* tree); 299 void AddTerm(RegExpTree* tree); 300 void AddAssertion(RegExpTree* tree); 301 void NewAlternative(); // '|' 302 bool AddQuantifierToAtom(int min, int max, int index, 303 RegExpQuantifier::QuantifierType type); 304 void FlushText(); 305 RegExpTree* ToRegExp(); 306 RegExpFlags flags() const { return flags_; } 307 308 bool ignore_case() const { return IsIgnoreCase(flags_); } 309 bool multiline() const { return IsMultiline(flags_); } 310 bool dotall() const { return IsDotAll(flags_); } 311 312 private: 313 void FlushTerms(); 314 bool IsUnicodeMode() const { 315 // Either /v or /u enable UnicodeMode 316 // https://tc39.es/ecma262/#sec-parsepattern 317 return IsUnicode(flags_) || IsUnicodeSets(flags_); 318 } 319 Zone* zone() const { return zone_; } 320 RegExpTextBuilder& text_builder() { return text_builder_; } 321 322 Zone* const zone_; 323 bool pending_empty_ = false; 324 const RegExpFlags flags_; 325 326 using SmallRegExpTreeVector = SmallZoneVector<RegExpTree*, 8>; 327 SmallRegExpTreeVector terms_; 328 SmallRegExpTreeVector alternatives_; 329 RegExpTextBuilder text_builder_; 330 }; 331 332 enum SubexpressionType { 333 INITIAL, 334 CAPTURE, // All positive values represent captures. 335 POSITIVE_LOOKAROUND, 336 NEGATIVE_LOOKAROUND, 337 GROUPING 338 }; 339 340 class RegExpParserState : public ZoneObject { 341 public: 342 // Push a state on the stack. 343 RegExpParserState(RegExpParserState* previous_state, 344 SubexpressionType group_type, 345 RegExpLookaround::Type lookaround_type, 346 int disjunction_capture_index, 347 const ZoneVector<base::uc16>* capture_name, 348 RegExpFlags flags, Zone* zone) 349 : previous_state_(previous_state), 350 builder_(zone, flags), 351 group_type_(group_type), 352 lookaround_type_(lookaround_type), 353 disjunction_capture_index_(disjunction_capture_index), 354 capture_name_(capture_name) { 355 if (previous_state != nullptr) { 356 non_participating_capture_group_interval_ = 357 previous_state->non_participating_capture_group_interval(); 358 } 359 } 360 // Parser state of containing expression, if any. 361 RegExpParserState* previous_state() const { return previous_state_; } 362 bool IsSubexpression() { return previous_state_ != nullptr; } 363 // RegExpBuilder building this regexp's AST. 364 RegExpBuilder* builder() { return &builder_; } 365 // Type of regexp being parsed (parenthesized group or entire regexp). 366 SubexpressionType group_type() const { return group_type_; } 367 // Lookahead or Lookbehind. 368 RegExpLookaround::Type lookaround_type() const { return lookaround_type_; } 369 // Index in captures array of first capture in this sub-expression, if any. 370 // Also the capture index of this sub-expression itself, if group_type 371 // is CAPTURE. 372 int capture_index() const { return disjunction_capture_index_; } 373 // The name of the current sub-expression, if group_type is CAPTURE. Only 374 // used for named captures. 375 const ZoneVector<base::uc16>* capture_name() const { return capture_name_; } 376 std::pair<int, int> non_participating_capture_group_interval() const { 377 return non_participating_capture_group_interval_; 378 } 379 380 bool IsNamedCapture() const { return capture_name_ != nullptr; } 381 382 // Check whether the parser is inside a capture group with the given index. 383 bool IsInsideCaptureGroup(int index) const { 384 for (const RegExpParserState* s = this; s != nullptr; 385 s = s->previous_state()) { 386 if (s->group_type() != CAPTURE) continue; 387 // Return true if we found the matching capture index. 388 if (index == s->capture_index()) return true; 389 // Abort if index is larger than what has been parsed up till this state. 390 if (index > s->capture_index()) return false; 391 } 392 return false; 393 } 394 395 // Check whether the parser is inside a capture group with the given name. 396 bool IsInsideCaptureGroup(const ZoneVector<base::uc16>* name) const { 397 DCHECK_NOT_NULL(name); 398 for (const RegExpParserState* s = this; s != nullptr; 399 s = s->previous_state()) { 400 if (s->capture_name() == nullptr) continue; 401 if (*s->capture_name() == *name) return true; 402 } 403 return false; 404 } 405 406 void NewAlternative(int captures_started) { 407 if (non_participating_capture_group_interval().second != 0) { 408 // Extend the non-participating interval. 409 non_participating_capture_group_interval_.second = captures_started; 410 } else { 411 // Create new non-participating interval from the start of the current 412 // enclosing group to all captures created within that group so far. 413 non_participating_capture_group_interval_ = 414 std::make_pair(capture_index(), captures_started); 415 } 416 } 417 418 private: 419 // Linked list implementation of stack of states. 420 RegExpParserState* const previous_state_; 421 // Builder for the stored disjunction. 422 RegExpBuilder builder_; 423 // Stored disjunction type (capture, look-ahead or grouping), if any. 424 const SubexpressionType group_type_; 425 // Stored read direction. 426 const RegExpLookaround::Type lookaround_type_; 427 // Stored disjunction's capture index (if any). 428 const int disjunction_capture_index_; 429 // Stored capture name (if any). 430 const ZoneVector<base::uc16>* const capture_name_; 431 // Interval of (named) capture indices ]from, to] that are not participating 432 // in the current state (i.e. they cannot match). 433 // Capture indices are not participating if they were created in a different 434 // alternative. 435 std::pair<int, int> non_participating_capture_group_interval_; 436 }; 437 438 template <class CharT> 439 class RegExpParserImpl final { 440 private: 441 RegExpParserImpl(const CharT* input, int input_length, RegExpFlags flags, 442 uintptr_t stack_limit, Zone* zone, 443 const DisallowGarbageCollection& no_gc); 444 445 bool Parse(RegExpCompileData* result); 446 447 RegExpTree* ParsePattern(); 448 RegExpTree* ParseDisjunction(); 449 RegExpTree* ParseGroup(); 450 451 // Parses a {...,...} quantifier and stores the range in the given 452 // out parameters. 453 bool ParseIntervalQuantifier(int* min_out, int* max_out); 454 455 // Checks whether the following is a length-digit hexadecimal number, 456 // and sets the value if it is. 457 bool ParseHexEscape(int length, base::uc32* value); 458 bool ParseUnicodeEscape(base::uc32* value); 459 bool ParseUnlimitedLengthHexNumber(int max_value, base::uc32* value); 460 461 bool ParsePropertyClassName(ZoneVector<char>* name_1, 462 ZoneVector<char>* name_2); 463 bool AddPropertyClassRange(ZoneList<CharacterRange>* add_to_range, 464 CharacterClassStrings* add_to_strings, bool negate, 465 const ZoneVector<char>& name_1, 466 const ZoneVector<char>& name_2); 467 468 RegExpTree* ParseClassRanges(ZoneList<CharacterRange>* ranges, 469 bool add_unicode_case_equivalents); 470 // Parse inside a class. Either add escaped class to the range, or return 471 // false and pass parsed single character through |char_out|. 472 void ParseClassEscape(ZoneList<CharacterRange>* ranges, Zone* zone, 473 bool add_unicode_case_equivalents, base::uc32* char_out, 474 bool* is_class_escape); 475 // Returns true iff parsing was successful. 476 bool TryParseCharacterClassEscape(base::uc32 next, 477 InClassEscapeState in_class_escape_state, 478 ZoneList<CharacterRange>* ranges, 479 CharacterClassStrings* strings, Zone* zone, 480 bool add_unicode_case_equivalents); 481 RegExpTree* ParseClassStringDisjunction(ZoneList<CharacterRange>* ranges, 482 CharacterClassStrings* strings); 483 RegExpTree* ParseClassSetOperand(const RegExpBuilder* builder, 484 ClassSetOperandType* type_out); 485 RegExpTree* ParseClassSetOperand(const RegExpBuilder* builder, 486 ClassSetOperandType* type_out, 487 ZoneList<CharacterRange>* ranges, 488 CharacterClassStrings* strings, 489 base::uc32* character); 490 base::uc32 ParseClassSetCharacter(); 491 // Parses and returns a single escaped character. 492 base::uc32 ParseCharacterEscape(InClassEscapeState in_class_escape_state, 493 bool* is_escaped_unicode_character); 494 495 void AddMaybeSimpleCaseFoldedRange(ZoneList<CharacterRange>* ranges, 496 CharacterRange new_range); 497 498 RegExpTree* ParseClassUnion(const RegExpBuilder* builder, bool is_negated, 499 RegExpTree* first_operand, 500 ClassSetOperandType first_operand_type, 501 ZoneList<CharacterRange>* ranges, 502 CharacterClassStrings* strings, 503 base::uc32 first_character); 504 RegExpTree* ParseClassIntersection(const RegExpBuilder* builder, 505 bool is_negated, RegExpTree* first_operand, 506 ClassSetOperandType first_operand_type); 507 RegExpTree* ParseClassSubtraction(const RegExpBuilder* builder, 508 bool is_negated, RegExpTree* first_operand, 509 ClassSetOperandType first_operand_type); 510 RegExpTree* ParseCharacterClass(const RegExpBuilder* state); 511 512 base::uc32 ParseOctalLiteral(); 513 514 // Tries to parse the input as a back reference. If successful it 515 // stores the result in the output parameter and returns true. If 516 // it fails it will push back the characters read so the same characters 517 // can be reparsed. 518 bool ParseBackReferenceIndex(int* index_out); 519 520 RegExpTree* ReportError(RegExpError error); 521 void Advance(); 522 void Advance(int dist); 523 void RewindByOneCodepoint(); // Rewinds to before the previous Advance(). 524 void Reset(int pos); 525 526 // Reports whether the pattern might be used as a literal search string. 527 // Only use if the result of the parse is a single atom node. 528 bool simple() const { return simple_; } 529 bool contains_anchor() const { return contains_anchor_; } 530 void set_contains_anchor() { contains_anchor_ = true; } 531 int captures_started() const { return captures_started_; } 532 int position() const { 533 const bool current_is_surrogate = 534 current() != kEndMarker && 535 current() > unibrow::Utf16::kMaxNonSurrogateCharCode; 536 const int rewind_bytes = current_is_surrogate ? 2 : 1; 537 return next_pos_ - rewind_bytes; 538 } 539 bool failed() const { return failed_; } 540 RegExpFlags flags() const { return flags_; } 541 bool IsUnicodeMode() const { 542 // Either /v or /u enable UnicodeMode 543 // https://tc39.es/ecma262/#sec-parsepattern 544 return IsUnicode(flags()) || IsUnicodeSets(flags()) || force_unicode_; 545 } 546 bool unicode_sets() const { return IsUnicodeSets(flags()); } 547 bool ignore_case() const { return IsIgnoreCase(flags()); } 548 549 static bool IsSyntaxCharacterOrSlash(base::uc32 c); 550 static bool IsClassSetSyntaxCharacter(base::uc32 c); 551 static bool IsClassSetReservedPunctuator(base::uc32 c); 552 bool IsClassSetReservedDoublePunctuator(base::uc32 c); 553 554 static const base::uc32 kEndMarker = (1 << 21); 555 556 private: 557 // Return the 1-indexed RegExpCapture object, allocate if necessary. 558 RegExpCapture* GetCapture(int index); 559 560 // Creates a new named capture at the specified index. Must be called exactly 561 // once for each named capture. Fails if a capture with the same name is 562 // encountered. 563 bool CreateNamedCaptureAtIndex(const RegExpParserState* state, int index); 564 565 // Parses the name of a capture group (?<name>pattern). The name must adhere 566 // to IdentifierName in the ECMAScript standard. 567 const ZoneVector<base::uc16>* ParseCaptureGroupName(); 568 569 bool ParseNamedBackReference(RegExpBuilder* builder, 570 RegExpParserState* state); 571 RegExpParserState* ParseOpenParenthesis(RegExpParserState* state); 572 573 // After the initial parsing pass, patch corresponding RegExpCapture objects 574 // into all RegExpBackReferences. This is done after initial parsing in order 575 // to avoid complicating cases in which references comes before the capture. 576 void PatchNamedBackReferences(); 577 578 ZoneVector<RegExpCapture*>* GetNamedCaptures(); 579 580 // Returns true iff the pattern contains named captures. May call 581 // ScanForCaptures to look ahead at the remaining pattern. 582 bool HasNamedCaptures(InClassEscapeState in_class_escape_state); 583 584 Zone* zone() const { return zone_; } 585 586 base::uc32 current() const { return current_; } 587 bool has_more() const { return has_more_; } 588 bool has_next() const { return next_pos_ < input_length(); } 589 base::uc32 Next(); 590 template <bool update_position> 591 base::uc32 ReadNext(); 592 CharT InputAt(int index) const { 593 DCHECK(0 <= index && index < input_length()); 594 return input_[index]; 595 } 596 int input_length() const { return input_length_; } 597 void ScanForCaptures(InClassEscapeState in_class_escape_state); 598 599 struct RegExpCaptureNameLess { 600 bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const { 601 DCHECK_NOT_NULL(lhs); 602 DCHECK_NOT_NULL(rhs); 603 return *lhs->name() < *rhs->name(); 604 } 605 }; 606 607 class ForceUnicodeScope final { 608 public: 609 explicit ForceUnicodeScope(RegExpParserImpl<CharT>* parser) 610 : parser_(parser) { 611 DCHECK(!parser_->force_unicode_); 612 parser_->force_unicode_ = true; 613 } 614 ~ForceUnicodeScope() { 615 DCHECK(parser_->force_unicode_); 616 parser_->force_unicode_ = false; 617 } 618 619 private: 620 RegExpParserImpl<CharT>* const parser_; 621 }; 622 623 const DisallowGarbageCollection no_gc_; 624 Zone* const zone_; 625 RegExpError error_ = RegExpError::kNone; 626 int error_pos_ = 0; 627 ZoneList<RegExpCapture*>* captures_; 628 // Maps capture names to a list of capture indices with this name. 629 ZoneMap<RegExpCapture*, ZoneList<int>*, RegExpCaptureNameLess>* 630 named_captures_; 631 ZoneList<RegExpBackReference*>* named_back_references_; 632 const CharT* const input_; 633 const int input_length_; 634 base::uc32 current_; 635 RegExpFlags flags_; 636 bool force_unicode_ = false; // Force parser to act as if unicode were set. 637 int next_pos_; 638 int captures_started_; 639 int capture_count_; // Only valid after we have scanned for captures. 640 int quantifier_count_; 641 int lookaround_count_; // Only valid after we have scanned for lookbehinds. 642 bool has_more_; 643 bool simple_; 644 bool contains_anchor_; 645 bool is_scanned_for_captures_; 646 bool has_named_captures_; // Only valid after we have scanned for captures. 647 bool failed_; 648 const uintptr_t stack_limit_; 649 650 friend class v8::internal::RegExpParser; 651 }; 652 653 template <class CharT> 654 RegExpParserImpl<CharT>::RegExpParserImpl( 655 const CharT* input, int input_length, RegExpFlags flags, 656 uintptr_t stack_limit, Zone* zone, const DisallowGarbageCollection& no_gc) 657 : zone_(zone), 658 captures_(nullptr), 659 named_captures_(nullptr), 660 named_back_references_(nullptr), 661 input_(input), 662 input_length_(input_length), 663 current_(kEndMarker), 664 flags_(flags), 665 next_pos_(0), 666 captures_started_(0), 667 capture_count_(0), 668 quantifier_count_(0), 669 lookaround_count_(0), 670 has_more_(true), 671 simple_(false), 672 contains_anchor_(false), 673 is_scanned_for_captures_(false), 674 has_named_captures_(false), 675 failed_(false), 676 stack_limit_(stack_limit) { 677 Advance(); 678 } 679 680 template <> 681 template <bool update_position> 682 inline base::uc32 RegExpParserImpl<uint8_t>::ReadNext() { 683 int position = next_pos_; 684 base::uc16 c0 = InputAt(position); 685 position++; 686 DCHECK(!unibrow::Utf16::IsLeadSurrogate(c0)); 687 if (update_position) next_pos_ = position; 688 return c0; 689 } 690 691 template <> 692 template <bool update_position> 693 inline base::uc32 RegExpParserImpl<base::uc16>::ReadNext() { 694 int position = next_pos_; 695 base::uc16 c0 = InputAt(position); 696 base::uc32 result = c0; 697 position++; 698 // Read the whole surrogate pair in case of unicode mode, if possible. 699 if (IsUnicodeMode() && position < input_length() && 700 unibrow::Utf16::IsLeadSurrogate(c0)) { 701 base::uc16 c1 = InputAt(position); 702 if (unibrow::Utf16::IsTrailSurrogate(c1)) { 703 result = unibrow::Utf16::CombineSurrogatePair(c0, c1); 704 position++; 705 } 706 } 707 if (update_position) next_pos_ = position; 708 return result; 709 } 710 711 template <class CharT> 712 base::uc32 RegExpParserImpl<CharT>::Next() { 713 if (has_next()) { 714 return ReadNext<false>(); 715 } else { 716 return kEndMarker; 717 } 718 } 719 720 template <class CharT> 721 void RegExpParserImpl<CharT>::Advance() { 722 if (has_next()) { 723 if (GetCurrentStackPosition() < stack_limit_) { 724 if (v8_flags.correctness_fuzzer_suppressions) { 725 FATAL("Aborting on stack overflow"); 726 } 727 ReportError(RegExpError::kStackOverflow); 728 } else { 729 current_ = ReadNext<true>(); 730 } 731 } else { 732 current_ = kEndMarker; 733 // Advance so that position() points to 1-after-the-last-character. This is 734 // important so that Reset() to this position works correctly. 735 next_pos_ = input_length() + 1; 736 has_more_ = false; 737 } 738 } 739 740 template <class CharT> 741 void RegExpParserImpl<CharT>::RewindByOneCodepoint() { 742 if (!has_more()) return; 743 // Rewinds by one code point, i.e.: two code units if `current` is outside 744 // the basic multilingual plane (= composed of a lead and trail surrogate), 745 // or one code unit otherwise. 746 const int rewind_by = 747 current() > unibrow::Utf16::kMaxNonSurrogateCharCode ? -2 : -1; 748 Advance(rewind_by); // Undo the last Advance. 749 } 750 751 template <class CharT> 752 void RegExpParserImpl<CharT>::Reset(int pos) { 753 next_pos_ = pos; 754 has_more_ = (pos < input_length()); 755 Advance(); 756 } 757 758 template <class CharT> 759 void RegExpParserImpl<CharT>::Advance(int dist) { 760 next_pos_ += dist - 1; 761 Advance(); 762 } 763 764 // static 765 template <class CharT> 766 bool RegExpParserImpl<CharT>::IsSyntaxCharacterOrSlash(base::uc32 c) { 767 switch (c) { 768 case '^': 769 case '$': 770 case '\\': 771 case '.': 772 case '*': 773 case '+': 774 case '?': 775 case '(': 776 case ')': 777 case '[': 778 case ']': 779 case '{': 780 case '}': 781 case '|': 782 case '/': 783 return true; 784 default: 785 break; 786 } 787 return false; 788 } 789 790 // static 791 template <class CharT> 792 bool RegExpParserImpl<CharT>::IsClassSetSyntaxCharacter(base::uc32 c) { 793 switch (c) { 794 case '(': 795 case ')': 796 case '[': 797 case ']': 798 case '{': 799 case '}': 800 case '/': 801 case '-': 802 case '\\': 803 case '|': 804 return true; 805 default: 806 break; 807 } 808 return false; 809 } 810 811 // static 812 template <class CharT> 813 bool RegExpParserImpl<CharT>::IsClassSetReservedPunctuator(base::uc32 c) { 814 switch (c) { 815 case '&': 816 case '-': 817 case '!': 818 case '#': 819 case '%': 820 case ',': 821 case ':': 822 case ';': 823 case '<': 824 case '=': 825 case '>': 826 case '@': 827 case '`': 828 case '~': 829 return true; 830 default: 831 break; 832 } 833 return false; 834 } 835 836 template <class CharT> 837 bool RegExpParserImpl<CharT>::IsClassSetReservedDoublePunctuator(base::uc32 c) { 838 #define DOUBLE_PUNCTUATOR_CASE(Char) \ 839 case Char: \ 840 return Next() == Char 841 842 switch (c) { 843 DOUBLE_PUNCTUATOR_CASE('&'); 844 DOUBLE_PUNCTUATOR_CASE('!'); 845 DOUBLE_PUNCTUATOR_CASE('#'); 846 DOUBLE_PUNCTUATOR_CASE('$'); 847 DOUBLE_PUNCTUATOR_CASE('%'); 848 DOUBLE_PUNCTUATOR_CASE('*'); 849 DOUBLE_PUNCTUATOR_CASE('+'); 850 DOUBLE_PUNCTUATOR_CASE(','); 851 DOUBLE_PUNCTUATOR_CASE('.'); 852 DOUBLE_PUNCTUATOR_CASE(':'); 853 DOUBLE_PUNCTUATOR_CASE(';'); 854 DOUBLE_PUNCTUATOR_CASE('<'); 855 DOUBLE_PUNCTUATOR_CASE('='); 856 DOUBLE_PUNCTUATOR_CASE('>'); 857 DOUBLE_PUNCTUATOR_CASE('?'); 858 DOUBLE_PUNCTUATOR_CASE('@'); 859 DOUBLE_PUNCTUATOR_CASE('^'); 860 DOUBLE_PUNCTUATOR_CASE('`'); 861 DOUBLE_PUNCTUATOR_CASE('~'); 862 default: 863 break; 864 } 865 #undef DOUBLE_PUNCTUATOR_CASE 866 867 return false; 868 } 869 870 template <class CharT> 871 RegExpTree* RegExpParserImpl<CharT>::ReportError(RegExpError error) { 872 if (failed_) return nullptr; // Do not overwrite any existing error. 873 failed_ = true; 874 error_ = error; 875 error_pos_ = position(); 876 // Zip to the end to make sure no more input is read. 877 current_ = kEndMarker; 878 next_pos_ = input_length(); 879 has_more_ = false; 880 return nullptr; 881 } 882 883 #define CHECK_FAILED /**/); \ 884 if (failed_) return nullptr; \ 885 ((void)0 886 887 // Pattern :: 888 // Disjunction 889 template <class CharT> 890 RegExpTree* RegExpParserImpl<CharT>::ParsePattern() { 891 RegExpTree* result = ParseDisjunction(CHECK_FAILED); 892 PatchNamedBackReferences(CHECK_FAILED); 893 DCHECK(!has_more()); 894 // If the result of parsing is a literal string atom, and it has the 895 // same length as the input, then the atom is identical to the input. 896 if (result->IsAtom() && result->AsAtom()->length() == input_length()) { 897 simple_ = true; 898 } 899 return result; 900 } 901 902 // Disjunction :: 903 // Alternative 904 // Alternative | Disjunction 905 // Alternative :: 906 // [empty] 907 // Term Alternative 908 // Term :: 909 // Assertion 910 // Atom 911 // Atom Quantifier 912 template <class CharT> 913 RegExpTree* RegExpParserImpl<CharT>::ParseDisjunction() { 914 // Used to store current state while parsing subexpressions. 915 RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD, 916 0, nullptr, flags(), zone()); 917 RegExpParserState* state = &initial_state; 918 // Cache the builder in a local variable for quick access. 919 RegExpBuilder* builder = initial_state.builder(); 920 while (true) { 921 switch (current()) { 922 case kEndMarker: 923 if (failed()) return nullptr; // E.g. the initial Advance failed. 924 if (state->IsSubexpression()) { 925 // Inside a parenthesized group when hitting end of input. 926 return ReportError(RegExpError::kUnterminatedGroup); 927 } 928 DCHECK_EQ(INITIAL, state->group_type()); 929 // Parsing completed successfully. 930 return builder->ToRegExp(); 931 case ')': { 932 if (!state->IsSubexpression()) { 933 return ReportError(RegExpError::kUnmatchedParen); 934 } 935 DCHECK_NE(INITIAL, state->group_type()); 936 937 Advance(); 938 // End disjunction parsing and convert builder content to new single 939 // regexp atom. 940 RegExpTree* body = builder->ToRegExp(); 941 942 int end_capture_index = captures_started(); 943 944 int capture_index = state->capture_index(); 945 SubexpressionType group_type = state->group_type(); 946 947 // Build result of subexpression. 948 if (group_type == CAPTURE) { 949 if (state->IsNamedCapture()) { 950 CreateNamedCaptureAtIndex(state, capture_index CHECK_FAILED); 951 } 952 RegExpCapture* capture = GetCapture(capture_index); 953 capture->set_body(body); 954 body = capture; 955 } else if (group_type == GROUPING) { 956 body = zone()->template New<RegExpGroup>(body, builder->flags()); 957 } else { 958 DCHECK(group_type == POSITIVE_LOOKAROUND || 959 group_type == NEGATIVE_LOOKAROUND); 960 bool is_positive = (group_type == POSITIVE_LOOKAROUND); 961 body = zone()->template New<RegExpLookaround>( 962 body, is_positive, end_capture_index - capture_index, 963 capture_index, state->lookaround_type(), lookaround_count_); 964 lookaround_count_++; 965 } 966 967 // Restore previous state. 968 state = state->previous_state(); 969 builder = state->builder(); 970 971 builder->AddAtom(body); 972 // For compatibility with JSC and ES3, we allow quantifiers after 973 // lookaheads, and break in all cases. 974 break; 975 } 976 case '|': { 977 Advance(); 978 state->NewAlternative(captures_started()); 979 builder->NewAlternative(); 980 continue; 981 } 982 case '*': 983 case '+': 984 case '?': 985 return ReportError(RegExpError::kNothingToRepeat); 986 case '^': { 987 Advance(); 988 builder->AddAssertion(zone()->template New<RegExpAssertion>( 989 builder->multiline() ? RegExpAssertion::Type::START_OF_LINE 990 : RegExpAssertion::Type::START_OF_INPUT)); 991 set_contains_anchor(); 992 continue; 993 } 994 case '$': { 995 Advance(); 996 RegExpAssertion::Type assertion_type = 997 builder->multiline() ? RegExpAssertion::Type::END_OF_LINE 998 : RegExpAssertion::Type::END_OF_INPUT; 999 builder->AddAssertion( 1000 zone()->template New<RegExpAssertion>(assertion_type)); 1001 continue; 1002 } 1003 case '.': { 1004 Advance(); 1005 ZoneList<CharacterRange>* ranges = 1006 zone()->template New<ZoneList<CharacterRange>>(2, zone()); 1007 1008 if (builder->dotall()) { 1009 // Everything. 1010 CharacterRange::AddClassEscape(StandardCharacterSet::kEverything, 1011 ranges, false, zone()); 1012 } else { 1013 // Everything except \x0A, \x0D, \u2028 and \u2029. 1014 CharacterRange::AddClassEscape( 1015 StandardCharacterSet::kNotLineTerminator, ranges, false, zone()); 1016 } 1017 1018 RegExpClassRanges* cc = 1019 zone()->template New<RegExpClassRanges>(zone(), ranges); 1020 builder->AddClassRanges(cc); 1021 break; 1022 } 1023 case '(': { 1024 state = ParseOpenParenthesis(state CHECK_FAILED); 1025 builder = state->builder(); 1026 flags_ = builder->flags(); 1027 continue; 1028 } 1029 case '[': { 1030 RegExpTree* cc = ParseCharacterClass(builder CHECK_FAILED); 1031 if (cc->IsClassRanges()) { 1032 builder->AddClassRanges(cc->AsClassRanges()); 1033 } else { 1034 DCHECK(cc->IsClassSetExpression()); 1035 builder->AddTerm(cc); 1036 } 1037 break; 1038 } 1039 // Atom :: 1040 // \ AtomEscape 1041 case '\\': 1042 switch (Next()) { 1043 case kEndMarker: 1044 return ReportError(RegExpError::kEscapeAtEndOfPattern); 1045 // AtomEscape :: 1046 // [+UnicodeMode] DecimalEscape 1047 // [~UnicodeMode] DecimalEscape but only if the CapturingGroupNumber 1048 // of DecimalEscape is ≤ NcapturingParens 1049 // CharacterEscape (some cases of this mixed in too) 1050 // 1051 // TODO(jgruber): It may make sense to disentangle all the different 1052 // cases and make the structure mirror the spec, e.g. for AtomEscape: 1053 // 1054 // if (TryParseDecimalEscape(...)) return; 1055 // if (TryParseCharacterClassEscape(...)) return; 1056 // if (TryParseCharacterEscape(...)) return; 1057 // if (TryParseGroupName(...)) return; 1058 case '1': 1059 case '2': 1060 case '3': 1061 case '4': 1062 case '5': 1063 case '6': 1064 case '7': 1065 case '8': 1066 case '9': { 1067 int index = 0; 1068 const bool is_backref = 1069 ParseBackReferenceIndex(&index CHECK_FAILED); 1070 if (is_backref) { 1071 if (state->IsInsideCaptureGroup(index)) { 1072 // The back reference is inside the capture group it refers to. 1073 // Nothing can possibly have been captured yet, so we use empty 1074 // instead. This ensures that, when checking a back reference, 1075 // the capture registers of the referenced capture are either 1076 // both set or both cleared. 1077 builder->AddEmpty(); 1078 } else { 1079 RegExpCapture* capture = GetCapture(index); 1080 RegExpTree* atom = 1081 zone()->template New<RegExpBackReference>(capture, zone()); 1082 builder->AddAtom(atom); 1083 } 1084 break; 1085 } 1086 // With /u and /v, no identity escapes except for syntax characters 1087 // are allowed. Otherwise, all identity escapes are allowed. 1088 if (IsUnicodeMode()) { 1089 return ReportError(RegExpError::kInvalidEscape); 1090 } 1091 base::uc32 first_digit = Next(); 1092 if (first_digit == '8' || first_digit == '9') { 1093 builder->AddCharacter(first_digit); 1094 Advance(2); 1095 break; 1096 } 1097 [[fallthrough]]; 1098 } 1099 case '0': { 1100 Advance(); 1101 if (IsUnicodeMode() && Next() >= '0' && Next() <= '9') { 1102 // Decimal escape with leading 0 are not parsed as octal. 1103 return ReportError(RegExpError::kInvalidDecimalEscape); 1104 } 1105 base::uc32 octal = ParseOctalLiteral(); 1106 builder->AddCharacter(octal); 1107 break; 1108 } 1109 case 'b': 1110 Advance(2); 1111 builder->AddAssertion(zone()->template New<RegExpAssertion>( 1112 RegExpAssertion::Type::BOUNDARY)); 1113 continue; 1114 case 'B': 1115 Advance(2); 1116 builder->AddAssertion(zone()->template New<RegExpAssertion>( 1117 RegExpAssertion::Type::NON_BOUNDARY)); 1118 continue; 1119 // AtomEscape :: 1120 // CharacterClassEscape 1121 case 'd': 1122 case 'D': 1123 case 's': 1124 case 'S': 1125 case 'w': 1126 case 'W': { 1127 base::uc32 next = Next(); 1128 ZoneList<CharacterRange>* ranges = 1129 zone()->template New<ZoneList<CharacterRange>>(2, zone()); 1130 bool add_unicode_case_equivalents = 1131 IsUnicodeMode() && ignore_case(); 1132 bool parsed_character_class_escape = TryParseCharacterClassEscape( 1133 next, InClassEscapeState::kNotInClass, ranges, nullptr, zone(), 1134 add_unicode_case_equivalents CHECK_FAILED); 1135 1136 if (parsed_character_class_escape) { 1137 RegExpClassRanges* cc = 1138 zone()->template New<RegExpClassRanges>(zone(), ranges); 1139 builder->AddClassRanges(cc); 1140 } else { 1141 CHECK(!IsUnicodeMode()); 1142 Advance(2); 1143 builder->AddCharacter(next); // IdentityEscape. 1144 } 1145 break; 1146 } 1147 case 'p': 1148 case 'P': { 1149 base::uc32 next = Next(); 1150 ZoneList<CharacterRange>* ranges = 1151 zone()->template New<ZoneList<CharacterRange>>(2, zone()); 1152 CharacterClassStrings* strings = nullptr; 1153 if (unicode_sets()) { 1154 strings = zone()->template New<CharacterClassStrings>(zone()); 1155 } 1156 bool add_unicode_case_equivalents = ignore_case(); 1157 bool parsed_character_class_escape = TryParseCharacterClassEscape( 1158 next, InClassEscapeState::kNotInClass, ranges, strings, zone(), 1159 add_unicode_case_equivalents CHECK_FAILED); 1160 1161 if (parsed_character_class_escape) { 1162 if (unicode_sets()) { 1163 RegExpClassSetOperand* op = 1164 zone()->template New<RegExpClassSetOperand>(ranges, 1165 strings); 1166 builder->AddTerm(op); 1167 } else { 1168 RegExpClassRanges* cc = 1169 zone()->template New<RegExpClassRanges>(zone(), ranges); 1170 builder->AddClassRanges(cc); 1171 } 1172 } else { 1173 CHECK(!IsUnicodeMode()); 1174 Advance(2); 1175 builder->AddCharacter(next); // IdentityEscape. 1176 } 1177 break; 1178 } 1179 // AtomEscape :: 1180 // k GroupName 1181 case 'k': { 1182 // Either an identity escape or a named back-reference. The two 1183 // interpretations are mutually exclusive: '\k' is interpreted as 1184 // an identity escape for non-Unicode patterns without named 1185 // capture groups, and as the beginning of a named back-reference 1186 // in all other cases. 1187 const bool has_named_captures = 1188 HasNamedCaptures(InClassEscapeState::kNotInClass CHECK_FAILED); 1189 if (IsUnicodeMode() || has_named_captures) { 1190 Advance(2); 1191 ParseNamedBackReference(builder, state CHECK_FAILED); 1192 break; 1193 } 1194 } 1195 [[fallthrough]]; 1196 // AtomEscape :: 1197 // CharacterEscape 1198 default: { 1199 bool is_escaped_unicode_character = false; 1200 base::uc32 c = ParseCharacterEscape( 1201 InClassEscapeState::kNotInClass, 1202 &is_escaped_unicode_character CHECK_FAILED); 1203 if (is_escaped_unicode_character) { 1204 builder->AddEscapedUnicodeCharacter(c); 1205 } else { 1206 builder->AddCharacter(c); 1207 } 1208 break; 1209 } 1210 } 1211 break; 1212 case '{': { 1213 int dummy; 1214 bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED); 1215 if (parsed) return ReportError(RegExpError::kNothingToRepeat); 1216 [[fallthrough]]; 1217 } 1218 case '}': 1219 case ']': 1220 if (IsUnicodeMode()) { 1221 return ReportError(RegExpError::kLoneQuantifierBrackets); 1222 } 1223 [[fallthrough]]; 1224 default: 1225 builder->AddUnicodeCharacter(current()); 1226 Advance(); 1227 break; 1228 } // end switch(current()) 1229 1230 int min; 1231 int max; 1232 switch (current()) { 1233 // QuantifierPrefix :: 1234 // * 1235 // + 1236 // ? 1237 // { 1238 case '*': 1239 min = 0; 1240 max = RegExpTree::kInfinity; 1241 Advance(); 1242 break; 1243 case '+': 1244 min = 1; 1245 max = RegExpTree::kInfinity; 1246 Advance(); 1247 break; 1248 case '?': 1249 min = 0; 1250 max = 1; 1251 Advance(); 1252 break; 1253 case '{': 1254 if (ParseIntervalQuantifier(&min, &max)) { 1255 if (max < min) { 1256 return ReportError(RegExpError::kRangeOutOfOrder); 1257 } 1258 break; 1259 } else if (IsUnicodeMode()) { 1260 // Incomplete quantifiers are not allowed. 1261 return ReportError(RegExpError::kIncompleteQuantifier); 1262 } 1263 continue; 1264 default: 1265 continue; 1266 } 1267 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY; 1268 if (current() == '?') { 1269 quantifier_type = RegExpQuantifier::NON_GREEDY; 1270 Advance(); 1271 } else if (v8_flags.regexp_possessive_quantifier && current() == '+') { 1272 // v8_flags.regexp_possessive_quantifier is a debug-only flag. 1273 quantifier_type = RegExpQuantifier::POSSESSIVE; 1274 Advance(); 1275 } 1276 if (!builder->AddQuantifierToAtom(min, max, quantifier_count_, 1277 quantifier_type)) { 1278 return ReportError(RegExpError::kInvalidQuantifier); 1279 } 1280 ++quantifier_count_; 1281 } 1282 } 1283 1284 template <class CharT> 1285 RegExpParserState* RegExpParserImpl<CharT>::ParseOpenParenthesis( 1286 RegExpParserState* state) { 1287 RegExpLookaround::Type lookaround_type = state->lookaround_type(); 1288 bool is_named_capture = false; 1289 const ZoneVector<base::uc16>* capture_name = nullptr; 1290 SubexpressionType subexpr_type = CAPTURE; 1291 RegExpFlags flags = state->builder()->flags(); 1292 bool parsing_modifiers = false; 1293 bool modifiers_polarity = true; 1294 RegExpFlags modifiers; 1295 Advance(); 1296 if (current() == '?') { 1297 do { 1298 base::uc32 next = Next(); 1299 switch (next) { 1300 case '-': 1301 if (!v8_flags.js_regexp_modifiers) { 1302 ReportError(RegExpError::kInvalidGroup); 1303 return nullptr; 1304 } 1305 Advance(); 1306 parsing_modifiers = true; 1307 if (modifiers_polarity == false) { 1308 ReportError(RegExpError::kMultipleFlagDashes); 1309 return nullptr; 1310 } 1311 modifiers_polarity = false; 1312 break; 1313 case 'm': 1314 case 'i': 1315 case 's': { 1316 if (!v8_flags.js_regexp_modifiers) { 1317 ReportError(RegExpError::kInvalidGroup); 1318 return nullptr; 1319 } 1320 Advance(); 1321 parsing_modifiers = true; 1322 RegExpFlag flag = TryRegExpFlagFromChar(next).value(); 1323 if ((modifiers & flag) != 0) { 1324 ReportError(RegExpError::kRepeatedFlag); 1325 return nullptr; 1326 } 1327 modifiers |= flag; 1328 flags.set(flag, modifiers_polarity); 1329 break; 1330 } 1331 case ':': 1332 Advance(2); 1333 parsing_modifiers = false; 1334 subexpr_type = GROUPING; 1335 break; 1336 case '=': 1337 Advance(2); 1338 if (parsing_modifiers) { 1339 DCHECK(v8_flags.js_regexp_modifiers); 1340 ReportError(RegExpError::kInvalidGroup); 1341 return nullptr; 1342 } 1343 lookaround_type = RegExpLookaround::LOOKAHEAD; 1344 subexpr_type = POSITIVE_LOOKAROUND; 1345 break; 1346 case '!': 1347 Advance(2); 1348 if (parsing_modifiers) { 1349 DCHECK(v8_flags.js_regexp_modifiers); 1350 ReportError(RegExpError::kInvalidGroup); 1351 return nullptr; 1352 } 1353 lookaround_type = RegExpLookaround::LOOKAHEAD; 1354 subexpr_type = NEGATIVE_LOOKAROUND; 1355 break; 1356 case '<': 1357 Advance(); 1358 if (parsing_modifiers) { 1359 DCHECK(v8_flags.js_regexp_modifiers); 1360 ReportError(RegExpError::kInvalidGroup); 1361 return nullptr; 1362 } 1363 if (Next() == '=') { 1364 Advance(2); 1365 lookaround_type = RegExpLookaround::LOOKBEHIND; 1366 subexpr_type = POSITIVE_LOOKAROUND; 1367 break; 1368 } else if (Next() == '!') { 1369 Advance(2); 1370 lookaround_type = RegExpLookaround::LOOKBEHIND; 1371 subexpr_type = NEGATIVE_LOOKAROUND; 1372 break; 1373 } 1374 is_named_capture = true; 1375 has_named_captures_ = true; 1376 Advance(); 1377 break; 1378 default: 1379 ReportError(RegExpError::kInvalidGroup); 1380 return nullptr; 1381 } 1382 } while (parsing_modifiers); 1383 } 1384 if (modifiers_polarity == false) { 1385 // We encountered a dash. 1386 if (modifiers == 0) { 1387 ReportError(RegExpError::kInvalidFlagGroup); 1388 return nullptr; 1389 } 1390 } 1391 if (subexpr_type == CAPTURE) { 1392 if (captures_started_ >= RegExpMacroAssembler::kMaxCaptures) { 1393 ReportError(RegExpError::kTooManyCaptures); 1394 return nullptr; 1395 } 1396 captures_started_++; 1397 1398 if (is_named_capture) { 1399 capture_name = ParseCaptureGroupName(CHECK_FAILED); 1400 } 1401 } 1402 // Store current state and begin new disjunction parsing. 1403 return zone()->template New<RegExpParserState>( 1404 state, subexpr_type, lookaround_type, captures_started_, capture_name, 1405 flags, zone()); 1406 } 1407 1408 // In order to know whether an escape is a backreference or not we have to scan 1409 // the entire regexp and find the number of capturing parentheses. However we 1410 // don't want to scan the regexp twice unless it is necessary. This mini-parser 1411 // is called when needed. It can see the difference between capturing and 1412 // noncapturing parentheses and can skip character classes and backslash-escaped 1413 // characters. 1414 // 1415 // Important: The scanner has to be in a consistent state when calling 1416 // ScanForCaptures, e.g. not in the middle of an escape sequence '\[' or while 1417 // parsing a nested class. 1418 template <class CharT> 1419 void RegExpParserImpl<CharT>::ScanForCaptures( 1420 InClassEscapeState in_class_escape_state) { 1421 DCHECK(!is_scanned_for_captures_); 1422 const int saved_position = position(); 1423 // Start with captures started previous to current position 1424 int capture_count = captures_started(); 1425 // When we start inside a character class, skip everything inside the class. 1426 if (in_class_escape_state == InClassEscapeState::kInClass) { 1427 // \k is always invalid within a class in unicode mode, thus we should never 1428 // call ScanForCaptures within a class. 1429 DCHECK(!IsUnicodeMode()); 1430 int c; 1431 while ((c = current()) != kEndMarker) { 1432 Advance(); 1433 if (c == '\\') { 1434 Advance(); 1435 } else { 1436 if (c == ']') break; 1437 } 1438 } 1439 } 1440 // Add count of captures after this position. 1441 int n; 1442 while ((n = current()) != kEndMarker) { 1443 Advance(); 1444 switch (n) { 1445 case '\\': 1446 Advance(); 1447 break; 1448 case '[': { 1449 int class_nest_level = 0; 1450 int c; 1451 while ((c = current()) != kEndMarker) { 1452 Advance(); 1453 if (c == '\\') { 1454 Advance(); 1455 } else if (c == '[') { 1456 // With /v, '[' inside a class is treated as a nested class. 1457 // Without /v, '[' is a normal character. 1458 if (unicode_sets()) class_nest_level++; 1459 } else if (c == ']') { 1460 if (class_nest_level == 0) break; 1461 class_nest_level--; 1462 } 1463 } 1464 break; 1465 } 1466 case '(': 1467 if (current() == '?') { 1468 // At this point we could be in 1469 // * a non-capturing group '(:', 1470 // * a lookbehind assertion '(?<=' '(?<!' 1471 // * or a named capture '(?<'. 1472 // 1473 // Of these, only named captures are capturing groups. 1474 1475 Advance(); 1476 if (current() != '<') break; 1477 1478 Advance(); 1479 if (current() == '=' || current() == '!') break; 1480 1481 // Found a possible named capture. It could turn out to be a syntax 1482 // error (e.g. an unterminated or invalid name), but that distinction 1483 // does not matter for our purposes. 1484 has_named_captures_ = true; 1485 } 1486 capture_count++; 1487 break; 1488 } 1489 } 1490 capture_count_ = capture_count; 1491 is_scanned_for_captures_ = true; 1492 Reset(saved_position); 1493 } 1494 1495 template <class CharT> 1496 bool RegExpParserImpl<CharT>::ParseBackReferenceIndex(int* index_out) { 1497 DCHECK_EQ('\\', current()); 1498 DCHECK('1' <= Next() && Next() <= '9'); 1499 // Try to parse a decimal literal that is no greater than the total number 1500 // of left capturing parentheses in the input. 1501 int start = position(); 1502 int value = Next() - '0'; 1503 Advance(2); 1504 while (true) { 1505 base::uc32 c = current(); 1506 if (IsDecimalDigit(c)) { 1507 value = 10 * value + (c - '0'); 1508 if (value > RegExpMacroAssembler::kMaxCaptures) { 1509 Reset(start); 1510 return false; 1511 } 1512 Advance(); 1513 } else { 1514 break; 1515 } 1516 } 1517 if (value > captures_started()) { 1518 if (!is_scanned_for_captures_) { 1519 ScanForCaptures(InClassEscapeState::kNotInClass); 1520 } 1521 if (value > capture_count_) { 1522 Reset(start); 1523 return false; 1524 } 1525 } 1526 *index_out = value; 1527 return true; 1528 } 1529 1530 namespace { 1531 1532 void push_code_unit(ZoneVector<base::uc16>* v, uint32_t code_unit) { 1533 if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) { 1534 v->push_back(code_unit); 1535 } else { 1536 v->push_back(unibrow::Utf16::LeadSurrogate(code_unit)); 1537 v->push_back(unibrow::Utf16::TrailSurrogate(code_unit)); 1538 } 1539 } 1540 1541 } // namespace 1542 1543 template <class CharT> 1544 const ZoneVector<base::uc16>* RegExpParserImpl<CharT>::ParseCaptureGroupName() { 1545 // Due to special Advance requirements (see the next comment), rewind by one 1546 // such that names starting with a surrogate pair are parsed correctly for 1547 // patterns where the unicode flag is unset. 1548 // 1549 // Note that we use this odd pattern of rewinding the last advance in order 1550 // to adhere to the common parser behavior of expecting `current` to point at 1551 // the first candidate character for a function (e.g. when entering ParseFoo, 1552 // `current` should point at the first character of Foo). 1553 RewindByOneCodepoint(); 1554 1555 ZoneVector<base::uc16>* name = 1556 zone()->template New<ZoneVector<base::uc16>>(zone()); 1557 1558 { 1559 // Advance behavior inside this function is tricky since 1560 // RegExpIdentifierName explicitly enables unicode (in spec terms, sets +U) 1561 // and thus allows surrogate pairs and \u{}-style escapes even in 1562 // non-unicode patterns. Therefore Advance within the capture group name 1563 // has to force-enable unicode, and outside the name revert to default 1564 // behavior. 1565 ForceUnicodeScope force_unicode(this); 1566 1567 bool at_start = true; 1568 while (true) { 1569 Advance(); 1570 base::uc32 c = current(); 1571 1572 // Convert unicode escapes. 1573 if (c == '\\' && Next() == 'u') { 1574 Advance(2); 1575 if (!ParseUnicodeEscape(&c)) { 1576 ReportError(RegExpError::kInvalidUnicodeEscape); 1577 return nullptr; 1578 } 1579 RewindByOneCodepoint(); 1580 } 1581 1582 // The backslash char is misclassified as both ID_Start and ID_Continue. 1583 if (c == '\\') { 1584 ReportError(RegExpError::kInvalidCaptureGroupName); 1585 return nullptr; 1586 } 1587 1588 if (at_start) { 1589 if (!IsIdentifierStart(c)) { 1590 ReportError(RegExpError::kInvalidCaptureGroupName); 1591 return nullptr; 1592 } 1593 push_code_unit(name, c); 1594 at_start = false; 1595 } else { 1596 if (c == '>') { 1597 break; 1598 } else if (IsIdentifierPart(c)) { 1599 push_code_unit(name, c); 1600 } else { 1601 ReportError(RegExpError::kInvalidCaptureGroupName); 1602 return nullptr; 1603 } 1604 } 1605 } 1606 } 1607 1608 // This final advance goes back into the state of pointing at the next 1609 // relevant char, which the rest of the parser expects. See also the previous 1610 // comments in this function. 1611 Advance(); 1612 return name; 1613 } 1614 1615 template <class CharT> 1616 bool RegExpParserImpl<CharT>::CreateNamedCaptureAtIndex( 1617 const RegExpParserState* state, int index) { 1618 const ZoneVector<base::uc16>* name = state->capture_name(); 1619 const std::pair<int, int> non_participating_capture_group_interval = 1620 state->non_participating_capture_group_interval(); 1621 DCHECK(0 < index && index <= captures_started_); 1622 DCHECK_NOT_NULL(name); 1623 1624 RegExpCapture* capture = GetCapture(index); 1625 DCHECK_NULL(capture->name()); 1626 1627 capture->set_name(name); 1628 1629 if (named_captures_ == nullptr) { 1630 named_captures_ = zone_->template New< 1631 ZoneMap<RegExpCapture*, ZoneList<int>*, RegExpCaptureNameLess>>(zone()); 1632 } else { 1633 // Check for duplicates and bail if we find any. 1634 const auto& named_capture_it = named_captures_->find(capture); 1635 if (named_capture_it != named_captures_->end()) { 1636 if (v8_flags.js_regexp_duplicate_named_groups) { 1637 ZoneList<int>* named_capture_indices = named_capture_it->second; 1638 DCHECK_NOT_NULL(named_capture_indices); 1639 DCHECK(!named_capture_indices->is_empty()); 1640 for (int named_index : *named_capture_indices) { 1641 if (named_index <= non_participating_capture_group_interval.first || 1642 named_index > non_participating_capture_group_interval.second) { 1643 ReportError(RegExpError::kDuplicateCaptureGroupName); 1644 return false; 1645 } 1646 } 1647 } else { 1648 ReportError(RegExpError::kDuplicateCaptureGroupName); 1649 return false; 1650 } 1651 } 1652 } 1653 if (v8_flags.js_regexp_duplicate_named_groups) { 1654 // Check for nested named captures. This is necessary to find duplicate 1655 // named captures within the same disjunct. 1656 RegExpParserState* parent_state = state->previous_state(); 1657 if (parent_state && parent_state->IsInsideCaptureGroup(name)) { 1658 ReportError(RegExpError::kDuplicateCaptureGroupName); 1659 return false; 1660 } 1661 } 1662 1663 auto entry = named_captures_->try_emplace( 1664 capture, zone()->template New<ZoneList<int>>(1, zone())); 1665 entry.first->second->Add(index, zone()); 1666 return true; 1667 } 1668 1669 template <class CharT> 1670 bool RegExpParserImpl<CharT>::ParseNamedBackReference( 1671 RegExpBuilder* builder, RegExpParserState* state) { 1672 // The parser is assumed to be on the '<' in \k<name>. 1673 if (current() != '<') { 1674 ReportError(RegExpError::kInvalidNamedReference); 1675 return false; 1676 } 1677 1678 Advance(); 1679 const ZoneVector<base::uc16>* name = ParseCaptureGroupName(); 1680 if (name == nullptr) { 1681 return false; 1682 } 1683 1684 if (state->IsInsideCaptureGroup(name)) { 1685 builder->AddEmpty(); 1686 } else { 1687 RegExpBackReference* atom = 1688 zone()->template New<RegExpBackReference>(zone()); 1689 atom->set_name(name); 1690 1691 builder->AddAtom(atom); 1692 1693 if (named_back_references_ == nullptr) { 1694 named_back_references_ = 1695 zone()->template New<ZoneList<RegExpBackReference*>>(1, zone()); 1696 } 1697 named_back_references_->Add(atom, zone()); 1698 } 1699 1700 return true; 1701 } 1702 1703 template <class CharT> 1704 void RegExpParserImpl<CharT>::PatchNamedBackReferences() { 1705 if (named_back_references_ == nullptr) return; 1706 1707 if (named_captures_ == nullptr) { 1708 ReportError(RegExpError::kInvalidNamedCaptureReference); 1709 return; 1710 } 1711 1712 // Look up and patch the actual capture for each named back reference. 1713 1714 for (int i = 0; i < named_back_references_->length(); i++) { 1715 RegExpBackReference* ref = named_back_references_->at(i); 1716 1717 // Capture used to search the named_captures_ by name, index of the 1718 // capture is never used. 1719 static const int kInvalidIndex = 0; 1720 RegExpCapture* search_capture = 1721 zone()->template New<RegExpCapture>(kInvalidIndex); 1722 DCHECK_NULL(search_capture->name()); 1723 search_capture->set_name(ref->name()); 1724 1725 const auto& capture_it = named_captures_->find(search_capture); 1726 if (capture_it == named_captures_->end()) { 1727 ReportError(RegExpError::kInvalidNamedCaptureReference); 1728 return; 1729 } 1730 1731 DCHECK_IMPLIES(!v8_flags.js_regexp_duplicate_named_groups, 1732 capture_it->second->length() == 1); 1733 for (int index : *capture_it->second) { 1734 ref->add_capture(GetCapture(index), zone()); 1735 } 1736 } 1737 } 1738 1739 template <class CharT> 1740 RegExpCapture* RegExpParserImpl<CharT>::GetCapture(int index) { 1741 // The index for the capture groups are one-based. Its index in the list is 1742 // zero-based. 1743 const int known_captures = 1744 is_scanned_for_captures_ ? capture_count_ : captures_started_; 1745 SBXCHECK(index >= 1 && index <= known_captures); 1746 if (captures_ == nullptr) { 1747 captures_ = 1748 zone()->template New<ZoneList<RegExpCapture*>>(known_captures, zone()); 1749 } 1750 while (captures_->length() < known_captures) { 1751 captures_->Add(zone()->template New<RegExpCapture>(captures_->length() + 1), 1752 zone()); 1753 } 1754 return captures_->at(index - 1); 1755 } 1756 1757 template <class CharT> 1758 ZoneVector<RegExpCapture*>* RegExpParserImpl<CharT>::GetNamedCaptures() { 1759 if (named_captures_ == nullptr) { 1760 return nullptr; 1761 } 1762 DCHECK(!named_captures_->empty()); 1763 1764 ZoneVector<RegExpCapture*>* flattened_named_captures = 1765 zone()->template New<ZoneVector<RegExpCapture*>>(zone()); 1766 for (auto capture : *named_captures_) { 1767 DCHECK_IMPLIES(!v8_flags.js_regexp_duplicate_named_groups, 1768 capture.second->length() == 1); 1769 for (int index : *capture.second) { 1770 flattened_named_captures->push_back(GetCapture(index)); 1771 } 1772 } 1773 return flattened_named_captures; 1774 } 1775 1776 template <class CharT> 1777 bool RegExpParserImpl<CharT>::HasNamedCaptures( 1778 InClassEscapeState in_class_escape_state) { 1779 if (has_named_captures_ || is_scanned_for_captures_) { 1780 return has_named_captures_; 1781 } 1782 1783 ScanForCaptures(in_class_escape_state); 1784 DCHECK(is_scanned_for_captures_); 1785 return has_named_captures_; 1786 } 1787 1788 // QuantifierPrefix :: 1789 // { DecimalDigits } 1790 // { DecimalDigits , } 1791 // { DecimalDigits , DecimalDigits } 1792 // 1793 // Returns true if parsing succeeds, and set the min_out and max_out 1794 // values. Values are truncated to RegExpTree::kInfinity if they overflow. 1795 template <class CharT> 1796 bool RegExpParserImpl<CharT>::ParseIntervalQuantifier(int* min_out, 1797 int* max_out) { 1798 DCHECK_EQ(current(), '{'); 1799 int start = position(); 1800 Advance(); 1801 int min = 0; 1802 if (!IsDecimalDigit(current())) { 1803 Reset(start); 1804 return false; 1805 } 1806 while (IsDecimalDigit(current())) { 1807 int next = current() - '0'; 1808 if (min > (RegExpTree::kInfinity - next) / 10) { 1809 // Overflow. Skip past remaining decimal digits and return -1. 1810 do { 1811 Advance(); 1812 } while (IsDecimalDigit(current())); 1813 min = RegExpTree::kInfinity; 1814 break; 1815 } 1816 min = 10 * min + next; 1817 Advance(); 1818 } 1819 int max = 0; 1820 if (current() == '}') { 1821 max = min; 1822 Advance(); 1823 } else if (current() == ',') { 1824 Advance(); 1825 if (current() == '}') { 1826 max = RegExpTree::kInfinity; 1827 Advance(); 1828 } else { 1829 while (IsDecimalDigit(current())) { 1830 int next = current() - '0'; 1831 if (max > (RegExpTree::kInfinity - next) / 10) { 1832 do { 1833 Advance(); 1834 } while (IsDecimalDigit(current())); 1835 max = RegExpTree::kInfinity; 1836 break; 1837 } 1838 max = 10 * max + next; 1839 Advance(); 1840 } 1841 if (current() != '}') { 1842 Reset(start); 1843 return false; 1844 } 1845 Advance(); 1846 } 1847 } else { 1848 Reset(start); 1849 return false; 1850 } 1851 *min_out = min; 1852 *max_out = max; 1853 return true; 1854 } 1855 1856 template <class CharT> 1857 base::uc32 RegExpParserImpl<CharT>::ParseOctalLiteral() { 1858 DCHECK(('0' <= current() && current() <= '7') || !has_more()); 1859 // For compatibility with some other browsers (not all), we parse 1860 // up to three octal digits with a value below 256. 1861 // ES#prod-annexB-LegacyOctalEscapeSequence 1862 base::uc32 value = current() - '0'; 1863 Advance(); 1864 if ('0' <= current() && current() <= '7') { 1865 value = value * 8 + current() - '0'; 1866 Advance(); 1867 if (value < 32 && '0' <= current() && current() <= '7') { 1868 value = value * 8 + current() - '0'; 1869 Advance(); 1870 } 1871 } 1872 return value; 1873 } 1874 1875 template <class CharT> 1876 bool RegExpParserImpl<CharT>::ParseHexEscape(int length, base::uc32* value) { 1877 int start = position(); 1878 base::uc32 val = 0; 1879 for (int i = 0; i < length; ++i) { 1880 base::uc32 c = current(); 1881 int d = base::HexValue(c); 1882 if (d < 0) { 1883 Reset(start); 1884 return false; 1885 } 1886 val = val * 16 + d; 1887 Advance(); 1888 } 1889 *value = val; 1890 return true; 1891 } 1892 1893 // This parses RegExpUnicodeEscapeSequence as described in ECMA262. 1894 template <class CharT> 1895 bool RegExpParserImpl<CharT>::ParseUnicodeEscape(base::uc32* value) { 1896 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are 1897 // allowed). In the latter case, the number of hex digits between { } is 1898 // arbitrary. \ and u have already been read. 1899 if (current() == '{' && IsUnicodeMode()) { 1900 int start = position(); 1901 Advance(); 1902 if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) { 1903 if (current() == '}') { 1904 Advance(); 1905 return true; 1906 } 1907 } 1908 Reset(start); 1909 return false; 1910 } 1911 // \u but no {, or \u{...} escapes not allowed. 1912 bool result = ParseHexEscape(4, value); 1913 if (result && IsUnicodeMode() && unibrow::Utf16::IsLeadSurrogate(*value) && 1914 current() == '\\') { 1915 // Attempt to read trail surrogate. 1916 int start = position(); 1917 if (Next() == 'u') { 1918 Advance(2); 1919 base::uc32 trail; 1920 if (ParseHexEscape(4, &trail) && 1921 unibrow::Utf16::IsTrailSurrogate(trail)) { 1922 *value = unibrow::Utf16::CombineSurrogatePair( 1923 static_cast<base::uc16>(*value), static_cast<base::uc16>(trail)); 1924 return true; 1925 } 1926 } 1927 Reset(start); 1928 } 1929 return result; 1930 } 1931 1932 #ifdef V8_INTL_SUPPORT 1933 1934 namespace { 1935 1936 bool IsExactPropertyAlias(const char* property_name, UProperty property) { 1937 const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME); 1938 if (short_name != nullptr && strcmp(property_name, short_name) == 0) 1939 return true; 1940 for (int i = 0;; i++) { 1941 const char* long_name = u_getPropertyName( 1942 property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i)); 1943 if (long_name == nullptr) break; 1944 if (strcmp(property_name, long_name) == 0) return true; 1945 } 1946 return false; 1947 } 1948 1949 bool IsExactPropertyValueAlias(const char* property_value_name, 1950 UProperty property, int32_t property_value) { 1951 const char* short_name = 1952 u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME); 1953 if (short_name != nullptr && strcmp(property_value_name, short_name) == 0) { 1954 return true; 1955 } 1956 for (int i = 0;; i++) { 1957 const char* long_name = u_getPropertyValueName( 1958 property, property_value, 1959 static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i)); 1960 if (long_name == nullptr) break; 1961 if (strcmp(property_value_name, long_name) == 0) return true; 1962 } 1963 return false; 1964 } 1965 1966 void ExtractStringsFromUnicodeSet(const icu::UnicodeSet& set, 1967 CharacterClassStrings* strings, 1968 RegExpFlags flags, Zone* zone) { 1969 DCHECK(set.hasStrings()); 1970 DCHECK(IsUnicodeSets(flags)); 1971 DCHECK_NOT_NULL(strings); 1972 1973 RegExpTextBuilder::SmallRegExpTreeVector string_storage(zone); 1974 RegExpTextBuilder string_builder(zone, &string_storage, flags); 1975 const bool needs_case_folding = IsIgnoreCase(flags); 1976 icu::UnicodeSetIterator iter(set); 1977 iter.skipToStrings(); 1978 while (iter.next()) { 1979 const icu::UnicodeString& s = iter.getString(); 1980 const char16_t* p = s.getBuffer(); 1981 int32_t length = s.length(); 1982 ZoneList<base::uc32>* string = 1983 zone->template New<ZoneList<base::uc32>>(length, zone); 1984 for (int32_t i = 0; i < length;) { 1985 UChar32 c; 1986 U16_NEXT(p, i, length, c); 1987 string_builder.AddUnicodeCharacter(c); 1988 if (needs_case_folding) { 1989 c = u_foldCase(c, U_FOLD_CASE_DEFAULT); 1990 } 1991 string->Add(c, zone); 1992 } 1993 strings->emplace(string->ToVector(), string_builder.ToRegExp()); 1994 string_storage.clear(); 1995 } 1996 } 1997 1998 bool LookupPropertyValueName(UProperty property, 1999 const char* property_value_name, bool negate, 2000 ZoneList<CharacterRange>* result_ranges, 2001 CharacterClassStrings* result_strings, 2002 RegExpFlags flags, Zone* zone) { 2003 UProperty property_for_lookup = property; 2004 if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) { 2005 // For the property Script_Extensions, we have to do the property value 2006 // name lookup as if the property is Script. 2007 property_for_lookup = UCHAR_SCRIPT; 2008 } 2009 int32_t property_value = 2010 u_getPropertyValueEnum(property_for_lookup, property_value_name); 2011 if (property_value == UCHAR_INVALID_CODE) return false; 2012 2013 // We require the property name to match exactly to one of the property value 2014 // aliases. However, u_getPropertyValueEnum uses loose matching. 2015 if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup, 2016 property_value)) { 2017 return false; 2018 } 2019 2020 UErrorCode ec = U_ZERO_ERROR; 2021 icu::UnicodeSet set; 2022 set.applyIntPropertyValue(property, property_value, ec); 2023 bool success = ec == U_ZERO_ERROR && !set.isEmpty(); 2024 2025 if (success) { 2026 if (set.hasStrings()) { 2027 ExtractStringsFromUnicodeSet(set, result_strings, flags, zone); 2028 } 2029 const bool needs_case_folding = IsUnicodeSets(flags) && IsIgnoreCase(flags); 2030 if (needs_case_folding) set.closeOver(USET_SIMPLE_CASE_INSENSITIVE); 2031 set.removeAllStrings(); 2032 if (negate) set.complement(); 2033 for (int i = 0; i < set.getRangeCount(); i++) { 2034 result_ranges->Add( 2035 CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)), 2036 zone); 2037 } 2038 } 2039 return success; 2040 } 2041 2042 template <size_t N> 2043 inline bool NameEquals(const char* name, const char (&literal)[N]) { 2044 return strncmp(name, literal, N + 1) == 0; 2045 } 2046 2047 bool LookupSpecialPropertyValueName(const char* name, 2048 ZoneList<CharacterRange>* result, 2049 bool negate, RegExpFlags flags, 2050 Zone* zone) { 2051 if (NameEquals(name, "Any")) { 2052 if (negate) { 2053 // Leave the list of character ranges empty, since the negation of 'Any' 2054 // is the empty set. 2055 } else { 2056 result->Add(CharacterRange::Everything(), zone); 2057 } 2058 } else if (NameEquals(name, "ASCII")) { 2059 result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint) 2060 : CharacterRange::Range(0x0, 0x7F), 2061 zone); 2062 } else if (NameEquals(name, "Assigned")) { 2063 return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned", 2064 !negate, result, nullptr, flags, zone); 2065 } else { 2066 return false; 2067 } 2068 return true; 2069 } 2070 2071 // Explicitly allowlist supported binary properties. The spec forbids supporting 2072 // properties outside of this set to ensure interoperability. 2073 bool IsSupportedBinaryProperty(UProperty property, bool unicode_sets) { 2074 switch (property) { 2075 case UCHAR_ALPHABETIC: 2076 // 'Any' is not supported by ICU. See LookupSpecialPropertyValueName. 2077 // 'ASCII' is not supported by ICU. See LookupSpecialPropertyValueName. 2078 case UCHAR_ASCII_HEX_DIGIT: 2079 // 'Assigned' is not supported by ICU. See LookupSpecialPropertyValueName. 2080 case UCHAR_BIDI_CONTROL: 2081 case UCHAR_BIDI_MIRRORED: 2082 case UCHAR_CASE_IGNORABLE: 2083 case UCHAR_CASED: 2084 case UCHAR_CHANGES_WHEN_CASEFOLDED: 2085 case UCHAR_CHANGES_WHEN_CASEMAPPED: 2086 case UCHAR_CHANGES_WHEN_LOWERCASED: 2087 case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED: 2088 case UCHAR_CHANGES_WHEN_TITLECASED: 2089 case UCHAR_CHANGES_WHEN_UPPERCASED: 2090 case UCHAR_DASH: 2091 case UCHAR_DEFAULT_IGNORABLE_CODE_POINT: 2092 case UCHAR_DEPRECATED: 2093 case UCHAR_DIACRITIC: 2094 case UCHAR_EMOJI: 2095 case UCHAR_EMOJI_COMPONENT: 2096 case UCHAR_EMOJI_MODIFIER_BASE: 2097 case UCHAR_EMOJI_MODIFIER: 2098 case UCHAR_EMOJI_PRESENTATION: 2099 case UCHAR_EXTENDED_PICTOGRAPHIC: 2100 case UCHAR_EXTENDER: 2101 case UCHAR_GRAPHEME_BASE: 2102 case UCHAR_GRAPHEME_EXTEND: 2103 case UCHAR_HEX_DIGIT: 2104 case UCHAR_ID_CONTINUE: 2105 case UCHAR_ID_START: 2106 case UCHAR_IDEOGRAPHIC: 2107 case UCHAR_IDS_BINARY_OPERATOR: 2108 case UCHAR_IDS_TRINARY_OPERATOR: 2109 case UCHAR_JOIN_CONTROL: 2110 case UCHAR_LOGICAL_ORDER_EXCEPTION: 2111 case UCHAR_LOWERCASE: 2112 case UCHAR_MATH: 2113 case UCHAR_NONCHARACTER_CODE_POINT: 2114 case UCHAR_PATTERN_SYNTAX: 2115 case UCHAR_PATTERN_WHITE_SPACE: 2116 case UCHAR_QUOTATION_MARK: 2117 case UCHAR_RADICAL: 2118 case UCHAR_REGIONAL_INDICATOR: 2119 case UCHAR_S_TERM: 2120 case UCHAR_SOFT_DOTTED: 2121 case UCHAR_TERMINAL_PUNCTUATION: 2122 case UCHAR_UNIFIED_IDEOGRAPH: 2123 case UCHAR_UPPERCASE: 2124 case UCHAR_VARIATION_SELECTOR: 2125 case UCHAR_WHITE_SPACE: 2126 case UCHAR_XID_CONTINUE: 2127 case UCHAR_XID_START: 2128 return true; 2129 case UCHAR_BASIC_EMOJI: 2130 case UCHAR_EMOJI_KEYCAP_SEQUENCE: 2131 case UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE: 2132 case UCHAR_RGI_EMOJI_FLAG_SEQUENCE: 2133 case UCHAR_RGI_EMOJI_TAG_SEQUENCE: 2134 case UCHAR_RGI_EMOJI_ZWJ_SEQUENCE: 2135 case UCHAR_RGI_EMOJI: 2136 return unicode_sets; 2137 default: 2138 break; 2139 } 2140 return false; 2141 } 2142 2143 bool IsBinaryPropertyOfStrings(UProperty property) { 2144 switch (property) { 2145 case UCHAR_BASIC_EMOJI: 2146 case UCHAR_EMOJI_KEYCAP_SEQUENCE: 2147 case UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE: 2148 case UCHAR_RGI_EMOJI_FLAG_SEQUENCE: 2149 case UCHAR_RGI_EMOJI_TAG_SEQUENCE: 2150 case UCHAR_RGI_EMOJI_ZWJ_SEQUENCE: 2151 case UCHAR_RGI_EMOJI: 2152 return true; 2153 default: 2154 break; 2155 } 2156 return false; 2157 } 2158 2159 bool IsUnicodePropertyValueCharacter(char c) { 2160 // https://tc39.github.io/proposal-regexp-unicode-property-escapes/ 2161 // 2162 // Note that using this to validate each parsed char is quite conservative. 2163 // A possible alternative solution would be to only ensure the parsed 2164 // property name/value candidate string does not contain '\0' characters and 2165 // let ICU lookups trigger the final failure. 2166 if ('a' <= c && c <= 'z') return true; 2167 if ('A' <= c && c <= 'Z') return true; 2168 if ('0' <= c && c <= '9') return true; 2169 return (c == '_'); 2170 } 2171 2172 } // namespace 2173 2174 template <class CharT> 2175 bool RegExpParserImpl<CharT>::ParsePropertyClassName(ZoneVector<char>* name_1, 2176 ZoneVector<char>* name_2) { 2177 DCHECK(name_1->empty()); 2178 DCHECK(name_2->empty()); 2179 // Parse the property class as follows: 2180 // - In \p{name}, 'name' is interpreted 2181 // - either as a general category property value name. 2182 // - or as a binary property name. 2183 // - In \p{name=value}, 'name' is interpreted as an enumerated property name, 2184 // and 'value' is interpreted as one of the available property value names. 2185 // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used. 2186 // - Loose matching is not applied. 2187 if (current() == '{') { 2188 // Parse \p{[PropertyName=]PropertyNameValue} 2189 for (Advance(); current() != '}' && current() != '='; Advance()) { 2190 if (!IsUnicodePropertyValueCharacter(current())) return false; 2191 if (!has_next()) return false; 2192 name_1->push_back(static_cast<char>(current())); 2193 } 2194 if (current() == '=') { 2195 for (Advance(); current() != '}'; Advance()) { 2196 if (!IsUnicodePropertyValueCharacter(current())) return false; 2197 if (!has_next()) return false; 2198 name_2->push_back(static_cast<char>(current())); 2199 } 2200 name_2->push_back(0); // null-terminate string. 2201 } 2202 } else { 2203 return false; 2204 } 2205 Advance(); 2206 name_1->push_back(0); // null-terminate string. 2207 2208 DCHECK(name_1->size() - 1 == std::strlen(name_1->data())); 2209 DCHECK(name_2->empty() || name_2->size() - 1 == std::strlen(name_2->data())); 2210 return true; 2211 } 2212 2213 template <class CharT> 2214 bool RegExpParserImpl<CharT>::AddPropertyClassRange( 2215 ZoneList<CharacterRange>* add_to_ranges, 2216 CharacterClassStrings* add_to_strings, bool negate, 2217 const ZoneVector<char>& name_1, const ZoneVector<char>& name_2) { 2218 if (name_2.empty()) { 2219 // First attempt to interpret as general category property value name. 2220 const char* name = name_1.data(); 2221 if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate, 2222 add_to_ranges, add_to_strings, flags(), 2223 zone())) { 2224 return true; 2225 } 2226 // Interpret "Any", "ASCII", and "Assigned". 2227 if (LookupSpecialPropertyValueName(name, add_to_ranges, negate, flags(), 2228 zone())) { 2229 return true; 2230 } 2231 // Then attempt to interpret as binary property name with value name 'Y'. 2232 UProperty property = u_getPropertyEnum(name); 2233 if (!IsSupportedBinaryProperty(property, unicode_sets())) return false; 2234 if (!IsExactPropertyAlias(name, property)) return false; 2235 // Negation of properties with strings is not allowed. 2236 // See 2237 // https://tc39.es/ecma262/#sec-static-semantics-maycontainstrings 2238 if (negate && IsBinaryPropertyOfStrings(property)) return false; 2239 if (unicode_sets()) { 2240 // In /v mode we can't simple lookup the "false" binary property values, 2241 // as the spec requires us to perform case folding before calculating the 2242 // complement. 2243 // See https://tc39.es/ecma262/#sec-compiletocharset 2244 // UnicodePropertyValueExpression :: LoneUnicodePropertyNameOrValue 2245 return LookupPropertyValueName(property, "Y", negate, add_to_ranges, 2246 add_to_strings, flags(), zone()); 2247 } else { 2248 return LookupPropertyValueName(property, negate ? "N" : "Y", false, 2249 add_to_ranges, add_to_strings, flags(), 2250 zone()); 2251 } 2252 } else { 2253 // Both property name and value name are specified. Attempt to interpret 2254 // the property name as enumerated property. 2255 const char* property_name = name_1.data(); 2256 const char* value_name = name_2.data(); 2257 UProperty property = u_getPropertyEnum(property_name); 2258 if (!IsExactPropertyAlias(property_name, property)) return false; 2259 if (property == UCHAR_GENERAL_CATEGORY) { 2260 // We want to allow aggregate value names such as "Letter". 2261 property = UCHAR_GENERAL_CATEGORY_MASK; 2262 } else if (property != UCHAR_SCRIPT && 2263 property != UCHAR_SCRIPT_EXTENSIONS) { 2264 return false; 2265 } 2266 return LookupPropertyValueName(property, value_name, negate, add_to_ranges, 2267 add_to_strings, flags(), zone()); 2268 } 2269 } 2270 2271 #else // V8_INTL_SUPPORT 2272 2273 template <class CharT> 2274 bool RegExpParserImpl<CharT>::ParsePropertyClassName(ZoneVector<char>* name_1, 2275 ZoneVector<char>* name_2) { 2276 return false; 2277 } 2278 2279 template <class CharT> 2280 bool RegExpParserImpl<CharT>::AddPropertyClassRange( 2281 ZoneList<CharacterRange>* add_to_ranges, 2282 CharacterClassStrings* add_to_strings, bool negate, 2283 const ZoneVector<char>& name_1, const ZoneVector<char>& name_2) { 2284 return false; 2285 } 2286 2287 #endif // V8_INTL_SUPPORT 2288 2289 template <class CharT> 2290 bool RegExpParserImpl<CharT>::ParseUnlimitedLengthHexNumber(int max_value, 2291 base::uc32* value) { 2292 base::uc32 x = 0; 2293 int d = base::HexValue(current()); 2294 if (d < 0) { 2295 return false; 2296 } 2297 while (d >= 0) { 2298 x = x * 16 + d; 2299 if (x > static_cast<base::uc32>(max_value)) { 2300 return false; 2301 } 2302 Advance(); 2303 d = base::HexValue(current()); 2304 } 2305 *value = x; 2306 return true; 2307 } 2308 2309 // https://tc39.es/ecma262/#prod-CharacterEscape 2310 template <class CharT> 2311 base::uc32 RegExpParserImpl<CharT>::ParseCharacterEscape( 2312 InClassEscapeState in_class_escape_state, 2313 bool* is_escaped_unicode_character) { 2314 DCHECK_EQ('\\', current()); 2315 DCHECK(has_next()); 2316 2317 Advance(); 2318 2319 const base::uc32 c = current(); 2320 switch (c) { 2321 // CharacterEscape :: 2322 // ControlEscape :: one of 2323 // f n r t v 2324 case 'f': 2325 Advance(); 2326 return '\f'; 2327 case 'n': 2328 Advance(); 2329 return '\n'; 2330 case 'r': 2331 Advance(); 2332 return '\r'; 2333 case 't': 2334 Advance(); 2335 return '\t'; 2336 case 'v': 2337 Advance(); 2338 return '\v'; 2339 // CharacterEscape :: 2340 // c ControlLetter 2341 case 'c': { 2342 base::uc32 controlLetter = Next(); 2343 base::uc32 letter = controlLetter & ~('A' ^ 'a'); 2344 if (letter >= 'A' && letter <= 'Z') { 2345 Advance(2); 2346 // Control letters mapped to ASCII control characters in the range 2347 // 0x00-0x1F. 2348 return controlLetter & 0x1F; 2349 } 2350 if (IsUnicodeMode()) { 2351 // With /u and /v, invalid escapes are not treated as identity escapes. 2352 ReportError(RegExpError::kInvalidUnicodeEscape); 2353 return 0; 2354 } 2355 if (in_class_escape_state == InClassEscapeState::kInClass) { 2356 // Inside a character class, we also accept digits and underscore as 2357 // control characters, unless with /u or /v. See Annex B: 2358 // ES#prod-annexB-ClassControlLetter 2359 if ((controlLetter >= '0' && controlLetter <= '9') || 2360 controlLetter == '_') { 2361 Advance(2); 2362 return controlLetter & 0x1F; 2363 } 2364 } 2365 // We match JSC in reading the backslash as a literal 2366 // character instead of as starting an escape. 2367 return '\\'; 2368 } 2369 // CharacterEscape :: 2370 // 0 [lookahead ∉ DecimalDigit] 2371 // [~UnicodeMode] LegacyOctalEscapeSequence 2372 case '0': 2373 // \0 is interpreted as NUL if not followed by another digit. 2374 if (Next() < '0' || Next() > '9') { 2375 Advance(); 2376 return 0; 2377 } 2378 [[fallthrough]]; 2379 case '1': 2380 case '2': 2381 case '3': 2382 case '4': 2383 case '5': 2384 case '6': 2385 case '7': 2386 // For compatibility, we interpret a decimal escape that isn't 2387 // a back reference (and therefore either \0 or not valid according 2388 // to the specification) as a 1..3 digit octal character code. 2389 // ES#prod-annexB-LegacyOctalEscapeSequence 2390 if (IsUnicodeMode()) { 2391 // With /u or /v, decimal escape is not interpreted as octal character 2392 // code. 2393 ReportError(RegExpError::kInvalidDecimalEscape); 2394 return 0; 2395 } 2396 return ParseOctalLiteral(); 2397 // CharacterEscape :: 2398 // HexEscapeSequence 2399 case 'x': { 2400 Advance(); 2401 base::uc32 value; 2402 if (ParseHexEscape(2, &value)) return value; 2403 if (IsUnicodeMode()) { 2404 // With /u or /v, invalid escapes are not treated as identity escapes. 2405 ReportError(RegExpError::kInvalidEscape); 2406 return 0; 2407 } 2408 // If \x is not followed by a two-digit hexadecimal, treat it 2409 // as an identity escape. 2410 return 'x'; 2411 } 2412 // CharacterEscape :: 2413 // RegExpUnicodeEscapeSequence [?UnicodeMode] 2414 case 'u': { 2415 Advance(); 2416 base::uc32 value; 2417 if (ParseUnicodeEscape(&value)) { 2418 *is_escaped_unicode_character = true; 2419 return value; 2420 } 2421 if (IsUnicodeMode()) { 2422 // With /u or /v, invalid escapes are not treated as identity escapes. 2423 ReportError(RegExpError::kInvalidUnicodeEscape); 2424 return 0; 2425 } 2426 // If \u is not followed by a two-digit hexadecimal, treat it 2427 // as an identity escape. 2428 return 'u'; 2429 } 2430 default: 2431 break; 2432 } 2433 2434 // CharacterEscape :: 2435 // IdentityEscape[?UnicodeMode, ?N] 2436 // 2437 // * With /u, no identity escapes except for syntax characters are 2438 // allowed. 2439 // * With /v, no identity escapes except for syntax characters and 2440 // ClassSetReservedPunctuators (if within a class) are allowed. 2441 // * Without /u or /v: 2442 // * '\c' is not an IdentityEscape. 2443 // * '\k' is not an IdentityEscape when named captures exist. 2444 // * Otherwise, all identity escapes are allowed. 2445 if (unicode_sets() && in_class_escape_state == InClassEscapeState::kInClass) { 2446 if (IsClassSetReservedPunctuator(c)) { 2447 Advance(); 2448 return c; 2449 } 2450 } 2451 if (IsUnicodeMode()) { 2452 if (!IsSyntaxCharacterOrSlash(c)) { 2453 ReportError(RegExpError::kInvalidEscape); 2454 return 0; 2455 } 2456 Advance(); 2457 return c; 2458 } 2459 DCHECK(!IsUnicodeMode()); 2460 if (c == 'c') { 2461 ReportError(RegExpError::kInvalidEscape); 2462 return 0; 2463 } 2464 Advance(); 2465 // Note: It's important to Advance before the HasNamedCaptures call s.t. we 2466 // don't start scanning in the middle of an escape. 2467 if (c == 'k' && HasNamedCaptures(in_class_escape_state)) { 2468 ReportError(RegExpError::kInvalidEscape); 2469 return 0; 2470 } 2471 return c; 2472 } 2473 2474 // https://tc39.es/ecma262/#prod-ClassRanges 2475 template <class CharT> 2476 RegExpTree* RegExpParserImpl<CharT>::ParseClassRanges( 2477 ZoneList<CharacterRange>* ranges, bool add_unicode_case_equivalents) { 2478 base::uc32 char_1, char_2; 2479 bool is_class_1, is_class_2; 2480 while (has_more() && current() != ']') { 2481 ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_1, 2482 &is_class_1 CHECK_FAILED); 2483 // ClassAtom 2484 if (current() == '-') { 2485 Advance(); 2486 if (!has_more()) { 2487 // If we reach the end we break out of the loop and let the 2488 // following code report an error. 2489 break; 2490 } else if (current() == ']') { 2491 if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone()); 2492 ranges->Add(CharacterRange::Singleton('-'), zone()); 2493 break; 2494 } 2495 ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_2, 2496 &is_class_2 CHECK_FAILED); 2497 if (is_class_1 || is_class_2) { 2498 // Either end is an escaped character class. Treat the '-' verbatim. 2499 if (IsUnicodeMode()) { 2500 // ES2015 21.2.2.15.1 step 1. 2501 return ReportError(RegExpError::kInvalidCharacterClass); 2502 } 2503 if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone()); 2504 ranges->Add(CharacterRange::Singleton('-'), zone()); 2505 if (!is_class_2) ranges->Add(CharacterRange::Singleton(char_2), zone()); 2506 continue; 2507 } 2508 // ES2015 21.2.2.15.1 step 6. 2509 if (char_1 > char_2) { 2510 return ReportError(RegExpError::kOutOfOrderCharacterClass); 2511 } 2512 ranges->Add(CharacterRange::Range(char_1, char_2), zone()); 2513 } else { 2514 if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone()); 2515 } 2516 } 2517 return nullptr; 2518 } 2519 2520 // https://tc39.es/ecma262/#prod-ClassEscape 2521 template <class CharT> 2522 void RegExpParserImpl<CharT>::ParseClassEscape( 2523 ZoneList<CharacterRange>* ranges, Zone* zone, 2524 bool add_unicode_case_equivalents, base::uc32* char_out, 2525 bool* is_class_escape) { 2526 *is_class_escape = false; 2527 2528 if (current() != '\\') { 2529 // Not a ClassEscape. 2530 *char_out = current(); 2531 Advance(); 2532 return; 2533 } 2534 2535 const base::uc32 next = Next(); 2536 switch (next) { 2537 case 'b': 2538 *char_out = '\b'; 2539 Advance(2); 2540 return; 2541 case '-': 2542 if (IsUnicodeMode()) { 2543 *char_out = next; 2544 Advance(2); 2545 return; 2546 } 2547 break; 2548 case kEndMarker: 2549 ReportError(RegExpError::kEscapeAtEndOfPattern); 2550 return; 2551 default: 2552 break; 2553 } 2554 2555 static constexpr InClassEscapeState kInClassEscape = 2556 InClassEscapeState::kInClass; 2557 *is_class_escape = 2558 TryParseCharacterClassEscape(next, kInClassEscape, ranges, nullptr, zone, 2559 add_unicode_case_equivalents); 2560 if (*is_class_escape) return; 2561 2562 bool dummy = false; // Unused. 2563 *char_out = ParseCharacterEscape(kInClassEscape, &dummy); 2564 } 2565 2566 // https://tc39.es/ecma262/#prod-CharacterClassEscape 2567 template <class CharT> 2568 bool RegExpParserImpl<CharT>::TryParseCharacterClassEscape( 2569 base::uc32 next, InClassEscapeState in_class_escape_state, 2570 ZoneList<CharacterRange>* ranges, CharacterClassStrings* strings, 2571 Zone* zone, bool add_unicode_case_equivalents) { 2572 DCHECK_EQ(current(), '\\'); 2573 DCHECK_EQ(Next(), next); 2574 2575 switch (next) { 2576 case 'd': 2577 case 'D': 2578 case 's': 2579 case 'S': 2580 case 'w': 2581 case 'W': 2582 CharacterRange::AddClassEscape(static_cast<StandardCharacterSet>(next), 2583 ranges, add_unicode_case_equivalents, 2584 zone); 2585 Advance(2); 2586 return true; 2587 case 'p': 2588 case 'P': { 2589 if (!IsUnicodeMode()) return false; 2590 bool negate = next == 'P'; 2591 Advance(2); 2592 ZoneVector<char> name_1(zone); 2593 ZoneVector<char> name_2(zone); 2594 if (!ParsePropertyClassName(&name_1, &name_2) || 2595 !AddPropertyClassRange(ranges, strings, negate, name_1, name_2)) { 2596 ReportError(in_class_escape_state == InClassEscapeState::kInClass 2597 ? RegExpError::kInvalidClassPropertyName 2598 : RegExpError::kInvalidPropertyName); 2599 } 2600 return true; 2601 } 2602 default: 2603 return false; 2604 } 2605 } 2606 2607 namespace { 2608 2609 // Add |string| to |ranges| if length of |string| == 1, otherwise add |string| 2610 // to |strings|. 2611 void AddClassString(ZoneList<base::uc32>* normalized_string, 2612 RegExpTree* regexp_string, ZoneList<CharacterRange>* ranges, 2613 CharacterClassStrings* strings, Zone* zone) { 2614 if (normalized_string->length() == 1) { 2615 ranges->Add(CharacterRange::Singleton(normalized_string->at(0)), zone); 2616 } else { 2617 strings->emplace(normalized_string->ToVector(), regexp_string); 2618 } 2619 } 2620 2621 } // namespace 2622 2623 // https://tc39.es/ecma262/#prod-ClassStringDisjunction 2624 template <class CharT> 2625 RegExpTree* RegExpParserImpl<CharT>::ParseClassStringDisjunction( 2626 ZoneList<CharacterRange>* ranges, CharacterClassStrings* strings) { 2627 DCHECK(unicode_sets()); 2628 DCHECK_EQ(current(), '\\'); 2629 DCHECK_EQ(Next(), 'q'); 2630 Advance(2); 2631 if (current() != '{') { 2632 // Identity escape of 'q' is not allowed in unicode mode. 2633 return ReportError(RegExpError::kInvalidEscape); 2634 } 2635 Advance(); 2636 2637 ZoneList<base::uc32>* string = 2638 zone()->template New<ZoneList<base::uc32>>(4, zone()); 2639 RegExpTextBuilder::SmallRegExpTreeVector string_storage(zone()); 2640 RegExpTextBuilder string_builder(zone(), &string_storage, flags()); 2641 2642 while (has_more() && current() != '}') { 2643 if (current() == '|') { 2644 AddClassString(string, string_builder.ToRegExp(), ranges, strings, 2645 zone()); 2646 string = zone()->template New<ZoneList<base::uc32>>(4, zone()); 2647 string_storage.clear(); 2648 Advance(); 2649 } else { 2650 base::uc32 c = ParseClassSetCharacter(CHECK_FAILED); 2651 if (ignore_case()) { 2652 #ifdef V8_INTL_SUPPORT 2653 c = u_foldCase(c, U_FOLD_CASE_DEFAULT); 2654 #else 2655 c = AsciiAlphaToLower(c); 2656 #endif 2657 } 2658 string->Add(c, zone()); 2659 string_builder.AddUnicodeCharacter(c); 2660 } 2661 } 2662 2663 AddClassString(string, string_builder.ToRegExp(), ranges, strings, zone()); 2664 CharacterRange::Canonicalize(ranges); 2665 2666 // We don't need to handle missing closing '}' here. 2667 // If the character class is correctly closed, ParseClassSetCharacter will 2668 // report an error. 2669 Advance(); 2670 return nullptr; 2671 } 2672 2673 // https://tc39.es/ecma262/#prod-ClassSetOperand 2674 // Tree returned based on type_out: 2675 // * kNestedClass: RegExpClassSetExpression 2676 // * For all other types: RegExpClassSetOperand 2677 template <class CharT> 2678 RegExpTree* RegExpParserImpl<CharT>::ParseClassSetOperand( 2679 const RegExpBuilder* builder, ClassSetOperandType* type_out) { 2680 ZoneList<CharacterRange>* ranges = 2681 zone()->template New<ZoneList<CharacterRange>>(1, zone()); 2682 CharacterClassStrings* strings = 2683 zone()->template New<CharacterClassStrings>(zone()); 2684 base::uc32 character; 2685 RegExpTree* tree = ParseClassSetOperand(builder, type_out, ranges, strings, 2686 &character CHECK_FAILED); 2687 DCHECK_IMPLIES(*type_out != ClassSetOperandType::kNestedClass, 2688 tree == nullptr); 2689 DCHECK_IMPLIES(*type_out == ClassSetOperandType::kClassSetCharacter, 2690 ranges->is_empty()); 2691 DCHECK_IMPLIES(*type_out == ClassSetOperandType::kClassSetCharacter, 2692 strings->empty()); 2693 DCHECK_IMPLIES(*type_out == ClassSetOperandType::kNestedClass, 2694 ranges->is_empty()); 2695 DCHECK_IMPLIES(*type_out == ClassSetOperandType::kNestedClass, 2696 strings->empty()); 2697 DCHECK_IMPLIES(*type_out == ClassSetOperandType::kNestedClass, 2698 tree->IsClassSetExpression()); 2699 // ClassSetRange is only used within ClassSetUnion(). 2700 DCHECK_NE(*type_out, ClassSetOperandType::kClassSetRange); 2701 // There are no restrictions for kCharacterClassEscape. 2702 // CharacterClassEscape includes \p{}, which can contain ranges, strings or 2703 // both and \P{}, which could contain nothing (i.e. \P{Any}). 2704 if (tree == nullptr) { 2705 if (*type_out == ClassSetOperandType::kClassSetCharacter) { 2706 AddMaybeSimpleCaseFoldedRange(ranges, 2707 CharacterRange::Singleton(character)); 2708 } 2709 tree = zone()->template New<RegExpClassSetOperand>(ranges, strings); 2710 } 2711 return tree; 2712 } 2713 2714 // https://tc39.es/ecma262/#prod-ClassSetOperand 2715 // Based on |type_out| either a tree is returned or 2716 // |ranges|/|strings|/|character| modified. If a tree is returned, 2717 // ranges/strings are not modified. If |type_out| is kNestedClass, a tree of 2718 // type RegExpClassSetExpression is returned. If | type_out| is 2719 // kClassSetCharacter, |character| is set and nullptr returned. For all other 2720 // types, |ranges|/|strings|/|character| is modified and nullptr is returned. 2721 template <class CharT> 2722 RegExpTree* RegExpParserImpl<CharT>::ParseClassSetOperand( 2723 const RegExpBuilder* builder, ClassSetOperandType* type_out, 2724 ZoneList<CharacterRange>* ranges, CharacterClassStrings* strings, 2725 base::uc32* character) { 2726 DCHECK(unicode_sets()); 2727 base::uc32 c = current(); 2728 if (c == '\\') { 2729 const base::uc32 next = Next(); 2730 if (next == 'q') { 2731 *type_out = ClassSetOperandType::kClassStringDisjunction; 2732 ParseClassStringDisjunction(ranges, strings CHECK_FAILED); 2733 return nullptr; 2734 } 2735 static constexpr InClassEscapeState kInClassEscape = 2736 InClassEscapeState::kInClass; 2737 const bool add_unicode_case_equivalents = ignore_case(); 2738 if (TryParseCharacterClassEscape(next, kInClassEscape, ranges, strings, 2739 zone(), add_unicode_case_equivalents)) { 2740 *type_out = ClassSetOperandType::kCharacterClassEscape; 2741 return nullptr; 2742 } 2743 } 2744 2745 if (c == '[') { 2746 *type_out = ClassSetOperandType::kNestedClass; 2747 return ParseCharacterClass(builder); 2748 } 2749 2750 *type_out = ClassSetOperandType::kClassSetCharacter; 2751 c = ParseClassSetCharacter(CHECK_FAILED); 2752 *character = c; 2753 return nullptr; 2754 } 2755 2756 template <class CharT> 2757 base::uc32 RegExpParserImpl<CharT>::ParseClassSetCharacter() { 2758 DCHECK(unicode_sets()); 2759 const base::uc32 c = current(); 2760 if (c == '\\') { 2761 const base::uc32 next = Next(); 2762 switch (next) { 2763 case 'b': 2764 Advance(2); 2765 return '\b'; 2766 case kEndMarker: 2767 ReportError(RegExpError::kEscapeAtEndOfPattern); 2768 return 0; 2769 } 2770 static constexpr InClassEscapeState kInClassEscape = 2771 InClassEscapeState::kInClass; 2772 2773 bool dummy = false; // Unused. 2774 return ParseCharacterEscape(kInClassEscape, &dummy); 2775 } 2776 if (IsClassSetSyntaxCharacter(c)) { 2777 ReportError(RegExpError::kInvalidCharacterInClass); 2778 return 0; 2779 } 2780 if (IsClassSetReservedDoublePunctuator(c)) { 2781 ReportError(RegExpError::kInvalidClassSetOperation); 2782 return 0; 2783 } 2784 Advance(); 2785 return c; 2786 } 2787 2788 namespace { 2789 2790 bool MayContainStrings(ClassSetOperandType type, RegExpTree* operand) { 2791 switch (type) { 2792 case ClassSetOperandType::kClassSetCharacter: 2793 case ClassSetOperandType::kClassSetRange: 2794 return false; 2795 case ClassSetOperandType::kCharacterClassEscape: 2796 case ClassSetOperandType::kClassStringDisjunction: 2797 return operand->AsClassSetOperand()->has_strings(); 2798 case ClassSetOperandType::kNestedClass: 2799 if (operand->IsClassRanges()) return false; 2800 return operand->AsClassSetExpression()->may_contain_strings(); 2801 } 2802 } 2803 2804 } // namespace 2805 2806 template <class CharT> 2807 void RegExpParserImpl<CharT>::AddMaybeSimpleCaseFoldedRange( 2808 ZoneList<CharacterRange>* ranges, CharacterRange new_range) { 2809 DCHECK(unicode_sets()); 2810 if (ignore_case()) { 2811 ZoneList<CharacterRange>* new_ranges = 2812 zone()->template New<ZoneList<CharacterRange>>(2, zone()); 2813 new_ranges->Add(new_range, zone()); 2814 CharacterRange::AddUnicodeCaseEquivalents(new_ranges, zone()); 2815 ranges->AddAll(*new_ranges, zone()); 2816 } else { 2817 ranges->Add(new_range, zone()); 2818 } 2819 CharacterRange::Canonicalize(ranges); 2820 } 2821 2822 // https://tc39.es/ecma262/#prod-ClassUnion 2823 template <class CharT> 2824 RegExpTree* RegExpParserImpl<CharT>::ParseClassUnion( 2825 const RegExpBuilder* builder, bool is_negated, RegExpTree* first_operand, 2826 ClassSetOperandType first_operand_type, ZoneList<CharacterRange>* ranges, 2827 CharacterClassStrings* strings, base::uc32 character) { 2828 DCHECK(unicode_sets()); 2829 ZoneList<RegExpTree*>* operands = 2830 zone()->template New<ZoneList<RegExpTree*>>(2, zone()); 2831 bool may_contain_strings = false; 2832 // Add the lhs to operands if necessary. 2833 // Either the lhs values were added to |ranges|/|strings| (in which case 2834 // |first_operand| is nullptr), or the lhs was evaluated to a tree and passed 2835 // as |first_operand| (in which case |ranges| and |strings| are empty). 2836 if (first_operand != nullptr) { 2837 may_contain_strings = MayContainStrings(first_operand_type, first_operand); 2838 operands->Add(first_operand, zone()); 2839 } 2840 ClassSetOperandType last_type = first_operand_type; 2841 while (has_more() && current() != ']') { 2842 if (current() == '-') { 2843 // Mix of ClassSetRange and ClassSubtraction is not allowed. 2844 if (Next() == '-') { 2845 return ReportError(RegExpError::kInvalidClassSetOperation); 2846 } 2847 Advance(); 2848 if (!has_more()) { 2849 // If we reach the end we break out of the loop and let the 2850 // following code report an error. 2851 break; 2852 } 2853 // If the lhs and rhs around '-' are both ClassSetCharacters, they 2854 // represent a character range. 2855 // In case one of them is not a ClassSetCharacter, it is a syntax error, 2856 // as '-' can not be used unescaped within a class with /v. 2857 // See 2858 // https://tc39.es/ecma262/#prod-ClassSetRange 2859 if (last_type != ClassSetOperandType::kClassSetCharacter) { 2860 return ReportError(RegExpError::kInvalidCharacterClass); 2861 } 2862 base::uc32 from = character; 2863 ParseClassSetOperand(builder, &last_type, ranges, strings, 2864 &character CHECK_FAILED); 2865 if (last_type != ClassSetOperandType::kClassSetCharacter) { 2866 return ReportError(RegExpError::kInvalidCharacterClass); 2867 } 2868 if (from > character) { 2869 return ReportError(RegExpError::kOutOfOrderCharacterClass); 2870 } 2871 AddMaybeSimpleCaseFoldedRange(ranges, 2872 CharacterRange::Range(from, character)); 2873 last_type = ClassSetOperandType::kClassSetRange; 2874 } else { 2875 DCHECK_NE(current(), '-'); 2876 if (last_type == ClassSetOperandType::kClassSetCharacter) { 2877 AddMaybeSimpleCaseFoldedRange(ranges, 2878 CharacterRange::Singleton(character)); 2879 } 2880 RegExpTree* operand = ParseClassSetOperand( 2881 builder, &last_type, ranges, strings, &character CHECK_FAILED); 2882 if (operand != nullptr) { 2883 may_contain_strings |= MayContainStrings(last_type, operand); 2884 // Add the range we started building as operand and reset the current 2885 // range. 2886 if (!ranges->is_empty() || !strings->empty()) { 2887 may_contain_strings |= !strings->empty(); 2888 operands->Add( 2889 zone()->template New<RegExpClassSetOperand>(ranges, strings), 2890 zone()); 2891 ranges = zone()->template New<ZoneList<CharacterRange>>(2, zone()); 2892 strings = zone()->template New<CharacterClassStrings>(zone()); 2893 } 2894 operands->Add(operand, zone()); 2895 } 2896 } 2897 } 2898 2899 if (!has_more()) { 2900 return ReportError(RegExpError::kUnterminatedCharacterClass); 2901 } 2902 2903 if (last_type == ClassSetOperandType::kClassSetCharacter) { 2904 AddMaybeSimpleCaseFoldedRange(ranges, CharacterRange::Singleton(character)); 2905 } 2906 2907 // Add the range we started building as operand. 2908 if (!ranges->is_empty() || !strings->empty()) { 2909 may_contain_strings |= !strings->empty(); 2910 operands->Add(zone()->template New<RegExpClassSetOperand>(ranges, strings), 2911 zone()); 2912 } 2913 2914 DCHECK_EQ(current(), ']'); 2915 Advance(); 2916 2917 if (is_negated && may_contain_strings) { 2918 return ReportError(RegExpError::kNegatedCharacterClassWithStrings); 2919 } 2920 2921 if (operands->is_empty()) { 2922 // Return empty expression if no operands were added (e.g. [\P{Any}] 2923 // produces an empty range). 2924 DCHECK(ranges->is_empty()); 2925 DCHECK(strings->empty()); 2926 return RegExpClassSetExpression::Empty(zone(), is_negated); 2927 } 2928 2929 return zone()->template New<RegExpClassSetExpression>( 2930 RegExpClassSetExpression::OperationType::kUnion, is_negated, 2931 may_contain_strings, operands); 2932 } 2933 2934 // https://tc39.es/ecma262/#prod-ClassIntersection 2935 template <class CharT> 2936 RegExpTree* RegExpParserImpl<CharT>::ParseClassIntersection( 2937 const RegExpBuilder* builder, bool is_negated, RegExpTree* first_operand, 2938 ClassSetOperandType first_operand_type) { 2939 DCHECK(unicode_sets()); 2940 DCHECK(current() == '&' && Next() == '&'); 2941 bool may_contain_strings = 2942 MayContainStrings(first_operand_type, first_operand); 2943 ZoneList<RegExpTree*>* operands = 2944 zone()->template New<ZoneList<RegExpTree*>>(2, zone()); 2945 operands->Add(first_operand, zone()); 2946 while (has_more() && current() != ']') { 2947 if (current() != '&' || Next() != '&') { 2948 return ReportError(RegExpError::kInvalidClassSetOperation); 2949 } 2950 Advance(2); 2951 // [lookahead ≠&] 2952 if (current() == '&') { 2953 return ReportError(RegExpError::kInvalidCharacterInClass); 2954 } 2955 2956 ClassSetOperandType operand_type; 2957 RegExpTree* operand = 2958 ParseClassSetOperand(builder, &operand_type CHECK_FAILED); 2959 may_contain_strings &= MayContainStrings(operand_type, operand); 2960 operands->Add(operand, zone()); 2961 } 2962 if (!has_more()) { 2963 return ReportError(RegExpError::kUnterminatedCharacterClass); 2964 } 2965 if (is_negated && may_contain_strings) { 2966 return ReportError(RegExpError::kNegatedCharacterClassWithStrings); 2967 } 2968 DCHECK_EQ(current(), ']'); 2969 Advance(); 2970 return zone()->template New<RegExpClassSetExpression>( 2971 RegExpClassSetExpression::OperationType::kIntersection, is_negated, 2972 may_contain_strings, operands); 2973 } 2974 2975 // https://tc39.es/ecma262/#prod-ClassSubtraction 2976 template <class CharT> 2977 RegExpTree* RegExpParserImpl<CharT>::ParseClassSubtraction( 2978 const RegExpBuilder* builder, bool is_negated, RegExpTree* first_operand, 2979 ClassSetOperandType first_operand_type) { 2980 DCHECK(unicode_sets()); 2981 DCHECK(current() == '-' && Next() == '-'); 2982 const bool may_contain_strings = 2983 MayContainStrings(first_operand_type, first_operand); 2984 if (is_negated && may_contain_strings) { 2985 return ReportError(RegExpError::kNegatedCharacterClassWithStrings); 2986 } 2987 ZoneList<RegExpTree*>* operands = 2988 zone()->template New<ZoneList<RegExpTree*>>(2, zone()); 2989 operands->Add(first_operand, zone()); 2990 while (has_more() && current() != ']') { 2991 if (current() != '-' || Next() != '-') { 2992 return ReportError(RegExpError::kInvalidClassSetOperation); 2993 } 2994 Advance(2); 2995 ClassSetOperandType dummy; // unused 2996 RegExpTree* operand = ParseClassSetOperand(builder, &dummy CHECK_FAILED); 2997 operands->Add(operand, zone()); 2998 } 2999 if (!has_more()) { 3000 return ReportError(RegExpError::kUnterminatedCharacterClass); 3001 } 3002 DCHECK_EQ(current(), ']'); 3003 Advance(); 3004 return zone()->template New<RegExpClassSetExpression>( 3005 RegExpClassSetExpression::OperationType::kSubtraction, is_negated, 3006 may_contain_strings, operands); 3007 } 3008 3009 // https://tc39.es/ecma262/#prod-CharacterClass 3010 template <class CharT> 3011 RegExpTree* RegExpParserImpl<CharT>::ParseCharacterClass( 3012 const RegExpBuilder* builder) { 3013 DCHECK_EQ(current(), '['); 3014 Advance(); 3015 bool is_negated = false; 3016 if (current() == '^') { 3017 is_negated = true; 3018 Advance(); 3019 } 3020 ZoneList<CharacterRange>* ranges = 3021 zone()->template New<ZoneList<CharacterRange>>(2, zone()); 3022 if (current() == ']') { 3023 Advance(); 3024 if (unicode_sets()) { 3025 return RegExpClassSetExpression::Empty(zone(), is_negated); 3026 } else { 3027 RegExpClassRanges::ClassRangesFlags class_ranges_flags; 3028 if (is_negated) class_ranges_flags = RegExpClassRanges::NEGATED; 3029 return zone()->template New<RegExpClassRanges>(zone(), ranges, 3030 class_ranges_flags); 3031 } 3032 } 3033 3034 if (!unicode_sets()) { 3035 bool add_unicode_case_equivalents = IsUnicodeMode() && ignore_case(); 3036 ParseClassRanges(ranges, add_unicode_case_equivalents CHECK_FAILED); 3037 if (!has_more()) { 3038 return ReportError(RegExpError::kUnterminatedCharacterClass); 3039 } 3040 DCHECK_EQ(current(), ']'); 3041 Advance(); 3042 RegExpClassRanges::ClassRangesFlags character_class_flags; 3043 if (is_negated) character_class_flags = RegExpClassRanges::NEGATED; 3044 return zone()->template New<RegExpClassRanges>(zone(), ranges, 3045 character_class_flags); 3046 } else { 3047 ClassSetOperandType operand_type; 3048 CharacterClassStrings* strings = 3049 zone()->template New<CharacterClassStrings>(zone()); 3050 base::uc32 character; 3051 RegExpTree* operand = ParseClassSetOperand( 3052 builder, &operand_type, ranges, strings, &character CHECK_FAILED); 3053 switch (current()) { 3054 case '-': 3055 if (Next() == '-') { 3056 if (operand == nullptr) { 3057 if (operand_type == ClassSetOperandType::kClassSetCharacter) { 3058 AddMaybeSimpleCaseFoldedRange( 3059 ranges, CharacterRange::Singleton(character)); 3060 } 3061 operand = 3062 zone()->template New<RegExpClassSetOperand>(ranges, strings); 3063 } 3064 return ParseClassSubtraction(builder, is_negated, operand, 3065 operand_type); 3066 } 3067 // ClassSetRange is handled in ParseClassUnion(). 3068 break; 3069 case '&': 3070 if (Next() == '&') { 3071 if (operand == nullptr) { 3072 if (operand_type == ClassSetOperandType::kClassSetCharacter) { 3073 AddMaybeSimpleCaseFoldedRange( 3074 ranges, CharacterRange::Singleton(character)); 3075 } 3076 operand = 3077 zone()->template New<RegExpClassSetOperand>(ranges, strings); 3078 } 3079 return ParseClassIntersection(builder, is_negated, operand, 3080 operand_type); 3081 } 3082 } 3083 return ParseClassUnion(builder, is_negated, operand, operand_type, ranges, 3084 strings, character); 3085 } 3086 } 3087 3088 #undef CHECK_FAILED 3089 3090 template <class CharT> 3091 bool RegExpParserImpl<CharT>::Parse(RegExpCompileData* result) { 3092 DCHECK_NOT_NULL(result); 3093 RegExpTree* tree = ParsePattern(); 3094 3095 if (failed()) { 3096 DCHECK_NULL(tree); 3097 DCHECK_NE(error_, RegExpError::kNone); 3098 result->error = error_; 3099 result->error_pos = error_pos_; 3100 return false; 3101 } 3102 3103 DCHECK_NOT_NULL(tree); 3104 DCHECK_EQ(error_, RegExpError::kNone); 3105 if (v8_flags.trace_regexp_parser) { 3106 StdoutStream os; 3107 tree->Print(os, zone()); 3108 os << "\n"; 3109 } 3110 3111 result->tree = tree; 3112 const int capture_count = captures_started(); 3113 result->simple = tree->IsAtom() && simple() && capture_count == 0; 3114 result->contains_anchor = contains_anchor(); 3115 result->capture_count = capture_count; 3116 result->named_captures = GetNamedCaptures(); 3117 return true; 3118 } 3119 3120 void RegExpBuilder::FlushText() { text_builder().FlushText(); } 3121 3122 void RegExpBuilder::AddCharacter(base::uc16 c) { 3123 pending_empty_ = false; 3124 text_builder().AddCharacter(c); 3125 } 3126 3127 void RegExpBuilder::AddUnicodeCharacter(base::uc32 c) { 3128 pending_empty_ = false; 3129 text_builder().AddUnicodeCharacter(c); 3130 } 3131 3132 void RegExpBuilder::AddEscapedUnicodeCharacter(base::uc32 character) { 3133 pending_empty_ = false; 3134 text_builder().AddEscapedUnicodeCharacter(character); 3135 } 3136 3137 void RegExpBuilder::AddEmpty() { 3138 text_builder().FlushPendingSurrogate(); 3139 pending_empty_ = true; 3140 } 3141 3142 void RegExpBuilder::AddClassRanges(RegExpClassRanges* cc) { 3143 pending_empty_ = false; 3144 text_builder().AddClassRanges(cc); 3145 } 3146 3147 void RegExpBuilder::AddAtom(RegExpTree* term) { 3148 if (term->IsEmpty()) { 3149 AddEmpty(); 3150 return; 3151 } 3152 pending_empty_ = false; 3153 if (term->IsTextElement()) { 3154 text_builder().AddAtom(term); 3155 } else { 3156 FlushText(); 3157 terms_.emplace_back(term); 3158 } 3159 } 3160 3161 void RegExpBuilder::AddTerm(RegExpTree* term) { 3162 DCHECK(!term->IsEmpty()); 3163 pending_empty_ = false; 3164 if (term->IsTextElement()) { 3165 text_builder().AddTerm(term); 3166 } else { 3167 FlushText(); 3168 terms_.emplace_back(term); 3169 } 3170 } 3171 3172 void RegExpBuilder::AddAssertion(RegExpTree* assert) { 3173 FlushText(); 3174 pending_empty_ = false; 3175 terms_.emplace_back(assert); 3176 } 3177 3178 void RegExpBuilder::NewAlternative() { FlushTerms(); } 3179 3180 void RegExpBuilder::FlushTerms() { 3181 FlushText(); 3182 size_t num_terms = terms_.size(); 3183 RegExpTree* alternative; 3184 if (num_terms == 0) { 3185 alternative = zone()->New<RegExpEmpty>(); 3186 } else if (num_terms == 1) { 3187 alternative = terms_.back(); 3188 } else { 3189 alternative = 3190 zone()->New<RegExpAlternative>(zone()->New<ZoneList<RegExpTree*>>( 3191 base::VectorOf(terms_.begin(), terms_.size()), zone())); 3192 } 3193 alternatives_.emplace_back(alternative); 3194 terms_.clear(); 3195 } 3196 3197 RegExpTree* RegExpBuilder::ToRegExp() { 3198 FlushTerms(); 3199 size_t num_alternatives = alternatives_.size(); 3200 if (num_alternatives == 0) return zone()->New<RegExpEmpty>(); 3201 if (num_alternatives == 1) return alternatives_.back(); 3202 return zone()->New<RegExpDisjunction>(zone()->New<ZoneList<RegExpTree*>>( 3203 base::VectorOf(alternatives_.begin(), alternatives_.size()), zone())); 3204 } 3205 3206 bool RegExpBuilder::AddQuantifierToAtom( 3207 int min, int max, int index, 3208 RegExpQuantifier::QuantifierType quantifier_type) { 3209 if (pending_empty_) { 3210 pending_empty_ = false; 3211 return true; 3212 } 3213 RegExpTree* atom = text_builder().PopLastAtom(); 3214 if (atom != nullptr) { 3215 FlushText(); 3216 } else if (!terms_.empty()) { 3217 atom = terms_.back(); 3218 terms_.pop_back(); 3219 if (atom->IsLookaround()) { 3220 // With /u or /v, lookarounds are not quantifiable. 3221 if (IsUnicodeMode()) return false; 3222 // Lookbehinds are not quantifiable. 3223 if (atom->AsLookaround()->type() == RegExpLookaround::LOOKBEHIND) { 3224 return false; 3225 } 3226 } 3227 if (atom->max_match() == 0) { 3228 // Guaranteed to only match an empty string. 3229 if (min == 0) { 3230 return true; 3231 } 3232 terms_.emplace_back(atom); 3233 return true; 3234 } 3235 } else { 3236 // Only call immediately after adding an atom or character! 3237 UNREACHABLE(); 3238 } 3239 terms_.emplace_back( 3240 zone()->New<RegExpQuantifier>(min, max, quantifier_type, index, atom)); 3241 return true; 3242 } 3243 3244 template class RegExpParserImpl<uint8_t>; 3245 template class RegExpParserImpl<base::uc16>; 3246 3247 } // namespace 3248 3249 // static 3250 bool RegExpParser::ParseRegExpFromHeapString(Isolate* isolate, Zone* zone, 3251 DirectHandle<String> input, 3252 RegExpFlags flags, 3253 RegExpCompileData* result) { 3254 DisallowGarbageCollection no_gc; 3255 uintptr_t stack_limit = isolate->stack_guard()->real_climit(); 3256 String::FlatContent content = input->GetFlatContent(no_gc); 3257 if (content.IsOneByte()) { 3258 base::Vector<const uint8_t> v = content.ToOneByteVector(); 3259 return RegExpParserImpl<uint8_t>{v.begin(), v.length(), flags, 3260 stack_limit, zone, no_gc} 3261 .Parse(result); 3262 } else { 3263 base::Vector<const base::uc16> v = content.ToUC16Vector(); 3264 return RegExpParserImpl<base::uc16>{v.begin(), v.length(), flags, 3265 stack_limit, zone, no_gc} 3266 .Parse(result); 3267 } 3268 } 3269 3270 // static 3271 template <class CharT> 3272 bool RegExpParser::VerifyRegExpSyntax(Zone* zone, uintptr_t stack_limit, 3273 const CharT* input, int input_length, 3274 RegExpFlags flags, 3275 RegExpCompileData* result, 3276 const DisallowGarbageCollection& no_gc) { 3277 return RegExpParserImpl<CharT>{input, input_length, flags, 3278 stack_limit, zone, no_gc} 3279 .Parse(result); 3280 } 3281 3282 template bool RegExpParser::VerifyRegExpSyntax<uint8_t>( 3283 Zone*, uintptr_t, const uint8_t*, int, RegExpFlags, RegExpCompileData*, 3284 const DisallowGarbageCollection&); 3285 template bool RegExpParser::VerifyRegExpSyntax<base::uc16>( 3286 Zone*, uintptr_t, const base::uc16*, int, RegExpFlags, RegExpCompileData*, 3287 const DisallowGarbageCollection&); 3288 3289 } // namespace internal 3290 } // namespace v8