[ tor-browser ].git.dasho

regexp-parser.cc (112157B)
      1 // Copyright 2016 the V8 project authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "irregexp/imported/regexp-parser.h"
      6 
      7 #include "irregexp/imported/regexp-ast.h"
      8 #include "irregexp/imported/regexp-macro-assembler.h"
      9 #include "irregexp/imported/regexp.h"
     10 
     11 #ifdef V8_INTL_SUPPORT
     12 #include "unicode/uniset.h"
     13 #include "unicode/unistr.h"
     14 #include "unicode/usetiter.h"
     15 #include "unicode/utf16.h"  // For U16_NEXT
     16 #endif                      // V8_INTL_SUPPORT
     17 
     18 namespace v8 {
     19 namespace internal {
     20 
     21 namespace {
     22 
     23 // Whether we're currently inside the ClassEscape production
     24 // (tc39.es/ecma262/#prod-annexB-CharacterEscape).
     25 enum class InClassEscapeState {
     26  kInClass,
     27  kNotInClass,
     28 };
     29 
     30 // The production used to derive ClassSetOperand.
     31 enum class ClassSetOperandType {
     32  kClassSetCharacter,
     33  kClassStringDisjunction,
     34  kNestedClass,
     35  kCharacterClassEscape,  // \ CharacterClassEscape is a special nested class,
     36                          // as we can fold it directly into another range.
     37  kClassSetRange
     38 };
     39 
     40 class RegExpTextBuilder {
     41 public:
     42  using SmallRegExpTreeVector = SmallZoneVector<RegExpTree*, 8>;
     43 
     44  RegExpTextBuilder(Zone* zone, SmallRegExpTreeVector* terms_storage,
     45                    RegExpFlags flags)
     46      : zone_(zone), flags_(flags), terms_(terms_storage), text_(zone) {}
     47  void AddCharacter(base::uc16 character);
     48  void AddUnicodeCharacter(base::uc32 character);
     49  void AddEscapedUnicodeCharacter(base::uc32 character);
     50  void AddAtom(RegExpTree* atom);
     51  void AddTerm(RegExpTree* term);
     52  void AddClassRanges(RegExpClassRanges* cc);
     53  void FlushPendingSurrogate();
     54  void FlushText();
     55  RegExpTree* PopLastAtom();
     56  RegExpTree* ToRegExp();
     57 
     58 private:
     59  static const base::uc16 kNoPendingSurrogate = 0;
     60 
     61  void AddLeadSurrogate(base::uc16 lead_surrogate);
     62  void AddTrailSurrogate(base::uc16 trail_surrogate);
     63  void FlushCharacters();
     64  bool NeedsDesugaringForUnicode(RegExpClassRanges* cc);
     65  bool NeedsDesugaringForIgnoreCase(base::uc32 c);
     66  void AddClassRangesForDesugaring(base::uc32 c);
     67  bool ignore_case() const { return IsIgnoreCase(flags_); }
     68  bool IsUnicodeMode() const {
     69    // Either /v or /u enable UnicodeMode
     70    // https://tc39.es/ecma262/#sec-parsepattern
     71    return IsUnicode(flags_) || IsUnicodeSets(flags_);
     72  }
     73  Zone* zone() const { return zone_; }
     74 
     75  Zone* const zone_;
     76  const RegExpFlags flags_;
     77  ZoneList<base::uc16>* characters_ = nullptr;
     78  base::uc16 pending_surrogate_ = kNoPendingSurrogate;
     79  SmallRegExpTreeVector* terms_;
     80  SmallRegExpTreeVector text_;
     81 };
     82 
     83 void RegExpTextBuilder::AddLeadSurrogate(base::uc16 lead_surrogate) {
     84  DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
     85  FlushPendingSurrogate();
     86  // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
     87  pending_surrogate_ = lead_surrogate;
     88 }
     89 
     90 void RegExpTextBuilder::AddTrailSurrogate(base::uc16 trail_surrogate) {
     91  DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
     92  if (pending_surrogate_ != kNoPendingSurrogate) {
     93    base::uc16 lead_surrogate = pending_surrogate_;
     94    pending_surrogate_ = kNoPendingSurrogate;
     95    DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
     96    base::uc32 combined =
     97        unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
     98    if (NeedsDesugaringForIgnoreCase(combined)) {
     99      AddClassRangesForDesugaring(combined);
    100    } else {
    101      ZoneList<base::uc16> surrogate_pair(2, zone());
    102      surrogate_pair.Add(lead_surrogate, zone());
    103      surrogate_pair.Add(trail_surrogate, zone());
    104      RegExpAtom* atom =
    105          zone()->New<RegExpAtom>(surrogate_pair.ToConstVector());
    106      AddAtom(atom);
    107    }
    108  } else {
    109    pending_surrogate_ = trail_surrogate;
    110    FlushPendingSurrogate();
    111  }
    112 }
    113 
    114 void RegExpTextBuilder::FlushPendingSurrogate() {
    115  if (pending_surrogate_ != kNoPendingSurrogate) {
    116    DCHECK(IsUnicodeMode());
    117    base::uc32 c = pending_surrogate_;
    118    pending_surrogate_ = kNoPendingSurrogate;
    119    AddClassRangesForDesugaring(c);
    120  }
    121 }
    122 
    123 void RegExpTextBuilder::FlushCharacters() {
    124  FlushPendingSurrogate();
    125  if (characters_ != nullptr) {
    126    RegExpTree* atom = zone()->New<RegExpAtom>(characters_->ToConstVector());
    127    characters_ = nullptr;
    128    text_.emplace_back(atom);
    129  }
    130 }
    131 
    132 void RegExpTextBuilder::FlushText() {
    133  FlushCharacters();
    134  size_t num_text = text_.size();
    135  if (num_text == 0) {
    136    return;
    137  } else if (num_text == 1) {
    138    terms_->emplace_back(text_.back());
    139  } else {
    140    RegExpText* text = zone()->New<RegExpText>(zone());
    141    for (size_t i = 0; i < num_text; i++) {
    142      text_[i]->AppendToText(text, zone());
    143    }
    144    terms_->emplace_back(text);
    145  }
    146  text_.clear();
    147 }
    148 
    149 void RegExpTextBuilder::AddCharacter(base::uc16 c) {
    150  FlushPendingSurrogate();
    151  if (characters_ == nullptr) {
    152    characters_ = zone()->New<ZoneList<base::uc16>>(4, zone());
    153  }
    154  characters_->Add(c, zone());
    155 }
    156 
    157 void RegExpTextBuilder::AddUnicodeCharacter(base::uc32 c) {
    158  if (c > static_cast<base::uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
    159    DCHECK(IsUnicodeMode());
    160    AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
    161    AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
    162  } else if (IsUnicodeMode() && unibrow::Utf16::IsLeadSurrogate(c)) {
    163    AddLeadSurrogate(c);
    164  } else if (IsUnicodeMode() && unibrow::Utf16::IsTrailSurrogate(c)) {
    165    AddTrailSurrogate(c);
    166  } else {
    167    AddCharacter(static_cast<base::uc16>(c));
    168  }
    169 }
    170 
    171 void RegExpTextBuilder::AddEscapedUnicodeCharacter(base::uc32 character) {
    172  // A lead or trail surrogate parsed via escape sequence will not
    173  // pair up with any preceding lead or following trail surrogate.
    174  FlushPendingSurrogate();
    175  AddUnicodeCharacter(character);
    176  FlushPendingSurrogate();
    177 }
    178 
    179 void RegExpTextBuilder::AddClassRanges(RegExpClassRanges* cr) {
    180  if (NeedsDesugaringForUnicode(cr)) {
    181    // With /u or /v, character class needs to be desugared, so it
    182    // must be a standalone term instead of being part of a RegExpText.
    183    AddTerm(cr);
    184  } else {
    185    AddAtom(cr);
    186  }
    187 }
    188 
    189 void RegExpTextBuilder::AddClassRangesForDesugaring(base::uc32 c) {
    190  AddTerm(zone()->New<RegExpClassRanges>(
    191      zone(), CharacterRange::List(zone(), CharacterRange::Singleton(c))));
    192 }
    193 
    194 void RegExpTextBuilder::AddAtom(RegExpTree* atom) {
    195  DCHECK(atom->IsTextElement());
    196  FlushCharacters();
    197  text_.emplace_back(atom);
    198 }
    199 
    200 void RegExpTextBuilder::AddTerm(RegExpTree* term) {
    201  DCHECK(term->IsTextElement());
    202  FlushText();
    203  terms_->emplace_back(term);
    204 }
    205 
    206 bool RegExpTextBuilder::NeedsDesugaringForUnicode(RegExpClassRanges* cc) {
    207  if (!IsUnicodeMode()) return false;
    208  // TODO(yangguo): we could be smarter than this. Case-insensitivity does not
    209  // necessarily mean that we need to desugar. It's probably nicer to have a
    210  // separate pass to figure out unicode desugarings.
    211  if (ignore_case()) return true;
    212  ZoneList<CharacterRange>* ranges = cc->ranges(zone());
    213  CharacterRange::Canonicalize(ranges);
    214 
    215  if (cc->is_negated()) {
    216    ZoneList<CharacterRange>* negated_ranges =
    217        zone()->New<ZoneList<CharacterRange>>(ranges->length(), zone());
    218    CharacterRange::Negate(ranges, negated_ranges, zone());
    219    ranges = negated_ranges;
    220  }
    221 
    222  for (int i = ranges->length() - 1; i >= 0; i--) {
    223    base::uc32 from = ranges->at(i).from();
    224    base::uc32 to = ranges->at(i).to();
    225    // Check for non-BMP characters.
    226    if (to >= kNonBmpStart) return true;
    227    // Check for lone surrogates.
    228    if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
    229  }
    230  return false;
    231 }
    232 
    233 // We only use this for characters made of surrogate pairs.  All other
    234 // characters outside of character classes are made case independent in the
    235 // code generation.
    236 bool RegExpTextBuilder::NeedsDesugaringForIgnoreCase(base::uc32 c) {
    237 #ifdef V8_INTL_SUPPORT
    238  if (IsUnicodeMode() && ignore_case()) {
    239    icu::UnicodeSet set(c, c);
    240    set.closeOver(USET_CASE_INSENSITIVE);
    241    set.removeAllStrings();
    242    return set.size() > 1;
    243  }
    244  // In the case where ICU is not included, we act as if the unicode flag is
    245  // not set, and do not desugar.
    246 #endif  // V8_INTL_SUPPORT
    247  return false;
    248 }
    249 
    250 RegExpTree* RegExpTextBuilder::PopLastAtom() {
    251  FlushPendingSurrogate();
    252  RegExpTree* atom;
    253  if (characters_ != nullptr) {
    254    base::Vector<const base::uc16> char_vector = characters_->ToConstVector();
    255    int num_chars = char_vector.length();
    256    if (num_chars > 1) {
    257      base::Vector<const base::uc16> prefix =
    258          char_vector.SubVector(0, num_chars - 1);
    259      text_.emplace_back(zone()->New<RegExpAtom>(prefix));
    260      char_vector = char_vector.SubVector(num_chars - 1, num_chars);
    261    }
    262    characters_ = nullptr;
    263    atom = zone()->New<RegExpAtom>(char_vector);
    264    return atom;
    265  } else if (!text_.empty()) {
    266    atom = text_.back();
    267    text_.pop_back();
    268    return atom;
    269  }
    270  return nullptr;
    271 }
    272 
    273 RegExpTree* RegExpTextBuilder::ToRegExp() {
    274  FlushText();
    275  size_t number_of_terms = terms_->size();
    276  if (number_of_terms == 0) return zone()->New<RegExpEmpty>();
    277  if (number_of_terms == 1) return terms_->back();
    278  return zone()->New<RegExpAlternative>(zone()->New<ZoneList<RegExpTree*>>(
    279      base::VectorOf(terms_->begin(), terms_->size()), zone()));
    280 }
    281 
    282 // Accumulates RegExp atoms and assertions into lists of terms and alternatives.
    283 class RegExpBuilder {
    284 public:
    285  RegExpBuilder(Zone* zone, RegExpFlags flags)
    286      : zone_(zone),
    287        flags_(flags),
    288        terms_(zone),
    289        alternatives_(zone),
    290        text_builder_(RegExpTextBuilder{zone, &terms_, flags}) {}
    291  void AddCharacter(base::uc16 character);
    292  void AddUnicodeCharacter(base::uc32 character);
    293  void AddEscapedUnicodeCharacter(base::uc32 character);
    294  // "Adds" an empty expression. Does nothing except consume a
    295  // following quantifier
    296  void AddEmpty();
    297  void AddClassRanges(RegExpClassRanges* cc);
    298  void AddAtom(RegExpTree* tree);
    299  void AddTerm(RegExpTree* tree);
    300  void AddAssertion(RegExpTree* tree);
    301  void NewAlternative();  // '|'
    302  bool AddQuantifierToAtom(int min, int max, int index,
    303                           RegExpQuantifier::QuantifierType type);
    304  void FlushText();
    305  RegExpTree* ToRegExp();
    306  RegExpFlags flags() const { return flags_; }
    307 
    308  bool ignore_case() const { return IsIgnoreCase(flags_); }
    309  bool multiline() const { return IsMultiline(flags_); }
    310  bool dotall() const { return IsDotAll(flags_); }
    311 
    312 private:
    313  void FlushTerms();
    314  bool IsUnicodeMode() const {
    315    // Either /v or /u enable UnicodeMode
    316    // https://tc39.es/ecma262/#sec-parsepattern
    317    return IsUnicode(flags_) || IsUnicodeSets(flags_);
    318  }
    319  Zone* zone() const { return zone_; }
    320  RegExpTextBuilder& text_builder() { return text_builder_; }
    321 
    322  Zone* const zone_;
    323  bool pending_empty_ = false;
    324  const RegExpFlags flags_;
    325 
    326  using SmallRegExpTreeVector = SmallZoneVector<RegExpTree*, 8>;
    327  SmallRegExpTreeVector terms_;
    328  SmallRegExpTreeVector alternatives_;
    329  RegExpTextBuilder text_builder_;
    330 };
    331 
    332 enum SubexpressionType {
    333  INITIAL,
    334  CAPTURE,  // All positive values represent captures.
    335  POSITIVE_LOOKAROUND,
    336  NEGATIVE_LOOKAROUND,
    337  GROUPING
    338 };
    339 
    340 class RegExpParserState : public ZoneObject {
    341 public:
    342  // Push a state on the stack.
    343  RegExpParserState(RegExpParserState* previous_state,
    344                    SubexpressionType group_type,
    345                    RegExpLookaround::Type lookaround_type,
    346                    int disjunction_capture_index,
    347                    const ZoneVector<base::uc16>* capture_name,
    348                    RegExpFlags flags, Zone* zone)
    349      : previous_state_(previous_state),
    350        builder_(zone, flags),
    351        group_type_(group_type),
    352        lookaround_type_(lookaround_type),
    353        disjunction_capture_index_(disjunction_capture_index),
    354        capture_name_(capture_name) {
    355    if (previous_state != nullptr) {
    356      non_participating_capture_group_interval_ =
    357          previous_state->non_participating_capture_group_interval();
    358    }
    359  }
    360  // Parser state of containing expression, if any.
    361  RegExpParserState* previous_state() const { return previous_state_; }
    362  bool IsSubexpression() { return previous_state_ != nullptr; }
    363  // RegExpBuilder building this regexp's AST.
    364  RegExpBuilder* builder() { return &builder_; }
    365  // Type of regexp being parsed (parenthesized group or entire regexp).
    366  SubexpressionType group_type() const { return group_type_; }
    367  // Lookahead or Lookbehind.
    368  RegExpLookaround::Type lookaround_type() const { return lookaround_type_; }
    369  // Index in captures array of first capture in this sub-expression, if any.
    370  // Also the capture index of this sub-expression itself, if group_type
    371  // is CAPTURE.
    372  int capture_index() const { return disjunction_capture_index_; }
    373  // The name of the current sub-expression, if group_type is CAPTURE. Only
    374  // used for named captures.
    375  const ZoneVector<base::uc16>* capture_name() const { return capture_name_; }
    376  std::pair<int, int> non_participating_capture_group_interval() const {
    377    return non_participating_capture_group_interval_;
    378  }
    379 
    380  bool IsNamedCapture() const { return capture_name_ != nullptr; }
    381 
    382  // Check whether the parser is inside a capture group with the given index.
    383  bool IsInsideCaptureGroup(int index) const {
    384    for (const RegExpParserState* s = this; s != nullptr;
    385         s = s->previous_state()) {
    386      if (s->group_type() != CAPTURE) continue;
    387      // Return true if we found the matching capture index.
    388      if (index == s->capture_index()) return true;
    389      // Abort if index is larger than what has been parsed up till this state.
    390      if (index > s->capture_index()) return false;
    391    }
    392    return false;
    393  }
    394 
    395  // Check whether the parser is inside a capture group with the given name.
    396  bool IsInsideCaptureGroup(const ZoneVector<base::uc16>* name) const {
    397    DCHECK_NOT_NULL(name);
    398    for (const RegExpParserState* s = this; s != nullptr;
    399         s = s->previous_state()) {
    400      if (s->capture_name() == nullptr) continue;
    401      if (*s->capture_name() == *name) return true;
    402    }
    403    return false;
    404  }
    405 
    406  void NewAlternative(int captures_started) {
    407    if (non_participating_capture_group_interval().second != 0) {
    408      // Extend the non-participating interval.
    409      non_participating_capture_group_interval_.second = captures_started;
    410    } else {
    411      // Create new non-participating interval from the start of the current
    412      // enclosing group to all captures created within that group so far.
    413      non_participating_capture_group_interval_ =
    414          std::make_pair(capture_index(), captures_started);
    415    }
    416  }
    417 
    418 private:
    419  // Linked list implementation of stack of states.
    420  RegExpParserState* const previous_state_;
    421  // Builder for the stored disjunction.
    422  RegExpBuilder builder_;
    423  // Stored disjunction type (capture, look-ahead or grouping), if any.
    424  const SubexpressionType group_type_;
    425  // Stored read direction.
    426  const RegExpLookaround::Type lookaround_type_;
    427  // Stored disjunction's capture index (if any).
    428  const int disjunction_capture_index_;
    429  // Stored capture name (if any).
    430  const ZoneVector<base::uc16>* const capture_name_;
    431  // Interval of (named) capture indices ]from, to] that are not participating
    432  // in the current state (i.e. they cannot match).
    433  // Capture indices are not participating if they were created in a different
    434  // alternative.
    435  std::pair<int, int> non_participating_capture_group_interval_;
    436 };
    437 
    438 template <class CharT>
    439 class RegExpParserImpl final {
    440 private:
    441  RegExpParserImpl(const CharT* input, int input_length, RegExpFlags flags,
    442                   uintptr_t stack_limit, Zone* zone,
    443                   const DisallowGarbageCollection& no_gc);
    444 
    445  bool Parse(RegExpCompileData* result);
    446 
    447  RegExpTree* ParsePattern();
    448  RegExpTree* ParseDisjunction();
    449  RegExpTree* ParseGroup();
    450 
    451  // Parses a {...,...} quantifier and stores the range in the given
    452  // out parameters.
    453  bool ParseIntervalQuantifier(int* min_out, int* max_out);
    454 
    455  // Checks whether the following is a length-digit hexadecimal number,
    456  // and sets the value if it is.
    457  bool ParseHexEscape(int length, base::uc32* value);
    458  bool ParseUnicodeEscape(base::uc32* value);
    459  bool ParseUnlimitedLengthHexNumber(int max_value, base::uc32* value);
    460 
    461  bool ParsePropertyClassName(ZoneVector<char>* name_1,
    462                              ZoneVector<char>* name_2);
    463  bool AddPropertyClassRange(ZoneList<CharacterRange>* add_to_range,
    464                             CharacterClassStrings* add_to_strings, bool negate,
    465                             const ZoneVector<char>& name_1,
    466                             const ZoneVector<char>& name_2);
    467 
    468  RegExpTree* ParseClassRanges(ZoneList<CharacterRange>* ranges,
    469                               bool add_unicode_case_equivalents);
    470  // Parse inside a class. Either add escaped class to the range, or return
    471  // false and pass parsed single character through |char_out|.
    472  void ParseClassEscape(ZoneList<CharacterRange>* ranges, Zone* zone,
    473                        bool add_unicode_case_equivalents, base::uc32* char_out,
    474                        bool* is_class_escape);
    475  // Returns true iff parsing was successful.
    476  bool TryParseCharacterClassEscape(base::uc32 next,
    477                                    InClassEscapeState in_class_escape_state,
    478                                    ZoneList<CharacterRange>* ranges,
    479                                    CharacterClassStrings* strings, Zone* zone,
    480                                    bool add_unicode_case_equivalents);
    481  RegExpTree* ParseClassStringDisjunction(ZoneList<CharacterRange>* ranges,
    482                                          CharacterClassStrings* strings);
    483  RegExpTree* ParseClassSetOperand(const RegExpBuilder* builder,
    484                                   ClassSetOperandType* type_out);
    485  RegExpTree* ParseClassSetOperand(const RegExpBuilder* builder,
    486                                   ClassSetOperandType* type_out,
    487                                   ZoneList<CharacterRange>* ranges,
    488                                   CharacterClassStrings* strings,
    489                                   base::uc32* character);
    490  base::uc32 ParseClassSetCharacter();
    491  // Parses and returns a single escaped character.
    492  base::uc32 ParseCharacterEscape(InClassEscapeState in_class_escape_state,
    493                                  bool* is_escaped_unicode_character);
    494 
    495  void AddMaybeSimpleCaseFoldedRange(ZoneList<CharacterRange>* ranges,
    496                                     CharacterRange new_range);
    497 
    498  RegExpTree* ParseClassUnion(const RegExpBuilder* builder, bool is_negated,
    499                              RegExpTree* first_operand,
    500                              ClassSetOperandType first_operand_type,
    501                              ZoneList<CharacterRange>* ranges,
    502                              CharacterClassStrings* strings,
    503                              base::uc32 first_character);
    504  RegExpTree* ParseClassIntersection(const RegExpBuilder* builder,
    505                                     bool is_negated, RegExpTree* first_operand,
    506                                     ClassSetOperandType first_operand_type);
    507  RegExpTree* ParseClassSubtraction(const RegExpBuilder* builder,
    508                                    bool is_negated, RegExpTree* first_operand,
    509                                    ClassSetOperandType first_operand_type);
    510  RegExpTree* ParseCharacterClass(const RegExpBuilder* state);
    511 
    512  base::uc32 ParseOctalLiteral();
    513 
    514  // Tries to parse the input as a back reference.  If successful it
    515  // stores the result in the output parameter and returns true.  If
    516  // it fails it will push back the characters read so the same characters
    517  // can be reparsed.
    518  bool ParseBackReferenceIndex(int* index_out);
    519 
    520  RegExpTree* ReportError(RegExpError error);
    521  void Advance();
    522  void Advance(int dist);
    523  void RewindByOneCodepoint();  // Rewinds to before the previous Advance().
    524  void Reset(int pos);
    525 
    526  // Reports whether the pattern might be used as a literal search string.
    527  // Only use if the result of the parse is a single atom node.
    528  bool simple() const { return simple_; }
    529  bool contains_anchor() const { return contains_anchor_; }
    530  void set_contains_anchor() { contains_anchor_ = true; }
    531  int captures_started() const { return captures_started_; }
    532  int position() const {
    533    const bool current_is_surrogate =
    534        current() != kEndMarker &&
    535        current() > unibrow::Utf16::kMaxNonSurrogateCharCode;
    536    const int rewind_bytes = current_is_surrogate ? 2 : 1;
    537    return next_pos_ - rewind_bytes;
    538  }
    539  bool failed() const { return failed_; }
    540  RegExpFlags flags() const { return flags_; }
    541  bool IsUnicodeMode() const {
    542    // Either /v or /u enable UnicodeMode
    543    // https://tc39.es/ecma262/#sec-parsepattern
    544    return IsUnicode(flags()) || IsUnicodeSets(flags()) || force_unicode_;
    545  }
    546  bool unicode_sets() const { return IsUnicodeSets(flags()); }
    547  bool ignore_case() const { return IsIgnoreCase(flags()); }
    548 
    549  static bool IsSyntaxCharacterOrSlash(base::uc32 c);
    550  static bool IsClassSetSyntaxCharacter(base::uc32 c);
    551  static bool IsClassSetReservedPunctuator(base::uc32 c);
    552  bool IsClassSetReservedDoublePunctuator(base::uc32 c);
    553 
    554  static const base::uc32 kEndMarker = (1 << 21);
    555 
    556 private:
    557  // Return the 1-indexed RegExpCapture object, allocate if necessary.
    558  RegExpCapture* GetCapture(int index);
    559 
    560  // Creates a new named capture at the specified index. Must be called exactly
    561  // once for each named capture. Fails if a capture with the same name is
    562  // encountered.
    563  bool CreateNamedCaptureAtIndex(const RegExpParserState* state, int index);
    564 
    565  // Parses the name of a capture group (?<name>pattern). The name must adhere
    566  // to IdentifierName in the ECMAScript standard.
    567  const ZoneVector<base::uc16>* ParseCaptureGroupName();
    568 
    569  bool ParseNamedBackReference(RegExpBuilder* builder,
    570                               RegExpParserState* state);
    571  RegExpParserState* ParseOpenParenthesis(RegExpParserState* state);
    572 
    573  // After the initial parsing pass, patch corresponding RegExpCapture objects
    574  // into all RegExpBackReferences. This is done after initial parsing in order
    575  // to avoid complicating cases in which references comes before the capture.
    576  void PatchNamedBackReferences();
    577 
    578  ZoneVector<RegExpCapture*>* GetNamedCaptures();
    579 
    580  // Returns true iff the pattern contains named captures. May call
    581  // ScanForCaptures to look ahead at the remaining pattern.
    582  bool HasNamedCaptures(InClassEscapeState in_class_escape_state);
    583 
    584  Zone* zone() const { return zone_; }
    585 
    586  base::uc32 current() const { return current_; }
    587  bool has_more() const { return has_more_; }
    588  bool has_next() const { return next_pos_ < input_length(); }
    589  base::uc32 Next();
    590  template <bool update_position>
    591  base::uc32 ReadNext();
    592  CharT InputAt(int index) const {
    593    DCHECK(0 <= index && index < input_length());
    594    return input_[index];
    595  }
    596  int input_length() const { return input_length_; }
    597  void ScanForCaptures(InClassEscapeState in_class_escape_state);
    598 
    599  struct RegExpCaptureNameLess {
    600    bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const {
    601      DCHECK_NOT_NULL(lhs);
    602      DCHECK_NOT_NULL(rhs);
    603      return *lhs->name() < *rhs->name();
    604    }
    605  };
    606 
    607  class ForceUnicodeScope final {
    608   public:
    609    explicit ForceUnicodeScope(RegExpParserImpl<CharT>* parser)
    610        : parser_(parser) {
    611      DCHECK(!parser_->force_unicode_);
    612      parser_->force_unicode_ = true;
    613    }
    614    ~ForceUnicodeScope() {
    615      DCHECK(parser_->force_unicode_);
    616      parser_->force_unicode_ = false;
    617    }
    618 
    619   private:
    620    RegExpParserImpl<CharT>* const parser_;
    621  };
    622 
    623  const DisallowGarbageCollection no_gc_;
    624  Zone* const zone_;
    625  RegExpError error_ = RegExpError::kNone;
    626  int error_pos_ = 0;
    627  ZoneList<RegExpCapture*>* captures_;
    628  // Maps capture names to a list of capture indices with this name.
    629  ZoneMap<RegExpCapture*, ZoneList<int>*, RegExpCaptureNameLess>*
    630      named_captures_;
    631  ZoneList<RegExpBackReference*>* named_back_references_;
    632  const CharT* const input_;
    633  const int input_length_;
    634  base::uc32 current_;
    635  RegExpFlags flags_;
    636  bool force_unicode_ = false;  // Force parser to act as if unicode were set.
    637  int next_pos_;
    638  int captures_started_;
    639  int capture_count_;  // Only valid after we have scanned for captures.
    640  int quantifier_count_;
    641  int lookaround_count_;  // Only valid after we have scanned for lookbehinds.
    642  bool has_more_;
    643  bool simple_;
    644  bool contains_anchor_;
    645  bool is_scanned_for_captures_;
    646  bool has_named_captures_;  // Only valid after we have scanned for captures.
    647  bool failed_;
    648  const uintptr_t stack_limit_;
    649 
    650  friend class v8::internal::RegExpParser;
    651 };
    652 
    653 template <class CharT>
    654 RegExpParserImpl<CharT>::RegExpParserImpl(
    655    const CharT* input, int input_length, RegExpFlags flags,
    656    uintptr_t stack_limit, Zone* zone, const DisallowGarbageCollection& no_gc)
    657    : zone_(zone),
    658      captures_(nullptr),
    659      named_captures_(nullptr),
    660      named_back_references_(nullptr),
    661      input_(input),
    662      input_length_(input_length),
    663      current_(kEndMarker),
    664      flags_(flags),
    665      next_pos_(0),
    666      captures_started_(0),
    667      capture_count_(0),
    668      quantifier_count_(0),
    669      lookaround_count_(0),
    670      has_more_(true),
    671      simple_(false),
    672      contains_anchor_(false),
    673      is_scanned_for_captures_(false),
    674      has_named_captures_(false),
    675      failed_(false),
    676      stack_limit_(stack_limit) {
    677  Advance();
    678 }
    679 
    680 template <>
    681 template <bool update_position>
    682 inline base::uc32 RegExpParserImpl<uint8_t>::ReadNext() {
    683  int position = next_pos_;
    684  base::uc16 c0 = InputAt(position);
    685  position++;
    686  DCHECK(!unibrow::Utf16::IsLeadSurrogate(c0));
    687  if (update_position) next_pos_ = position;
    688  return c0;
    689 }
    690 
    691 template <>
    692 template <bool update_position>
    693 inline base::uc32 RegExpParserImpl<base::uc16>::ReadNext() {
    694  int position = next_pos_;
    695  base::uc16 c0 = InputAt(position);
    696  base::uc32 result = c0;
    697  position++;
    698  // Read the whole surrogate pair in case of unicode mode, if possible.
    699  if (IsUnicodeMode() && position < input_length() &&
    700      unibrow::Utf16::IsLeadSurrogate(c0)) {
    701    base::uc16 c1 = InputAt(position);
    702    if (unibrow::Utf16::IsTrailSurrogate(c1)) {
    703      result = unibrow::Utf16::CombineSurrogatePair(c0, c1);
    704      position++;
    705    }
    706  }
    707  if (update_position) next_pos_ = position;
    708  return result;
    709 }
    710 
    711 template <class CharT>
    712 base::uc32 RegExpParserImpl<CharT>::Next() {
    713  if (has_next()) {
    714    return ReadNext<false>();
    715  } else {
    716    return kEndMarker;
    717  }
    718 }
    719 
    720 template <class CharT>
    721 void RegExpParserImpl<CharT>::Advance() {
    722  if (has_next()) {
    723    if (GetCurrentStackPosition() < stack_limit_) {
    724      if (v8_flags.correctness_fuzzer_suppressions) {
    725        FATAL("Aborting on stack overflow");
    726      }
    727      ReportError(RegExpError::kStackOverflow);
    728    } else {
    729      current_ = ReadNext<true>();
    730    }
    731  } else {
    732    current_ = kEndMarker;
    733    // Advance so that position() points to 1-after-the-last-character. This is
    734    // important so that Reset() to this position works correctly.
    735    next_pos_ = input_length() + 1;
    736    has_more_ = false;
    737  }
    738 }
    739 
    740 template <class CharT>
    741 void RegExpParserImpl<CharT>::RewindByOneCodepoint() {
    742  if (!has_more()) return;
    743  // Rewinds by one code point, i.e.: two code units if `current` is outside
    744  // the basic multilingual plane (= composed of a lead and trail surrogate),
    745  // or one code unit otherwise.
    746  const int rewind_by =
    747      current() > unibrow::Utf16::kMaxNonSurrogateCharCode ? -2 : -1;
    748  Advance(rewind_by);  // Undo the last Advance.
    749 }
    750 
    751 template <class CharT>
    752 void RegExpParserImpl<CharT>::Reset(int pos) {
    753  next_pos_ = pos;
    754  has_more_ = (pos < input_length());
    755  Advance();
    756 }
    757 
    758 template <class CharT>
    759 void RegExpParserImpl<CharT>::Advance(int dist) {
    760  next_pos_ += dist - 1;
    761  Advance();
    762 }
    763 
    764 // static
    765 template <class CharT>
    766 bool RegExpParserImpl<CharT>::IsSyntaxCharacterOrSlash(base::uc32 c) {
    767  switch (c) {
    768    case '^':
    769    case '$':
    770    case '\\':
    771    case '.':
    772    case '*':
    773    case '+':
    774    case '?':
    775    case '(':
    776    case ')':
    777    case '[':
    778    case ']':
    779    case '{':
    780    case '}':
    781    case '|':
    782    case '/':
    783      return true;
    784    default:
    785      break;
    786  }
    787  return false;
    788 }
    789 
    790 // static
    791 template <class CharT>
    792 bool RegExpParserImpl<CharT>::IsClassSetSyntaxCharacter(base::uc32 c) {
    793  switch (c) {
    794    case '(':
    795    case ')':
    796    case '[':
    797    case ']':
    798    case '{':
    799    case '}':
    800    case '/':
    801    case '-':
    802    case '\\':
    803    case '|':
    804      return true;
    805    default:
    806      break;
    807  }
    808  return false;
    809 }
    810 
    811 // static
    812 template <class CharT>
    813 bool RegExpParserImpl<CharT>::IsClassSetReservedPunctuator(base::uc32 c) {
    814  switch (c) {
    815    case '&':
    816    case '-':
    817    case '!':
    818    case '#':
    819    case '%':
    820    case ',':
    821    case ':':
    822    case ';':
    823    case '<':
    824    case '=':
    825    case '>':
    826    case '@':
    827    case '`':
    828    case '~':
    829      return true;
    830    default:
    831      break;
    832  }
    833  return false;
    834 }
    835 
    836 template <class CharT>
    837 bool RegExpParserImpl<CharT>::IsClassSetReservedDoublePunctuator(base::uc32 c) {
    838 #define DOUBLE_PUNCTUATOR_CASE(Char) \
    839  case Char:                         \
    840    return Next() == Char
    841 
    842  switch (c) {
    843    DOUBLE_PUNCTUATOR_CASE('&');
    844    DOUBLE_PUNCTUATOR_CASE('!');
    845    DOUBLE_PUNCTUATOR_CASE('#');
    846    DOUBLE_PUNCTUATOR_CASE('$');
    847    DOUBLE_PUNCTUATOR_CASE('%');
    848    DOUBLE_PUNCTUATOR_CASE('*');
    849    DOUBLE_PUNCTUATOR_CASE('+');
    850    DOUBLE_PUNCTUATOR_CASE(',');
    851    DOUBLE_PUNCTUATOR_CASE('.');
    852    DOUBLE_PUNCTUATOR_CASE(':');
    853    DOUBLE_PUNCTUATOR_CASE(';');
    854    DOUBLE_PUNCTUATOR_CASE('<');
    855    DOUBLE_PUNCTUATOR_CASE('=');
    856    DOUBLE_PUNCTUATOR_CASE('>');
    857    DOUBLE_PUNCTUATOR_CASE('?');
    858    DOUBLE_PUNCTUATOR_CASE('@');
    859    DOUBLE_PUNCTUATOR_CASE('^');
    860    DOUBLE_PUNCTUATOR_CASE('`');
    861    DOUBLE_PUNCTUATOR_CASE('~');
    862    default:
    863      break;
    864  }
    865 #undef DOUBLE_PUNCTUATOR_CASE
    866 
    867  return false;
    868 }
    869 
    870 template <class CharT>
    871 RegExpTree* RegExpParserImpl<CharT>::ReportError(RegExpError error) {
    872  if (failed_) return nullptr;  // Do not overwrite any existing error.
    873  failed_ = true;
    874  error_ = error;
    875  error_pos_ = position();
    876  // Zip to the end to make sure no more input is read.
    877  current_ = kEndMarker;
    878  next_pos_ = input_length();
    879  has_more_ = false;
    880  return nullptr;
    881 }
    882 
    883 #define CHECK_FAILED /**/);    \
    884  if (failed_) return nullptr; \
    885  ((void)0
    886 
    887 // Pattern ::
    888 //   Disjunction
    889 template <class CharT>
    890 RegExpTree* RegExpParserImpl<CharT>::ParsePattern() {
    891  RegExpTree* result = ParseDisjunction(CHECK_FAILED);
    892  PatchNamedBackReferences(CHECK_FAILED);
    893  DCHECK(!has_more());
    894  // If the result of parsing is a literal string atom, and it has the
    895  // same length as the input, then the atom is identical to the input.
    896  if (result->IsAtom() && result->AsAtom()->length() == input_length()) {
    897    simple_ = true;
    898  }
    899  return result;
    900 }
    901 
    902 // Disjunction ::
    903 //   Alternative
    904 //   Alternative | Disjunction
    905 // Alternative ::
    906 //   [empty]
    907 //   Term Alternative
    908 // Term ::
    909 //   Assertion
    910 //   Atom
    911 //   Atom Quantifier
    912 template <class CharT>
    913 RegExpTree* RegExpParserImpl<CharT>::ParseDisjunction() {
    914  // Used to store current state while parsing subexpressions.
    915  RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD,
    916                                  0, nullptr, flags(), zone());
    917  RegExpParserState* state = &initial_state;
    918  // Cache the builder in a local variable for quick access.
    919  RegExpBuilder* builder = initial_state.builder();
    920  while (true) {
    921    switch (current()) {
    922      case kEndMarker:
    923        if (failed()) return nullptr;  // E.g. the initial Advance failed.
    924        if (state->IsSubexpression()) {
    925          // Inside a parenthesized group when hitting end of input.
    926          return ReportError(RegExpError::kUnterminatedGroup);
    927        }
    928        DCHECK_EQ(INITIAL, state->group_type());
    929        // Parsing completed successfully.
    930        return builder->ToRegExp();
    931      case ')': {
    932        if (!state->IsSubexpression()) {
    933          return ReportError(RegExpError::kUnmatchedParen);
    934        }
    935        DCHECK_NE(INITIAL, state->group_type());
    936 
    937        Advance();
    938        // End disjunction parsing and convert builder content to new single
    939        // regexp atom.
    940        RegExpTree* body = builder->ToRegExp();
    941 
    942        int end_capture_index = captures_started();
    943 
    944        int capture_index = state->capture_index();
    945        SubexpressionType group_type = state->group_type();
    946 
    947        // Build result of subexpression.
    948        if (group_type == CAPTURE) {
    949          if (state->IsNamedCapture()) {
    950            CreateNamedCaptureAtIndex(state, capture_index CHECK_FAILED);
    951          }
    952          RegExpCapture* capture = GetCapture(capture_index);
    953          capture->set_body(body);
    954          body = capture;
    955        } else if (group_type == GROUPING) {
    956          body = zone()->template New<RegExpGroup>(body, builder->flags());
    957        } else {
    958          DCHECK(group_type == POSITIVE_LOOKAROUND ||
    959                 group_type == NEGATIVE_LOOKAROUND);
    960          bool is_positive = (group_type == POSITIVE_LOOKAROUND);
    961          body = zone()->template New<RegExpLookaround>(
    962              body, is_positive, end_capture_index - capture_index,
    963              capture_index, state->lookaround_type(), lookaround_count_);
    964          lookaround_count_++;
    965        }
    966 
    967        // Restore previous state.
    968        state = state->previous_state();
    969        builder = state->builder();
    970 
    971        builder->AddAtom(body);
    972        // For compatibility with JSC and ES3, we allow quantifiers after
    973        // lookaheads, and break in all cases.
    974        break;
    975      }
    976      case '|': {
    977        Advance();
    978        state->NewAlternative(captures_started());
    979        builder->NewAlternative();
    980        continue;
    981      }
    982      case '*':
    983      case '+':
    984      case '?':
    985        return ReportError(RegExpError::kNothingToRepeat);
    986      case '^': {
    987        Advance();
    988        builder->AddAssertion(zone()->template New<RegExpAssertion>(
    989            builder->multiline() ? RegExpAssertion::Type::START_OF_LINE
    990                                 : RegExpAssertion::Type::START_OF_INPUT));
    991        set_contains_anchor();
    992        continue;
    993      }
    994      case '$': {
    995        Advance();
    996        RegExpAssertion::Type assertion_type =
    997            builder->multiline() ? RegExpAssertion::Type::END_OF_LINE
    998                                 : RegExpAssertion::Type::END_OF_INPUT;
    999        builder->AddAssertion(
   1000            zone()->template New<RegExpAssertion>(assertion_type));
   1001        continue;
   1002      }
   1003      case '.': {
   1004        Advance();
   1005        ZoneList<CharacterRange>* ranges =
   1006            zone()->template New<ZoneList<CharacterRange>>(2, zone());
   1007 
   1008        if (builder->dotall()) {
   1009          // Everything.
   1010          CharacterRange::AddClassEscape(StandardCharacterSet::kEverything,
   1011                                         ranges, false, zone());
   1012        } else {
   1013          // Everything except \x0A, \x0D, \u2028 and \u2029.
   1014          CharacterRange::AddClassEscape(
   1015              StandardCharacterSet::kNotLineTerminator, ranges, false, zone());
   1016        }
   1017 
   1018        RegExpClassRanges* cc =
   1019            zone()->template New<RegExpClassRanges>(zone(), ranges);
   1020        builder->AddClassRanges(cc);
   1021        break;
   1022      }
   1023      case '(': {
   1024        state = ParseOpenParenthesis(state CHECK_FAILED);
   1025        builder = state->builder();
   1026        flags_ = builder->flags();
   1027        continue;
   1028      }
   1029      case '[': {
   1030        RegExpTree* cc = ParseCharacterClass(builder CHECK_FAILED);
   1031        if (cc->IsClassRanges()) {
   1032          builder->AddClassRanges(cc->AsClassRanges());
   1033        } else {
   1034          DCHECK(cc->IsClassSetExpression());
   1035          builder->AddTerm(cc);
   1036        }
   1037        break;
   1038      }
   1039      // Atom ::
   1040      //   \ AtomEscape
   1041      case '\\':
   1042        switch (Next()) {
   1043          case kEndMarker:
   1044            return ReportError(RegExpError::kEscapeAtEndOfPattern);
   1045          // AtomEscape ::
   1046          //   [+UnicodeMode] DecimalEscape
   1047          //   [~UnicodeMode] DecimalEscape but only if the CapturingGroupNumber
   1048          //                  of DecimalEscape is ≤ NcapturingParens
   1049          //   CharacterEscape (some cases of this mixed in too)
   1050          //
   1051          // TODO(jgruber): It may make sense to disentangle all the different
   1052          // cases and make the structure mirror the spec, e.g. for AtomEscape:
   1053          //
   1054          //  if (TryParseDecimalEscape(...)) return;
   1055          //  if (TryParseCharacterClassEscape(...)) return;
   1056          //  if (TryParseCharacterEscape(...)) return;
   1057          //  if (TryParseGroupName(...)) return;
   1058          case '1':
   1059          case '2':
   1060          case '3':
   1061          case '4':
   1062          case '5':
   1063          case '6':
   1064          case '7':
   1065          case '8':
   1066          case '9': {
   1067            int index = 0;
   1068            const bool is_backref =
   1069                ParseBackReferenceIndex(&index CHECK_FAILED);
   1070            if (is_backref) {
   1071              if (state->IsInsideCaptureGroup(index)) {
   1072                // The back reference is inside the capture group it refers to.
   1073                // Nothing can possibly have been captured yet, so we use empty
   1074                // instead. This ensures that, when checking a back reference,
   1075                // the capture registers of the referenced capture are either
   1076                // both set or both cleared.
   1077                builder->AddEmpty();
   1078              } else {
   1079                RegExpCapture* capture = GetCapture(index);
   1080                RegExpTree* atom =
   1081                    zone()->template New<RegExpBackReference>(capture, zone());
   1082                builder->AddAtom(atom);
   1083              }
   1084              break;
   1085            }
   1086            // With /u and /v, no identity escapes except for syntax characters
   1087            // are allowed. Otherwise, all identity escapes are allowed.
   1088            if (IsUnicodeMode()) {
   1089              return ReportError(RegExpError::kInvalidEscape);
   1090            }
   1091            base::uc32 first_digit = Next();
   1092            if (first_digit == '8' || first_digit == '9') {
   1093              builder->AddCharacter(first_digit);
   1094              Advance(2);
   1095              break;
   1096            }
   1097            [[fallthrough]];
   1098          }
   1099          case '0': {
   1100            Advance();
   1101            if (IsUnicodeMode() && Next() >= '0' && Next() <= '9') {
   1102              // Decimal escape with leading 0 are not parsed as octal.
   1103              return ReportError(RegExpError::kInvalidDecimalEscape);
   1104            }
   1105            base::uc32 octal = ParseOctalLiteral();
   1106            builder->AddCharacter(octal);
   1107            break;
   1108          }
   1109          case 'b':
   1110            Advance(2);
   1111            builder->AddAssertion(zone()->template New<RegExpAssertion>(
   1112                RegExpAssertion::Type::BOUNDARY));
   1113            continue;
   1114          case 'B':
   1115            Advance(2);
   1116            builder->AddAssertion(zone()->template New<RegExpAssertion>(
   1117                RegExpAssertion::Type::NON_BOUNDARY));
   1118            continue;
   1119          // AtomEscape ::
   1120          //   CharacterClassEscape
   1121          case 'd':
   1122          case 'D':
   1123          case 's':
   1124          case 'S':
   1125          case 'w':
   1126          case 'W': {
   1127            base::uc32 next = Next();
   1128            ZoneList<CharacterRange>* ranges =
   1129                zone()->template New<ZoneList<CharacterRange>>(2, zone());
   1130            bool add_unicode_case_equivalents =
   1131                IsUnicodeMode() && ignore_case();
   1132            bool parsed_character_class_escape = TryParseCharacterClassEscape(
   1133                next, InClassEscapeState::kNotInClass, ranges, nullptr, zone(),
   1134                add_unicode_case_equivalents CHECK_FAILED);
   1135 
   1136            if (parsed_character_class_escape) {
   1137              RegExpClassRanges* cc =
   1138                  zone()->template New<RegExpClassRanges>(zone(), ranges);
   1139              builder->AddClassRanges(cc);
   1140            } else {
   1141              CHECK(!IsUnicodeMode());
   1142              Advance(2);
   1143              builder->AddCharacter(next);  // IdentityEscape.
   1144            }
   1145            break;
   1146          }
   1147          case 'p':
   1148          case 'P': {
   1149            base::uc32 next = Next();
   1150            ZoneList<CharacterRange>* ranges =
   1151                zone()->template New<ZoneList<CharacterRange>>(2, zone());
   1152            CharacterClassStrings* strings = nullptr;
   1153            if (unicode_sets()) {
   1154              strings = zone()->template New<CharacterClassStrings>(zone());
   1155            }
   1156            bool add_unicode_case_equivalents = ignore_case();
   1157            bool parsed_character_class_escape = TryParseCharacterClassEscape(
   1158                next, InClassEscapeState::kNotInClass, ranges, strings, zone(),
   1159                add_unicode_case_equivalents CHECK_FAILED);
   1160 
   1161            if (parsed_character_class_escape) {
   1162              if (unicode_sets()) {
   1163                RegExpClassSetOperand* op =
   1164                    zone()->template New<RegExpClassSetOperand>(ranges,
   1165                                                                strings);
   1166                builder->AddTerm(op);
   1167              } else {
   1168                RegExpClassRanges* cc =
   1169                    zone()->template New<RegExpClassRanges>(zone(), ranges);
   1170                builder->AddClassRanges(cc);
   1171              }
   1172            } else {
   1173              CHECK(!IsUnicodeMode());
   1174              Advance(2);
   1175              builder->AddCharacter(next);  // IdentityEscape.
   1176            }
   1177            break;
   1178          }
   1179          // AtomEscape ::
   1180          //   k GroupName
   1181          case 'k': {
   1182            // Either an identity escape or a named back-reference.  The two
   1183            // interpretations are mutually exclusive: '\k' is interpreted as
   1184            // an identity escape for non-Unicode patterns without named
   1185            // capture groups, and as the beginning of a named back-reference
   1186            // in all other cases.
   1187            const bool has_named_captures =
   1188                HasNamedCaptures(InClassEscapeState::kNotInClass CHECK_FAILED);
   1189            if (IsUnicodeMode() || has_named_captures) {
   1190              Advance(2);
   1191              ParseNamedBackReference(builder, state CHECK_FAILED);
   1192              break;
   1193            }
   1194          }
   1195            [[fallthrough]];
   1196          // AtomEscape ::
   1197          //   CharacterEscape
   1198          default: {
   1199            bool is_escaped_unicode_character = false;
   1200            base::uc32 c = ParseCharacterEscape(
   1201                InClassEscapeState::kNotInClass,
   1202                &is_escaped_unicode_character CHECK_FAILED);
   1203            if (is_escaped_unicode_character) {
   1204              builder->AddEscapedUnicodeCharacter(c);
   1205            } else {
   1206              builder->AddCharacter(c);
   1207            }
   1208            break;
   1209          }
   1210        }
   1211        break;
   1212      case '{': {
   1213        int dummy;
   1214        bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED);
   1215        if (parsed) return ReportError(RegExpError::kNothingToRepeat);
   1216        [[fallthrough]];
   1217      }
   1218      case '}':
   1219      case ']':
   1220        if (IsUnicodeMode()) {
   1221          return ReportError(RegExpError::kLoneQuantifierBrackets);
   1222        }
   1223        [[fallthrough]];
   1224      default:
   1225        builder->AddUnicodeCharacter(current());
   1226        Advance();
   1227        break;
   1228    }  // end switch(current())
   1229 
   1230    int min;
   1231    int max;
   1232    switch (current()) {
   1233      // QuantifierPrefix ::
   1234      //   *
   1235      //   +
   1236      //   ?
   1237      //   {
   1238      case '*':
   1239        min = 0;
   1240        max = RegExpTree::kInfinity;
   1241        Advance();
   1242        break;
   1243      case '+':
   1244        min = 1;
   1245        max = RegExpTree::kInfinity;
   1246        Advance();
   1247        break;
   1248      case '?':
   1249        min = 0;
   1250        max = 1;
   1251        Advance();
   1252        break;
   1253      case '{':
   1254        if (ParseIntervalQuantifier(&min, &max)) {
   1255          if (max < min) {
   1256            return ReportError(RegExpError::kRangeOutOfOrder);
   1257          }
   1258          break;
   1259        } else if (IsUnicodeMode()) {
   1260          // Incomplete quantifiers are not allowed.
   1261          return ReportError(RegExpError::kIncompleteQuantifier);
   1262        }
   1263        continue;
   1264      default:
   1265        continue;
   1266    }
   1267    RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
   1268    if (current() == '?') {
   1269      quantifier_type = RegExpQuantifier::NON_GREEDY;
   1270      Advance();
   1271    } else if (v8_flags.regexp_possessive_quantifier && current() == '+') {
   1272      // v8_flags.regexp_possessive_quantifier is a debug-only flag.
   1273      quantifier_type = RegExpQuantifier::POSSESSIVE;
   1274      Advance();
   1275    }
   1276    if (!builder->AddQuantifierToAtom(min, max, quantifier_count_,
   1277                                      quantifier_type)) {
   1278      return ReportError(RegExpError::kInvalidQuantifier);
   1279    }
   1280    ++quantifier_count_;
   1281  }
   1282 }
   1283 
   1284 template <class CharT>
   1285 RegExpParserState* RegExpParserImpl<CharT>::ParseOpenParenthesis(
   1286    RegExpParserState* state) {
   1287  RegExpLookaround::Type lookaround_type = state->lookaround_type();
   1288  bool is_named_capture = false;
   1289  const ZoneVector<base::uc16>* capture_name = nullptr;
   1290  SubexpressionType subexpr_type = CAPTURE;
   1291  RegExpFlags flags = state->builder()->flags();
   1292  bool parsing_modifiers = false;
   1293  bool modifiers_polarity = true;
   1294  RegExpFlags modifiers;
   1295  Advance();
   1296  if (current() == '?') {
   1297    do {
   1298      base::uc32 next = Next();
   1299      switch (next) {
   1300        case '-':
   1301          if (!v8_flags.js_regexp_modifiers) {
   1302            ReportError(RegExpError::kInvalidGroup);
   1303            return nullptr;
   1304          }
   1305          Advance();
   1306          parsing_modifiers = true;
   1307          if (modifiers_polarity == false) {
   1308            ReportError(RegExpError::kMultipleFlagDashes);
   1309            return nullptr;
   1310          }
   1311          modifiers_polarity = false;
   1312          break;
   1313        case 'm':
   1314        case 'i':
   1315        case 's': {
   1316          if (!v8_flags.js_regexp_modifiers) {
   1317            ReportError(RegExpError::kInvalidGroup);
   1318            return nullptr;
   1319          }
   1320          Advance();
   1321          parsing_modifiers = true;
   1322          RegExpFlag flag = TryRegExpFlagFromChar(next).value();
   1323          if ((modifiers & flag) != 0) {
   1324            ReportError(RegExpError::kRepeatedFlag);
   1325            return nullptr;
   1326          }
   1327          modifiers |= flag;
   1328          flags.set(flag, modifiers_polarity);
   1329          break;
   1330        }
   1331        case ':':
   1332          Advance(2);
   1333          parsing_modifiers = false;
   1334          subexpr_type = GROUPING;
   1335          break;
   1336        case '=':
   1337          Advance(2);
   1338          if (parsing_modifiers) {
   1339            DCHECK(v8_flags.js_regexp_modifiers);
   1340            ReportError(RegExpError::kInvalidGroup);
   1341            return nullptr;
   1342          }
   1343          lookaround_type = RegExpLookaround::LOOKAHEAD;
   1344          subexpr_type = POSITIVE_LOOKAROUND;
   1345          break;
   1346        case '!':
   1347          Advance(2);
   1348          if (parsing_modifiers) {
   1349            DCHECK(v8_flags.js_regexp_modifiers);
   1350            ReportError(RegExpError::kInvalidGroup);
   1351            return nullptr;
   1352          }
   1353          lookaround_type = RegExpLookaround::LOOKAHEAD;
   1354          subexpr_type = NEGATIVE_LOOKAROUND;
   1355          break;
   1356        case '<':
   1357          Advance();
   1358          if (parsing_modifiers) {
   1359            DCHECK(v8_flags.js_regexp_modifiers);
   1360            ReportError(RegExpError::kInvalidGroup);
   1361            return nullptr;
   1362          }
   1363          if (Next() == '=') {
   1364            Advance(2);
   1365            lookaround_type = RegExpLookaround::LOOKBEHIND;
   1366            subexpr_type = POSITIVE_LOOKAROUND;
   1367            break;
   1368          } else if (Next() == '!') {
   1369            Advance(2);
   1370            lookaround_type = RegExpLookaround::LOOKBEHIND;
   1371            subexpr_type = NEGATIVE_LOOKAROUND;
   1372            break;
   1373          }
   1374          is_named_capture = true;
   1375          has_named_captures_ = true;
   1376          Advance();
   1377          break;
   1378        default:
   1379          ReportError(RegExpError::kInvalidGroup);
   1380          return nullptr;
   1381      }
   1382    } while (parsing_modifiers);
   1383  }
   1384  if (modifiers_polarity == false) {
   1385    // We encountered a dash.
   1386    if (modifiers == 0) {
   1387      ReportError(RegExpError::kInvalidFlagGroup);
   1388      return nullptr;
   1389    }
   1390  }
   1391  if (subexpr_type == CAPTURE) {
   1392    if (captures_started_ >= RegExpMacroAssembler::kMaxCaptures) {
   1393      ReportError(RegExpError::kTooManyCaptures);
   1394      return nullptr;
   1395    }
   1396    captures_started_++;
   1397 
   1398    if (is_named_capture) {
   1399      capture_name = ParseCaptureGroupName(CHECK_FAILED);
   1400    }
   1401  }
   1402  // Store current state and begin new disjunction parsing.
   1403  return zone()->template New<RegExpParserState>(
   1404      state, subexpr_type, lookaround_type, captures_started_, capture_name,
   1405      flags, zone());
   1406 }
   1407 
   1408 // In order to know whether an escape is a backreference or not we have to scan
   1409 // the entire regexp and find the number of capturing parentheses.  However we
   1410 // don't want to scan the regexp twice unless it is necessary.  This mini-parser
   1411 // is called when needed.  It can see the difference between capturing and
   1412 // noncapturing parentheses and can skip character classes and backslash-escaped
   1413 // characters.
   1414 //
   1415 // Important: The scanner has to be in a consistent state when calling
   1416 // ScanForCaptures, e.g. not in the middle of an escape sequence '\[' or while
   1417 // parsing a nested class.
   1418 template <class CharT>
   1419 void RegExpParserImpl<CharT>::ScanForCaptures(
   1420    InClassEscapeState in_class_escape_state) {
   1421  DCHECK(!is_scanned_for_captures_);
   1422  const int saved_position = position();
   1423  // Start with captures started previous to current position
   1424  int capture_count = captures_started();
   1425  // When we start inside a character class, skip everything inside the class.
   1426  if (in_class_escape_state == InClassEscapeState::kInClass) {
   1427    // \k is always invalid within a class in unicode mode, thus we should never
   1428    // call ScanForCaptures within a class.
   1429    DCHECK(!IsUnicodeMode());
   1430    int c;
   1431    while ((c = current()) != kEndMarker) {
   1432      Advance();
   1433      if (c == '\\') {
   1434        Advance();
   1435      } else {
   1436        if (c == ']') break;
   1437      }
   1438    }
   1439  }
   1440  // Add count of captures after this position.
   1441  int n;
   1442  while ((n = current()) != kEndMarker) {
   1443    Advance();
   1444    switch (n) {
   1445      case '\\':
   1446        Advance();
   1447        break;
   1448      case '[': {
   1449        int class_nest_level = 0;
   1450        int c;
   1451        while ((c = current()) != kEndMarker) {
   1452          Advance();
   1453          if (c == '\\') {
   1454            Advance();
   1455          } else if (c == '[') {
   1456            // With /v, '[' inside a class is treated as a nested class.
   1457            // Without /v, '[' is a normal character.
   1458            if (unicode_sets()) class_nest_level++;
   1459          } else if (c == ']') {
   1460            if (class_nest_level == 0) break;
   1461            class_nest_level--;
   1462          }
   1463        }
   1464        break;
   1465      }
   1466      case '(':
   1467        if (current() == '?') {
   1468          // At this point we could be in
   1469          // * a non-capturing group '(:',
   1470          // * a lookbehind assertion '(?<=' '(?<!'
   1471          // * or a named capture '(?<'.
   1472          //
   1473          // Of these, only named captures are capturing groups.
   1474 
   1475          Advance();
   1476          if (current() != '<') break;
   1477 
   1478          Advance();
   1479          if (current() == '=' || current() == '!') break;
   1480 
   1481          // Found a possible named capture. It could turn out to be a syntax
   1482          // error (e.g. an unterminated or invalid name), but that distinction
   1483          // does not matter for our purposes.
   1484          has_named_captures_ = true;
   1485        }
   1486        capture_count++;
   1487        break;
   1488    }
   1489  }
   1490  capture_count_ = capture_count;
   1491  is_scanned_for_captures_ = true;
   1492  Reset(saved_position);
   1493 }
   1494 
   1495 template <class CharT>
   1496 bool RegExpParserImpl<CharT>::ParseBackReferenceIndex(int* index_out) {
   1497  DCHECK_EQ('\\', current());
   1498  DCHECK('1' <= Next() && Next() <= '9');
   1499  // Try to parse a decimal literal that is no greater than the total number
   1500  // of left capturing parentheses in the input.
   1501  int start = position();
   1502  int value = Next() - '0';
   1503  Advance(2);
   1504  while (true) {
   1505    base::uc32 c = current();
   1506    if (IsDecimalDigit(c)) {
   1507      value = 10 * value + (c - '0');
   1508      if (value > RegExpMacroAssembler::kMaxCaptures) {
   1509        Reset(start);
   1510        return false;
   1511      }
   1512      Advance();
   1513    } else {
   1514      break;
   1515    }
   1516  }
   1517  if (value > captures_started()) {
   1518    if (!is_scanned_for_captures_) {
   1519      ScanForCaptures(InClassEscapeState::kNotInClass);
   1520    }
   1521    if (value > capture_count_) {
   1522      Reset(start);
   1523      return false;
   1524    }
   1525  }
   1526  *index_out = value;
   1527  return true;
   1528 }
   1529 
   1530 namespace {
   1531 
   1532 void push_code_unit(ZoneVector<base::uc16>* v, uint32_t code_unit) {
   1533  if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
   1534    v->push_back(code_unit);
   1535  } else {
   1536    v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));
   1537    v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));
   1538  }
   1539 }
   1540 
   1541 }  // namespace
   1542 
   1543 template <class CharT>
   1544 const ZoneVector<base::uc16>* RegExpParserImpl<CharT>::ParseCaptureGroupName() {
   1545  // Due to special Advance requirements (see the next comment), rewind by one
   1546  // such that names starting with a surrogate pair are parsed correctly for
   1547  // patterns where the unicode flag is unset.
   1548  //
   1549  // Note that we use this odd pattern of rewinding the last advance in order
   1550  // to adhere to the common parser behavior of expecting `current` to point at
   1551  // the first candidate character for a function (e.g. when entering ParseFoo,
   1552  // `current` should point at the first character of Foo).
   1553  RewindByOneCodepoint();
   1554 
   1555  ZoneVector<base::uc16>* name =
   1556      zone()->template New<ZoneVector<base::uc16>>(zone());
   1557 
   1558  {
   1559    // Advance behavior inside this function is tricky since
   1560    // RegExpIdentifierName explicitly enables unicode (in spec terms, sets +U)
   1561    // and thus allows surrogate pairs and \u{}-style escapes even in
   1562    // non-unicode patterns. Therefore Advance within the capture group name
   1563    // has to force-enable unicode, and outside the name revert to default
   1564    // behavior.
   1565    ForceUnicodeScope force_unicode(this);
   1566 
   1567    bool at_start = true;
   1568    while (true) {
   1569      Advance();
   1570      base::uc32 c = current();
   1571 
   1572      // Convert unicode escapes.
   1573      if (c == '\\' && Next() == 'u') {
   1574        Advance(2);
   1575        if (!ParseUnicodeEscape(&c)) {
   1576          ReportError(RegExpError::kInvalidUnicodeEscape);
   1577          return nullptr;
   1578        }
   1579        RewindByOneCodepoint();
   1580      }
   1581 
   1582      // The backslash char is misclassified as both ID_Start and ID_Continue.
   1583      if (c == '\\') {
   1584        ReportError(RegExpError::kInvalidCaptureGroupName);
   1585        return nullptr;
   1586      }
   1587 
   1588      if (at_start) {
   1589        if (!IsIdentifierStart(c)) {
   1590          ReportError(RegExpError::kInvalidCaptureGroupName);
   1591          return nullptr;
   1592        }
   1593        push_code_unit(name, c);
   1594        at_start = false;
   1595      } else {
   1596        if (c == '>') {
   1597          break;
   1598        } else if (IsIdentifierPart(c)) {
   1599          push_code_unit(name, c);
   1600        } else {
   1601          ReportError(RegExpError::kInvalidCaptureGroupName);
   1602          return nullptr;
   1603        }
   1604      }
   1605    }
   1606  }
   1607 
   1608  // This final advance goes back into the state of pointing at the next
   1609  // relevant char, which the rest of the parser expects. See also the previous
   1610  // comments in this function.
   1611  Advance();
   1612  return name;
   1613 }
   1614 
   1615 template <class CharT>
   1616 bool RegExpParserImpl<CharT>::CreateNamedCaptureAtIndex(
   1617    const RegExpParserState* state, int index) {
   1618  const ZoneVector<base::uc16>* name = state->capture_name();
   1619  const std::pair<int, int> non_participating_capture_group_interval =
   1620      state->non_participating_capture_group_interval();
   1621  DCHECK(0 < index && index <= captures_started_);
   1622  DCHECK_NOT_NULL(name);
   1623 
   1624  RegExpCapture* capture = GetCapture(index);
   1625  DCHECK_NULL(capture->name());
   1626 
   1627  capture->set_name(name);
   1628 
   1629  if (named_captures_ == nullptr) {
   1630    named_captures_ = zone_->template New<
   1631        ZoneMap<RegExpCapture*, ZoneList<int>*, RegExpCaptureNameLess>>(zone());
   1632  } else {
   1633    // Check for duplicates and bail if we find any.
   1634    const auto& named_capture_it = named_captures_->find(capture);
   1635    if (named_capture_it != named_captures_->end()) {
   1636      if (v8_flags.js_regexp_duplicate_named_groups) {
   1637        ZoneList<int>* named_capture_indices = named_capture_it->second;
   1638        DCHECK_NOT_NULL(named_capture_indices);
   1639        DCHECK(!named_capture_indices->is_empty());
   1640        for (int named_index : *named_capture_indices) {
   1641          if (named_index <= non_participating_capture_group_interval.first ||
   1642              named_index > non_participating_capture_group_interval.second) {
   1643            ReportError(RegExpError::kDuplicateCaptureGroupName);
   1644            return false;
   1645          }
   1646        }
   1647      } else {
   1648        ReportError(RegExpError::kDuplicateCaptureGroupName);
   1649        return false;
   1650      }
   1651    }
   1652  }
   1653  if (v8_flags.js_regexp_duplicate_named_groups) {
   1654    // Check for nested named captures. This is necessary to find duplicate
   1655    // named captures within the same disjunct.
   1656    RegExpParserState* parent_state = state->previous_state();
   1657    if (parent_state && parent_state->IsInsideCaptureGroup(name)) {
   1658      ReportError(RegExpError::kDuplicateCaptureGroupName);
   1659      return false;
   1660    }
   1661  }
   1662 
   1663  auto entry = named_captures_->try_emplace(
   1664      capture, zone()->template New<ZoneList<int>>(1, zone()));
   1665  entry.first->second->Add(index, zone());
   1666  return true;
   1667 }
   1668 
   1669 template <class CharT>
   1670 bool RegExpParserImpl<CharT>::ParseNamedBackReference(
   1671    RegExpBuilder* builder, RegExpParserState* state) {
   1672  // The parser is assumed to be on the '<' in \k<name>.
   1673  if (current() != '<') {
   1674    ReportError(RegExpError::kInvalidNamedReference);
   1675    return false;
   1676  }
   1677 
   1678  Advance();
   1679  const ZoneVector<base::uc16>* name = ParseCaptureGroupName();
   1680  if (name == nullptr) {
   1681    return false;
   1682  }
   1683 
   1684  if (state->IsInsideCaptureGroup(name)) {
   1685    builder->AddEmpty();
   1686  } else {
   1687    RegExpBackReference* atom =
   1688        zone()->template New<RegExpBackReference>(zone());
   1689    atom->set_name(name);
   1690 
   1691    builder->AddAtom(atom);
   1692 
   1693    if (named_back_references_ == nullptr) {
   1694      named_back_references_ =
   1695          zone()->template New<ZoneList<RegExpBackReference*>>(1, zone());
   1696    }
   1697    named_back_references_->Add(atom, zone());
   1698  }
   1699 
   1700  return true;
   1701 }
   1702 
   1703 template <class CharT>
   1704 void RegExpParserImpl<CharT>::PatchNamedBackReferences() {
   1705  if (named_back_references_ == nullptr) return;
   1706 
   1707  if (named_captures_ == nullptr) {
   1708    ReportError(RegExpError::kInvalidNamedCaptureReference);
   1709    return;
   1710  }
   1711 
   1712  // Look up and patch the actual capture for each named back reference.
   1713 
   1714  for (int i = 0; i < named_back_references_->length(); i++) {
   1715    RegExpBackReference* ref = named_back_references_->at(i);
   1716 
   1717    // Capture used to search the named_captures_ by name, index of the
   1718    // capture is never used.
   1719    static const int kInvalidIndex = 0;
   1720    RegExpCapture* search_capture =
   1721        zone()->template New<RegExpCapture>(kInvalidIndex);
   1722    DCHECK_NULL(search_capture->name());
   1723    search_capture->set_name(ref->name());
   1724 
   1725    const auto& capture_it = named_captures_->find(search_capture);
   1726    if (capture_it == named_captures_->end()) {
   1727      ReportError(RegExpError::kInvalidNamedCaptureReference);
   1728      return;
   1729    }
   1730 
   1731    DCHECK_IMPLIES(!v8_flags.js_regexp_duplicate_named_groups,
   1732                   capture_it->second->length() == 1);
   1733    for (int index : *capture_it->second) {
   1734      ref->add_capture(GetCapture(index), zone());
   1735    }
   1736  }
   1737 }
   1738 
   1739 template <class CharT>
   1740 RegExpCapture* RegExpParserImpl<CharT>::GetCapture(int index) {
   1741  // The index for the capture groups are one-based. Its index in the list is
   1742  // zero-based.
   1743  const int known_captures =
   1744      is_scanned_for_captures_ ? capture_count_ : captures_started_;
   1745  SBXCHECK(index >= 1 && index <= known_captures);
   1746  if (captures_ == nullptr) {
   1747    captures_ =
   1748        zone()->template New<ZoneList<RegExpCapture*>>(known_captures, zone());
   1749  }
   1750  while (captures_->length() < known_captures) {
   1751    captures_->Add(zone()->template New<RegExpCapture>(captures_->length() + 1),
   1752                   zone());
   1753  }
   1754  return captures_->at(index - 1);
   1755 }
   1756 
   1757 template <class CharT>
   1758 ZoneVector<RegExpCapture*>* RegExpParserImpl<CharT>::GetNamedCaptures() {
   1759  if (named_captures_ == nullptr) {
   1760    return nullptr;
   1761  }
   1762  DCHECK(!named_captures_->empty());
   1763 
   1764  ZoneVector<RegExpCapture*>* flattened_named_captures =
   1765      zone()->template New<ZoneVector<RegExpCapture*>>(zone());
   1766  for (auto capture : *named_captures_) {
   1767    DCHECK_IMPLIES(!v8_flags.js_regexp_duplicate_named_groups,
   1768                   capture.second->length() == 1);
   1769    for (int index : *capture.second) {
   1770      flattened_named_captures->push_back(GetCapture(index));
   1771    }
   1772  }
   1773  return flattened_named_captures;
   1774 }
   1775 
   1776 template <class CharT>
   1777 bool RegExpParserImpl<CharT>::HasNamedCaptures(
   1778    InClassEscapeState in_class_escape_state) {
   1779  if (has_named_captures_ || is_scanned_for_captures_) {
   1780    return has_named_captures_;
   1781  }
   1782 
   1783  ScanForCaptures(in_class_escape_state);
   1784  DCHECK(is_scanned_for_captures_);
   1785  return has_named_captures_;
   1786 }
   1787 
   1788 // QuantifierPrefix ::
   1789 //   { DecimalDigits }
   1790 //   { DecimalDigits , }
   1791 //   { DecimalDigits , DecimalDigits }
   1792 //
   1793 // Returns true if parsing succeeds, and set the min_out and max_out
   1794 // values. Values are truncated to RegExpTree::kInfinity if they overflow.
   1795 template <class CharT>
   1796 bool RegExpParserImpl<CharT>::ParseIntervalQuantifier(int* min_out,
   1797                                                      int* max_out) {
   1798  DCHECK_EQ(current(), '{');
   1799  int start = position();
   1800  Advance();
   1801  int min = 0;
   1802  if (!IsDecimalDigit(current())) {
   1803    Reset(start);
   1804    return false;
   1805  }
   1806  while (IsDecimalDigit(current())) {
   1807    int next = current() - '0';
   1808    if (min > (RegExpTree::kInfinity - next) / 10) {
   1809      // Overflow. Skip past remaining decimal digits and return -1.
   1810      do {
   1811        Advance();
   1812      } while (IsDecimalDigit(current()));
   1813      min = RegExpTree::kInfinity;
   1814      break;
   1815    }
   1816    min = 10 * min + next;
   1817    Advance();
   1818  }
   1819  int max = 0;
   1820  if (current() == '}') {
   1821    max = min;
   1822    Advance();
   1823  } else if (current() == ',') {
   1824    Advance();
   1825    if (current() == '}') {
   1826      max = RegExpTree::kInfinity;
   1827      Advance();
   1828    } else {
   1829      while (IsDecimalDigit(current())) {
   1830        int next = current() - '0';
   1831        if (max > (RegExpTree::kInfinity - next) / 10) {
   1832          do {
   1833            Advance();
   1834          } while (IsDecimalDigit(current()));
   1835          max = RegExpTree::kInfinity;
   1836          break;
   1837        }
   1838        max = 10 * max + next;
   1839        Advance();
   1840      }
   1841      if (current() != '}') {
   1842        Reset(start);
   1843        return false;
   1844      }
   1845      Advance();
   1846    }
   1847  } else {
   1848    Reset(start);
   1849    return false;
   1850  }
   1851  *min_out = min;
   1852  *max_out = max;
   1853  return true;
   1854 }
   1855 
   1856 template <class CharT>
   1857 base::uc32 RegExpParserImpl<CharT>::ParseOctalLiteral() {
   1858  DCHECK(('0' <= current() && current() <= '7') || !has_more());
   1859  // For compatibility with some other browsers (not all), we parse
   1860  // up to three octal digits with a value below 256.
   1861  // ES#prod-annexB-LegacyOctalEscapeSequence
   1862  base::uc32 value = current() - '0';
   1863  Advance();
   1864  if ('0' <= current() && current() <= '7') {
   1865    value = value * 8 + current() - '0';
   1866    Advance();
   1867    if (value < 32 && '0' <= current() && current() <= '7') {
   1868      value = value * 8 + current() - '0';
   1869      Advance();
   1870    }
   1871  }
   1872  return value;
   1873 }
   1874 
   1875 template <class CharT>
   1876 bool RegExpParserImpl<CharT>::ParseHexEscape(int length, base::uc32* value) {
   1877  int start = position();
   1878  base::uc32 val = 0;
   1879  for (int i = 0; i < length; ++i) {
   1880    base::uc32 c = current();
   1881    int d = base::HexValue(c);
   1882    if (d < 0) {
   1883      Reset(start);
   1884      return false;
   1885    }
   1886    val = val * 16 + d;
   1887    Advance();
   1888  }
   1889  *value = val;
   1890  return true;
   1891 }
   1892 
   1893 // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
   1894 template <class CharT>
   1895 bool RegExpParserImpl<CharT>::ParseUnicodeEscape(base::uc32* value) {
   1896  // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
   1897  // allowed). In the latter case, the number of hex digits between { } is
   1898  // arbitrary. \ and u have already been read.
   1899  if (current() == '{' && IsUnicodeMode()) {
   1900    int start = position();
   1901    Advance();
   1902    if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) {
   1903      if (current() == '}') {
   1904        Advance();
   1905        return true;
   1906      }
   1907    }
   1908    Reset(start);
   1909    return false;
   1910  }
   1911  // \u but no {, or \u{...} escapes not allowed.
   1912  bool result = ParseHexEscape(4, value);
   1913  if (result && IsUnicodeMode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
   1914      current() == '\\') {
   1915    // Attempt to read trail surrogate.
   1916    int start = position();
   1917    if (Next() == 'u') {
   1918      Advance(2);
   1919      base::uc32 trail;
   1920      if (ParseHexEscape(4, &trail) &&
   1921          unibrow::Utf16::IsTrailSurrogate(trail)) {
   1922        *value = unibrow::Utf16::CombineSurrogatePair(
   1923            static_cast<base::uc16>(*value), static_cast<base::uc16>(trail));
   1924        return true;
   1925      }
   1926    }
   1927    Reset(start);
   1928  }
   1929  return result;
   1930 }
   1931 
   1932 #ifdef V8_INTL_SUPPORT
   1933 
   1934 namespace {
   1935 
   1936 bool IsExactPropertyAlias(const char* property_name, UProperty property) {
   1937  const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
   1938  if (short_name != nullptr && strcmp(property_name, short_name) == 0)
   1939    return true;
   1940  for (int i = 0;; i++) {
   1941    const char* long_name = u_getPropertyName(
   1942        property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
   1943    if (long_name == nullptr) break;
   1944    if (strcmp(property_name, long_name) == 0) return true;
   1945  }
   1946  return false;
   1947 }
   1948 
   1949 bool IsExactPropertyValueAlias(const char* property_value_name,
   1950                               UProperty property, int32_t property_value) {
   1951  const char* short_name =
   1952      u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
   1953  if (short_name != nullptr && strcmp(property_value_name, short_name) == 0) {
   1954    return true;
   1955  }
   1956  for (int i = 0;; i++) {
   1957    const char* long_name = u_getPropertyValueName(
   1958        property, property_value,
   1959        static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
   1960    if (long_name == nullptr) break;
   1961    if (strcmp(property_value_name, long_name) == 0) return true;
   1962  }
   1963  return false;
   1964 }
   1965 
   1966 void ExtractStringsFromUnicodeSet(const icu::UnicodeSet& set,
   1967                                  CharacterClassStrings* strings,
   1968                                  RegExpFlags flags, Zone* zone) {
   1969  DCHECK(set.hasStrings());
   1970  DCHECK(IsUnicodeSets(flags));
   1971  DCHECK_NOT_NULL(strings);
   1972 
   1973  RegExpTextBuilder::SmallRegExpTreeVector string_storage(zone);
   1974  RegExpTextBuilder string_builder(zone, &string_storage, flags);
   1975  const bool needs_case_folding = IsIgnoreCase(flags);
   1976  icu::UnicodeSetIterator iter(set);
   1977  iter.skipToStrings();
   1978  while (iter.next()) {
   1979    const icu::UnicodeString& s = iter.getString();
   1980    const char16_t* p = s.getBuffer();
   1981    int32_t length = s.length();
   1982    ZoneList<base::uc32>* string =
   1983        zone->template New<ZoneList<base::uc32>>(length, zone);
   1984    for (int32_t i = 0; i < length;) {
   1985      UChar32 c;
   1986      U16_NEXT(p, i, length, c);
   1987      string_builder.AddUnicodeCharacter(c);
   1988      if (needs_case_folding) {
   1989        c = u_foldCase(c, U_FOLD_CASE_DEFAULT);
   1990      }
   1991      string->Add(c, zone);
   1992    }
   1993    strings->emplace(string->ToVector(), string_builder.ToRegExp());
   1994    string_storage.clear();
   1995  }
   1996 }
   1997 
   1998 bool LookupPropertyValueName(UProperty property,
   1999                             const char* property_value_name, bool negate,
   2000                             ZoneList<CharacterRange>* result_ranges,
   2001                             CharacterClassStrings* result_strings,
   2002                             RegExpFlags flags, Zone* zone) {
   2003  UProperty property_for_lookup = property;
   2004  if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) {
   2005    // For the property Script_Extensions, we have to do the property value
   2006    // name lookup as if the property is Script.
   2007    property_for_lookup = UCHAR_SCRIPT;
   2008  }
   2009  int32_t property_value =
   2010      u_getPropertyValueEnum(property_for_lookup, property_value_name);
   2011  if (property_value == UCHAR_INVALID_CODE) return false;
   2012 
   2013  // We require the property name to match exactly to one of the property value
   2014  // aliases. However, u_getPropertyValueEnum uses loose matching.
   2015  if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup,
   2016                                 property_value)) {
   2017    return false;
   2018  }
   2019 
   2020  UErrorCode ec = U_ZERO_ERROR;
   2021  icu::UnicodeSet set;
   2022  set.applyIntPropertyValue(property, property_value, ec);
   2023  bool success = ec == U_ZERO_ERROR && !set.isEmpty();
   2024 
   2025  if (success) {
   2026    if (set.hasStrings()) {
   2027      ExtractStringsFromUnicodeSet(set, result_strings, flags, zone);
   2028    }
   2029    const bool needs_case_folding = IsUnicodeSets(flags) && IsIgnoreCase(flags);
   2030    if (needs_case_folding) set.closeOver(USET_SIMPLE_CASE_INSENSITIVE);
   2031    set.removeAllStrings();
   2032    if (negate) set.complement();
   2033    for (int i = 0; i < set.getRangeCount(); i++) {
   2034      result_ranges->Add(
   2035          CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
   2036          zone);
   2037    }
   2038  }
   2039  return success;
   2040 }
   2041 
   2042 template <size_t N>
   2043 inline bool NameEquals(const char* name, const char (&literal)[N]) {
   2044  return strncmp(name, literal, N + 1) == 0;
   2045 }
   2046 
   2047 bool LookupSpecialPropertyValueName(const char* name,
   2048                                    ZoneList<CharacterRange>* result,
   2049                                    bool negate, RegExpFlags flags,
   2050                                    Zone* zone) {
   2051  if (NameEquals(name, "Any")) {
   2052    if (negate) {
   2053      // Leave the list of character ranges empty, since the negation of 'Any'
   2054      // is the empty set.
   2055    } else {
   2056      result->Add(CharacterRange::Everything(), zone);
   2057    }
   2058  } else if (NameEquals(name, "ASCII")) {
   2059    result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint)
   2060                       : CharacterRange::Range(0x0, 0x7F),
   2061                zone);
   2062  } else if (NameEquals(name, "Assigned")) {
   2063    return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned",
   2064                                   !negate, result, nullptr, flags, zone);
   2065  } else {
   2066    return false;
   2067  }
   2068  return true;
   2069 }
   2070 
   2071 // Explicitly allowlist supported binary properties. The spec forbids supporting
   2072 // properties outside of this set to ensure interoperability.
   2073 bool IsSupportedBinaryProperty(UProperty property, bool unicode_sets) {
   2074  switch (property) {
   2075    case UCHAR_ALPHABETIC:
   2076    // 'Any' is not supported by ICU. See LookupSpecialPropertyValueName.
   2077    // 'ASCII' is not supported by ICU. See LookupSpecialPropertyValueName.
   2078    case UCHAR_ASCII_HEX_DIGIT:
   2079    // 'Assigned' is not supported by ICU. See LookupSpecialPropertyValueName.
   2080    case UCHAR_BIDI_CONTROL:
   2081    case UCHAR_BIDI_MIRRORED:
   2082    case UCHAR_CASE_IGNORABLE:
   2083    case UCHAR_CASED:
   2084    case UCHAR_CHANGES_WHEN_CASEFOLDED:
   2085    case UCHAR_CHANGES_WHEN_CASEMAPPED:
   2086    case UCHAR_CHANGES_WHEN_LOWERCASED:
   2087    case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED:
   2088    case UCHAR_CHANGES_WHEN_TITLECASED:
   2089    case UCHAR_CHANGES_WHEN_UPPERCASED:
   2090    case UCHAR_DASH:
   2091    case UCHAR_DEFAULT_IGNORABLE_CODE_POINT:
   2092    case UCHAR_DEPRECATED:
   2093    case UCHAR_DIACRITIC:
   2094    case UCHAR_EMOJI:
   2095    case UCHAR_EMOJI_COMPONENT:
   2096    case UCHAR_EMOJI_MODIFIER_BASE:
   2097    case UCHAR_EMOJI_MODIFIER:
   2098    case UCHAR_EMOJI_PRESENTATION:
   2099    case UCHAR_EXTENDED_PICTOGRAPHIC:
   2100    case UCHAR_EXTENDER:
   2101    case UCHAR_GRAPHEME_BASE:
   2102    case UCHAR_GRAPHEME_EXTEND:
   2103    case UCHAR_HEX_DIGIT:
   2104    case UCHAR_ID_CONTINUE:
   2105    case UCHAR_ID_START:
   2106    case UCHAR_IDEOGRAPHIC:
   2107    case UCHAR_IDS_BINARY_OPERATOR:
   2108    case UCHAR_IDS_TRINARY_OPERATOR:
   2109    case UCHAR_JOIN_CONTROL:
   2110    case UCHAR_LOGICAL_ORDER_EXCEPTION:
   2111    case UCHAR_LOWERCASE:
   2112    case UCHAR_MATH:
   2113    case UCHAR_NONCHARACTER_CODE_POINT:
   2114    case UCHAR_PATTERN_SYNTAX:
   2115    case UCHAR_PATTERN_WHITE_SPACE:
   2116    case UCHAR_QUOTATION_MARK:
   2117    case UCHAR_RADICAL:
   2118    case UCHAR_REGIONAL_INDICATOR:
   2119    case UCHAR_S_TERM:
   2120    case UCHAR_SOFT_DOTTED:
   2121    case UCHAR_TERMINAL_PUNCTUATION:
   2122    case UCHAR_UNIFIED_IDEOGRAPH:
   2123    case UCHAR_UPPERCASE:
   2124    case UCHAR_VARIATION_SELECTOR:
   2125    case UCHAR_WHITE_SPACE:
   2126    case UCHAR_XID_CONTINUE:
   2127    case UCHAR_XID_START:
   2128      return true;
   2129    case UCHAR_BASIC_EMOJI:
   2130    case UCHAR_EMOJI_KEYCAP_SEQUENCE:
   2131    case UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE:
   2132    case UCHAR_RGI_EMOJI_FLAG_SEQUENCE:
   2133    case UCHAR_RGI_EMOJI_TAG_SEQUENCE:
   2134    case UCHAR_RGI_EMOJI_ZWJ_SEQUENCE:
   2135    case UCHAR_RGI_EMOJI:
   2136      return unicode_sets;
   2137    default:
   2138      break;
   2139  }
   2140  return false;
   2141 }
   2142 
   2143 bool IsBinaryPropertyOfStrings(UProperty property) {
   2144  switch (property) {
   2145    case UCHAR_BASIC_EMOJI:
   2146    case UCHAR_EMOJI_KEYCAP_SEQUENCE:
   2147    case UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE:
   2148    case UCHAR_RGI_EMOJI_FLAG_SEQUENCE:
   2149    case UCHAR_RGI_EMOJI_TAG_SEQUENCE:
   2150    case UCHAR_RGI_EMOJI_ZWJ_SEQUENCE:
   2151    case UCHAR_RGI_EMOJI:
   2152      return true;
   2153    default:
   2154      break;
   2155  }
   2156  return false;
   2157 }
   2158 
   2159 bool IsUnicodePropertyValueCharacter(char c) {
   2160  // https://tc39.github.io/proposal-regexp-unicode-property-escapes/
   2161  //
   2162  // Note that using this to validate each parsed char is quite conservative.
   2163  // A possible alternative solution would be to only ensure the parsed
   2164  // property name/value candidate string does not contain '\0' characters and
   2165  // let ICU lookups trigger the final failure.
   2166  if ('a' <= c && c <= 'z') return true;
   2167  if ('A' <= c && c <= 'Z') return true;
   2168  if ('0' <= c && c <= '9') return true;
   2169  return (c == '_');
   2170 }
   2171 
   2172 }  // namespace
   2173 
   2174 template <class CharT>
   2175 bool RegExpParserImpl<CharT>::ParsePropertyClassName(ZoneVector<char>* name_1,
   2176                                                     ZoneVector<char>* name_2) {
   2177  DCHECK(name_1->empty());
   2178  DCHECK(name_2->empty());
   2179  // Parse the property class as follows:
   2180  // - In \p{name}, 'name' is interpreted
   2181  //   - either as a general category property value name.
   2182  //   - or as a binary property name.
   2183  // - In \p{name=value}, 'name' is interpreted as an enumerated property name,
   2184  //   and 'value' is interpreted as one of the available property value names.
   2185  // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
   2186  // - Loose matching is not applied.
   2187  if (current() == '{') {
   2188    // Parse \p{[PropertyName=]PropertyNameValue}
   2189    for (Advance(); current() != '}' && current() != '='; Advance()) {
   2190      if (!IsUnicodePropertyValueCharacter(current())) return false;
   2191      if (!has_next()) return false;
   2192      name_1->push_back(static_cast<char>(current()));
   2193    }
   2194    if (current() == '=') {
   2195      for (Advance(); current() != '}'; Advance()) {
   2196        if (!IsUnicodePropertyValueCharacter(current())) return false;
   2197        if (!has_next()) return false;
   2198        name_2->push_back(static_cast<char>(current()));
   2199      }
   2200      name_2->push_back(0);  // null-terminate string.
   2201    }
   2202  } else {
   2203    return false;
   2204  }
   2205  Advance();
   2206  name_1->push_back(0);  // null-terminate string.
   2207 
   2208  DCHECK(name_1->size() - 1 == std::strlen(name_1->data()));
   2209  DCHECK(name_2->empty() || name_2->size() - 1 == std::strlen(name_2->data()));
   2210  return true;
   2211 }
   2212 
   2213 template <class CharT>
   2214 bool RegExpParserImpl<CharT>::AddPropertyClassRange(
   2215    ZoneList<CharacterRange>* add_to_ranges,
   2216    CharacterClassStrings* add_to_strings, bool negate,
   2217    const ZoneVector<char>& name_1, const ZoneVector<char>& name_2) {
   2218  if (name_2.empty()) {
   2219    // First attempt to interpret as general category property value name.
   2220    const char* name = name_1.data();
   2221    if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
   2222                                add_to_ranges, add_to_strings, flags(),
   2223                                zone())) {
   2224      return true;
   2225    }
   2226    // Interpret "Any", "ASCII", and "Assigned".
   2227    if (LookupSpecialPropertyValueName(name, add_to_ranges, negate, flags(),
   2228                                       zone())) {
   2229      return true;
   2230    }
   2231    // Then attempt to interpret as binary property name with value name 'Y'.
   2232    UProperty property = u_getPropertyEnum(name);
   2233    if (!IsSupportedBinaryProperty(property, unicode_sets())) return false;
   2234    if (!IsExactPropertyAlias(name, property)) return false;
   2235    // Negation of properties with strings is not allowed.
   2236    // See
   2237    // https://tc39.es/ecma262/#sec-static-semantics-maycontainstrings
   2238    if (negate && IsBinaryPropertyOfStrings(property)) return false;
   2239    if (unicode_sets()) {
   2240      // In /v mode we can't simple lookup the "false" binary property values,
   2241      // as the spec requires us to perform case folding before calculating the
   2242      // complement.
   2243      // See https://tc39.es/ecma262/#sec-compiletocharset
   2244      // UnicodePropertyValueExpression :: LoneUnicodePropertyNameOrValue
   2245      return LookupPropertyValueName(property, "Y", negate, add_to_ranges,
   2246                                     add_to_strings, flags(), zone());
   2247    } else {
   2248      return LookupPropertyValueName(property, negate ? "N" : "Y", false,
   2249                                     add_to_ranges, add_to_strings, flags(),
   2250                                     zone());
   2251    }
   2252  } else {
   2253    // Both property name and value name are specified. Attempt to interpret
   2254    // the property name as enumerated property.
   2255    const char* property_name = name_1.data();
   2256    const char* value_name = name_2.data();
   2257    UProperty property = u_getPropertyEnum(property_name);
   2258    if (!IsExactPropertyAlias(property_name, property)) return false;
   2259    if (property == UCHAR_GENERAL_CATEGORY) {
   2260      // We want to allow aggregate value names such as "Letter".
   2261      property = UCHAR_GENERAL_CATEGORY_MASK;
   2262    } else if (property != UCHAR_SCRIPT &&
   2263               property != UCHAR_SCRIPT_EXTENSIONS) {
   2264      return false;
   2265    }
   2266    return LookupPropertyValueName(property, value_name, negate, add_to_ranges,
   2267                                   add_to_strings, flags(), zone());
   2268  }
   2269 }
   2270 
   2271 #else  // V8_INTL_SUPPORT
   2272 
   2273 template <class CharT>
   2274 bool RegExpParserImpl<CharT>::ParsePropertyClassName(ZoneVector<char>* name_1,
   2275                                                     ZoneVector<char>* name_2) {
   2276  return false;
   2277 }
   2278 
   2279 template <class CharT>
   2280 bool RegExpParserImpl<CharT>::AddPropertyClassRange(
   2281    ZoneList<CharacterRange>* add_to_ranges,
   2282    CharacterClassStrings* add_to_strings, bool negate,
   2283    const ZoneVector<char>& name_1, const ZoneVector<char>& name_2) {
   2284  return false;
   2285 }
   2286 
   2287 #endif  // V8_INTL_SUPPORT
   2288 
   2289 template <class CharT>
   2290 bool RegExpParserImpl<CharT>::ParseUnlimitedLengthHexNumber(int max_value,
   2291                                                            base::uc32* value) {
   2292  base::uc32 x = 0;
   2293  int d = base::HexValue(current());
   2294  if (d < 0) {
   2295    return false;
   2296  }
   2297  while (d >= 0) {
   2298    x = x * 16 + d;
   2299    if (x > static_cast<base::uc32>(max_value)) {
   2300      return false;
   2301    }
   2302    Advance();
   2303    d = base::HexValue(current());
   2304  }
   2305  *value = x;
   2306  return true;
   2307 }
   2308 
   2309 // https://tc39.es/ecma262/#prod-CharacterEscape
   2310 template <class CharT>
   2311 base::uc32 RegExpParserImpl<CharT>::ParseCharacterEscape(
   2312    InClassEscapeState in_class_escape_state,
   2313    bool* is_escaped_unicode_character) {
   2314  DCHECK_EQ('\\', current());
   2315  DCHECK(has_next());
   2316 
   2317  Advance();
   2318 
   2319  const base::uc32 c = current();
   2320  switch (c) {
   2321    // CharacterEscape ::
   2322    //   ControlEscape :: one of
   2323    //     f n r t v
   2324    case 'f':
   2325      Advance();
   2326      return '\f';
   2327    case 'n':
   2328      Advance();
   2329      return '\n';
   2330    case 'r':
   2331      Advance();
   2332      return '\r';
   2333    case 't':
   2334      Advance();
   2335      return '\t';
   2336    case 'v':
   2337      Advance();
   2338      return '\v';
   2339    // CharacterEscape ::
   2340    //   c ControlLetter
   2341    case 'c': {
   2342      base::uc32 controlLetter = Next();
   2343      base::uc32 letter = controlLetter & ~('A' ^ 'a');
   2344      if (letter >= 'A' && letter <= 'Z') {
   2345        Advance(2);
   2346        // Control letters mapped to ASCII control characters in the range
   2347        // 0x00-0x1F.
   2348        return controlLetter & 0x1F;
   2349      }
   2350      if (IsUnicodeMode()) {
   2351        // With /u and /v, invalid escapes are not treated as identity escapes.
   2352        ReportError(RegExpError::kInvalidUnicodeEscape);
   2353        return 0;
   2354      }
   2355      if (in_class_escape_state == InClassEscapeState::kInClass) {
   2356        // Inside a character class, we also accept digits and underscore as
   2357        // control characters, unless with /u or /v. See Annex B:
   2358        // ES#prod-annexB-ClassControlLetter
   2359        if ((controlLetter >= '0' && controlLetter <= '9') ||
   2360            controlLetter == '_') {
   2361          Advance(2);
   2362          return controlLetter & 0x1F;
   2363        }
   2364      }
   2365      // We match JSC in reading the backslash as a literal
   2366      // character instead of as starting an escape.
   2367      return '\\';
   2368    }
   2369    // CharacterEscape ::
   2370    //   0 [lookahead ∉ DecimalDigit]
   2371    //   [~UnicodeMode] LegacyOctalEscapeSequence
   2372    case '0':
   2373      // \0 is interpreted as NUL if not followed by another digit.
   2374      if (Next() < '0' || Next() > '9') {
   2375        Advance();
   2376        return 0;
   2377      }
   2378      [[fallthrough]];
   2379    case '1':
   2380    case '2':
   2381    case '3':
   2382    case '4':
   2383    case '5':
   2384    case '6':
   2385    case '7':
   2386      // For compatibility, we interpret a decimal escape that isn't
   2387      // a back reference (and therefore either \0 or not valid according
   2388      // to the specification) as a 1..3 digit octal character code.
   2389      // ES#prod-annexB-LegacyOctalEscapeSequence
   2390      if (IsUnicodeMode()) {
   2391        // With /u or /v, decimal escape is not interpreted as octal character
   2392        // code.
   2393        ReportError(RegExpError::kInvalidDecimalEscape);
   2394        return 0;
   2395      }
   2396      return ParseOctalLiteral();
   2397    // CharacterEscape ::
   2398    //   HexEscapeSequence
   2399    case 'x': {
   2400      Advance();
   2401      base::uc32 value;
   2402      if (ParseHexEscape(2, &value)) return value;
   2403      if (IsUnicodeMode()) {
   2404        // With /u or /v, invalid escapes are not treated as identity escapes.
   2405        ReportError(RegExpError::kInvalidEscape);
   2406        return 0;
   2407      }
   2408      // If \x is not followed by a two-digit hexadecimal, treat it
   2409      // as an identity escape.
   2410      return 'x';
   2411    }
   2412    // CharacterEscape ::
   2413    //   RegExpUnicodeEscapeSequence [?UnicodeMode]
   2414    case 'u': {
   2415      Advance();
   2416      base::uc32 value;
   2417      if (ParseUnicodeEscape(&value)) {
   2418        *is_escaped_unicode_character = true;
   2419        return value;
   2420      }
   2421      if (IsUnicodeMode()) {
   2422        // With /u or /v, invalid escapes are not treated as identity escapes.
   2423        ReportError(RegExpError::kInvalidUnicodeEscape);
   2424        return 0;
   2425      }
   2426      // If \u is not followed by a two-digit hexadecimal, treat it
   2427      // as an identity escape.
   2428      return 'u';
   2429    }
   2430    default:
   2431      break;
   2432  }
   2433 
   2434  // CharacterEscape ::
   2435  //   IdentityEscape[?UnicodeMode, ?N]
   2436  //
   2437  // * With /u, no identity escapes except for syntax characters are
   2438  //   allowed.
   2439  // * With /v, no identity escapes except for syntax characters and
   2440  //   ClassSetReservedPunctuators (if within a class) are allowed.
   2441  // * Without /u or /v:
   2442  //   * '\c' is not an IdentityEscape.
   2443  //   * '\k' is not an IdentityEscape when named captures exist.
   2444  //   * Otherwise, all identity escapes are allowed.
   2445  if (unicode_sets() && in_class_escape_state == InClassEscapeState::kInClass) {
   2446    if (IsClassSetReservedPunctuator(c)) {
   2447      Advance();
   2448      return c;
   2449    }
   2450  }
   2451  if (IsUnicodeMode()) {
   2452    if (!IsSyntaxCharacterOrSlash(c)) {
   2453      ReportError(RegExpError::kInvalidEscape);
   2454      return 0;
   2455    }
   2456    Advance();
   2457    return c;
   2458  }
   2459  DCHECK(!IsUnicodeMode());
   2460  if (c == 'c') {
   2461    ReportError(RegExpError::kInvalidEscape);
   2462    return 0;
   2463  }
   2464  Advance();
   2465  // Note: It's important to Advance before the HasNamedCaptures call s.t. we
   2466  // don't start scanning in the middle of an escape.
   2467  if (c == 'k' && HasNamedCaptures(in_class_escape_state)) {
   2468    ReportError(RegExpError::kInvalidEscape);
   2469    return 0;
   2470  }
   2471  return c;
   2472 }
   2473 
   2474 // https://tc39.es/ecma262/#prod-ClassRanges
   2475 template <class CharT>
   2476 RegExpTree* RegExpParserImpl<CharT>::ParseClassRanges(
   2477    ZoneList<CharacterRange>* ranges, bool add_unicode_case_equivalents) {
   2478  base::uc32 char_1, char_2;
   2479  bool is_class_1, is_class_2;
   2480  while (has_more() && current() != ']') {
   2481    ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_1,
   2482                     &is_class_1 CHECK_FAILED);
   2483    // ClassAtom
   2484    if (current() == '-') {
   2485      Advance();
   2486      if (!has_more()) {
   2487        // If we reach the end we break out of the loop and let the
   2488        // following code report an error.
   2489        break;
   2490      } else if (current() == ']') {
   2491        if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
   2492        ranges->Add(CharacterRange::Singleton('-'), zone());
   2493        break;
   2494      }
   2495      ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_2,
   2496                       &is_class_2 CHECK_FAILED);
   2497      if (is_class_1 || is_class_2) {
   2498        // Either end is an escaped character class. Treat the '-' verbatim.
   2499        if (IsUnicodeMode()) {
   2500          // ES2015 21.2.2.15.1 step 1.
   2501          return ReportError(RegExpError::kInvalidCharacterClass);
   2502        }
   2503        if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
   2504        ranges->Add(CharacterRange::Singleton('-'), zone());
   2505        if (!is_class_2) ranges->Add(CharacterRange::Singleton(char_2), zone());
   2506        continue;
   2507      }
   2508      // ES2015 21.2.2.15.1 step 6.
   2509      if (char_1 > char_2) {
   2510        return ReportError(RegExpError::kOutOfOrderCharacterClass);
   2511      }
   2512      ranges->Add(CharacterRange::Range(char_1, char_2), zone());
   2513    } else {
   2514      if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
   2515    }
   2516  }
   2517  return nullptr;
   2518 }
   2519 
   2520 // https://tc39.es/ecma262/#prod-ClassEscape
   2521 template <class CharT>
   2522 void RegExpParserImpl<CharT>::ParseClassEscape(
   2523    ZoneList<CharacterRange>* ranges, Zone* zone,
   2524    bool add_unicode_case_equivalents, base::uc32* char_out,
   2525    bool* is_class_escape) {
   2526  *is_class_escape = false;
   2527 
   2528  if (current() != '\\') {
   2529    // Not a ClassEscape.
   2530    *char_out = current();
   2531    Advance();
   2532    return;
   2533  }
   2534 
   2535  const base::uc32 next = Next();
   2536  switch (next) {
   2537    case 'b':
   2538      *char_out = '\b';
   2539      Advance(2);
   2540      return;
   2541    case '-':
   2542      if (IsUnicodeMode()) {
   2543        *char_out = next;
   2544        Advance(2);
   2545        return;
   2546      }
   2547      break;
   2548    case kEndMarker:
   2549      ReportError(RegExpError::kEscapeAtEndOfPattern);
   2550      return;
   2551    default:
   2552      break;
   2553  }
   2554 
   2555  static constexpr InClassEscapeState kInClassEscape =
   2556      InClassEscapeState::kInClass;
   2557  *is_class_escape =
   2558      TryParseCharacterClassEscape(next, kInClassEscape, ranges, nullptr, zone,
   2559                                   add_unicode_case_equivalents);
   2560  if (*is_class_escape) return;
   2561 
   2562  bool dummy = false;  // Unused.
   2563  *char_out = ParseCharacterEscape(kInClassEscape, &dummy);
   2564 }
   2565 
   2566 // https://tc39.es/ecma262/#prod-CharacterClassEscape
   2567 template <class CharT>
   2568 bool RegExpParserImpl<CharT>::TryParseCharacterClassEscape(
   2569    base::uc32 next, InClassEscapeState in_class_escape_state,
   2570    ZoneList<CharacterRange>* ranges, CharacterClassStrings* strings,
   2571    Zone* zone, bool add_unicode_case_equivalents) {
   2572  DCHECK_EQ(current(), '\\');
   2573  DCHECK_EQ(Next(), next);
   2574 
   2575  switch (next) {
   2576    case 'd':
   2577    case 'D':
   2578    case 's':
   2579    case 'S':
   2580    case 'w':
   2581    case 'W':
   2582      CharacterRange::AddClassEscape(static_cast<StandardCharacterSet>(next),
   2583                                     ranges, add_unicode_case_equivalents,
   2584                                     zone);
   2585      Advance(2);
   2586      return true;
   2587    case 'p':
   2588    case 'P': {
   2589      if (!IsUnicodeMode()) return false;
   2590      bool negate = next == 'P';
   2591      Advance(2);
   2592      ZoneVector<char> name_1(zone);
   2593      ZoneVector<char> name_2(zone);
   2594      if (!ParsePropertyClassName(&name_1, &name_2) ||
   2595          !AddPropertyClassRange(ranges, strings, negate, name_1, name_2)) {
   2596        ReportError(in_class_escape_state == InClassEscapeState::kInClass
   2597                        ? RegExpError::kInvalidClassPropertyName
   2598                        : RegExpError::kInvalidPropertyName);
   2599      }
   2600      return true;
   2601    }
   2602    default:
   2603      return false;
   2604  }
   2605 }
   2606 
   2607 namespace {
   2608 
   2609 // Add |string| to |ranges| if length of |string| == 1, otherwise add |string|
   2610 // to |strings|.
   2611 void AddClassString(ZoneList<base::uc32>* normalized_string,
   2612                    RegExpTree* regexp_string, ZoneList<CharacterRange>* ranges,
   2613                    CharacterClassStrings* strings, Zone* zone) {
   2614  if (normalized_string->length() == 1) {
   2615    ranges->Add(CharacterRange::Singleton(normalized_string->at(0)), zone);
   2616  } else {
   2617    strings->emplace(normalized_string->ToVector(), regexp_string);
   2618  }
   2619 }
   2620 
   2621 }  // namespace
   2622 
   2623 // https://tc39.es/ecma262/#prod-ClassStringDisjunction
   2624 template <class CharT>
   2625 RegExpTree* RegExpParserImpl<CharT>::ParseClassStringDisjunction(
   2626    ZoneList<CharacterRange>* ranges, CharacterClassStrings* strings) {
   2627  DCHECK(unicode_sets());
   2628  DCHECK_EQ(current(), '\\');
   2629  DCHECK_EQ(Next(), 'q');
   2630  Advance(2);
   2631  if (current() != '{') {
   2632    // Identity escape of 'q' is not allowed in unicode mode.
   2633    return ReportError(RegExpError::kInvalidEscape);
   2634  }
   2635  Advance();
   2636 
   2637  ZoneList<base::uc32>* string =
   2638      zone()->template New<ZoneList<base::uc32>>(4, zone());
   2639  RegExpTextBuilder::SmallRegExpTreeVector string_storage(zone());
   2640  RegExpTextBuilder string_builder(zone(), &string_storage, flags());
   2641 
   2642  while (has_more() && current() != '}') {
   2643    if (current() == '|') {
   2644      AddClassString(string, string_builder.ToRegExp(), ranges, strings,
   2645                     zone());
   2646      string = zone()->template New<ZoneList<base::uc32>>(4, zone());
   2647      string_storage.clear();
   2648      Advance();
   2649    } else {
   2650      base::uc32 c = ParseClassSetCharacter(CHECK_FAILED);
   2651      if (ignore_case()) {
   2652 #ifdef V8_INTL_SUPPORT
   2653        c = u_foldCase(c, U_FOLD_CASE_DEFAULT);
   2654 #else
   2655        c = AsciiAlphaToLower(c);
   2656 #endif
   2657      }
   2658      string->Add(c, zone());
   2659      string_builder.AddUnicodeCharacter(c);
   2660    }
   2661  }
   2662 
   2663  AddClassString(string, string_builder.ToRegExp(), ranges, strings, zone());
   2664  CharacterRange::Canonicalize(ranges);
   2665 
   2666  // We don't need to handle missing closing '}' here.
   2667  // If the character class is correctly closed, ParseClassSetCharacter will
   2668  // report an error.
   2669  Advance();
   2670  return nullptr;
   2671 }
   2672 
   2673 // https://tc39.es/ecma262/#prod-ClassSetOperand
   2674 // Tree returned based on type_out:
   2675 //  * kNestedClass: RegExpClassSetExpression
   2676 //  * For all other types: RegExpClassSetOperand
   2677 template <class CharT>
   2678 RegExpTree* RegExpParserImpl<CharT>::ParseClassSetOperand(
   2679    const RegExpBuilder* builder, ClassSetOperandType* type_out) {
   2680  ZoneList<CharacterRange>* ranges =
   2681      zone()->template New<ZoneList<CharacterRange>>(1, zone());
   2682  CharacterClassStrings* strings =
   2683      zone()->template New<CharacterClassStrings>(zone());
   2684  base::uc32 character;
   2685  RegExpTree* tree = ParseClassSetOperand(builder, type_out, ranges, strings,
   2686                                          &character CHECK_FAILED);
   2687  DCHECK_IMPLIES(*type_out != ClassSetOperandType::kNestedClass,
   2688                 tree == nullptr);
   2689  DCHECK_IMPLIES(*type_out == ClassSetOperandType::kClassSetCharacter,
   2690                 ranges->is_empty());
   2691  DCHECK_IMPLIES(*type_out == ClassSetOperandType::kClassSetCharacter,
   2692                 strings->empty());
   2693  DCHECK_IMPLIES(*type_out == ClassSetOperandType::kNestedClass,
   2694                 ranges->is_empty());
   2695  DCHECK_IMPLIES(*type_out == ClassSetOperandType::kNestedClass,
   2696                 strings->empty());
   2697  DCHECK_IMPLIES(*type_out == ClassSetOperandType::kNestedClass,
   2698                 tree->IsClassSetExpression());
   2699  // ClassSetRange is only used within ClassSetUnion().
   2700  DCHECK_NE(*type_out, ClassSetOperandType::kClassSetRange);
   2701  // There are no restrictions for kCharacterClassEscape.
   2702  // CharacterClassEscape includes \p{}, which can contain ranges, strings or
   2703  // both and \P{}, which could contain nothing (i.e. \P{Any}).
   2704  if (tree == nullptr) {
   2705    if (*type_out == ClassSetOperandType::kClassSetCharacter) {
   2706      AddMaybeSimpleCaseFoldedRange(ranges,
   2707                                    CharacterRange::Singleton(character));
   2708    }
   2709    tree = zone()->template New<RegExpClassSetOperand>(ranges, strings);
   2710  }
   2711  return tree;
   2712 }
   2713 
   2714 // https://tc39.es/ecma262/#prod-ClassSetOperand
   2715 // Based on |type_out| either a tree is returned or
   2716 // |ranges|/|strings|/|character| modified. If a tree is returned,
   2717 // ranges/strings are not modified. If |type_out| is kNestedClass, a tree of
   2718 // type RegExpClassSetExpression is returned. If | type_out| is
   2719 // kClassSetCharacter, |character| is set and nullptr returned. For all other
   2720 // types, |ranges|/|strings|/|character| is modified and nullptr is returned.
   2721 template <class CharT>
   2722 RegExpTree* RegExpParserImpl<CharT>::ParseClassSetOperand(
   2723    const RegExpBuilder* builder, ClassSetOperandType* type_out,
   2724    ZoneList<CharacterRange>* ranges, CharacterClassStrings* strings,
   2725    base::uc32* character) {
   2726  DCHECK(unicode_sets());
   2727  base::uc32 c = current();
   2728  if (c == '\\') {
   2729    const base::uc32 next = Next();
   2730    if (next == 'q') {
   2731      *type_out = ClassSetOperandType::kClassStringDisjunction;
   2732      ParseClassStringDisjunction(ranges, strings CHECK_FAILED);
   2733      return nullptr;
   2734    }
   2735    static constexpr InClassEscapeState kInClassEscape =
   2736        InClassEscapeState::kInClass;
   2737    const bool add_unicode_case_equivalents = ignore_case();
   2738    if (TryParseCharacterClassEscape(next, kInClassEscape, ranges, strings,
   2739                                     zone(), add_unicode_case_equivalents)) {
   2740      *type_out = ClassSetOperandType::kCharacterClassEscape;
   2741      return nullptr;
   2742    }
   2743  }
   2744 
   2745  if (c == '[') {
   2746    *type_out = ClassSetOperandType::kNestedClass;
   2747    return ParseCharacterClass(builder);
   2748  }
   2749 
   2750  *type_out = ClassSetOperandType::kClassSetCharacter;
   2751  c = ParseClassSetCharacter(CHECK_FAILED);
   2752  *character = c;
   2753  return nullptr;
   2754 }
   2755 
   2756 template <class CharT>
   2757 base::uc32 RegExpParserImpl<CharT>::ParseClassSetCharacter() {
   2758  DCHECK(unicode_sets());
   2759  const base::uc32 c = current();
   2760  if (c == '\\') {
   2761    const base::uc32 next = Next();
   2762    switch (next) {
   2763      case 'b':
   2764        Advance(2);
   2765        return '\b';
   2766      case kEndMarker:
   2767        ReportError(RegExpError::kEscapeAtEndOfPattern);
   2768        return 0;
   2769    }
   2770    static constexpr InClassEscapeState kInClassEscape =
   2771        InClassEscapeState::kInClass;
   2772 
   2773    bool dummy = false;  // Unused.
   2774    return ParseCharacterEscape(kInClassEscape, &dummy);
   2775  }
   2776  if (IsClassSetSyntaxCharacter(c)) {
   2777    ReportError(RegExpError::kInvalidCharacterInClass);
   2778    return 0;
   2779  }
   2780  if (IsClassSetReservedDoublePunctuator(c)) {
   2781    ReportError(RegExpError::kInvalidClassSetOperation);
   2782    return 0;
   2783  }
   2784  Advance();
   2785  return c;
   2786 }
   2787 
   2788 namespace {
   2789 
   2790 bool MayContainStrings(ClassSetOperandType type, RegExpTree* operand) {
   2791  switch (type) {
   2792    case ClassSetOperandType::kClassSetCharacter:
   2793    case ClassSetOperandType::kClassSetRange:
   2794      return false;
   2795    case ClassSetOperandType::kCharacterClassEscape:
   2796    case ClassSetOperandType::kClassStringDisjunction:
   2797      return operand->AsClassSetOperand()->has_strings();
   2798    case ClassSetOperandType::kNestedClass:
   2799      if (operand->IsClassRanges()) return false;
   2800      return operand->AsClassSetExpression()->may_contain_strings();
   2801  }
   2802 }
   2803 
   2804 }  // namespace
   2805 
   2806 template <class CharT>
   2807 void RegExpParserImpl<CharT>::AddMaybeSimpleCaseFoldedRange(
   2808    ZoneList<CharacterRange>* ranges, CharacterRange new_range) {
   2809  DCHECK(unicode_sets());
   2810  if (ignore_case()) {
   2811    ZoneList<CharacterRange>* new_ranges =
   2812        zone()->template New<ZoneList<CharacterRange>>(2, zone());
   2813    new_ranges->Add(new_range, zone());
   2814    CharacterRange::AddUnicodeCaseEquivalents(new_ranges, zone());
   2815    ranges->AddAll(*new_ranges, zone());
   2816  } else {
   2817    ranges->Add(new_range, zone());
   2818  }
   2819  CharacterRange::Canonicalize(ranges);
   2820 }
   2821 
   2822 // https://tc39.es/ecma262/#prod-ClassUnion
   2823 template <class CharT>
   2824 RegExpTree* RegExpParserImpl<CharT>::ParseClassUnion(
   2825    const RegExpBuilder* builder, bool is_negated, RegExpTree* first_operand,
   2826    ClassSetOperandType first_operand_type, ZoneList<CharacterRange>* ranges,
   2827    CharacterClassStrings* strings, base::uc32 character) {
   2828  DCHECK(unicode_sets());
   2829  ZoneList<RegExpTree*>* operands =
   2830      zone()->template New<ZoneList<RegExpTree*>>(2, zone());
   2831  bool may_contain_strings = false;
   2832  // Add the lhs to operands if necessary.
   2833  // Either the lhs values were added to |ranges|/|strings| (in which case
   2834  // |first_operand| is nullptr), or the lhs was evaluated to a tree and passed
   2835  // as |first_operand| (in which case |ranges| and |strings| are empty).
   2836  if (first_operand != nullptr) {
   2837    may_contain_strings = MayContainStrings(first_operand_type, first_operand);
   2838    operands->Add(first_operand, zone());
   2839  }
   2840  ClassSetOperandType last_type = first_operand_type;
   2841  while (has_more() && current() != ']') {
   2842    if (current() == '-') {
   2843      // Mix of ClassSetRange and ClassSubtraction is not allowed.
   2844      if (Next() == '-') {
   2845        return ReportError(RegExpError::kInvalidClassSetOperation);
   2846      }
   2847      Advance();
   2848      if (!has_more()) {
   2849        // If we reach the end we break out of the loop and let the
   2850        // following code report an error.
   2851        break;
   2852      }
   2853      // If the lhs and rhs around '-' are both ClassSetCharacters, they
   2854      // represent a character range.
   2855      // In case one of them is not a ClassSetCharacter, it is a syntax error,
   2856      // as '-' can not be used unescaped within a class with /v.
   2857      // See
   2858      // https://tc39.es/ecma262/#prod-ClassSetRange
   2859      if (last_type != ClassSetOperandType::kClassSetCharacter) {
   2860        return ReportError(RegExpError::kInvalidCharacterClass);
   2861      }
   2862      base::uc32 from = character;
   2863      ParseClassSetOperand(builder, &last_type, ranges, strings,
   2864                           &character CHECK_FAILED);
   2865      if (last_type != ClassSetOperandType::kClassSetCharacter) {
   2866        return ReportError(RegExpError::kInvalidCharacterClass);
   2867      }
   2868      if (from > character) {
   2869        return ReportError(RegExpError::kOutOfOrderCharacterClass);
   2870      }
   2871      AddMaybeSimpleCaseFoldedRange(ranges,
   2872                                    CharacterRange::Range(from, character));
   2873      last_type = ClassSetOperandType::kClassSetRange;
   2874    } else {
   2875      DCHECK_NE(current(), '-');
   2876      if (last_type == ClassSetOperandType::kClassSetCharacter) {
   2877        AddMaybeSimpleCaseFoldedRange(ranges,
   2878                                      CharacterRange::Singleton(character));
   2879      }
   2880      RegExpTree* operand = ParseClassSetOperand(
   2881          builder, &last_type, ranges, strings, &character CHECK_FAILED);
   2882      if (operand != nullptr) {
   2883        may_contain_strings |= MayContainStrings(last_type, operand);
   2884        // Add the range we started building as operand and reset the current
   2885        // range.
   2886        if (!ranges->is_empty() || !strings->empty()) {
   2887          may_contain_strings |= !strings->empty();
   2888          operands->Add(
   2889              zone()->template New<RegExpClassSetOperand>(ranges, strings),
   2890              zone());
   2891          ranges = zone()->template New<ZoneList<CharacterRange>>(2, zone());
   2892          strings = zone()->template New<CharacterClassStrings>(zone());
   2893        }
   2894        operands->Add(operand, zone());
   2895      }
   2896    }
   2897  }
   2898 
   2899  if (!has_more()) {
   2900    return ReportError(RegExpError::kUnterminatedCharacterClass);
   2901  }
   2902 
   2903  if (last_type == ClassSetOperandType::kClassSetCharacter) {
   2904    AddMaybeSimpleCaseFoldedRange(ranges, CharacterRange::Singleton(character));
   2905  }
   2906 
   2907  // Add the range we started building as operand.
   2908  if (!ranges->is_empty() || !strings->empty()) {
   2909    may_contain_strings |= !strings->empty();
   2910    operands->Add(zone()->template New<RegExpClassSetOperand>(ranges, strings),
   2911                  zone());
   2912  }
   2913 
   2914  DCHECK_EQ(current(), ']');
   2915  Advance();
   2916 
   2917  if (is_negated && may_contain_strings) {
   2918    return ReportError(RegExpError::kNegatedCharacterClassWithStrings);
   2919  }
   2920 
   2921  if (operands->is_empty()) {
   2922    // Return empty expression if no operands were added (e.g. [\P{Any}]
   2923    // produces an empty range).
   2924    DCHECK(ranges->is_empty());
   2925    DCHECK(strings->empty());
   2926    return RegExpClassSetExpression::Empty(zone(), is_negated);
   2927  }
   2928 
   2929  return zone()->template New<RegExpClassSetExpression>(
   2930      RegExpClassSetExpression::OperationType::kUnion, is_negated,
   2931      may_contain_strings, operands);
   2932 }
   2933 
   2934 // https://tc39.es/ecma262/#prod-ClassIntersection
   2935 template <class CharT>
   2936 RegExpTree* RegExpParserImpl<CharT>::ParseClassIntersection(
   2937    const RegExpBuilder* builder, bool is_negated, RegExpTree* first_operand,
   2938    ClassSetOperandType first_operand_type) {
   2939  DCHECK(unicode_sets());
   2940  DCHECK(current() == '&' && Next() == '&');
   2941  bool may_contain_strings =
   2942      MayContainStrings(first_operand_type, first_operand);
   2943  ZoneList<RegExpTree*>* operands =
   2944      zone()->template New<ZoneList<RegExpTree*>>(2, zone());
   2945  operands->Add(first_operand, zone());
   2946  while (has_more() && current() != ']') {
   2947    if (current() != '&' || Next() != '&') {
   2948      return ReportError(RegExpError::kInvalidClassSetOperation);
   2949    }
   2950    Advance(2);
   2951    // [lookahead ≠ &]
   2952    if (current() == '&') {
   2953      return ReportError(RegExpError::kInvalidCharacterInClass);
   2954    }
   2955 
   2956    ClassSetOperandType operand_type;
   2957    RegExpTree* operand =
   2958        ParseClassSetOperand(builder, &operand_type CHECK_FAILED);
   2959    may_contain_strings &= MayContainStrings(operand_type, operand);
   2960    operands->Add(operand, zone());
   2961  }
   2962  if (!has_more()) {
   2963    return ReportError(RegExpError::kUnterminatedCharacterClass);
   2964  }
   2965  if (is_negated && may_contain_strings) {
   2966    return ReportError(RegExpError::kNegatedCharacterClassWithStrings);
   2967  }
   2968  DCHECK_EQ(current(), ']');
   2969  Advance();
   2970  return zone()->template New<RegExpClassSetExpression>(
   2971      RegExpClassSetExpression::OperationType::kIntersection, is_negated,
   2972      may_contain_strings, operands);
   2973 }
   2974 
   2975 // https://tc39.es/ecma262/#prod-ClassSubtraction
   2976 template <class CharT>
   2977 RegExpTree* RegExpParserImpl<CharT>::ParseClassSubtraction(
   2978    const RegExpBuilder* builder, bool is_negated, RegExpTree* first_operand,
   2979    ClassSetOperandType first_operand_type) {
   2980  DCHECK(unicode_sets());
   2981  DCHECK(current() == '-' && Next() == '-');
   2982  const bool may_contain_strings =
   2983      MayContainStrings(first_operand_type, first_operand);
   2984  if (is_negated && may_contain_strings) {
   2985    return ReportError(RegExpError::kNegatedCharacterClassWithStrings);
   2986  }
   2987  ZoneList<RegExpTree*>* operands =
   2988      zone()->template New<ZoneList<RegExpTree*>>(2, zone());
   2989  operands->Add(first_operand, zone());
   2990  while (has_more() && current() != ']') {
   2991    if (current() != '-' || Next() != '-') {
   2992      return ReportError(RegExpError::kInvalidClassSetOperation);
   2993    }
   2994    Advance(2);
   2995    ClassSetOperandType dummy;  // unused
   2996    RegExpTree* operand = ParseClassSetOperand(builder, &dummy CHECK_FAILED);
   2997    operands->Add(operand, zone());
   2998  }
   2999  if (!has_more()) {
   3000    return ReportError(RegExpError::kUnterminatedCharacterClass);
   3001  }
   3002  DCHECK_EQ(current(), ']');
   3003  Advance();
   3004  return zone()->template New<RegExpClassSetExpression>(
   3005      RegExpClassSetExpression::OperationType::kSubtraction, is_negated,
   3006      may_contain_strings, operands);
   3007 }
   3008 
   3009 // https://tc39.es/ecma262/#prod-CharacterClass
   3010 template <class CharT>
   3011 RegExpTree* RegExpParserImpl<CharT>::ParseCharacterClass(
   3012    const RegExpBuilder* builder) {
   3013  DCHECK_EQ(current(), '[');
   3014  Advance();
   3015  bool is_negated = false;
   3016  if (current() == '^') {
   3017    is_negated = true;
   3018    Advance();
   3019  }
   3020  ZoneList<CharacterRange>* ranges =
   3021      zone()->template New<ZoneList<CharacterRange>>(2, zone());
   3022  if (current() == ']') {
   3023    Advance();
   3024    if (unicode_sets()) {
   3025      return RegExpClassSetExpression::Empty(zone(), is_negated);
   3026    } else {
   3027      RegExpClassRanges::ClassRangesFlags class_ranges_flags;
   3028      if (is_negated) class_ranges_flags = RegExpClassRanges::NEGATED;
   3029      return zone()->template New<RegExpClassRanges>(zone(), ranges,
   3030                                                     class_ranges_flags);
   3031    }
   3032  }
   3033 
   3034  if (!unicode_sets()) {
   3035    bool add_unicode_case_equivalents = IsUnicodeMode() && ignore_case();
   3036    ParseClassRanges(ranges, add_unicode_case_equivalents CHECK_FAILED);
   3037    if (!has_more()) {
   3038      return ReportError(RegExpError::kUnterminatedCharacterClass);
   3039    }
   3040    DCHECK_EQ(current(), ']');
   3041    Advance();
   3042    RegExpClassRanges::ClassRangesFlags character_class_flags;
   3043    if (is_negated) character_class_flags = RegExpClassRanges::NEGATED;
   3044    return zone()->template New<RegExpClassRanges>(zone(), ranges,
   3045                                                   character_class_flags);
   3046  } else {
   3047    ClassSetOperandType operand_type;
   3048    CharacterClassStrings* strings =
   3049        zone()->template New<CharacterClassStrings>(zone());
   3050    base::uc32 character;
   3051    RegExpTree* operand = ParseClassSetOperand(
   3052        builder, &operand_type, ranges, strings, &character CHECK_FAILED);
   3053    switch (current()) {
   3054      case '-':
   3055        if (Next() == '-') {
   3056          if (operand == nullptr) {
   3057            if (operand_type == ClassSetOperandType::kClassSetCharacter) {
   3058              AddMaybeSimpleCaseFoldedRange(
   3059                  ranges, CharacterRange::Singleton(character));
   3060            }
   3061            operand =
   3062                zone()->template New<RegExpClassSetOperand>(ranges, strings);
   3063          }
   3064          return ParseClassSubtraction(builder, is_negated, operand,
   3065                                       operand_type);
   3066        }
   3067        // ClassSetRange is handled in ParseClassUnion().
   3068        break;
   3069      case '&':
   3070        if (Next() == '&') {
   3071          if (operand == nullptr) {
   3072            if (operand_type == ClassSetOperandType::kClassSetCharacter) {
   3073              AddMaybeSimpleCaseFoldedRange(
   3074                  ranges, CharacterRange::Singleton(character));
   3075            }
   3076            operand =
   3077                zone()->template New<RegExpClassSetOperand>(ranges, strings);
   3078          }
   3079          return ParseClassIntersection(builder, is_negated, operand,
   3080                                        operand_type);
   3081        }
   3082    }
   3083    return ParseClassUnion(builder, is_negated, operand, operand_type, ranges,
   3084                           strings, character);
   3085  }
   3086 }
   3087 
   3088 #undef CHECK_FAILED
   3089 
   3090 template <class CharT>
   3091 bool RegExpParserImpl<CharT>::Parse(RegExpCompileData* result) {
   3092  DCHECK_NOT_NULL(result);
   3093  RegExpTree* tree = ParsePattern();
   3094 
   3095  if (failed()) {
   3096    DCHECK_NULL(tree);
   3097    DCHECK_NE(error_, RegExpError::kNone);
   3098    result->error = error_;
   3099    result->error_pos = error_pos_;
   3100    return false;
   3101  }
   3102 
   3103  DCHECK_NOT_NULL(tree);
   3104  DCHECK_EQ(error_, RegExpError::kNone);
   3105  if (v8_flags.trace_regexp_parser) {
   3106    StdoutStream os;
   3107    tree->Print(os, zone());
   3108    os << "\n";
   3109  }
   3110 
   3111  result->tree = tree;
   3112  const int capture_count = captures_started();
   3113  result->simple = tree->IsAtom() && simple() && capture_count == 0;
   3114  result->contains_anchor = contains_anchor();
   3115  result->capture_count = capture_count;
   3116  result->named_captures = GetNamedCaptures();
   3117  return true;
   3118 }
   3119 
   3120 void RegExpBuilder::FlushText() { text_builder().FlushText(); }
   3121 
   3122 void RegExpBuilder::AddCharacter(base::uc16 c) {
   3123  pending_empty_ = false;
   3124  text_builder().AddCharacter(c);
   3125 }
   3126 
   3127 void RegExpBuilder::AddUnicodeCharacter(base::uc32 c) {
   3128  pending_empty_ = false;
   3129  text_builder().AddUnicodeCharacter(c);
   3130 }
   3131 
   3132 void RegExpBuilder::AddEscapedUnicodeCharacter(base::uc32 character) {
   3133  pending_empty_ = false;
   3134  text_builder().AddEscapedUnicodeCharacter(character);
   3135 }
   3136 
   3137 void RegExpBuilder::AddEmpty() {
   3138  text_builder().FlushPendingSurrogate();
   3139  pending_empty_ = true;
   3140 }
   3141 
   3142 void RegExpBuilder::AddClassRanges(RegExpClassRanges* cc) {
   3143  pending_empty_ = false;
   3144  text_builder().AddClassRanges(cc);
   3145 }
   3146 
   3147 void RegExpBuilder::AddAtom(RegExpTree* term) {
   3148  if (term->IsEmpty()) {
   3149    AddEmpty();
   3150    return;
   3151  }
   3152  pending_empty_ = false;
   3153  if (term->IsTextElement()) {
   3154    text_builder().AddAtom(term);
   3155  } else {
   3156    FlushText();
   3157    terms_.emplace_back(term);
   3158  }
   3159 }
   3160 
   3161 void RegExpBuilder::AddTerm(RegExpTree* term) {
   3162  DCHECK(!term->IsEmpty());
   3163  pending_empty_ = false;
   3164  if (term->IsTextElement()) {
   3165    text_builder().AddTerm(term);
   3166  } else {
   3167    FlushText();
   3168    terms_.emplace_back(term);
   3169  }
   3170 }
   3171 
   3172 void RegExpBuilder::AddAssertion(RegExpTree* assert) {
   3173  FlushText();
   3174  pending_empty_ = false;
   3175  terms_.emplace_back(assert);
   3176 }
   3177 
   3178 void RegExpBuilder::NewAlternative() { FlushTerms(); }
   3179 
   3180 void RegExpBuilder::FlushTerms() {
   3181  FlushText();
   3182  size_t num_terms = terms_.size();
   3183  RegExpTree* alternative;
   3184  if (num_terms == 0) {
   3185    alternative = zone()->New<RegExpEmpty>();
   3186  } else if (num_terms == 1) {
   3187    alternative = terms_.back();
   3188  } else {
   3189    alternative =
   3190        zone()->New<RegExpAlternative>(zone()->New<ZoneList<RegExpTree*>>(
   3191            base::VectorOf(terms_.begin(), terms_.size()), zone()));
   3192  }
   3193  alternatives_.emplace_back(alternative);
   3194  terms_.clear();
   3195 }
   3196 
   3197 RegExpTree* RegExpBuilder::ToRegExp() {
   3198  FlushTerms();
   3199  size_t num_alternatives = alternatives_.size();
   3200  if (num_alternatives == 0) return zone()->New<RegExpEmpty>();
   3201  if (num_alternatives == 1) return alternatives_.back();
   3202  return zone()->New<RegExpDisjunction>(zone()->New<ZoneList<RegExpTree*>>(
   3203      base::VectorOf(alternatives_.begin(), alternatives_.size()), zone()));
   3204 }
   3205 
   3206 bool RegExpBuilder::AddQuantifierToAtom(
   3207    int min, int max, int index,
   3208    RegExpQuantifier::QuantifierType quantifier_type) {
   3209  if (pending_empty_) {
   3210    pending_empty_ = false;
   3211    return true;
   3212  }
   3213  RegExpTree* atom = text_builder().PopLastAtom();
   3214  if (atom != nullptr) {
   3215    FlushText();
   3216  } else if (!terms_.empty()) {
   3217    atom = terms_.back();
   3218    terms_.pop_back();
   3219    if (atom->IsLookaround()) {
   3220      // With /u or /v, lookarounds are not quantifiable.
   3221      if (IsUnicodeMode()) return false;
   3222      // Lookbehinds are not quantifiable.
   3223      if (atom->AsLookaround()->type() == RegExpLookaround::LOOKBEHIND) {
   3224        return false;
   3225      }
   3226    }
   3227    if (atom->max_match() == 0) {
   3228      // Guaranteed to only match an empty string.
   3229      if (min == 0) {
   3230        return true;
   3231      }
   3232      terms_.emplace_back(atom);
   3233      return true;
   3234    }
   3235  } else {
   3236    // Only call immediately after adding an atom or character!
   3237    UNREACHABLE();
   3238  }
   3239  terms_.emplace_back(
   3240      zone()->New<RegExpQuantifier>(min, max, quantifier_type, index, atom));
   3241  return true;
   3242 }
   3243 
   3244 template class RegExpParserImpl<uint8_t>;
   3245 template class RegExpParserImpl<base::uc16>;
   3246 
   3247 }  // namespace
   3248 
   3249 // static
   3250 bool RegExpParser::ParseRegExpFromHeapString(Isolate* isolate, Zone* zone,
   3251                                             DirectHandle<String> input,
   3252                                             RegExpFlags flags,
   3253                                             RegExpCompileData* result) {
   3254  DisallowGarbageCollection no_gc;
   3255  uintptr_t stack_limit = isolate->stack_guard()->real_climit();
   3256  String::FlatContent content = input->GetFlatContent(no_gc);
   3257  if (content.IsOneByte()) {
   3258    base::Vector<const uint8_t> v = content.ToOneByteVector();
   3259    return RegExpParserImpl<uint8_t>{v.begin(),   v.length(), flags,
   3260                                     stack_limit, zone,       no_gc}
   3261        .Parse(result);
   3262  } else {
   3263    base::Vector<const base::uc16> v = content.ToUC16Vector();
   3264    return RegExpParserImpl<base::uc16>{v.begin(),   v.length(), flags,
   3265                                        stack_limit, zone,       no_gc}
   3266        .Parse(result);
   3267  }
   3268 }
   3269 
   3270 // static
   3271 template <class CharT>
   3272 bool RegExpParser::VerifyRegExpSyntax(Zone* zone, uintptr_t stack_limit,
   3273                                      const CharT* input, int input_length,
   3274                                      RegExpFlags flags,
   3275                                      RegExpCompileData* result,
   3276                                      const DisallowGarbageCollection& no_gc) {
   3277  return RegExpParserImpl<CharT>{input,       input_length, flags,
   3278                                 stack_limit, zone,         no_gc}
   3279      .Parse(result);
   3280 }
   3281 
   3282 template bool RegExpParser::VerifyRegExpSyntax<uint8_t>(
   3283    Zone*, uintptr_t, const uint8_t*, int, RegExpFlags, RegExpCompileData*,
   3284    const DisallowGarbageCollection&);
   3285 template bool RegExpParser::VerifyRegExpSyntax<base::uc16>(
   3286    Zone*, uintptr_t, const base::uc16*, int, RegExpFlags, RegExpCompileData*,
   3287    const DisallowGarbageCollection&);
   3288 
   3289 }  // namespace internal
   3290 }  // namespace v8
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE