tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

regexp-compiler-tonode.cc (80803B)


      1 // Copyright 2019 the V8 project authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "irregexp/imported/regexp-compiler.h"
      6 #include "irregexp/imported/regexp.h"
      7 
      8 #ifdef V8_INTL_SUPPORT
      9 #include "irregexp/imported/special-case.h"
     10 #include "unicode/locid.h"
     11 #include "unicode/uniset.h"
     12 #include "unicode/utypes.h"
     13 #endif  // V8_INTL_SUPPORT
     14 
     15 namespace v8 {
     16 namespace internal {
     17 
     18 using namespace regexp_compiler_constants;  // NOLINT(build/namespaces)
     19 
     20 constexpr base::uc32 kMaxCodePoint = 0x10ffff;
     21 constexpr int kMaxUtf16CodeUnit = 0xffff;
     22 constexpr uint32_t kMaxUtf16CodeUnitU = 0xffff;
     23 
     24 // -------------------------------------------------------------------
     25 // Tree to graph conversion
     26 
     27 RegExpNode* RegExpTree::ToNode(RegExpCompiler* compiler,
     28                               RegExpNode* on_success) {
     29  compiler->ToNodeMaybeCheckForStackOverflow();
     30  return ToNodeImpl(compiler, on_success);
     31 }
     32 
     33 RegExpNode* RegExpAtom::ToNodeImpl(RegExpCompiler* compiler,
     34                                   RegExpNode* on_success) {
     35  ZoneList<TextElement>* elms =
     36      compiler->zone()->New<ZoneList<TextElement>>(1, compiler->zone());
     37  elms->Add(TextElement::Atom(this), compiler->zone());
     38  return compiler->zone()->New<TextNode>(elms, compiler->read_backward(),
     39                                         on_success);
     40 }
     41 
     42 RegExpNode* RegExpText::ToNodeImpl(RegExpCompiler* compiler,
     43                                   RegExpNode* on_success) {
     44  return compiler->zone()->New<TextNode>(elements(), compiler->read_backward(),
     45                                         on_success);
     46 }
     47 
     48 namespace {
     49 
     50 bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
     51                          const int* special_class, int length) {
     52  length--;  // Remove final marker.
     53 
     54  DCHECK_EQ(kRangeEndMarker, special_class[length]);
     55  DCHECK_NE(0, ranges->length());
     56  DCHECK_NE(0, length);
     57  DCHECK_NE(0, special_class[0]);
     58 
     59  if (ranges->length() != (length >> 1) + 1) return false;
     60 
     61  CharacterRange range = ranges->at(0);
     62  if (range.from() != 0) return false;
     63 
     64  for (int i = 0; i < length; i += 2) {
     65    if (static_cast<base::uc32>(special_class[i]) != (range.to() + 1)) {
     66      return false;
     67    }
     68    range = ranges->at((i >> 1) + 1);
     69    if (static_cast<base::uc32>(special_class[i + 1]) != range.from()) {
     70      return false;
     71    }
     72  }
     73 
     74  return range.to() == kMaxCodePoint;
     75 }
     76 
     77 bool CompareRanges(ZoneList<CharacterRange>* ranges, const int* special_class,
     78                   int length) {
     79  length--;  // Remove final marker.
     80 
     81  DCHECK_EQ(kRangeEndMarker, special_class[length]);
     82  if (ranges->length() * 2 != length) return false;
     83 
     84  for (int i = 0; i < length; i += 2) {
     85    CharacterRange range = ranges->at(i >> 1);
     86    if (range.from() != static_cast<base::uc32>(special_class[i]) ||
     87        range.to() != static_cast<base::uc32>(special_class[i + 1] - 1)) {
     88      return false;
     89    }
     90  }
     91  return true;
     92 }
     93 
     94 }  // namespace
     95 
     96 bool RegExpClassRanges::is_standard(Zone* zone) {
     97  // TODO(lrn): Remove need for this function, by not throwing away information
     98  // along the way.
     99  if (is_negated()) {
    100    return false;
    101  }
    102  if (set_.is_standard()) {
    103    return true;
    104  }
    105  if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
    106    set_.set_standard_set_type(StandardCharacterSet::kWhitespace);
    107    return true;
    108  }
    109  if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
    110    set_.set_standard_set_type(StandardCharacterSet::kNotWhitespace);
    111    return true;
    112  }
    113  if (CompareInverseRanges(set_.ranges(zone), kLineTerminatorRanges,
    114                           kLineTerminatorRangeCount)) {
    115    set_.set_standard_set_type(StandardCharacterSet::kNotLineTerminator);
    116    return true;
    117  }
    118  if (CompareRanges(set_.ranges(zone), kLineTerminatorRanges,
    119                    kLineTerminatorRangeCount)) {
    120    set_.set_standard_set_type(StandardCharacterSet::kLineTerminator);
    121    return true;
    122  }
    123  if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
    124    set_.set_standard_set_type(StandardCharacterSet::kWord);
    125    return true;
    126  }
    127  if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
    128    set_.set_standard_set_type(StandardCharacterSet::kNotWord);
    129    return true;
    130  }
    131  return false;
    132 }
    133 
    134 UnicodeRangeSplitter::UnicodeRangeSplitter(ZoneList<CharacterRange>* base) {
    135  // The unicode range splitter categorizes given character ranges into:
    136  // - Code points from the BMP representable by one code unit.
    137  // - Code points outside the BMP that need to be split into
    138  // surrogate pairs.
    139  // - Lone lead surrogates.
    140  // - Lone trail surrogates.
    141  // Lone surrogates are valid code points, even though no actual characters.
    142  // They require special matching to make sure we do not split surrogate pairs.
    143 
    144  for (int i = 0; i < base->length(); i++) AddRange(base->at(i));
    145 }
    146 
    147 void UnicodeRangeSplitter::AddRange(CharacterRange range) {
    148  static constexpr base::uc32 kBmp1Start = 0;
    149  static constexpr base::uc32 kBmp1End = kLeadSurrogateStart - 1;
    150  static constexpr base::uc32 kBmp2Start = kTrailSurrogateEnd + 1;
    151  static constexpr base::uc32 kBmp2End = kNonBmpStart - 1;
    152 
    153  // Ends are all inclusive.
    154  static_assert(kBmp1Start == 0);
    155  static_assert(kBmp1Start < kBmp1End);
    156  static_assert(kBmp1End + 1 == kLeadSurrogateStart);
    157  static_assert(kLeadSurrogateStart < kLeadSurrogateEnd);
    158  static_assert(kLeadSurrogateEnd + 1 == kTrailSurrogateStart);
    159  static_assert(kTrailSurrogateStart < kTrailSurrogateEnd);
    160  static_assert(kTrailSurrogateEnd + 1 == kBmp2Start);
    161  static_assert(kBmp2Start < kBmp2End);
    162  static_assert(kBmp2End + 1 == kNonBmpStart);
    163  static_assert(kNonBmpStart < kNonBmpEnd);
    164 
    165  static constexpr base::uc32 kStarts[] = {
    166      kBmp1Start, kLeadSurrogateStart, kTrailSurrogateStart,
    167      kBmp2Start, kNonBmpStart,
    168  };
    169 
    170  static constexpr base::uc32 kEnds[] = {
    171      kBmp1End, kLeadSurrogateEnd, kTrailSurrogateEnd, kBmp2End, kNonBmpEnd,
    172  };
    173 
    174  CharacterRangeVector* const kTargets[] = {
    175      &bmp_, &lead_surrogates_, &trail_surrogates_, &bmp_, &non_bmp_,
    176  };
    177 
    178  static constexpr int kCount = arraysize(kStarts);
    179  static_assert(kCount == arraysize(kEnds));
    180  static_assert(kCount == arraysize(kTargets));
    181 
    182  for (int i = 0; i < kCount; i++) {
    183    if (kStarts[i] > range.to()) break;
    184    const base::uc32 from = std::max(kStarts[i], range.from());
    185    const base::uc32 to = std::min(kEnds[i], range.to());
    186    if (from > to) continue;
    187    kTargets[i]->emplace_back(CharacterRange::Range(from, to));
    188  }
    189 }
    190 
    191 namespace {
    192 
    193 // Translates between new and old V8-isms (SmallVector, ZoneList).
    194 ZoneList<CharacterRange>* ToCanonicalZoneList(
    195    const UnicodeRangeSplitter::CharacterRangeVector* v, Zone* zone) {
    196  if (v->empty()) return nullptr;
    197 
    198  ZoneList<CharacterRange>* result =
    199      zone->New<ZoneList<CharacterRange>>(static_cast<int>(v->size()), zone);
    200  for (size_t i = 0; i < v->size(); i++) {
    201    result->Add(v->at(i), zone);
    202  }
    203 
    204  CharacterRange::Canonicalize(result);
    205  return result;
    206 }
    207 
    208 void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,
    209                      RegExpNode* on_success, UnicodeRangeSplitter* splitter) {
    210  ZoneList<CharacterRange>* bmp =
    211      ToCanonicalZoneList(splitter->bmp(), compiler->zone());
    212  if (bmp == nullptr) return;
    213  result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
    214      compiler->zone(), bmp, compiler->read_backward(), on_success)));
    215 }
    216 
    217 using UC16Range = uint32_t;  // {from, to} packed into one uint32_t.
    218 constexpr UC16Range ToUC16Range(base::uc16 from, base::uc16 to) {
    219  return (static_cast<uint32_t>(from) << 16) | to;
    220 }
    221 constexpr base::uc16 ExtractFrom(UC16Range r) {
    222  return static_cast<base::uc16>(r >> 16);
    223 }
    224 constexpr base::uc16 ExtractTo(UC16Range r) {
    225  return static_cast<base::uc16>(r);
    226 }
    227 
    228 void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
    229                             RegExpNode* on_success,
    230                             UnicodeRangeSplitter* splitter) {
    231  DCHECK(!compiler->one_byte());
    232  Zone* const zone = compiler->zone();
    233  ZoneList<CharacterRange>* non_bmp =
    234      ToCanonicalZoneList(splitter->non_bmp(), zone);
    235  if (non_bmp == nullptr) return;
    236 
    237  // Translate each 32-bit code point range into the corresponding 16-bit code
    238  // unit representation consisting of the lead- and trail surrogate.
    239  //
    240  // The generated alternatives are grouped by the leading surrogate to avoid
    241  // emitting excessive code. For example, for
    242  //
    243  //  { \ud800[\udc00-\udc01]
    244  //  , \ud800[\udc05-\udc06]
    245  //  }
    246  //
    247  // there's no need to emit matching code for the leading surrogate \ud800
    248  // twice. We also create a dedicated grouping for full trailing ranges, i.e.
    249  // [dc00-dfff].
    250  ZoneUnorderedMap<UC16Range, ZoneList<CharacterRange>*> grouped_by_leading(
    251      zone);
    252  ZoneList<CharacterRange>* leading_with_full_trailing_range =
    253      zone->New<ZoneList<CharacterRange>>(1, zone);
    254  const auto AddRange = [&](base::uc16 from_l, base::uc16 to_l,
    255                            base::uc16 from_t, base::uc16 to_t) {
    256    const UC16Range leading_range = ToUC16Range(from_l, to_l);
    257    if (grouped_by_leading.count(leading_range) == 0) {
    258      if (from_t == kTrailSurrogateStart && to_t == kTrailSurrogateEnd) {
    259        leading_with_full_trailing_range->Add(
    260            CharacterRange::Range(from_l, to_l), zone);
    261        return;
    262      }
    263      grouped_by_leading[leading_range] =
    264          zone->New<ZoneList<CharacterRange>>(2, zone);
    265    }
    266    grouped_by_leading[leading_range]->Add(CharacterRange::Range(from_t, to_t),
    267                                           zone);
    268  };
    269 
    270  // First, create the grouped ranges.
    271  CharacterRange::Canonicalize(non_bmp);
    272  for (int i = 0; i < non_bmp->length(); i++) {
    273    // Match surrogate pair.
    274    // E.g. [\u10005-\u11005] becomes
    275    //      \ud800[\udc05-\udfff]|
    276    //      [\ud801-\ud803][\udc00-\udfff]|
    277    //      \ud804[\udc00-\udc05]
    278    base::uc32 from = non_bmp->at(i).from();
    279    base::uc32 to = non_bmp->at(i).to();
    280    base::uc16 from_l = unibrow::Utf16::LeadSurrogate(from);
    281    base::uc16 from_t = unibrow::Utf16::TrailSurrogate(from);
    282    base::uc16 to_l = unibrow::Utf16::LeadSurrogate(to);
    283    base::uc16 to_t = unibrow::Utf16::TrailSurrogate(to);
    284 
    285    if (from_l == to_l) {
    286      // The lead surrogate is the same.
    287      AddRange(from_l, to_l, from_t, to_t);
    288      continue;
    289    }
    290 
    291    if (from_t != kTrailSurrogateStart) {
    292      // Add [from_l][from_t-\udfff].
    293      AddRange(from_l, from_l, from_t, kTrailSurrogateEnd);
    294      from_l++;
    295    }
    296    if (to_t != kTrailSurrogateEnd) {
    297      // Add [to_l][\udc00-to_t].
    298      AddRange(to_l, to_l, kTrailSurrogateStart, to_t);
    299      to_l--;
    300    }
    301    if (from_l <= to_l) {
    302      // Add [from_l-to_l][\udc00-\udfff].
    303      AddRange(from_l, to_l, kTrailSurrogateStart, kTrailSurrogateEnd);
    304    }
    305  }
    306 
    307  // Create the actual TextNode now that ranges are fully grouped.
    308  if (!leading_with_full_trailing_range->is_empty()) {
    309    CharacterRange::Canonicalize(leading_with_full_trailing_range);
    310    result->AddAlternative(GuardedAlternative(TextNode::CreateForSurrogatePair(
    311        zone, leading_with_full_trailing_range,
    312        CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
    313        compiler->read_backward(), on_success)));
    314  }
    315  for (const auto& it : grouped_by_leading) {
    316    CharacterRange leading_range =
    317        CharacterRange::Range(ExtractFrom(it.first), ExtractTo(it.first));
    318    ZoneList<CharacterRange>* trailing_ranges = it.second;
    319    CharacterRange::Canonicalize(trailing_ranges);
    320    result->AddAlternative(GuardedAlternative(TextNode::CreateForSurrogatePair(
    321        zone, leading_range, trailing_ranges, compiler->read_backward(),
    322        on_success)));
    323  }
    324 }
    325 
    326 RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
    327    RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind,
    328    ZoneList<CharacterRange>* match, RegExpNode* on_success,
    329    bool read_backward) {
    330  Zone* zone = compiler->zone();
    331  RegExpNode* match_node = TextNode::CreateForCharacterRanges(
    332      zone, match, read_backward, on_success);
    333  int stack_register = compiler->UnicodeLookaroundStackRegister();
    334  int position_register = compiler->UnicodeLookaroundPositionRegister();
    335  RegExpLookaround::Builder lookaround(false, match_node, stack_register,
    336                                       position_register);
    337  RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
    338      zone, lookbehind, !read_backward, lookaround.on_match_success());
    339  return lookaround.ForMatch(negative_match);
    340 }
    341 
    342 RegExpNode* MatchAndNegativeLookaroundInReadDirection(
    343    RegExpCompiler* compiler, ZoneList<CharacterRange>* match,
    344    ZoneList<CharacterRange>* lookahead, RegExpNode* on_success,
    345    bool read_backward) {
    346  Zone* zone = compiler->zone();
    347  int stack_register = compiler->UnicodeLookaroundStackRegister();
    348  int position_register = compiler->UnicodeLookaroundPositionRegister();
    349  RegExpLookaround::Builder lookaround(false, on_success, stack_register,
    350                                       position_register);
    351  RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
    352      zone, lookahead, read_backward, lookaround.on_match_success());
    353  return TextNode::CreateForCharacterRanges(
    354      zone, match, read_backward, lookaround.ForMatch(negative_match));
    355 }
    356 
    357 void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
    358                           RegExpNode* on_success,
    359                           UnicodeRangeSplitter* splitter) {
    360  ZoneList<CharacterRange>* lead_surrogates =
    361      ToCanonicalZoneList(splitter->lead_surrogates(), compiler->zone());
    362  if (lead_surrogates == nullptr) return;
    363  Zone* zone = compiler->zone();
    364  // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
    365  ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
    366      zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
    367 
    368  RegExpNode* match;
    369  if (compiler->read_backward()) {
    370    // Reading backward. Assert that reading forward, there is no trail
    371    // surrogate, and then backward match the lead surrogate.
    372    match = NegativeLookaroundAgainstReadDirectionAndMatch(
    373        compiler, trail_surrogates, lead_surrogates, on_success, true);
    374  } else {
    375    // Reading forward. Forward match the lead surrogate and assert that
    376    // no trail surrogate follows.
    377    match = MatchAndNegativeLookaroundInReadDirection(
    378        compiler, lead_surrogates, trail_surrogates, on_success, false);
    379  }
    380  result->AddAlternative(GuardedAlternative(match));
    381 }
    382 
    383 void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
    384                            RegExpNode* on_success,
    385                            UnicodeRangeSplitter* splitter) {
    386  ZoneList<CharacterRange>* trail_surrogates =
    387      ToCanonicalZoneList(splitter->trail_surrogates(), compiler->zone());
    388  if (trail_surrogates == nullptr) return;
    389  Zone* zone = compiler->zone();
    390  // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
    391  ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
    392      zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
    393 
    394  RegExpNode* match;
    395  if (compiler->read_backward()) {
    396    // Reading backward. Backward match the trail surrogate and assert that no
    397    // lead surrogate precedes it.
    398    match = MatchAndNegativeLookaroundInReadDirection(
    399        compiler, trail_surrogates, lead_surrogates, on_success, true);
    400  } else {
    401    // Reading forward. Assert that reading backward, there is no lead
    402    // surrogate, and then forward match the trail surrogate.
    403    match = NegativeLookaroundAgainstReadDirectionAndMatch(
    404        compiler, lead_surrogates, trail_surrogates, on_success, false);
    405  }
    406  result->AddAlternative(GuardedAlternative(match));
    407 }
    408 
    409 RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
    410                              RegExpNode* on_success) {
    411  // This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
    412  DCHECK(!compiler->read_backward());
    413  Zone* zone = compiler->zone();
    414  // Advance any character. If the character happens to be a lead surrogate and
    415  // we advanced into the middle of a surrogate pair, it will work out, as
    416  // nothing will match from there. We will have to advance again, consuming
    417  // the associated trail surrogate.
    418  ZoneList<CharacterRange>* range =
    419      CharacterRange::List(zone, CharacterRange::Range(0, kMaxUtf16CodeUnit));
    420  return TextNode::CreateForCharacterRanges(zone, range, false, on_success);
    421 }
    422 
    423 }  // namespace
    424 
    425 // static
    426 // Only for /ui and /vi, not for /i regexps.
    427 void CharacterRange::AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges,
    428                                               Zone* zone) {
    429 #ifdef V8_INTL_SUPPORT
    430  DCHECK(IsCanonical(ranges));
    431 
    432  // Micro-optimization to avoid passing large ranges to UnicodeSet::closeOver.
    433  // See also https://crbug.com/v8/6727.
    434  // TODO(jgruber): This only covers the special case of the {0,0x10FFFF} range,
    435  // which we use frequently internally. But large ranges can also easily be
    436  // created by the user. We might want to have a more general caching mechanism
    437  // for such ranges.
    438  if (ranges->length() == 1 && ranges->at(0).IsEverything(kNonBmpEnd)) return;
    439 
    440  // Use ICU to compute the case fold closure over the ranges.
    441  icu::UnicodeSet set;
    442  for (int i = 0; i < ranges->length(); i++) {
    443    set.add(ranges->at(i).from(), ranges->at(i).to());
    444  }
    445  // Clear the ranges list without freeing the backing store.
    446  ranges->Rewind(0);
    447  set.closeOver(USET_SIMPLE_CASE_INSENSITIVE);
    448  for (int i = 0; i < set.getRangeCount(); i++) {
    449    ranges->Add(Range(set.getRangeStart(i), set.getRangeEnd(i)), zone);
    450  }
    451  // No errors and everything we collected have been ranges.
    452  Canonicalize(ranges);
    453 #endif  // V8_INTL_SUPPORT
    454 }
    455 
    456 RegExpNode* RegExpClassRanges::ToNodeImpl(RegExpCompiler* compiler,
    457                                          RegExpNode* on_success) {
    458  set_.Canonicalize();
    459  Zone* const zone = compiler->zone();
    460  ZoneList<CharacterRange>* ranges = this->ranges(zone);
    461 
    462  const bool needs_case_folding =
    463      NeedsUnicodeCaseEquivalents(compiler->flags()) && !is_case_folded();
    464  if (needs_case_folding) {
    465    CharacterRange::AddUnicodeCaseEquivalents(ranges, zone);
    466  }
    467 
    468  if (!IsEitherUnicode(compiler->flags()) || compiler->one_byte() ||
    469      contains_split_surrogate()) {
    470    return zone->New<TextNode>(this, compiler->read_backward(), on_success);
    471  }
    472 
    473  if (is_negated()) {
    474    // With /v, character classes are never negated.
    475    // https://tc39.es/ecma262/#sec-compileatom
    476    // Atom :: CharacterClass
    477    //   4. Assert: cc.[[Invert]] is false.
    478    // Instead the complement is created when evaluating the class set.
    479    // The only exception is the "nothing range" (negated everything), which is
    480    // internally created for an empty set.
    481    DCHECK_IMPLIES(
    482        IsUnicodeSets(compiler->flags()),
    483        ranges->length() == 1 && ranges->first().IsEverything(kMaxCodePoint));
    484    ZoneList<CharacterRange>* negated =
    485        zone->New<ZoneList<CharacterRange>>(2, zone);
    486    CharacterRange::Negate(ranges, negated, zone);
    487    ranges = negated;
    488  }
    489 
    490  if (ranges->length() == 0) {
    491    // The empty character class is used as a 'fail' node.
    492    RegExpClassRanges* fail = zone->New<RegExpClassRanges>(zone, ranges);
    493    return zone->New<TextNode>(fail, compiler->read_backward(), on_success);
    494  }
    495 
    496  if (set_.is_standard() &&
    497      standard_type() == StandardCharacterSet::kEverything) {
    498    return UnanchoredAdvance(compiler, on_success);
    499  }
    500 
    501  // Split ranges in order to handle surrogates correctly:
    502  // - Surrogate pairs: translate the 32-bit code point into two uc16 code
    503  //   units (irregexp operates only on code units).
    504  // - Lone surrogates: these require lookarounds to ensure we don't match in
    505  //   the middle of a surrogate pair.
    506  ChoiceNode* result = zone->New<ChoiceNode>(2, zone);
    507  UnicodeRangeSplitter splitter(ranges);
    508  AddBmpCharacters(compiler, result, on_success, &splitter);
    509  AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
    510  AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
    511  AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
    512 
    513  static constexpr int kMaxRangesToInline = 32;  // Arbitrary.
    514  if (ranges->length() > kMaxRangesToInline) result->SetDoNotInline();
    515 
    516  if (result->alternatives()->length() == 1) {
    517    return result->alternatives()->at(0).node();
    518  }
    519 
    520  return result;
    521 }
    522 
    523 RegExpNode* RegExpClassSetOperand::ToNodeImpl(RegExpCompiler* compiler,
    524                                              RegExpNode* on_success) {
    525  Zone* zone = compiler->zone();
    526  const int size = (has_strings() ? static_cast<int>(strings()->size()) : 0) +
    527                   (ranges()->is_empty() ? 0 : 1);
    528  if (size == 0) {
    529    // If neither ranges nor strings are present, the operand is equal to an
    530    // empty range (matching nothing).
    531    ZoneList<CharacterRange>* empty =
    532        zone->template New<ZoneList<CharacterRange>>(0, zone);
    533    return zone->template New<RegExpClassRanges>(zone, empty)
    534        ->ToNode(compiler, on_success);
    535  }
    536  ZoneList<RegExpTree*>* alternatives =
    537      zone->template New<ZoneList<RegExpTree*>>(size, zone);
    538  // Strings are sorted by length first (larger strings before shorter ones).
    539  // See the comment on CharacterClassStrings.
    540  // Empty strings (if present) are added after character ranges.
    541  RegExpTree* empty_string = nullptr;
    542  if (has_strings()) {
    543    for (auto string : *strings()) {
    544      if (string.second->IsEmpty()) {
    545        empty_string = string.second;
    546      } else {
    547        alternatives->Add(string.second, zone);
    548      }
    549    }
    550  }
    551  if (!ranges()->is_empty()) {
    552    // In unicode sets mode case folding has to be done at precise locations
    553    // (e.g. before building complements).
    554    // It is therefore the parsers responsibility to case fold (sub-) ranges
    555    // before creating ClassSetOperands.
    556    alternatives->Add(zone->template New<RegExpClassRanges>(
    557                          zone, ranges(), RegExpClassRanges::IS_CASE_FOLDED),
    558                      zone);
    559  }
    560  if (empty_string != nullptr) {
    561    alternatives->Add(empty_string, zone);
    562  }
    563 
    564  RegExpTree* node = nullptr;
    565  if (size == 1) {
    566    DCHECK_EQ(alternatives->length(), 1);
    567    node = alternatives->first();
    568  } else {
    569    node = zone->template New<RegExpDisjunction>(alternatives);
    570  }
    571  return node->ToNode(compiler, on_success);
    572 }
    573 
    574 RegExpNode* RegExpClassSetExpression::ToNodeImpl(RegExpCompiler* compiler,
    575                                                 RegExpNode* on_success) {
    576  Zone* zone = compiler->zone();
    577  ZoneList<CharacterRange>* temp_ranges =
    578      zone->template New<ZoneList<CharacterRange>>(4, zone);
    579  RegExpClassSetOperand* root = ComputeExpression(this, temp_ranges, zone);
    580  return root->ToNode(compiler, on_success);
    581 }
    582 
    583 void RegExpClassSetOperand::Union(RegExpClassSetOperand* other, Zone* zone) {
    584  ranges()->AddAll(*other->ranges(), zone);
    585  if (other->has_strings()) {
    586    if (strings_ == nullptr) {
    587      strings_ = zone->template New<CharacterClassStrings>(zone);
    588    }
    589    strings()->insert(other->strings()->begin(), other->strings()->end());
    590  }
    591 }
    592 
    593 void RegExpClassSetOperand::Intersect(RegExpClassSetOperand* other,
    594                                      ZoneList<CharacterRange>* temp_ranges,
    595                                      Zone* zone) {
    596  CharacterRange::Intersect(ranges(), other->ranges(), temp_ranges, zone);
    597  std::swap(*ranges(), *temp_ranges);
    598  temp_ranges->Rewind(0);
    599  if (has_strings()) {
    600    if (!other->has_strings()) {
    601      strings()->clear();
    602    } else {
    603      for (auto iter = strings()->begin(); iter != strings()->end();) {
    604        if (other->strings()->find(iter->first) == other->strings()->end()) {
    605          iter = strings()->erase(iter);
    606        } else {
    607          iter++;
    608        }
    609      }
    610    }
    611  }
    612 }
    613 
    614 void RegExpClassSetOperand::Subtract(RegExpClassSetOperand* other,
    615                                     ZoneList<CharacterRange>* temp_ranges,
    616                                     Zone* zone) {
    617  CharacterRange::Subtract(ranges(), other->ranges(), temp_ranges, zone);
    618  std::swap(*ranges(), *temp_ranges);
    619  temp_ranges->Rewind(0);
    620  if (has_strings() && other->has_strings()) {
    621    for (auto iter = strings()->begin(); iter != strings()->end();) {
    622      if (other->strings()->find(iter->first) != other->strings()->end()) {
    623        iter = strings()->erase(iter);
    624      } else {
    625        iter++;
    626      }
    627    }
    628  }
    629 }
    630 
    631 // static
    632 RegExpClassSetOperand* RegExpClassSetExpression::ComputeExpression(
    633    RegExpTree* root, ZoneList<CharacterRange>* temp_ranges, Zone* zone) {
    634  DCHECK(temp_ranges->is_empty());
    635  if (root->IsClassSetOperand()) {
    636    return root->AsClassSetOperand();
    637  }
    638  DCHECK(root->IsClassSetExpression());
    639  RegExpClassSetExpression* node = root->AsClassSetExpression();
    640  RegExpClassSetOperand* result =
    641      ComputeExpression(node->operands()->at(0), temp_ranges, zone);
    642  switch (node->operation()) {
    643    case OperationType::kUnion: {
    644      for (int i = 1; i < node->operands()->length(); i++) {
    645        RegExpClassSetOperand* op =
    646            ComputeExpression(node->operands()->at(i), temp_ranges, zone);
    647        result->Union(op, zone);
    648      }
    649      CharacterRange::Canonicalize(result->ranges());
    650      break;
    651    }
    652    case OperationType::kIntersection: {
    653      for (int i = 1; i < node->operands()->length(); i++) {
    654        RegExpClassSetOperand* op =
    655            ComputeExpression(node->operands()->at(i), temp_ranges, zone);
    656        result->Intersect(op, temp_ranges, zone);
    657      }
    658      break;
    659    }
    660    case OperationType::kSubtraction: {
    661      for (int i = 1; i < node->operands()->length(); i++) {
    662        RegExpClassSetOperand* op =
    663            ComputeExpression(node->operands()->at(i), temp_ranges, zone);
    664        result->Subtract(op, temp_ranges, zone);
    665      }
    666      break;
    667    }
    668  }
    669  if (node->is_negated()) {
    670    DCHECK(!result->has_strings());
    671    CharacterRange::Negate(result->ranges(), temp_ranges, zone);
    672    std::swap(*result->ranges(), *temp_ranges);
    673    temp_ranges->Rewind(0);
    674    node->is_negated_ = false;
    675  }
    676  // Store the result as single operand of the current node.
    677  node->operands()->Set(0, result);
    678  node->operands()->Rewind(1);
    679 
    680  return result;
    681 }
    682 
    683 namespace {
    684 
    685 bool StartsWithAtom(RegExpTree* tree) {
    686  if (tree->IsAtom()) return true;
    687  return tree->IsText() && tree->AsText()->StartsWithAtom();
    688 }
    689 
    690 RegExpAtom* FirstAtom(RegExpTree* tree) {
    691  if (tree->IsAtom()) return tree->AsAtom();
    692  return tree->AsText()->FirstAtom();
    693 }
    694 
    695 int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
    696  RegExpAtom* atom1 = FirstAtom(*a);
    697  RegExpAtom* atom2 = FirstAtom(*b);
    698  base::uc16 character1 = atom1->data().at(0);
    699  base::uc16 character2 = atom2->data().at(0);
    700  if (character1 < character2) return -1;
    701  if (character1 > character2) return 1;
    702  return 0;
    703 }
    704 
    705 #ifdef V8_INTL_SUPPORT
    706 
    707 int CompareCaseInsensitive(const icu::UnicodeString& a,
    708                           const icu::UnicodeString& b) {
    709  return a.caseCompare(b, U_FOLD_CASE_DEFAULT);
    710 }
    711 
    712 int CompareFirstCharCaseInsensitive(RegExpTree* const* a,
    713                                    RegExpTree* const* b) {
    714  RegExpAtom* atom1 = FirstAtom(*a);
    715  RegExpAtom* atom2 = FirstAtom(*b);
    716  return CompareCaseInsensitive(icu::UnicodeString{atom1->data().at(0)},
    717                                icu::UnicodeString{atom2->data().at(0)});
    718 }
    719 
    720 bool Equals(bool ignore_case, const icu::UnicodeString& a,
    721            const icu::UnicodeString& b) {
    722  if (a == b) return true;
    723  if (ignore_case) return CompareCaseInsensitive(a, b) == 0;
    724  return false;  // Case-sensitive equality already checked above.
    725 }
    726 
    727 bool CharAtEquals(bool ignore_case, int index, const RegExpAtom* a,
    728                  const RegExpAtom* b) {
    729  return Equals(ignore_case, a->data().at(index), b->data().at(index));
    730 }
    731 
    732 #else
    733 
    734 unibrow::uchar Canonical(
    735    unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
    736    unibrow::uchar c) {
    737  unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
    738  int length = canonicalize->get(c, '\0', chars);
    739  DCHECK_LE(length, 1);
    740  unibrow::uchar canonical = c;
    741  if (length == 1) canonical = chars[0];
    742  return canonical;
    743 }
    744 
    745 int CompareCaseInsensitive(
    746    unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
    747    unibrow::uchar a, unibrow::uchar b) {
    748  if (a == b) return 0;
    749  if (a >= 'a' || b >= 'a') {
    750    a = Canonical(canonicalize, a);
    751    b = Canonical(canonicalize, b);
    752  }
    753  return static_cast<int>(a) - static_cast<int>(b);
    754 }
    755 
    756 int CompareFirstCharCaseInsensitive(
    757    unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
    758    RegExpTree* const* a, RegExpTree* const* b) {
    759  RegExpAtom* atom1 = FirstAtom(*a);
    760  RegExpAtom* atom2 = FirstAtom(*b);
    761  return CompareCaseInsensitive(canonicalize, atom1->data().at(0),
    762                                atom2->data().at(0));
    763 }
    764 
    765 bool Equals(bool ignore_case,
    766            unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
    767            unibrow::uchar a, unibrow::uchar b) {
    768  if (a == b) return true;
    769  if (ignore_case) {
    770    return CompareCaseInsensitive(canonicalize, a, b) == 0;
    771  }
    772  return false;  // Case-sensitive equality already checked above.
    773 }
    774 
    775 bool CharAtEquals(bool ignore_case,
    776                  unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
    777                  int index, const RegExpAtom* a, const RegExpAtom* b) {
    778  return Equals(ignore_case, canonicalize, a->data().at(index),
    779                b->data().at(index));
    780 }
    781 
    782 #endif  // V8_INTL_SUPPORT
    783 
    784 }  // namespace
    785 
    786 // We can stable sort runs of atoms, since the order does not matter if they
    787 // start with different characters.
    788 // Returns true if any consecutive atoms were found.
    789 bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
    790  ZoneList<RegExpTree*>* alternatives = this->alternatives();
    791  int length = alternatives->length();
    792  bool found_consecutive_atoms = false;
    793  for (int i = 0; i < length; i++) {
    794    while (i < length) {
    795      RegExpTree* alternative = alternatives->at(i);
    796      if (StartsWithAtom(alternative)) break;
    797      i++;
    798    }
    799    // i is length or it is the index of an atom.
    800    if (i == length) break;
    801    int first_atom = i;
    802    i++;
    803    while (i < length) {
    804      RegExpTree* alternative = alternatives->at(i);
    805      if (!StartsWithAtom(alternative)) break;
    806      i++;
    807    }
    808    // Sort atoms to get ones with common prefixes together.
    809    // This step is more tricky if we are in a case-independent regexp,
    810    // because it would change /is|I/ to /I|is/, and order matters when
    811    // the regexp parts don't match only disjoint starting points. To fix
    812    // this we have a version of CompareFirstChar that uses case-
    813    // independent character classes for comparison.
    814    DCHECK_LT(first_atom, alternatives->length());
    815    DCHECK_LE(i, alternatives->length());
    816    DCHECK_LE(first_atom, i);
    817    if (IsIgnoreCase(compiler->flags())) {
    818 #ifdef V8_INTL_SUPPORT
    819      alternatives->StableSort(CompareFirstCharCaseInsensitive, first_atom,
    820                               i - first_atom);
    821 #else
    822      unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
    823          compiler->isolate()->regexp_macro_assembler_canonicalize();
    824      auto compare_closure = [canonicalize](RegExpTree* const* a,
    825                                            RegExpTree* const* b) {
    826        return CompareFirstCharCaseInsensitive(canonicalize, a, b);
    827      };
    828      alternatives->StableSort(compare_closure, first_atom, i - first_atom);
    829 #endif  // V8_INTL_SUPPORT
    830    } else {
    831      alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
    832    }
    833    if (i - first_atom > 1) found_consecutive_atoms = true;
    834  }
    835  return found_consecutive_atoms;
    836 }
    837 
    838 // Optimizes ab|ac|az to a(?:b|c|d).
    839 void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
    840  Zone* zone = compiler->zone();
    841  ZoneList<RegExpTree*>* alternatives = this->alternatives();
    842  int length = alternatives->length();
    843  const bool ignore_case = IsIgnoreCase(compiler->flags());
    844 
    845  int write_posn = 0;
    846  int i = 0;
    847  while (i < length) {
    848    RegExpTree* alternative = alternatives->at(i);
    849    if (!StartsWithAtom(alternative)) {
    850      alternatives->at(write_posn++) = alternatives->at(i);
    851      i++;
    852      continue;
    853    }
    854    RegExpAtom* const atom = FirstAtom(alternative);
    855 
    856 #ifdef V8_INTL_SUPPORT
    857    icu::UnicodeString common_prefix(atom->data().at(0));
    858 #else
    859    unibrow::Mapping<unibrow::Ecma262Canonicalize>* const canonicalize =
    860        compiler->isolate()->regexp_macro_assembler_canonicalize();
    861    unibrow::uchar common_prefix = atom->data().at(0);
    862    if (ignore_case) {
    863      common_prefix = Canonical(canonicalize, common_prefix);
    864    }
    865 #endif  // V8_INTL_SUPPORT
    866    int first_with_prefix = i;
    867    int prefix_length = atom->length();
    868    i++;
    869    while (i < length) {
    870      alternative = alternatives->at(i);
    871      if (!StartsWithAtom(alternative)) break;
    872      RegExpAtom* const alt_atom = FirstAtom(alternative);
    873 #ifdef V8_INTL_SUPPORT
    874      icu::UnicodeString new_prefix(alt_atom->data().at(0));
    875      if (!Equals(ignore_case, new_prefix, common_prefix)) break;
    876 #else
    877      unibrow::uchar new_prefix = alt_atom->data().at(0);
    878      if (!Equals(ignore_case, canonicalize, new_prefix, common_prefix)) break;
    879 #endif  // V8_INTL_SUPPORT
    880      prefix_length = std::min(prefix_length, alt_atom->length());
    881      i++;
    882    }
    883    if (i > first_with_prefix + 2) {
    884      // Found worthwhile run of alternatives with common prefix of at least one
    885      // character.  The sorting function above did not sort on more than one
    886      // character for reasons of correctness, but there may still be a longer
    887      // common prefix if the terms were similar or presorted in the input.
    888      // Find out how long the common prefix is.
    889      int run_length = i - first_with_prefix;
    890      RegExpAtom* const alt_atom =
    891          FirstAtom(alternatives->at(first_with_prefix));
    892      alternatives->at(first_with_prefix)->AsAtom();
    893      for (int j = 1; j < run_length && prefix_length > 1; j++) {
    894        RegExpAtom* old_atom =
    895            FirstAtom(alternatives->at(j + first_with_prefix));
    896        for (int k = 1; k < prefix_length; k++) {
    897 #ifdef V8_INTL_SUPPORT
    898          if (!CharAtEquals(ignore_case, k, alt_atom, old_atom)) {
    899 #else
    900          if (!CharAtEquals(ignore_case, canonicalize, k, alt_atom, old_atom)) {
    901 #endif  // V8_INTL_SUPPORT
    902            prefix_length = k;
    903            break;
    904          }
    905        }
    906      }
    907      RegExpAtom* prefix =
    908          zone->New<RegExpAtom>(alt_atom->data().SubVector(0, prefix_length));
    909      ZoneList<RegExpTree*>* pair = zone->New<ZoneList<RegExpTree*>>(2, zone);
    910      pair->Add(prefix, zone);
    911      ZoneList<RegExpTree*>* suffixes =
    912          zone->New<ZoneList<RegExpTree*>>(run_length, zone);
    913      for (int j = 0; j < run_length; j++) {
    914        if (alternatives->at(j + first_with_prefix)->IsAtom()) {
    915          RegExpAtom* old_atom =
    916              alternatives->at(j + first_with_prefix)->AsAtom();
    917          int len = old_atom->length();
    918          if (len == prefix_length) {
    919            suffixes->Add(zone->New<RegExpEmpty>(), zone);
    920          } else {
    921            RegExpTree* suffix = zone->New<RegExpAtom>(
    922                old_atom->data().SubVector(prefix_length, len));
    923            suffixes->Add(suffix, zone);
    924          }
    925        } else {
    926          RegExpText* new_text = zone->New<RegExpText>(zone);
    927          RegExpText* old_text =
    928              alternatives->at(j + first_with_prefix)->AsText();
    929          RegExpAtom* old_atom = old_text->FirstAtom();
    930          int len = old_atom->length();
    931          if (len != prefix_length) {
    932            RegExpAtom* suffix = zone->New<RegExpAtom>(
    933                old_atom->data().SubVector(prefix_length, len));
    934            new_text->AddElement(TextElement::Atom(suffix), zone);
    935          }
    936          for (int k = 1; k < old_text->elements()->length(); k++) {
    937            new_text->AddElement(old_text->elements()->at(k), zone);
    938          }
    939          if (new_text->elements()->length() != 0) {
    940            suffixes->Add(new_text, zone);
    941          } else {
    942            suffixes->Add(zone->New<RegExpEmpty>(), zone);
    943          }
    944        }
    945      }
    946      pair->Add(zone->New<RegExpDisjunction>(suffixes), zone);
    947      alternatives->at(write_posn++) = zone->New<RegExpAlternative>(pair);
    948    } else {
    949      // Just copy any non-worthwhile alternatives.
    950      for (int j = first_with_prefix; j < i; j++) {
    951        alternatives->at(write_posn++) = alternatives->at(j);
    952      }
    953    }
    954  }
    955  alternatives->Rewind(write_posn);  // Trim end of array.
    956 }
    957 
    958 // Optimizes b|c|z to [bcz].
    959 void RegExpDisjunction::FixSingleCharacterDisjunctions(
    960    RegExpCompiler* compiler) {
    961  Zone* zone = compiler->zone();
    962  ZoneList<RegExpTree*>* alternatives = this->alternatives();
    963  int length = alternatives->length();
    964 
    965  int write_posn = 0;
    966  int i = 0;
    967  while (i < length) {
    968    RegExpTree* alternative = alternatives->at(i);
    969    if (!alternative->IsAtom()) {
    970      alternatives->at(write_posn++) = alternatives->at(i);
    971      i++;
    972      continue;
    973    }
    974    RegExpAtom* const atom = alternative->AsAtom();
    975    if (atom->length() != 1) {
    976      alternatives->at(write_posn++) = alternatives->at(i);
    977      i++;
    978      continue;
    979    }
    980    const RegExpFlags flags = compiler->flags();
    981    DCHECK_IMPLIES(IsEitherUnicode(flags),
    982                   !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
    983    bool contains_trail_surrogate =
    984        unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
    985    int first_in_run = i;
    986    i++;
    987    // Find a run of single-character atom alternatives that have identical
    988    // flags (case independence and unicode-ness).
    989    while (i < length) {
    990      alternative = alternatives->at(i);
    991      if (!alternative->IsAtom()) break;
    992      RegExpAtom* const alt_atom = alternative->AsAtom();
    993      if (alt_atom->length() != 1) break;
    994      DCHECK_IMPLIES(IsEitherUnicode(flags),
    995                     !unibrow::Utf16::IsLeadSurrogate(alt_atom->data().at(0)));
    996      contains_trail_surrogate |=
    997          unibrow::Utf16::IsTrailSurrogate(alt_atom->data().at(0));
    998      i++;
    999    }
   1000    if (i > first_in_run + 1) {
   1001      // Found non-trivial run of single-character alternatives.
   1002      int run_length = i - first_in_run;
   1003      ZoneList<CharacterRange>* ranges =
   1004          zone->New<ZoneList<CharacterRange>>(2, zone);
   1005      for (int j = 0; j < run_length; j++) {
   1006        RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
   1007        DCHECK_EQ(old_atom->length(), 1);
   1008        ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
   1009      }
   1010      RegExpClassRanges::ClassRangesFlags class_ranges_flags;
   1011      if (IsEitherUnicode(flags) && contains_trail_surrogate) {
   1012        class_ranges_flags = RegExpClassRanges::CONTAINS_SPLIT_SURROGATE;
   1013      }
   1014      alternatives->at(write_posn++) =
   1015          zone->New<RegExpClassRanges>(zone, ranges, class_ranges_flags);
   1016    } else {
   1017      // Just copy any trivial alternatives.
   1018      for (int j = first_in_run; j < i; j++) {
   1019        alternatives->at(write_posn++) = alternatives->at(j);
   1020      }
   1021    }
   1022  }
   1023  alternatives->Rewind(write_posn);  // Trim end of array.
   1024 }
   1025 
   1026 RegExpNode* RegExpDisjunction::ToNodeImpl(RegExpCompiler* compiler,
   1027                                          RegExpNode* on_success) {
   1028  ZoneList<RegExpTree*>* alternatives = this->alternatives();
   1029 
   1030  if (alternatives->length() > 2) {
   1031    bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
   1032    if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
   1033    FixSingleCharacterDisjunctions(compiler);
   1034    if (alternatives->length() == 1) {
   1035      return alternatives->at(0)->ToNode(compiler, on_success);
   1036    }
   1037  }
   1038 
   1039  int length = alternatives->length();
   1040 
   1041  ChoiceNode* result =
   1042      compiler->zone()->New<ChoiceNode>(length, compiler->zone());
   1043  for (int i = 0; i < length; i++) {
   1044    GuardedAlternative alternative(
   1045        alternatives->at(i)->ToNode(compiler, on_success));
   1046    result->AddAlternative(alternative);
   1047  }
   1048  return result;
   1049 }
   1050 
   1051 RegExpNode* RegExpQuantifier::ToNodeImpl(RegExpCompiler* compiler,
   1052                                         RegExpNode* on_success) {
   1053  return ToNode(min(), max(), is_greedy(), body(), compiler, on_success);
   1054 }
   1055 
   1056 namespace {
   1057 // Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and
   1058 //         \B to (?<=\w)(?=\w)|(?<=\W)(?=\W)
   1059 RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler,
   1060                                          RegExpNode* on_success,
   1061                                          RegExpAssertion::Type type) {
   1062  CHECK(NeedsUnicodeCaseEquivalents(compiler->flags()));
   1063  Zone* zone = compiler->zone();
   1064  ZoneList<CharacterRange>* word_range =
   1065      zone->New<ZoneList<CharacterRange>>(2, zone);
   1066  CharacterRange::AddClassEscape(StandardCharacterSet::kWord, word_range, true,
   1067                                 zone);
   1068  int stack_register = compiler->UnicodeLookaroundStackRegister();
   1069  int position_register = compiler->UnicodeLookaroundPositionRegister();
   1070  ChoiceNode* result = zone->New<ChoiceNode>(2, zone);
   1071  // Add two choices. The (non-)boundary could start with a word or
   1072  // a non-word-character.
   1073  for (int i = 0; i < 2; i++) {
   1074    bool lookbehind_for_word = i == 0;
   1075    bool lookahead_for_word =
   1076        (type == RegExpAssertion::Type::BOUNDARY) ^ lookbehind_for_word;
   1077    // Look to the left.
   1078    RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success,
   1079                                         stack_register, position_register);
   1080    RegExpNode* backward = TextNode::CreateForCharacterRanges(
   1081        zone, word_range, true, lookbehind.on_match_success());
   1082    // Look to the right.
   1083    RegExpLookaround::Builder lookahead(lookahead_for_word,
   1084                                        lookbehind.ForMatch(backward),
   1085                                        stack_register, position_register);
   1086    RegExpNode* forward = TextNode::CreateForCharacterRanges(
   1087        zone, word_range, false, lookahead.on_match_success());
   1088    result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward)));
   1089  }
   1090  return result;
   1091 }
   1092 }  // anonymous namespace
   1093 
   1094 RegExpNode* RegExpAssertion::ToNodeImpl(RegExpCompiler* compiler,
   1095                                        RegExpNode* on_success) {
   1096  NodeInfo info;
   1097  Zone* zone = compiler->zone();
   1098 
   1099  switch (assertion_type()) {
   1100    case Type::START_OF_LINE:
   1101      return AssertionNode::AfterNewline(on_success);
   1102    case Type::START_OF_INPUT:
   1103      return AssertionNode::AtStart(on_success);
   1104    case Type::BOUNDARY:
   1105      return NeedsUnicodeCaseEquivalents(compiler->flags())
   1106                 ? BoundaryAssertionAsLookaround(compiler, on_success,
   1107                                                 Type::BOUNDARY)
   1108                 : AssertionNode::AtBoundary(on_success);
   1109    case Type::NON_BOUNDARY:
   1110      return NeedsUnicodeCaseEquivalents(compiler->flags())
   1111                 ? BoundaryAssertionAsLookaround(compiler, on_success,
   1112                                                 Type::NON_BOUNDARY)
   1113                 : AssertionNode::AtNonBoundary(on_success);
   1114    case Type::END_OF_INPUT:
   1115      return AssertionNode::AtEnd(on_success);
   1116    case Type::END_OF_LINE: {
   1117      // Compile $ in multiline regexps as an alternation with a positive
   1118      // lookahead in one side and an end-of-input on the other side.
   1119      // We need two registers for the lookahead.
   1120      int stack_pointer_register = compiler->AllocateRegister();
   1121      int position_register = compiler->AllocateRegister();
   1122      // The ChoiceNode to distinguish between a newline and end-of-input.
   1123      ChoiceNode* result = zone->New<ChoiceNode>(2, zone);
   1124      // Create a newline atom.
   1125      ZoneList<CharacterRange>* newline_ranges =
   1126          zone->New<ZoneList<CharacterRange>>(3, zone);
   1127      CharacterRange::AddClassEscape(StandardCharacterSet::kLineTerminator,
   1128                                     newline_ranges, false, zone);
   1129      RegExpClassRanges* newline_atom =
   1130          zone->New<RegExpClassRanges>(StandardCharacterSet::kLineTerminator);
   1131      ActionNode* submatch_success = ActionNode::PositiveSubmatchSuccess(
   1132          stack_pointer_register, position_register,
   1133          0,   // No captures inside.
   1134          -1,  // Ignored if no captures.
   1135          on_success);
   1136      TextNode* newline_matcher =
   1137          zone->New<TextNode>(newline_atom, false, submatch_success);
   1138      // Create an end-of-input matcher.
   1139      RegExpNode* end_of_line = ActionNode::BeginPositiveSubmatch(
   1140          stack_pointer_register, position_register, newline_matcher,
   1141          submatch_success);
   1142      // Add the two alternatives to the ChoiceNode.
   1143      GuardedAlternative eol_alternative(end_of_line);
   1144      result->AddAlternative(eol_alternative);
   1145      GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
   1146      result->AddAlternative(end_alternative);
   1147      return result;
   1148    }
   1149    default:
   1150      UNREACHABLE();
   1151  }
   1152 }
   1153 
   1154 RegExpNode* RegExpBackReference::ToNodeImpl(RegExpCompiler* compiler,
   1155                                            RegExpNode* on_success) {
   1156  RegExpNode* backref_node = on_success;
   1157  // Only one of the captures in the list can actually match. Since
   1158  // back-references to unmatched captures are treated as empty, we can simply
   1159  // create back-references to all possible captures.
   1160  for (auto capture : *captures()) {
   1161    backref_node = compiler->zone()->New<BackReferenceNode>(
   1162        RegExpCapture::StartRegister(capture->index()),
   1163        RegExpCapture::EndRegister(capture->index()), compiler->read_backward(),
   1164        backref_node);
   1165  }
   1166  return backref_node;
   1167 }
   1168 
   1169 RegExpNode* RegExpEmpty::ToNodeImpl(RegExpCompiler* compiler,
   1170                                    RegExpNode* on_success) {
   1171  return on_success;
   1172 }
   1173 
   1174 namespace {
   1175 
   1176 class V8_NODISCARD ModifiersScope {
   1177 public:
   1178  ModifiersScope(RegExpCompiler* compiler, RegExpFlags flags)
   1179      : compiler_(compiler), previous_flags_(compiler->flags()) {
   1180    compiler->set_flags(flags);
   1181  }
   1182  ~ModifiersScope() { compiler_->set_flags(previous_flags_); }
   1183 
   1184 private:
   1185  RegExpCompiler* compiler_;
   1186  const RegExpFlags previous_flags_;
   1187 };
   1188 
   1189 }  // namespace
   1190 
   1191 RegExpNode* RegExpGroup::ToNodeImpl(RegExpCompiler* compiler,
   1192                                    RegExpNode* on_success) {
   1193  // If no flags are modified, simply convert and return the body.
   1194  if (flags() == compiler->flags()) {
   1195    return body_->ToNode(compiler, on_success);
   1196  }
   1197  // Reset flags for successor node.
   1198  const RegExpFlags old_flags = compiler->flags();
   1199  on_success = ActionNode::ModifyFlags(old_flags, on_success);
   1200 
   1201  // Convert body using modifier.
   1202  ModifiersScope modifiers_scope(compiler, flags());
   1203  RegExpNode* body = body_->ToNode(compiler, on_success);
   1204 
   1205  // Wrap body into modifier node.
   1206  RegExpNode* modified_body = ActionNode::ModifyFlags(flags(), body);
   1207  return modified_body;
   1208 }
   1209 
   1210 RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success,
   1211                                   int stack_pointer_register,
   1212                                   int position_register,
   1213                                   int capture_register_count,
   1214                                   int capture_register_start)
   1215    : is_positive_(is_positive),
   1216      on_success_(on_success),
   1217      stack_pointer_register_(stack_pointer_register),
   1218      position_register_(position_register) {
   1219  if (is_positive_) {
   1220    on_match_success_ = ActionNode::PositiveSubmatchSuccess(
   1221        stack_pointer_register, position_register, capture_register_count,
   1222        capture_register_start, on_success_);
   1223  } else {
   1224    Zone* zone = on_success_->zone();
   1225    on_match_success_ = zone->New<NegativeSubmatchSuccess>(
   1226        stack_pointer_register, position_register, capture_register_count,
   1227        capture_register_start, zone);
   1228  }
   1229 }
   1230 
   1231 RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {
   1232  if (is_positive_) {
   1233    ActionNode* on_match_success = on_match_success_->AsActionNode();
   1234    return ActionNode::BeginPositiveSubmatch(
   1235        stack_pointer_register_, position_register_, match, on_match_success);
   1236  } else {
   1237    Zone* zone = on_success_->zone();
   1238    // We use a ChoiceNode to represent the negative lookaround. The first
   1239    // alternative is the negative match. On success, the end node backtracks.
   1240    // On failure, the second alternative is tried and leads to success.
   1241    // NegativeLookaroundChoiceNode is a special ChoiceNode that ignores the
   1242    // first exit when calculating quick checks.
   1243    ChoiceNode* choice_node = zone->New<NegativeLookaroundChoiceNode>(
   1244        GuardedAlternative(match), GuardedAlternative(on_success_), zone);
   1245    return ActionNode::BeginNegativeSubmatch(stack_pointer_register_,
   1246                                             position_register_, choice_node);
   1247  }
   1248 }
   1249 
   1250 RegExpNode* RegExpLookaround::ToNodeImpl(RegExpCompiler* compiler,
   1251                                         RegExpNode* on_success) {
   1252  int stack_pointer_register = compiler->AllocateRegister();
   1253  int position_register = compiler->AllocateRegister();
   1254 
   1255  const int registers_per_capture = 2;
   1256  const int register_of_first_capture = 2;
   1257  int register_count = capture_count_ * registers_per_capture;
   1258  int register_start =
   1259      register_of_first_capture + capture_from_ * registers_per_capture;
   1260 
   1261  RegExpNode* result;
   1262  bool was_reading_backward = compiler->read_backward();
   1263  compiler->set_read_backward(type() == LOOKBEHIND);
   1264  Builder builder(is_positive(), on_success, stack_pointer_register,
   1265                  position_register, register_count, register_start);
   1266  RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
   1267  result = builder.ForMatch(match);
   1268  compiler->set_read_backward(was_reading_backward);
   1269  return result;
   1270 }
   1271 
   1272 RegExpNode* RegExpCapture::ToNodeImpl(RegExpCompiler* compiler,
   1273                                      RegExpNode* on_success) {
   1274  return ToNode(body(), index(), compiler, on_success);
   1275 }
   1276 
   1277 RegExpNode* RegExpCapture::ToNode(RegExpTree* body, int index,
   1278                                  RegExpCompiler* compiler,
   1279                                  RegExpNode* on_success) {
   1280  DCHECK_NOT_NULL(body);
   1281  int start_reg = RegExpCapture::StartRegister(index);
   1282  int end_reg = RegExpCapture::EndRegister(index);
   1283  if (compiler->read_backward()) std::swap(start_reg, end_reg);
   1284  RegExpNode* store_end = ActionNode::ClearPosition(end_reg, on_success);
   1285  RegExpNode* body_node = body->ToNode(compiler, store_end);
   1286  return ActionNode::ClearPosition(start_reg, body_node);
   1287 }
   1288 
   1289 namespace {
   1290 
   1291 class AssertionSequenceRewriter final {
   1292 public:
   1293  // TODO(jgruber): Consider moving this to a separate AST tree rewriter pass
   1294  // instead of sprinkling rewrites into the AST->Node conversion process.
   1295  static void MaybeRewrite(ZoneList<RegExpTree*>* terms, Zone* zone) {
   1296    AssertionSequenceRewriter rewriter(terms, zone);
   1297 
   1298    static constexpr int kNoIndex = -1;
   1299    int from = kNoIndex;
   1300 
   1301    for (int i = 0; i < terms->length(); i++) {
   1302      RegExpTree* t = terms->at(i);
   1303      if (from == kNoIndex && t->IsAssertion()) {
   1304        from = i;  // Start a sequence.
   1305      } else if (from != kNoIndex && !t->IsAssertion()) {
   1306        // Terminate and process the sequence.
   1307        if (i - from > 1) rewriter.Rewrite(from, i);
   1308        from = kNoIndex;
   1309      }
   1310    }
   1311 
   1312    if (from != kNoIndex && terms->length() - from > 1) {
   1313      rewriter.Rewrite(from, terms->length());
   1314    }
   1315  }
   1316 
   1317  // All assertions are zero width. A consecutive sequence of assertions is
   1318  // order-independent. There's two ways we can optimize here:
   1319  // 1. fold all identical assertions.
   1320  // 2. if any assertion combinations are known to fail (e.g. \b\B), the entire
   1321  //    sequence fails.
   1322  void Rewrite(int from, int to) {
   1323    DCHECK_GT(to, from + 1);
   1324 
   1325    // Bitfield of all seen assertions.
   1326    uint32_t seen_assertions = 0;
   1327    static_assert(static_cast<int>(RegExpAssertion::Type::LAST_ASSERTION_TYPE) <
   1328                  kUInt32Size * kBitsPerByte);
   1329 
   1330    for (int i = from; i < to; i++) {
   1331      RegExpAssertion* t = terms_->at(i)->AsAssertion();
   1332      const uint32_t bit = 1 << static_cast<int>(t->assertion_type());
   1333 
   1334      if (seen_assertions & bit) {
   1335        // Fold duplicates.
   1336        terms_->Set(i, zone_->New<RegExpEmpty>());
   1337      }
   1338 
   1339      seen_assertions |= bit;
   1340    }
   1341 
   1342    // Collapse failures.
   1343    const uint32_t always_fails_mask =
   1344        1 << static_cast<int>(RegExpAssertion::Type::BOUNDARY) |
   1345        1 << static_cast<int>(RegExpAssertion::Type::NON_BOUNDARY);
   1346    if ((seen_assertions & always_fails_mask) == always_fails_mask) {
   1347      ReplaceSequenceWithFailure(from, to);
   1348    }
   1349  }
   1350 
   1351  void ReplaceSequenceWithFailure(int from, int to) {
   1352    // Replace the entire sequence with a single node that always fails.
   1353    // TODO(jgruber): Consider adding an explicit Fail kind. Until then, the
   1354    // negated '*' (everything) range serves the purpose.
   1355    ZoneList<CharacterRange>* ranges =
   1356        zone_->New<ZoneList<CharacterRange>>(0, zone_);
   1357    RegExpClassRanges* cc = zone_->New<RegExpClassRanges>(zone_, ranges);
   1358    terms_->Set(from, cc);
   1359 
   1360    // Zero out the rest.
   1361    RegExpEmpty* empty = zone_->New<RegExpEmpty>();
   1362    for (int i = from + 1; i < to; i++) terms_->Set(i, empty);
   1363  }
   1364 
   1365 private:
   1366  AssertionSequenceRewriter(ZoneList<RegExpTree*>* terms, Zone* zone)
   1367      : zone_(zone), terms_(terms) {}
   1368 
   1369  Zone* zone_;
   1370  ZoneList<RegExpTree*>* terms_;
   1371 };
   1372 
   1373 }  // namespace
   1374 
   1375 RegExpNode* RegExpAlternative::ToNodeImpl(RegExpCompiler* compiler,
   1376                                          RegExpNode* on_success) {
   1377  ZoneList<RegExpTree*>* children = nodes();
   1378 
   1379  AssertionSequenceRewriter::MaybeRewrite(children, compiler->zone());
   1380 
   1381  RegExpNode* current = on_success;
   1382  if (compiler->read_backward()) {
   1383    for (int i = 0; i < children->length(); i++) {
   1384      current = children->at(i)->ToNode(compiler, current);
   1385    }
   1386  } else {
   1387    for (int i = children->length() - 1; i >= 0; i--) {
   1388      current = children->at(i)->ToNode(compiler, current);
   1389    }
   1390  }
   1391  return current;
   1392 }
   1393 
   1394 namespace {
   1395 
   1396 void AddClass(const int* elmv, int elmc, ZoneList<CharacterRange>* ranges,
   1397              Zone* zone) {
   1398  elmc--;
   1399  DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
   1400  for (int i = 0; i < elmc; i += 2) {
   1401    DCHECK(elmv[i] < elmv[i + 1]);
   1402    ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone);
   1403  }
   1404 }
   1405 
   1406 void AddClassNegated(const int* elmv, int elmc,
   1407                     ZoneList<CharacterRange>* ranges, Zone* zone) {
   1408  elmc--;
   1409  DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
   1410  DCHECK_NE(0x0000, elmv[0]);
   1411  DCHECK_NE(kMaxCodePoint, elmv[elmc - 1]);
   1412  base::uc16 last = 0x0000;
   1413  for (int i = 0; i < elmc; i += 2) {
   1414    DCHECK(last <= elmv[i] - 1);
   1415    DCHECK(elmv[i] < elmv[i + 1]);
   1416    ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone);
   1417    last = elmv[i + 1];
   1418  }
   1419  ranges->Add(CharacterRange::Range(last, kMaxCodePoint), zone);
   1420 }
   1421 
   1422 }  // namespace
   1423 
   1424 void CharacterRange::AddClassEscape(StandardCharacterSet standard_character_set,
   1425                                    ZoneList<CharacterRange>* ranges,
   1426                                    bool add_unicode_case_equivalents,
   1427                                    Zone* zone) {
   1428  if (add_unicode_case_equivalents &&
   1429      (standard_character_set == StandardCharacterSet::kWord ||
   1430       standard_character_set == StandardCharacterSet::kNotWord)) {
   1431    // See #sec-runtime-semantics-wordcharacters-abstract-operation
   1432    // In case of unicode and ignore_case, we need to create the closure over
   1433    // case equivalent characters before negating.
   1434    ZoneList<CharacterRange>* new_ranges =
   1435        zone->New<ZoneList<CharacterRange>>(2, zone);
   1436    AddClass(kWordRanges, kWordRangeCount, new_ranges, zone);
   1437    AddUnicodeCaseEquivalents(new_ranges, zone);
   1438    if (standard_character_set == StandardCharacterSet::kNotWord) {
   1439      ZoneList<CharacterRange>* negated =
   1440          zone->New<ZoneList<CharacterRange>>(2, zone);
   1441      CharacterRange::Negate(new_ranges, negated, zone);
   1442      new_ranges = negated;
   1443    }
   1444    ranges->AddAll(*new_ranges, zone);
   1445    return;
   1446  }
   1447 
   1448  switch (standard_character_set) {
   1449    case StandardCharacterSet::kWhitespace:
   1450      AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
   1451      break;
   1452    case StandardCharacterSet::kNotWhitespace:
   1453      AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
   1454      break;
   1455    case StandardCharacterSet::kWord:
   1456      AddClass(kWordRanges, kWordRangeCount, ranges, zone);
   1457      break;
   1458    case StandardCharacterSet::kNotWord:
   1459      AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
   1460      break;
   1461    case StandardCharacterSet::kDigit:
   1462      AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
   1463      break;
   1464    case StandardCharacterSet::kNotDigit:
   1465      AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
   1466      break;
   1467    // This is the set of characters matched by the $ and ^ symbols
   1468    // in multiline mode.
   1469    case StandardCharacterSet::kLineTerminator:
   1470      AddClass(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges, zone);
   1471      break;
   1472    case StandardCharacterSet::kNotLineTerminator:
   1473      AddClassNegated(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges,
   1474                      zone);
   1475      break;
   1476    // This is not a character range as defined by the spec but a
   1477    // convenient shorthand for a character class that matches any
   1478    // character.
   1479    case StandardCharacterSet::kEverything:
   1480      ranges->Add(CharacterRange::Everything(), zone);
   1481      break;
   1482  }
   1483 }
   1484 
   1485 // static
   1486 // Only for /i, not for /ui or /vi.
   1487 void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
   1488                                        ZoneList<CharacterRange>* ranges,
   1489                                        bool is_one_byte) {
   1490  CharacterRange::Canonicalize(ranges);
   1491  int range_count = ranges->length();
   1492 #ifdef V8_INTL_SUPPORT
   1493  icu::UnicodeSet others;
   1494  for (int i = 0; i < range_count; i++) {
   1495    CharacterRange range = ranges->at(i);
   1496    base::uc32 from = range.from();
   1497    if (from > kMaxUtf16CodeUnit) continue;
   1498    base::uc32 to = std::min({range.to(), kMaxUtf16CodeUnitU});
   1499    // Nothing to be done for surrogates.
   1500    if (from >= kLeadSurrogateStart && to <= kTrailSurrogateEnd) continue;
   1501    if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
   1502      if (from > String::kMaxOneByteCharCode) continue;
   1503      if (to > String::kMaxOneByteCharCode) to = String::kMaxOneByteCharCode;
   1504    }
   1505    others.add(from, to);
   1506  }
   1507 
   1508  // Compute the set of additional characters that should be added,
   1509  // using UnicodeSet::closeOver. ECMA 262 defines slightly different
   1510  // case-folding rules than Unicode, so some characters that are
   1511  // added by closeOver do not match anything other than themselves in
   1512  // JS. For example, 'Å¿' (U+017F LATIN SMALL LETTER LONG S) is the
   1513  // same case-insensitive character as 's' or 'S' according to
   1514  // Unicode, but does not match any other character in JS. To handle
   1515  // this case, we add such characters to the IgnoreSet and filter
   1516  // them out. We filter twice: once before calling closeOver (to
   1517  // prevent 'Å¿' from adding 's'), and once after calling closeOver
   1518  // (to prevent 's' from adding 'Å¿'). See regexp/special-case.h for
   1519  // more information.
   1520  icu::UnicodeSet already_added(others);
   1521  others.removeAll(RegExpCaseFolding::IgnoreSet());
   1522  others.closeOver(USET_CASE_INSENSITIVE);
   1523  others.removeAll(RegExpCaseFolding::IgnoreSet());
   1524  others.removeAll(already_added);
   1525 
   1526  // Add others to the ranges
   1527  for (int32_t i = 0; i < others.getRangeCount(); i++) {
   1528    UChar32 from = others.getRangeStart(i);
   1529    UChar32 to = others.getRangeEnd(i);
   1530    if (from == to) {
   1531      ranges->Add(CharacterRange::Singleton(from), zone);
   1532    } else {
   1533      ranges->Add(CharacterRange::Range(from, to), zone);
   1534    }
   1535  }
   1536 #else
   1537  for (int i = 0; i < range_count; i++) {
   1538    CharacterRange range = ranges->at(i);
   1539    base::uc32 bottom = range.from();
   1540    if (bottom > kMaxUtf16CodeUnit) continue;
   1541    base::uc32 top = std::min({range.to(), kMaxUtf16CodeUnitU});
   1542    // Nothing to be done for surrogates.
   1543    if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) continue;
   1544    if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
   1545      if (bottom > String::kMaxOneByteCharCode) continue;
   1546      if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
   1547    }
   1548    unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
   1549    if (top == bottom) {
   1550      // If this is a singleton we just expand the one character.
   1551      int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
   1552      for (int j = 0; j < length; j++) {
   1553        base::uc32 chr = chars[j];
   1554        if (chr != bottom) {
   1555          ranges->Add(CharacterRange::Singleton(chars[j]), zone);
   1556        }
   1557      }
   1558    } else {
   1559      // If this is a range we expand the characters block by block, expanding
   1560      // contiguous subranges (blocks) one at a time.  The approach is as
   1561      // follows.  For a given start character we look up the remainder of the
   1562      // block that contains it (represented by the end point), for instance we
   1563      // find 'z' if the character is 'c'.  A block is characterized by the
   1564      // property that all characters uncanonicalize in the same way, except
   1565      // that each entry in the result is incremented by the distance from the
   1566      // first element.  So a-z is a block because 'a' uncanonicalizes to ['a',
   1567      // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k].  Once
   1568      // we've found the end point we look up its uncanonicalization and
   1569      // produce a range for each element.  For instance for [c-f] we look up
   1570      // ['z', 'Z'] and produce [c-f] and [C-F].  We then only add a range if
   1571      // it is not already contained in the input, so [c-f] will be skipped but
   1572      // [C-F] will be added.  If this range is not completely contained in a
   1573      // block we do this for all the blocks covered by the range (handling
   1574      // characters that is not in a block as a "singleton block").
   1575      unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth];
   1576      base::uc32 pos = bottom;
   1577      while (pos <= top) {
   1578        int length =
   1579            isolate->jsregexp_canonrange()->get(pos, '\0', equivalents);
   1580        base::uc32 block_end;
   1581        if (length == 0) {
   1582          block_end = pos;
   1583        } else {
   1584          DCHECK_EQ(1, length);
   1585          block_end = equivalents[0];
   1586        }
   1587        int end = (block_end > top) ? top : block_end;
   1588        length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0',
   1589                                                         equivalents);
   1590        for (int j = 0; j < length; j++) {
   1591          base::uc32 c = equivalents[j];
   1592          base::uc32 range_from = c - (block_end - pos);
   1593          base::uc32 range_to = c - (block_end - end);
   1594          if (!(bottom <= range_from && range_to <= top)) {
   1595            ranges->Add(CharacterRange::Range(range_from, range_to), zone);
   1596          }
   1597        }
   1598        pos = end + 1;
   1599      }
   1600    }
   1601  }
   1602 #endif  // V8_INTL_SUPPORT
   1603 }
   1604 
   1605 bool CharacterRange::IsCanonical(const ZoneList<CharacterRange>* ranges) {
   1606  DCHECK_NOT_NULL(ranges);
   1607  int n = ranges->length();
   1608  if (n <= 1) return true;
   1609  base::uc32 max = ranges->at(0).to();
   1610  for (int i = 1; i < n; i++) {
   1611    CharacterRange next_range = ranges->at(i);
   1612    if (next_range.from() <= max + 1) return false;
   1613    max = next_range.to();
   1614  }
   1615  return true;
   1616 }
   1617 
   1618 ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
   1619  if (ranges_ == nullptr) {
   1620    ranges_ = zone->New<ZoneList<CharacterRange>>(2, zone);
   1621    CharacterRange::AddClassEscape(standard_set_type_.value(), ranges_, false,
   1622                                   zone);
   1623  }
   1624  return ranges_;
   1625 }
   1626 
   1627 namespace {
   1628 
   1629 // Move a number of elements in a zonelist to another position
   1630 // in the same list. Handles overlapping source and target areas.
   1631 void MoveRanges(ZoneList<CharacterRange>* list, int from, int to, int count) {
   1632  // Ranges are potentially overlapping.
   1633  if (from < to) {
   1634    for (int i = count - 1; i >= 0; i--) {
   1635      list->at(to + i) = list->at(from + i);
   1636    }
   1637  } else {
   1638    for (int i = 0; i < count; i++) {
   1639      list->at(to + i) = list->at(from + i);
   1640    }
   1641  }
   1642 }
   1643 
   1644 int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list, int count,
   1645                               CharacterRange insert) {
   1646  // Inserts a range into list[0..count[, which must be sorted
   1647  // by from value and non-overlapping and non-adjacent, using at most
   1648  // list[0..count] for the result. Returns the number of resulting
   1649  // canonicalized ranges. Inserting a range may collapse existing ranges into
   1650  // fewer ranges, so the return value can be anything in the range 1..count+1.
   1651  base::uc32 from = insert.from();
   1652  base::uc32 to = insert.to();
   1653  int start_pos = 0;
   1654  int end_pos = count;
   1655  for (int i = count - 1; i >= 0; i--) {
   1656    CharacterRange current = list->at(i);
   1657    if (current.from() > to + 1) {
   1658      end_pos = i;
   1659    } else if (current.to() + 1 < from) {
   1660      start_pos = i + 1;
   1661      break;
   1662    }
   1663  }
   1664 
   1665  // Inserted range overlaps, or is adjacent to, ranges at positions
   1666  // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
   1667  // not affected by the insertion.
   1668  // If start_pos == end_pos, the range must be inserted before start_pos.
   1669  // if start_pos < end_pos, the entire range from start_pos to end_pos
   1670  // must be merged with the insert range.
   1671 
   1672  if (start_pos == end_pos) {
   1673    // Insert between existing ranges at position start_pos.
   1674    if (start_pos < count) {
   1675      MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
   1676    }
   1677    list->at(start_pos) = insert;
   1678    return count + 1;
   1679  }
   1680  if (start_pos + 1 == end_pos) {
   1681    // Replace single existing range at position start_pos.
   1682    CharacterRange to_replace = list->at(start_pos);
   1683    int new_from = std::min(to_replace.from(), from);
   1684    int new_to = std::max(to_replace.to(), to);
   1685    list->at(start_pos) = CharacterRange::Range(new_from, new_to);
   1686    return count;
   1687  }
   1688  // Replace a number of existing ranges from start_pos to end_pos - 1.
   1689  // Move the remaining ranges down.
   1690 
   1691  int new_from = std::min(list->at(start_pos).from(), from);
   1692  int new_to = std::max(list->at(end_pos - 1).to(), to);
   1693  if (end_pos < count) {
   1694    MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
   1695  }
   1696  list->at(start_pos) = CharacterRange::Range(new_from, new_to);
   1697  return count - (end_pos - start_pos) + 1;
   1698 }
   1699 
   1700 }  // namespace
   1701 
   1702 void CharacterSet::Canonicalize() {
   1703  // Special/default classes are always considered canonical. The result
   1704  // of calling ranges() will be sorted.
   1705  if (ranges_ == nullptr) return;
   1706  CharacterRange::Canonicalize(ranges_);
   1707 }
   1708 
   1709 // static
   1710 void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
   1711  if (character_ranges->length() <= 1) return;
   1712  // Check whether ranges are already canonical (increasing, non-overlapping,
   1713  // non-adjacent).
   1714  int n = character_ranges->length();
   1715  base::uc32 max = character_ranges->at(0).to();
   1716  int i = 1;
   1717  while (i < n) {
   1718    CharacterRange current = character_ranges->at(i);
   1719    if (current.from() <= max + 1) {
   1720      break;
   1721    }
   1722    max = current.to();
   1723    i++;
   1724  }
   1725  // Canonical until the i'th range. If that's all of them, we are done.
   1726  if (i == n) return;
   1727 
   1728  // The ranges at index i and forward are not canonicalized. Make them so by
   1729  // doing the equivalent of insertion sort (inserting each into the previous
   1730  // list, in order).
   1731  // Notice that inserting a range can reduce the number of ranges in the
   1732  // result due to combining of adjacent and overlapping ranges.
   1733  int read = i;           // Range to insert.
   1734  int num_canonical = i;  // Length of canonicalized part of list.
   1735  do {
   1736    num_canonical = InsertRangeInCanonicalList(character_ranges, num_canonical,
   1737                                               character_ranges->at(read));
   1738    read++;
   1739  } while (read < n);
   1740  character_ranges->Rewind(num_canonical);
   1741 
   1742  DCHECK(CharacterRange::IsCanonical(character_ranges));
   1743 }
   1744 
   1745 // static
   1746 void CharacterRange::Negate(const ZoneList<CharacterRange>* ranges,
   1747                            ZoneList<CharacterRange>* negated_ranges,
   1748                            Zone* zone) {
   1749  DCHECK(CharacterRange::IsCanonical(ranges));
   1750  DCHECK_EQ(0, negated_ranges->length());
   1751  int range_count = ranges->length();
   1752  base::uc32 from = 0;
   1753  int i = 0;
   1754  if (range_count > 0 && ranges->at(0).from() == 0) {
   1755    from = ranges->at(0).to() + 1;
   1756    i = 1;
   1757  }
   1758  while (i < range_count) {
   1759    CharacterRange range = ranges->at(i);
   1760    negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone);
   1761    from = range.to() + 1;
   1762    i++;
   1763  }
   1764  if (from < kMaxCodePoint) {
   1765    negated_ranges->Add(CharacterRange::Range(from, kMaxCodePoint), zone);
   1766  }
   1767 }
   1768 
   1769 // static
   1770 void CharacterRange::Intersect(const ZoneList<CharacterRange>* lhs,
   1771                               const ZoneList<CharacterRange>* rhs,
   1772                               ZoneList<CharacterRange>* intersection,
   1773                               Zone* zone) {
   1774  DCHECK(CharacterRange::IsCanonical(lhs));
   1775  DCHECK(CharacterRange::IsCanonical(rhs));
   1776  DCHECK_EQ(0, intersection->length());
   1777  int lhs_index = 0;
   1778  int rhs_index = 0;
   1779  while (lhs_index < lhs->length() && rhs_index < rhs->length()) {
   1780    // Skip non-overlapping ranges.
   1781    if (lhs->at(lhs_index).to() < rhs->at(rhs_index).from()) {
   1782      lhs_index++;
   1783      continue;
   1784    }
   1785    if (rhs->at(rhs_index).to() < lhs->at(lhs_index).from()) {
   1786      rhs_index++;
   1787      continue;
   1788    }
   1789 
   1790    base::uc32 from =
   1791        std::max(lhs->at(lhs_index).from(), rhs->at(rhs_index).from());
   1792    base::uc32 to = std::min(lhs->at(lhs_index).to(), rhs->at(rhs_index).to());
   1793    intersection->Add(CharacterRange::Range(from, to), zone);
   1794    if (to == lhs->at(lhs_index).to()) {
   1795      lhs_index++;
   1796    } else {
   1797      rhs_index++;
   1798    }
   1799  }
   1800 
   1801  DCHECK(IsCanonical(intersection));
   1802 }
   1803 
   1804 namespace {
   1805 
   1806 // Advance |index| and set |from| and |to| to the new range, if not out of
   1807 // bounds of |range|, otherwise |from| is set to a code point beyond the legal
   1808 // unicode character range.
   1809 void SafeAdvanceRange(const ZoneList<CharacterRange>* range, int* index,
   1810                      base::uc32* from, base::uc32* to) {
   1811  ++(*index);
   1812  if (*index < range->length()) {
   1813    *from = range->at(*index).from();
   1814    *to = range->at(*index).to();
   1815  } else {
   1816    *from = kMaxCodePoint + 1;
   1817  }
   1818 }
   1819 
   1820 }  // namespace
   1821 
   1822 // static
   1823 void CharacterRange::Subtract(const ZoneList<CharacterRange>* src,
   1824                              const ZoneList<CharacterRange>* to_remove,
   1825                              ZoneList<CharacterRange>* result, Zone* zone) {
   1826  DCHECK(CharacterRange::IsCanonical(src));
   1827  DCHECK(CharacterRange::IsCanonical(to_remove));
   1828  DCHECK_EQ(0, result->length());
   1829 
   1830  if (src->is_empty()) return;
   1831 
   1832  int src_index = 0;
   1833  int to_remove_index = 0;
   1834  base::uc32 from = src->at(src_index).from();
   1835  base::uc32 to = src->at(src_index).to();
   1836  while (src_index < src->length() && to_remove_index < to_remove->length()) {
   1837    CharacterRange remove_range = to_remove->at(to_remove_index);
   1838    if (remove_range.to() < from) {
   1839      // (a) Non-overlapping case, ignore current to_remove range.
   1840      //            |-------|
   1841      // |-------|
   1842      to_remove_index++;
   1843    } else if (to < remove_range.from()) {
   1844      // (b) Non-overlapping case, add full current range to result.
   1845      // |-------|
   1846      //            |-------|
   1847      result->Add(CharacterRange::Range(from, to), zone);
   1848      SafeAdvanceRange(src, &src_index, &from, &to);
   1849    } else if (from >= remove_range.from() && to <= remove_range.to()) {
   1850      // (c) Current to_remove range fully covers current range.
   1851      //   |---|
   1852      // |-------|
   1853      SafeAdvanceRange(src, &src_index, &from, &to);
   1854    } else if (from < remove_range.from() && to > remove_range.to()) {
   1855      // (d) Split current range.
   1856      // |-------|
   1857      //   |---|
   1858      result->Add(CharacterRange::Range(from, remove_range.from() - 1), zone);
   1859      from = remove_range.to() + 1;
   1860      to_remove_index++;
   1861    } else if (from < remove_range.from()) {
   1862      // (e) End current range.
   1863      // |-------|
   1864      //    |-------|
   1865      to = remove_range.from() - 1;
   1866      result->Add(CharacterRange::Range(from, to), zone);
   1867      SafeAdvanceRange(src, &src_index, &from, &to);
   1868    } else if (to > remove_range.to()) {
   1869      // (f) Modify start of current range.
   1870      //    |-------|
   1871      // |-------|
   1872      from = remove_range.to() + 1;
   1873      to_remove_index++;
   1874    } else {
   1875      UNREACHABLE();
   1876    }
   1877  }
   1878  // The last range needs special treatment after |to_remove| is exhausted, as
   1879  // |from| might have been modified by the last |to_remove| range and |to| was
   1880  // not yet known (i.e. cases d and f).
   1881  if (from <= to) {
   1882    result->Add(CharacterRange::Range(from, to), zone);
   1883  }
   1884  src_index++;
   1885 
   1886  // Add remaining ranges after |to_remove| is exhausted.
   1887  for (; src_index < src->length(); src_index++) {
   1888    result->Add(src->at(src_index), zone);
   1889  }
   1890 
   1891  DCHECK(IsCanonical(result));
   1892 }
   1893 
   1894 // static
   1895 void CharacterRange::ClampToOneByte(ZoneList<CharacterRange>* ranges) {
   1896  DCHECK(IsCanonical(ranges));
   1897 
   1898  // Drop all ranges that don't contain one-byte code units, and clamp the last
   1899  // range s.t. it likewise only contains one-byte code units. Note this relies
   1900  // on `ranges` being canonicalized, i.e. sorted and non-overlapping.
   1901 
   1902  static constexpr base::uc32 max_char = String::kMaxOneByteCharCodeU;
   1903  int n = ranges->length();
   1904  for (; n > 0; n--) {
   1905    CharacterRange& r = ranges->at(n - 1);
   1906    if (r.from() <= max_char) {
   1907      r.to_ = std::min(r.to_, max_char);
   1908      break;
   1909    }
   1910  }
   1911 
   1912  ranges->Rewind(n);
   1913 }
   1914 
   1915 // static
   1916 bool CharacterRange::Equals(const ZoneList<CharacterRange>* lhs,
   1917                            const ZoneList<CharacterRange>* rhs) {
   1918  DCHECK(IsCanonical(lhs));
   1919  DCHECK(IsCanonical(rhs));
   1920  if (lhs->length() != rhs->length()) return false;
   1921 
   1922  for (int i = 0; i < lhs->length(); i++) {
   1923    if (lhs->at(i) != rhs->at(i)) return false;
   1924  }
   1925 
   1926  return true;
   1927 }
   1928 
   1929 namespace {
   1930 
   1931 // Scoped object to keep track of how much we unroll quantifier loops in the
   1932 // regexp graph generator.
   1933 class RegExpExpansionLimiter {
   1934 public:
   1935  static const int kMaxExpansionFactor = 6;
   1936  RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
   1937      : compiler_(compiler),
   1938        saved_expansion_factor_(compiler->current_expansion_factor()),
   1939        ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
   1940    DCHECK_LT(0, factor);
   1941    if (ok_to_expand_) {
   1942      if (factor > kMaxExpansionFactor) {
   1943        // Avoid integer overflow of the current expansion factor.
   1944        ok_to_expand_ = false;
   1945        compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
   1946      } else {
   1947        int new_factor = saved_expansion_factor_ * factor;
   1948        ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
   1949        compiler->set_current_expansion_factor(new_factor);
   1950      }
   1951    }
   1952  }
   1953 
   1954  ~RegExpExpansionLimiter() {
   1955    compiler_->set_current_expansion_factor(saved_expansion_factor_);
   1956  }
   1957 
   1958  bool ok_to_expand() { return ok_to_expand_; }
   1959 
   1960 private:
   1961  RegExpCompiler* compiler_;
   1962  int saved_expansion_factor_;
   1963  bool ok_to_expand_;
   1964 
   1965  DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
   1966 };
   1967 
   1968 }  // namespace
   1969 
   1970 RegExpNode* RegExpQuantifier::ToNode(int min, int max, bool is_greedy,
   1971                                     RegExpTree* body, RegExpCompiler* compiler,
   1972                                     RegExpNode* on_success,
   1973                                     bool not_at_start) {
   1974  // x{f, t} becomes this:
   1975  //
   1976  //             (r++)<-.
   1977  //               |     `
   1978  //               |     (x)
   1979  //               v     ^
   1980  //      (r=0)-->(?)---/ [if r < t]
   1981  //               |
   1982  //   [if r >= f] \----> ...
   1983  //
   1984 
   1985  // 15.10.2.5 RepeatMatcher algorithm.
   1986  // The parser has already eliminated the case where max is 0.  In the case
   1987  // where max_match is zero the parser has removed the quantifier if min was
   1988  // > 0 and removed the atom if min was 0.  See AddQuantifierToAtom.
   1989 
   1990  // If we know that we cannot match zero length then things are a little
   1991  // simpler since we don't need to make the special zero length match check
   1992  // from step 2.1.  If the min and max are small we can unroll a little in
   1993  // this case.
   1994  static const int kMaxUnrolledMinMatches = 3;  // Unroll (foo)+ and (foo){3,}
   1995  static const int kMaxUnrolledMaxMatches = 3;  // Unroll (foo)? and (foo){x,3}
   1996  if (max == 0) return on_success;  // This can happen due to recursion.
   1997  bool body_can_be_empty = (body->min_match() == 0);
   1998  int body_start_reg = RegExpCompiler::kNoRegister;
   1999  Interval capture_registers = body->CaptureRegisters();
   2000  bool needs_capture_clearing = !capture_registers.is_empty();
   2001  Zone* zone = compiler->zone();
   2002 
   2003  if (body_can_be_empty) {
   2004    body_start_reg = compiler->AllocateRegister();
   2005  } else if (compiler->optimize() && !needs_capture_clearing) {
   2006    // Only unroll if there are no captures and the body can't be
   2007    // empty.
   2008    {
   2009      RegExpExpansionLimiter limiter(compiler, min + ((max != min) ? 1 : 0));
   2010      if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
   2011        int new_max = (max == kInfinity) ? max : max - min;
   2012        // Recurse once to get the loop or optional matches after the fixed
   2013        // ones.
   2014        RegExpNode* answer =
   2015            ToNode(0, new_max, is_greedy, body, compiler, on_success, true);
   2016        // Unroll the forced matches from 0 to min.  This can cause chains of
   2017        // TextNodes (which the parser does not generate).  These should be
   2018        // combined if it turns out they hinder good code generation.
   2019        for (int i = 0; i < min; i++) {
   2020          answer = body->ToNode(compiler, answer);
   2021        }
   2022        return answer;
   2023      }
   2024    }
   2025    if (max <= kMaxUnrolledMaxMatches && min == 0) {
   2026      DCHECK_LT(0, max);  // Due to the 'if' above.
   2027      RegExpExpansionLimiter limiter(compiler, max);
   2028      if (limiter.ok_to_expand()) {
   2029        // Unroll the optional matches up to max.
   2030        RegExpNode* answer = on_success;
   2031        for (int i = 0; i < max; i++) {
   2032          ChoiceNode* alternation = zone->New<ChoiceNode>(2, zone);
   2033          if (is_greedy) {
   2034            alternation->AddAlternative(
   2035                GuardedAlternative(body->ToNode(compiler, answer)));
   2036            alternation->AddAlternative(GuardedAlternative(on_success));
   2037          } else {
   2038            alternation->AddAlternative(GuardedAlternative(on_success));
   2039            alternation->AddAlternative(
   2040                GuardedAlternative(body->ToNode(compiler, answer)));
   2041          }
   2042          answer = alternation;
   2043          if (not_at_start && !compiler->read_backward()) {
   2044            alternation->set_not_at_start();
   2045          }
   2046        }
   2047        return answer;
   2048      }
   2049    }
   2050  }
   2051  bool has_min = min > 0;
   2052  bool has_max = max < RegExpTree::kInfinity;
   2053  bool needs_counter = has_min || has_max;
   2054  int reg_ctr = needs_counter ? compiler->AllocateRegister()
   2055                              : RegExpCompiler::kNoRegister;
   2056  LoopChoiceNode* center = zone->New<LoopChoiceNode>(
   2057      body->min_match() == 0, compiler->read_backward(), min, zone);
   2058  if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
   2059  RegExpNode* loop_return =
   2060      needs_counter ? static_cast<RegExpNode*>(
   2061                          ActionNode::IncrementRegister(reg_ctr, center))
   2062                    : static_cast<RegExpNode*>(center);
   2063  if (body_can_be_empty) {
   2064    // If the body can be empty we need to check if it was and then
   2065    // backtrack.
   2066    loop_return =
   2067        ActionNode::EmptyMatchCheck(body_start_reg, reg_ctr, min, loop_return);
   2068  }
   2069  RegExpNode* body_node = body->ToNode(compiler, loop_return);
   2070  if (body_can_be_empty) {
   2071    // If the body can be empty we need to store the start position
   2072    // so we can bail out if it was empty.
   2073    body_node = ActionNode::RestorePosition(body_start_reg, body_node);
   2074  }
   2075  if (needs_capture_clearing) {
   2076    // Before entering the body of this loop we need to clear captures.
   2077    body_node = ActionNode::ClearCaptures(capture_registers, body_node);
   2078  }
   2079  GuardedAlternative body_alt(body_node);
   2080  if (has_max) {
   2081    Guard* body_guard = zone->New<Guard>(reg_ctr, Guard::LT, max);
   2082    body_alt.AddGuard(body_guard, zone);
   2083  }
   2084  GuardedAlternative rest_alt(on_success);
   2085  if (has_min) {
   2086    Guard* rest_guard = compiler->zone()->New<Guard>(reg_ctr, Guard::GEQ, min);
   2087    rest_alt.AddGuard(rest_guard, zone);
   2088  }
   2089  if (is_greedy) {
   2090    center->AddLoopAlternative(body_alt);
   2091    center->AddContinueAlternative(rest_alt);
   2092  } else {
   2093    center->AddContinueAlternative(rest_alt);
   2094    center->AddLoopAlternative(body_alt);
   2095  }
   2096  if (needs_counter) {
   2097    return ActionNode::SetRegisterForLoop(reg_ctr, 0, center);
   2098  } else {
   2099    return center;
   2100  }
   2101 }
   2102 
   2103 }  // namespace internal
   2104 }  // namespace v8