[ tor-browser ].git.dasho

regexp-compiler.cc (163809B)
      1 // Copyright 2019 the V8 project authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "irregexp/imported/regexp-compiler.h"
      6 
      7 #include <optional>
      8 
      9 #include "irregexp/imported/regexp-macro-assembler-arch.h"
     10 
     11 #ifdef V8_INTL_SUPPORT
     12 #include "irregexp/imported/special-case.h"
     13 #include "unicode/locid.h"
     14 #include "unicode/uniset.h"
     15 #include "unicode/utypes.h"
     16 #endif  // V8_INTL_SUPPORT
     17 
     18 namespace v8::internal {
     19 
     20 using namespace regexp_compiler_constants;  // NOLINT(build/namespaces)
     21 
     22 // -------------------------------------------------------------------
     23 // Implementation of the Irregexp regular expression engine.
     24 //
     25 // The Irregexp regular expression engine is intended to be a complete
     26 // implementation of ECMAScript regular expressions.  It generates either
     27 // bytecodes or native code.
     28 
     29 //   The Irregexp regexp engine is structured in three steps.
     30 //   1) The parser generates an abstract syntax tree.  See ast.cc.
     31 //   2) From the AST a node network is created.  The nodes are all
     32 //      subclasses of RegExpNode.  The nodes represent states when
     33 //      executing a regular expression.  Several optimizations are
     34 //      performed on the node network.
     35 //   3) From the nodes we generate either byte codes or native code
     36 //      that can actually execute the regular expression (perform
     37 //      the search).  The code generation step is described in more
     38 //      detail below.
     39 
     40 // Code generation.
     41 //
     42 //   The nodes are divided into four main categories.
     43 //   * Choice nodes
     44 //        These represent places where the regular expression can
     45 //        match in more than one way.  For example on entry to an
     46 //        alternation (foo|bar) or a repetition (*, +, ? or {}).
     47 //   * Action nodes
     48 //        These represent places where some action should be
     49 //        performed.  Examples include recording the current position
     50 //        in the input string to a register (in order to implement
     51 //        captures) or other actions on register for example in order
     52 //        to implement the counters needed for {} repetitions.
     53 //   * Matching nodes
     54 //        These attempt to match some element part of the input string.
     55 //        Examples of elements include character classes, plain strings
     56 //        or back references.
     57 //   * End nodes
     58 //        These are used to implement the actions required on finding
     59 //        a successful match or failing to find a match.
     60 //
     61 //   The code generated (whether as byte codes or native code) maintains
     62 //   some state as it runs.  This consists of the following elements:
     63 //
     64 //   * The capture registers.  Used for string captures.
     65 //   * Other registers.  Used for counters etc.
     66 //   * The current position.
     67 //   * The stack of backtracking information.  Used when a matching node
     68 //     fails to find a match and needs to try an alternative.
     69 //
     70 // Conceptual regular expression execution model:
     71 //
     72 //   There is a simple conceptual model of regular expression execution
     73 //   which will be presented first.  The actual code generated is a more
     74 //   efficient simulation of the simple conceptual model:
     75 //
     76 //   * Choice nodes are implemented as follows:
     77 //     For each choice except the last {
     78 //       push current position
     79 //       push backtrack code location
     80 //       <generate code to test for choice>
     81 //       backtrack code location:
     82 //       pop current position
     83 //     }
     84 //     <generate code to test for last choice>
     85 //
     86 //   * Actions nodes are generated as follows
     87 //     <push affected registers on backtrack stack>
     88 //     <generate code to perform action>
     89 //     push backtrack code location
     90 //     <generate code to test for following nodes>
     91 //     backtrack code location:
     92 //     <pop affected registers to restore their state>
     93 //     <pop backtrack location from stack and go to it>
     94 //
     95 //   * Matching nodes are generated as follows:
     96 //     if input string matches at current position
     97 //       update current position
     98 //       <generate code to test for following nodes>
     99 //     else
    100 //       <pop backtrack location from stack and go to it>
    101 //
    102 //   Thus it can be seen that the current position is saved and restored
    103 //   by the choice nodes, whereas the registers are saved and restored by
    104 //   by the action nodes that manipulate them.
    105 //
    106 //   The other interesting aspect of this model is that nodes are generated
    107 //   at the point where they are needed by a recursive call to Emit().  If
    108 //   the node has already been code generated then the Emit() call will
    109 //   generate a jump to the previously generated code instead.  In order to
    110 //   limit recursion it is possible for the Emit() function to put the node
    111 //   on a work list for later generation and instead generate a jump.  The
    112 //   destination of the jump is resolved later when the code is generated.
    113 //
    114 // Actual regular expression code generation.
    115 //
    116 //   Code generation is actually more complicated than the above.  In order to
    117 //   improve the efficiency of the generated code some optimizations are
    118 //   performed
    119 //
    120 //   * Choice nodes have 1-character lookahead.
    121 //     A choice node looks at the following character and eliminates some of
    122 //     the choices immediately based on that character.  This is not yet
    123 //     implemented.
    124 //   * Simple greedy loops store reduced backtracking information.  We call
    125 //     these fixed length loops
    126 //     A quantifier like /.*foo/m will greedily match the whole input.  It will
    127 //     then need to backtrack to a point where it can match "foo".  The naive
    128 //     implementation of this would push each character position onto the
    129 //     backtracking stack, then pop them off one by one.  This would use space
    130 //     proportional to the length of the input string.  However since the "."
    131 //     can only match in one way and always has a constant length (in this case
    132 //     of 1) it suffices to store the current position on the top of the stack
    133 //     once.  Matching now becomes merely incrementing the current position and
    134 //     backtracking becomes decrementing the current position and checking the
    135 //     result against the stored current position.  This is faster and saves
    136 //     space.
    137 //   * The current state is virtualized.
    138 //     This is used to defer expensive operations until it is clear that they
    139 //     are needed and to generate code for a node more than once, allowing
    140 //     specialized an efficient versions of the code to be created. This is
    141 //     explained in the section below.
    142 //
    143 // Execution state virtualization.
    144 //
    145 //   Instead of emitting code, nodes that manipulate the state can record their
    146 //   manipulation in an object called the Trace.  The Trace object can record a
    147 //   current position offset, an optional backtrack code location on the top of
    148 //   the virtualized backtrack stack and some register changes.  When a node is
    149 //   to be emitted it can flush the Trace or update it.  Flushing the Trace
    150 //   will emit code to bring the actual state into line with the virtual state.
    151 //   Avoiding flushing the state can postpone some work (e.g. updates of capture
    152 //   registers).  Postponing work can save time when executing the regular
    153 //   expression since it may be found that the work never has to be done as a
    154 //   failure to match can occur.  In addition it is much faster to jump to a
    155 //   known backtrack code location than it is to pop an unknown backtrack
    156 //   location from the stack and jump there.
    157 //
    158 //   The virtual state found in the Trace affects code generation.  For example
    159 //   the virtual state contains the difference between the actual current
    160 //   position and the virtual current position, and matching code needs to use
    161 //   this offset to attempt a match in the correct location of the input
    162 //   string.  Therefore code generated for a non-trivial trace is specialized
    163 //   to that trace.  The code generator therefore has the ability to generate
    164 //   code for each node several times.  In order to limit the size of the
    165 //   generated code there is an arbitrary limit on how many specialized sets of
    166 //   code may be generated for a given node.  If the limit is reached, the
    167 //   trace is flushed and a generic version of the code for a node is emitted.
    168 //   This is subsequently used for that node.  The code emitted for non-generic
    169 //   trace is not recorded in the node and so it cannot currently be reused in
    170 //   the event that code generation is requested for an identical trace.
    171 
    172 namespace {
    173 
    174 constexpr base::uc32 MaxCodeUnit(const bool one_byte) {
    175  static_assert(String::kMaxOneByteCharCodeU <=
    176                std::numeric_limits<uint16_t>::max());
    177  static_assert(String::kMaxUtf16CodeUnitU <=
    178                std::numeric_limits<uint16_t>::max());
    179  return one_byte ? String::kMaxOneByteCharCodeU : String::kMaxUtf16CodeUnitU;
    180 }
    181 
    182 constexpr uint32_t CharMask(const bool one_byte) {
    183  static_assert(base::bits::IsPowerOfTwo(String::kMaxOneByteCharCodeU + 1));
    184  static_assert(base::bits::IsPowerOfTwo(String::kMaxUtf16CodeUnitU + 1));
    185  return MaxCodeUnit(one_byte);
    186 }
    187 
    188 }  // namespace
    189 
    190 void RegExpTree::AppendToText(RegExpText* text, Zone* zone) { UNREACHABLE(); }
    191 
    192 void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
    193  text->AddElement(TextElement::Atom(this), zone);
    194 }
    195 
    196 void RegExpClassRanges::AppendToText(RegExpText* text, Zone* zone) {
    197  text->AddElement(TextElement::ClassRanges(this), zone);
    198 }
    199 
    200 void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
    201  for (int i = 0; i < elements()->length(); i++)
    202    text->AddElement(elements()->at(i), zone);
    203 }
    204 
    205 TextElement TextElement::Atom(RegExpAtom* atom) {
    206  return TextElement(ATOM, atom);
    207 }
    208 
    209 TextElement TextElement::ClassRanges(RegExpClassRanges* class_ranges) {
    210  return TextElement(CLASS_RANGES, class_ranges);
    211 }
    212 
    213 int TextElement::length() const {
    214  switch (text_type()) {
    215    case ATOM:
    216      return atom()->length();
    217 
    218    case CLASS_RANGES:
    219      return 1;
    220  }
    221  UNREACHABLE();
    222 }
    223 
    224 class RecursionCheck {
    225 public:
    226  explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
    227    compiler->IncrementRecursionDepth();
    228  }
    229  ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
    230 
    231 private:
    232  RegExpCompiler* compiler_;
    233 };
    234 
    235 // Attempts to compile the regexp using an Irregexp code generator.  Returns
    236 // a fixed array or a null handle depending on whether it succeeded.
    237 RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
    238                               RegExpFlags flags, bool one_byte)
    239    : next_register_(JSRegExp::RegistersForCaptureCount(capture_count)),
    240      unicode_lookaround_stack_register_(kNoRegister),
    241      unicode_lookaround_position_register_(kNoRegister),
    242      work_list_(nullptr),
    243      recursion_depth_(0),
    244      flags_(flags),
    245      one_byte_(one_byte),
    246      reg_exp_too_big_(false),
    247      limiting_recursion_(false),
    248      optimize_(v8_flags.regexp_optimization),
    249      read_backward_(false),
    250      current_expansion_factor_(1),
    251      frequency_collator_(),
    252      isolate_(isolate),
    253      zone_(zone) {
    254  accept_ = zone->New<EndNode>(EndNode::ACCEPT, zone);
    255  DCHECK_GE(RegExpMacroAssembler::kMaxRegister, next_register_ - 1);
    256 }
    257 
    258 RegExpCompiler::CompilationResult RegExpCompiler::Assemble(
    259    Isolate* isolate, RegExpMacroAssembler* macro_assembler, RegExpNode* start,
    260    int capture_count, DirectHandle<String> pattern) {
    261  macro_assembler_ = macro_assembler;
    262 
    263  ZoneVector<RegExpNode*> work_list(zone());
    264  work_list_ = &work_list;
    265  Label fail;
    266  macro_assembler_->PushBacktrack(&fail);
    267  Trace new_trace;
    268  start->Emit(this, &new_trace);
    269  macro_assembler_->BindJumpTarget(&fail);
    270  macro_assembler_->Fail();
    271  while (!work_list.empty()) {
    272    RegExpNode* node = work_list.back();
    273    work_list.pop_back();
    274    node->set_on_work_list(false);
    275    if (!node->label()->is_bound()) node->Emit(this, &new_trace);
    276  }
    277  if (reg_exp_too_big_) {
    278    if (v8_flags.correctness_fuzzer_suppressions) {
    279      FATAL("Aborting on excess zone allocation");
    280    }
    281    macro_assembler_->AbortedCodeGeneration();
    282    return CompilationResult::RegExpTooBig();
    283  }
    284 
    285  DirectHandle<HeapObject> code = macro_assembler_->GetCode(pattern, flags_);
    286  isolate->IncreaseTotalRegexpCodeGenerated(code);
    287  work_list_ = nullptr;
    288 
    289  return {code, next_register_};
    290 }
    291 
    292 bool Trace::mentions_reg(int reg) const {
    293  for (auto trace : *this) {
    294    if (trace->has_action() && trace->action()->Mentions(reg)) return true;
    295  }
    296  return false;
    297 }
    298 
    299 bool Trace::GetStoredPosition(int reg, int* cp_offset) const {
    300  DCHECK_EQ(0, *cp_offset);
    301  for (auto trace : *this) {
    302    if (trace->has_action() && trace->action()->Mentions(reg)) {
    303      if (trace->action_->action_type() == ActionNode::CLEAR_POSITION ||
    304          trace->action_->action_type() == ActionNode::RESTORE_POSITION) {
    305        *cp_offset = trace->next_->cp_offset();
    306        return true;
    307      } else {
    308        return false;
    309      }
    310    }
    311  }
    312  return false;
    313 }
    314 
    315 // A (dynamically-sized) set of unsigned integers that behaves especially well
    316 // on small integers (< kFirstLimit). May do zone-allocation.
    317 class DynamicBitSet : public ZoneObject {
    318 public:
    319  V8_EXPORT_PRIVATE bool Get(unsigned value) const {
    320    if (value < kFirstLimit) {
    321      return (first_ & (1 << value)) != 0;
    322    } else if (remaining_ == nullptr) {
    323      return false;
    324    } else {
    325      return remaining_->Contains(value);
    326    }
    327  }
    328 
    329  // Destructively set a value in this set.
    330  void Set(unsigned value, Zone* zone) {
    331    if (value < kFirstLimit) {
    332      first_ |= (1 << value);
    333    } else {
    334      if (remaining_ == nullptr)
    335        remaining_ = zone->New<ZoneList<unsigned>>(1, zone);
    336      if (remaining_->is_empty() || !remaining_->Contains(value))
    337        remaining_->Add(value, zone);
    338    }
    339  }
    340 
    341 private:
    342  static constexpr unsigned kFirstLimit = 32;
    343 
    344  uint32_t first_ = 0;
    345  ZoneList<unsigned>* remaining_ = nullptr;
    346 };
    347 
    348 int Trace::FindAffectedRegisters(DynamicBitSet* affected_registers,
    349                                 Zone* zone) {
    350  int max_register = RegExpCompiler::kNoRegister;
    351  for (auto trace : *this) {
    352    if (ActionNode* action = trace->action_) {
    353      int to = action->register_to();
    354      for (int i = action->register_from(); i <= to; i++) {
    355        affected_registers->Set(i, zone);
    356      }
    357      if (to > max_register) max_register = to;
    358    }
    359  }
    360  return max_register;
    361 }
    362 
    363 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
    364                                     int max_register,
    365                                     const DynamicBitSet& registers_to_pop,
    366                                     const DynamicBitSet& registers_to_clear) {
    367  for (int reg = max_register; reg >= 0; reg--) {
    368    if (registers_to_pop.Get(reg)) {
    369      assembler->PopRegister(reg);
    370    } else if (registers_to_clear.Get(reg)) {
    371      int clear_to = reg;
    372      while (reg > 0 && registers_to_clear.Get(reg - 1)) {
    373        reg--;
    374      }
    375      assembler->ClearRegisters(reg, clear_to);
    376    }
    377  }
    378 }
    379 
    380 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
    381                                   int max_register,
    382                                   const DynamicBitSet& affected_registers,
    383                                   DynamicBitSet* registers_to_pop,
    384                                   DynamicBitSet* registers_to_clear,
    385                                   Zone* zone) {
    386  // Count pushes performed to force a stack limit check occasionally.
    387  int pushes = 0;
    388 
    389  for (int reg = 0; reg <= max_register; reg++) {
    390    if (!affected_registers.Get(reg)) continue;
    391 
    392    // The chronologically first deferred action in the trace
    393    // is used to infer the action needed to restore a register
    394    // to its previous state (or not, if it's safe to ignore it).
    395    enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
    396    DeferredActionUndoType undo_action = IGNORE;
    397 
    398    int value = 0;
    399    bool absolute = false;
    400    bool clear = false;
    401    static const int kNoStore = kMinInt;
    402    int store_position = kNoStore;
    403    // This is a little tricky because we are scanning the actions in reverse
    404    // historical order (newest first).
    405    for (auto trace : *this) {
    406      ActionNode* action = trace->action_;
    407      if (!action) continue;
    408      if (action->Mentions(reg)) {
    409        switch (action->action_type()) {
    410          case ActionNode::SET_REGISTER_FOR_LOOP: {
    411            if (!absolute) {
    412              value += action->value();
    413              absolute = true;
    414            }
    415            // SET_REGISTER_FOR_LOOP is only used for newly introduced loop
    416            // counters. They can have a significant previous value if they
    417            // occur in a loop. TODO(lrn): Propagate this information, so
    418            // we can set undo_action to IGNORE if we know there is no value to
    419            // restore.
    420            undo_action = RESTORE;
    421            DCHECK_EQ(store_position, kNoStore);
    422            DCHECK(!clear);
    423            break;
    424          }
    425          case ActionNode::INCREMENT_REGISTER:
    426            if (!absolute) {
    427              value++;
    428            }
    429            DCHECK_EQ(store_position, kNoStore);
    430            DCHECK(!clear);
    431            undo_action = RESTORE;
    432            break;
    433          case ActionNode::CLEAR_POSITION:
    434          case ActionNode::RESTORE_POSITION: {
    435            if (!clear && store_position == kNoStore) {
    436              store_position = trace->next()->cp_offset();
    437            }
    438 
    439            // For captures we know that stores and clears alternate.
    440            // Other register, are never cleared, and if the occur
    441            // inside a loop, they might be assigned more than once.
    442            if (reg <= 1) {
    443              // Registers zero and one, aka "capture zero", is
    444              // always set correctly if we succeed. There is no
    445              // need to undo a setting on backtrack, because we
    446              // will set it again or fail.
    447              undo_action = IGNORE;
    448            } else {
    449              if (action->action_type() == ActionNode::CLEAR_POSITION) {
    450                undo_action = CLEAR;
    451              } else {
    452                undo_action = RESTORE;
    453              }
    454            }
    455            DCHECK(!absolute);
    456            DCHECK_EQ(value, 0);
    457            break;
    458          }
    459          case ActionNode::CLEAR_CAPTURES: {
    460            // Since we're scanning in reverse order, if we've already
    461            // set the position we have to ignore historically earlier
    462            // clearing operations.
    463            if (store_position == kNoStore) {
    464              clear = true;
    465            }
    466            undo_action = RESTORE;
    467            DCHECK(!absolute);
    468            DCHECK_EQ(value, 0);
    469            break;
    470          }
    471          default:
    472            UNREACHABLE();
    473        }
    474      }
    475    }
    476    // Prepare for the undo-action (e.g., push if it's going to be popped).
    477    if (undo_action == RESTORE) {
    478      pushes++;
    479      RegExpMacroAssembler::StackCheckFlag stack_check =
    480          RegExpMacroAssembler::kNoStackLimitCheck;
    481      DCHECK_GT(assembler->stack_limit_slack_slot_count(), 0);
    482      if (pushes == assembler->stack_limit_slack_slot_count()) {
    483        stack_check = RegExpMacroAssembler::kCheckStackLimit;
    484        pushes = 0;
    485      }
    486 
    487      assembler->PushRegister(reg, stack_check);
    488      registers_to_pop->Set(reg, zone);
    489    } else if (undo_action == CLEAR) {
    490      registers_to_clear->Set(reg, zone);
    491    }
    492    // Perform the chronologically last action (or accumulated increment)
    493    // for the register.
    494    if (store_position != kNoStore) {
    495      assembler->WriteCurrentPositionToRegister(reg, store_position);
    496    } else if (clear) {
    497      assembler->ClearRegisters(reg, reg);
    498    } else if (absolute) {
    499      assembler->SetRegister(reg, value);
    500    } else if (value != 0) {
    501      assembler->AdvanceRegister(reg, value);
    502    }
    503  }
    504 }
    505 
    506 // This is called as we come into a loop choice node and some other tricky
    507 // nodes.  It normalizes the state of the code generator to ensure we can
    508 // generate generic code.  If the mode indicates that we are in a success
    509 // situation then don't push anything, because the stack is about to be
    510 // discarded, and also don't update the current position.
    511 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor,
    512                  Trace::FlushMode mode) {
    513  RegExpMacroAssembler* assembler = compiler->macro_assembler();
    514 
    515  DCHECK(!is_trivial());
    516 
    517  // Normally we don't need to update the current position register if we are
    518  // about to stop because we had a successful match, but the global mode
    519  // requires the current position register to be updated so it can start the
    520  // next match.  TODO(erikcorry): Perhaps it should use capture register 1
    521  // instead.
    522  bool update_current_position =
    523      cp_offset_ != 0 && (mode != kFlushSuccess || assembler->global());
    524 
    525  if (!has_any_actions() && (backtrack() == nullptr || mode == kFlushSuccess)) {
    526    // Here we just have some deferred cp advances to fix and we are back to
    527    // a normal situation.  We may also have to forget some information gained
    528    // through a quick check that was already performed.
    529    if (update_current_position) assembler->AdvanceCurrentPosition(cp_offset_);
    530    // Create a new trivial state and generate the node with that.
    531    Trace new_state;
    532    successor->Emit(compiler, &new_state);
    533    return;
    534  }
    535 
    536  // Generate deferred actions here along with code to undo them again.
    537  DynamicBitSet affected_registers;
    538 
    539  if (backtrack() != nullptr && mode != kFlushSuccess) {
    540    // Here we have a concrete backtrack location.  These are set up by choice
    541    // nodes and so they indicate that we have a deferred save of the current
    542    // position which we may need to emit here.
    543    assembler->PushCurrentPosition();
    544  }
    545 
    546  int max_register =
    547      FindAffectedRegisters(&affected_registers, compiler->zone());
    548  DynamicBitSet registers_to_pop;
    549  DynamicBitSet registers_to_clear;
    550  PerformDeferredActions(assembler, max_register, affected_registers,
    551                         &registers_to_pop, &registers_to_clear,
    552                         compiler->zone());
    553  if (update_current_position) assembler->AdvanceCurrentPosition(cp_offset_);
    554 
    555  if (mode == kFlushSuccess) {
    556    Trace new_state;
    557    successor->Emit(compiler, &new_state);
    558    return;
    559  }
    560 
    561  // Create a new trivial state and generate the node with that.
    562  Label undo;
    563  assembler->PushBacktrack(&undo);
    564  if (successor->KeepRecursing(compiler)) {
    565    Trace new_state;
    566    successor->Emit(compiler, &new_state);
    567  } else {
    568    compiler->AddWork(successor);
    569    assembler->GoTo(successor->label());
    570  }
    571 
    572  // On backtrack we need to restore state.
    573  assembler->BindJumpTarget(&undo);
    574  RestoreAffectedRegisters(assembler, max_register, registers_to_pop,
    575                           registers_to_clear);
    576  if (backtrack() == nullptr) {
    577    assembler->Backtrack();
    578  } else {
    579    assembler->PopCurrentPosition();
    580    assembler->GoTo(backtrack());
    581  }
    582 }
    583 
    584 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
    585  RegExpMacroAssembler* assembler = compiler->macro_assembler();
    586 
    587  // Omit flushing the trace. We discard the entire stack frame anyway.
    588 
    589  if (!label()->is_bound()) {
    590    // We are completely independent of the trace, since we ignore it,
    591    // so this code can be used as the generic version.
    592    assembler->Bind(label());
    593  }
    594 
    595  // Throw away everything on the backtrack stack since the start
    596  // of the negative submatch and restore the character position.
    597  assembler->ReadCurrentPositionFromRegister(current_position_register_);
    598  assembler->ReadStackPointerFromRegister(stack_pointer_register_);
    599  if (clear_capture_count_ > 0) {
    600    // Clear any captures that might have been performed during the success
    601    // of the body of the negative look-ahead.
    602    int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
    603    assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
    604  }
    605  // Now that we have unwound the stack we find at the top of the stack the
    606  // backtrack that the BeginNegativeSubmatch node got.
    607  assembler->Backtrack();
    608 }
    609 
    610 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
    611  if (!trace->is_trivial()) {
    612    DCHECK(action_ == ACCEPT);
    613    trace->Flush(compiler, this, Trace::kFlushSuccess);
    614    return;
    615  }
    616  RegExpMacroAssembler* assembler = compiler->macro_assembler();
    617  if (!label()->is_bound()) {
    618    assembler->Bind(label());
    619  }
    620  switch (action_) {
    621    case ACCEPT:
    622      assembler->Succeed();
    623      return;
    624    case BACKTRACK:
    625      assembler->GoTo(trace->backtrack());
    626      return;
    627    case NEGATIVE_SUBMATCH_SUCCESS:
    628      // This case is handled in a different virtual method.
    629      UNREACHABLE();
    630  }
    631  UNIMPLEMENTED();
    632 }
    633 
    634 void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
    635  if (guards_ == nullptr) guards_ = zone->New<ZoneList<Guard*>>(1, zone);
    636  guards_->Add(guard, zone);
    637 }
    638 
    639 ActionNode* ActionNode::SetRegisterForLoop(int reg, int val,
    640                                           RegExpNode* on_success) {
    641  return on_success->zone()->New<ActionNode>(SET_REGISTER_FOR_LOOP, on_success,
    642                                             reg, reg, val);
    643 }
    644 
    645 ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
    646  return on_success->zone()->New<ActionNode>(INCREMENT_REGISTER, on_success,
    647                                             reg);
    648 }
    649 
    650 ActionNode* ActionNode::ClearPosition(int reg, RegExpNode* on_success) {
    651  return on_success->zone()->New<ActionNode>(CLEAR_POSITION, on_success, reg);
    652 }
    653 
    654 ActionNode* ActionNode::RestorePosition(int reg, RegExpNode* on_success) {
    655  return on_success->zone()->New<ActionNode>(RESTORE_POSITION, on_success, reg);
    656 }
    657 
    658 ActionNode* ActionNode::ClearCaptures(Interval range, RegExpNode* on_success) {
    659  return on_success->zone()->New<ActionNode>(CLEAR_CAPTURES, on_success,
    660                                             range.from(), range.to());
    661 }
    662 
    663 ActionNode* ActionNode::BeginPositiveSubmatch(int stack_reg, int position_reg,
    664                                              RegExpNode* body,
    665                                              ActionNode* success_node) {
    666  ActionNode* result =
    667      body->zone()->New<ActionNode>(BEGIN_POSITIVE_SUBMATCH, body);
    668  result->data_.u_submatch.stack_pointer_register = stack_reg;
    669  result->data_.u_submatch.current_position_register = position_reg;
    670  result->data_.u_submatch.success_node = success_node;
    671  return result;
    672 }
    673 
    674 ActionNode* ActionNode::BeginNegativeSubmatch(int stack_reg, int position_reg,
    675                                              RegExpNode* on_success) {
    676  ActionNode* result =
    677      on_success->zone()->New<ActionNode>(BEGIN_NEGATIVE_SUBMATCH, on_success);
    678  result->data_.u_submatch.stack_pointer_register = stack_reg;
    679  result->data_.u_submatch.current_position_register = position_reg;
    680  return result;
    681 }
    682 
    683 ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg, int position_reg,
    684                                                int clear_register_count,
    685                                                int clear_register_from,
    686                                                RegExpNode* on_success) {
    687  ActionNode* result = on_success->zone()->New<ActionNode>(
    688      POSITIVE_SUBMATCH_SUCCESS, on_success);
    689  result->data_.u_submatch.stack_pointer_register = stack_reg;
    690  result->data_.u_submatch.current_position_register = position_reg;
    691  result->data_.u_submatch.clear_register_count = clear_register_count;
    692  result->data_.u_submatch.clear_register_from = clear_register_from;
    693  return result;
    694 }
    695 
    696 ActionNode* ActionNode::EmptyMatchCheck(int start_register,
    697                                        int repetition_register,
    698                                        int repetition_limit,
    699                                        RegExpNode* on_success) {
    700  ActionNode* result =
    701      on_success->zone()->New<ActionNode>(EMPTY_MATCH_CHECK, on_success);
    702  result->data_.u_empty_match_check.start_register = start_register;
    703  result->data_.u_empty_match_check.repetition_register = repetition_register;
    704  result->data_.u_empty_match_check.repetition_limit = repetition_limit;
    705  return result;
    706 }
    707 
    708 ActionNode* ActionNode::ModifyFlags(RegExpFlags flags, RegExpNode* on_success) {
    709  ActionNode* result =
    710      on_success->zone()->New<ActionNode>(MODIFY_FLAGS, on_success);
    711  result->data_.u_modify_flags.flags = flags;
    712  return result;
    713 }
    714 
    715 #define DEFINE_ACCEPT(Type) \
    716  void Type##Node::Accept(NodeVisitor* visitor) { visitor->Visit##Type(this); }
    717 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
    718 #undef DEFINE_ACCEPT
    719 
    720 // -------------------------------------------------------------------
    721 // Emit code.
    722 
    723 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
    724                               Guard* guard, Trace* trace) {
    725  switch (guard->op()) {
    726    case Guard::LT:
    727      DCHECK(!trace->mentions_reg(guard->reg()));
    728      macro_assembler->IfRegisterGE(guard->reg(), guard->value(),
    729                                    trace->backtrack());
    730      break;
    731    case Guard::GEQ:
    732      DCHECK(!trace->mentions_reg(guard->reg()));
    733      macro_assembler->IfRegisterLT(guard->reg(), guard->value(),
    734                                    trace->backtrack());
    735      break;
    736  }
    737 }
    738 
    739 namespace {
    740 
    741 #ifdef DEBUG
    742 bool ContainsOnlyUtf16CodeUnits(unibrow::uchar* chars, int length) {
    743  static_assert(sizeof(unibrow::uchar) == 4);
    744  for (int i = 0; i < length; i++) {
    745    if (chars[i] > String::kMaxUtf16CodeUnit) return false;
    746  }
    747  return true;
    748 }
    749 #endif  // DEBUG
    750 
    751 // Returns the number of characters in the equivalence class, omitting those
    752 // that cannot occur in the source string because it is Latin1.  This is called
    753 // both for unicode modes /ui and /vi, and also for legacy case independent
    754 // mode /i.  In the case of Unicode modes we handled surrogate pair expansions
    755 // earlier so at this point it's all about single-code-unit expansions.
    756 int GetCaseIndependentLetters(Isolate* isolate, base::uc16 character,
    757                              RegExpCompiler* compiler, unibrow::uchar* letters,
    758                              int letter_length) {
    759  bool one_byte_subject = compiler->one_byte();
    760  bool unicode = IsEitherUnicode(compiler->flags());
    761  static const base::uc16 kMaxAscii = 0x7f;
    762  if (!unicode && character <= kMaxAscii) {
    763    // Fast case for common characters.
    764    base::uc16 upper = character & ~0x20;
    765    if ('A' <= upper && upper <= 'Z') {
    766      letters[0] = upper;
    767      letters[1] = upper | 0x20;
    768      return 2;
    769    }
    770    letters[0] = character;
    771    return 1;
    772  }
    773 #ifdef V8_INTL_SUPPORT
    774 
    775  if (!unicode && RegExpCaseFolding::IgnoreSet().contains(character)) {
    776    if (one_byte_subject && character > String::kMaxOneByteCharCode) {
    777      // This function promises not to return a character that is impossible
    778      // for the subject encoding.
    779      return 0;
    780    }
    781    letters[0] = character;
    782    DCHECK(ContainsOnlyUtf16CodeUnits(letters, 1));
    783    return 1;
    784  }
    785  bool in_special_add_set =
    786      RegExpCaseFolding::SpecialAddSet().contains(character);
    787 
    788  icu::UnicodeSet set;
    789  set.add(character);
    790  set = set.closeOver(unicode ? USET_SIMPLE_CASE_INSENSITIVE
    791                              : USET_CASE_INSENSITIVE);
    792 
    793  UChar32 canon = 0;
    794  if (in_special_add_set && !unicode) {
    795    canon = RegExpCaseFolding::Canonicalize(character);
    796  }
    797 
    798  int32_t range_count = set.getRangeCount();
    799  int items = 0;
    800  for (int32_t i = 0; i < range_count; i++) {
    801    UChar32 start = set.getRangeStart(i);
    802    UChar32 end = set.getRangeEnd(i);
    803    CHECK(end - start + items <= letter_length);
    804    for (UChar32 cu = start; cu <= end; cu++) {
    805      if (one_byte_subject && cu > String::kMaxOneByteCharCode) continue;
    806      if (!unicode && in_special_add_set &&
    807          RegExpCaseFolding::Canonicalize(cu) != canon) {
    808        continue;
    809      }
    810      letters[items++] = static_cast<unibrow::uchar>(cu);
    811    }
    812  }
    813  DCHECK(ContainsOnlyUtf16CodeUnits(letters, items));
    814  return items;
    815 #else
    816  int length =
    817      isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
    818  // Unibrow returns 0 or 1 for characters where case independence is
    819  // trivial.
    820  if (length == 0) {
    821    letters[0] = character;
    822    length = 1;
    823  }
    824 
    825  if (one_byte_subject) {
    826    int new_length = 0;
    827    for (int i = 0; i < length; i++) {
    828      if (letters[i] <= String::kMaxOneByteCharCode) {
    829        letters[new_length++] = letters[i];
    830      }
    831    }
    832    length = new_length;
    833  }
    834 
    835  DCHECK(ContainsOnlyUtf16CodeUnits(letters, length));
    836  return length;
    837 #endif  // V8_INTL_SUPPORT
    838 }
    839 
    840 inline bool EmitSimpleCharacter(Isolate* isolate, RegExpCompiler* compiler,
    841                                base::uc16 c, Label* on_failure, int cp_offset,
    842                                bool check, bool preloaded) {
    843  RegExpMacroAssembler* assembler = compiler->macro_assembler();
    844  bool bound_checked = false;
    845  if (!preloaded) {
    846    assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
    847    bound_checked = true;
    848  }
    849  assembler->CheckNotCharacter(c, on_failure);
    850  return bound_checked;
    851 }
    852 
    853 // Only emits non-letters (things that don't have case).  Only used for case
    854 // independent matches.
    855 inline bool EmitAtomNonLetter(Isolate* isolate, RegExpCompiler* compiler,
    856                              base::uc16 c, Label* on_failure, int cp_offset,
    857                              bool check, bool preloaded) {
    858  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
    859  bool one_byte = compiler->one_byte();
    860  unibrow::uchar chars[4];
    861  int length = GetCaseIndependentLetters(isolate, c, compiler, chars, 4);
    862  if (length < 1) {
    863    // This can't match.  Must be an one-byte subject and a non-one-byte
    864    // character.  We do not need to do anything since the one-byte pass
    865    // already handled this.
    866    CHECK(one_byte);
    867    return false;  // Bounds not checked.
    868  }
    869  bool checked = false;
    870  // We handle the length > 1 case in a later pass.
    871  if (length == 1) {
    872    // GetCaseIndependentLetters promises not to return characters that can't
    873    // match because of the subject encoding.  This case is already handled by
    874    // the one-byte pass.
    875    CHECK_IMPLIES(one_byte, chars[0] <= String::kMaxOneByteCharCodeU);
    876    if (!preloaded) {
    877      macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
    878      checked = check;
    879    }
    880    macro_assembler->CheckNotCharacter(chars[0], on_failure);
    881  }
    882  return checked;
    883 }
    884 
    885 bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
    886                               bool one_byte, base::uc16 c1, base::uc16 c2,
    887                               Label* on_failure) {
    888  const uint32_t char_mask = CharMask(one_byte);
    889  base::uc16 exor = c1 ^ c2;
    890  // Check whether exor has only one bit set.
    891  if (((exor - 1) & exor) == 0) {
    892    // If c1 and c2 differ only by one bit.
    893    // Ecma262UnCanonicalize always gives the highest number last.
    894    DCHECK(c2 > c1);
    895    base::uc16 mask = char_mask ^ exor;
    896    macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
    897    return true;
    898  }
    899  DCHECK(c2 > c1);
    900  base::uc16 diff = c2 - c1;
    901  if (((diff - 1) & diff) == 0 && c1 >= diff) {
    902    // If the characters differ by 2^n but don't differ by one bit then
    903    // subtract the difference from the found character, then do the or
    904    // trick.  We avoid the theoretical case where negative numbers are
    905    // involved in order to simplify code generation.
    906    base::uc16 mask = char_mask ^ diff;
    907    macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff, diff, mask,
    908                                                    on_failure);
    909    return true;
    910  }
    911  return false;
    912 }
    913 
    914 // Only emits letters (things that have case).  Only used for case independent
    915 // matches.
    916 inline bool EmitAtomLetter(Isolate* isolate, RegExpCompiler* compiler,
    917                           base::uc16 c, Label* on_failure, int cp_offset,
    918                           bool check, bool preloaded) {
    919  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
    920  bool one_byte = compiler->one_byte();
    921  unibrow::uchar chars[4];
    922  int length = GetCaseIndependentLetters(isolate, c, compiler, chars, 4);
    923  // The 0 and 1 case are handled by earlier passes.
    924  if (length <= 1) return false;
    925  // We may not need to check against the end of the input string
    926  // if this character lies before a character that matched.
    927  if (!preloaded) {
    928    macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
    929  }
    930  Label ok;
    931  switch (length) {
    932    case 2: {
    933      if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
    934                                    chars[1], on_failure)) {
    935      } else {
    936        macro_assembler->CheckCharacter(chars[0], &ok);
    937        macro_assembler->CheckNotCharacter(chars[1], on_failure);
    938        macro_assembler->Bind(&ok);
    939      }
    940      break;
    941    }
    942    case 4:
    943      macro_assembler->CheckCharacter(chars[3], &ok);
    944      [[fallthrough]];
    945    case 3:
    946      macro_assembler->CheckCharacter(chars[0], &ok);
    947      macro_assembler->CheckCharacter(chars[1], &ok);
    948      macro_assembler->CheckNotCharacter(chars[2], on_failure);
    949      macro_assembler->Bind(&ok);
    950      break;
    951    default:
    952      UNREACHABLE();
    953  }
    954  return true;
    955 }
    956 
    957 void EmitBoundaryTest(RegExpMacroAssembler* masm, int border,
    958                      Label* fall_through, Label* above_or_equal,
    959                      Label* below) {
    960  if (below != fall_through) {
    961    masm->CheckCharacterLT(border, below);
    962    if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
    963  } else {
    964    masm->CheckCharacterGT(border - 1, above_or_equal);
    965  }
    966 }
    967 
    968 void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm, int first, int last,
    969                            Label* fall_through, Label* in_range,
    970                            Label* out_of_range) {
    971  if (in_range == fall_through) {
    972    if (first == last) {
    973      masm->CheckNotCharacter(first, out_of_range);
    974    } else {
    975      masm->CheckCharacterNotInRange(first, last, out_of_range);
    976    }
    977  } else {
    978    if (first == last) {
    979      masm->CheckCharacter(first, in_range);
    980    } else {
    981      masm->CheckCharacterInRange(first, last, in_range);
    982    }
    983    if (out_of_range != fall_through) masm->GoTo(out_of_range);
    984  }
    985 }
    986 
    987 // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
    988 // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
    989 void EmitUseLookupTable(RegExpMacroAssembler* masm,
    990                        ZoneList<base::uc32>* ranges, uint32_t start_index,
    991                        uint32_t end_index, base::uc32 min_char,
    992                        Label* fall_through, Label* even_label,
    993                        Label* odd_label) {
    994  static const uint32_t kSize = RegExpMacroAssembler::kTableSize;
    995  static const uint32_t kMask = RegExpMacroAssembler::kTableMask;
    996 
    997  base::uc32 base = (min_char & ~kMask);
    998  USE(base);
    999 
   1000  // Assert that everything is on one kTableSize page.
   1001  for (uint32_t i = start_index; i <= end_index; i++) {
   1002    DCHECK_EQ(ranges->at(i) & ~kMask, base);
   1003  }
   1004  DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base);
   1005 
   1006  char templ[kSize];
   1007  Label* on_bit_set;
   1008  Label* on_bit_clear;
   1009  int bit;
   1010  if (even_label == fall_through) {
   1011    on_bit_set = odd_label;
   1012    on_bit_clear = even_label;
   1013    bit = 1;
   1014  } else {
   1015    on_bit_set = even_label;
   1016    on_bit_clear = odd_label;
   1017    bit = 0;
   1018  }
   1019  for (uint32_t i = 0; i < (ranges->at(start_index) & kMask) && i < kSize;
   1020       i++) {
   1021    templ[i] = bit;
   1022  }
   1023  uint32_t j = 0;
   1024  bit ^= 1;
   1025  for (uint32_t i = start_index; i < end_index; i++) {
   1026    for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
   1027      templ[j] = bit;
   1028    }
   1029    bit ^= 1;
   1030  }
   1031  for (uint32_t i = j; i < kSize; i++) {
   1032    templ[i] = bit;
   1033  }
   1034  Factory* factory = masm->isolate()->factory();
   1035  // TODO(erikcorry): Cache these.
   1036  Handle<ByteArray> ba = factory->NewByteArray(kSize, AllocationType::kOld);
   1037  for (uint32_t i = 0; i < kSize; i++) {
   1038    ba->set(i, templ[i]);
   1039  }
   1040  masm->CheckBitInTable(ba, on_bit_set);
   1041  if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
   1042 }
   1043 
   1044 void CutOutRange(RegExpMacroAssembler* masm, ZoneList<base::uc32>* ranges,
   1045                 uint32_t start_index, uint32_t end_index, uint32_t cut_index,
   1046                 Label* even_label, Label* odd_label) {
   1047  bool odd = (((cut_index - start_index) & 1) == 1);
   1048  Label* in_range_label = odd ? odd_label : even_label;
   1049  Label dummy;
   1050  EmitDoubleBoundaryTest(masm, ranges->at(cut_index),
   1051                         ranges->at(cut_index + 1) - 1, &dummy, in_range_label,
   1052                         &dummy);
   1053  DCHECK(!dummy.is_linked());
   1054  // Cut out the single range by rewriting the array.  This creates a new
   1055  // range that is a merger of the two ranges on either side of the one we
   1056  // are cutting out.  The oddity of the labels is preserved.
   1057  for (uint32_t j = cut_index; j > start_index; j--) {
   1058    ranges->at(j) = ranges->at(j - 1);
   1059  }
   1060  for (uint32_t j = cut_index + 1; j < end_index; j++) {
   1061    ranges->at(j) = ranges->at(j + 1);
   1062  }
   1063 }
   1064 
   1065 // Unicode case.  Split the search space into kSize spaces that are handled
   1066 // with recursion.
   1067 void SplitSearchSpace(ZoneList<base::uc32>* ranges, uint32_t start_index,
   1068                      uint32_t end_index, uint32_t* new_start_index,
   1069                      uint32_t* new_end_index, base::uc32* border) {
   1070  static const uint32_t kSize = RegExpMacroAssembler::kTableSize;
   1071  static const uint32_t kMask = RegExpMacroAssembler::kTableMask;
   1072 
   1073  base::uc32 first = ranges->at(start_index);
   1074  base::uc32 last = ranges->at(end_index) - 1;
   1075 
   1076  *new_start_index = start_index;
   1077  *border = (ranges->at(start_index) & ~kMask) + kSize;
   1078  while (*new_start_index < end_index) {
   1079    if (ranges->at(*new_start_index) > *border) break;
   1080    (*new_start_index)++;
   1081  }
   1082  // new_start_index is the index of the first edge that is beyond the
   1083  // current kSize space.
   1084 
   1085  // For very large search spaces we do a binary chop search of the non-Latin1
   1086  // space instead of just going to the end of the current kSize space.  The
   1087  // heuristics are complicated a little by the fact that any 128-character
   1088  // encoding space can be quickly tested with a table lookup, so we don't
   1089  // wish to do binary chop search at a smaller granularity than that.  A
   1090  // 128-character space can take up a lot of space in the ranges array if,
   1091  // for example, we only want to match every second character (eg. the lower
   1092  // case characters on some Unicode pages).
   1093  uint32_t binary_chop_index = (end_index + start_index) / 2;
   1094  // The first test ensures that we get to the code that handles the Latin1
   1095  // range with a single not-taken branch, speeding up this important
   1096  // character range (even non-Latin1 charset-based text has spaces and
   1097  // punctuation).
   1098  if (*border - 1 > String::kMaxOneByteCharCode &&  // Latin1 case.
   1099      end_index - start_index > (*new_start_index - start_index) * 2 &&
   1100      last - first > kSize * 2 && binary_chop_index > *new_start_index &&
   1101      ranges->at(binary_chop_index) >= first + 2 * kSize) {
   1102    uint32_t scan_forward_for_section_border = binary_chop_index;
   1103    uint32_t new_border = (ranges->at(binary_chop_index) | kMask) + 1;
   1104 
   1105    while (scan_forward_for_section_border < end_index) {
   1106      if (ranges->at(scan_forward_for_section_border) > new_border) {
   1107        *new_start_index = scan_forward_for_section_border;
   1108        *border = new_border;
   1109        break;
   1110      }
   1111      scan_forward_for_section_border++;
   1112    }
   1113  }
   1114 
   1115  DCHECK(*new_start_index > start_index);
   1116  *new_end_index = *new_start_index - 1;
   1117  if (ranges->at(*new_end_index) == *border) {
   1118    (*new_end_index)--;
   1119  }
   1120  if (*border >= ranges->at(end_index)) {
   1121    *border = ranges->at(end_index);
   1122    *new_start_index = end_index;  // Won't be used.
   1123    *new_end_index = end_index - 1;
   1124  }
   1125 }
   1126 
   1127 // Gets a series of segment boundaries representing a character class.  If the
   1128 // character is in the range between an even and an odd boundary (counting from
   1129 // start_index) then go to even_label, otherwise go to odd_label.  We already
   1130 // know that the character is in the range of min_char to max_char inclusive.
   1131 // Either label can be nullptr indicating backtracking.  Either label can also
   1132 // be equal to the fall_through label.
   1133 void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<base::uc32>* ranges,
   1134                      uint32_t start_index, uint32_t end_index,
   1135                      base::uc32 min_char, base::uc32 max_char,
   1136                      Label* fall_through, Label* even_label,
   1137                      Label* odd_label) {
   1138  DCHECK_LE(min_char, String::kMaxUtf16CodeUnit);
   1139  DCHECK_LE(max_char, String::kMaxUtf16CodeUnit);
   1140 
   1141  base::uc32 first = ranges->at(start_index);
   1142  base::uc32 last = ranges->at(end_index) - 1;
   1143 
   1144  DCHECK_LT(min_char, first);
   1145 
   1146  // Just need to test if the character is before or on-or-after
   1147  // a particular character.
   1148  if (start_index == end_index) {
   1149    EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
   1150    return;
   1151  }
   1152 
   1153  // Another almost trivial case:  There is one interval in the middle that is
   1154  // different from the end intervals.
   1155  if (start_index + 1 == end_index) {
   1156    EmitDoubleBoundaryTest(masm, first, last, fall_through, even_label,
   1157                           odd_label);
   1158    return;
   1159  }
   1160 
   1161  // It's not worth using table lookup if there are very few intervals in the
   1162  // character class.
   1163  if (end_index - start_index <= 6) {
   1164    // It is faster to test for individual characters, so we look for those
   1165    // first, then try arbitrary ranges in the second round.
   1166    static uint32_t kNoCutIndex = -1;
   1167    uint32_t cut = kNoCutIndex;
   1168    for (uint32_t i = start_index; i < end_index; i++) {
   1169      if (ranges->at(i) == ranges->at(i + 1) - 1) {
   1170        cut = i;
   1171        break;
   1172      }
   1173    }
   1174    if (cut == kNoCutIndex) cut = start_index;
   1175    CutOutRange(masm, ranges, start_index, end_index, cut, even_label,
   1176                odd_label);
   1177    DCHECK_GE(end_index - start_index, 2);
   1178    GenerateBranches(masm, ranges, start_index + 1, end_index - 1, min_char,
   1179                     max_char, fall_through, even_label, odd_label);
   1180    return;
   1181  }
   1182 
   1183  // If there are a lot of intervals in the regexp, then we will use tables to
   1184  // determine whether the character is inside or outside the character class.
   1185  static const int kBits = RegExpMacroAssembler::kTableSizeBits;
   1186 
   1187  if ((max_char >> kBits) == (min_char >> kBits)) {
   1188    EmitUseLookupTable(masm, ranges, start_index, end_index, min_char,
   1189                       fall_through, even_label, odd_label);
   1190    return;
   1191  }
   1192 
   1193  if ((min_char >> kBits) != first >> kBits) {
   1194    masm->CheckCharacterLT(first, odd_label);
   1195    GenerateBranches(masm, ranges, start_index + 1, end_index, first, max_char,
   1196                     fall_through, odd_label, even_label);
   1197    return;
   1198  }
   1199 
   1200  uint32_t new_start_index = 0;
   1201  uint32_t new_end_index = 0;
   1202  base::uc32 border = 0;
   1203 
   1204  SplitSearchSpace(ranges, start_index, end_index, &new_start_index,
   1205                   &new_end_index, &border);
   1206 
   1207  Label handle_rest;
   1208  Label* above = &handle_rest;
   1209  if (border == last + 1) {
   1210    // We didn't find any section that started after the limit, so everything
   1211    // above the border is one of the terminal labels.
   1212    above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
   1213    DCHECK(new_end_index == end_index - 1);
   1214  }
   1215 
   1216  DCHECK_LE(start_index, new_end_index);
   1217  DCHECK_LE(new_start_index, end_index);
   1218  DCHECK_LT(start_index, new_start_index);
   1219  DCHECK_LT(new_end_index, end_index);
   1220  DCHECK(new_end_index + 1 == new_start_index ||
   1221         (new_end_index + 2 == new_start_index &&
   1222          border == ranges->at(new_end_index + 1)));
   1223  DCHECK_LT(min_char, border - 1);
   1224  DCHECK_LT(border, max_char);
   1225  DCHECK_LT(ranges->at(new_end_index), border);
   1226  DCHECK(border < ranges->at(new_start_index) ||
   1227         (border == ranges->at(new_start_index) &&
   1228          new_start_index == end_index && new_end_index == end_index - 1 &&
   1229          border == last + 1));
   1230  DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1));
   1231 
   1232  masm->CheckCharacterGT(border - 1, above);
   1233  Label dummy;
   1234  GenerateBranches(masm, ranges, start_index, new_end_index, min_char,
   1235                   border - 1, &dummy, even_label, odd_label);
   1236  if (handle_rest.is_linked()) {
   1237    masm->Bind(&handle_rest);
   1238    bool flip = (new_start_index & 1) != (start_index & 1);
   1239    GenerateBranches(masm, ranges, new_start_index, end_index, border, max_char,
   1240                     &dummy, flip ? odd_label : even_label,
   1241                     flip ? even_label : odd_label);
   1242  }
   1243 }
   1244 
   1245 void EmitClassRanges(RegExpMacroAssembler* macro_assembler,
   1246                     RegExpClassRanges* cr, bool one_byte, Label* on_failure,
   1247                     int cp_offset, bool check_offset, bool preloaded,
   1248                     Zone* zone) {
   1249  ZoneList<CharacterRange>* ranges = cr->ranges(zone);
   1250  CharacterRange::Canonicalize(ranges);
   1251 
   1252  // Now that all processing (like case-insensitivity) is done, clamp the
   1253  // ranges to the set of ranges that may actually occur in the subject string.
   1254  if (one_byte) CharacterRange::ClampToOneByte(ranges);
   1255 
   1256  const int ranges_length = ranges->length();
   1257  if (ranges_length == 0) {
   1258    if (!cr->is_negated()) {
   1259      macro_assembler->GoTo(on_failure);
   1260    }
   1261    if (check_offset) {
   1262      macro_assembler->CheckPosition(cp_offset, on_failure);
   1263    }
   1264    return;
   1265  }
   1266 
   1267  const base::uc32 max_char = MaxCodeUnit(one_byte);
   1268  if (ranges_length == 1 && ranges->at(0).IsEverything(max_char)) {
   1269    if (cr->is_negated()) {
   1270      macro_assembler->GoTo(on_failure);
   1271    } else {
   1272      // This is a common case hit by non-anchored expressions.
   1273      if (check_offset) {
   1274        macro_assembler->CheckPosition(cp_offset, on_failure);
   1275      }
   1276    }
   1277    return;
   1278  }
   1279 
   1280  if (!preloaded) {
   1281    macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
   1282  }
   1283 
   1284  if (cr->is_standard(zone) && macro_assembler->CheckSpecialClassRanges(
   1285                                   cr->standard_type(), on_failure)) {
   1286    return;
   1287  }
   1288 
   1289  static constexpr int kMaxRangesForInlineBranchGeneration = 16;
   1290  if (ranges_length > kMaxRangesForInlineBranchGeneration) {
   1291    // For large range sets, emit a more compact instruction sequence to avoid
   1292    // a potentially problematic increase in code size.
   1293    // Note the flipped logic below (we check InRange if negated, NotInRange if
   1294    // not negated); this is necessary since the method falls through on
   1295    // failure whereas we want to fall through on success.
   1296    if (cr->is_negated()) {
   1297      if (macro_assembler->CheckCharacterInRangeArray(ranges, on_failure)) {
   1298        return;
   1299      }
   1300    } else {
   1301      if (macro_assembler->CheckCharacterNotInRangeArray(ranges, on_failure)) {
   1302        return;
   1303      }
   1304    }
   1305  }
   1306 
   1307  // Generate a flat list of range boundaries for consumption by
   1308  // GenerateBranches. See the comment on that function for how the list should
   1309  // be structured
   1310  ZoneList<base::uc32>* range_boundaries =
   1311      zone->New<ZoneList<base::uc32>>(ranges_length * 2, zone);
   1312 
   1313  bool zeroth_entry_is_failure = !cr->is_negated();
   1314 
   1315  for (int i = 0; i < ranges_length; i++) {
   1316    CharacterRange& range = ranges->at(i);
   1317    if (range.from() == 0) {
   1318      DCHECK_EQ(i, 0);
   1319      zeroth_entry_is_failure = !zeroth_entry_is_failure;
   1320    } else {
   1321      range_boundaries->Add(range.from(), zone);
   1322    }
   1323    // `+ 1` to convert from inclusive to exclusive `to`.
   1324    // [from, to] == [from, to+1[.
   1325    range_boundaries->Add(range.to() + 1, zone);
   1326  }
   1327  int end_index = range_boundaries->length() - 1;
   1328  if (range_boundaries->at(end_index) > max_char) {
   1329    end_index--;
   1330  }
   1331 
   1332  Label fall_through;
   1333  GenerateBranches(macro_assembler, range_boundaries,
   1334                   0,  // start_index.
   1335                   end_index,
   1336                   0,  // min_char.
   1337                   max_char, &fall_through,
   1338                   zeroth_entry_is_failure ? &fall_through : on_failure,
   1339                   zeroth_entry_is_failure ? on_failure : &fall_through);
   1340  macro_assembler->Bind(&fall_through);
   1341 }
   1342 
   1343 }  // namespace
   1344 
   1345 RegExpNode::~RegExpNode() = default;
   1346 
   1347 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
   1348                                                  Trace* trace) {
   1349  // If we are generating a fixed length loop then don't stop and don't reuse
   1350  // code.
   1351  if (trace->fixed_length_loop_state() != nullptr) {
   1352    return CONTINUE;
   1353  }
   1354 
   1355  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
   1356  if (trace->is_trivial()) {
   1357    if (label_.is_bound() || on_work_list() || !KeepRecursing(compiler)) {
   1358      // If a generic version is already scheduled to be generated or we have
   1359      // recursed too deeply then just generate a jump to that code.
   1360      macro_assembler->GoTo(&label_);
   1361      // This will queue it up for generation of a generic version if it hasn't
   1362      // already been queued.
   1363      compiler->AddWork(this);
   1364      return DONE;
   1365    }
   1366    // Generate generic version of the node and bind the label for later use.
   1367    macro_assembler->Bind(&label_);
   1368    return CONTINUE;
   1369  }
   1370 
   1371  // We are being asked to make a non-generic version.  Keep track of how many
   1372  // non-generic versions we generate so as not to overdo it.
   1373  trace_count_++;
   1374  if (KeepRecursing(compiler) && compiler->optimize() &&
   1375      trace_count_ < kMaxCopiesCodeGenerated) {
   1376    return CONTINUE;
   1377  }
   1378 
   1379  // If we get here code has been generated for this node too many times or
   1380  // recursion is too deep.  Time to switch to a generic version.  The code for
   1381  // generic versions above can handle deep recursion properly.
   1382  bool was_limiting = compiler->limiting_recursion();
   1383  compiler->set_limiting_recursion(true);
   1384  trace->Flush(compiler, this);
   1385  compiler->set_limiting_recursion(was_limiting);
   1386  return DONE;
   1387 }
   1388 
   1389 bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) {
   1390  return !compiler->limiting_recursion() &&
   1391         compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion;
   1392 }
   1393 
   1394 void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
   1395                              BoyerMooreLookahead* bm, bool not_at_start) {
   1396  std::optional<RegExpFlags> old_flags;
   1397  if (action_type_ == MODIFY_FLAGS) {
   1398    // It is not guaranteed that we hit the resetting modify flags node, due to
   1399    // recursion budget limitation for filling in BMInfo. Therefore we reset the
   1400    // flags manually to the previous state after recursing.
   1401    old_flags = bm->compiler()->flags();
   1402    bm->compiler()->set_flags(flags());
   1403  }
   1404  if (action_type_ == BEGIN_POSITIVE_SUBMATCH) {
   1405    // We use the node after the lookaround to fill in the eats_at_least info
   1406    // so we have to use the same node to fill in the Boyer-Moore info.
   1407    success_node()->on_success()->FillInBMInfo(isolate, offset, budget - 1, bm,
   1408                                               not_at_start);
   1409  } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
   1410    // We don't use the node after a positive submatch success because it
   1411    // rewinds the position.  Since we returned 0 as the eats_at_least value for
   1412    // this node, we don't need to fill in any data.
   1413    on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
   1414  }
   1415  SaveBMInfo(bm, not_at_start, offset);
   1416  if (old_flags.has_value()) {
   1417    bm->compiler()->set_flags(*old_flags);
   1418  }
   1419 }
   1420 
   1421 void ActionNode::GetQuickCheckDetails(QuickCheckDetails* details,
   1422                                      RegExpCompiler* compiler, int filled_in,
   1423                                      bool not_at_start) {
   1424  if (action_type_ == SET_REGISTER_FOR_LOOP) {
   1425    on_success()->GetQuickCheckDetailsFromLoopEntry(details, compiler,
   1426                                                    filled_in, not_at_start);
   1427  } else if (action_type_ == BEGIN_POSITIVE_SUBMATCH) {
   1428    // We use the node after the lookaround to fill in the eats_at_least info
   1429    // so we have to use the same node to fill in the QuickCheck info.
   1430    success_node()->on_success()->GetQuickCheckDetails(details, compiler,
   1431                                                       filled_in, not_at_start);
   1432  } else if (action_type() != POSITIVE_SUBMATCH_SUCCESS) {
   1433    // We don't use the node after a positive submatch success because it
   1434    // rewinds the position.  Since we returned 0 as the eats_at_least value
   1435    // for this node, we don't need to fill in any data.
   1436    std::optional<RegExpFlags> old_flags;
   1437    if (action_type() == MODIFY_FLAGS) {
   1438      // It is not guaranteed that we hit the resetting modify flags node, as
   1439      // GetQuickCheckDetails doesn't travers the whole graph. Therefore we
   1440      // reset the flags manually to the previous state after recursing.
   1441      old_flags = compiler->flags();
   1442      compiler->set_flags(flags());
   1443    }
   1444    on_success()->GetQuickCheckDetails(details, compiler, filled_in,
   1445                                       not_at_start);
   1446    if (old_flags.has_value()) {
   1447      compiler->set_flags(*old_flags);
   1448    }
   1449  }
   1450 }
   1451 
   1452 void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
   1453                                 BoyerMooreLookahead* bm, bool not_at_start) {
   1454  // Match the behaviour of EatsAtLeast on this node.
   1455  if (assertion_type() == AT_START && not_at_start) return;
   1456  on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
   1457  SaveBMInfo(bm, not_at_start, offset);
   1458 }
   1459 
   1460 void NegativeLookaroundChoiceNode::GetQuickCheckDetails(
   1461    QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in,
   1462    bool not_at_start) {
   1463  RegExpNode* node = continue_node();
   1464  return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
   1465 }
   1466 
   1467 namespace {
   1468 
   1469 // Takes the left-most 1-bit and smears it out, setting all bits to its right.
   1470 inline uint32_t SmearBitsRight(uint32_t v) {
   1471  v |= v >> 1;
   1472  v |= v >> 2;
   1473  v |= v >> 4;
   1474  v |= v >> 8;
   1475  v |= v >> 16;
   1476  return v;
   1477 }
   1478 
   1479 }  // namespace
   1480 
   1481 bool QuickCheckDetails::Rationalize(bool asc) {
   1482  bool found_useful_op = false;
   1483  const uint32_t char_mask = CharMask(asc);
   1484  mask_ = 0;
   1485  value_ = 0;
   1486  int char_shift = 0;
   1487  for (int i = 0; i < characters_; i++) {
   1488    Position* pos = &positions_[i];
   1489    if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
   1490      found_useful_op = true;
   1491    }
   1492    mask_ |= (pos->mask & char_mask) << char_shift;
   1493    value_ |= (pos->value & char_mask) << char_shift;
   1494    char_shift += asc ? 8 : 16;
   1495  }
   1496  return found_useful_op;
   1497 }
   1498 
   1499 uint32_t RegExpNode::EatsAtLeast(bool not_at_start) {
   1500  return not_at_start ? eats_at_least_.eats_at_least_from_not_start
   1501                      : eats_at_least_.eats_at_least_from_possibly_start;
   1502 }
   1503 
   1504 EatsAtLeastInfo RegExpNode::EatsAtLeastFromLoopEntry() {
   1505  // SET_REGISTER_FOR_LOOP is only used to initialize loop counters, and it
   1506  // implies that the following node must be a LoopChoiceNode. If we need to
   1507  // set registers to constant values for other reasons, we could introduce a
   1508  // new action type SET_REGISTER that doesn't imply anything about its
   1509  // successor.
   1510  UNREACHABLE();
   1511 }
   1512 
   1513 void RegExpNode::GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails* details,
   1514                                                   RegExpCompiler* compiler,
   1515                                                   int characters_filled_in,
   1516                                                   bool not_at_start) {
   1517  // See comment in RegExpNode::EatsAtLeastFromLoopEntry.
   1518  UNREACHABLE();
   1519 }
   1520 
   1521 EatsAtLeastInfo LoopChoiceNode::EatsAtLeastFromLoopEntry() {
   1522  DCHECK_EQ(alternatives_->length(), 2);  // There's just loop and continue.
   1523 
   1524  if (read_backward()) {
   1525    // The eats_at_least value is not used if reading backward. The
   1526    // EatsAtLeastPropagator should've zeroed it as well.
   1527    DCHECK_EQ(eats_at_least_info()->eats_at_least_from_possibly_start, 0);
   1528    DCHECK_EQ(eats_at_least_info()->eats_at_least_from_not_start, 0);
   1529    return {};
   1530  }
   1531 
   1532  // Figure out how much the loop body itself eats, not including anything in
   1533  // the continuation case. In general, the nodes in the loop body should report
   1534  // that they eat at least the number eaten by the continuation node, since any
   1535  // successful match in the loop body must also include the continuation node.
   1536  // However, in some cases involving positive lookaround, the loop body under-
   1537  // reports its appetite, so use saturated math here to avoid negative numbers.
   1538  // For this to work correctly, we explicitly need to use signed integers here.
   1539  uint8_t loop_body_from_not_start = base::saturated_cast<uint8_t>(
   1540      static_cast<int>(loop_node_->EatsAtLeast(true)) -
   1541      static_cast<int>(continue_node_->EatsAtLeast(true)));
   1542  uint8_t loop_body_from_possibly_start = base::saturated_cast<uint8_t>(
   1543      static_cast<int>(loop_node_->EatsAtLeast(false)) -
   1544      static_cast<int>(continue_node_->EatsAtLeast(true)));
   1545 
   1546  // Limit the number of loop iterations to avoid overflow in subsequent steps.
   1547  int loop_iterations = base::saturated_cast<uint8_t>(min_loop_iterations());
   1548 
   1549  EatsAtLeastInfo result;
   1550  result.eats_at_least_from_not_start =
   1551      base::saturated_cast<uint8_t>(loop_iterations * loop_body_from_not_start +
   1552                                    continue_node_->EatsAtLeast(true));
   1553  if (loop_iterations > 0 && loop_body_from_possibly_start > 0) {
   1554    // First loop iteration eats at least one, so all subsequent iterations
   1555    // and the after-loop chunk are guaranteed to not be at the start.
   1556    result.eats_at_least_from_possibly_start = base::saturated_cast<uint8_t>(
   1557        loop_body_from_possibly_start +
   1558        (loop_iterations - 1) * loop_body_from_not_start +
   1559        continue_node_->EatsAtLeast(true));
   1560  } else {
   1561    // Loop body might eat nothing, so only continue node contributes.
   1562    result.eats_at_least_from_possibly_start =
   1563        continue_node_->EatsAtLeast(false);
   1564  }
   1565  return result;
   1566 }
   1567 
   1568 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
   1569                                Trace* bounds_check_trace, Trace* trace,
   1570                                bool preload_has_checked_bounds,
   1571                                Label* on_possible_success,
   1572                                QuickCheckDetails* details,
   1573                                bool fall_through_on_failure,
   1574                                ChoiceNode* predecessor) {
   1575  DCHECK_NOT_NULL(predecessor);
   1576  if (details->characters() == 0) return false;
   1577  GetQuickCheckDetails(details, compiler, 0,
   1578                       trace->at_start() == Trace::FALSE_VALUE);
   1579  if (details->cannot_match()) return false;
   1580  if (!details->Rationalize(compiler->one_byte())) return false;
   1581  DCHECK(details->characters() == 1 ||
   1582         compiler->macro_assembler()->CanReadUnaligned());
   1583  uint32_t mask = details->mask();
   1584  uint32_t value = details->value();
   1585 
   1586  RegExpMacroAssembler* assembler = compiler->macro_assembler();
   1587 
   1588  if (trace->characters_preloaded() != details->characters()) {
   1589    DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset());
   1590    // The bounds check is performed using the minimum number of characters
   1591    // any choice would eat, so if the bounds check fails, then none of the
   1592    // choices can succeed, so we can just immediately backtrack, rather
   1593    // than go to the next choice. The number of characters preloaded may be
   1594    // less than the number used for the bounds check.
   1595    int eats_at_least = predecessor->EatsAtLeast(
   1596        bounds_check_trace->at_start() == Trace::FALSE_VALUE);
   1597    DCHECK_GE(eats_at_least, details->characters());
   1598    assembler->LoadCurrentCharacter(
   1599        trace->cp_offset(), bounds_check_trace->backtrack(),
   1600        !preload_has_checked_bounds, details->characters(), eats_at_least);
   1601  }
   1602 
   1603  bool need_mask = true;
   1604 
   1605  if (details->characters() == 1) {
   1606    // If number of characters preloaded is 1 then we used a byte or 16 bit
   1607    // load so the value is already masked down.
   1608    const uint32_t char_mask = CharMask(compiler->one_byte());
   1609    if ((mask & char_mask) == char_mask) need_mask = false;
   1610    mask &= char_mask;
   1611  } else {
   1612    // For 2-character preloads in one-byte mode or 1-character preloads in
   1613    // two-byte mode we also use a 16 bit load with zero extend.
   1614    static const uint32_t kTwoByteMask = 0xFFFF;
   1615    static const uint32_t kFourByteMask = 0xFFFFFFFF;
   1616    if (details->characters() == 2 && compiler->one_byte()) {
   1617      if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
   1618    } else if (details->characters() == 1 && !compiler->one_byte()) {
   1619      if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
   1620    } else {
   1621      if (mask == kFourByteMask) need_mask = false;
   1622    }
   1623  }
   1624 
   1625  if (fall_through_on_failure) {
   1626    if (need_mask) {
   1627      assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
   1628    } else {
   1629      assembler->CheckCharacter(value, on_possible_success);
   1630    }
   1631  } else {
   1632    if (need_mask) {
   1633      assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
   1634    } else {
   1635      assembler->CheckNotCharacter(value, trace->backtrack());
   1636    }
   1637  }
   1638  return true;
   1639 }
   1640 
   1641 // Here is the meat of GetQuickCheckDetails (see also the comment on the
   1642 // super-class in the .h file).
   1643 //
   1644 // We iterate along the text object, building up for each character a
   1645 // mask and value that can be used to test for a quick failure to match.
   1646 // The masks and values for the positions will be combined into a single
   1647 // machine word for the current character width in order to be used in
   1648 // generating a quick check.
   1649 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
   1650                                    RegExpCompiler* compiler,
   1651                                    int characters_filled_in,
   1652                                    bool not_at_start) {
   1653  // Do not collect any quick check details if the text node reads backward,
   1654  // since it reads in the opposite direction than we use for quick checks.
   1655  if (read_backward()) return;
   1656  Isolate* isolate = compiler->macro_assembler()->isolate();
   1657  DCHECK(characters_filled_in < details->characters());
   1658  int characters = details->characters();
   1659  const uint32_t char_mask = CharMask(compiler->one_byte());
   1660  for (int k = 0; k < elements()->length(); k++) {
   1661    TextElement elm = elements()->at(k);
   1662    if (elm.text_type() == TextElement::ATOM) {
   1663      base::Vector<const base::uc16> quarks = elm.atom()->data();
   1664      for (int i = 0; i < characters && i < quarks.length(); i++) {
   1665        QuickCheckDetails::Position* pos =
   1666            details->positions(characters_filled_in);
   1667        base::uc16 c = quarks[i];
   1668        if (IsIgnoreCase(compiler->flags())) {
   1669          unibrow::uchar chars[4];
   1670          int length =
   1671              GetCaseIndependentLetters(isolate, c, compiler, chars, 4);
   1672          if (length == 0) {
   1673            // This can happen because all case variants are non-Latin1, but we
   1674            // know the input is Latin1.
   1675            details->set_cannot_match();
   1676            pos->determines_perfectly = false;
   1677            return;
   1678          }
   1679          if (length == 1) {
   1680            // This letter has no case equivalents, so it's nice and simple
   1681            // and the mask-compare will determine definitely whether we have
   1682            // a match at this character position.
   1683            pos->mask = char_mask;
   1684            pos->value = chars[0];
   1685            pos->determines_perfectly = true;
   1686          } else {
   1687            uint32_t common_bits = char_mask;
   1688            uint32_t bits = chars[0];
   1689            for (int j = 1; j < length; j++) {
   1690              uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
   1691              common_bits ^= differing_bits;
   1692              bits &= common_bits;
   1693            }
   1694            // If length is 2 and common bits has only one zero in it then
   1695            // our mask and compare instruction will determine definitely
   1696            // whether we have a match at this character position.  Otherwise
   1697            // it can only be an approximate check.
   1698            uint32_t one_zero = (common_bits | ~char_mask);
   1699            if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
   1700              pos->determines_perfectly = true;
   1701            }
   1702            pos->mask = common_bits;
   1703            pos->value = bits;
   1704          }
   1705        } else {
   1706          // Don't ignore case.  Nice simple case where the mask-compare will
   1707          // determine definitely whether we have a match at this character
   1708          // position.
   1709          if (c > char_mask) {
   1710            details->set_cannot_match();
   1711            pos->determines_perfectly = false;
   1712            return;
   1713          }
   1714          pos->mask = char_mask;
   1715          pos->value = c;
   1716          pos->determines_perfectly = true;
   1717        }
   1718        characters_filled_in++;
   1719        DCHECK(characters_filled_in <= details->characters());
   1720        if (characters_filled_in == details->characters()) {
   1721          return;
   1722        }
   1723      }
   1724    } else {
   1725      QuickCheckDetails::Position* pos =
   1726          details->positions(characters_filled_in);
   1727      RegExpClassRanges* tree = elm.class_ranges();
   1728      ZoneList<CharacterRange>* ranges = tree->ranges(zone());
   1729      if (tree->is_negated() || ranges->is_empty()) {
   1730        // A quick check uses multi-character mask and compare.  There is no
   1731        // useful way to incorporate a negative char class into this scheme
   1732        // so we just conservatively create a mask and value that will always
   1733        // succeed.
   1734        // Likewise for empty ranges (empty ranges can occur e.g. when
   1735        // compiling for one-byte subjects and impossible (non-one-byte) ranges
   1736        // have been removed).
   1737        pos->mask = 0;
   1738        pos->value = 0;
   1739      } else {
   1740        int first_range = 0;
   1741        while (ranges->at(first_range).from() > char_mask) {
   1742          first_range++;
   1743          if (first_range == ranges->length()) {
   1744            details->set_cannot_match();
   1745            pos->determines_perfectly = false;
   1746            return;
   1747          }
   1748        }
   1749        CharacterRange range = ranges->at(first_range);
   1750        const base::uc32 first_from = range.from();
   1751        const base::uc32 first_to =
   1752            (range.to() > char_mask) ? char_mask : range.to();
   1753        const uint32_t differing_bits = (first_from ^ first_to);
   1754        // A mask and compare is only perfect if the differing bits form a
   1755        // number like 00011111 with one single block of trailing 1s.
   1756        if ((differing_bits & (differing_bits + 1)) == 0 &&
   1757            first_from + differing_bits == first_to) {
   1758          pos->determines_perfectly = true;
   1759        }
   1760        uint32_t common_bits = ~SmearBitsRight(differing_bits);
   1761        uint32_t bits = (first_from & common_bits);
   1762        for (int i = first_range + 1; i < ranges->length(); i++) {
   1763          range = ranges->at(i);
   1764          const base::uc32 from = range.from();
   1765          if (from > char_mask) continue;
   1766          const base::uc32 to =
   1767              (range.to() > char_mask) ? char_mask : range.to();
   1768          // Here we are combining more ranges into the mask and compare
   1769          // value.  With each new range the mask becomes more sparse and
   1770          // so the chances of a false positive rise.  A character class
   1771          // with multiple ranges is assumed never to be equivalent to a
   1772          // mask and compare operation.
   1773          pos->determines_perfectly = false;
   1774          uint32_t new_common_bits = (from ^ to);
   1775          new_common_bits = ~SmearBitsRight(new_common_bits);
   1776          common_bits &= new_common_bits;
   1777          bits &= new_common_bits;
   1778          uint32_t new_differing_bits = (from & common_bits) ^ bits;
   1779          common_bits ^= new_differing_bits;
   1780          bits &= common_bits;
   1781        }
   1782        pos->mask = common_bits;
   1783        pos->value = bits;
   1784      }
   1785      characters_filled_in++;
   1786      DCHECK(characters_filled_in <= details->characters());
   1787      if (characters_filled_in == details->characters()) return;
   1788    }
   1789  }
   1790  DCHECK(characters_filled_in != details->characters());
   1791  if (!details->cannot_match()) {
   1792    on_success()->GetQuickCheckDetails(details, compiler, characters_filled_in,
   1793                                       true);
   1794  }
   1795 }
   1796 
   1797 void QuickCheckDetails::Clear() {
   1798  for (int i = 0; i < characters_; i++) {
   1799    positions_[i].mask = 0;
   1800    positions_[i].value = 0;
   1801    positions_[i].determines_perfectly = false;
   1802  }
   1803  characters_ = 0;
   1804 }
   1805 
   1806 void QuickCheckDetails::Advance(int by, bool one_byte) {
   1807  if (by >= characters_ || by < 0) {
   1808    DCHECK_IMPLIES(by < 0, characters_ == 0);
   1809    Clear();
   1810    return;
   1811  }
   1812  DCHECK_LE(characters_ - by, 4);
   1813  DCHECK_LE(characters_, 4);
   1814  for (int i = 0; i < characters_ - by; i++) {
   1815    positions_[i] = positions_[by + i];
   1816  }
   1817  for (int i = characters_ - by; i < characters_; i++) {
   1818    positions_[i].mask = 0;
   1819    positions_[i].value = 0;
   1820    positions_[i].determines_perfectly = false;
   1821  }
   1822  characters_ -= by;
   1823  // We could change mask_ and value_ here but we would never advance unless
   1824  // they had already been used in a check and they won't be used again because
   1825  // it would gain us nothing.  So there's no point.
   1826 }
   1827 
   1828 void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
   1829  DCHECK(characters_ == other->characters_);
   1830  if (other->cannot_match_) {
   1831    return;
   1832  }
   1833  if (cannot_match_) {
   1834    *this = *other;
   1835    return;
   1836  }
   1837  for (int i = from_index; i < characters_; i++) {
   1838    QuickCheckDetails::Position* pos = positions(i);
   1839    QuickCheckDetails::Position* other_pos = other->positions(i);
   1840    if (pos->mask != other_pos->mask || pos->value != other_pos->value ||
   1841        !other_pos->determines_perfectly) {
   1842      // Our mask-compare operation will be approximate unless we have the
   1843      // exact same operation on both sides of the alternation.
   1844      pos->determines_perfectly = false;
   1845    }
   1846    pos->mask &= other_pos->mask;
   1847    pos->value &= pos->mask;
   1848    other_pos->value &= pos->mask;
   1849    uint32_t differing_bits = (pos->value ^ other_pos->value);
   1850    pos->mask &= ~differing_bits;
   1851    pos->value &= pos->mask;
   1852  }
   1853 }
   1854 
   1855 class VisitMarker {
   1856 public:
   1857  explicit VisitMarker(NodeInfo* info) : info_(info) {
   1858    DCHECK(!info->visited);
   1859    info->visited = true;
   1860  }
   1861  ~VisitMarker() { info_->visited = false; }
   1862 
   1863 private:
   1864  NodeInfo* info_;
   1865 };
   1866 
   1867 // Temporarily sets traversed_loop_initialization_node_.
   1868 class LoopInitializationMarker {
   1869 public:
   1870  explicit LoopInitializationMarker(LoopChoiceNode* node) : node_(node) {
   1871    DCHECK(!node_->traversed_loop_initialization_node_);
   1872    node_->traversed_loop_initialization_node_ = true;
   1873  }
   1874  ~LoopInitializationMarker() {
   1875    DCHECK(node_->traversed_loop_initialization_node_);
   1876    node_->traversed_loop_initialization_node_ = false;
   1877  }
   1878  LoopInitializationMarker(const LoopInitializationMarker&) = delete;
   1879  LoopInitializationMarker& operator=(const LoopInitializationMarker&) = delete;
   1880 
   1881 private:
   1882  LoopChoiceNode* node_;
   1883 };
   1884 
   1885 // Temporarily decrements min_loop_iterations_.
   1886 class IterationDecrementer {
   1887 public:
   1888  explicit IterationDecrementer(LoopChoiceNode* node) : node_(node) {
   1889    DCHECK_GT(node_->min_loop_iterations_, 0);
   1890    --node_->min_loop_iterations_;
   1891  }
   1892  ~IterationDecrementer() { ++node_->min_loop_iterations_; }
   1893  IterationDecrementer(const IterationDecrementer&) = delete;
   1894  IterationDecrementer& operator=(const IterationDecrementer&) = delete;
   1895 
   1896 private:
   1897  LoopChoiceNode* node_;
   1898 };
   1899 
   1900 RegExpNode* SeqRegExpNode::FilterOneByte(int depth, RegExpCompiler* compiler) {
   1901  if (info()->replacement_calculated) return replacement();
   1902  if (depth < 0) return this;
   1903  DCHECK(!info()->visited);
   1904  VisitMarker marker(info());
   1905  return FilterSuccessor(depth - 1, compiler);
   1906 }
   1907 
   1908 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth,
   1909                                           RegExpCompiler* compiler) {
   1910  RegExpNode* next = on_success_->FilterOneByte(depth - 1, compiler);
   1911  if (next == nullptr) return set_replacement(nullptr);
   1912  on_success_ = next;
   1913  return set_replacement(this);
   1914 }
   1915 
   1916 // We need to check for the following characters: 0x39C 0x3BC 0x178.
   1917 bool RangeContainsLatin1Equivalents(CharacterRange range) {
   1918  // TODO(dcarney): this could be a lot more efficient.
   1919  return range.Contains(0x039C) || range.Contains(0x03BC) ||
   1920         range.Contains(0x0178);
   1921 }
   1922 
   1923 namespace {
   1924 
   1925 bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
   1926  for (int i = 0; i < ranges->length(); i++) {
   1927    // TODO(dcarney): this could be a lot more efficient.
   1928    if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
   1929  }
   1930  return false;
   1931 }
   1932 
   1933 }  // namespace
   1934 
   1935 RegExpNode* TextNode::FilterOneByte(int depth, RegExpCompiler* compiler) {
   1936  RegExpFlags flags = compiler->flags();
   1937  if (info()->replacement_calculated) return replacement();
   1938  if (depth < 0) return this;
   1939  DCHECK(!info()->visited);
   1940  VisitMarker marker(info());
   1941  int element_count = elements()->length();
   1942  for (int i = 0; i < element_count; i++) {
   1943    TextElement elm = elements()->at(i);
   1944    if (elm.text_type() == TextElement::ATOM) {
   1945      base::Vector<const base::uc16> quarks = elm.atom()->data();
   1946      for (int j = 0; j < quarks.length(); j++) {
   1947        base::uc16 c = quarks[j];
   1948        if (!IsIgnoreCase(flags)) {
   1949          if (c > String::kMaxOneByteCharCode) return set_replacement(nullptr);
   1950        } else {
   1951          unibrow::uchar chars[4];
   1952          int length = GetCaseIndependentLetters(compiler->isolate(), c,
   1953                                                 compiler, chars, 4);
   1954          if (length == 0 || chars[0] > String::kMaxOneByteCharCode) {
   1955            return set_replacement(nullptr);
   1956          }
   1957        }
   1958      }
   1959    } else {
   1960      // A character class can also be impossible to match in one-byte mode.
   1961      DCHECK(elm.text_type() == TextElement::CLASS_RANGES);
   1962      RegExpClassRanges* cr = elm.class_ranges();
   1963      ZoneList<CharacterRange>* ranges = cr->ranges(zone());
   1964      CharacterRange::Canonicalize(ranges);
   1965      // Now they are in order so we only need to look at the first.
   1966      // If we are in non-Unicode case independent mode then we need
   1967      // to be a bit careful here, because the character classes have
   1968      // not been case-desugared yet, but there are characters and ranges
   1969      // that can become Latin-1 when case is considered.
   1970      int range_count = ranges->length();
   1971      if (cr->is_negated()) {
   1972        if (range_count != 0 && ranges->at(0).from() == 0 &&
   1973            ranges->at(0).to() >= String::kMaxOneByteCharCode) {
   1974          bool case_complications = !IsEitherUnicode(flags) &&
   1975                                    IsIgnoreCase(flags) &&
   1976                                    RangesContainLatin1Equivalents(ranges);
   1977          if (!case_complications) {
   1978            return set_replacement(nullptr);
   1979          }
   1980        }
   1981      } else {
   1982        if (range_count == 0 ||
   1983            ranges->at(0).from() > String::kMaxOneByteCharCode) {
   1984          bool case_complications = !IsEitherUnicode(flags) &&
   1985                                    IsIgnoreCase(flags) &&
   1986                                    RangesContainLatin1Equivalents(ranges);
   1987          if (!case_complications) {
   1988            return set_replacement(nullptr);
   1989          }
   1990        }
   1991      }
   1992    }
   1993  }
   1994  return FilterSuccessor(depth - 1, compiler);
   1995 }
   1996 
   1997 RegExpNode* LoopChoiceNode::FilterOneByte(int depth, RegExpCompiler* compiler) {
   1998  if (info()->replacement_calculated) return replacement();
   1999  if (depth < 0) return this;
   2000  if (info()->visited) return this;
   2001  {
   2002    VisitMarker marker(info());
   2003 
   2004    RegExpNode* continue_replacement =
   2005        continue_node_->FilterOneByte(depth - 1, compiler);
   2006    // If we can't continue after the loop then there is no sense in doing the
   2007    // loop.
   2008    if (continue_replacement == nullptr) return set_replacement(nullptr);
   2009  }
   2010 
   2011  return ChoiceNode::FilterOneByte(depth - 1, compiler);
   2012 }
   2013 
   2014 RegExpNode* ChoiceNode::FilterOneByte(int depth, RegExpCompiler* compiler) {
   2015  if (info()->replacement_calculated) return replacement();
   2016  if (depth < 0) return this;
   2017  if (info()->visited) return this;
   2018  VisitMarker marker(info());
   2019  int choice_count = alternatives_->length();
   2020 
   2021  for (int i = 0; i < choice_count; i++) {
   2022    GuardedAlternative alternative = alternatives_->at(i);
   2023    if (alternative.guards() != nullptr &&
   2024        alternative.guards()->length() != 0) {
   2025      set_replacement(this);
   2026      return this;
   2027    }
   2028  }
   2029 
   2030  int surviving = 0;
   2031  RegExpNode* survivor = nullptr;
   2032  for (int i = 0; i < choice_count; i++) {
   2033    GuardedAlternative alternative = alternatives_->at(i);
   2034    RegExpNode* replacement =
   2035        alternative.node()->FilterOneByte(depth - 1, compiler);
   2036    DCHECK(replacement != this);  // No missing EMPTY_MATCH_CHECK.
   2037    if (replacement != nullptr) {
   2038      alternatives_->at(i).set_node(replacement);
   2039      surviving++;
   2040      survivor = replacement;
   2041    }
   2042  }
   2043  if (surviving < 2) return set_replacement(survivor);
   2044 
   2045  set_replacement(this);
   2046  if (surviving == choice_count) {
   2047    return this;
   2048  }
   2049  // Only some of the nodes survived the filtering.  We need to rebuild the
   2050  // alternatives list.
   2051  ZoneList<GuardedAlternative>* new_alternatives =
   2052      zone()->New<ZoneList<GuardedAlternative>>(surviving, zone());
   2053  for (int i = 0; i < choice_count; i++) {
   2054    RegExpNode* replacement =
   2055        alternatives_->at(i).node()->FilterOneByte(depth - 1, compiler);
   2056    if (replacement != nullptr) {
   2057      alternatives_->at(i).set_node(replacement);
   2058      new_alternatives->Add(alternatives_->at(i), zone());
   2059    }
   2060  }
   2061  alternatives_ = new_alternatives;
   2062  return this;
   2063 }
   2064 
   2065 RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(
   2066    int depth, RegExpCompiler* compiler) {
   2067  if (info()->replacement_calculated) return replacement();
   2068  if (depth < 0) return this;
   2069  if (info()->visited) return this;
   2070  VisitMarker marker(info());
   2071  // Alternative 0 is the negative lookahead, alternative 1 is what comes
   2072  // afterwards.
   2073  RegExpNode* node = continue_node();
   2074  RegExpNode* replacement = node->FilterOneByte(depth - 1, compiler);
   2075  if (replacement == nullptr) return set_replacement(nullptr);
   2076  alternatives_->at(kContinueIndex).set_node(replacement);
   2077 
   2078  RegExpNode* neg_node = lookaround_node();
   2079  RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, compiler);
   2080  // If the negative lookahead is always going to fail then
   2081  // we don't need to check it.
   2082  if (neg_replacement == nullptr) return set_replacement(replacement);
   2083  alternatives_->at(kLookaroundIndex).set_node(neg_replacement);
   2084  return set_replacement(this);
   2085 }
   2086 
   2087 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
   2088                                          RegExpCompiler* compiler,
   2089                                          int characters_filled_in,
   2090                                          bool not_at_start) {
   2091  if (body_can_be_zero_length_ || info()->visited) return;
   2092  not_at_start = not_at_start || this->not_at_start();
   2093  DCHECK_EQ(alternatives_->length(), 2);  // There's just loop and continue.
   2094  if (traversed_loop_initialization_node_ && min_loop_iterations_ > 0 &&
   2095      loop_node_->EatsAtLeast(not_at_start) >
   2096          continue_node_->EatsAtLeast(true)) {
   2097    // Loop body is guaranteed to execute at least once, and consume characters
   2098    // when it does, meaning the only possible quick checks from this point
   2099    // begin with the loop body. We may recursively visit this LoopChoiceNode,
   2100    // but we temporarily decrease its minimum iteration counter so we know when
   2101    // to check the continue case.
   2102    IterationDecrementer next_iteration(this);
   2103    loop_node_->GetQuickCheckDetails(details, compiler, characters_filled_in,
   2104                                     not_at_start);
   2105  } else {
   2106    // Might not consume anything in the loop body, so treat it like a normal
   2107    // ChoiceNode (and don't recursively visit this node again).
   2108    VisitMarker marker(info());
   2109    ChoiceNode::GetQuickCheckDetails(details, compiler, characters_filled_in,
   2110                                     not_at_start);
   2111  }
   2112 }
   2113 
   2114 void LoopChoiceNode::GetQuickCheckDetailsFromLoopEntry(
   2115    QuickCheckDetails* details, RegExpCompiler* compiler,
   2116    int characters_filled_in, bool not_at_start) {
   2117  if (traversed_loop_initialization_node_) {
   2118    // We already entered this loop once, exited via its continuation node, and
   2119    // followed an outer loop's back-edge to before the loop entry point. We
   2120    // could try to reset the minimum iteration count to its starting value at
   2121    // this point, but that seems like more trouble than it's worth. It's safe
   2122    // to keep going with the current (possibly reduced) minimum iteration
   2123    // count.
   2124    GetQuickCheckDetails(details, compiler, characters_filled_in, not_at_start);
   2125  } else {
   2126    // We are entering a loop via its counter initialization action, meaning we
   2127    // are guaranteed to run the loop body at least some minimum number of times
   2128    // before running the continuation node. Set a flag so that this node knows
   2129    // (now and any times we visit it again recursively) that it was entered
   2130    // from the top.
   2131    LoopInitializationMarker marker(this);
   2132    GetQuickCheckDetails(details, compiler, characters_filled_in, not_at_start);
   2133  }
   2134 }
   2135 
   2136 void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
   2137                                  BoyerMooreLookahead* bm, bool not_at_start) {
   2138  if (body_can_be_zero_length_ || budget <= 0) {
   2139    bm->SetRest(offset);
   2140    SaveBMInfo(bm, not_at_start, offset);
   2141    return;
   2142  }
   2143  ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
   2144  SaveBMInfo(bm, not_at_start, offset);
   2145 }
   2146 
   2147 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
   2148                                      RegExpCompiler* compiler,
   2149                                      int characters_filled_in,
   2150                                      bool not_at_start) {
   2151  not_at_start = (not_at_start || not_at_start_);
   2152  int choice_count = alternatives_->length();
   2153  DCHECK_LT(0, choice_count);
   2154  alternatives_->at(0).node()->GetQuickCheckDetails(
   2155      details, compiler, characters_filled_in, not_at_start);
   2156  for (int i = 1; i < choice_count; i++) {
   2157    QuickCheckDetails new_details(details->characters());
   2158    RegExpNode* node = alternatives_->at(i).node();
   2159    node->GetQuickCheckDetails(&new_details, compiler, characters_filled_in,
   2160                               not_at_start);
   2161    // Here we merge the quick match details of the two branches.
   2162    details->Merge(&new_details, characters_filled_in);
   2163  }
   2164 }
   2165 
   2166 namespace {
   2167 
   2168 // Check for [0-9A-Z_a-z].
   2169 void EmitWordCheck(RegExpMacroAssembler* assembler, Label* word,
   2170                   Label* non_word, bool fall_through_on_word) {
   2171  if (assembler->CheckSpecialClassRanges(
   2172          fall_through_on_word ? StandardCharacterSet::kWord
   2173                               : StandardCharacterSet::kNotWord,
   2174          fall_through_on_word ? non_word : word)) {
   2175    // Optimized implementation available.
   2176    return;
   2177  }
   2178  assembler->CheckCharacterGT('z', non_word);
   2179  assembler->CheckCharacterLT('0', non_word);
   2180  assembler->CheckCharacterGT('a' - 1, word);
   2181  assembler->CheckCharacterLT('9' + 1, word);
   2182  assembler->CheckCharacterLT('A', non_word);
   2183  assembler->CheckCharacterLT('Z' + 1, word);
   2184  if (fall_through_on_word) {
   2185    assembler->CheckNotCharacter('_', non_word);
   2186  } else {
   2187    assembler->CheckCharacter('_', word);
   2188  }
   2189 }
   2190 
   2191 // Emit the code to check for a ^ in multiline mode (1-character lookbehind
   2192 // that matches newline or the start of input).
   2193 void EmitHat(RegExpCompiler* compiler, RegExpNode* on_success, Trace* trace) {
   2194  RegExpMacroAssembler* assembler = compiler->macro_assembler();
   2195 
   2196  // We will load the previous character into the current character register.
   2197  Trace new_trace(*trace);
   2198  new_trace.InvalidateCurrentCharacter();
   2199 
   2200  // A positive (> 0) cp_offset means we've already successfully matched a
   2201  // non-empty-width part of the pattern, and thus cannot be at or before the
   2202  // start of the subject string. We can thus skip both at-start and
   2203  // bounds-checks when loading the one-character lookbehind.
   2204  const bool may_be_at_or_before_subject_string_start =
   2205      new_trace.cp_offset() <= 0;
   2206 
   2207  Label ok;
   2208  if (may_be_at_or_before_subject_string_start) {
   2209    // The start of input counts as a newline in this context, so skip to ok if
   2210    // we are at the start.
   2211    assembler->CheckAtStart(new_trace.cp_offset(), &ok);
   2212  }
   2213 
   2214  // If we've already checked that we are not at the start of input, it's okay
   2215  // to load the previous character without bounds checks.
   2216  const bool can_skip_bounds_check = !may_be_at_or_before_subject_string_start;
   2217  assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
   2218                                  new_trace.backtrack(), can_skip_bounds_check);
   2219  if (!assembler->CheckSpecialClassRanges(StandardCharacterSet::kLineTerminator,
   2220                                          new_trace.backtrack())) {
   2221    // Newline means \n, \r, 0x2028 or 0x2029.
   2222    if (!compiler->one_byte()) {
   2223      assembler->CheckCharacterAfterAnd(0x2028, 0xFFFE, &ok);
   2224    }
   2225    assembler->CheckCharacter('\n', &ok);
   2226    assembler->CheckNotCharacter('\r', new_trace.backtrack());
   2227  }
   2228  assembler->Bind(&ok);
   2229  on_success->Emit(compiler, &new_trace);
   2230 }
   2231 
   2232 }  // namespace
   2233 
   2234 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
   2235 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
   2236  RegExpMacroAssembler* assembler = compiler->macro_assembler();
   2237  Isolate* isolate = assembler->isolate();
   2238  Trace::TriBool next_is_word_character = Trace::UNKNOWN;
   2239  bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
   2240  BoyerMooreLookahead* lookahead = bm_info(not_at_start);
   2241  if (lookahead == nullptr) {
   2242    int eats_at_least =
   2243        std::min(kMaxLookaheadForBoyerMoore, EatsAtLeast(not_at_start));
   2244    if (eats_at_least >= 1) {
   2245      BoyerMooreLookahead* bm =
   2246          zone()->New<BoyerMooreLookahead>(eats_at_least, compiler, zone());
   2247      FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start);
   2248      if (bm->at(0)->is_non_word()) next_is_word_character = Trace::FALSE_VALUE;
   2249      if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
   2250    }
   2251  } else {
   2252    if (lookahead->at(0)->is_non_word())
   2253      next_is_word_character = Trace::FALSE_VALUE;
   2254    if (lookahead->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
   2255  }
   2256  bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
   2257  if (next_is_word_character == Trace::UNKNOWN) {
   2258    Label before_non_word;
   2259    Label before_word;
   2260    if (trace->characters_preloaded() != 1) {
   2261      assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
   2262    }
   2263    // Fall through on non-word.
   2264    EmitWordCheck(assembler, &before_word, &before_non_word, false);
   2265    // Next character is not a word character.
   2266    assembler->Bind(&before_non_word);
   2267    Label ok;
   2268    BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
   2269    assembler->GoTo(&ok);
   2270 
   2271    assembler->Bind(&before_word);
   2272    BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
   2273    assembler->Bind(&ok);
   2274  } else if (next_is_word_character == Trace::TRUE_VALUE) {
   2275    BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
   2276  } else {
   2277    DCHECK(next_is_word_character == Trace::FALSE_VALUE);
   2278    BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
   2279  }
   2280 }
   2281 
   2282 void AssertionNode::BacktrackIfPrevious(
   2283    RegExpCompiler* compiler, Trace* trace,
   2284    AssertionNode::IfPrevious backtrack_if_previous) {
   2285  RegExpMacroAssembler* assembler = compiler->macro_assembler();
   2286  Trace new_trace(*trace);
   2287  new_trace.InvalidateCurrentCharacter();
   2288 
   2289  Label fall_through;
   2290  Label* non_word = backtrack_if_previous == kIsNonWord ? new_trace.backtrack()
   2291                                                        : &fall_through;
   2292  Label* word = backtrack_if_previous == kIsNonWord ? &fall_through
   2293                                                    : new_trace.backtrack();
   2294 
   2295  // A positive (> 0) cp_offset means we've already successfully matched a
   2296  // non-empty-width part of the pattern, and thus cannot be at or before the
   2297  // start of the subject string. We can thus skip both at-start and
   2298  // bounds-checks when loading the one-character lookbehind.
   2299  const bool may_be_at_or_before_subject_string_start =
   2300      new_trace.cp_offset() <= 0;
   2301 
   2302  if (may_be_at_or_before_subject_string_start) {
   2303    // The start of input counts as a non-word character, so the question is
   2304    // decided if we are at the start.
   2305    assembler->CheckAtStart(new_trace.cp_offset(), non_word);
   2306  }
   2307 
   2308  // If we've already checked that we are not at the start of input, it's okay
   2309  // to load the previous character without bounds checks.
   2310  const bool can_skip_bounds_check = !may_be_at_or_before_subject_string_start;
   2311  static_assert(Trace::kCPOffsetSlack == 1);
   2312  assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, non_word,
   2313                                  can_skip_bounds_check);
   2314  EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
   2315 
   2316  assembler->Bind(&fall_through);
   2317  on_success()->Emit(compiler, &new_trace);
   2318 }
   2319 
   2320 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
   2321                                         RegExpCompiler* compiler,
   2322                                         int filled_in, bool not_at_start) {
   2323  if (assertion_type_ == AT_START && not_at_start) {
   2324    details->set_cannot_match();
   2325    return;
   2326  }
   2327  return on_success()->GetQuickCheckDetails(details, compiler, filled_in,
   2328                                            not_at_start);
   2329 }
   2330 
   2331 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
   2332  RegExpMacroAssembler* assembler = compiler->macro_assembler();
   2333  switch (assertion_type_) {
   2334    case AT_END: {
   2335      Label ok;
   2336      assembler->CheckPosition(trace->cp_offset(), &ok);
   2337      assembler->GoTo(trace->backtrack());
   2338      assembler->Bind(&ok);
   2339      break;
   2340    }
   2341    case AT_START: {
   2342      if (trace->at_start() == Trace::FALSE_VALUE) {
   2343        assembler->GoTo(trace->backtrack());
   2344        return;
   2345      }
   2346      if (trace->at_start() == Trace::UNKNOWN) {
   2347        assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack());
   2348        Trace at_start_trace = *trace;
   2349        at_start_trace.set_at_start(Trace::TRUE_VALUE);
   2350        on_success()->Emit(compiler, &at_start_trace);
   2351        return;
   2352      }
   2353    } break;
   2354    case AFTER_NEWLINE:
   2355      EmitHat(compiler, on_success(), trace);
   2356      return;
   2357    case AT_BOUNDARY:
   2358    case AT_NON_BOUNDARY: {
   2359      EmitBoundaryCheck(compiler, trace);
   2360      return;
   2361    }
   2362  }
   2363  on_success()->Emit(compiler, trace);
   2364 }
   2365 
   2366 namespace {
   2367 
   2368 bool DeterminedAlready(const QuickCheckDetails* quick_check, int offset) {
   2369  if (quick_check == nullptr) return false;
   2370  if (offset >= quick_check->characters()) return false;
   2371  return quick_check->positions(offset)->determines_perfectly;
   2372 }
   2373 
   2374 void UpdateBoundsCheck(int index, int* checked_up_to) {
   2375  if (index > *checked_up_to) {
   2376    *checked_up_to = index;
   2377  }
   2378 }
   2379 
   2380 }  // namespace
   2381 
   2382 // We call this repeatedly to generate code for each pass over the text node.
   2383 // The passes are in increasing order of difficulty because we hope one
   2384 // of the first passes will fail in which case we are saved the work of the
   2385 // later passes.  for example for the case independent regexp /%[asdfghjkl]a/
   2386 // we will check the '%' in the first pass, the case independent 'a' in the
   2387 // second pass and the character class in the last pass.
   2388 //
   2389 // The passes are done from right to left, so for example to test for /bar/
   2390 // we will first test for an 'r' with offset 2, then an 'a' with offset 1
   2391 // and then a 'b' with offset 0.  This means we can avoid the end-of-input
   2392 // bounds check most of the time.  In the example we only need to check for
   2393 // end-of-input when loading the putative 'r'.
   2394 //
   2395 // A slight complication involves the fact that the first character may already
   2396 // be fetched into a register by the previous node.  In this case we want to
   2397 // do the test for that character first.  We do this in separate passes.  The
   2398 // 'preloaded' argument indicates that we are doing such a 'pass'.  If such a
   2399 // pass has been performed then subsequent passes will have true in
   2400 // first_element_checked to indicate that that character does not need to be
   2401 // checked again.
   2402 //
   2403 // In addition to all this we are passed a Trace, which can
   2404 // contain an AlternativeGeneration object.  In this AlternativeGeneration
   2405 // object we can see details of any quick check that was already passed in
   2406 // order to get to the code we are now generating.  The quick check can involve
   2407 // loading characters, which means we do not need to recheck the bounds
   2408 // up to the limit the quick check already checked.  In addition the quick
   2409 // check can have involved a mask and compare operation which may simplify
   2410 // or obviate the need for further checks at some character positions.
   2411 void TextNode::TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass,
   2412                            bool preloaded, Trace* trace,
   2413                            bool first_element_checked, int* checked_up_to) {
   2414  RegExpMacroAssembler* assembler = compiler->macro_assembler();
   2415  Isolate* isolate = assembler->isolate();
   2416  bool one_byte = compiler->one_byte();
   2417  Label* backtrack = trace->backtrack();
   2418  const QuickCheckDetails* quick_check = trace->quick_check_performed();
   2419  int element_count = elements()->length();
   2420  int backward_offset = read_backward() ? -Length() : 0;
   2421  for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
   2422    TextElement elm = elements()->at(i);
   2423    int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
   2424    if (elm.text_type() == TextElement::ATOM) {
   2425      base::Vector<const base::uc16> quarks = elm.atom()->data();
   2426      for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
   2427        if (first_element_checked && i == 0 && j == 0) continue;
   2428        if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
   2429        base::uc16 quark = quarks[j];
   2430        bool needs_bounds_check =
   2431            *checked_up_to < cp_offset + j || read_backward();
   2432        bool bounds_checked = false;
   2433        switch (pass) {
   2434          case NON_LATIN1_MATCH: {
   2435            DCHECK(one_byte);  // This pass is only done in one-byte mode.
   2436            if (IsIgnoreCase(compiler->flags())) {
   2437              // We are compiling for a one-byte subject, case independent mode.
   2438              // We have to check whether any of the case alternatives are in
   2439              // the one-byte range.
   2440              unibrow::uchar chars[4];
   2441              // Only returns characters that are in the one-byte range.
   2442              int length =
   2443                  GetCaseIndependentLetters(isolate, quark, compiler, chars, 4);
   2444              if (length == 0) {
   2445                assembler->GoTo(backtrack);
   2446                return;
   2447              }
   2448            } else {
   2449              // Case-dependent mode.
   2450              if (quark > String::kMaxOneByteCharCode) {
   2451                assembler->GoTo(backtrack);
   2452                return;
   2453              }
   2454            }
   2455            break;
   2456          }
   2457          case NON_LETTER_CHARACTER_MATCH:
   2458            bounds_checked =
   2459                EmitAtomNonLetter(isolate, compiler, quark, backtrack,
   2460                                  cp_offset + j, needs_bounds_check, preloaded);
   2461            break;
   2462          case SIMPLE_CHARACTER_MATCH:
   2463            bounds_checked = EmitSimpleCharacter(isolate, compiler, quark,
   2464                                                 backtrack, cp_offset + j,
   2465                                                 needs_bounds_check, preloaded);
   2466            break;
   2467          case CASE_CHARACTER_MATCH:
   2468            bounds_checked =
   2469                EmitAtomLetter(isolate, compiler, quark, backtrack,
   2470                               cp_offset + j, needs_bounds_check, preloaded);
   2471            break;
   2472          default:
   2473            break;
   2474        }
   2475        if (bounds_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
   2476      }
   2477    } else {
   2478      DCHECK_EQ(TextElement::CLASS_RANGES, elm.text_type());
   2479      if (pass == CHARACTER_CLASS_MATCH) {
   2480        if (first_element_checked && i == 0) continue;
   2481        if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
   2482        RegExpClassRanges* cr = elm.class_ranges();
   2483        bool bounds_check = *checked_up_to < cp_offset || read_backward();
   2484        EmitClassRanges(assembler, cr, one_byte, backtrack, cp_offset,
   2485                        bounds_check, preloaded, zone());
   2486        UpdateBoundsCheck(cp_offset, checked_up_to);
   2487      }
   2488    }
   2489  }
   2490 }
   2491 
   2492 int TextNode::Length() {
   2493  TextElement elm = elements()->last();
   2494  DCHECK_LE(0, elm.cp_offset());
   2495  return elm.cp_offset() + elm.length();
   2496 }
   2497 
   2498 TextNode* TextNode::CreateForCharacterRanges(Zone* zone,
   2499                                             ZoneList<CharacterRange>* ranges,
   2500                                             bool read_backward,
   2501                                             RegExpNode* on_success) {
   2502  DCHECK_NOT_NULL(ranges);
   2503  // TODO(jgruber): There's no fundamental need to create this
   2504  // RegExpClassRanges; we could refactor to avoid the allocation.
   2505  return zone->New<TextNode>(zone->New<RegExpClassRanges>(zone, ranges),
   2506                             read_backward, on_success);
   2507 }
   2508 
   2509 TextNode* TextNode::CreateForSurrogatePair(
   2510    Zone* zone, CharacterRange lead, ZoneList<CharacterRange>* trail_ranges,
   2511    bool read_backward, RegExpNode* on_success) {
   2512  ZoneList<TextElement>* elms = zone->New<ZoneList<TextElement>>(2, zone);
   2513  if (lead.from() == lead.to()) {
   2514    ZoneList<base::uc16> lead_surrogate(1, zone);
   2515    lead_surrogate.Add(lead.from(), zone);
   2516    RegExpAtom* atom = zone->New<RegExpAtom>(lead_surrogate.ToConstVector());
   2517    elms->Add(TextElement::Atom(atom), zone);
   2518  } else {
   2519    ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);
   2520    elms->Add(TextElement::ClassRanges(
   2521                  zone->New<RegExpClassRanges>(zone, lead_ranges)),
   2522              zone);
   2523  }
   2524  elms->Add(TextElement::ClassRanges(
   2525                zone->New<RegExpClassRanges>(zone, trail_ranges)),
   2526            zone);
   2527  return zone->New<TextNode>(elms, read_backward, on_success);
   2528 }
   2529 
   2530 TextNode* TextNode::CreateForSurrogatePair(
   2531    Zone* zone, ZoneList<CharacterRange>* lead_ranges, CharacterRange trail,
   2532    bool read_backward, RegExpNode* on_success) {
   2533  ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);
   2534  ZoneList<TextElement>* elms = zone->New<ZoneList<TextElement>>(2, zone);
   2535  elms->Add(
   2536      TextElement::ClassRanges(zone->New<RegExpClassRanges>(zone, lead_ranges)),
   2537      zone);
   2538  elms->Add(TextElement::ClassRanges(
   2539                zone->New<RegExpClassRanges>(zone, trail_ranges)),
   2540            zone);
   2541  return zone->New<TextNode>(elms, read_backward, on_success);
   2542 }
   2543 
   2544 // This generates the code to match a text node.  A text node can contain
   2545 // straight character sequences (possibly to be matched in a case-independent
   2546 // way) and character classes.  For efficiency we do not do this in a single
   2547 // pass from left to right.  Instead we pass over the text node several times,
   2548 // emitting code for some character positions every time.  See the comment on
   2549 // TextEmitPass for details.
   2550 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
   2551  LimitResult limit_result = LimitVersions(compiler, trace);
   2552  if (limit_result == DONE) return;
   2553  DCHECK(limit_result == CONTINUE);
   2554 
   2555  if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
   2556    compiler->SetRegExpTooBig();
   2557    return;
   2558  }
   2559 
   2560  if (compiler->one_byte()) {
   2561    int dummy = 0;
   2562    TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
   2563  }
   2564 
   2565  bool first_elt_done = false;
   2566  static_assert(Trace::kCPOffsetSlack == 1);
   2567  int bound_checked_to = trace->cp_offset() - 1;
   2568  bound_checked_to += trace->bound_checked_up_to();
   2569 
   2570  // If a character is preloaded into the current character register then
   2571  // check that first to save reloading it.
   2572  for (int twice = 0; twice < 2; twice++) {
   2573    bool is_preloaded_pass = twice == 0;
   2574    if (is_preloaded_pass && trace->characters_preloaded() != 1) continue;
   2575    if (IsIgnoreCase(compiler->flags())) {
   2576      TextEmitPass(compiler, NON_LETTER_CHARACTER_MATCH, is_preloaded_pass,
   2577                   trace, first_elt_done, &bound_checked_to);
   2578      TextEmitPass(compiler, CASE_CHARACTER_MATCH, is_preloaded_pass, trace,
   2579                   first_elt_done, &bound_checked_to);
   2580    } else {
   2581      TextEmitPass(compiler, SIMPLE_CHARACTER_MATCH, is_preloaded_pass, trace,
   2582                   first_elt_done, &bound_checked_to);
   2583    }
   2584    TextEmitPass(compiler, CHARACTER_CLASS_MATCH, is_preloaded_pass, trace,
   2585                 first_elt_done, &bound_checked_to);
   2586    first_elt_done = true;
   2587  }
   2588 
   2589  Trace successor_trace(*trace);
   2590  // If we advance backward, we may end up at the start.
   2591  successor_trace.AdvanceCurrentPositionInTrace(
   2592      read_backward() ? -Length() : Length(), compiler);
   2593  successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN
   2594                                               : Trace::FALSE_VALUE);
   2595  RecursionCheck rc(compiler);
   2596  on_success()->Emit(compiler, &successor_trace);
   2597 }
   2598 
   2599 void Trace::InvalidateCurrentCharacter() { characters_preloaded_ = 0; }
   2600 
   2601 void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
   2602  // We don't have an instruction for shifting the current character register
   2603  // down or for using a shifted value for anything so lets just forget that
   2604  // we preloaded any characters into it.
   2605  characters_preloaded_ = 0;
   2606  // Adjust the offsets of the quick check performed information.  This
   2607  // information is used to find out what we already determined about the
   2608  // characters by means of mask and compare.
   2609  quick_check_performed_.Advance(by, compiler->one_byte());
   2610  cp_offset_ += by;
   2611  static_assert(RegExpMacroAssembler::kMaxCPOffset ==
   2612                -RegExpMacroAssembler::kMinCPOffset);
   2613  if (std::abs(cp_offset_) + kCPOffsetSlack >
   2614      RegExpMacroAssembler::kMaxCPOffset) {
   2615    compiler->SetRegExpTooBig();
   2616    cp_offset_ = 0;
   2617  }
   2618  bound_checked_up_to_ = std::max(0, bound_checked_up_to_ - by);
   2619 }
   2620 
   2621 void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte,
   2622                                   RegExpFlags flags) {
   2623  if (!IsIgnoreCase(flags)) return;
   2624 #ifdef V8_INTL_SUPPORT
   2625  // This is done in an earlier step when generating the nodes from the AST
   2626  // because we may have to split up into separate nodes.
   2627  if (NeedsUnicodeCaseEquivalents(flags)) return;
   2628 #endif
   2629 
   2630  int element_count = elements()->length();
   2631  for (int i = 0; i < element_count; i++) {
   2632    TextElement elm = elements()->at(i);
   2633    if (elm.text_type() == TextElement::CLASS_RANGES) {
   2634      RegExpClassRanges* cr = elm.class_ranges();
   2635      // None of the standard character classes is different in the case
   2636      // independent case and it slows us down if we don't know that.
   2637      if (cr->is_standard(zone())) continue;
   2638      ZoneList<CharacterRange>* ranges = cr->ranges(zone());
   2639      CharacterRange::AddCaseEquivalents(isolate, zone(), ranges, is_one_byte);
   2640    }
   2641  }
   2642 }
   2643 
   2644 int TextNode::FixedLengthLoopLength() { return Length(); }
   2645 
   2646 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
   2647    RegExpCompiler* compiler) {
   2648  if (read_backward()) return nullptr;
   2649  if (elements()->length() != 1) return nullptr;
   2650  TextElement elm = elements()->at(0);
   2651  if (elm.text_type() != TextElement::CLASS_RANGES) return nullptr;
   2652  RegExpClassRanges* node = elm.class_ranges();
   2653  ZoneList<CharacterRange>* ranges = node->ranges(zone());
   2654  CharacterRange::Canonicalize(ranges);
   2655  if (node->is_negated()) {
   2656    return ranges->length() == 0 ? on_success() : nullptr;
   2657  }
   2658  if (ranges->length() != 1) return nullptr;
   2659  const base::uc32 max_char = MaxCodeUnit(compiler->one_byte());
   2660  return ranges->at(0).IsEverything(max_char) ? on_success() : nullptr;
   2661 }
   2662 
   2663 // Finds the fixed match length of a sequence of nodes that goes from
   2664 // this alternative and back to this choice node.  If there are variable
   2665 // length nodes or other complications in the way then return a sentinel
   2666 // value indicating that a fixed length loop cannot be constructed.
   2667 int ChoiceNode::FixedLengthLoopLengthForAlternative(
   2668    GuardedAlternative* alternative) {
   2669  int length = 0;
   2670  RegExpNode* node = alternative->node();
   2671  // Later we will generate code for all these text nodes using recursion
   2672  // so we have to limit the max number.
   2673  int recursion_depth = 0;
   2674  while (node != this) {
   2675    if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
   2676      return kNodeIsTooComplexForFixedLengthLoops;
   2677    }
   2678    int node_length = node->FixedLengthLoopLength();
   2679    if (node_length == kNodeIsTooComplexForFixedLengthLoops) {
   2680      return kNodeIsTooComplexForFixedLengthLoops;
   2681    }
   2682    length += node_length;
   2683    node = node->AsSeqRegExpNode()->on_success();
   2684  }
   2685  if (read_backward()) {
   2686    length = -length;
   2687  }
   2688  // Check that we can jump by the whole text length. If not, return sentinel
   2689  // to indicate the we can't construct a fixed length loop.
   2690  if (length < RegExpMacroAssembler::kMinCPOffset ||
   2691      length > RegExpMacroAssembler::kMaxCPOffset) {
   2692    return kNodeIsTooComplexForFixedLengthLoops;
   2693  }
   2694  return length;
   2695 }
   2696 
   2697 void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
   2698  DCHECK_NULL(loop_node_);
   2699  AddAlternative(alt);
   2700  loop_node_ = alt.node();
   2701 }
   2702 
   2703 void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
   2704  DCHECK_NULL(continue_node_);
   2705  AddAlternative(alt);
   2706  continue_node_ = alt.node();
   2707 }
   2708 
   2709 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
   2710  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
   2711  if (trace->fixed_length_loop_state() != nullptr &&
   2712      trace->fixed_length_loop_state()->loop_choice_node() == this) {
   2713    // Back edge of fixed length optimized loop node graph.
   2714    int text_length =
   2715        FixedLengthLoopLengthForAlternative(&(alternatives_->at(0)));
   2716    DCHECK_NE(kNodeIsTooComplexForFixedLengthLoops, text_length);
   2717    // Update the counter-based backtracking info on the stack.  This is an
   2718    // optimization for fixed length loops (see below).
   2719    DCHECK(trace->cp_offset() == text_length);
   2720    macro_assembler->AdvanceCurrentPosition(text_length);
   2721    trace->fixed_length_loop_state()->GoToLoopTopLabel(macro_assembler);
   2722    return;
   2723  }
   2724  DCHECK_NULL(trace->fixed_length_loop_state());
   2725  if (!trace->is_trivial()) {
   2726    trace->Flush(compiler, this);
   2727    return;
   2728  }
   2729  ChoiceNode::Emit(compiler, trace);
   2730 }
   2731 
   2732 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
   2733                                           int eats_at_least) {
   2734  int preload_characters = std::min(4, eats_at_least);
   2735  DCHECK_LE(preload_characters, 4);
   2736  if (compiler->macro_assembler()->CanReadUnaligned()) {
   2737    bool one_byte = compiler->one_byte();
   2738    if (one_byte) {
   2739      // We can't preload 3 characters because there is no machine instruction
   2740      // to do that.  We can't just load 4 because we could be reading
   2741      // beyond the end of the string, which could cause a memory fault.
   2742      if (preload_characters == 3) preload_characters = 2;
   2743    } else {
   2744      if (preload_characters > 2) preload_characters = 2;
   2745    }
   2746  } else {
   2747    if (preload_characters > 1) preload_characters = 1;
   2748  }
   2749  return preload_characters;
   2750 }
   2751 
   2752 // This class is used when generating the alternatives in a choice node.  It
   2753 // records the way the alternative is being code generated.
   2754 class AlternativeGeneration : public Malloced {
   2755 public:
   2756  AlternativeGeneration()
   2757      : possible_success(),
   2758        expects_preload(false),
   2759        after(),
   2760        quick_check_details() {}
   2761  Label possible_success;
   2762  bool expects_preload;
   2763  Label after;
   2764  QuickCheckDetails quick_check_details;
   2765 };
   2766 
   2767 // Creates a list of AlternativeGenerations.  If the list has a reasonable
   2768 // size then it is on the stack, otherwise the excess is on the heap.
   2769 class AlternativeGenerationList {
   2770 public:
   2771  AlternativeGenerationList(int count, Zone* zone) : alt_gens_(count, zone) {
   2772    for (int i = 0; i < count && i < kAFew; i++) {
   2773      alt_gens_.Add(a_few_alt_gens_ + i, zone);
   2774    }
   2775    for (int i = kAFew; i < count; i++) {
   2776      alt_gens_.Add(new AlternativeGeneration(), zone);
   2777    }
   2778  }
   2779  ~AlternativeGenerationList() {
   2780    for (int i = kAFew; i < alt_gens_.length(); i++) {
   2781      delete alt_gens_[i];
   2782      alt_gens_[i] = nullptr;
   2783    }
   2784  }
   2785 
   2786  AlternativeGeneration* at(int i) { return alt_gens_[i]; }
   2787 
   2788 private:
   2789  static const int kAFew = 10;
   2790  ZoneList<AlternativeGeneration*> alt_gens_;
   2791  AlternativeGeneration a_few_alt_gens_[kAFew];
   2792 };
   2793 
   2794 void BoyerMoorePositionInfo::Set(int character) {
   2795  SetInterval(Interval(character, character));
   2796 }
   2797 
   2798 namespace {
   2799 
   2800 ContainedInLattice AddRange(ContainedInLattice containment, const int* ranges,
   2801                            int ranges_length, Interval new_range) {
   2802  DCHECK_EQ(1, ranges_length & 1);
   2803  DCHECK_EQ(String::kMaxCodePoint + 1, ranges[ranges_length - 1]);
   2804  if (containment == kLatticeUnknown) return containment;
   2805  bool inside = false;
   2806  int last = 0;
   2807  for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
   2808    // Consider the range from last to ranges[i].
   2809    // We haven't got to the new range yet.
   2810    if (ranges[i] <= new_range.from()) continue;
   2811    // New range is wholly inside last-ranges[i].  Note that new_range.to() is
   2812    // inclusive, but the values in ranges are not.
   2813    if (last <= new_range.from() && new_range.to() < ranges[i]) {
   2814      return Combine(containment, inside ? kLatticeIn : kLatticeOut);
   2815    }
   2816    return kLatticeUnknown;
   2817  }
   2818  return containment;
   2819 }
   2820 
   2821 int BitsetFirstSetBit(BoyerMoorePositionInfo::Bitset bitset) {
   2822  static_assert(BoyerMoorePositionInfo::kMapSize ==
   2823                2 * kInt64Size * kBitsPerByte);
   2824 
   2825  // Slight fiddling is needed here, since the bitset is of length 128 while
   2826  // CountTrailingZeros requires an integral type and std::bitset can only
   2827  // convert to unsigned long long. So we handle the most- and least-significant
   2828  // bits separately.
   2829 
   2830  {
   2831    static constexpr BoyerMoorePositionInfo::Bitset mask(~uint64_t{0});
   2832    BoyerMoorePositionInfo::Bitset masked_bitset = bitset & mask;
   2833    static_assert(kInt64Size >= sizeof(decltype(masked_bitset.to_ullong())));
   2834    uint64_t lsb = masked_bitset.to_ullong();
   2835    if (lsb != 0) return base::bits::CountTrailingZeros(lsb);
   2836  }
   2837 
   2838  {
   2839    BoyerMoorePositionInfo::Bitset masked_bitset = bitset >> 64;
   2840    uint64_t msb = masked_bitset.to_ullong();
   2841    if (msb != 0) return 64 + base::bits::CountTrailingZeros(msb);
   2842  }
   2843 
   2844  return -1;
   2845 }
   2846 
   2847 }  // namespace
   2848 
   2849 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
   2850  w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
   2851 
   2852  if (interval.size() >= kMapSize) {
   2853    map_count_ = kMapSize;
   2854    map_.set();
   2855    return;
   2856  }
   2857 
   2858  for (int i = interval.from(); i <= interval.to(); i++) {
   2859    int mod_character = (i & kMask);
   2860    if (!map_[mod_character]) {
   2861      map_count_++;
   2862      map_.set(mod_character);
   2863    }
   2864    if (map_count_ == kMapSize) return;
   2865  }
   2866 }
   2867 
   2868 void BoyerMoorePositionInfo::SetAll() {
   2869  w_ = kLatticeUnknown;
   2870  if (map_count_ != kMapSize) {
   2871    map_count_ = kMapSize;
   2872    map_.set();
   2873  }
   2874 }
   2875 
   2876 BoyerMooreLookahead::BoyerMooreLookahead(int length, RegExpCompiler* compiler,
   2877                                         Zone* zone)
   2878    : length_(length),
   2879      compiler_(compiler),
   2880      max_char_(MaxCodeUnit(compiler->one_byte())) {
   2881  bitmaps_ = zone->New<ZoneList<BoyerMoorePositionInfo*>>(length, zone);
   2882  for (int i = 0; i < length; i++) {
   2883    bitmaps_->Add(zone->New<BoyerMoorePositionInfo>(), zone);
   2884  }
   2885 }
   2886 
   2887 // Find the longest range of lookahead that has the fewest number of different
   2888 // characters that can occur at a given position.  Since we are optimizing two
   2889 // different parameters at once this is a tradeoff.
   2890 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
   2891  int biggest_points = 0;
   2892  // If more than 32 characters out of 128 can occur it is unlikely that we can
   2893  // be lucky enough to step forwards much of the time.
   2894  const int kMaxMax = 32;
   2895  for (int max_number_of_chars = 4; max_number_of_chars < kMaxMax;
   2896       max_number_of_chars *= 2) {
   2897    biggest_points =
   2898        FindBestInterval(max_number_of_chars, biggest_points, from, to);
   2899  }
   2900  if (biggest_points == 0) return false;
   2901  return true;
   2902 }
   2903 
   2904 // Find the highest-points range between 0 and length_ where the character
   2905 // information is not too vague.  'Too vague' means that there are more than
   2906 // max_number_of_chars that can occur at this position.  Calculates the number
   2907 // of points as the product of width-of-the-range and
   2908 // probability-of-finding-one-of-the-characters, where the probability is
   2909 // calculated using the frequency distribution of the sample subject string.
   2910 int BoyerMooreLookahead::FindBestInterval(int max_number_of_chars,
   2911                                          int old_biggest_points, int* from,
   2912                                          int* to) {
   2913  int biggest_points = old_biggest_points;
   2914  static const int kSize = RegExpMacroAssembler::kTableSize;
   2915  for (int i = 0; i < length_;) {
   2916    while (i < length_ && Count(i) > max_number_of_chars) i++;
   2917    if (i == length_) break;
   2918    int remembered_from = i;
   2919 
   2920    BoyerMoorePositionInfo::Bitset union_bitset;
   2921    for (; i < length_ && Count(i) <= max_number_of_chars; i++) {
   2922      union_bitset |= bitmaps_->at(i)->raw_bitset();
   2923    }
   2924 
   2925    int frequency = 0;
   2926 
   2927    // Iterate only over set bits.
   2928    int j;
   2929    while ((j = BitsetFirstSetBit(union_bitset)) != -1) {
   2930      DCHECK(union_bitset[j]);  // Sanity check.
   2931      // Add 1 to the frequency to give a small per-character boost for
   2932      // the cases where our sampling is not good enough and many
   2933      // characters have a frequency of zero.  This means the frequency
   2934      // can theoretically be up to 2*kSize though we treat it mostly as
   2935      // a fraction of kSize.
   2936      frequency += compiler_->frequency_collator()->Frequency(j) + 1;
   2937      union_bitset.reset(j);
   2938    }
   2939 
   2940    // We use the probability of skipping times the distance we are skipping to
   2941    // judge the effectiveness of this.  Actually we have a cut-off:  By
   2942    // dividing by 2 we switch off the skipping if the probability of skipping
   2943    // is less than 50%.  This is because the multibyte mask-and-compare
   2944    // skipping in quickcheck is more likely to do well on this case.
   2945    bool in_quickcheck_range =
   2946        ((i - remembered_from < 4) ||
   2947         (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));
   2948    // Called 'probability' but it is only a rough estimate and can actually
   2949    // be outside the 0-kSize range.
   2950    int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
   2951    int points = (i - remembered_from) * probability;
   2952    if (points > biggest_points) {
   2953      *from = remembered_from;
   2954      *to = i - 1;
   2955      biggest_points = points;
   2956    }
   2957  }
   2958  return biggest_points;
   2959 }
   2960 
   2961 // Take all the characters that will not prevent a successful match if they
   2962 // occur in the subject string in the range between min_lookahead and
   2963 // max_lookahead (inclusive) measured from the current position.  If the
   2964 // character at max_lookahead offset is not one of these characters, then we
   2965 // can safely skip forwards by the number of characters in the range.
   2966 // nibble_table is only used for SIMD variants and encodes the same information
   2967 // as boolean_skip_table but in only 128 bits. It contains 16 bytes where the
   2968 // index into the table represent low nibbles of a character, and the stored
   2969 // byte is a bitset representing matching high nibbles. E.g. to store the
   2970 // character 'b' (0x62) in the nibble table, we set the 6th bit in row 2.
   2971 int BoyerMooreLookahead::GetSkipTable(
   2972    int min_lookahead, int max_lookahead,
   2973    DirectHandle<ByteArray> boolean_skip_table,
   2974    DirectHandle<ByteArray> nibble_table) {
   2975  const int kSkipArrayEntry = 0;
   2976  const int kDontSkipArrayEntry = 1;
   2977 
   2978  std::memset(boolean_skip_table->begin(), kSkipArrayEntry,
   2979              boolean_skip_table->length());
   2980  const bool fill_nibble_table = !nibble_table.is_null();
   2981  if (fill_nibble_table) {
   2982    std::memset(nibble_table->begin(), 0, nibble_table->length());
   2983  }
   2984 
   2985  for (int i = max_lookahead; i >= min_lookahead; i--) {
   2986    BoyerMoorePositionInfo::Bitset bitset = bitmaps_->at(i)->raw_bitset();
   2987 
   2988    // Iterate only over set bits.
   2989    int j;
   2990    while ((j = BitsetFirstSetBit(bitset)) != -1) {
   2991      DCHECK(bitset[j]);  // Sanity check.
   2992      boolean_skip_table->set(j, kDontSkipArrayEntry);
   2993      if (fill_nibble_table) {
   2994        int lo_nibble = j & 0x0f;
   2995        int hi_nibble = (j >> 4) & 0x07;
   2996        int row = nibble_table->get(lo_nibble);
   2997        row |= 1 << hi_nibble;
   2998        nibble_table->set(lo_nibble, row);
   2999      }
   3000      bitset.reset(j);
   3001    }
   3002  }
   3003 
   3004  const int skip = max_lookahead + 1 - min_lookahead;
   3005  return skip;
   3006 }
   3007 
   3008 // See comment above on the implementation of GetSkipTable.
   3009 void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
   3010  const int kSize = RegExpMacroAssembler::kTableSize;
   3011 
   3012  int min_lookahead = 0;
   3013  int max_lookahead = 0;
   3014 
   3015  if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;
   3016 
   3017  // Check if we only have a single non-empty position info, and that info
   3018  // contains precisely one character.
   3019  bool found_single_character = false;
   3020  int single_character = 0;
   3021  for (int i = max_lookahead; i >= min_lookahead; i--) {
   3022    BoyerMoorePositionInfo* map = bitmaps_->at(i);
   3023    if (map->map_count() == 0) continue;
   3024 
   3025    if (found_single_character || map->map_count() > 1) {
   3026      found_single_character = false;
   3027      break;
   3028    }
   3029 
   3030    DCHECK(!found_single_character);
   3031    DCHECK_EQ(map->map_count(), 1);
   3032 
   3033    found_single_character = true;
   3034    single_character = BitsetFirstSetBit(map->raw_bitset());
   3035 
   3036    DCHECK_NE(single_character, -1);
   3037  }
   3038 
   3039  int lookahead_width = max_lookahead + 1 - min_lookahead;
   3040 
   3041  if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
   3042    // The mask-compare can probably handle this better.
   3043    return;
   3044  }
   3045 
   3046  if (found_single_character) {
   3047    // TODO(pthier): Add vectorized version.
   3048    Label cont, again;
   3049    masm->Bind(&again);
   3050    masm->LoadCurrentCharacter(max_lookahead, &cont, true);
   3051    if (max_char_ > kSize) {
   3052      masm->CheckCharacterAfterAnd(single_character,
   3053                                   RegExpMacroAssembler::kTableMask, &cont);
   3054    } else {
   3055      masm->CheckCharacter(single_character, &cont);
   3056    }
   3057    masm->AdvanceCurrentPosition(lookahead_width);
   3058    masm->GoTo(&again);
   3059    masm->Bind(&cont);
   3060    return;
   3061  }
   3062 
   3063  Factory* factory = masm->isolate()->factory();
   3064  Handle<ByteArray> boolean_skip_table =
   3065      factory->NewByteArray(kSize, AllocationType::kOld);
   3066  Handle<ByteArray> nibble_table;
   3067  const int skip_distance = max_lookahead + 1 - min_lookahead;
   3068  if (masm->SkipUntilBitInTableUseSimd(skip_distance)) {
   3069    // The current implementation is tailored specifically for 128-bit tables.
   3070    static_assert(kSize == 128);
   3071    nibble_table =
   3072        factory->NewByteArray(kSize / kBitsPerByte, AllocationType::kOld);
   3073  }
   3074  GetSkipTable(min_lookahead, max_lookahead, boolean_skip_table, nibble_table);
   3075  DCHECK_NE(0, skip_distance);
   3076 
   3077  masm->SkipUntilBitInTable(max_lookahead, boolean_skip_table, nibble_table,
   3078                            skip_distance);
   3079 }
   3080 
   3081 /* Code generation for choice nodes.
   3082 *
   3083 * We generate quick checks that do a mask and compare to eliminate a
   3084 * choice.  If the quick check succeeds then it jumps to the continuation to
   3085 * do slow checks and check subsequent nodes.  If it fails (the common case)
   3086 * it falls through to the next choice.
   3087 *
   3088 * Here is the desired flow graph.  Nodes directly below each other imply
   3089 * fallthrough.  Alternatives 1 and 2 have quick checks.  Alternative
   3090 * 3 doesn't have a quick check so we have to call the slow check.
   3091 * Nodes are marked Qn for quick checks and Sn for slow checks.  The entire
   3092 * regexp continuation is generated directly after the Sn node, up to the
   3093 * next GoTo if we decide to reuse some already generated code.  Some
   3094 * nodes expect preload_characters to be preloaded into the current
   3095 * character register.  R nodes do this preloading.  Vertices are marked
   3096 * F for failures and S for success (possible success in the case of quick
   3097 * nodes).  L, V, < and > are used as arrow heads.
   3098 *
   3099 * ----------> R
   3100 *             |
   3101 *             V
   3102 *            Q1 -----> S1
   3103 *             |   S   /
   3104 *            F|      /
   3105 *             |    F/
   3106 *             |    /
   3107 *             |   R
   3108 *             |  /
   3109 *             V L
   3110 *            Q2 -----> S2
   3111 *             |   S   /
   3112 *            F|      /
   3113 *             |    F/
   3114 *             |    /
   3115 *             |   R
   3116 *             |  /
   3117 *             V L
   3118 *            S3
   3119 *             |
   3120 *            F|
   3121 *             |
   3122 *             R
   3123 *             |
   3124 * backtrack   V
   3125 * <----------Q4
   3126 *   \    F    |
   3127 *    \        |S
   3128 *     \   F   V
   3129 *      \-----S4
   3130 *
   3131 * For fixed length loops we push the current position, then generate the code
   3132 * that eats the input specially in EmitFixedLengthLoop.  The other choice (the
   3133 * continuation) is generated by the normal code in EmitChoices, and steps back
   3134 * in the input to the starting position when it fails to match.  The loop code
   3135 * looks like this (U is the unwind code that steps back in the fixed length
   3136 * loop).
   3137 *
   3138 *              _____
   3139 *             /     \
   3140 *             V     |
   3141 * ----------> S1    |
   3142 *            /|     |
   3143 *           / |S    |
   3144 *         F/  \_____/
   3145 *         /
   3146 *        |<-----
   3147 *        |      \
   3148 *        V       |S
   3149 *        Q2 ---> U----->backtrack
   3150 *        |  F   /
   3151 *       S|     /
   3152 *        V  F /
   3153 *        S2--/
   3154 */
   3155 
   3156 FixedLengthLoopState::FixedLengthLoopState(bool not_at_start,
   3157                                           ChoiceNode* loop_choice_node)
   3158    : loop_choice_node_(loop_choice_node) {
   3159  counter_backtrack_trace_.set_backtrack(&step_backwards_label_);
   3160  if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE);
   3161 }
   3162 
   3163 void FixedLengthLoopState::BindStepBackwardsLabel(
   3164    RegExpMacroAssembler* macro_assembler) {
   3165  macro_assembler->Bind(&step_backwards_label_);
   3166 }
   3167 
   3168 void FixedLengthLoopState::BindLoopTopLabel(
   3169    RegExpMacroAssembler* macro_assembler) {
   3170  macro_assembler->Bind(&loop_top_label_);
   3171 }
   3172 
   3173 void FixedLengthLoopState::GoToLoopTopLabel(
   3174    RegExpMacroAssembler* macro_assembler) {
   3175  macro_assembler->GoTo(&loop_top_label_);
   3176 }
   3177 
   3178 void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
   3179 #ifdef DEBUG
   3180  int choice_count = alternatives_->length();
   3181  for (int i = 0; i < choice_count - 1; i++) {
   3182    GuardedAlternative alternative = alternatives_->at(i);
   3183    ZoneList<Guard*>* guards = alternative.guards();
   3184    int guard_count = (guards == nullptr) ? 0 : guards->length();
   3185    for (int j = 0; j < guard_count; j++) {
   3186      DCHECK(!trace->mentions_reg(guards->at(j)->reg()));
   3187    }
   3188  }
   3189 #endif
   3190 }
   3191 
   3192 void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler, Trace* current_trace,
   3193                              PreloadState* state) {
   3194  if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {
   3195    // Save some time by looking at most one machine word ahead.
   3196    state->eats_at_least_ =
   3197        EatsAtLeast(current_trace->at_start() == Trace::FALSE_VALUE);
   3198  }
   3199  state->preload_characters_ =
   3200      CalculatePreloadCharacters(compiler, state->eats_at_least_);
   3201 
   3202  state->preload_is_current_ =
   3203      (current_trace->characters_preloaded() == state->preload_characters_);
   3204  state->preload_has_checked_bounds_ = state->preload_is_current_;
   3205 }
   3206 
   3207 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
   3208  int choice_count = alternatives_->length();
   3209 
   3210  if (choice_count == 1 && alternatives_->at(0).guards() == nullptr) {
   3211    alternatives_->at(0).node()->Emit(compiler, trace);
   3212    return;
   3213  }
   3214 
   3215  AssertGuardsMentionRegisters(trace);
   3216 
   3217  LimitResult limit_result = LimitVersions(compiler, trace);
   3218  if (limit_result == DONE) return;
   3219  DCHECK(limit_result == CONTINUE);
   3220 
   3221  // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
   3222  // other choice nodes we only flush if we are out of code size budget.
   3223  if (trace->flush_budget() == 0 && trace->has_any_actions()) {
   3224    trace->Flush(compiler, this);
   3225    return;
   3226  }
   3227 
   3228  RecursionCheck rc(compiler);
   3229 
   3230  PreloadState preload;
   3231  preload.init();
   3232  // This must be outside the 'if' because the trace we use for what
   3233  // comes after the fixed_length_loop is inside it and needs the lifetime.
   3234  FixedLengthLoopState fixed_length_loop_state(not_at_start(), this);
   3235 
   3236  int text_length = FixedLengthLoopLengthForAlternative(&alternatives_->at(0));
   3237  AlternativeGenerationList alt_gens(choice_count, zone());
   3238 
   3239  if (choice_count > 1 && text_length != kNodeIsTooComplexForFixedLengthLoops) {
   3240    trace = EmitFixedLengthLoop(compiler, trace, &alt_gens, &preload,
   3241                                &fixed_length_loop_state, text_length);
   3242  } else {
   3243    preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
   3244 
   3245    EmitChoices(compiler, &alt_gens, 0, trace, &preload);
   3246  }
   3247 
   3248  // At this point we need to generate slow checks for the alternatives where
   3249  // the quick check was inlined.  We can recognize these because the associated
   3250  // label was bound.
   3251  int new_flush_budget = trace->flush_budget() / choice_count;
   3252  for (int i = 0; i < choice_count; i++) {
   3253    AlternativeGeneration* alt_gen = alt_gens.at(i);
   3254    Trace new_trace(*trace);
   3255    // If there are actions to be flushed we have to limit how many times
   3256    // they are flushed.  Take the budget of the parent trace and distribute
   3257    // it fairly amongst the children.
   3258    if (new_trace.has_any_actions()) {
   3259      new_trace.set_flush_budget(new_flush_budget);
   3260    }
   3261    bool next_expects_preload =
   3262        i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;
   3263    EmitOutOfLineContinuation(compiler, &new_trace, alternatives_->at(i),
   3264                              alt_gen, preload.preload_characters_,
   3265                              next_expects_preload);
   3266  }
   3267 }
   3268 
   3269 Trace* ChoiceNode::EmitFixedLengthLoop(
   3270    RegExpCompiler* compiler, Trace* trace, AlternativeGenerationList* alt_gens,
   3271    PreloadState* preload, FixedLengthLoopState* fixed_length_loop_state,
   3272    int text_length) {
   3273  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
   3274  // Here we have special handling for greedy loops containing only text nodes
   3275  // and other simple nodes.  We call these fixed length loops.  These are
   3276  // handled by pushing the current position on the stack and then incrementing
   3277  // the current position each time around the switch.  On backtrack we
   3278  // decrement the current position and check it against the pushed value.
   3279  // This avoids pushing backtrack information for each iteration of the loop,
   3280  // which could take up a lot of space.
   3281  DCHECK(trace->fixed_length_loop_state() == nullptr);
   3282  macro_assembler->PushCurrentPosition();
   3283  // This is the label for trying to match what comes after the greedy
   3284  // quantifier, either because the body of the quantifier failed, or because
   3285  // we have stepped back to try again with one iteration fewer.
   3286  Label after_body_match_attempt;
   3287  Trace fixed_length_match_trace;
   3288  if (not_at_start()) fixed_length_match_trace.set_at_start(Trace::FALSE_VALUE);
   3289  fixed_length_match_trace.set_backtrack(&after_body_match_attempt);
   3290  fixed_length_loop_state->BindLoopTopLabel(macro_assembler);
   3291  fixed_length_match_trace.set_fixed_length_loop_state(fixed_length_loop_state);
   3292  alternatives_->at(0).node()->Emit(compiler, &fixed_length_match_trace);
   3293  macro_assembler->Bind(&after_body_match_attempt);
   3294 
   3295  Trace* new_trace = fixed_length_loop_state->counter_backtrack_trace();
   3296 
   3297  // In a fixed length loop there is only one other choice, which is what
   3298  // comes after the greedy quantifer.  Try to match that now.
   3299  EmitChoices(compiler, alt_gens, 1, new_trace, preload);
   3300 
   3301  fixed_length_loop_state->BindStepBackwardsLabel(macro_assembler);
   3302  // If we have unwound to the bottom then backtrack.
   3303  macro_assembler->CheckFixedLengthLoop(trace->backtrack());
   3304  // Otherwise try the second priority at an earlier position.
   3305  macro_assembler->AdvanceCurrentPosition(-text_length);
   3306  macro_assembler->GoTo(&after_body_match_attempt);
   3307  return new_trace;
   3308 }
   3309 
   3310 int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
   3311                                              Trace* trace) {
   3312  int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
   3313  if (alternatives_->length() != 2) return eats_at_least;
   3314 
   3315  GuardedAlternative alt1 = alternatives_->at(1);
   3316  if (alt1.guards() != nullptr && alt1.guards()->length() != 0) {
   3317    return eats_at_least;
   3318  }
   3319  RegExpNode* eats_anything_node = alt1.node();
   3320  if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
   3321    return eats_at_least;
   3322  }
   3323 
   3324  // Really we should be creating a new trace when we execute this function,
   3325  // but there is no need, because the code it generates cannot backtrack, and
   3326  // we always arrive here with a trivial trace (since it's the entry to a
   3327  // loop.  That also implies that there are no preloaded characters, which is
   3328  // good, because it means we won't be violating any assumptions by
   3329  // overwriting those characters with new load instructions.
   3330  DCHECK(trace->is_trivial());
   3331 
   3332  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
   3333  Isolate* isolate = macro_assembler->isolate();
   3334  // At this point we know that we are at a non-greedy loop that will eat
   3335  // any character one at a time.  Any non-anchored regexp has such a
   3336  // loop prepended to it in order to find where it starts.  We look for
   3337  // a pattern of the form ...abc... where we can look 6 characters ahead
   3338  // and step forwards 3 if the character is not one of abc.  Abc need
   3339  // not be atoms, they can be any reasonably limited character class or
   3340  // small alternation.
   3341  BoyerMooreLookahead* bm = bm_info(false);
   3342  if (bm == nullptr) {
   3343    eats_at_least = std::min(kMaxLookaheadForBoyerMoore, EatsAtLeast(false));
   3344    if (eats_at_least >= 1) {
   3345      bm = zone()->New<BoyerMooreLookahead>(eats_at_least, compiler, zone());
   3346      GuardedAlternative alt0 = alternatives_->at(0);
   3347      alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false);
   3348    }
   3349  }
   3350  if (bm != nullptr) {
   3351    bm->EmitSkipInstructions(macro_assembler);
   3352  }
   3353  return eats_at_least;
   3354 }
   3355 
   3356 void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
   3357                             AlternativeGenerationList* alt_gens,
   3358                             int first_choice, Trace* trace,
   3359                             PreloadState* preload) {
   3360  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
   3361  SetUpPreLoad(compiler, trace, preload);
   3362 
   3363  // For now we just call all choices one after the other.  The idea ultimately
   3364  // is to use the Dispatch table to try only the relevant ones.
   3365  int choice_count = alternatives_->length();
   3366 
   3367  int new_flush_budget = trace->flush_budget() / choice_count;
   3368 
   3369  for (int i = first_choice; i < choice_count; i++) {
   3370    bool is_last = i == choice_count - 1;
   3371    bool fall_through_on_failure = !is_last;
   3372    GuardedAlternative alternative = alternatives_->at(i);
   3373    AlternativeGeneration* alt_gen = alt_gens->at(i);
   3374    alt_gen->quick_check_details.set_characters(preload->preload_characters_);
   3375    ZoneList<Guard*>* guards = alternative.guards();
   3376    int guard_count = (guards == nullptr) ? 0 : guards->length();
   3377    Trace new_trace(*trace);
   3378    new_trace.set_characters_preloaded(
   3379        preload->preload_is_current_ ? preload->preload_characters_ : 0);
   3380    if (preload->preload_has_checked_bounds_) {
   3381      new_trace.set_bound_checked_up_to(preload->preload_characters_);
   3382    }
   3383    new_trace.quick_check_performed()->Clear();
   3384    if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
   3385    if (!is_last) {
   3386      new_trace.set_backtrack(&alt_gen->after);
   3387    }
   3388    alt_gen->expects_preload = preload->preload_is_current_;
   3389    bool generate_full_check_inline = false;
   3390    if (v8_flags.regexp_optimization &&
   3391        try_to_emit_quick_check_for_alternative(i == 0) &&
   3392        alternative.node()->EmitQuickCheck(
   3393            compiler, trace, &new_trace, preload->preload_has_checked_bounds_,
   3394            &alt_gen->possible_success, &alt_gen->quick_check_details,
   3395            fall_through_on_failure, this)) {
   3396      // Quick check was generated for this choice.
   3397      preload->preload_is_current_ = true;
   3398      preload->preload_has_checked_bounds_ = true;
   3399      // If we generated the quick check to fall through on possible success,
   3400      // we now need to generate the full check inline.
   3401      if (!fall_through_on_failure) {
   3402        macro_assembler->Bind(&alt_gen->possible_success);
   3403        new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
   3404        new_trace.set_characters_preloaded(preload->preload_characters_);
   3405        new_trace.set_bound_checked_up_to(preload->preload_characters_);
   3406        generate_full_check_inline = true;
   3407      }
   3408    } else if (alt_gen->quick_check_details.cannot_match()) {
   3409      if (!fall_through_on_failure) {
   3410        macro_assembler->GoTo(trace->backtrack());
   3411      }
   3412      continue;
   3413    } else {
   3414      // No quick check was generated.  Put the full code here.
   3415      // If this is not the first choice then there could be slow checks from
   3416      // previous cases that go here when they fail.  There's no reason to
   3417      // insist that they preload characters since the slow check we are about
   3418      // to generate probably can't use it.
   3419      if (i != first_choice) {
   3420        alt_gen->expects_preload = false;
   3421        new_trace.InvalidateCurrentCharacter();
   3422      }
   3423      generate_full_check_inline = true;
   3424    }
   3425    if (generate_full_check_inline) {
   3426      if (new_trace.has_any_actions()) {
   3427        new_trace.set_flush_budget(new_flush_budget);
   3428      }
   3429      for (int j = 0; j < guard_count; j++) {
   3430        GenerateGuard(macro_assembler, guards->at(j), &new_trace);
   3431      }
   3432      alternative.node()->Emit(compiler, &new_trace);
   3433      preload->preload_is_current_ = false;
   3434    }
   3435    macro_assembler->Bind(&alt_gen->after);
   3436  }
   3437 }
   3438 
   3439 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
   3440                                           Trace* trace,
   3441                                           GuardedAlternative alternative,
   3442                                           AlternativeGeneration* alt_gen,
   3443                                           int preload_characters,
   3444                                           bool next_expects_preload) {
   3445  if (!alt_gen->possible_success.is_linked()) return;
   3446 
   3447  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
   3448  macro_assembler->Bind(&alt_gen->possible_success);
   3449  Trace out_of_line_trace(*trace);
   3450  out_of_line_trace.set_characters_preloaded(preload_characters);
   3451  out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
   3452  if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
   3453  ZoneList<Guard*>* guards = alternative.guards();
   3454  int guard_count = (guards == nullptr) ? 0 : guards->length();
   3455  if (next_expects_preload) {
   3456    Label reload_current_char;
   3457    out_of_line_trace.set_backtrack(&reload_current_char);
   3458    for (int j = 0; j < guard_count; j++) {
   3459      GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
   3460    }
   3461    alternative.node()->Emit(compiler, &out_of_line_trace);
   3462    macro_assembler->Bind(&reload_current_char);
   3463    // Reload the current character, since the next quick check expects that.
   3464    // We don't need to check bounds here because we only get into this
   3465    // code through a quick check which already did the checked load.
   3466    macro_assembler->LoadCurrentCharacter(trace->cp_offset(), nullptr, false,
   3467                                          preload_characters);
   3468    macro_assembler->GoTo(&(alt_gen->after));
   3469  } else {
   3470    out_of_line_trace.set_backtrack(&(alt_gen->after));
   3471    for (int j = 0; j < guard_count; j++) {
   3472      GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
   3473    }
   3474    alternative.node()->Emit(compiler, &out_of_line_trace);
   3475  }
   3476 }
   3477 
   3478 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
   3479  RegExpMacroAssembler* assembler = compiler->macro_assembler();
   3480  LimitResult limit_result = LimitVersions(compiler, trace);
   3481  if (limit_result == DONE) return;
   3482  DCHECK(limit_result == CONTINUE);
   3483 
   3484  RecursionCheck rc(compiler);
   3485 
   3486  switch (action_type_) {
   3487    // Start with the actions we know how to defer. These are just recorded in
   3488    // the new trace, no code is emitted right now.  (If we backtrack then we
   3489    // don't have to perform and undo these actions.)
   3490    case CLEAR_POSITION:
   3491    case RESTORE_POSITION:
   3492    case INCREMENT_REGISTER:
   3493    case SET_REGISTER_FOR_LOOP:
   3494    case CLEAR_CAPTURES: {
   3495      Trace new_trace = *trace;
   3496      new_trace.add_action(this);
   3497      on_success()->Emit(compiler, &new_trace);
   3498      break;
   3499    }
   3500    // We don't yet have the ability to defer these.
   3501    case BEGIN_POSITIVE_SUBMATCH:
   3502    case BEGIN_NEGATIVE_SUBMATCH:
   3503      if (!trace->is_trivial()) {
   3504        // Complex situation: Flush the trace state to the assembler and
   3505        // generate a generic version of this action.  This call will
   3506        // recurse back to the else clause here.
   3507        trace->Flush(compiler, this);
   3508      } else {
   3509        assembler->WriteCurrentPositionToRegister(
   3510            data_.u_submatch.current_position_register, 0);
   3511        assembler->WriteStackPointerToRegister(
   3512            data_.u_submatch.stack_pointer_register);
   3513        on_success()->Emit(compiler, trace);
   3514      }
   3515      break;
   3516    case EMPTY_MATCH_CHECK: {
   3517      int start_pos_reg = data_.u_empty_match_check.start_register;
   3518      int stored_pos = 0;
   3519      int rep_reg = data_.u_empty_match_check.repetition_register;
   3520      bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
   3521      bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
   3522      if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
   3523        // If we know we haven't advanced and there is no minimum we
   3524        // can just backtrack immediately.
   3525        assembler->GoTo(trace->backtrack());
   3526      } else if (know_dist && stored_pos < trace->cp_offset()) {
   3527        // If we know we've advanced we can generate the continuation
   3528        // immediately.
   3529        on_success()->Emit(compiler, trace);
   3530      } else if (!trace->is_trivial()) {
   3531        trace->Flush(compiler, this);
   3532      } else {
   3533        Label skip_empty_check;
   3534        // If we have a minimum number of repetitions we check the current
   3535        // number first and skip the empty check if it's not enough.
   3536        if (has_minimum) {
   3537          int limit = data_.u_empty_match_check.repetition_limit;
   3538          assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
   3539        }
   3540        // If the match is empty we bail out, otherwise we fall through
   3541        // to the on-success continuation.
   3542        assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
   3543                                   trace->backtrack());
   3544        assembler->Bind(&skip_empty_check);
   3545        on_success()->Emit(compiler, trace);
   3546      }
   3547      break;
   3548    }
   3549    case POSITIVE_SUBMATCH_SUCCESS: {
   3550      if (!trace->is_trivial()) {
   3551        trace->Flush(compiler, this, Trace::kFlushSuccess);
   3552        return;
   3553      }
   3554      assembler->ReadCurrentPositionFromRegister(
   3555          data_.u_submatch.current_position_register);
   3556      assembler->ReadStackPointerFromRegister(
   3557          data_.u_submatch.stack_pointer_register);
   3558      int clear_register_count = data_.u_submatch.clear_register_count;
   3559      if (clear_register_count == 0) {
   3560        on_success()->Emit(compiler, trace);
   3561        return;
   3562      }
   3563      int clear_registers_from = data_.u_submatch.clear_register_from;
   3564      Label clear_registers_backtrack;
   3565      Trace new_trace = *trace;
   3566      new_trace.set_backtrack(&clear_registers_backtrack);
   3567      on_success()->Emit(compiler, &new_trace);
   3568 
   3569      assembler->Bind(&clear_registers_backtrack);
   3570      int clear_registers_to = clear_registers_from + clear_register_count - 1;
   3571      assembler->ClearRegisters(clear_registers_from, clear_registers_to);
   3572 
   3573      DCHECK(trace->backtrack() == nullptr);
   3574      assembler->Backtrack();
   3575      return;
   3576    }
   3577    case MODIFY_FLAGS: {
   3578      compiler->set_flags(flags());
   3579      on_success()->Emit(compiler, trace);
   3580      break;
   3581    }
   3582    default:
   3583      UNREACHABLE();
   3584  }
   3585 }
   3586 
   3587 void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
   3588  RegExpMacroAssembler* assembler = compiler->macro_assembler();
   3589  if (!trace->is_trivial()) {
   3590    trace->Flush(compiler, this);
   3591    return;
   3592  }
   3593 
   3594  LimitResult limit_result = LimitVersions(compiler, trace);
   3595  if (limit_result == DONE) return;
   3596  DCHECK(limit_result == CONTINUE);
   3597 
   3598  RecursionCheck rc(compiler);
   3599 
   3600  DCHECK_EQ(start_reg_ + 1, end_reg_);
   3601  if (IsIgnoreCase(compiler->flags())) {
   3602    bool unicode = IsEitherUnicode(compiler->flags());
   3603    assembler->CheckNotBackReferenceIgnoreCase(start_reg_, read_backward(),
   3604                                               unicode, trace->backtrack());
   3605  } else {
   3606    assembler->CheckNotBackReference(start_reg_, read_backward(),
   3607                                     trace->backtrack());
   3608  }
   3609  // We are going to advance backward, so we may end up at the start.
   3610  if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
   3611 
   3612  // Check that the back reference does not end inside a surrogate pair.
   3613  if (IsEitherUnicode(compiler->flags()) && !compiler->one_byte()) {
   3614    assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack());
   3615  }
   3616  on_success()->Emit(compiler, trace);
   3617 }
   3618 
   3619 void TextNode::CalculateOffsets() {
   3620  int element_count = elements()->length();
   3621  // Set up the offsets of the elements relative to the start.  This is a fixed
   3622  // quantity since a TextNode can only contain fixed-width things.
   3623  int cp_offset = 0;
   3624  for (int i = 0; i < element_count; i++) {
   3625    TextElement& elm = elements()->at(i);
   3626    elm.set_cp_offset(cp_offset);
   3627    cp_offset += elm.length();
   3628  }
   3629 }
   3630 
   3631 namespace {
   3632 
   3633 // Assertion propagation moves information about assertions such as
   3634 // \b to the affected nodes.  For instance, in /.\b./ information must
   3635 // be propagated to the first '.' that whatever follows needs to know
   3636 // if it matched a word or a non-word, and to the second '.' that it
   3637 // has to check if it succeeds a word or non-word.  In this case the
   3638 // result will be something like:
   3639 //
   3640 //   +-------+        +------------+
   3641 //   |   .   |        |      .     |
   3642 //   +-------+  --->  +------------+
   3643 //   | word? |        | check word |
   3644 //   +-------+        +------------+
   3645 class AssertionPropagator : public AllStatic {
   3646 public:
   3647  static void VisitText(TextNode* that) {}
   3648 
   3649  static void VisitAction(ActionNode* that) {
   3650    // If the next node is interested in what it follows then this node
   3651    // has to be interested too so it can pass the information on.
   3652    that->info()->AddFromFollowing(that->on_success()->info());
   3653  }
   3654 
   3655  static void VisitChoice(ChoiceNode* that, int i) {
   3656    // Anything the following nodes need to know has to be known by
   3657    // this node also, so it can pass it on.
   3658    that->info()->AddFromFollowing(that->alternatives()->at(i).node()->info());
   3659  }
   3660 
   3661  static void VisitLoopChoiceContinueNode(LoopChoiceNode* that) {
   3662    that->info()->AddFromFollowing(that->continue_node()->info());
   3663  }
   3664 
   3665  static void VisitLoopChoiceLoopNode(LoopChoiceNode* that) {
   3666    that->info()->AddFromFollowing(that->loop_node()->info());
   3667  }
   3668 
   3669  static void VisitNegativeLookaroundChoiceLookaroundNode(
   3670      NegativeLookaroundChoiceNode* that) {
   3671    VisitChoice(that, NegativeLookaroundChoiceNode::kLookaroundIndex);
   3672  }
   3673 
   3674  static void VisitNegativeLookaroundChoiceContinueNode(
   3675      NegativeLookaroundChoiceNode* that) {
   3676    VisitChoice(that, NegativeLookaroundChoiceNode::kContinueIndex);
   3677  }
   3678 
   3679  static void VisitBackReference(BackReferenceNode* that) {}
   3680 
   3681  static void VisitAssertion(AssertionNode* that) {}
   3682 };
   3683 
   3684 // Propagates information about the minimum size of successful matches from
   3685 // successor nodes to their predecessors. Note that all eats_at_least values
   3686 // are initialized to zero before analysis.
   3687 class EatsAtLeastPropagator : public AllStatic {
   3688 public:
   3689  static void VisitText(TextNode* that) {
   3690    // The eats_at_least value is not used if reading backward.
   3691    if (!that->read_backward()) {
   3692      // We are not at the start after this node, and thus we can use the
   3693      // successor's eats_at_least_from_not_start value.
   3694      uint8_t eats_at_least = base::saturated_cast<uint8_t>(
   3695          that->Length() + that->on_success()
   3696                               ->eats_at_least_info()
   3697                               ->eats_at_least_from_not_start);
   3698      that->set_eats_at_least_info(EatsAtLeastInfo(eats_at_least));
   3699    }
   3700  }
   3701 
   3702  static void VisitAction(ActionNode* that) {
   3703    switch (that->action_type()) {
   3704      case ActionNode::BEGIN_POSITIVE_SUBMATCH: {
   3705        // For a begin positive submatch we propagate the eats_at_least
   3706        // data from the successor of the success node, ignoring the body of
   3707        // the lookahead, which eats nothing, since it is a zero-width
   3708        // assertion.
   3709        // TODO(chromium:42201836) This is better than discarding all
   3710        // information when there is a positive lookahead, but it loses some
   3711        // information that could be useful, since the body of the lookahead
   3712        // could tell us something about how close to the end of the string we
   3713        // are.
   3714        that->set_eats_at_least_info(
   3715            *that->success_node()->on_success()->eats_at_least_info());
   3716        break;
   3717      }
   3718      case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
   3719        // We do not propagate eats_at_least data through positive submatch
   3720        // success because it rewinds input.
   3721        DCHECK(that->eats_at_least_info()->IsZero());
   3722        break;
   3723      case ActionNode::SET_REGISTER_FOR_LOOP:
   3724        // SET_REGISTER_FOR_LOOP indicates a loop entry point, which means the
   3725        // loop body will run at least the minimum number of times before the
   3726        // continuation case can run.
   3727        that->set_eats_at_least_info(
   3728            that->on_success()->EatsAtLeastFromLoopEntry());
   3729        break;
   3730      case ActionNode::BEGIN_NEGATIVE_SUBMATCH:
   3731      default:
   3732        // Otherwise, the current node eats at least as much as its successor.
   3733        // Note: we can propagate eats_at_least data for BEGIN_NEGATIVE_SUBMATCH
   3734        // because NegativeLookaroundChoiceNode ignores its lookaround successor
   3735        // when computing eats-at-least and quick check information.
   3736        that->set_eats_at_least_info(*that->on_success()->eats_at_least_info());
   3737        break;
   3738    }
   3739  }
   3740 
   3741  static void VisitChoice(ChoiceNode* that, int i) {
   3742    // The minimum possible match from a choice node is the minimum of its
   3743    // successors.
   3744    EatsAtLeastInfo eats_at_least =
   3745        i == 0 ? EatsAtLeastInfo(UINT8_MAX) : *that->eats_at_least_info();
   3746    eats_at_least.SetMin(
   3747        *that->alternatives()->at(i).node()->eats_at_least_info());
   3748    that->set_eats_at_least_info(eats_at_least);
   3749  }
   3750 
   3751  static void VisitLoopChoiceContinueNode(LoopChoiceNode* that) {
   3752    if (!that->read_backward()) {
   3753      that->set_eats_at_least_info(
   3754          *that->continue_node()->eats_at_least_info());
   3755    }
   3756  }
   3757 
   3758  static void VisitLoopChoiceLoopNode(LoopChoiceNode* that) {}
   3759 
   3760  static void VisitNegativeLookaroundChoiceLookaroundNode(
   3761      NegativeLookaroundChoiceNode* that) {}
   3762 
   3763  static void VisitNegativeLookaroundChoiceContinueNode(
   3764      NegativeLookaroundChoiceNode* that) {
   3765    that->set_eats_at_least_info(*that->continue_node()->eats_at_least_info());
   3766  }
   3767 
   3768  static void VisitBackReference(BackReferenceNode* that) {
   3769    if (!that->read_backward()) {
   3770      that->set_eats_at_least_info(*that->on_success()->eats_at_least_info());
   3771    }
   3772  }
   3773 
   3774  static void VisitAssertion(AssertionNode* that) {
   3775    EatsAtLeastInfo eats_at_least = *that->on_success()->eats_at_least_info();
   3776    if (that->assertion_type() == AssertionNode::AT_START) {
   3777      // If we know we are not at the start and we are asked "how many
   3778      // characters will you match if you succeed?" then we can answer anything
   3779      // since false implies false.  So let's just set the max answer
   3780      // (UINT8_MAX) since that won't prevent us from preloading a lot of
   3781      // characters for the other branches in the node graph.
   3782      eats_at_least.eats_at_least_from_not_start = UINT8_MAX;
   3783    }
   3784    that->set_eats_at_least_info(eats_at_least);
   3785  }
   3786 };
   3787 
   3788 }  // namespace
   3789 
   3790 // -------------------------------------------------------------------
   3791 // Analysis
   3792 
   3793 // Iterates the node graph and provides the opportunity for propagators to set
   3794 // values that depend on successor nodes.
   3795 template <typename... Propagators>
   3796 class Analysis : public NodeVisitor {
   3797 public:
   3798  Analysis(Isolate* isolate, bool is_one_byte, RegExpFlags flags)
   3799      : isolate_(isolate),
   3800        is_one_byte_(is_one_byte),
   3801        flags_(flags),
   3802        error_(RegExpError::kNone) {}
   3803 
   3804  void EnsureAnalyzed(RegExpNode* that) {
   3805    StackLimitCheck check(isolate());
   3806    if (check.HasOverflowed()) {
   3807      if (v8_flags.correctness_fuzzer_suppressions) {
   3808        FATAL("Analysis: Aborting on stack overflow");
   3809      }
   3810      fail(RegExpError::kAnalysisStackOverflow);
   3811      return;
   3812    }
   3813    if (that->info()->been_analyzed || that->info()->being_analyzed) return;
   3814    that->info()->being_analyzed = true;
   3815    that->Accept(this);
   3816    that->info()->being_analyzed = false;
   3817    that->info()->been_analyzed = true;
   3818  }
   3819 
   3820  bool has_failed() { return error_ != RegExpError::kNone; }
   3821  RegExpError error() {
   3822    DCHECK(error_ != RegExpError::kNone);
   3823    return error_;
   3824  }
   3825  void fail(RegExpError error) { error_ = error; }
   3826 
   3827  Isolate* isolate() const { return isolate_; }
   3828 
   3829  void VisitEnd(EndNode* that) override {
   3830    // nothing to do
   3831  }
   3832 
   3833 // Used to call the given static function on each propagator / variadic template
   3834 // argument.
   3835 #define STATIC_FOR_EACH(expr)       \
   3836  do {                              \
   3837    int dummy[] = {((expr), 0)...}; \
   3838    USE(dummy);                     \
   3839  } while (false)
   3840 
   3841  void VisitText(TextNode* that) override {
   3842    that->MakeCaseIndependent(isolate(), is_one_byte_, flags());
   3843    EnsureAnalyzed(that->on_success());
   3844    if (has_failed()) return;
   3845    that->CalculateOffsets();
   3846    STATIC_FOR_EACH(Propagators::VisitText(that));
   3847  }
   3848 
   3849  void VisitAction(ActionNode* that) override {
   3850    if (that->action_type() == ActionNode::MODIFY_FLAGS) {
   3851      set_flags(that->flags());
   3852    }
   3853    EnsureAnalyzed(that->on_success());
   3854    if (has_failed()) return;
   3855    STATIC_FOR_EACH(Propagators::VisitAction(that));
   3856  }
   3857 
   3858  void VisitChoice(ChoiceNode* that) override {
   3859    for (int i = 0; i < that->alternatives()->length(); i++) {
   3860      EnsureAnalyzed(that->alternatives()->at(i).node());
   3861      if (has_failed()) return;
   3862      STATIC_FOR_EACH(Propagators::VisitChoice(that, i));
   3863    }
   3864  }
   3865 
   3866  void VisitLoopChoice(LoopChoiceNode* that) override {
   3867    DCHECK_EQ(that->alternatives()->length(), 2);  // Just loop and continue.
   3868 
   3869    // First propagate all information from the continuation node.
   3870    // Due to the unusual visitation order, we need to manage the flags manually
   3871    // as if we were visiting the loop node before visiting the continuation.
   3872    RegExpFlags orig_flags = flags();
   3873 
   3874    EnsureAnalyzed(that->continue_node());
   3875    if (has_failed()) return;
   3876    // Propagators don't access global state (including flags), so we don't need
   3877    // to reset them here.
   3878    STATIC_FOR_EACH(Propagators::VisitLoopChoiceContinueNode(that));
   3879 
   3880    RegExpFlags continuation_flags = flags();
   3881 
   3882    // Check the loop last since it may need the value of this node
   3883    // to get a correct result.
   3884    set_flags(orig_flags);
   3885    EnsureAnalyzed(that->loop_node());
   3886    if (has_failed()) return;
   3887    // Propagators don't access global state (including flags), so we don't need
   3888    // to reset them here.
   3889    STATIC_FOR_EACH(Propagators::VisitLoopChoiceLoopNode(that));
   3890 
   3891    set_flags(continuation_flags);
   3892  }
   3893 
   3894  void VisitNegativeLookaroundChoice(
   3895      NegativeLookaroundChoiceNode* that) override {
   3896    DCHECK_EQ(that->alternatives()->length(), 2);  // Lookaround and continue.
   3897 
   3898    EnsureAnalyzed(that->lookaround_node());
   3899    if (has_failed()) return;
   3900    STATIC_FOR_EACH(
   3901        Propagators::VisitNegativeLookaroundChoiceLookaroundNode(that));
   3902 
   3903    EnsureAnalyzed(that->continue_node());
   3904    if (has_failed()) return;
   3905    STATIC_FOR_EACH(
   3906        Propagators::VisitNegativeLookaroundChoiceContinueNode(that));
   3907  }
   3908 
   3909  void VisitBackReference(BackReferenceNode* that) override {
   3910    EnsureAnalyzed(that->on_success());
   3911    if (has_failed()) return;
   3912    STATIC_FOR_EACH(Propagators::VisitBackReference(that));
   3913  }
   3914 
   3915  void VisitAssertion(AssertionNode* that) override {
   3916    EnsureAnalyzed(that->on_success());
   3917    if (has_failed()) return;
   3918    STATIC_FOR_EACH(Propagators::VisitAssertion(that));
   3919  }
   3920 
   3921 #undef STATIC_FOR_EACH
   3922 
   3923 private:
   3924  RegExpFlags flags() const { return flags_; }
   3925  void set_flags(RegExpFlags flags) { flags_ = flags; }
   3926 
   3927  Isolate* isolate_;
   3928  const bool is_one_byte_;
   3929  RegExpFlags flags_;
   3930  RegExpError error_;
   3931 
   3932  DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis);
   3933 };
   3934 
   3935 RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpFlags flags,
   3936                          RegExpNode* node) {
   3937  Analysis<AssertionPropagator, EatsAtLeastPropagator> analysis(
   3938      isolate, is_one_byte, flags);
   3939  DCHECK_EQ(node->info()->been_analyzed, false);
   3940  analysis.EnsureAnalyzed(node);
   3941  DCHECK_IMPLIES(analysis.has_failed(), analysis.error() != RegExpError::kNone);
   3942  return analysis.has_failed() ? analysis.error() : RegExpError::kNone;
   3943 }
   3944 
   3945 void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
   3946                                     BoyerMooreLookahead* bm,
   3947                                     bool not_at_start) {
   3948  // Working out the set of characters that a backreference can match is too
   3949  // hard, so we just say that any character can match.
   3950  bm->SetRest(offset);
   3951  SaveBMInfo(bm, not_at_start, offset);
   3952 }
   3953 
   3954 static_assert(BoyerMoorePositionInfo::kMapSize ==
   3955              RegExpMacroAssembler::kTableSize);
   3956 
   3957 void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
   3958                              BoyerMooreLookahead* bm, bool not_at_start) {
   3959  ZoneList<GuardedAlternative>* alts = alternatives();
   3960  budget = (budget - 1) / alts->length();
   3961  for (int i = 0; i < alts->length(); i++) {
   3962    GuardedAlternative& alt = alts->at(i);
   3963    if (alt.guards() != nullptr && alt.guards()->length() != 0) {
   3964      bm->SetRest(offset);  // Give up trying to fill in info.
   3965      SaveBMInfo(bm, not_at_start, offset);
   3966      return;
   3967    }
   3968    alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start);
   3969  }
   3970  SaveBMInfo(bm, not_at_start, offset);
   3971 }
   3972 
   3973 void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
   3974                            BoyerMooreLookahead* bm, bool not_at_start) {
   3975  if (initial_offset >= bm->length()) return;
   3976  if (read_backward()) return;
   3977  int offset = initial_offset;
   3978  int max_char = bm->max_char();
   3979  for (int i = 0; i < elements()->length(); i++) {
   3980    if (offset >= bm->length()) {
   3981      if (initial_offset == 0) set_bm_info(not_at_start, bm);
   3982      return;
   3983    }
   3984    TextElement text = elements()->at(i);
   3985    if (text.text_type() == TextElement::ATOM) {
   3986      RegExpAtom* atom = text.atom();
   3987      for (int j = 0; j < atom->length(); j++, offset++) {
   3988        if (offset >= bm->length()) {
   3989          if (initial_offset == 0) set_bm_info(not_at_start, bm);
   3990          return;
   3991        }
   3992        base::uc16 character = atom->data()[j];
   3993        if (IsIgnoreCase(bm->compiler()->flags())) {
   3994          unibrow::uchar chars[4];
   3995          int length = GetCaseIndependentLetters(isolate, character,
   3996                                                 bm->compiler(), chars, 4);
   3997          for (int k = 0; k < length; k++) {
   3998            bm->Set(offset, chars[k]);
   3999          }
   4000        } else {
   4001          if (character <= max_char) bm->Set(offset, character);
   4002        }
   4003      }
   4004    } else {
   4005      DCHECK_EQ(TextElement::CLASS_RANGES, text.text_type());
   4006      RegExpClassRanges* class_ranges = text.class_ranges();
   4007      ZoneList<CharacterRange>* ranges = class_ranges->ranges(zone());
   4008      if (class_ranges->is_negated()) {
   4009        bm->SetAll(offset);
   4010      } else {
   4011        for (int k = 0; k < ranges->length(); k++) {
   4012          CharacterRange& range = ranges->at(k);
   4013          if (static_cast<int>(range.from()) > max_char) continue;
   4014          int to = std::min(max_char, static_cast<int>(range.to()));
   4015          bm->SetInterval(offset, Interval(range.from(), to));
   4016        }
   4017      }
   4018      offset++;
   4019    }
   4020  }
   4021  if (offset >= bm->length()) {
   4022    if (initial_offset == 0) set_bm_info(not_at_start, bm);
   4023    return;
   4024  }
   4025  on_success()->FillInBMInfo(isolate, offset, budget - 1, bm,
   4026                             true);  // Not at start after a text node.
   4027  if (initial_offset == 0) set_bm_info(not_at_start, bm);
   4028 }
   4029 
   4030 RegExpNode* RegExpCompiler::OptionallyStepBackToLeadSurrogate(
   4031    RegExpNode* on_success) {
   4032  DCHECK(!read_backward());
   4033  ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
   4034      zone(), CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
   4035  ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
   4036      zone(), CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
   4037 
   4038  ChoiceNode* optional_step_back = zone()->New<ChoiceNode>(2, zone());
   4039 
   4040  int stack_register = UnicodeLookaroundStackRegister();
   4041  int position_register = UnicodeLookaroundPositionRegister();
   4042  RegExpNode* step_back = TextNode::CreateForCharacterRanges(
   4043      zone(), lead_surrogates, true, on_success);
   4044  RegExpLookaround::Builder builder(true, step_back, stack_register,
   4045                                    position_register);
   4046  RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
   4047      zone(), trail_surrogates, false, builder.on_match_success());
   4048 
   4049  optional_step_back->AddAlternative(
   4050      GuardedAlternative(builder.ForMatch(match_trail)));
   4051  optional_step_back->AddAlternative(GuardedAlternative(on_success));
   4052 
   4053  return optional_step_back;
   4054 }
   4055 
   4056 RegExpNode* RegExpCompiler::PreprocessRegExp(RegExpCompileData* data,
   4057                                             bool is_one_byte) {
   4058  // Wrap the body of the regexp in capture #0.
   4059  RegExpNode* captured_body =
   4060      RegExpCapture::ToNode(data->tree, 0, this, accept());
   4061  RegExpNode* node = captured_body;
   4062  if (!data->tree->IsAnchoredAtStart() && !IsSticky(flags())) {
   4063    // Add a .*? at the beginning, outside the body capture, unless
   4064    // this expression is anchored at the beginning or sticky.
   4065    RegExpNode* loop_node = RegExpQuantifier::ToNode(
   4066        0, RegExpTree::kInfinity, false,
   4067        zone()->New<RegExpClassRanges>(StandardCharacterSet::kEverything), this,
   4068        captured_body, data->contains_anchor);
   4069 
   4070    if (data->contains_anchor) {
   4071      // Unroll loop once, to take care of the case that might start
   4072      // at the start of input.
   4073      ChoiceNode* first_step_node = zone()->New<ChoiceNode>(2, zone());
   4074      first_step_node->AddAlternative(GuardedAlternative(captured_body));
   4075      first_step_node->AddAlternative(GuardedAlternative(zone()->New<TextNode>(
   4076          zone()->New<RegExpClassRanges>(StandardCharacterSet::kEverything),
   4077          false, loop_node)));
   4078      node = first_step_node;
   4079    } else {
   4080      node = loop_node;
   4081    }
   4082  }
   4083  if (is_one_byte) {
   4084    node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, this);
   4085    // Do it again to propagate the new nodes to places where they were not
   4086    // put because they had not been calculated yet.
   4087    if (node != nullptr) {
   4088      node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, this);
   4089    }
   4090  } else if (IsEitherUnicode(flags()) &&
   4091             (IsGlobal(flags()) || IsSticky(flags()))) {
   4092    node = OptionallyStepBackToLeadSurrogate(node);
   4093  }
   4094 
   4095  if (node == nullptr) node = zone()->New<EndNode>(EndNode::BACKTRACK, zone());
   4096  // We can run out of registers during preprocessing. Indicate an error in case
   4097  // we do.
   4098  if (reg_exp_too_big_) {
   4099    data->error = RegExpError::kTooLarge;
   4100  }
   4101  return node;
   4102 }
   4103 
   4104 void RegExpCompiler::ToNodeCheckForStackOverflow() {
   4105  if (StackLimitCheck{isolate()}.HasOverflowed()) {
   4106    V8::FatalProcessOutOfMemory(isolate(), "RegExpCompiler");
   4107  }
   4108 }
   4109 
   4110 }  // namespace v8::internal
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE