tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

regexp-compiler.h (25026B)


      1 // Copyright 2019 the V8 project authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef V8_REGEXP_REGEXP_COMPILER_H_
      6 #define V8_REGEXP_REGEXP_COMPILER_H_
      7 
      8 #include <bitset>
      9 
     10 #include "irregexp/imported/regexp-nodes.h"
     11 
     12 namespace v8 {
     13 namespace internal {
     14 
     15 class DynamicBitSet;
     16 class Isolate;
     17 class FixedLengthLoopState;
     18 
     19 namespace regexp_compiler_constants {
     20 
     21 // The '2' variant is has inclusive from and exclusive to.
     22 // This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
     23 // which include WhiteSpace (7.2) or LineTerminator (7.3) values.
     24 constexpr base::uc32 kRangeEndMarker = 0x110000;
     25 constexpr int kSpaceRanges[] = {
     26    '\t',   '\r' + 1, ' ',    ' ' + 1, 0x00A0, 0x00A1, 0x1680,
     27    0x1681, 0x2000,   0x200B, 0x2028,  0x202A, 0x202F, 0x2030,
     28    0x205F, 0x2060,   0x3000, 0x3001,  0xFEFF, 0xFF00, kRangeEndMarker};
     29 constexpr int kSpaceRangeCount = arraysize(kSpaceRanges);
     30 
     31 constexpr int kWordRanges[] = {'0',     '9' + 1, 'A',     'Z' + 1,        '_',
     32                               '_' + 1, 'a',     'z' + 1, kRangeEndMarker};
     33 constexpr int kWordRangeCount = arraysize(kWordRanges);
     34 constexpr int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
     35 constexpr int kDigitRangeCount = arraysize(kDigitRanges);
     36 constexpr int kSurrogateRanges[] = {kLeadSurrogateStart,
     37                                    kLeadSurrogateStart + 1, kRangeEndMarker};
     38 constexpr int kSurrogateRangeCount = arraysize(kSurrogateRanges);
     39 constexpr int kLineTerminatorRanges[] = {0x000A, 0x000B, 0x000D,         0x000E,
     40                                         0x2028, 0x202A, kRangeEndMarker};
     41 constexpr int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);
     42 
     43 // More makes code generation slower, less makes V8 benchmark score lower.
     44 constexpr uint32_t kMaxLookaheadForBoyerMoore = 8;
     45 // In a 3-character pattern you can maximally step forwards 3 characters
     46 // at a time, which is not always enough to pay for the extra logic.
     47 constexpr uint32_t kPatternTooShortForBoyerMoore = 2;
     48 
     49 }  // namespace regexp_compiler_constants
     50 
     51 inline bool NeedsUnicodeCaseEquivalents(RegExpFlags flags) {
     52  // Both unicode (or unicode sets) and ignore_case flags are set. We need to
     53  // use ICU to find the closure over case equivalents.
     54  return IsEitherUnicode(flags) && IsIgnoreCase(flags);
     55 }
     56 
     57 // Details of a quick mask-compare check that can look ahead in the
     58 // input stream.
     59 class QuickCheckDetails {
     60 public:
     61  QuickCheckDetails()
     62      : characters_(0), mask_(0), value_(0), cannot_match_(false) {}
     63  explicit QuickCheckDetails(int characters)
     64      : characters_(characters), mask_(0), value_(0), cannot_match_(false) {}
     65  bool Rationalize(bool one_byte);
     66  // Merge in the information from another branch of an alternation.
     67  void Merge(QuickCheckDetails* other, int from_index);
     68  // Advance the current position by some amount.
     69  void Advance(int by, bool one_byte);
     70  void Clear();
     71  bool cannot_match() { return cannot_match_; }
     72  void set_cannot_match() { cannot_match_ = true; }
     73  struct Position {
     74    Position() : mask(0), value(0), determines_perfectly(false) {}
     75    base::uc32 mask;
     76    base::uc32 value;
     77    bool determines_perfectly;
     78  };
     79  int characters() const { return characters_; }
     80  void set_characters(int characters) { characters_ = characters; }
     81  Position* positions(int index) {
     82    DCHECK_LE(0, index);
     83    DCHECK_GT(characters_, index);
     84    return positions_ + index;
     85  }
     86  const Position* positions(int index) const {
     87    DCHECK_LE(0, index);
     88    DCHECK_GT(characters_, index);
     89    return positions_ + index;
     90  }
     91  uint32_t mask() { return mask_; }
     92  uint32_t value() { return value_; }
     93 
     94 private:
     95  // How many characters do we have quick check information from.  This is
     96  // the same for all branches of a choice node.
     97  int characters_;
     98  Position positions_[4];
     99  // These values are the condensate of the above array after Rationalize().
    100  uint32_t mask_;
    101  uint32_t value_;
    102  // If set to true, there is no way this quick check can match at all.
    103  // E.g., if it requires to be at the start of the input, and isn't.
    104  bool cannot_match_;
    105 };
    106 
    107 // Improve the speed that we scan for an initial point where a non-anchored
    108 // regexp can match by using a Boyer-Moore-like table. This is done by
    109 // identifying non-greedy non-capturing loops in the nodes that eat any
    110 // character one at a time.  For example in the middle of the regexp
    111 // /foo[\s\S]*?bar/ we find such a loop.  There is also such a loop implicitly
    112 // inserted at the start of any non-anchored regexp.
    113 //
    114 // When we have found such a loop we look ahead in the nodes to find the set of
    115 // characters that can come at given distances. For example for the regexp
    116 // /.?foo/ we know that there are at least 3 characters ahead of us, and the
    117 // sets of characters that can occur are [any, [f, o], [o]]. We find a range in
    118 // the lookahead info where the set of characters is reasonably constrained. In
    119 // our example this is from index 1 to 2 (0 is not constrained). We can now
    120 // look 3 characters ahead and if we don't find one of [f, o] (the union of
    121 // [f, o] and [o]) then we can skip forwards by the range size (in this case 2).
    122 //
    123 // For Unicode input strings we do the same, but modulo 128.
    124 //
    125 // We also look at the first string fed to the regexp and use that to get a hint
    126 // of the character frequencies in the inputs. This affects the assessment of
    127 // whether the set of characters is 'reasonably constrained'.
    128 //
    129 // We also have another lookahead mechanism (called quick check in the code),
    130 // which uses a wide load of multiple characters followed by a mask and compare
    131 // to determine whether a match is possible at this point.
    132 enum ContainedInLattice {
    133  kNotYet = 0,
    134  kLatticeIn = 1,
    135  kLatticeOut = 2,
    136  kLatticeUnknown = 3  // Can also mean both in and out.
    137 };
    138 
    139 inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) {
    140  return static_cast<ContainedInLattice>(a | b);
    141 }
    142 
    143 class BoyerMoorePositionInfo : public ZoneObject {
    144 public:
    145  bool at(int i) const { return map_[i]; }
    146 
    147  static constexpr int kMapSize = 128;
    148  static constexpr int kMask = kMapSize - 1;
    149 
    150  int map_count() const { return map_count_; }
    151 
    152  void Set(int character);
    153  void SetInterval(const Interval& interval);
    154  void SetAll();
    155 
    156  bool is_non_word() { return w_ == kLatticeOut; }
    157  bool is_word() { return w_ == kLatticeIn; }
    158 
    159  using Bitset = std::bitset<kMapSize>;
    160  Bitset raw_bitset() const { return map_; }
    161 
    162 private:
    163  Bitset map_;
    164  int map_count_ = 0;               // Number of set bits in the map.
    165  ContainedInLattice w_ = kNotYet;  // The \w character class.
    166 };
    167 
    168 class BoyerMooreLookahead : public ZoneObject {
    169 public:
    170  BoyerMooreLookahead(int length, RegExpCompiler* compiler, Zone* zone);
    171 
    172  int length() { return length_; }
    173  int max_char() { return max_char_; }
    174  RegExpCompiler* compiler() { return compiler_; }
    175 
    176  int Count(int map_number) { return bitmaps_->at(map_number)->map_count(); }
    177 
    178  BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); }
    179 
    180  void Set(int map_number, int character) {
    181    if (character > max_char_) return;
    182    BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
    183    info->Set(character);
    184  }
    185 
    186  void SetInterval(int map_number, const Interval& interval) {
    187    if (interval.from() > max_char_) return;
    188    BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
    189    if (interval.to() > max_char_) {
    190      info->SetInterval(Interval(interval.from(), max_char_));
    191    } else {
    192      info->SetInterval(interval);
    193    }
    194  }
    195 
    196  void SetAll(int map_number) { bitmaps_->at(map_number)->SetAll(); }
    197 
    198  void SetRest(int from_map) {
    199    for (int i = from_map; i < length_; i++) SetAll(i);
    200  }
    201  void EmitSkipInstructions(RegExpMacroAssembler* masm);
    202 
    203 private:
    204  // This is the value obtained by EatsAtLeast.  If we do not have at least this
    205  // many characters left in the sample string then the match is bound to fail.
    206  // Therefore it is OK to read a character this far ahead of the current match
    207  // point.
    208  int length_;
    209  RegExpCompiler* compiler_;
    210  // 0xff for Latin1, 0xffff for UTF-16.
    211  int max_char_;
    212  ZoneList<BoyerMoorePositionInfo*>* bitmaps_;
    213 
    214  int GetSkipTable(
    215      int min_lookahead, int max_lookahead,
    216      DirectHandle<ByteArray> boolean_skip_table,
    217      DirectHandle<ByteArray> nibble_table = DirectHandle<ByteArray>{});
    218  bool FindWorthwhileInterval(int* from, int* to);
    219  int FindBestInterval(int max_number_of_chars, int old_biggest_points,
    220                       int* from, int* to);
    221 };
    222 
    223 // There are many ways to generate code for a node.  This class encapsulates
    224 // the current way we should be generating.  In other words it encapsulates
    225 // the current state of the code generator.  The effect of this is that we
    226 // generate code for paths that the matcher can take through the regular
    227 // expression.  A given node in the regexp can be code-generated several times
    228 // as it can be part of several traces.  For example for the regexp:
    229 // /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part
    230 // of the foo-bar-baz trace and once as part of the foo-ip-baz trace.  The code
    231 // to match foo is generated only once (the traces have a common prefix).  The
    232 // code to store the capture is deferred and generated (twice) after the places
    233 // where baz has been matched.
    234 class Trace {
    235 public:
    236  // A value for a property that is either known to be true, know to be false,
    237  // or not known.
    238  enum TriBool { UNKNOWN = -1, FALSE_VALUE = 0, TRUE_VALUE = 1 };
    239 
    240  Trace()
    241      : cp_offset_(0),
    242        flush_budget_(100),  // Note: this is a 16 bit field.
    243        at_start_(UNKNOWN),
    244        has_any_actions_(false),
    245        action_(nullptr),
    246        backtrack_(nullptr),
    247        fixed_length_loop_state_(nullptr),
    248        characters_preloaded_(0),
    249        bound_checked_up_to_(0),
    250        next_(nullptr) {}
    251 
    252  Trace(const Trace& other) V8_NOEXCEPT
    253      : cp_offset_(other.cp_offset_),
    254        flush_budget_(other.flush_budget_),
    255        at_start_(other.at_start_),
    256        has_any_actions_(other.has_any_actions_),
    257        action_(nullptr),
    258        backtrack_(other.backtrack_),
    259        fixed_length_loop_state_(other.fixed_length_loop_state_),
    260        characters_preloaded_(other.characters_preloaded_),
    261        bound_checked_up_to_(other.bound_checked_up_to_),
    262        quick_check_performed_(other.quick_check_performed_),
    263        next_(&other) {}
    264 
    265  // End the trace.  This involves flushing the deferred actions in the trace
    266  // and pushing a backtrack location onto the backtrack stack.  Once this is
    267  // done we can start a new trace or go to one that has already been
    268  // generated.
    269  enum FlushMode {
    270    // Normal flush of the deferred actions, generates code for backtracking.
    271    kFlushFull,
    272    // Matching has succeeded, so current position and backtrack stack will be
    273    // ignored and need not be written.
    274    kFlushSuccess
    275  };
    276  void Flush(RegExpCompiler* compiler, RegExpNode* successor,
    277             FlushMode mode = kFlushFull);
    278 
    279  // Some callers add/subtract 1 from cp_offset, assuming that the result is
    280  // still valid. That's obviously not the case when our `cp_offset` is only
    281  // checked against kMinCPOffset/kMaxCPOffset, so we need to apply the some
    282  // slack.
    283  // TODO(jgruber): It would be better if all callers checked against limits
    284  // themselves when doing so; but unfortunately not all callers have
    285  // abort-compilation mechanisms.
    286  static constexpr int kCPOffsetSlack = 1;
    287  int cp_offset() const { return cp_offset_; }
    288 
    289  // Does any trace in the chain have an action?
    290  bool has_any_actions() const { return has_any_actions_; }
    291  // Does this particular trace object have an action?
    292  bool has_action() const { return action_ != nullptr; }
    293  ActionNode* action() const { return action_; }
    294  // A trivial trace is one that has no deferred actions or other state that
    295  // affects the assumptions used when generating code.  There is no recorded
    296  // backtrack location in a trivial trace, so with a trivial trace we will
    297  // generate code that, on a failure to match, gets the backtrack location
    298  // from the backtrack stack rather than using a direct jump instruction.  We
    299  // always start code generation with a trivial trace and non-trivial traces
    300  // are created as we emit code for nodes or add to the list of deferred
    301  // actions in the trace.  The location of the code generated for a node using
    302  // a trivial trace is recorded in a label in the node so that gotos can be
    303  // generated to that code.
    304  bool is_trivial() const {
    305    return backtrack_ == nullptr && !has_any_actions_ && cp_offset_ == 0 &&
    306           characters_preloaded_ == 0 && bound_checked_up_to_ == 0 &&
    307           quick_check_performed_.characters() == 0 && at_start_ == UNKNOWN;
    308  }
    309  TriBool at_start() const { return at_start_; }
    310  void set_at_start(TriBool at_start) { at_start_ = at_start; }
    311  Label* backtrack() const { return backtrack_; }
    312  FixedLengthLoopState* fixed_length_loop_state() const {
    313    return fixed_length_loop_state_;
    314  }
    315  int characters_preloaded() const { return characters_preloaded_; }
    316  int bound_checked_up_to() const { return bound_checked_up_to_; }
    317  int flush_budget() const { return flush_budget_; }
    318  QuickCheckDetails* quick_check_performed() { return &quick_check_performed_; }
    319  bool mentions_reg(int reg) const;
    320  // Returns true if a deferred position store exists to the specified
    321  // register and stores the offset in the out-parameter.  Otherwise
    322  // returns false.
    323  bool GetStoredPosition(int reg, int* cp_offset) const;
    324  // These set methods and AdvanceCurrentPositionInTrace should be used only on
    325  // new traces - the intention is that traces are immutable after creation.
    326  void add_action(ActionNode* new_action) {
    327    DCHECK(action_ == nullptr);  // Otherwise we lose an action.
    328    action_ = new_action;
    329    has_any_actions_ = true;
    330  }
    331  void set_backtrack(Label* backtrack) { backtrack_ = backtrack; }
    332  void set_fixed_length_loop_state(FixedLengthLoopState* state) {
    333    fixed_length_loop_state_ = state;
    334  }
    335  void set_characters_preloaded(int count) { characters_preloaded_ = count; }
    336  void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; }
    337  void set_flush_budget(int to) {
    338    DCHECK(to <= UINT16_MAX);  // Flush-budget is 16 bit.
    339    flush_budget_ = to;
    340  }
    341  void set_quick_check_performed(QuickCheckDetails* d) {
    342    quick_check_performed_ = *d;
    343  }
    344  void InvalidateCurrentCharacter();
    345  void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler);
    346  const Trace* next() const { return next_; }
    347 
    348  class ConstIterator final {
    349   public:
    350    ConstIterator& operator++() {
    351      trace_ = trace_->next();
    352      return *this;
    353    }
    354    bool operator==(const ConstIterator& other) const {
    355      return trace_ == other.trace_;
    356    }
    357    bool operator!=(const ConstIterator& other) const {
    358      return !(*this == other);
    359    }
    360    const Trace* operator*() const { return trace_; }
    361 
    362   private:
    363    explicit ConstIterator(const Trace* trace) : trace_(trace) {}
    364 
    365    const Trace* trace_;
    366 
    367    friend class Trace;
    368  };
    369 
    370  ConstIterator begin() const { return ConstIterator(this); }
    371  ConstIterator end() const { return ConstIterator(nullptr); }
    372 
    373 private:
    374  int FindAffectedRegisters(DynamicBitSet* affected_registers, Zone* zone);
    375  void PerformDeferredActions(RegExpMacroAssembler* macro, int max_register,
    376                              const DynamicBitSet& affected_registers,
    377                              DynamicBitSet* registers_to_pop,
    378                              DynamicBitSet* registers_to_clear, Zone* zone);
    379  void RestoreAffectedRegisters(RegExpMacroAssembler* macro, int max_register,
    380                                const DynamicBitSet& registers_to_pop,
    381                                const DynamicBitSet& registers_to_clear);
    382  int cp_offset_;
    383  uint16_t flush_budget_;
    384  TriBool at_start_ : 8;      // Whether we are at the start of the string.
    385  bool has_any_actions_ : 8;  // Whether any trace in the chain has an action.
    386  ActionNode* action_;
    387  Label* backtrack_;
    388  FixedLengthLoopState* fixed_length_loop_state_;
    389  int characters_preloaded_;
    390  int bound_checked_up_to_;
    391  QuickCheckDetails quick_check_performed_;
    392  const Trace* next_;
    393 };
    394 
    395 class FixedLengthLoopState {
    396 public:
    397  explicit FixedLengthLoopState(bool not_at_start,
    398                                ChoiceNode* loop_choice_node);
    399 
    400  void BindStepBackwardsLabel(RegExpMacroAssembler* macro_assembler);
    401  void BindLoopTopLabel(RegExpMacroAssembler* macro_assembler);
    402  void GoToLoopTopLabel(RegExpMacroAssembler* macro_assembler);
    403  ChoiceNode* loop_choice_node() const { return loop_choice_node_; }
    404  Trace* counter_backtrack_trace() { return &counter_backtrack_trace_; }
    405 
    406 private:
    407  Label step_backwards_label_;
    408  Label loop_top_label_;
    409  ChoiceNode* loop_choice_node_;
    410  Trace counter_backtrack_trace_;
    411 };
    412 
    413 struct PreloadState {
    414  static const int kEatsAtLeastNotYetInitialized = -1;
    415  bool preload_is_current_;
    416  bool preload_has_checked_bounds_;
    417  int preload_characters_;
    418  int eats_at_least_;
    419  void init() { eats_at_least_ = kEatsAtLeastNotYetInitialized; }
    420 };
    421 
    422 // Analysis performs assertion propagation and computes eats_at_least_ values.
    423 // See the comments on AssertionPropagator and EatsAtLeastPropagator for more
    424 // details.
    425 RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpFlags flags,
    426                          RegExpNode* node);
    427 
    428 class FrequencyCollator {
    429 public:
    430  FrequencyCollator() : total_samples_(0) {
    431    for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
    432      frequencies_[i] = CharacterFrequency(i);
    433    }
    434  }
    435 
    436  void CountCharacter(int character) {
    437    int index = (character & RegExpMacroAssembler::kTableMask);
    438    frequencies_[index].Increment();
    439    total_samples_++;
    440  }
    441 
    442  // Does not measure in percent, but rather per-128 (the table size from the
    443  // regexp macro assembler).
    444  int Frequency(int in_character) {
    445    DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
    446    if (total_samples_ < 1) return 1;  // Division by zero.
    447    int freq_in_per128 =
    448        (frequencies_[in_character].counter() * 128) / total_samples_;
    449    return freq_in_per128;
    450  }
    451 
    452 private:
    453  class CharacterFrequency {
    454   public:
    455    CharacterFrequency() : counter_(0), character_(-1) {}
    456    explicit CharacterFrequency(int character)
    457        : counter_(0), character_(character) {}
    458 
    459    void Increment() { counter_++; }
    460    int counter() { return counter_; }
    461    int character() { return character_; }
    462 
    463   private:
    464    int counter_;
    465    int character_;
    466  };
    467 
    468 private:
    469  CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
    470  int total_samples_;
    471 };
    472 
    473 class RegExpCompiler {
    474 public:
    475  RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
    476                 RegExpFlags flags, bool is_one_byte);
    477 
    478  int AllocateRegister() {
    479    if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
    480      reg_exp_too_big_ = true;
    481      return next_register_;
    482    }
    483    return next_register_++;
    484  }
    485 
    486  // Lookarounds to match lone surrogates for unicode character class matches
    487  // are never nested. We can therefore reuse registers.
    488  int UnicodeLookaroundStackRegister() {
    489    if (unicode_lookaround_stack_register_ == kNoRegister) {
    490      unicode_lookaround_stack_register_ = AllocateRegister();
    491    }
    492    return unicode_lookaround_stack_register_;
    493  }
    494 
    495  int UnicodeLookaroundPositionRegister() {
    496    if (unicode_lookaround_position_register_ == kNoRegister) {
    497      unicode_lookaround_position_register_ = AllocateRegister();
    498    }
    499    return unicode_lookaround_position_register_;
    500  }
    501 
    502  struct CompilationResult final {
    503    explicit CompilationResult(RegExpError err) : error(err) {}
    504    CompilationResult(DirectHandle<Object> code, int registers)
    505        : code(code), num_registers(registers) {}
    506 
    507    static CompilationResult RegExpTooBig() {
    508      return CompilationResult(RegExpError::kTooLarge);
    509    }
    510 
    511    bool Succeeded() const { return error == RegExpError::kNone; }
    512 
    513    const RegExpError error = RegExpError::kNone;
    514    DirectHandle<Object> code;
    515    int num_registers = 0;
    516  };
    517 
    518  CompilationResult Assemble(Isolate* isolate, RegExpMacroAssembler* assembler,
    519                             RegExpNode* start, int capture_count,
    520                             DirectHandle<String> pattern);
    521 
    522  // Preprocessing is the final step of node creation before analysis
    523  // and assembly. It includes:
    524  // - Wrapping the body of the regexp in capture 0.
    525  // - Inserting the implicit .* before/after the regexp if necessary.
    526  // - If the input is a one-byte string, filtering out nodes that can't match.
    527  // - Fixing up regexp matches that start within a surrogate pair.
    528  RegExpNode* PreprocessRegExp(RegExpCompileData* data, bool is_one_byte);
    529 
    530  // If the regexp matching starts within a surrogate pair, step back to the
    531  // lead surrogate and start matching from there.
    532  RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpNode* on_success);
    533 
    534  inline void AddWork(RegExpNode* node) {
    535    if (!node->on_work_list() && !node->label()->is_bound()) {
    536      node->set_on_work_list(true);
    537      work_list_->push_back(node);
    538    }
    539  }
    540 
    541  static const int kImplementationOffset = 0;
    542  static const int kNumberOfRegistersOffset = 0;
    543  static const int kCodeOffset = 1;
    544 
    545  RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
    546  EndNode* accept() { return accept_; }
    547 
    548 #if defined(V8_TARGET_OS_MACOS)
    549  // Looks like MacOS needs a lower recursion limit since "secondary threads"
    550  // get a smaller stack by default (512kB vs. 8MB).
    551  // See https://crbug.com/408820921.
    552  static constexpr int kMaxRecursion = 50;
    553 #else
    554  static constexpr int kMaxRecursion = 100;
    555 #endif
    556  inline int recursion_depth() { return recursion_depth_; }
    557  inline void IncrementRecursionDepth() { recursion_depth_++; }
    558  inline void DecrementRecursionDepth() { recursion_depth_--; }
    559 
    560  inline RegExpFlags flags() const { return flags_; }
    561  inline void set_flags(RegExpFlags flags) { flags_ = flags; }
    562 
    563  void SetRegExpTooBig() { reg_exp_too_big_ = true; }
    564 
    565  inline bool one_byte() { return one_byte_; }
    566  inline bool optimize() { return optimize_; }
    567  inline void set_optimize(bool value) { optimize_ = value; }
    568  inline bool limiting_recursion() { return limiting_recursion_; }
    569  inline void set_limiting_recursion(bool value) {
    570    limiting_recursion_ = value;
    571  }
    572  bool read_backward() { return read_backward_; }
    573  void set_read_backward(bool value) { read_backward_ = value; }
    574  FrequencyCollator* frequency_collator() { return &frequency_collator_; }
    575 
    576  int current_expansion_factor() { return current_expansion_factor_; }
    577  void set_current_expansion_factor(int value) {
    578    current_expansion_factor_ = value;
    579  }
    580 
    581  // The recursive nature of ToNode node generation means we may run into stack
    582  // overflow issues. We introduce periodic checks to detect these, and the
    583  // tick counter helps limit overhead of these checks.
    584  // TODO(jgruber): This is super hacky and should be replaced by an abort
    585  // mechanism or iterative node generation.
    586  void ToNodeMaybeCheckForStackOverflow() {
    587    if ((to_node_overflow_check_ticks_++ % 64 == 0)) {
    588      ToNodeCheckForStackOverflow();
    589    }
    590  }
    591  void ToNodeCheckForStackOverflow();
    592 
    593  Isolate* isolate() const { return isolate_; }
    594  Zone* zone() const { return zone_; }
    595 
    596  static const int kNoRegister = -1;
    597 
    598 private:
    599  EndNode* accept_;
    600  int next_register_;
    601  int unicode_lookaround_stack_register_;
    602  int unicode_lookaround_position_register_;
    603  ZoneVector<RegExpNode*>* work_list_;
    604  int recursion_depth_;
    605  RegExpFlags flags_;
    606  RegExpMacroAssembler* macro_assembler_;
    607  bool one_byte_;
    608  bool reg_exp_too_big_;
    609  bool limiting_recursion_;
    610  int to_node_overflow_check_ticks_ = 0;
    611  bool optimize_;
    612  bool read_backward_;
    613  int current_expansion_factor_;
    614  FrequencyCollator frequency_collator_;
    615  Isolate* isolate_;
    616  Zone* zone_;
    617 };
    618 
    619 // Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates.
    620 class UnicodeRangeSplitter {
    621 public:
    622  V8_EXPORT_PRIVATE UnicodeRangeSplitter(ZoneList<CharacterRange>* base);
    623 
    624  static constexpr int kInitialSize = 8;
    625  using CharacterRangeVector = base::SmallVector<CharacterRange, kInitialSize>;
    626 
    627  const CharacterRangeVector* bmp() const { return &bmp_; }
    628  const CharacterRangeVector* lead_surrogates() const {
    629    return &lead_surrogates_;
    630  }
    631  const CharacterRangeVector* trail_surrogates() const {
    632    return &trail_surrogates_;
    633  }
    634  const CharacterRangeVector* non_bmp() const { return &non_bmp_; }
    635 
    636 private:
    637  void AddRange(CharacterRange range);
    638 
    639  CharacterRangeVector bmp_;
    640  CharacterRangeVector lead_surrogates_;
    641  CharacterRangeVector trail_surrogates_;
    642  CharacterRangeVector non_bmp_;
    643 };
    644 
    645 // We need to check for the following characters: 0x39C 0x3BC 0x178.
    646 // TODO(jgruber): Move to CharacterRange.
    647 bool RangeContainsLatin1Equivalents(CharacterRange range);
    648 
    649 }  // namespace internal
    650 }  // namespace v8
    651 
    652 #endif  // V8_REGEXP_REGEXP_COMPILER_H_