regexp-compiler.h (25026B)
1 // Copyright 2019 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef V8_REGEXP_REGEXP_COMPILER_H_ 6 #define V8_REGEXP_REGEXP_COMPILER_H_ 7 8 #include <bitset> 9 10 #include "irregexp/imported/regexp-nodes.h" 11 12 namespace v8 { 13 namespace internal { 14 15 class DynamicBitSet; 16 class Isolate; 17 class FixedLengthLoopState; 18 19 namespace regexp_compiler_constants { 20 21 // The '2' variant is has inclusive from and exclusive to. 22 // This covers \s as defined in ECMA-262 5.1, 15.10.2.12, 23 // which include WhiteSpace (7.2) or LineTerminator (7.3) values. 24 constexpr base::uc32 kRangeEndMarker = 0x110000; 25 constexpr int kSpaceRanges[] = { 26 '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680, 27 0x1681, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030, 28 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker}; 29 constexpr int kSpaceRangeCount = arraysize(kSpaceRanges); 30 31 constexpr int kWordRanges[] = {'0', '9' + 1, 'A', 'Z' + 1, '_', 32 '_' + 1, 'a', 'z' + 1, kRangeEndMarker}; 33 constexpr int kWordRangeCount = arraysize(kWordRanges); 34 constexpr int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker}; 35 constexpr int kDigitRangeCount = arraysize(kDigitRanges); 36 constexpr int kSurrogateRanges[] = {kLeadSurrogateStart, 37 kLeadSurrogateStart + 1, kRangeEndMarker}; 38 constexpr int kSurrogateRangeCount = arraysize(kSurrogateRanges); 39 constexpr int kLineTerminatorRanges[] = {0x000A, 0x000B, 0x000D, 0x000E, 40 0x2028, 0x202A, kRangeEndMarker}; 41 constexpr int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges); 42 43 // More makes code generation slower, less makes V8 benchmark score lower. 44 constexpr uint32_t kMaxLookaheadForBoyerMoore = 8; 45 // In a 3-character pattern you can maximally step forwards 3 characters 46 // at a time, which is not always enough to pay for the extra logic. 47 constexpr uint32_t kPatternTooShortForBoyerMoore = 2; 48 49 } // namespace regexp_compiler_constants 50 51 inline bool NeedsUnicodeCaseEquivalents(RegExpFlags flags) { 52 // Both unicode (or unicode sets) and ignore_case flags are set. We need to 53 // use ICU to find the closure over case equivalents. 54 return IsEitherUnicode(flags) && IsIgnoreCase(flags); 55 } 56 57 // Details of a quick mask-compare check that can look ahead in the 58 // input stream. 59 class QuickCheckDetails { 60 public: 61 QuickCheckDetails() 62 : characters_(0), mask_(0), value_(0), cannot_match_(false) {} 63 explicit QuickCheckDetails(int characters) 64 : characters_(characters), mask_(0), value_(0), cannot_match_(false) {} 65 bool Rationalize(bool one_byte); 66 // Merge in the information from another branch of an alternation. 67 void Merge(QuickCheckDetails* other, int from_index); 68 // Advance the current position by some amount. 69 void Advance(int by, bool one_byte); 70 void Clear(); 71 bool cannot_match() { return cannot_match_; } 72 void set_cannot_match() { cannot_match_ = true; } 73 struct Position { 74 Position() : mask(0), value(0), determines_perfectly(false) {} 75 base::uc32 mask; 76 base::uc32 value; 77 bool determines_perfectly; 78 }; 79 int characters() const { return characters_; } 80 void set_characters(int characters) { characters_ = characters; } 81 Position* positions(int index) { 82 DCHECK_LE(0, index); 83 DCHECK_GT(characters_, index); 84 return positions_ + index; 85 } 86 const Position* positions(int index) const { 87 DCHECK_LE(0, index); 88 DCHECK_GT(characters_, index); 89 return positions_ + index; 90 } 91 uint32_t mask() { return mask_; } 92 uint32_t value() { return value_; } 93 94 private: 95 // How many characters do we have quick check information from. This is 96 // the same for all branches of a choice node. 97 int characters_; 98 Position positions_[4]; 99 // These values are the condensate of the above array after Rationalize(). 100 uint32_t mask_; 101 uint32_t value_; 102 // If set to true, there is no way this quick check can match at all. 103 // E.g., if it requires to be at the start of the input, and isn't. 104 bool cannot_match_; 105 }; 106 107 // Improve the speed that we scan for an initial point where a non-anchored 108 // regexp can match by using a Boyer-Moore-like table. This is done by 109 // identifying non-greedy non-capturing loops in the nodes that eat any 110 // character one at a time. For example in the middle of the regexp 111 // /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly 112 // inserted at the start of any non-anchored regexp. 113 // 114 // When we have found such a loop we look ahead in the nodes to find the set of 115 // characters that can come at given distances. For example for the regexp 116 // /.?foo/ we know that there are at least 3 characters ahead of us, and the 117 // sets of characters that can occur are [any, [f, o], [o]]. We find a range in 118 // the lookahead info where the set of characters is reasonably constrained. In 119 // our example this is from index 1 to 2 (0 is not constrained). We can now 120 // look 3 characters ahead and if we don't find one of [f, o] (the union of 121 // [f, o] and [o]) then we can skip forwards by the range size (in this case 2). 122 // 123 // For Unicode input strings we do the same, but modulo 128. 124 // 125 // We also look at the first string fed to the regexp and use that to get a hint 126 // of the character frequencies in the inputs. This affects the assessment of 127 // whether the set of characters is 'reasonably constrained'. 128 // 129 // We also have another lookahead mechanism (called quick check in the code), 130 // which uses a wide load of multiple characters followed by a mask and compare 131 // to determine whether a match is possible at this point. 132 enum ContainedInLattice { 133 kNotYet = 0, 134 kLatticeIn = 1, 135 kLatticeOut = 2, 136 kLatticeUnknown = 3 // Can also mean both in and out. 137 }; 138 139 inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) { 140 return static_cast<ContainedInLattice>(a | b); 141 } 142 143 class BoyerMoorePositionInfo : public ZoneObject { 144 public: 145 bool at(int i) const { return map_[i]; } 146 147 static constexpr int kMapSize = 128; 148 static constexpr int kMask = kMapSize - 1; 149 150 int map_count() const { return map_count_; } 151 152 void Set(int character); 153 void SetInterval(const Interval& interval); 154 void SetAll(); 155 156 bool is_non_word() { return w_ == kLatticeOut; } 157 bool is_word() { return w_ == kLatticeIn; } 158 159 using Bitset = std::bitset<kMapSize>; 160 Bitset raw_bitset() const { return map_; } 161 162 private: 163 Bitset map_; 164 int map_count_ = 0; // Number of set bits in the map. 165 ContainedInLattice w_ = kNotYet; // The \w character class. 166 }; 167 168 class BoyerMooreLookahead : public ZoneObject { 169 public: 170 BoyerMooreLookahead(int length, RegExpCompiler* compiler, Zone* zone); 171 172 int length() { return length_; } 173 int max_char() { return max_char_; } 174 RegExpCompiler* compiler() { return compiler_; } 175 176 int Count(int map_number) { return bitmaps_->at(map_number)->map_count(); } 177 178 BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); } 179 180 void Set(int map_number, int character) { 181 if (character > max_char_) return; 182 BoyerMoorePositionInfo* info = bitmaps_->at(map_number); 183 info->Set(character); 184 } 185 186 void SetInterval(int map_number, const Interval& interval) { 187 if (interval.from() > max_char_) return; 188 BoyerMoorePositionInfo* info = bitmaps_->at(map_number); 189 if (interval.to() > max_char_) { 190 info->SetInterval(Interval(interval.from(), max_char_)); 191 } else { 192 info->SetInterval(interval); 193 } 194 } 195 196 void SetAll(int map_number) { bitmaps_->at(map_number)->SetAll(); } 197 198 void SetRest(int from_map) { 199 for (int i = from_map; i < length_; i++) SetAll(i); 200 } 201 void EmitSkipInstructions(RegExpMacroAssembler* masm); 202 203 private: 204 // This is the value obtained by EatsAtLeast. If we do not have at least this 205 // many characters left in the sample string then the match is bound to fail. 206 // Therefore it is OK to read a character this far ahead of the current match 207 // point. 208 int length_; 209 RegExpCompiler* compiler_; 210 // 0xff for Latin1, 0xffff for UTF-16. 211 int max_char_; 212 ZoneList<BoyerMoorePositionInfo*>* bitmaps_; 213 214 int GetSkipTable( 215 int min_lookahead, int max_lookahead, 216 DirectHandle<ByteArray> boolean_skip_table, 217 DirectHandle<ByteArray> nibble_table = DirectHandle<ByteArray>{}); 218 bool FindWorthwhileInterval(int* from, int* to); 219 int FindBestInterval(int max_number_of_chars, int old_biggest_points, 220 int* from, int* to); 221 }; 222 223 // There are many ways to generate code for a node. This class encapsulates 224 // the current way we should be generating. In other words it encapsulates 225 // the current state of the code generator. The effect of this is that we 226 // generate code for paths that the matcher can take through the regular 227 // expression. A given node in the regexp can be code-generated several times 228 // as it can be part of several traces. For example for the regexp: 229 // /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part 230 // of the foo-bar-baz trace and once as part of the foo-ip-baz trace. The code 231 // to match foo is generated only once (the traces have a common prefix). The 232 // code to store the capture is deferred and generated (twice) after the places 233 // where baz has been matched. 234 class Trace { 235 public: 236 // A value for a property that is either known to be true, know to be false, 237 // or not known. 238 enum TriBool { UNKNOWN = -1, FALSE_VALUE = 0, TRUE_VALUE = 1 }; 239 240 Trace() 241 : cp_offset_(0), 242 flush_budget_(100), // Note: this is a 16 bit field. 243 at_start_(UNKNOWN), 244 has_any_actions_(false), 245 action_(nullptr), 246 backtrack_(nullptr), 247 fixed_length_loop_state_(nullptr), 248 characters_preloaded_(0), 249 bound_checked_up_to_(0), 250 next_(nullptr) {} 251 252 Trace(const Trace& other) V8_NOEXCEPT 253 : cp_offset_(other.cp_offset_), 254 flush_budget_(other.flush_budget_), 255 at_start_(other.at_start_), 256 has_any_actions_(other.has_any_actions_), 257 action_(nullptr), 258 backtrack_(other.backtrack_), 259 fixed_length_loop_state_(other.fixed_length_loop_state_), 260 characters_preloaded_(other.characters_preloaded_), 261 bound_checked_up_to_(other.bound_checked_up_to_), 262 quick_check_performed_(other.quick_check_performed_), 263 next_(&other) {} 264 265 // End the trace. This involves flushing the deferred actions in the trace 266 // and pushing a backtrack location onto the backtrack stack. Once this is 267 // done we can start a new trace or go to one that has already been 268 // generated. 269 enum FlushMode { 270 // Normal flush of the deferred actions, generates code for backtracking. 271 kFlushFull, 272 // Matching has succeeded, so current position and backtrack stack will be 273 // ignored and need not be written. 274 kFlushSuccess 275 }; 276 void Flush(RegExpCompiler* compiler, RegExpNode* successor, 277 FlushMode mode = kFlushFull); 278 279 // Some callers add/subtract 1 from cp_offset, assuming that the result is 280 // still valid. That's obviously not the case when our `cp_offset` is only 281 // checked against kMinCPOffset/kMaxCPOffset, so we need to apply the some 282 // slack. 283 // TODO(jgruber): It would be better if all callers checked against limits 284 // themselves when doing so; but unfortunately not all callers have 285 // abort-compilation mechanisms. 286 static constexpr int kCPOffsetSlack = 1; 287 int cp_offset() const { return cp_offset_; } 288 289 // Does any trace in the chain have an action? 290 bool has_any_actions() const { return has_any_actions_; } 291 // Does this particular trace object have an action? 292 bool has_action() const { return action_ != nullptr; } 293 ActionNode* action() const { return action_; } 294 // A trivial trace is one that has no deferred actions or other state that 295 // affects the assumptions used when generating code. There is no recorded 296 // backtrack location in a trivial trace, so with a trivial trace we will 297 // generate code that, on a failure to match, gets the backtrack location 298 // from the backtrack stack rather than using a direct jump instruction. We 299 // always start code generation with a trivial trace and non-trivial traces 300 // are created as we emit code for nodes or add to the list of deferred 301 // actions in the trace. The location of the code generated for a node using 302 // a trivial trace is recorded in a label in the node so that gotos can be 303 // generated to that code. 304 bool is_trivial() const { 305 return backtrack_ == nullptr && !has_any_actions_ && cp_offset_ == 0 && 306 characters_preloaded_ == 0 && bound_checked_up_to_ == 0 && 307 quick_check_performed_.characters() == 0 && at_start_ == UNKNOWN; 308 } 309 TriBool at_start() const { return at_start_; } 310 void set_at_start(TriBool at_start) { at_start_ = at_start; } 311 Label* backtrack() const { return backtrack_; } 312 FixedLengthLoopState* fixed_length_loop_state() const { 313 return fixed_length_loop_state_; 314 } 315 int characters_preloaded() const { return characters_preloaded_; } 316 int bound_checked_up_to() const { return bound_checked_up_to_; } 317 int flush_budget() const { return flush_budget_; } 318 QuickCheckDetails* quick_check_performed() { return &quick_check_performed_; } 319 bool mentions_reg(int reg) const; 320 // Returns true if a deferred position store exists to the specified 321 // register and stores the offset in the out-parameter. Otherwise 322 // returns false. 323 bool GetStoredPosition(int reg, int* cp_offset) const; 324 // These set methods and AdvanceCurrentPositionInTrace should be used only on 325 // new traces - the intention is that traces are immutable after creation. 326 void add_action(ActionNode* new_action) { 327 DCHECK(action_ == nullptr); // Otherwise we lose an action. 328 action_ = new_action; 329 has_any_actions_ = true; 330 } 331 void set_backtrack(Label* backtrack) { backtrack_ = backtrack; } 332 void set_fixed_length_loop_state(FixedLengthLoopState* state) { 333 fixed_length_loop_state_ = state; 334 } 335 void set_characters_preloaded(int count) { characters_preloaded_ = count; } 336 void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; } 337 void set_flush_budget(int to) { 338 DCHECK(to <= UINT16_MAX); // Flush-budget is 16 bit. 339 flush_budget_ = to; 340 } 341 void set_quick_check_performed(QuickCheckDetails* d) { 342 quick_check_performed_ = *d; 343 } 344 void InvalidateCurrentCharacter(); 345 void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler); 346 const Trace* next() const { return next_; } 347 348 class ConstIterator final { 349 public: 350 ConstIterator& operator++() { 351 trace_ = trace_->next(); 352 return *this; 353 } 354 bool operator==(const ConstIterator& other) const { 355 return trace_ == other.trace_; 356 } 357 bool operator!=(const ConstIterator& other) const { 358 return !(*this == other); 359 } 360 const Trace* operator*() const { return trace_; } 361 362 private: 363 explicit ConstIterator(const Trace* trace) : trace_(trace) {} 364 365 const Trace* trace_; 366 367 friend class Trace; 368 }; 369 370 ConstIterator begin() const { return ConstIterator(this); } 371 ConstIterator end() const { return ConstIterator(nullptr); } 372 373 private: 374 int FindAffectedRegisters(DynamicBitSet* affected_registers, Zone* zone); 375 void PerformDeferredActions(RegExpMacroAssembler* macro, int max_register, 376 const DynamicBitSet& affected_registers, 377 DynamicBitSet* registers_to_pop, 378 DynamicBitSet* registers_to_clear, Zone* zone); 379 void RestoreAffectedRegisters(RegExpMacroAssembler* macro, int max_register, 380 const DynamicBitSet& registers_to_pop, 381 const DynamicBitSet& registers_to_clear); 382 int cp_offset_; 383 uint16_t flush_budget_; 384 TriBool at_start_ : 8; // Whether we are at the start of the string. 385 bool has_any_actions_ : 8; // Whether any trace in the chain has an action. 386 ActionNode* action_; 387 Label* backtrack_; 388 FixedLengthLoopState* fixed_length_loop_state_; 389 int characters_preloaded_; 390 int bound_checked_up_to_; 391 QuickCheckDetails quick_check_performed_; 392 const Trace* next_; 393 }; 394 395 class FixedLengthLoopState { 396 public: 397 explicit FixedLengthLoopState(bool not_at_start, 398 ChoiceNode* loop_choice_node); 399 400 void BindStepBackwardsLabel(RegExpMacroAssembler* macro_assembler); 401 void BindLoopTopLabel(RegExpMacroAssembler* macro_assembler); 402 void GoToLoopTopLabel(RegExpMacroAssembler* macro_assembler); 403 ChoiceNode* loop_choice_node() const { return loop_choice_node_; } 404 Trace* counter_backtrack_trace() { return &counter_backtrack_trace_; } 405 406 private: 407 Label step_backwards_label_; 408 Label loop_top_label_; 409 ChoiceNode* loop_choice_node_; 410 Trace counter_backtrack_trace_; 411 }; 412 413 struct PreloadState { 414 static const int kEatsAtLeastNotYetInitialized = -1; 415 bool preload_is_current_; 416 bool preload_has_checked_bounds_; 417 int preload_characters_; 418 int eats_at_least_; 419 void init() { eats_at_least_ = kEatsAtLeastNotYetInitialized; } 420 }; 421 422 // Analysis performs assertion propagation and computes eats_at_least_ values. 423 // See the comments on AssertionPropagator and EatsAtLeastPropagator for more 424 // details. 425 RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpFlags flags, 426 RegExpNode* node); 427 428 class FrequencyCollator { 429 public: 430 FrequencyCollator() : total_samples_(0) { 431 for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) { 432 frequencies_[i] = CharacterFrequency(i); 433 } 434 } 435 436 void CountCharacter(int character) { 437 int index = (character & RegExpMacroAssembler::kTableMask); 438 frequencies_[index].Increment(); 439 total_samples_++; 440 } 441 442 // Does not measure in percent, but rather per-128 (the table size from the 443 // regexp macro assembler). 444 int Frequency(int in_character) { 445 DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character); 446 if (total_samples_ < 1) return 1; // Division by zero. 447 int freq_in_per128 = 448 (frequencies_[in_character].counter() * 128) / total_samples_; 449 return freq_in_per128; 450 } 451 452 private: 453 class CharacterFrequency { 454 public: 455 CharacterFrequency() : counter_(0), character_(-1) {} 456 explicit CharacterFrequency(int character) 457 : counter_(0), character_(character) {} 458 459 void Increment() { counter_++; } 460 int counter() { return counter_; } 461 int character() { return character_; } 462 463 private: 464 int counter_; 465 int character_; 466 }; 467 468 private: 469 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize]; 470 int total_samples_; 471 }; 472 473 class RegExpCompiler { 474 public: 475 RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count, 476 RegExpFlags flags, bool is_one_byte); 477 478 int AllocateRegister() { 479 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) { 480 reg_exp_too_big_ = true; 481 return next_register_; 482 } 483 return next_register_++; 484 } 485 486 // Lookarounds to match lone surrogates for unicode character class matches 487 // are never nested. We can therefore reuse registers. 488 int UnicodeLookaroundStackRegister() { 489 if (unicode_lookaround_stack_register_ == kNoRegister) { 490 unicode_lookaround_stack_register_ = AllocateRegister(); 491 } 492 return unicode_lookaround_stack_register_; 493 } 494 495 int UnicodeLookaroundPositionRegister() { 496 if (unicode_lookaround_position_register_ == kNoRegister) { 497 unicode_lookaround_position_register_ = AllocateRegister(); 498 } 499 return unicode_lookaround_position_register_; 500 } 501 502 struct CompilationResult final { 503 explicit CompilationResult(RegExpError err) : error(err) {} 504 CompilationResult(DirectHandle<Object> code, int registers) 505 : code(code), num_registers(registers) {} 506 507 static CompilationResult RegExpTooBig() { 508 return CompilationResult(RegExpError::kTooLarge); 509 } 510 511 bool Succeeded() const { return error == RegExpError::kNone; } 512 513 const RegExpError error = RegExpError::kNone; 514 DirectHandle<Object> code; 515 int num_registers = 0; 516 }; 517 518 CompilationResult Assemble(Isolate* isolate, RegExpMacroAssembler* assembler, 519 RegExpNode* start, int capture_count, 520 DirectHandle<String> pattern); 521 522 // Preprocessing is the final step of node creation before analysis 523 // and assembly. It includes: 524 // - Wrapping the body of the regexp in capture 0. 525 // - Inserting the implicit .* before/after the regexp if necessary. 526 // - If the input is a one-byte string, filtering out nodes that can't match. 527 // - Fixing up regexp matches that start within a surrogate pair. 528 RegExpNode* PreprocessRegExp(RegExpCompileData* data, bool is_one_byte); 529 530 // If the regexp matching starts within a surrogate pair, step back to the 531 // lead surrogate and start matching from there. 532 RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpNode* on_success); 533 534 inline void AddWork(RegExpNode* node) { 535 if (!node->on_work_list() && !node->label()->is_bound()) { 536 node->set_on_work_list(true); 537 work_list_->push_back(node); 538 } 539 } 540 541 static const int kImplementationOffset = 0; 542 static const int kNumberOfRegistersOffset = 0; 543 static const int kCodeOffset = 1; 544 545 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; } 546 EndNode* accept() { return accept_; } 547 548 #if defined(V8_TARGET_OS_MACOS) 549 // Looks like MacOS needs a lower recursion limit since "secondary threads" 550 // get a smaller stack by default (512kB vs. 8MB). 551 // See https://crbug.com/408820921. 552 static constexpr int kMaxRecursion = 50; 553 #else 554 static constexpr int kMaxRecursion = 100; 555 #endif 556 inline int recursion_depth() { return recursion_depth_; } 557 inline void IncrementRecursionDepth() { recursion_depth_++; } 558 inline void DecrementRecursionDepth() { recursion_depth_--; } 559 560 inline RegExpFlags flags() const { return flags_; } 561 inline void set_flags(RegExpFlags flags) { flags_ = flags; } 562 563 void SetRegExpTooBig() { reg_exp_too_big_ = true; } 564 565 inline bool one_byte() { return one_byte_; } 566 inline bool optimize() { return optimize_; } 567 inline void set_optimize(bool value) { optimize_ = value; } 568 inline bool limiting_recursion() { return limiting_recursion_; } 569 inline void set_limiting_recursion(bool value) { 570 limiting_recursion_ = value; 571 } 572 bool read_backward() { return read_backward_; } 573 void set_read_backward(bool value) { read_backward_ = value; } 574 FrequencyCollator* frequency_collator() { return &frequency_collator_; } 575 576 int current_expansion_factor() { return current_expansion_factor_; } 577 void set_current_expansion_factor(int value) { 578 current_expansion_factor_ = value; 579 } 580 581 // The recursive nature of ToNode node generation means we may run into stack 582 // overflow issues. We introduce periodic checks to detect these, and the 583 // tick counter helps limit overhead of these checks. 584 // TODO(jgruber): This is super hacky and should be replaced by an abort 585 // mechanism or iterative node generation. 586 void ToNodeMaybeCheckForStackOverflow() { 587 if ((to_node_overflow_check_ticks_++ % 64 == 0)) { 588 ToNodeCheckForStackOverflow(); 589 } 590 } 591 void ToNodeCheckForStackOverflow(); 592 593 Isolate* isolate() const { return isolate_; } 594 Zone* zone() const { return zone_; } 595 596 static const int kNoRegister = -1; 597 598 private: 599 EndNode* accept_; 600 int next_register_; 601 int unicode_lookaround_stack_register_; 602 int unicode_lookaround_position_register_; 603 ZoneVector<RegExpNode*>* work_list_; 604 int recursion_depth_; 605 RegExpFlags flags_; 606 RegExpMacroAssembler* macro_assembler_; 607 bool one_byte_; 608 bool reg_exp_too_big_; 609 bool limiting_recursion_; 610 int to_node_overflow_check_ticks_ = 0; 611 bool optimize_; 612 bool read_backward_; 613 int current_expansion_factor_; 614 FrequencyCollator frequency_collator_; 615 Isolate* isolate_; 616 Zone* zone_; 617 }; 618 619 // Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates. 620 class UnicodeRangeSplitter { 621 public: 622 V8_EXPORT_PRIVATE UnicodeRangeSplitter(ZoneList<CharacterRange>* base); 623 624 static constexpr int kInitialSize = 8; 625 using CharacterRangeVector = base::SmallVector<CharacterRange, kInitialSize>; 626 627 const CharacterRangeVector* bmp() const { return &bmp_; } 628 const CharacterRangeVector* lead_surrogates() const { 629 return &lead_surrogates_; 630 } 631 const CharacterRangeVector* trail_surrogates() const { 632 return &trail_surrogates_; 633 } 634 const CharacterRangeVector* non_bmp() const { return &non_bmp_; } 635 636 private: 637 void AddRange(CharacterRange range); 638 639 CharacterRangeVector bmp_; 640 CharacterRangeVector lead_surrogates_; 641 CharacterRangeVector trail_surrogates_; 642 CharacterRangeVector non_bmp_; 643 }; 644 645 // We need to check for the following characters: 0x39C 0x3BC 0x178. 646 // TODO(jgruber): Move to CharacterRange. 647 bool RangeContainsLatin1Equivalents(CharacterRange range); 648 649 } // namespace internal 650 } // namespace v8 651 652 #endif // V8_REGEXP_REGEXP_COMPILER_H_