regexp-nodes.h (34020B)
1 // Copyright 2019 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef V8_REGEXP_REGEXP_NODES_H_ 6 #define V8_REGEXP_REGEXP_NODES_H_ 7 8 #include "irregexp/imported/regexp-macro-assembler.h" 9 10 namespace v8 { 11 namespace internal { 12 13 class AlternativeGenerationList; 14 class BoyerMooreLookahead; 15 class FixedLengthLoopState; 16 class NodeVisitor; 17 class QuickCheckDetails; 18 class RegExpCompiler; 19 class SeqRegExpNode; 20 class Trace; 21 struct PreloadState; 22 23 #define FOR_EACH_NODE_TYPE(VISIT) \ 24 VISIT(End) \ 25 VISIT(Action) \ 26 VISIT(Choice) \ 27 VISIT(LoopChoice) \ 28 VISIT(NegativeLookaroundChoice) \ 29 VISIT(BackReference) \ 30 VISIT(Assertion) \ 31 VISIT(Text) 32 33 #define FORWARD_DECLARE(type) class type##Node; 34 FOR_EACH_NODE_TYPE(FORWARD_DECLARE) 35 #undef FORWARD_DECLARE 36 37 struct NodeInfo final { 38 NodeInfo() 39 : being_analyzed(false), 40 been_analyzed(false), 41 follows_word_interest(false), 42 follows_newline_interest(false), 43 follows_start_interest(false), 44 at_end(false), 45 visited(false), 46 replacement_calculated(false) {} 47 48 // Returns true if the interests and assumptions of this node 49 // matches the given one. 50 bool Matches(NodeInfo* that) { 51 return (at_end == that->at_end) && 52 (follows_word_interest == that->follows_word_interest) && 53 (follows_newline_interest == that->follows_newline_interest) && 54 (follows_start_interest == that->follows_start_interest); 55 } 56 57 // Updates the interests of this node given the interests of the 58 // node preceding it. 59 void AddFromPreceding(NodeInfo* that) { 60 at_end |= that->at_end; 61 follows_word_interest |= that->follows_word_interest; 62 follows_newline_interest |= that->follows_newline_interest; 63 follows_start_interest |= that->follows_start_interest; 64 } 65 66 bool HasLookbehind() { 67 return follows_word_interest || follows_newline_interest || 68 follows_start_interest; 69 } 70 71 // Sets the interests of this node to include the interests of the 72 // following node. 73 void AddFromFollowing(NodeInfo* that) { 74 follows_word_interest |= that->follows_word_interest; 75 follows_newline_interest |= that->follows_newline_interest; 76 follows_start_interest |= that->follows_start_interest; 77 } 78 79 void ResetCompilationState() { 80 being_analyzed = false; 81 been_analyzed = false; 82 } 83 84 bool being_analyzed : 1; 85 bool been_analyzed : 1; 86 87 // These bits are set of this node has to know what the preceding 88 // character was. 89 bool follows_word_interest : 1; 90 bool follows_newline_interest : 1; 91 bool follows_start_interest : 1; 92 93 bool at_end : 1; 94 bool visited : 1; 95 bool replacement_calculated : 1; 96 }; 97 98 struct EatsAtLeastInfo final { 99 EatsAtLeastInfo() : EatsAtLeastInfo(0) {} 100 explicit EatsAtLeastInfo(uint8_t eats) 101 : eats_at_least_from_possibly_start(eats), 102 eats_at_least_from_not_start(eats) {} 103 void SetMin(const EatsAtLeastInfo& other) { 104 if (other.eats_at_least_from_possibly_start < 105 eats_at_least_from_possibly_start) { 106 eats_at_least_from_possibly_start = 107 other.eats_at_least_from_possibly_start; 108 } 109 if (other.eats_at_least_from_not_start < eats_at_least_from_not_start) { 110 eats_at_least_from_not_start = other.eats_at_least_from_not_start; 111 } 112 } 113 114 bool IsZero() const { 115 return eats_at_least_from_possibly_start == 0 && 116 eats_at_least_from_not_start == 0; 117 } 118 119 // Any successful match starting from the current node will consume at least 120 // this many characters. This does not necessarily mean that there is a 121 // possible match with exactly this many characters, but we generally try to 122 // get this number as high as possible to allow for early exit on failure. 123 uint8_t eats_at_least_from_possibly_start; 124 125 // Like eats_at_least_from_possibly_start, but with the additional assumption 126 // that start-of-string assertions (^) can't match. This value is greater than 127 // or equal to eats_at_least_from_possibly_start. 128 uint8_t eats_at_least_from_not_start; 129 }; 130 131 class RegExpNode : public ZoneObject { 132 public: 133 explicit RegExpNode(Zone* zone) 134 : replacement_(nullptr), 135 on_work_list_(false), 136 trace_count_(0), 137 zone_(zone) { 138 bm_info_[0] = bm_info_[1] = nullptr; 139 } 140 virtual ~RegExpNode(); 141 virtual void Accept(NodeVisitor* visitor) = 0; 142 // Generates a goto to this node or actually generates the code at this point. 143 virtual void Emit(RegExpCompiler* compiler, Trace* trace) = 0; 144 // How many characters must this node consume at a minimum in order to 145 // succeed. The not_at_start argument is used to indicate that we know we are 146 // not at the start of the input. In this case anchored branches will always 147 // fail and can be ignored when determining how many characters are consumed 148 // on success. If this node has not been analyzed yet, EatsAtLeast returns 0. 149 uint32_t EatsAtLeast(bool not_at_start); 150 // Returns how many characters this node must consume in order to succeed, 151 // given that this is a LoopChoiceNode whose counter register is in a 152 // newly-initialized state at the current position in the generated code. For 153 // example, consider /a{6,8}/. Absent any extra information, the 154 // LoopChoiceNode for the repetition must report that it consumes at least 155 // zero characters, because it may have already looped several times. However, 156 // with a newly-initialized counter, it can report that it consumes at least 157 // six characters. 158 virtual EatsAtLeastInfo EatsAtLeastFromLoopEntry(); 159 // Emits some quick code that checks whether the preloaded characters match. 160 // Falls through on certain failure, jumps to the label on possible success. 161 // If the node cannot make a quick check it does nothing and returns false. 162 bool EmitQuickCheck(RegExpCompiler* compiler, Trace* bounds_check_trace, 163 Trace* trace, bool preload_has_checked_bounds, 164 Label* on_possible_success, 165 QuickCheckDetails* details_return, 166 bool fall_through_on_failure, ChoiceNode* predecessor); 167 // For a given number of characters this returns a mask and a value. The 168 // next n characters are anded with the mask and compared with the value. 169 // A comparison failure indicates the node cannot match the next n characters. 170 // A comparison success indicates the node may match. 171 virtual void GetQuickCheckDetails(QuickCheckDetails* details, 172 RegExpCompiler* compiler, 173 int characters_filled_in, 174 bool not_at_start) = 0; 175 // Fills in quick check details for this node, given that this is a 176 // LoopChoiceNode whose counter register is in a newly-initialized state at 177 // the current position in the generated code. For example, consider /a{6,8}/. 178 // Absent any extra information, the LoopChoiceNode for the repetition cannot 179 // generate any useful quick check because a match might be the (empty) 180 // continuation node. However, with a newly-initialized counter, it can 181 // generate a quick check for several 'a' characters at once. 182 virtual void GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails* details, 183 RegExpCompiler* compiler, 184 int characters_filled_in, 185 bool not_at_start); 186 static const int kNodeIsTooComplexForFixedLengthLoops = kMinInt; 187 virtual int FixedLengthLoopLength() { 188 return kNodeIsTooComplexForFixedLengthLoops; 189 } 190 // Only returns the successor for a text node of length 1 that matches any 191 // character and that has no guards on it. 192 virtual RegExpNode* GetSuccessorOfOmnivorousTextNode( 193 RegExpCompiler* compiler) { 194 return nullptr; 195 } 196 197 // Collects information on the possible code units (mod 128) that can match if 198 // we look forward. This is used for a Boyer-Moore-like string searching 199 // implementation. TODO(erikcorry): This should share more code with 200 // EatsAtLeast, GetQuickCheckDetails. The budget argument is used to limit 201 // the number of nodes we are willing to look at in order to create this data. 202 static const int kRecursionBudget = 200; 203 bool KeepRecursing(RegExpCompiler* compiler); 204 virtual void FillInBMInfo(Isolate* isolate, int offset, int budget, 205 BoyerMooreLookahead* bm, bool not_at_start) { 206 UNREACHABLE(); 207 } 208 209 // If we know that the input is one-byte then there are some nodes that can 210 // never match. This method returns a node that can be substituted for 211 // itself, or nullptr if the node can never match. 212 virtual RegExpNode* FilterOneByte(int depth, RegExpCompiler* compiler) { 213 return this; 214 } 215 // Helper for FilterOneByte. 216 RegExpNode* replacement() { 217 DCHECK(info()->replacement_calculated); 218 return replacement_; 219 } 220 RegExpNode* set_replacement(RegExpNode* replacement) { 221 info()->replacement_calculated = true; 222 replacement_ = replacement; 223 return replacement; // For convenience. 224 } 225 226 // We want to avoid recalculating the lookahead info, so we store it on the 227 // node. Only info that is for this node is stored. We can tell that the 228 // info is for this node when offset == 0, so the information is calculated 229 // relative to this node. 230 void SaveBMInfo(BoyerMooreLookahead* bm, bool not_at_start, int offset) { 231 if (offset == 0) set_bm_info(not_at_start, bm); 232 } 233 234 Label* label() { return &label_; } 235 // If non-generic code is generated for a node (i.e. the node is not at the 236 // start of the trace) then it cannot be reused. This variable sets a limit 237 // on how often we allow that to happen before we insist on starting a new 238 // trace and generating generic code for a node that can be reused by flushing 239 // the deferred actions in the current trace and generating a goto. 240 static const int kMaxCopiesCodeGenerated = 10; 241 242 bool on_work_list() { return on_work_list_; } 243 void set_on_work_list(bool value) { on_work_list_ = value; } 244 245 NodeInfo* info() { return &info_; } 246 const EatsAtLeastInfo* eats_at_least_info() const { return &eats_at_least_; } 247 void set_eats_at_least_info(const EatsAtLeastInfo& eats_at_least) { 248 eats_at_least_ = eats_at_least; 249 } 250 251 // TODO(v8:10441): This is a hacky way to avoid exponential code size growth 252 // for very large choice nodes that can be generated by unicode property 253 // escapes. In order to avoid inlining (i.e. trace recursion), we pretend to 254 // have generated the maximum count of code copies already. 255 // We should instead fix this properly, e.g. by using the code size budget 256 // (flush_budget) or by generating property escape matches as calls to a C 257 // function. 258 void SetDoNotInline() { trace_count_ = kMaxCopiesCodeGenerated; } 259 260 BoyerMooreLookahead* bm_info(bool not_at_start) { 261 return bm_info_[not_at_start ? 1 : 0]; 262 } 263 264 #define DECLARE_CAST(type) \ 265 virtual type##Node* As##type##Node() { return nullptr; } 266 FOR_EACH_NODE_TYPE(DECLARE_CAST) 267 #undef DECLARE_CAST 268 269 virtual SeqRegExpNode* AsSeqRegExpNode() { return nullptr; } 270 271 Zone* zone() const { return zone_; } 272 273 protected: 274 enum LimitResult { DONE, CONTINUE }; 275 RegExpNode* replacement_; 276 277 LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace); 278 279 void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) { 280 bm_info_[not_at_start ? 1 : 0] = bm; 281 } 282 283 private: 284 static const int kFirstCharBudget = 10; 285 Label label_; 286 bool on_work_list_; 287 NodeInfo info_; 288 289 // Saved values for EatsAtLeast results, to avoid recomputation. Filled in 290 // during analysis (valid if info_.been_analyzed is true). 291 EatsAtLeastInfo eats_at_least_; 292 293 // This variable keeps track of how many times code has been generated for 294 // this node (in different traces). We don't keep track of where the 295 // generated code is located unless the code is generated at the start of 296 // a trace, in which case it is generic and can be reused by flushing the 297 // deferred operations in the current trace and generating a goto. 298 int trace_count_; 299 BoyerMooreLookahead* bm_info_[2]; 300 301 Zone* zone_; 302 }; 303 304 class SeqRegExpNode : public RegExpNode { 305 public: 306 explicit SeqRegExpNode(RegExpNode* on_success) 307 : RegExpNode(on_success->zone()), on_success_(on_success) {} 308 RegExpNode* on_success() { return on_success_; } 309 void set_on_success(RegExpNode* node) { on_success_ = node; } 310 RegExpNode* FilterOneByte(int depth, RegExpCompiler* compiler) override; 311 void FillInBMInfo(Isolate* isolate, int offset, int budget, 312 BoyerMooreLookahead* bm, bool not_at_start) override { 313 on_success_->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start); 314 if (offset == 0) set_bm_info(not_at_start, bm); 315 } 316 SeqRegExpNode* AsSeqRegExpNode() override { return this; } 317 318 protected: 319 RegExpNode* FilterSuccessor(int depth, RegExpCompiler* compiler); 320 321 private: 322 RegExpNode* on_success_; 323 }; 324 325 class ActionNode : public SeqRegExpNode { 326 public: 327 enum ActionType { 328 SET_REGISTER_FOR_LOOP, 329 INCREMENT_REGISTER, 330 CLEAR_POSITION, 331 RESTORE_POSITION, 332 BEGIN_POSITIVE_SUBMATCH, 333 BEGIN_NEGATIVE_SUBMATCH, 334 POSITIVE_SUBMATCH_SUCCESS, 335 EMPTY_MATCH_CHECK, 336 CLEAR_CAPTURES, 337 MODIFY_FLAGS 338 }; 339 static ActionNode* SetRegisterForLoop(int reg, int val, 340 RegExpNode* on_success); 341 static ActionNode* IncrementRegister(int reg, RegExpNode* on_success); 342 static ActionNode* ClearPosition(int reg, RegExpNode* on_success); 343 static ActionNode* RestorePosition(int reg, RegExpNode* on_success); 344 static ActionNode* ClearCaptures(Interval range, RegExpNode* on_success); 345 static ActionNode* BeginPositiveSubmatch(int stack_pointer_reg, 346 int position_reg, RegExpNode* body, 347 ActionNode* success_node); 348 static ActionNode* BeginNegativeSubmatch(int stack_pointer_reg, 349 int position_reg, 350 RegExpNode* on_success); 351 static ActionNode* PositiveSubmatchSuccess(int stack_pointer_reg, 352 int restore_reg, 353 int clear_capture_count, 354 int clear_capture_from, 355 RegExpNode* on_success); 356 static ActionNode* EmptyMatchCheck(int start_register, 357 int repetition_register, 358 int repetition_limit, 359 RegExpNode* on_success); 360 static ActionNode* ModifyFlags(RegExpFlags flags, RegExpNode* on_success); 361 ActionNode* AsActionNode() override { return this; } 362 void Accept(NodeVisitor* visitor) override; 363 void Emit(RegExpCompiler* compiler, Trace* trace) override; 364 void GetQuickCheckDetails(QuickCheckDetails* details, 365 RegExpCompiler* compiler, int filled_in, 366 bool not_at_start) override; 367 void FillInBMInfo(Isolate* isolate, int offset, int budget, 368 BoyerMooreLookahead* bm, bool not_at_start) override; 369 ActionType action_type() const { return action_type_; } 370 // TODO(erikcorry): We should allow some action nodes in fixed length loops. 371 int FixedLengthLoopLength() override { 372 return kNodeIsTooComplexForFixedLengthLoops; 373 } 374 RegExpFlags flags() const { 375 DCHECK_EQ(action_type(), MODIFY_FLAGS); 376 return RegExpFlags{data_.u_modify_flags.flags}; 377 } 378 ActionNode* success_node() const { 379 DCHECK_EQ(action_type(), BEGIN_POSITIVE_SUBMATCH); 380 return data_.u_submatch.success_node; 381 } 382 383 bool Mentions(int reg) const { 384 return base::IsInRange(reg, register_from(), register_to()); 385 } 386 387 int value() const { 388 DCHECK(action_type() == SET_REGISTER_FOR_LOOP); 389 return data_.u_simple.value; 390 } 391 392 bool IsSimpleAction() const { 393 return action_type() == CLEAR_POSITION || 394 action_type() == RESTORE_POSITION || 395 action_type() == INCREMENT_REGISTER || 396 action_type() == SET_REGISTER_FOR_LOOP || 397 action_type() == CLEAR_CAPTURES; 398 } 399 400 int register_from() const { 401 DCHECK(IsSimpleAction()); 402 return data_.u_simple.register_from; 403 } 404 405 int register_to() const { return data_.u_simple.register_to; } 406 407 protected: 408 ActionNode(ActionType action_type, RegExpNode* on_success) 409 : SeqRegExpNode(on_success), action_type_(action_type) {} 410 411 ActionNode(ActionType action_type, RegExpNode* on_success, int from, 412 int to = -1, int value = 0) 413 : SeqRegExpNode(on_success), action_type_(action_type) { 414 data_.u_simple.register_from = from; 415 data_.u_simple.register_to = to == -1 ? from : to; 416 data_.u_simple.value = value; 417 DCHECK(IsSimpleAction()); 418 } 419 420 private: 421 union { 422 struct { 423 int register_from; 424 int register_to; 425 int value; 426 } u_simple; 427 struct { 428 int stack_pointer_register; 429 int current_position_register; 430 int clear_register_count; 431 int clear_register_from; 432 ActionNode* success_node; // Only used for positive submatch. 433 } u_submatch; 434 struct { 435 int start_register; 436 int repetition_register; 437 int repetition_limit; 438 } u_empty_match_check; 439 struct { 440 int flags; 441 } u_modify_flags; 442 } data_; 443 444 ActionType action_type_; 445 friend class DotPrinterImpl; 446 friend Zone; 447 }; 448 449 class TextNode : public SeqRegExpNode { 450 public: 451 TextNode(ZoneList<TextElement>* elms, bool read_backward, 452 RegExpNode* on_success) 453 : SeqRegExpNode(on_success), elms_(elms), read_backward_(read_backward) {} 454 TextNode(RegExpClassRanges* that, bool read_backward, RegExpNode* on_success) 455 : SeqRegExpNode(on_success), 456 elms_(zone()->New<ZoneList<TextElement>>(1, zone())), 457 read_backward_(read_backward) { 458 elms_->Add(TextElement::ClassRanges(that), zone()); 459 } 460 // Create TextNode for a single character class for the given ranges. 461 static TextNode* CreateForCharacterRanges(Zone* zone, 462 ZoneList<CharacterRange>* ranges, 463 bool read_backward, 464 RegExpNode* on_success); 465 // Create TextNode for a surrogate pair (i.e. match a sequence of two uc16 466 // code unit ranges). 467 static TextNode* CreateForSurrogatePair( 468 Zone* zone, CharacterRange lead, ZoneList<CharacterRange>* trail_ranges, 469 bool read_backward, RegExpNode* on_success); 470 static TextNode* CreateForSurrogatePair(Zone* zone, 471 ZoneList<CharacterRange>* lead_ranges, 472 CharacterRange trail, 473 bool read_backward, 474 RegExpNode* on_success); 475 TextNode* AsTextNode() override { return this; } 476 void Accept(NodeVisitor* visitor) override; 477 void Emit(RegExpCompiler* compiler, Trace* trace) override; 478 void GetQuickCheckDetails(QuickCheckDetails* details, 479 RegExpCompiler* compiler, int characters_filled_in, 480 bool not_at_start) override; 481 ZoneList<TextElement>* elements() { return elms_; } 482 bool read_backward() { return read_backward_; } 483 void MakeCaseIndependent(Isolate* isolate, bool is_one_byte, 484 RegExpFlags flags); 485 int FixedLengthLoopLength() override; 486 RegExpNode* GetSuccessorOfOmnivorousTextNode( 487 RegExpCompiler* compiler) override; 488 void FillInBMInfo(Isolate* isolate, int offset, int budget, 489 BoyerMooreLookahead* bm, bool not_at_start) override; 490 void CalculateOffsets(); 491 RegExpNode* FilterOneByte(int depth, RegExpCompiler* compiler) override; 492 int Length(); 493 494 private: 495 enum TextEmitPassType { 496 NON_LATIN1_MATCH, // Check for characters that can never match. 497 SIMPLE_CHARACTER_MATCH, // Case-dependent single character check. 498 NON_LETTER_CHARACTER_MATCH, // Check characters that have no case equivs. 499 CASE_CHARACTER_MATCH, // Case-independent single character check. 500 CHARACTER_CLASS_MATCH // Character class. 501 }; 502 void TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass, 503 bool preloaded, Trace* trace, bool first_element_checked, 504 int* checked_up_to); 505 ZoneList<TextElement>* elms_; 506 bool read_backward_; 507 }; 508 509 class AssertionNode : public SeqRegExpNode { 510 public: 511 enum AssertionType { 512 AT_END, 513 AT_START, 514 AT_BOUNDARY, 515 AT_NON_BOUNDARY, 516 AFTER_NEWLINE 517 }; 518 static AssertionNode* AtEnd(RegExpNode* on_success) { 519 return on_success->zone()->New<AssertionNode>(AT_END, on_success); 520 } 521 static AssertionNode* AtStart(RegExpNode* on_success) { 522 return on_success->zone()->New<AssertionNode>(AT_START, on_success); 523 } 524 static AssertionNode* AtBoundary(RegExpNode* on_success) { 525 return on_success->zone()->New<AssertionNode>(AT_BOUNDARY, on_success); 526 } 527 static AssertionNode* AtNonBoundary(RegExpNode* on_success) { 528 return on_success->zone()->New<AssertionNode>(AT_NON_BOUNDARY, on_success); 529 } 530 static AssertionNode* AfterNewline(RegExpNode* on_success) { 531 return on_success->zone()->New<AssertionNode>(AFTER_NEWLINE, on_success); 532 } 533 AssertionNode* AsAssertionNode() override { return this; } 534 void Accept(NodeVisitor* visitor) override; 535 void Emit(RegExpCompiler* compiler, Trace* trace) override; 536 void GetQuickCheckDetails(QuickCheckDetails* details, 537 RegExpCompiler* compiler, int filled_in, 538 bool not_at_start) override; 539 void FillInBMInfo(Isolate* isolate, int offset, int budget, 540 BoyerMooreLookahead* bm, bool not_at_start) override; 541 AssertionType assertion_type() { return assertion_type_; } 542 543 private: 544 friend Zone; 545 546 void EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace); 547 enum IfPrevious { kIsNonWord, kIsWord }; 548 void BacktrackIfPrevious(RegExpCompiler* compiler, Trace* trace, 549 IfPrevious backtrack_if_previous); 550 AssertionNode(AssertionType t, RegExpNode* on_success) 551 : SeqRegExpNode(on_success), assertion_type_(t) {} 552 AssertionType assertion_type_; 553 }; 554 555 class BackReferenceNode : public SeqRegExpNode { 556 public: 557 BackReferenceNode(int start_reg, int end_reg, bool read_backward, 558 RegExpNode* on_success) 559 : SeqRegExpNode(on_success), 560 start_reg_(start_reg), 561 end_reg_(end_reg), 562 read_backward_(read_backward) {} 563 BackReferenceNode* AsBackReferenceNode() override { return this; } 564 void Accept(NodeVisitor* visitor) override; 565 int start_register() { return start_reg_; } 566 int end_register() { return end_reg_; } 567 bool read_backward() { return read_backward_; } 568 void Emit(RegExpCompiler* compiler, Trace* trace) override; 569 void GetQuickCheckDetails(QuickCheckDetails* details, 570 RegExpCompiler* compiler, int characters_filled_in, 571 bool not_at_start) override { 572 return; 573 } 574 void FillInBMInfo(Isolate* isolate, int offset, int budget, 575 BoyerMooreLookahead* bm, bool not_at_start) override; 576 577 private: 578 int start_reg_; 579 int end_reg_; 580 bool read_backward_; 581 }; 582 583 class EndNode : public RegExpNode { 584 public: 585 enum Action { ACCEPT, BACKTRACK, NEGATIVE_SUBMATCH_SUCCESS }; 586 EndNode(Action action, Zone* zone) : RegExpNode(zone), action_(action) {} 587 EndNode* AsEndNode() override { return this; } 588 void Accept(NodeVisitor* visitor) override; 589 void Emit(RegExpCompiler* compiler, Trace* trace) override; 590 void GetQuickCheckDetails(QuickCheckDetails* details, 591 RegExpCompiler* compiler, int characters_filled_in, 592 bool not_at_start) override { 593 // Returning 0 from EatsAtLeast should ensure we never get here. 594 UNREACHABLE(); 595 } 596 void FillInBMInfo(Isolate* isolate, int offset, int budget, 597 BoyerMooreLookahead* bm, bool not_at_start) override { 598 // Returning 0 from EatsAtLeast should ensure we never get here. 599 UNREACHABLE(); 600 } 601 602 private: 603 Action action_; 604 }; 605 606 class NegativeSubmatchSuccess : public EndNode { 607 public: 608 NegativeSubmatchSuccess(int stack_pointer_reg, int position_reg, 609 int clear_capture_count, int clear_capture_start, 610 Zone* zone) 611 : EndNode(NEGATIVE_SUBMATCH_SUCCESS, zone), 612 stack_pointer_register_(stack_pointer_reg), 613 current_position_register_(position_reg), 614 clear_capture_count_(clear_capture_count), 615 clear_capture_start_(clear_capture_start) {} 616 void Emit(RegExpCompiler* compiler, Trace* trace) override; 617 618 private: 619 int stack_pointer_register_; 620 int current_position_register_; 621 int clear_capture_count_; 622 int clear_capture_start_; 623 }; 624 625 class Guard : public ZoneObject { 626 public: 627 enum Relation { LT, GEQ }; 628 Guard(int reg, Relation op, int value) : reg_(reg), op_(op), value_(value) {} 629 int reg() { return reg_; } 630 Relation op() { return op_; } 631 int value() { return value_; } 632 633 private: 634 int reg_; 635 Relation op_; 636 int value_; 637 }; 638 639 class GuardedAlternative { 640 public: 641 explicit GuardedAlternative(RegExpNode* node) 642 : node_(node), guards_(nullptr) {} 643 void AddGuard(Guard* guard, Zone* zone); 644 RegExpNode* node() { return node_; } 645 void set_node(RegExpNode* node) { node_ = node; } 646 ZoneList<Guard*>* guards() { return guards_; } 647 648 private: 649 RegExpNode* node_; 650 ZoneList<Guard*>* guards_; 651 }; 652 653 class AlternativeGeneration; 654 655 class ChoiceNode : public RegExpNode { 656 public: 657 explicit ChoiceNode(int expected_size, Zone* zone) 658 : RegExpNode(zone), 659 alternatives_( 660 zone->New<ZoneList<GuardedAlternative>>(expected_size, zone)), 661 not_at_start_(false), 662 being_calculated_(false) {} 663 ChoiceNode* AsChoiceNode() override { return this; } 664 void Accept(NodeVisitor* visitor) override; 665 void AddAlternative(GuardedAlternative node) { 666 alternatives()->Add(node, zone()); 667 } 668 ZoneList<GuardedAlternative>* alternatives() { return alternatives_; } 669 void Emit(RegExpCompiler* compiler, Trace* trace) override; 670 void GetQuickCheckDetails(QuickCheckDetails* details, 671 RegExpCompiler* compiler, int characters_filled_in, 672 bool not_at_start) override; 673 void FillInBMInfo(Isolate* isolate, int offset, int budget, 674 BoyerMooreLookahead* bm, bool not_at_start) override; 675 676 bool being_calculated() { return being_calculated_; } 677 bool not_at_start() { return not_at_start_; } 678 void set_not_at_start() { not_at_start_ = true; } 679 void set_being_calculated(bool b) { being_calculated_ = b; } 680 virtual bool try_to_emit_quick_check_for_alternative(bool is_first) { 681 return true; 682 } 683 RegExpNode* FilterOneByte(int depth, RegExpCompiler* compiler) override; 684 virtual bool read_backward() { return false; } 685 686 protected: 687 int FixedLengthLoopLengthForAlternative(GuardedAlternative* alternative); 688 ZoneList<GuardedAlternative>* alternatives_; 689 690 private: 691 template <typename...> 692 friend class Analysis; 693 694 void GenerateGuard(RegExpMacroAssembler* macro_assembler, Guard* guard, 695 Trace* trace); 696 int CalculatePreloadCharacters(RegExpCompiler* compiler, int eats_at_least); 697 void EmitOutOfLineContinuation(RegExpCompiler* compiler, Trace* trace, 698 GuardedAlternative alternative, 699 AlternativeGeneration* alt_gen, 700 int preload_characters, 701 bool next_expects_preload); 702 void SetUpPreLoad(RegExpCompiler* compiler, Trace* current_trace, 703 PreloadState* preloads); 704 void AssertGuardsMentionRegisters(Trace* trace); 705 int EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler, Trace* trace); 706 Trace* EmitFixedLengthLoop(RegExpCompiler* compiler, Trace* trace, 707 AlternativeGenerationList* alt_gens, 708 PreloadState* preloads, 709 FixedLengthLoopState* fixed_length_loop_state, 710 int text_length); 711 void EmitChoices(RegExpCompiler* compiler, 712 AlternativeGenerationList* alt_gens, int first_choice, 713 Trace* trace, PreloadState* preloads); 714 715 // If true, this node is never checked at the start of the input. 716 // Allows a new trace to start with at_start() set to false. 717 bool not_at_start_; 718 bool being_calculated_; 719 }; 720 721 class NegativeLookaroundChoiceNode : public ChoiceNode { 722 public: 723 explicit NegativeLookaroundChoiceNode(GuardedAlternative this_must_fail, 724 GuardedAlternative then_do_this, 725 Zone* zone) 726 : ChoiceNode(2, zone) { 727 AddAlternative(this_must_fail); 728 AddAlternative(then_do_this); 729 } 730 void GetQuickCheckDetails(QuickCheckDetails* details, 731 RegExpCompiler* compiler, int characters_filled_in, 732 bool not_at_start) override; 733 void FillInBMInfo(Isolate* isolate, int offset, int budget, 734 BoyerMooreLookahead* bm, bool not_at_start) override { 735 continue_node()->FillInBMInfo(isolate, offset, budget - 1, bm, 736 not_at_start); 737 if (offset == 0) set_bm_info(not_at_start, bm); 738 } 739 static constexpr int kLookaroundIndex = 0; 740 static constexpr int kContinueIndex = 1; 741 RegExpNode* lookaround_node() { 742 return alternatives()->at(kLookaroundIndex).node(); 743 } 744 RegExpNode* continue_node() { 745 return alternatives()->at(kContinueIndex).node(); 746 } 747 // For a negative lookahead we don't emit the quick check for the 748 // alternative that is expected to fail. This is because quick check code 749 // starts by loading enough characters for the alternative that takes fewest 750 // characters, but on a negative lookahead the negative branch did not take 751 // part in that calculation (EatsAtLeast) so the assumptions don't hold. 752 bool try_to_emit_quick_check_for_alternative(bool is_first) override { 753 return !is_first; 754 } 755 NegativeLookaroundChoiceNode* AsNegativeLookaroundChoiceNode() override { 756 return this; 757 } 758 void Accept(NodeVisitor* visitor) override; 759 RegExpNode* FilterOneByte(int depth, RegExpCompiler* compiler) override; 760 }; 761 762 class LoopChoiceNode : public ChoiceNode { 763 public: 764 LoopChoiceNode(bool body_can_be_zero_length, bool read_backward, 765 int min_loop_iterations, Zone* zone) 766 : ChoiceNode(2, zone), 767 loop_node_(nullptr), 768 continue_node_(nullptr), 769 body_can_be_zero_length_(body_can_be_zero_length), 770 read_backward_(read_backward), 771 traversed_loop_initialization_node_(false), 772 min_loop_iterations_(min_loop_iterations) {} 773 void AddLoopAlternative(GuardedAlternative alt); 774 void AddContinueAlternative(GuardedAlternative alt); 775 void Emit(RegExpCompiler* compiler, Trace* trace) override; 776 void GetQuickCheckDetails(QuickCheckDetails* details, 777 RegExpCompiler* compiler, int characters_filled_in, 778 bool not_at_start) override; 779 void GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails* details, 780 RegExpCompiler* compiler, 781 int characters_filled_in, 782 bool not_at_start) override; 783 void FillInBMInfo(Isolate* isolate, int offset, int budget, 784 BoyerMooreLookahead* bm, bool not_at_start) override; 785 EatsAtLeastInfo EatsAtLeastFromLoopEntry() override; 786 RegExpNode* loop_node() { return loop_node_; } 787 RegExpNode* continue_node() { return continue_node_; } 788 bool body_can_be_zero_length() { return body_can_be_zero_length_; } 789 int min_loop_iterations() const { return min_loop_iterations_; } 790 bool read_backward() override { return read_backward_; } 791 LoopChoiceNode* AsLoopChoiceNode() override { return this; } 792 void Accept(NodeVisitor* visitor) override; 793 RegExpNode* FilterOneByte(int depth, RegExpCompiler* compiler) override; 794 795 private: 796 // AddAlternative is made private for loop nodes because alternatives 797 // should not be added freely, we need to keep track of which node 798 // goes back to the node itself. 799 void AddAlternative(GuardedAlternative node) { 800 ChoiceNode::AddAlternative(node); 801 } 802 803 RegExpNode* loop_node_; 804 RegExpNode* continue_node_; 805 bool body_can_be_zero_length_; 806 bool read_backward_; 807 808 // Temporary marker set only while generating quick check details. Represents 809 // whether GetQuickCheckDetails traversed the initialization node for this 810 // loop's counter. If so, we may be able to generate stricter quick checks 811 // because we know the loop node must match at least min_loop_iterations_ 812 // times before the continuation node can match. 813 bool traversed_loop_initialization_node_; 814 815 // The minimum number of times the loop_node_ must match before the 816 // continue_node_ might be considered. This value can be temporarily decreased 817 // while generating quick check details, to represent the remaining iterations 818 // after the completed portion of the quick check details. 819 int min_loop_iterations_; 820 821 friend class IterationDecrementer; 822 friend class LoopInitializationMarker; 823 }; 824 825 class NodeVisitor { 826 public: 827 virtual ~NodeVisitor() = default; 828 #define DECLARE_VISIT(Type) virtual void Visit##Type(Type##Node* that) = 0; 829 FOR_EACH_NODE_TYPE(DECLARE_VISIT) 830 #undef DECLARE_VISIT 831 }; 832 833 } // namespace internal 834 } // namespace v8 835 836 #endif // V8_REGEXP_REGEXP_NODES_H_