regexp-ast.h (25810B)
1 // Copyright 2016 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef V8_REGEXP_REGEXP_AST_H_ 6 #define V8_REGEXP_REGEXP_AST_H_ 7 8 #include <optional> 9 10 #include "irregexp/RegExpShim.h" 11 12 #ifdef V8_INTL_SUPPORT 13 #include "unicode/uniset.h" 14 #endif // V8_INTL_SUPPORT 15 16 namespace v8::internal { 17 18 #define FOR_EACH_REG_EXP_TREE_TYPE(VISIT) \ 19 VISIT(Disjunction) \ 20 VISIT(Alternative) \ 21 VISIT(Assertion) \ 22 VISIT(ClassRanges) \ 23 VISIT(ClassSetOperand) \ 24 VISIT(ClassSetExpression) \ 25 VISIT(Atom) \ 26 VISIT(Quantifier) \ 27 VISIT(Capture) \ 28 VISIT(Group) \ 29 VISIT(Lookaround) \ 30 VISIT(BackReference) \ 31 VISIT(Empty) \ 32 VISIT(Text) 33 34 #define FORWARD_DECLARE(Name) class RegExp##Name; 35 FOR_EACH_REG_EXP_TREE_TYPE(FORWARD_DECLARE) 36 #undef FORWARD_DECLARE 37 38 class RegExpCompiler; 39 class RegExpNode; 40 class RegExpTree; 41 42 class RegExpVisitor { 43 public: 44 virtual ~RegExpVisitor() = default; 45 #define MAKE_CASE(Name) \ 46 virtual void* Visit##Name(RegExp##Name*, void* data) = 0; 47 FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE) 48 #undef MAKE_CASE 49 }; 50 51 // A simple closed interval. 52 class Interval { 53 public: 54 Interval() : from_(kNone), to_(kNone - 1) {} // '- 1' for branchless size(). 55 Interval(int from, int to) : from_(from), to_(to) {} 56 Interval Union(Interval that) { 57 if (that.from_ == kNone) return *this; 58 if (from_ == kNone) return that; 59 return Interval(std::min(from_, that.from_), std::max(to_, that.to_)); 60 } 61 62 static Interval Empty() { return Interval(); } 63 64 bool Contains(int value) const { return (from_ <= value) && (value <= to_); } 65 bool is_empty() const { return from_ == kNone; } 66 int from() const { return from_; } 67 int to() const { return to_; } 68 int size() const { return to_ - from_ + 1; } 69 70 static constexpr int kNone = -1; 71 72 private: 73 int from_; 74 int to_; 75 }; 76 77 // Named standard character sets. 78 enum class StandardCharacterSet : char { 79 kWhitespace = 's', // Like /\s/. 80 kNotWhitespace = 'S', // Like /\S/. 81 kWord = 'w', // Like /\w/. 82 kNotWord = 'W', // Like /\W/. 83 kDigit = 'd', // Like /\d/. 84 kNotDigit = 'D', // Like /\D/. 85 kLineTerminator = 'n', // The inverse of /./. 86 kNotLineTerminator = '.', // Like /./. 87 kEverything = '*', // Matches every character, like /./s. 88 }; 89 90 // Represents code points (with values up to 0x10FFFF) in the range from from_ 91 // to to_, both ends are inclusive. 92 class CharacterRange { 93 public: 94 CharacterRange() = default; 95 // For compatibility with the CHECK_OK macro. 96 CharacterRange(void* null) { DCHECK_NULL(null); } // NOLINT 97 98 static inline CharacterRange Singleton(base::uc32 value) { 99 return CharacterRange(value, value); 100 } 101 static inline CharacterRange Range(base::uc32 from, base::uc32 to) { 102 DCHECK(0 <= from && to <= kMaxCodePoint); 103 DCHECK(static_cast<uint32_t>(from) <= static_cast<uint32_t>(to)); 104 return CharacterRange(from, to); 105 } 106 static inline CharacterRange Everything() { 107 return CharacterRange(0, kMaxCodePoint); 108 } 109 110 static inline ZoneList<CharacterRange>* List(Zone* zone, 111 CharacterRange range) { 112 ZoneList<CharacterRange>* list = 113 zone->New<ZoneList<CharacterRange>>(1, zone); 114 list->Add(range, zone); 115 return list; 116 } 117 118 // Add class escapes. Add case equivalent closure for \w and \W if necessary. 119 V8_EXPORT_PRIVATE static void AddClassEscape( 120 StandardCharacterSet standard_character_set, 121 ZoneList<CharacterRange>* ranges, bool add_unicode_case_equivalents, 122 Zone* zone); 123 // Add case equivalents to ranges. Only used for /i, not for /ui or /vi, as 124 // the semantics for unicode mode are slightly different. 125 // See https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch Note 4. 126 V8_EXPORT_PRIVATE static void AddCaseEquivalents( 127 Isolate* isolate, Zone* zone, ZoneList<CharacterRange>* ranges, 128 bool is_one_byte); 129 // Add case equivalent code points to ranges. Only used for /ui and /vi, not 130 // for /i, as the semantics for non-unicode mode are slightly different. 131 // See https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch Note 4. 132 static void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, 133 Zone* zone); 134 135 bool Contains(base::uc32 i) const { return from_ <= i && i <= to_; } 136 base::uc32 from() const { return from_; } 137 base::uc32 to() const { return to_; } 138 bool IsEverything(base::uc32 max) const { return from_ == 0 && to_ >= max; } 139 bool IsSingleton() const { return from_ == to_; } 140 141 // Whether a range list is in canonical form: Ranges ordered by from value, 142 // and ranges non-overlapping and non-adjacent. 143 V8_EXPORT_PRIVATE static bool IsCanonical( 144 const ZoneList<CharacterRange>* ranges); 145 // Convert range list to canonical form. The characters covered by the ranges 146 // will still be the same, but no character is in more than one range, and 147 // adjacent ranges are merged. The resulting list may be shorter than the 148 // original, but cannot be longer. 149 static void Canonicalize(ZoneList<CharacterRange>* ranges); 150 // Negate the contents of a character range in canonical form. 151 static void Negate(const ZoneList<CharacterRange>* src, 152 ZoneList<CharacterRange>* dst, Zone* zone); 153 // Intersect the contents of two character ranges in canonical form. 154 static void Intersect(const ZoneList<CharacterRange>* lhs, 155 const ZoneList<CharacterRange>* rhs, 156 ZoneList<CharacterRange>* dst, Zone* zone); 157 // Subtract the contents of |to_remove| from the contents of |src|. 158 static void Subtract(const ZoneList<CharacterRange>* src, 159 const ZoneList<CharacterRange>* to_remove, 160 ZoneList<CharacterRange>* dst, Zone* zone); 161 // Remove all ranges outside the one-byte range. 162 static void ClampToOneByte(ZoneList<CharacterRange>* ranges); 163 // Checks if two ranges (both need to be canonical) are equal. 164 static bool Equals(const ZoneList<CharacterRange>* lhs, 165 const ZoneList<CharacterRange>* rhs); 166 167 private: 168 CharacterRange(base::uc32 from, base::uc32 to) : from_(from), to_(to) {} 169 170 static constexpr int kMaxCodePoint = 0x10ffff; 171 172 base::uc32 from_ = 0; 173 base::uc32 to_ = 0; 174 }; 175 176 inline bool operator==(const CharacterRange& lhs, const CharacterRange& rhs) { 177 return lhs.from() == rhs.from() && lhs.to() == rhs.to(); 178 } 179 inline bool operator!=(const CharacterRange& lhs, const CharacterRange& rhs) { 180 return !operator==(lhs, rhs); 181 } 182 183 #define DECL_BOILERPLATE(Name) \ 184 void* Accept(RegExpVisitor* visitor, void* data) override; \ 185 RegExpNode* ToNodeImpl(RegExpCompiler* compiler, RegExpNode* on_success) \ 186 override; \ 187 RegExp##Name* As##Name() override; \ 188 bool Is##Name() override 189 190 class RegExpTree : public ZoneObject { 191 public: 192 static const int kInfinity = kMaxInt; 193 virtual ~RegExpTree() = default; 194 virtual void* Accept(RegExpVisitor* visitor, void* data) = 0; 195 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success); 196 virtual RegExpNode* ToNodeImpl(RegExpCompiler* compiler, 197 RegExpNode* on_success) = 0; 198 virtual bool IsTextElement() { return false; } 199 virtual bool IsAnchoredAtStart() { return false; } 200 virtual bool IsAnchoredAtEnd() { return false; } 201 virtual int min_match() = 0; 202 virtual int max_match() = 0; 203 // Returns the interval of registers used for captures within this 204 // expression. 205 virtual Interval CaptureRegisters() { return Interval::Empty(); } 206 virtual void AppendToText(RegExpText* text, Zone* zone); 207 V8_EXPORT_PRIVATE std::ostream& Print(std::ostream& os, Zone* zone); 208 #define MAKE_ASTYPE(Name) \ 209 virtual RegExp##Name* As##Name(); \ 210 virtual bool Is##Name(); 211 FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ASTYPE) 212 #undef MAKE_ASTYPE 213 }; 214 215 class RegExpDisjunction final : public RegExpTree { 216 public: 217 explicit RegExpDisjunction(ZoneList<RegExpTree*>* alternatives); 218 219 DECL_BOILERPLATE(Disjunction); 220 221 Interval CaptureRegisters() override; 222 bool IsAnchoredAtStart() override; 223 bool IsAnchoredAtEnd() override; 224 int min_match() override { return min_match_; } 225 int max_match() override { return max_match_; } 226 ZoneList<RegExpTree*>* alternatives() const { return alternatives_; } 227 228 private: 229 bool SortConsecutiveAtoms(RegExpCompiler* compiler); 230 void RationalizeConsecutiveAtoms(RegExpCompiler* compiler); 231 void FixSingleCharacterDisjunctions(RegExpCompiler* compiler); 232 ZoneList<RegExpTree*>* alternatives_; 233 int min_match_; 234 int max_match_; 235 }; 236 237 class RegExpAlternative final : public RegExpTree { 238 public: 239 explicit RegExpAlternative(ZoneList<RegExpTree*>* nodes); 240 241 DECL_BOILERPLATE(Alternative); 242 243 Interval CaptureRegisters() override; 244 bool IsAnchoredAtStart() override; 245 bool IsAnchoredAtEnd() override; 246 int min_match() override { return min_match_; } 247 int max_match() override { return max_match_; } 248 ZoneList<RegExpTree*>* nodes() const { return nodes_; } 249 250 private: 251 ZoneList<RegExpTree*>* nodes_; 252 int min_match_; 253 int max_match_; 254 }; 255 256 class RegExpAssertion final : public RegExpTree { 257 public: 258 enum class Type { 259 START_OF_LINE = 0, 260 START_OF_INPUT = 1, 261 END_OF_LINE = 2, 262 END_OF_INPUT = 3, 263 BOUNDARY = 4, 264 NON_BOUNDARY = 5, 265 LAST_ASSERTION_TYPE = NON_BOUNDARY, 266 }; 267 explicit RegExpAssertion(Type type) : assertion_type_(type) {} 268 269 DECL_BOILERPLATE(Assertion); 270 271 bool IsAnchoredAtStart() override; 272 bool IsAnchoredAtEnd() override; 273 int min_match() override { return 0; } 274 int max_match() override { return 0; } 275 Type assertion_type() const { return assertion_type_; } 276 277 private: 278 const Type assertion_type_; 279 }; 280 281 class CharacterSet final { 282 public: 283 explicit CharacterSet(StandardCharacterSet standard_set_type) 284 : standard_set_type_(standard_set_type) {} 285 explicit CharacterSet(ZoneList<CharacterRange>* ranges) : ranges_(ranges) {} 286 287 ZoneList<CharacterRange>* ranges(Zone* zone); 288 StandardCharacterSet standard_set_type() const { 289 return standard_set_type_.value(); 290 } 291 void set_standard_set_type(StandardCharacterSet standard_set_type) { 292 standard_set_type_ = standard_set_type; 293 } 294 bool is_standard() const { return standard_set_type_.has_value(); } 295 V8_EXPORT_PRIVATE void Canonicalize(); 296 297 private: 298 ZoneList<CharacterRange>* ranges_ = nullptr; 299 std::optional<StandardCharacterSet> standard_set_type_; 300 }; 301 302 class RegExpClassRanges final : public RegExpTree { 303 public: 304 // NEGATED: The character class is negated and should match everything but 305 // the specified ranges. 306 // CONTAINS_SPLIT_SURROGATE: The character class contains part of a split 307 // surrogate and should not be unicode-desugared (crbug.com/641091). 308 // IS_CASE_FOLDED: If case folding is required (/i), it was already 309 // performed on individual ranges and should not be applied again. 310 enum Flag { 311 NEGATED = 1 << 0, 312 CONTAINS_SPLIT_SURROGATE = 1 << 1, 313 IS_CASE_FOLDED = 1 << 2, 314 }; 315 using ClassRangesFlags = base::Flags<Flag>; 316 317 RegExpClassRanges(Zone* zone, ZoneList<CharacterRange>* ranges, 318 ClassRangesFlags class_ranges_flags = ClassRangesFlags()) 319 : set_(ranges), class_ranges_flags_(class_ranges_flags) { 320 // Convert the empty set of ranges to the negated Everything() range. 321 if (ranges->is_empty()) { 322 ranges->Add(CharacterRange::Everything(), zone); 323 class_ranges_flags_ ^= NEGATED; 324 } 325 } 326 explicit RegExpClassRanges(StandardCharacterSet standard_set_type) 327 : set_(standard_set_type), class_ranges_flags_() {} 328 329 DECL_BOILERPLATE(ClassRanges); 330 331 bool IsTextElement() override { return true; } 332 int min_match() override { return 1; } 333 // The character class may match two code units for unicode regexps. 334 // TODO(yangguo): we should split this class for usage in TextElement, and 335 // make max_match() dependent on the character class content. 336 int max_match() override { return 2; } 337 338 void AppendToText(RegExpText* text, Zone* zone) override; 339 340 // TODO(lrn): Remove need for complex version if is_standard that 341 // recognizes a mangled standard set and just do { return set_.is_special(); } 342 bool is_standard(Zone* zone); 343 // Returns a value representing the standard character set if is_standard() 344 // returns true. 345 StandardCharacterSet standard_type() const { 346 return set_.standard_set_type(); 347 } 348 349 CharacterSet character_set() const { return set_; } 350 ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); } 351 352 bool is_negated() const { return (class_ranges_flags_ & NEGATED) != 0; } 353 bool contains_split_surrogate() const { 354 return (class_ranges_flags_ & CONTAINS_SPLIT_SURROGATE) != 0; 355 } 356 bool is_case_folded() const { 357 return (class_ranges_flags_ & IS_CASE_FOLDED) != 0; 358 } 359 360 private: 361 CharacterSet set_; 362 ClassRangesFlags class_ranges_flags_; 363 }; 364 365 struct CharacterClassStringLess { 366 bool operator()(base::Vector<const base::uc32> lhs, 367 base::Vector<const base::uc32> rhs) const { 368 // Longer strings first so we generate matches for the largest string 369 // possible. 370 if (lhs.length() != rhs.length()) { 371 return lhs.length() > rhs.length(); 372 } 373 for (int i = 0; i < lhs.length(); i++) { 374 if (lhs[i] != rhs[i]) { 375 return lhs[i] < rhs[i]; 376 } 377 } 378 return false; 379 } 380 }; 381 382 // A type used for strings as part of character classes (only possible in 383 // unicode sets mode). 384 // We use a ZoneMap instead of an UnorderedZoneMap because we need to match 385 // the longest alternatives first. By using a ZoneMap with the custom comparator 386 // we can avoid sorting before assembling the code. 387 // Strings are likely short (the largest string in current unicode properties 388 // consists of 10 code points). 389 using CharacterClassStrings = ZoneMap<base::Vector<const base::uc32>, 390 RegExpTree*, CharacterClassStringLess>; 391 392 // TODO(pthier): If we are sure we don't want to use icu::UnicodeSets 393 // (performance evaluation pending), this class can be merged with 394 // RegExpClassRanges. 395 class RegExpClassSetOperand final : public RegExpTree { 396 public: 397 RegExpClassSetOperand(ZoneList<CharacterRange>* ranges, 398 CharacterClassStrings* strings); 399 400 DECL_BOILERPLATE(ClassSetOperand); 401 402 bool IsTextElement() override { return true; } 403 int min_match() override { return min_match_; } 404 int max_match() override { return max_match_; } 405 406 void Union(RegExpClassSetOperand* other, Zone* zone); 407 void Intersect(RegExpClassSetOperand* other, 408 ZoneList<CharacterRange>* temp_ranges, Zone* zone); 409 void Subtract(RegExpClassSetOperand* other, 410 ZoneList<CharacterRange>* temp_ranges, Zone* zone); 411 412 bool has_strings() const { return strings_ != nullptr && !strings_->empty(); } 413 ZoneList<CharacterRange>* ranges() { return ranges_; } 414 CharacterClassStrings* strings() { 415 DCHECK_NOT_NULL(strings_); 416 return strings_; 417 } 418 419 private: 420 ZoneList<CharacterRange>* ranges_; 421 CharacterClassStrings* strings_; 422 int min_match_; 423 int max_match_; 424 }; 425 426 class RegExpClassSetExpression final : public RegExpTree { 427 public: 428 enum class OperationType { kUnion, kIntersection, kSubtraction }; 429 430 RegExpClassSetExpression(OperationType op, bool is_negated, 431 bool may_contain_strings, 432 ZoneList<RegExpTree*>* operands); 433 434 DECL_BOILERPLATE(ClassSetExpression); 435 436 // Create an empty class set expression (matches everything if |is_negated|, 437 // nothing otherwise). 438 static RegExpClassSetExpression* Empty(Zone* zone, bool is_negated); 439 440 bool IsTextElement() override { return true; } 441 int min_match() override { return 0; } 442 int max_match() override { return max_match_; } 443 444 OperationType operation() const { return operation_; } 445 bool is_negated() const { return is_negated_; } 446 bool may_contain_strings() const { return may_contain_strings_; } 447 const ZoneList<RegExpTree*>* operands() const { return operands_; } 448 ZoneList<RegExpTree*>* operands() { return operands_; } 449 450 private: 451 // Recursively evaluates the tree rooted at |root|, computing the valid 452 // CharacterRanges and strings after applying all set operations. 453 // The original tree will be modified by this method, so don't store pointers 454 // to inner nodes of the tree somewhere else! 455 // Modifying the tree in-place saves memory and speeds up multiple calls of 456 // the method (e.g. when unrolling quantifiers). 457 // |temp_ranges| is used for intermediate results, passed as parameter to 458 // avoid allocating new lists all the time. 459 static RegExpClassSetOperand* ComputeExpression( 460 RegExpTree* root, ZoneList<CharacterRange>* temp_ranges, Zone* zone); 461 462 const OperationType operation_; 463 bool is_negated_; 464 const bool may_contain_strings_; 465 ZoneList<RegExpTree*>* operands_ = nullptr; 466 int max_match_; 467 }; 468 469 class RegExpAtom final : public RegExpTree { 470 public: 471 explicit RegExpAtom(base::Vector<const base::uc16> data) : data_(data) {} 472 473 DECL_BOILERPLATE(Atom); 474 475 bool IsTextElement() override { return true; } 476 int min_match() override { return data_.length(); } 477 int max_match() override { return data_.length(); } 478 void AppendToText(RegExpText* text, Zone* zone) override; 479 480 base::Vector<const base::uc16> data() const { return data_; } 481 int length() const { return data_.length(); } 482 483 private: 484 base::Vector<const base::uc16> data_; 485 }; 486 487 class TextElement final { 488 public: 489 enum TextType { ATOM, CLASS_RANGES }; 490 491 static TextElement Atom(RegExpAtom* atom); 492 static TextElement ClassRanges(RegExpClassRanges* class_ranges); 493 494 int cp_offset() const { return cp_offset_; } 495 void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; } 496 int length() const; 497 498 TextType text_type() const { return text_type_; } 499 500 RegExpTree* tree() const { return tree_; } 501 502 RegExpAtom* atom() const { 503 DCHECK(text_type() == ATOM); 504 return reinterpret_cast<RegExpAtom*>(tree()); 505 } 506 507 RegExpClassRanges* class_ranges() const { 508 DCHECK(text_type() == CLASS_RANGES); 509 return reinterpret_cast<RegExpClassRanges*>(tree()); 510 } 511 512 private: 513 TextElement(TextType text_type, RegExpTree* tree) 514 : cp_offset_(-1), text_type_(text_type), tree_(tree) {} 515 516 int cp_offset_; 517 TextType text_type_; 518 RegExpTree* tree_; 519 }; 520 521 class RegExpText final : public RegExpTree { 522 public: 523 explicit RegExpText(Zone* zone) : elements_(2, zone) {} 524 525 DECL_BOILERPLATE(Text); 526 527 bool IsTextElement() override { return true; } 528 int min_match() override { return length_; } 529 int max_match() override { return length_; } 530 void AppendToText(RegExpText* text, Zone* zone) override; 531 void AddElement(TextElement elm, Zone* zone) { 532 elements_.Add(elm, zone); 533 length_ += elm.length(); 534 } 535 ZoneList<TextElement>* elements() { return &elements_; } 536 537 bool StartsWithAtom() const; 538 RegExpAtom* FirstAtom() const; 539 540 private: 541 ZoneList<TextElement> elements_; 542 int length_ = 0; 543 }; 544 545 class RegExpQuantifier final : public RegExpTree { 546 public: 547 enum QuantifierType { GREEDY, NON_GREEDY, POSSESSIVE }; 548 RegExpQuantifier(int min, int max, QuantifierType type, int index, 549 RegExpTree* body) 550 : body_(body), 551 min_(min), 552 max_(max), 553 quantifier_type_(type), 554 index_(index) { 555 if (min > 0 && body->min_match() > kInfinity / min) { 556 min_match_ = kInfinity; 557 } else { 558 min_match_ = min * body->min_match(); 559 } 560 if (max > 0 && body->max_match() > kInfinity / max) { 561 max_match_ = kInfinity; 562 } else { 563 max_match_ = max * body->max_match(); 564 } 565 } 566 567 DECL_BOILERPLATE(Quantifier); 568 569 static RegExpNode* ToNode(int min, int max, bool is_greedy, RegExpTree* body, 570 RegExpCompiler* compiler, RegExpNode* on_success, 571 bool not_at_start = false); 572 Interval CaptureRegisters() override; 573 int min_match() override { return min_match_; } 574 int max_match() override { return max_match_; } 575 int min() const { return min_; } 576 int max() const { return max_; } 577 QuantifierType quantifier_type() const { return quantifier_type_; } 578 int index() const { return index_; } 579 bool is_possessive() const { return quantifier_type_ == POSSESSIVE; } 580 bool is_non_greedy() const { return quantifier_type_ == NON_GREEDY; } 581 bool is_greedy() const { return quantifier_type_ == GREEDY; } 582 RegExpTree* body() const { return body_; } 583 584 private: 585 RegExpTree* body_; 586 int min_; 587 int max_; 588 int min_match_; 589 int max_match_; 590 QuantifierType quantifier_type_; 591 int index_; 592 }; 593 594 class RegExpCapture final : public RegExpTree { 595 public: 596 explicit RegExpCapture(int index) 597 : body_(nullptr), 598 index_(index), 599 min_match_(0), 600 max_match_(0), 601 name_(nullptr) {} 602 603 DECL_BOILERPLATE(Capture); 604 605 static RegExpNode* ToNode(RegExpTree* body, int index, 606 RegExpCompiler* compiler, RegExpNode* on_success); 607 bool IsAnchoredAtStart() override; 608 bool IsAnchoredAtEnd() override; 609 Interval CaptureRegisters() override; 610 int min_match() override { return min_match_; } 611 int max_match() override { return max_match_; } 612 RegExpTree* body() { return body_; } 613 void set_body(RegExpTree* body) { 614 body_ = body; 615 min_match_ = body->min_match(); 616 max_match_ = body->max_match(); 617 } 618 int index() const { return index_; } 619 const ZoneVector<base::uc16>* name() const { return name_; } 620 void set_name(const ZoneVector<base::uc16>* name) { name_ = name; } 621 static int StartRegister(int index) { return index * 2; } 622 static int EndRegister(int index) { return index * 2 + 1; } 623 624 private: 625 RegExpTree* body_ = nullptr; 626 int index_; 627 int min_match_ = 0; 628 int max_match_ = 0; 629 const ZoneVector<base::uc16>* name_ = nullptr; 630 }; 631 632 class RegExpGroup final : public RegExpTree { 633 public: 634 explicit RegExpGroup(RegExpTree* body, RegExpFlags flags) 635 : body_(body), 636 flags_(flags), 637 min_match_(body->min_match()), 638 max_match_(body->max_match()) {} 639 640 DECL_BOILERPLATE(Group); 641 642 bool IsAnchoredAtStart() override { return body_->IsAnchoredAtStart(); } 643 bool IsAnchoredAtEnd() override { return body_->IsAnchoredAtEnd(); } 644 int min_match() override { return min_match_; } 645 int max_match() override { return max_match_; } 646 Interval CaptureRegisters() override { return body_->CaptureRegisters(); } 647 RegExpTree* body() const { return body_; } 648 RegExpFlags flags() const { return flags_; } 649 650 private: 651 RegExpTree* body_; 652 const RegExpFlags flags_; 653 int min_match_; 654 int max_match_; 655 }; 656 657 class RegExpLookaround final : public RegExpTree { 658 public: 659 enum Type { LOOKAHEAD, LOOKBEHIND }; 660 661 RegExpLookaround(RegExpTree* body, bool is_positive, int capture_count, 662 int capture_from, Type type, int index) 663 : body_(body), 664 is_positive_(is_positive), 665 capture_count_(capture_count), 666 capture_from_(capture_from), 667 type_(type), 668 index_(index) {} 669 670 DECL_BOILERPLATE(Lookaround); 671 672 Interval CaptureRegisters() override; 673 bool IsAnchoredAtStart() override; 674 int min_match() override { return 0; } 675 int max_match() override { return 0; } 676 RegExpTree* body() const { return body_; } 677 bool is_positive() const { return is_positive_; } 678 int capture_count() const { return capture_count_; } 679 int capture_from() const { return capture_from_; } 680 Type type() const { return type_; } 681 int index() const { return index_; } 682 683 class Builder { 684 public: 685 Builder(bool is_positive, RegExpNode* on_success, 686 int stack_pointer_register, int position_register, 687 int capture_register_count = 0, int capture_register_start = 0); 688 RegExpNode* on_match_success() const { return on_match_success_; } 689 RegExpNode* ForMatch(RegExpNode* match); 690 691 private: 692 bool is_positive_; 693 RegExpNode* on_match_success_; 694 RegExpNode* on_success_; 695 int stack_pointer_register_; 696 int position_register_; 697 }; 698 699 private: 700 RegExpTree* body_; 701 bool is_positive_; 702 int capture_count_; 703 int capture_from_; 704 Type type_; 705 int index_; 706 }; 707 708 class RegExpBackReference final : public RegExpTree { 709 public: 710 explicit RegExpBackReference(Zone* zone) : captures_(1, zone) {} 711 explicit RegExpBackReference(RegExpCapture* capture, Zone* zone) 712 : captures_(1, zone) { 713 captures_.Add(capture, zone); 714 } 715 716 DECL_BOILERPLATE(BackReference); 717 718 int min_match() override { return 0; } 719 // The back reference may be recursive, e.g. /(\2)(\1)/. To avoid infinite 720 // recursion, we give up. Ignorance is bliss. 721 int max_match() override { return kInfinity; } 722 const ZoneList<RegExpCapture*>* captures() const { return &captures_; } 723 void add_capture(RegExpCapture* capture, Zone* zone) { 724 captures_.Add(capture, zone); 725 } 726 const ZoneVector<base::uc16>* name() const { return name_; } 727 void set_name(const ZoneVector<base::uc16>* name) { name_ = name; } 728 729 private: 730 ZoneList<RegExpCapture*> captures_; 731 const ZoneVector<base::uc16>* name_ = nullptr; 732 }; 733 734 class RegExpEmpty final : public RegExpTree { 735 public: 736 DECL_BOILERPLATE(Empty); 737 int min_match() override { return 0; } 738 int max_match() override { return 0; } 739 }; 740 741 } // namespace v8::internal 742 743 #undef DECL_BOILERPLATE 744 745 #endif // V8_REGEXP_REGEXP_AST_H_