regexp-ast.cc (12448B)
1 // Copyright 2016 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "irregexp/imported/regexp-ast.h" 6 7 8 namespace v8 { 9 namespace internal { 10 11 #define MAKE_ACCEPT(Name) \ 12 void* RegExp##Name::Accept(RegExpVisitor* visitor, void* data) { \ 13 return visitor->Visit##Name(this, data); \ 14 } 15 FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ACCEPT) 16 #undef MAKE_ACCEPT 17 18 #define MAKE_TYPE_CASE(Name) \ 19 RegExp##Name* RegExpTree::As##Name() { return nullptr; } \ 20 bool RegExpTree::Is##Name() { return false; } 21 FOR_EACH_REG_EXP_TREE_TYPE(MAKE_TYPE_CASE) 22 #undef MAKE_TYPE_CASE 23 24 #define MAKE_TYPE_CASE(Name) \ 25 RegExp##Name* RegExp##Name::As##Name() { return this; } \ 26 bool RegExp##Name::Is##Name() { return true; } 27 FOR_EACH_REG_EXP_TREE_TYPE(MAKE_TYPE_CASE) 28 #undef MAKE_TYPE_CASE 29 30 namespace { 31 32 Interval ListCaptureRegisters(ZoneList<RegExpTree*>* children) { 33 Interval result = Interval::Empty(); 34 for (int i = 0; i < children->length(); i++) 35 result = result.Union(children->at(i)->CaptureRegisters()); 36 return result; 37 } 38 39 } // namespace 40 41 Interval RegExpAlternative::CaptureRegisters() { 42 return ListCaptureRegisters(nodes()); 43 } 44 45 46 Interval RegExpDisjunction::CaptureRegisters() { 47 return ListCaptureRegisters(alternatives()); 48 } 49 50 51 Interval RegExpLookaround::CaptureRegisters() { 52 return body()->CaptureRegisters(); 53 } 54 55 56 Interval RegExpCapture::CaptureRegisters() { 57 Interval self(StartRegister(index()), EndRegister(index())); 58 return self.Union(body()->CaptureRegisters()); 59 } 60 61 62 Interval RegExpQuantifier::CaptureRegisters() { 63 return body()->CaptureRegisters(); 64 } 65 66 67 bool RegExpAssertion::IsAnchoredAtStart() { 68 return assertion_type() == RegExpAssertion::Type::START_OF_INPUT; 69 } 70 71 72 bool RegExpAssertion::IsAnchoredAtEnd() { 73 return assertion_type() == RegExpAssertion::Type::END_OF_INPUT; 74 } 75 76 77 bool RegExpAlternative::IsAnchoredAtStart() { 78 ZoneList<RegExpTree*>* nodes = this->nodes(); 79 for (int i = 0; i < nodes->length(); i++) { 80 RegExpTree* node = nodes->at(i); 81 if (node->IsAnchoredAtStart()) { 82 return true; 83 } 84 if (node->max_match() > 0) { 85 return false; 86 } 87 } 88 return false; 89 } 90 91 92 bool RegExpAlternative::IsAnchoredAtEnd() { 93 ZoneList<RegExpTree*>* nodes = this->nodes(); 94 for (int i = nodes->length() - 1; i >= 0; i--) { 95 RegExpTree* node = nodes->at(i); 96 if (node->IsAnchoredAtEnd()) { 97 return true; 98 } 99 if (node->max_match() > 0) { 100 return false; 101 } 102 } 103 return false; 104 } 105 106 107 bool RegExpDisjunction::IsAnchoredAtStart() { 108 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 109 for (int i = 0; i < alternatives->length(); i++) { 110 if (!alternatives->at(i)->IsAnchoredAtStart()) return false; 111 } 112 return true; 113 } 114 115 116 bool RegExpDisjunction::IsAnchoredAtEnd() { 117 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 118 for (int i = 0; i < alternatives->length(); i++) { 119 if (!alternatives->at(i)->IsAnchoredAtEnd()) return false; 120 } 121 return true; 122 } 123 124 125 bool RegExpLookaround::IsAnchoredAtStart() { 126 return is_positive() && type() == LOOKAHEAD && body()->IsAnchoredAtStart(); 127 } 128 129 130 bool RegExpCapture::IsAnchoredAtStart() { return body()->IsAnchoredAtStart(); } 131 132 133 bool RegExpCapture::IsAnchoredAtEnd() { return body()->IsAnchoredAtEnd(); } 134 135 namespace { 136 137 // Convert regular expression trees to a simple sexp representation. 138 // This representation should be different from the input grammar 139 // in as many cases as possible, to make it more difficult for incorrect 140 // parses to look as correct ones which is likely if the input and 141 // output formats are alike. 142 class RegExpUnparser final : public RegExpVisitor { 143 public: 144 RegExpUnparser(std::ostream& os, Zone* zone) : os_(os), zone_(zone) {} 145 void VisitCharacterRange(CharacterRange that); 146 #define MAKE_CASE(Name) void* Visit##Name(RegExp##Name*, void* data) override; 147 FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE) 148 #undef MAKE_CASE 149 private: 150 std::ostream& os_; 151 Zone* zone_; 152 }; 153 154 } // namespace 155 156 void* RegExpUnparser::VisitDisjunction(RegExpDisjunction* that, void* data) { 157 os_ << "(|"; 158 for (int i = 0; i < that->alternatives()->length(); i++) { 159 os_ << " "; 160 that->alternatives()->at(i)->Accept(this, data); 161 } 162 os_ << ")"; 163 return nullptr; 164 } 165 166 167 void* RegExpUnparser::VisitAlternative(RegExpAlternative* that, void* data) { 168 os_ << "(:"; 169 for (int i = 0; i < that->nodes()->length(); i++) { 170 os_ << " "; 171 that->nodes()->at(i)->Accept(this, data); 172 } 173 os_ << ")"; 174 return nullptr; 175 } 176 177 178 void RegExpUnparser::VisitCharacterRange(CharacterRange that) { 179 os_ << AsUC32(that.from()); 180 if (!that.IsSingleton()) { 181 os_ << "-" << AsUC32(that.to()); 182 } 183 } 184 185 void* RegExpUnparser::VisitClassRanges(RegExpClassRanges* that, void* data) { 186 if (that->is_negated()) os_ << "^"; 187 os_ << "["; 188 for (int i = 0; i < that->ranges(zone_)->length(); i++) { 189 if (i > 0) os_ << " "; 190 VisitCharacterRange(that->ranges(zone_)->at(i)); 191 } 192 os_ << "]"; 193 return nullptr; 194 } 195 196 void* RegExpUnparser::VisitClassSetOperand(RegExpClassSetOperand* that, 197 void* data) { 198 os_ << "!["; 199 for (int i = 0; i < that->ranges()->length(); i++) { 200 if (i > 0) os_ << " "; 201 VisitCharacterRange(that->ranges()->at(i)); 202 } 203 if (that->has_strings()) { 204 for (auto iter : *that->strings()) { 205 os_ << " '"; 206 os_ << std::string(iter.first.begin(), iter.first.end()); 207 os_ << "'"; 208 } 209 } 210 os_ << "]"; 211 return nullptr; 212 } 213 214 void* RegExpUnparser::VisitClassSetExpression(RegExpClassSetExpression* that, 215 void* data) { 216 switch (that->operation()) { 217 case RegExpClassSetExpression::OperationType::kUnion: 218 os_ << "++"; 219 break; 220 case RegExpClassSetExpression::OperationType::kIntersection: 221 os_ << "&&"; 222 break; 223 case RegExpClassSetExpression::OperationType::kSubtraction: 224 os_ << "--"; 225 break; 226 } 227 if (that->is_negated()) os_ << "^"; 228 os_ << "["; 229 for (int i = 0; i < that->operands()->length(); i++) { 230 if (i > 0) os_ << " "; 231 that->operands()->at(i)->Accept(this, data); 232 } 233 os_ << "]"; 234 return nullptr; 235 } 236 237 void* RegExpUnparser::VisitAssertion(RegExpAssertion* that, void* data) { 238 switch (that->assertion_type()) { 239 case RegExpAssertion::Type::START_OF_INPUT: 240 os_ << "@^i"; 241 break; 242 case RegExpAssertion::Type::END_OF_INPUT: 243 os_ << "@$i"; 244 break; 245 case RegExpAssertion::Type::START_OF_LINE: 246 os_ << "@^l"; 247 break; 248 case RegExpAssertion::Type::END_OF_LINE: 249 os_ << "@$l"; 250 break; 251 case RegExpAssertion::Type::BOUNDARY: 252 os_ << "@b"; 253 break; 254 case RegExpAssertion::Type::NON_BOUNDARY: 255 os_ << "@B"; 256 break; 257 } 258 return nullptr; 259 } 260 261 262 void* RegExpUnparser::VisitAtom(RegExpAtom* that, void* data) { 263 os_ << "'"; 264 base::Vector<const base::uc16> chardata = that->data(); 265 for (int i = 0; i < chardata.length(); i++) { 266 os_ << AsUC16(chardata[i]); 267 } 268 os_ << "'"; 269 return nullptr; 270 } 271 272 273 void* RegExpUnparser::VisitText(RegExpText* that, void* data) { 274 if (that->elements()->length() == 1) { 275 that->elements()->at(0).tree()->Accept(this, data); 276 } else { 277 os_ << "(!"; 278 for (int i = 0; i < that->elements()->length(); i++) { 279 os_ << " "; 280 that->elements()->at(i).tree()->Accept(this, data); 281 } 282 os_ << ")"; 283 } 284 return nullptr; 285 } 286 287 288 void* RegExpUnparser::VisitQuantifier(RegExpQuantifier* that, void* data) { 289 os_ << "(# " << that->min() << " "; 290 if (that->max() == RegExpTree::kInfinity) { 291 os_ << "- "; 292 } else { 293 os_ << that->max() << " "; 294 } 295 os_ << (that->is_greedy() ? "g " : that->is_possessive() ? "p " : "n "); 296 that->body()->Accept(this, data); 297 os_ << ")"; 298 return nullptr; 299 } 300 301 302 void* RegExpUnparser::VisitCapture(RegExpCapture* that, void* data) { 303 os_ << "(^ "; 304 that->body()->Accept(this, data); 305 os_ << ")"; 306 return nullptr; 307 } 308 309 void* RegExpUnparser::VisitGroup(RegExpGroup* that, void* data) { 310 os_ << "(?" << that->flags() << ": "; 311 that->body()->Accept(this, data); 312 os_ << ")"; 313 return nullptr; 314 } 315 316 void* RegExpUnparser::VisitLookaround(RegExpLookaround* that, void* data) { 317 os_ << "("; 318 os_ << (that->type() == RegExpLookaround::LOOKAHEAD ? "->" : "<-"); 319 os_ << (that->is_positive() ? " + " : " - "); 320 that->body()->Accept(this, data); 321 os_ << ")"; 322 return nullptr; 323 } 324 325 326 void* RegExpUnparser::VisitBackReference(RegExpBackReference* that, 327 void* data) { 328 os_ << "(<- " << that->captures()->first()->index(); 329 for (int i = 1; i < that->captures()->length(); ++i) { 330 os_ << "," << that->captures()->at(i)->index(); 331 } 332 os_ << ")"; 333 return nullptr; 334 } 335 336 337 void* RegExpUnparser::VisitEmpty(RegExpEmpty* that, void* data) { 338 os_ << '%'; 339 return nullptr; 340 } 341 342 std::ostream& RegExpTree::Print(std::ostream& os, Zone* zone) { 343 RegExpUnparser unparser(os, zone); 344 Accept(&unparser, nullptr); 345 return os; 346 } 347 348 RegExpDisjunction::RegExpDisjunction(ZoneList<RegExpTree*>* alternatives) 349 : alternatives_(alternatives) { 350 DCHECK_LT(1, alternatives->length()); 351 RegExpTree* first_alternative = alternatives->at(0); 352 min_match_ = first_alternative->min_match(); 353 max_match_ = first_alternative->max_match(); 354 for (int i = 1; i < alternatives->length(); i++) { 355 RegExpTree* alternative = alternatives->at(i); 356 min_match_ = std::min(min_match_, alternative->min_match()); 357 max_match_ = std::max(max_match_, alternative->max_match()); 358 } 359 } 360 361 namespace { 362 363 int IncreaseBy(int previous, int increase) { 364 if (RegExpTree::kInfinity - previous < increase) { 365 return RegExpTree::kInfinity; 366 } else { 367 return previous + increase; 368 } 369 } 370 371 } // namespace 372 373 RegExpAlternative::RegExpAlternative(ZoneList<RegExpTree*>* nodes) 374 : nodes_(nodes) { 375 DCHECK_LT(1, nodes->length()); 376 min_match_ = 0; 377 max_match_ = 0; 378 for (int i = 0; i < nodes->length(); i++) { 379 RegExpTree* node = nodes->at(i); 380 int node_min_match = node->min_match(); 381 min_match_ = IncreaseBy(min_match_, node_min_match); 382 int node_max_match = node->max_match(); 383 max_match_ = IncreaseBy(max_match_, node_max_match); 384 } 385 } 386 387 RegExpClassSetOperand::RegExpClassSetOperand(ZoneList<CharacterRange>* ranges, 388 CharacterClassStrings* strings) 389 : ranges_(ranges), strings_(strings) { 390 DCHECK_NOT_NULL(ranges); 391 min_match_ = 0; 392 max_match_ = 0; 393 if (!ranges->is_empty()) { 394 min_match_ = 1; 395 max_match_ = 2; 396 } 397 if (has_strings()) { 398 for (auto string : *strings) { 399 min_match_ = std::min(min_match_, string.second->min_match()); 400 max_match_ = std::max(max_match_, string.second->max_match()); 401 } 402 } 403 } 404 405 RegExpClassSetExpression::RegExpClassSetExpression( 406 OperationType op, bool is_negated, bool may_contain_strings, 407 ZoneList<RegExpTree*>* operands) 408 : operation_(op), 409 is_negated_(is_negated), 410 may_contain_strings_(may_contain_strings), 411 operands_(operands) { 412 DCHECK_NOT_NULL(operands); 413 if (is_negated) { 414 DCHECK(!may_contain_strings_); 415 // We don't know anything about max matches for negated classes. 416 // As there are no strings involved, assume that we can match a unicode 417 // character (2 code points). 418 max_match_ = 2; 419 } else { 420 max_match_ = 0; 421 for (auto operand : *operands) { 422 max_match_ = std::max(max_match_, operand->max_match()); 423 } 424 } 425 } 426 427 // static 428 RegExpClassSetExpression* RegExpClassSetExpression::Empty(Zone* zone, 429 bool is_negated) { 430 ZoneList<CharacterRange>* ranges = 431 zone->template New<ZoneList<CharacterRange>>(0, zone); 432 RegExpClassSetOperand* op = 433 zone->template New<RegExpClassSetOperand>(ranges, nullptr); 434 ZoneList<RegExpTree*>* operands = 435 zone->template New<ZoneList<RegExpTree*>>(1, zone); 436 operands->Add(op, zone); 437 return zone->template New<RegExpClassSetExpression>( 438 RegExpClassSetExpression::OperationType::kUnion, is_negated, false, 439 operands); 440 } 441 442 bool RegExpText::StartsWithAtom() const { 443 if (elements_.length() == 0) return false; 444 return elements_.at(0).text_type() == TextElement::ATOM; 445 } 446 447 RegExpAtom* RegExpText::FirstAtom() const { return elements_.at(0).atom(); } 448 449 } // namespace internal 450 } // namespace v8