tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

regexp-ast.cc (12448B)


      1 // Copyright 2016 the V8 project authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "irregexp/imported/regexp-ast.h"
      6 
      7 
      8 namespace v8 {
      9 namespace internal {
     10 
     11 #define MAKE_ACCEPT(Name)                                          \
     12  void* RegExp##Name::Accept(RegExpVisitor* visitor, void* data) { \
     13    return visitor->Visit##Name(this, data);                       \
     14  }
     15 FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ACCEPT)
     16 #undef MAKE_ACCEPT
     17 
     18 #define MAKE_TYPE_CASE(Name)                               \
     19  RegExp##Name* RegExpTree::As##Name() { return nullptr; } \
     20  bool RegExpTree::Is##Name() { return false; }
     21 FOR_EACH_REG_EXP_TREE_TYPE(MAKE_TYPE_CASE)
     22 #undef MAKE_TYPE_CASE
     23 
     24 #define MAKE_TYPE_CASE(Name)                              \
     25  RegExp##Name* RegExp##Name::As##Name() { return this; } \
     26  bool RegExp##Name::Is##Name() { return true; }
     27 FOR_EACH_REG_EXP_TREE_TYPE(MAKE_TYPE_CASE)
     28 #undef MAKE_TYPE_CASE
     29 
     30 namespace {
     31 
     32 Interval ListCaptureRegisters(ZoneList<RegExpTree*>* children) {
     33  Interval result = Interval::Empty();
     34  for (int i = 0; i < children->length(); i++)
     35    result = result.Union(children->at(i)->CaptureRegisters());
     36  return result;
     37 }
     38 
     39 }  // namespace
     40 
     41 Interval RegExpAlternative::CaptureRegisters() {
     42  return ListCaptureRegisters(nodes());
     43 }
     44 
     45 
     46 Interval RegExpDisjunction::CaptureRegisters() {
     47  return ListCaptureRegisters(alternatives());
     48 }
     49 
     50 
     51 Interval RegExpLookaround::CaptureRegisters() {
     52  return body()->CaptureRegisters();
     53 }
     54 
     55 
     56 Interval RegExpCapture::CaptureRegisters() {
     57  Interval self(StartRegister(index()), EndRegister(index()));
     58  return self.Union(body()->CaptureRegisters());
     59 }
     60 
     61 
     62 Interval RegExpQuantifier::CaptureRegisters() {
     63  return body()->CaptureRegisters();
     64 }
     65 
     66 
     67 bool RegExpAssertion::IsAnchoredAtStart() {
     68  return assertion_type() == RegExpAssertion::Type::START_OF_INPUT;
     69 }
     70 
     71 
     72 bool RegExpAssertion::IsAnchoredAtEnd() {
     73  return assertion_type() == RegExpAssertion::Type::END_OF_INPUT;
     74 }
     75 
     76 
     77 bool RegExpAlternative::IsAnchoredAtStart() {
     78  ZoneList<RegExpTree*>* nodes = this->nodes();
     79  for (int i = 0; i < nodes->length(); i++) {
     80    RegExpTree* node = nodes->at(i);
     81    if (node->IsAnchoredAtStart()) {
     82      return true;
     83    }
     84    if (node->max_match() > 0) {
     85      return false;
     86    }
     87  }
     88  return false;
     89 }
     90 
     91 
     92 bool RegExpAlternative::IsAnchoredAtEnd() {
     93  ZoneList<RegExpTree*>* nodes = this->nodes();
     94  for (int i = nodes->length() - 1; i >= 0; i--) {
     95    RegExpTree* node = nodes->at(i);
     96    if (node->IsAnchoredAtEnd()) {
     97      return true;
     98    }
     99    if (node->max_match() > 0) {
    100      return false;
    101    }
    102  }
    103  return false;
    104 }
    105 
    106 
    107 bool RegExpDisjunction::IsAnchoredAtStart() {
    108  ZoneList<RegExpTree*>* alternatives = this->alternatives();
    109  for (int i = 0; i < alternatives->length(); i++) {
    110    if (!alternatives->at(i)->IsAnchoredAtStart()) return false;
    111  }
    112  return true;
    113 }
    114 
    115 
    116 bool RegExpDisjunction::IsAnchoredAtEnd() {
    117  ZoneList<RegExpTree*>* alternatives = this->alternatives();
    118  for (int i = 0; i < alternatives->length(); i++) {
    119    if (!alternatives->at(i)->IsAnchoredAtEnd()) return false;
    120  }
    121  return true;
    122 }
    123 
    124 
    125 bool RegExpLookaround::IsAnchoredAtStart() {
    126  return is_positive() && type() == LOOKAHEAD && body()->IsAnchoredAtStart();
    127 }
    128 
    129 
    130 bool RegExpCapture::IsAnchoredAtStart() { return body()->IsAnchoredAtStart(); }
    131 
    132 
    133 bool RegExpCapture::IsAnchoredAtEnd() { return body()->IsAnchoredAtEnd(); }
    134 
    135 namespace {
    136 
    137 // Convert regular expression trees to a simple sexp representation.
    138 // This representation should be different from the input grammar
    139 // in as many cases as possible, to make it more difficult for incorrect
    140 // parses to look as correct ones which is likely if the input and
    141 // output formats are alike.
    142 class RegExpUnparser final : public RegExpVisitor {
    143 public:
    144  RegExpUnparser(std::ostream& os, Zone* zone) : os_(os), zone_(zone) {}
    145  void VisitCharacterRange(CharacterRange that);
    146 #define MAKE_CASE(Name) void* Visit##Name(RegExp##Name*, void* data) override;
    147  FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE)
    148 #undef MAKE_CASE
    149 private:
    150  std::ostream& os_;
    151  Zone* zone_;
    152 };
    153 
    154 }  // namespace
    155 
    156 void* RegExpUnparser::VisitDisjunction(RegExpDisjunction* that, void* data) {
    157  os_ << "(|";
    158  for (int i = 0; i < that->alternatives()->length(); i++) {
    159    os_ << " ";
    160    that->alternatives()->at(i)->Accept(this, data);
    161  }
    162  os_ << ")";
    163  return nullptr;
    164 }
    165 
    166 
    167 void* RegExpUnparser::VisitAlternative(RegExpAlternative* that, void* data) {
    168  os_ << "(:";
    169  for (int i = 0; i < that->nodes()->length(); i++) {
    170    os_ << " ";
    171    that->nodes()->at(i)->Accept(this, data);
    172  }
    173  os_ << ")";
    174  return nullptr;
    175 }
    176 
    177 
    178 void RegExpUnparser::VisitCharacterRange(CharacterRange that) {
    179  os_ << AsUC32(that.from());
    180  if (!that.IsSingleton()) {
    181    os_ << "-" << AsUC32(that.to());
    182  }
    183 }
    184 
    185 void* RegExpUnparser::VisitClassRanges(RegExpClassRanges* that, void* data) {
    186  if (that->is_negated()) os_ << "^";
    187  os_ << "[";
    188  for (int i = 0; i < that->ranges(zone_)->length(); i++) {
    189    if (i > 0) os_ << " ";
    190    VisitCharacterRange(that->ranges(zone_)->at(i));
    191  }
    192  os_ << "]";
    193  return nullptr;
    194 }
    195 
    196 void* RegExpUnparser::VisitClassSetOperand(RegExpClassSetOperand* that,
    197                                           void* data) {
    198  os_ << "![";
    199  for (int i = 0; i < that->ranges()->length(); i++) {
    200    if (i > 0) os_ << " ";
    201    VisitCharacterRange(that->ranges()->at(i));
    202  }
    203  if (that->has_strings()) {
    204    for (auto iter : *that->strings()) {
    205      os_ << " '";
    206      os_ << std::string(iter.first.begin(), iter.first.end());
    207      os_ << "'";
    208    }
    209  }
    210  os_ << "]";
    211  return nullptr;
    212 }
    213 
    214 void* RegExpUnparser::VisitClassSetExpression(RegExpClassSetExpression* that,
    215                                              void* data) {
    216  switch (that->operation()) {
    217    case RegExpClassSetExpression::OperationType::kUnion:
    218      os_ << "++";
    219      break;
    220    case RegExpClassSetExpression::OperationType::kIntersection:
    221      os_ << "&&";
    222      break;
    223    case RegExpClassSetExpression::OperationType::kSubtraction:
    224      os_ << "--";
    225      break;
    226  }
    227  if (that->is_negated()) os_ << "^";
    228  os_ << "[";
    229  for (int i = 0; i < that->operands()->length(); i++) {
    230    if (i > 0) os_ << " ";
    231    that->operands()->at(i)->Accept(this, data);
    232  }
    233  os_ << "]";
    234  return nullptr;
    235 }
    236 
    237 void* RegExpUnparser::VisitAssertion(RegExpAssertion* that, void* data) {
    238  switch (that->assertion_type()) {
    239    case RegExpAssertion::Type::START_OF_INPUT:
    240      os_ << "@^i";
    241      break;
    242    case RegExpAssertion::Type::END_OF_INPUT:
    243      os_ << "@$i";
    244      break;
    245    case RegExpAssertion::Type::START_OF_LINE:
    246      os_ << "@^l";
    247      break;
    248    case RegExpAssertion::Type::END_OF_LINE:
    249      os_ << "@$l";
    250      break;
    251    case RegExpAssertion::Type::BOUNDARY:
    252      os_ << "@b";
    253      break;
    254    case RegExpAssertion::Type::NON_BOUNDARY:
    255      os_ << "@B";
    256      break;
    257  }
    258  return nullptr;
    259 }
    260 
    261 
    262 void* RegExpUnparser::VisitAtom(RegExpAtom* that, void* data) {
    263  os_ << "'";
    264  base::Vector<const base::uc16> chardata = that->data();
    265  for (int i = 0; i < chardata.length(); i++) {
    266    os_ << AsUC16(chardata[i]);
    267  }
    268  os_ << "'";
    269  return nullptr;
    270 }
    271 
    272 
    273 void* RegExpUnparser::VisitText(RegExpText* that, void* data) {
    274  if (that->elements()->length() == 1) {
    275    that->elements()->at(0).tree()->Accept(this, data);
    276  } else {
    277    os_ << "(!";
    278    for (int i = 0; i < that->elements()->length(); i++) {
    279      os_ << " ";
    280      that->elements()->at(i).tree()->Accept(this, data);
    281    }
    282    os_ << ")";
    283  }
    284  return nullptr;
    285 }
    286 
    287 
    288 void* RegExpUnparser::VisitQuantifier(RegExpQuantifier* that, void* data) {
    289  os_ << "(# " << that->min() << " ";
    290  if (that->max() == RegExpTree::kInfinity) {
    291    os_ << "- ";
    292  } else {
    293    os_ << that->max() << " ";
    294  }
    295  os_ << (that->is_greedy() ? "g " : that->is_possessive() ? "p " : "n ");
    296  that->body()->Accept(this, data);
    297  os_ << ")";
    298  return nullptr;
    299 }
    300 
    301 
    302 void* RegExpUnparser::VisitCapture(RegExpCapture* that, void* data) {
    303  os_ << "(^ ";
    304  that->body()->Accept(this, data);
    305  os_ << ")";
    306  return nullptr;
    307 }
    308 
    309 void* RegExpUnparser::VisitGroup(RegExpGroup* that, void* data) {
    310  os_ << "(?" << that->flags() << ": ";
    311  that->body()->Accept(this, data);
    312  os_ << ")";
    313  return nullptr;
    314 }
    315 
    316 void* RegExpUnparser::VisitLookaround(RegExpLookaround* that, void* data) {
    317  os_ << "(";
    318  os_ << (that->type() == RegExpLookaround::LOOKAHEAD ? "->" : "<-");
    319  os_ << (that->is_positive() ? " + " : " - ");
    320  that->body()->Accept(this, data);
    321  os_ << ")";
    322  return nullptr;
    323 }
    324 
    325 
    326 void* RegExpUnparser::VisitBackReference(RegExpBackReference* that,
    327                                         void* data) {
    328  os_ << "(<- " << that->captures()->first()->index();
    329  for (int i = 1; i < that->captures()->length(); ++i) {
    330    os_ << "," << that->captures()->at(i)->index();
    331  }
    332  os_ << ")";
    333  return nullptr;
    334 }
    335 
    336 
    337 void* RegExpUnparser::VisitEmpty(RegExpEmpty* that, void* data) {
    338  os_ << '%';
    339  return nullptr;
    340 }
    341 
    342 std::ostream& RegExpTree::Print(std::ostream& os, Zone* zone) {
    343  RegExpUnparser unparser(os, zone);
    344  Accept(&unparser, nullptr);
    345  return os;
    346 }
    347 
    348 RegExpDisjunction::RegExpDisjunction(ZoneList<RegExpTree*>* alternatives)
    349    : alternatives_(alternatives) {
    350  DCHECK_LT(1, alternatives->length());
    351  RegExpTree* first_alternative = alternatives->at(0);
    352  min_match_ = first_alternative->min_match();
    353  max_match_ = first_alternative->max_match();
    354  for (int i = 1; i < alternatives->length(); i++) {
    355    RegExpTree* alternative = alternatives->at(i);
    356    min_match_ = std::min(min_match_, alternative->min_match());
    357    max_match_ = std::max(max_match_, alternative->max_match());
    358  }
    359 }
    360 
    361 namespace {
    362 
    363 int IncreaseBy(int previous, int increase) {
    364  if (RegExpTree::kInfinity - previous < increase) {
    365    return RegExpTree::kInfinity;
    366  } else {
    367    return previous + increase;
    368  }
    369 }
    370 
    371 }  // namespace
    372 
    373 RegExpAlternative::RegExpAlternative(ZoneList<RegExpTree*>* nodes)
    374    : nodes_(nodes) {
    375  DCHECK_LT(1, nodes->length());
    376  min_match_ = 0;
    377  max_match_ = 0;
    378  for (int i = 0; i < nodes->length(); i++) {
    379    RegExpTree* node = nodes->at(i);
    380    int node_min_match = node->min_match();
    381    min_match_ = IncreaseBy(min_match_, node_min_match);
    382    int node_max_match = node->max_match();
    383    max_match_ = IncreaseBy(max_match_, node_max_match);
    384  }
    385 }
    386 
    387 RegExpClassSetOperand::RegExpClassSetOperand(ZoneList<CharacterRange>* ranges,
    388                                             CharacterClassStrings* strings)
    389    : ranges_(ranges), strings_(strings) {
    390  DCHECK_NOT_NULL(ranges);
    391  min_match_ = 0;
    392  max_match_ = 0;
    393  if (!ranges->is_empty()) {
    394    min_match_ = 1;
    395    max_match_ = 2;
    396  }
    397  if (has_strings()) {
    398    for (auto string : *strings) {
    399      min_match_ = std::min(min_match_, string.second->min_match());
    400      max_match_ = std::max(max_match_, string.second->max_match());
    401    }
    402  }
    403 }
    404 
    405 RegExpClassSetExpression::RegExpClassSetExpression(
    406    OperationType op, bool is_negated, bool may_contain_strings,
    407    ZoneList<RegExpTree*>* operands)
    408    : operation_(op),
    409      is_negated_(is_negated),
    410      may_contain_strings_(may_contain_strings),
    411      operands_(operands) {
    412  DCHECK_NOT_NULL(operands);
    413  if (is_negated) {
    414    DCHECK(!may_contain_strings_);
    415    // We don't know anything about max matches for negated classes.
    416    // As there are no strings involved, assume that we can match a unicode
    417    // character (2 code points).
    418    max_match_ = 2;
    419  } else {
    420    max_match_ = 0;
    421    for (auto operand : *operands) {
    422      max_match_ = std::max(max_match_, operand->max_match());
    423    }
    424  }
    425 }
    426 
    427 // static
    428 RegExpClassSetExpression* RegExpClassSetExpression::Empty(Zone* zone,
    429                                                          bool is_negated) {
    430  ZoneList<CharacterRange>* ranges =
    431      zone->template New<ZoneList<CharacterRange>>(0, zone);
    432  RegExpClassSetOperand* op =
    433      zone->template New<RegExpClassSetOperand>(ranges, nullptr);
    434  ZoneList<RegExpTree*>* operands =
    435      zone->template New<ZoneList<RegExpTree*>>(1, zone);
    436  operands->Add(op, zone);
    437  return zone->template New<RegExpClassSetExpression>(
    438      RegExpClassSetExpression::OperationType::kUnion, is_negated, false,
    439      operands);
    440 }
    441 
    442 bool RegExpText::StartsWithAtom() const {
    443  if (elements_.length() == 0) return false;
    444  return elements_.at(0).text_type() == TextElement::ATOM;
    445 }
    446 
    447 RegExpAtom* RegExpText::FirstAtom() const { return elements_.at(0).atom(); }
    448 
    449 }  // namespace internal
    450 }  // namespace v8