regexp-compiler.cc (163809B)
1 // Copyright 2019 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "irregexp/imported/regexp-compiler.h" 6 7 #include <optional> 8 9 #include "irregexp/imported/regexp-macro-assembler-arch.h" 10 11 #ifdef V8_INTL_SUPPORT 12 #include "irregexp/imported/special-case.h" 13 #include "unicode/locid.h" 14 #include "unicode/uniset.h" 15 #include "unicode/utypes.h" 16 #endif // V8_INTL_SUPPORT 17 18 namespace v8::internal { 19 20 using namespace regexp_compiler_constants; // NOLINT(build/namespaces) 21 22 // ------------------------------------------------------------------- 23 // Implementation of the Irregexp regular expression engine. 24 // 25 // The Irregexp regular expression engine is intended to be a complete 26 // implementation of ECMAScript regular expressions. It generates either 27 // bytecodes or native code. 28 29 // The Irregexp regexp engine is structured in three steps. 30 // 1) The parser generates an abstract syntax tree. See ast.cc. 31 // 2) From the AST a node network is created. The nodes are all 32 // subclasses of RegExpNode. The nodes represent states when 33 // executing a regular expression. Several optimizations are 34 // performed on the node network. 35 // 3) From the nodes we generate either byte codes or native code 36 // that can actually execute the regular expression (perform 37 // the search). The code generation step is described in more 38 // detail below. 39 40 // Code generation. 41 // 42 // The nodes are divided into four main categories. 43 // * Choice nodes 44 // These represent places where the regular expression can 45 // match in more than one way. For example on entry to an 46 // alternation (foo|bar) or a repetition (*, +, ? or {}). 47 // * Action nodes 48 // These represent places where some action should be 49 // performed. Examples include recording the current position 50 // in the input string to a register (in order to implement 51 // captures) or other actions on register for example in order 52 // to implement the counters needed for {} repetitions. 53 // * Matching nodes 54 // These attempt to match some element part of the input string. 55 // Examples of elements include character classes, plain strings 56 // or back references. 57 // * End nodes 58 // These are used to implement the actions required on finding 59 // a successful match or failing to find a match. 60 // 61 // The code generated (whether as byte codes or native code) maintains 62 // some state as it runs. This consists of the following elements: 63 // 64 // * The capture registers. Used for string captures. 65 // * Other registers. Used for counters etc. 66 // * The current position. 67 // * The stack of backtracking information. Used when a matching node 68 // fails to find a match and needs to try an alternative. 69 // 70 // Conceptual regular expression execution model: 71 // 72 // There is a simple conceptual model of regular expression execution 73 // which will be presented first. The actual code generated is a more 74 // efficient simulation of the simple conceptual model: 75 // 76 // * Choice nodes are implemented as follows: 77 // For each choice except the last { 78 // push current position 79 // push backtrack code location 80 // <generate code to test for choice> 81 // backtrack code location: 82 // pop current position 83 // } 84 // <generate code to test for last choice> 85 // 86 // * Actions nodes are generated as follows 87 // <push affected registers on backtrack stack> 88 // <generate code to perform action> 89 // push backtrack code location 90 // <generate code to test for following nodes> 91 // backtrack code location: 92 // <pop affected registers to restore their state> 93 // <pop backtrack location from stack and go to it> 94 // 95 // * Matching nodes are generated as follows: 96 // if input string matches at current position 97 // update current position 98 // <generate code to test for following nodes> 99 // else 100 // <pop backtrack location from stack and go to it> 101 // 102 // Thus it can be seen that the current position is saved and restored 103 // by the choice nodes, whereas the registers are saved and restored by 104 // by the action nodes that manipulate them. 105 // 106 // The other interesting aspect of this model is that nodes are generated 107 // at the point where they are needed by a recursive call to Emit(). If 108 // the node has already been code generated then the Emit() call will 109 // generate a jump to the previously generated code instead. In order to 110 // limit recursion it is possible for the Emit() function to put the node 111 // on a work list for later generation and instead generate a jump. The 112 // destination of the jump is resolved later when the code is generated. 113 // 114 // Actual regular expression code generation. 115 // 116 // Code generation is actually more complicated than the above. In order to 117 // improve the efficiency of the generated code some optimizations are 118 // performed 119 // 120 // * Choice nodes have 1-character lookahead. 121 // A choice node looks at the following character and eliminates some of 122 // the choices immediately based on that character. This is not yet 123 // implemented. 124 // * Simple greedy loops store reduced backtracking information. We call 125 // these fixed length loops 126 // A quantifier like /.*foo/m will greedily match the whole input. It will 127 // then need to backtrack to a point where it can match "foo". The naive 128 // implementation of this would push each character position onto the 129 // backtracking stack, then pop them off one by one. This would use space 130 // proportional to the length of the input string. However since the "." 131 // can only match in one way and always has a constant length (in this case 132 // of 1) it suffices to store the current position on the top of the stack 133 // once. Matching now becomes merely incrementing the current position and 134 // backtracking becomes decrementing the current position and checking the 135 // result against the stored current position. This is faster and saves 136 // space. 137 // * The current state is virtualized. 138 // This is used to defer expensive operations until it is clear that they 139 // are needed and to generate code for a node more than once, allowing 140 // specialized an efficient versions of the code to be created. This is 141 // explained in the section below. 142 // 143 // Execution state virtualization. 144 // 145 // Instead of emitting code, nodes that manipulate the state can record their 146 // manipulation in an object called the Trace. The Trace object can record a 147 // current position offset, an optional backtrack code location on the top of 148 // the virtualized backtrack stack and some register changes. When a node is 149 // to be emitted it can flush the Trace or update it. Flushing the Trace 150 // will emit code to bring the actual state into line with the virtual state. 151 // Avoiding flushing the state can postpone some work (e.g. updates of capture 152 // registers). Postponing work can save time when executing the regular 153 // expression since it may be found that the work never has to be done as a 154 // failure to match can occur. In addition it is much faster to jump to a 155 // known backtrack code location than it is to pop an unknown backtrack 156 // location from the stack and jump there. 157 // 158 // The virtual state found in the Trace affects code generation. For example 159 // the virtual state contains the difference between the actual current 160 // position and the virtual current position, and matching code needs to use 161 // this offset to attempt a match in the correct location of the input 162 // string. Therefore code generated for a non-trivial trace is specialized 163 // to that trace. The code generator therefore has the ability to generate 164 // code for each node several times. In order to limit the size of the 165 // generated code there is an arbitrary limit on how many specialized sets of 166 // code may be generated for a given node. If the limit is reached, the 167 // trace is flushed and a generic version of the code for a node is emitted. 168 // This is subsequently used for that node. The code emitted for non-generic 169 // trace is not recorded in the node and so it cannot currently be reused in 170 // the event that code generation is requested for an identical trace. 171 172 namespace { 173 174 constexpr base::uc32 MaxCodeUnit(const bool one_byte) { 175 static_assert(String::kMaxOneByteCharCodeU <= 176 std::numeric_limits<uint16_t>::max()); 177 static_assert(String::kMaxUtf16CodeUnitU <= 178 std::numeric_limits<uint16_t>::max()); 179 return one_byte ? String::kMaxOneByteCharCodeU : String::kMaxUtf16CodeUnitU; 180 } 181 182 constexpr uint32_t CharMask(const bool one_byte) { 183 static_assert(base::bits::IsPowerOfTwo(String::kMaxOneByteCharCodeU + 1)); 184 static_assert(base::bits::IsPowerOfTwo(String::kMaxUtf16CodeUnitU + 1)); 185 return MaxCodeUnit(one_byte); 186 } 187 188 } // namespace 189 190 void RegExpTree::AppendToText(RegExpText* text, Zone* zone) { UNREACHABLE(); } 191 192 void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) { 193 text->AddElement(TextElement::Atom(this), zone); 194 } 195 196 void RegExpClassRanges::AppendToText(RegExpText* text, Zone* zone) { 197 text->AddElement(TextElement::ClassRanges(this), zone); 198 } 199 200 void RegExpText::AppendToText(RegExpText* text, Zone* zone) { 201 for (int i = 0; i < elements()->length(); i++) 202 text->AddElement(elements()->at(i), zone); 203 } 204 205 TextElement TextElement::Atom(RegExpAtom* atom) { 206 return TextElement(ATOM, atom); 207 } 208 209 TextElement TextElement::ClassRanges(RegExpClassRanges* class_ranges) { 210 return TextElement(CLASS_RANGES, class_ranges); 211 } 212 213 int TextElement::length() const { 214 switch (text_type()) { 215 case ATOM: 216 return atom()->length(); 217 218 case CLASS_RANGES: 219 return 1; 220 } 221 UNREACHABLE(); 222 } 223 224 class RecursionCheck { 225 public: 226 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) { 227 compiler->IncrementRecursionDepth(); 228 } 229 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); } 230 231 private: 232 RegExpCompiler* compiler_; 233 }; 234 235 // Attempts to compile the regexp using an Irregexp code generator. Returns 236 // a fixed array or a null handle depending on whether it succeeded. 237 RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count, 238 RegExpFlags flags, bool one_byte) 239 : next_register_(JSRegExp::RegistersForCaptureCount(capture_count)), 240 unicode_lookaround_stack_register_(kNoRegister), 241 unicode_lookaround_position_register_(kNoRegister), 242 work_list_(nullptr), 243 recursion_depth_(0), 244 flags_(flags), 245 one_byte_(one_byte), 246 reg_exp_too_big_(false), 247 limiting_recursion_(false), 248 optimize_(v8_flags.regexp_optimization), 249 read_backward_(false), 250 current_expansion_factor_(1), 251 frequency_collator_(), 252 isolate_(isolate), 253 zone_(zone) { 254 accept_ = zone->New<EndNode>(EndNode::ACCEPT, zone); 255 DCHECK_GE(RegExpMacroAssembler::kMaxRegister, next_register_ - 1); 256 } 257 258 RegExpCompiler::CompilationResult RegExpCompiler::Assemble( 259 Isolate* isolate, RegExpMacroAssembler* macro_assembler, RegExpNode* start, 260 int capture_count, DirectHandle<String> pattern) { 261 macro_assembler_ = macro_assembler; 262 263 ZoneVector<RegExpNode*> work_list(zone()); 264 work_list_ = &work_list; 265 Label fail; 266 macro_assembler_->PushBacktrack(&fail); 267 Trace new_trace; 268 start->Emit(this, &new_trace); 269 macro_assembler_->BindJumpTarget(&fail); 270 macro_assembler_->Fail(); 271 while (!work_list.empty()) { 272 RegExpNode* node = work_list.back(); 273 work_list.pop_back(); 274 node->set_on_work_list(false); 275 if (!node->label()->is_bound()) node->Emit(this, &new_trace); 276 } 277 if (reg_exp_too_big_) { 278 if (v8_flags.correctness_fuzzer_suppressions) { 279 FATAL("Aborting on excess zone allocation"); 280 } 281 macro_assembler_->AbortedCodeGeneration(); 282 return CompilationResult::RegExpTooBig(); 283 } 284 285 DirectHandle<HeapObject> code = macro_assembler_->GetCode(pattern, flags_); 286 isolate->IncreaseTotalRegexpCodeGenerated(code); 287 work_list_ = nullptr; 288 289 return {code, next_register_}; 290 } 291 292 bool Trace::mentions_reg(int reg) const { 293 for (auto trace : *this) { 294 if (trace->has_action() && trace->action()->Mentions(reg)) return true; 295 } 296 return false; 297 } 298 299 bool Trace::GetStoredPosition(int reg, int* cp_offset) const { 300 DCHECK_EQ(0, *cp_offset); 301 for (auto trace : *this) { 302 if (trace->has_action() && trace->action()->Mentions(reg)) { 303 if (trace->action_->action_type() == ActionNode::CLEAR_POSITION || 304 trace->action_->action_type() == ActionNode::RESTORE_POSITION) { 305 *cp_offset = trace->next_->cp_offset(); 306 return true; 307 } else { 308 return false; 309 } 310 } 311 } 312 return false; 313 } 314 315 // A (dynamically-sized) set of unsigned integers that behaves especially well 316 // on small integers (< kFirstLimit). May do zone-allocation. 317 class DynamicBitSet : public ZoneObject { 318 public: 319 V8_EXPORT_PRIVATE bool Get(unsigned value) const { 320 if (value < kFirstLimit) { 321 return (first_ & (1 << value)) != 0; 322 } else if (remaining_ == nullptr) { 323 return false; 324 } else { 325 return remaining_->Contains(value); 326 } 327 } 328 329 // Destructively set a value in this set. 330 void Set(unsigned value, Zone* zone) { 331 if (value < kFirstLimit) { 332 first_ |= (1 << value); 333 } else { 334 if (remaining_ == nullptr) 335 remaining_ = zone->New<ZoneList<unsigned>>(1, zone); 336 if (remaining_->is_empty() || !remaining_->Contains(value)) 337 remaining_->Add(value, zone); 338 } 339 } 340 341 private: 342 static constexpr unsigned kFirstLimit = 32; 343 344 uint32_t first_ = 0; 345 ZoneList<unsigned>* remaining_ = nullptr; 346 }; 347 348 int Trace::FindAffectedRegisters(DynamicBitSet* affected_registers, 349 Zone* zone) { 350 int max_register = RegExpCompiler::kNoRegister; 351 for (auto trace : *this) { 352 if (ActionNode* action = trace->action_) { 353 int to = action->register_to(); 354 for (int i = action->register_from(); i <= to; i++) { 355 affected_registers->Set(i, zone); 356 } 357 if (to > max_register) max_register = to; 358 } 359 } 360 return max_register; 361 } 362 363 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler, 364 int max_register, 365 const DynamicBitSet& registers_to_pop, 366 const DynamicBitSet& registers_to_clear) { 367 for (int reg = max_register; reg >= 0; reg--) { 368 if (registers_to_pop.Get(reg)) { 369 assembler->PopRegister(reg); 370 } else if (registers_to_clear.Get(reg)) { 371 int clear_to = reg; 372 while (reg > 0 && registers_to_clear.Get(reg - 1)) { 373 reg--; 374 } 375 assembler->ClearRegisters(reg, clear_to); 376 } 377 } 378 } 379 380 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler, 381 int max_register, 382 const DynamicBitSet& affected_registers, 383 DynamicBitSet* registers_to_pop, 384 DynamicBitSet* registers_to_clear, 385 Zone* zone) { 386 // Count pushes performed to force a stack limit check occasionally. 387 int pushes = 0; 388 389 for (int reg = 0; reg <= max_register; reg++) { 390 if (!affected_registers.Get(reg)) continue; 391 392 // The chronologically first deferred action in the trace 393 // is used to infer the action needed to restore a register 394 // to its previous state (or not, if it's safe to ignore it). 395 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR }; 396 DeferredActionUndoType undo_action = IGNORE; 397 398 int value = 0; 399 bool absolute = false; 400 bool clear = false; 401 static const int kNoStore = kMinInt; 402 int store_position = kNoStore; 403 // This is a little tricky because we are scanning the actions in reverse 404 // historical order (newest first). 405 for (auto trace : *this) { 406 ActionNode* action = trace->action_; 407 if (!action) continue; 408 if (action->Mentions(reg)) { 409 switch (action->action_type()) { 410 case ActionNode::SET_REGISTER_FOR_LOOP: { 411 if (!absolute) { 412 value += action->value(); 413 absolute = true; 414 } 415 // SET_REGISTER_FOR_LOOP is only used for newly introduced loop 416 // counters. They can have a significant previous value if they 417 // occur in a loop. TODO(lrn): Propagate this information, so 418 // we can set undo_action to IGNORE if we know there is no value to 419 // restore. 420 undo_action = RESTORE; 421 DCHECK_EQ(store_position, kNoStore); 422 DCHECK(!clear); 423 break; 424 } 425 case ActionNode::INCREMENT_REGISTER: 426 if (!absolute) { 427 value++; 428 } 429 DCHECK_EQ(store_position, kNoStore); 430 DCHECK(!clear); 431 undo_action = RESTORE; 432 break; 433 case ActionNode::CLEAR_POSITION: 434 case ActionNode::RESTORE_POSITION: { 435 if (!clear && store_position == kNoStore) { 436 store_position = trace->next()->cp_offset(); 437 } 438 439 // For captures we know that stores and clears alternate. 440 // Other register, are never cleared, and if the occur 441 // inside a loop, they might be assigned more than once. 442 if (reg <= 1) { 443 // Registers zero and one, aka "capture zero", is 444 // always set correctly if we succeed. There is no 445 // need to undo a setting on backtrack, because we 446 // will set it again or fail. 447 undo_action = IGNORE; 448 } else { 449 if (action->action_type() == ActionNode::CLEAR_POSITION) { 450 undo_action = CLEAR; 451 } else { 452 undo_action = RESTORE; 453 } 454 } 455 DCHECK(!absolute); 456 DCHECK_EQ(value, 0); 457 break; 458 } 459 case ActionNode::CLEAR_CAPTURES: { 460 // Since we're scanning in reverse order, if we've already 461 // set the position we have to ignore historically earlier 462 // clearing operations. 463 if (store_position == kNoStore) { 464 clear = true; 465 } 466 undo_action = RESTORE; 467 DCHECK(!absolute); 468 DCHECK_EQ(value, 0); 469 break; 470 } 471 default: 472 UNREACHABLE(); 473 } 474 } 475 } 476 // Prepare for the undo-action (e.g., push if it's going to be popped). 477 if (undo_action == RESTORE) { 478 pushes++; 479 RegExpMacroAssembler::StackCheckFlag stack_check = 480 RegExpMacroAssembler::kNoStackLimitCheck; 481 DCHECK_GT(assembler->stack_limit_slack_slot_count(), 0); 482 if (pushes == assembler->stack_limit_slack_slot_count()) { 483 stack_check = RegExpMacroAssembler::kCheckStackLimit; 484 pushes = 0; 485 } 486 487 assembler->PushRegister(reg, stack_check); 488 registers_to_pop->Set(reg, zone); 489 } else if (undo_action == CLEAR) { 490 registers_to_clear->Set(reg, zone); 491 } 492 // Perform the chronologically last action (or accumulated increment) 493 // for the register. 494 if (store_position != kNoStore) { 495 assembler->WriteCurrentPositionToRegister(reg, store_position); 496 } else if (clear) { 497 assembler->ClearRegisters(reg, reg); 498 } else if (absolute) { 499 assembler->SetRegister(reg, value); 500 } else if (value != 0) { 501 assembler->AdvanceRegister(reg, value); 502 } 503 } 504 } 505 506 // This is called as we come into a loop choice node and some other tricky 507 // nodes. It normalizes the state of the code generator to ensure we can 508 // generate generic code. If the mode indicates that we are in a success 509 // situation then don't push anything, because the stack is about to be 510 // discarded, and also don't update the current position. 511 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor, 512 Trace::FlushMode mode) { 513 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 514 515 DCHECK(!is_trivial()); 516 517 // Normally we don't need to update the current position register if we are 518 // about to stop because we had a successful match, but the global mode 519 // requires the current position register to be updated so it can start the 520 // next match. TODO(erikcorry): Perhaps it should use capture register 1 521 // instead. 522 bool update_current_position = 523 cp_offset_ != 0 && (mode != kFlushSuccess || assembler->global()); 524 525 if (!has_any_actions() && (backtrack() == nullptr || mode == kFlushSuccess)) { 526 // Here we just have some deferred cp advances to fix and we are back to 527 // a normal situation. We may also have to forget some information gained 528 // through a quick check that was already performed. 529 if (update_current_position) assembler->AdvanceCurrentPosition(cp_offset_); 530 // Create a new trivial state and generate the node with that. 531 Trace new_state; 532 successor->Emit(compiler, &new_state); 533 return; 534 } 535 536 // Generate deferred actions here along with code to undo them again. 537 DynamicBitSet affected_registers; 538 539 if (backtrack() != nullptr && mode != kFlushSuccess) { 540 // Here we have a concrete backtrack location. These are set up by choice 541 // nodes and so they indicate that we have a deferred save of the current 542 // position which we may need to emit here. 543 assembler->PushCurrentPosition(); 544 } 545 546 int max_register = 547 FindAffectedRegisters(&affected_registers, compiler->zone()); 548 DynamicBitSet registers_to_pop; 549 DynamicBitSet registers_to_clear; 550 PerformDeferredActions(assembler, max_register, affected_registers, 551 ®isters_to_pop, ®isters_to_clear, 552 compiler->zone()); 553 if (update_current_position) assembler->AdvanceCurrentPosition(cp_offset_); 554 555 if (mode == kFlushSuccess) { 556 Trace new_state; 557 successor->Emit(compiler, &new_state); 558 return; 559 } 560 561 // Create a new trivial state and generate the node with that. 562 Label undo; 563 assembler->PushBacktrack(&undo); 564 if (successor->KeepRecursing(compiler)) { 565 Trace new_state; 566 successor->Emit(compiler, &new_state); 567 } else { 568 compiler->AddWork(successor); 569 assembler->GoTo(successor->label()); 570 } 571 572 // On backtrack we need to restore state. 573 assembler->BindJumpTarget(&undo); 574 RestoreAffectedRegisters(assembler, max_register, registers_to_pop, 575 registers_to_clear); 576 if (backtrack() == nullptr) { 577 assembler->Backtrack(); 578 } else { 579 assembler->PopCurrentPosition(); 580 assembler->GoTo(backtrack()); 581 } 582 } 583 584 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) { 585 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 586 587 // Omit flushing the trace. We discard the entire stack frame anyway. 588 589 if (!label()->is_bound()) { 590 // We are completely independent of the trace, since we ignore it, 591 // so this code can be used as the generic version. 592 assembler->Bind(label()); 593 } 594 595 // Throw away everything on the backtrack stack since the start 596 // of the negative submatch and restore the character position. 597 assembler->ReadCurrentPositionFromRegister(current_position_register_); 598 assembler->ReadStackPointerFromRegister(stack_pointer_register_); 599 if (clear_capture_count_ > 0) { 600 // Clear any captures that might have been performed during the success 601 // of the body of the negative look-ahead. 602 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1; 603 assembler->ClearRegisters(clear_capture_start_, clear_capture_end); 604 } 605 // Now that we have unwound the stack we find at the top of the stack the 606 // backtrack that the BeginNegativeSubmatch node got. 607 assembler->Backtrack(); 608 } 609 610 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) { 611 if (!trace->is_trivial()) { 612 DCHECK(action_ == ACCEPT); 613 trace->Flush(compiler, this, Trace::kFlushSuccess); 614 return; 615 } 616 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 617 if (!label()->is_bound()) { 618 assembler->Bind(label()); 619 } 620 switch (action_) { 621 case ACCEPT: 622 assembler->Succeed(); 623 return; 624 case BACKTRACK: 625 assembler->GoTo(trace->backtrack()); 626 return; 627 case NEGATIVE_SUBMATCH_SUCCESS: 628 // This case is handled in a different virtual method. 629 UNREACHABLE(); 630 } 631 UNIMPLEMENTED(); 632 } 633 634 void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) { 635 if (guards_ == nullptr) guards_ = zone->New<ZoneList<Guard*>>(1, zone); 636 guards_->Add(guard, zone); 637 } 638 639 ActionNode* ActionNode::SetRegisterForLoop(int reg, int val, 640 RegExpNode* on_success) { 641 return on_success->zone()->New<ActionNode>(SET_REGISTER_FOR_LOOP, on_success, 642 reg, reg, val); 643 } 644 645 ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) { 646 return on_success->zone()->New<ActionNode>(INCREMENT_REGISTER, on_success, 647 reg); 648 } 649 650 ActionNode* ActionNode::ClearPosition(int reg, RegExpNode* on_success) { 651 return on_success->zone()->New<ActionNode>(CLEAR_POSITION, on_success, reg); 652 } 653 654 ActionNode* ActionNode::RestorePosition(int reg, RegExpNode* on_success) { 655 return on_success->zone()->New<ActionNode>(RESTORE_POSITION, on_success, reg); 656 } 657 658 ActionNode* ActionNode::ClearCaptures(Interval range, RegExpNode* on_success) { 659 return on_success->zone()->New<ActionNode>(CLEAR_CAPTURES, on_success, 660 range.from(), range.to()); 661 } 662 663 ActionNode* ActionNode::BeginPositiveSubmatch(int stack_reg, int position_reg, 664 RegExpNode* body, 665 ActionNode* success_node) { 666 ActionNode* result = 667 body->zone()->New<ActionNode>(BEGIN_POSITIVE_SUBMATCH, body); 668 result->data_.u_submatch.stack_pointer_register = stack_reg; 669 result->data_.u_submatch.current_position_register = position_reg; 670 result->data_.u_submatch.success_node = success_node; 671 return result; 672 } 673 674 ActionNode* ActionNode::BeginNegativeSubmatch(int stack_reg, int position_reg, 675 RegExpNode* on_success) { 676 ActionNode* result = 677 on_success->zone()->New<ActionNode>(BEGIN_NEGATIVE_SUBMATCH, on_success); 678 result->data_.u_submatch.stack_pointer_register = stack_reg; 679 result->data_.u_submatch.current_position_register = position_reg; 680 return result; 681 } 682 683 ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg, int position_reg, 684 int clear_register_count, 685 int clear_register_from, 686 RegExpNode* on_success) { 687 ActionNode* result = on_success->zone()->New<ActionNode>( 688 POSITIVE_SUBMATCH_SUCCESS, on_success); 689 result->data_.u_submatch.stack_pointer_register = stack_reg; 690 result->data_.u_submatch.current_position_register = position_reg; 691 result->data_.u_submatch.clear_register_count = clear_register_count; 692 result->data_.u_submatch.clear_register_from = clear_register_from; 693 return result; 694 } 695 696 ActionNode* ActionNode::EmptyMatchCheck(int start_register, 697 int repetition_register, 698 int repetition_limit, 699 RegExpNode* on_success) { 700 ActionNode* result = 701 on_success->zone()->New<ActionNode>(EMPTY_MATCH_CHECK, on_success); 702 result->data_.u_empty_match_check.start_register = start_register; 703 result->data_.u_empty_match_check.repetition_register = repetition_register; 704 result->data_.u_empty_match_check.repetition_limit = repetition_limit; 705 return result; 706 } 707 708 ActionNode* ActionNode::ModifyFlags(RegExpFlags flags, RegExpNode* on_success) { 709 ActionNode* result = 710 on_success->zone()->New<ActionNode>(MODIFY_FLAGS, on_success); 711 result->data_.u_modify_flags.flags = flags; 712 return result; 713 } 714 715 #define DEFINE_ACCEPT(Type) \ 716 void Type##Node::Accept(NodeVisitor* visitor) { visitor->Visit##Type(this); } 717 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT) 718 #undef DEFINE_ACCEPT 719 720 // ------------------------------------------------------------------- 721 // Emit code. 722 723 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler, 724 Guard* guard, Trace* trace) { 725 switch (guard->op()) { 726 case Guard::LT: 727 DCHECK(!trace->mentions_reg(guard->reg())); 728 macro_assembler->IfRegisterGE(guard->reg(), guard->value(), 729 trace->backtrack()); 730 break; 731 case Guard::GEQ: 732 DCHECK(!trace->mentions_reg(guard->reg())); 733 macro_assembler->IfRegisterLT(guard->reg(), guard->value(), 734 trace->backtrack()); 735 break; 736 } 737 } 738 739 namespace { 740 741 #ifdef DEBUG 742 bool ContainsOnlyUtf16CodeUnits(unibrow::uchar* chars, int length) { 743 static_assert(sizeof(unibrow::uchar) == 4); 744 for (int i = 0; i < length; i++) { 745 if (chars[i] > String::kMaxUtf16CodeUnit) return false; 746 } 747 return true; 748 } 749 #endif // DEBUG 750 751 // Returns the number of characters in the equivalence class, omitting those 752 // that cannot occur in the source string because it is Latin1. This is called 753 // both for unicode modes /ui and /vi, and also for legacy case independent 754 // mode /i. In the case of Unicode modes we handled surrogate pair expansions 755 // earlier so at this point it's all about single-code-unit expansions. 756 int GetCaseIndependentLetters(Isolate* isolate, base::uc16 character, 757 RegExpCompiler* compiler, unibrow::uchar* letters, 758 int letter_length) { 759 bool one_byte_subject = compiler->one_byte(); 760 bool unicode = IsEitherUnicode(compiler->flags()); 761 static const base::uc16 kMaxAscii = 0x7f; 762 if (!unicode && character <= kMaxAscii) { 763 // Fast case for common characters. 764 base::uc16 upper = character & ~0x20; 765 if ('A' <= upper && upper <= 'Z') { 766 letters[0] = upper; 767 letters[1] = upper | 0x20; 768 return 2; 769 } 770 letters[0] = character; 771 return 1; 772 } 773 #ifdef V8_INTL_SUPPORT 774 775 if (!unicode && RegExpCaseFolding::IgnoreSet().contains(character)) { 776 if (one_byte_subject && character > String::kMaxOneByteCharCode) { 777 // This function promises not to return a character that is impossible 778 // for the subject encoding. 779 return 0; 780 } 781 letters[0] = character; 782 DCHECK(ContainsOnlyUtf16CodeUnits(letters, 1)); 783 return 1; 784 } 785 bool in_special_add_set = 786 RegExpCaseFolding::SpecialAddSet().contains(character); 787 788 icu::UnicodeSet set; 789 set.add(character); 790 set = set.closeOver(unicode ? USET_SIMPLE_CASE_INSENSITIVE 791 : USET_CASE_INSENSITIVE); 792 793 UChar32 canon = 0; 794 if (in_special_add_set && !unicode) { 795 canon = RegExpCaseFolding::Canonicalize(character); 796 } 797 798 int32_t range_count = set.getRangeCount(); 799 int items = 0; 800 for (int32_t i = 0; i < range_count; i++) { 801 UChar32 start = set.getRangeStart(i); 802 UChar32 end = set.getRangeEnd(i); 803 CHECK(end - start + items <= letter_length); 804 for (UChar32 cu = start; cu <= end; cu++) { 805 if (one_byte_subject && cu > String::kMaxOneByteCharCode) continue; 806 if (!unicode && in_special_add_set && 807 RegExpCaseFolding::Canonicalize(cu) != canon) { 808 continue; 809 } 810 letters[items++] = static_cast<unibrow::uchar>(cu); 811 } 812 } 813 DCHECK(ContainsOnlyUtf16CodeUnits(letters, items)); 814 return items; 815 #else 816 int length = 817 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters); 818 // Unibrow returns 0 or 1 for characters where case independence is 819 // trivial. 820 if (length == 0) { 821 letters[0] = character; 822 length = 1; 823 } 824 825 if (one_byte_subject) { 826 int new_length = 0; 827 for (int i = 0; i < length; i++) { 828 if (letters[i] <= String::kMaxOneByteCharCode) { 829 letters[new_length++] = letters[i]; 830 } 831 } 832 length = new_length; 833 } 834 835 DCHECK(ContainsOnlyUtf16CodeUnits(letters, length)); 836 return length; 837 #endif // V8_INTL_SUPPORT 838 } 839 840 inline bool EmitSimpleCharacter(Isolate* isolate, RegExpCompiler* compiler, 841 base::uc16 c, Label* on_failure, int cp_offset, 842 bool check, bool preloaded) { 843 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 844 bool bound_checked = false; 845 if (!preloaded) { 846 assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 847 bound_checked = true; 848 } 849 assembler->CheckNotCharacter(c, on_failure); 850 return bound_checked; 851 } 852 853 // Only emits non-letters (things that don't have case). Only used for case 854 // independent matches. 855 inline bool EmitAtomNonLetter(Isolate* isolate, RegExpCompiler* compiler, 856 base::uc16 c, Label* on_failure, int cp_offset, 857 bool check, bool preloaded) { 858 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 859 bool one_byte = compiler->one_byte(); 860 unibrow::uchar chars[4]; 861 int length = GetCaseIndependentLetters(isolate, c, compiler, chars, 4); 862 if (length < 1) { 863 // This can't match. Must be an one-byte subject and a non-one-byte 864 // character. We do not need to do anything since the one-byte pass 865 // already handled this. 866 CHECK(one_byte); 867 return false; // Bounds not checked. 868 } 869 bool checked = false; 870 // We handle the length > 1 case in a later pass. 871 if (length == 1) { 872 // GetCaseIndependentLetters promises not to return characters that can't 873 // match because of the subject encoding. This case is already handled by 874 // the one-byte pass. 875 CHECK_IMPLIES(one_byte, chars[0] <= String::kMaxOneByteCharCodeU); 876 if (!preloaded) { 877 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 878 checked = check; 879 } 880 macro_assembler->CheckNotCharacter(chars[0], on_failure); 881 } 882 return checked; 883 } 884 885 bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler, 886 bool one_byte, base::uc16 c1, base::uc16 c2, 887 Label* on_failure) { 888 const uint32_t char_mask = CharMask(one_byte); 889 base::uc16 exor = c1 ^ c2; 890 // Check whether exor has only one bit set. 891 if (((exor - 1) & exor) == 0) { 892 // If c1 and c2 differ only by one bit. 893 // Ecma262UnCanonicalize always gives the highest number last. 894 DCHECK(c2 > c1); 895 base::uc16 mask = char_mask ^ exor; 896 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure); 897 return true; 898 } 899 DCHECK(c2 > c1); 900 base::uc16 diff = c2 - c1; 901 if (((diff - 1) & diff) == 0 && c1 >= diff) { 902 // If the characters differ by 2^n but don't differ by one bit then 903 // subtract the difference from the found character, then do the or 904 // trick. We avoid the theoretical case where negative numbers are 905 // involved in order to simplify code generation. 906 base::uc16 mask = char_mask ^ diff; 907 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff, diff, mask, 908 on_failure); 909 return true; 910 } 911 return false; 912 } 913 914 // Only emits letters (things that have case). Only used for case independent 915 // matches. 916 inline bool EmitAtomLetter(Isolate* isolate, RegExpCompiler* compiler, 917 base::uc16 c, Label* on_failure, int cp_offset, 918 bool check, bool preloaded) { 919 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 920 bool one_byte = compiler->one_byte(); 921 unibrow::uchar chars[4]; 922 int length = GetCaseIndependentLetters(isolate, c, compiler, chars, 4); 923 // The 0 and 1 case are handled by earlier passes. 924 if (length <= 1) return false; 925 // We may not need to check against the end of the input string 926 // if this character lies before a character that matched. 927 if (!preloaded) { 928 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 929 } 930 Label ok; 931 switch (length) { 932 case 2: { 933 if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0], 934 chars[1], on_failure)) { 935 } else { 936 macro_assembler->CheckCharacter(chars[0], &ok); 937 macro_assembler->CheckNotCharacter(chars[1], on_failure); 938 macro_assembler->Bind(&ok); 939 } 940 break; 941 } 942 case 4: 943 macro_assembler->CheckCharacter(chars[3], &ok); 944 [[fallthrough]]; 945 case 3: 946 macro_assembler->CheckCharacter(chars[0], &ok); 947 macro_assembler->CheckCharacter(chars[1], &ok); 948 macro_assembler->CheckNotCharacter(chars[2], on_failure); 949 macro_assembler->Bind(&ok); 950 break; 951 default: 952 UNREACHABLE(); 953 } 954 return true; 955 } 956 957 void EmitBoundaryTest(RegExpMacroAssembler* masm, int border, 958 Label* fall_through, Label* above_or_equal, 959 Label* below) { 960 if (below != fall_through) { 961 masm->CheckCharacterLT(border, below); 962 if (above_or_equal != fall_through) masm->GoTo(above_or_equal); 963 } else { 964 masm->CheckCharacterGT(border - 1, above_or_equal); 965 } 966 } 967 968 void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm, int first, int last, 969 Label* fall_through, Label* in_range, 970 Label* out_of_range) { 971 if (in_range == fall_through) { 972 if (first == last) { 973 masm->CheckNotCharacter(first, out_of_range); 974 } else { 975 masm->CheckCharacterNotInRange(first, last, out_of_range); 976 } 977 } else { 978 if (first == last) { 979 masm->CheckCharacter(first, in_range); 980 } else { 981 masm->CheckCharacterInRange(first, last, in_range); 982 } 983 if (out_of_range != fall_through) masm->GoTo(out_of_range); 984 } 985 } 986 987 // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even. 988 // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd. 989 void EmitUseLookupTable(RegExpMacroAssembler* masm, 990 ZoneList<base::uc32>* ranges, uint32_t start_index, 991 uint32_t end_index, base::uc32 min_char, 992 Label* fall_through, Label* even_label, 993 Label* odd_label) { 994 static const uint32_t kSize = RegExpMacroAssembler::kTableSize; 995 static const uint32_t kMask = RegExpMacroAssembler::kTableMask; 996 997 base::uc32 base = (min_char & ~kMask); 998 USE(base); 999 1000 // Assert that everything is on one kTableSize page. 1001 for (uint32_t i = start_index; i <= end_index; i++) { 1002 DCHECK_EQ(ranges->at(i) & ~kMask, base); 1003 } 1004 DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base); 1005 1006 char templ[kSize]; 1007 Label* on_bit_set; 1008 Label* on_bit_clear; 1009 int bit; 1010 if (even_label == fall_through) { 1011 on_bit_set = odd_label; 1012 on_bit_clear = even_label; 1013 bit = 1; 1014 } else { 1015 on_bit_set = even_label; 1016 on_bit_clear = odd_label; 1017 bit = 0; 1018 } 1019 for (uint32_t i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; 1020 i++) { 1021 templ[i] = bit; 1022 } 1023 uint32_t j = 0; 1024 bit ^= 1; 1025 for (uint32_t i = start_index; i < end_index; i++) { 1026 for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) { 1027 templ[j] = bit; 1028 } 1029 bit ^= 1; 1030 } 1031 for (uint32_t i = j; i < kSize; i++) { 1032 templ[i] = bit; 1033 } 1034 Factory* factory = masm->isolate()->factory(); 1035 // TODO(erikcorry): Cache these. 1036 Handle<ByteArray> ba = factory->NewByteArray(kSize, AllocationType::kOld); 1037 for (uint32_t i = 0; i < kSize; i++) { 1038 ba->set(i, templ[i]); 1039 } 1040 masm->CheckBitInTable(ba, on_bit_set); 1041 if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear); 1042 } 1043 1044 void CutOutRange(RegExpMacroAssembler* masm, ZoneList<base::uc32>* ranges, 1045 uint32_t start_index, uint32_t end_index, uint32_t cut_index, 1046 Label* even_label, Label* odd_label) { 1047 bool odd = (((cut_index - start_index) & 1) == 1); 1048 Label* in_range_label = odd ? odd_label : even_label; 1049 Label dummy; 1050 EmitDoubleBoundaryTest(masm, ranges->at(cut_index), 1051 ranges->at(cut_index + 1) - 1, &dummy, in_range_label, 1052 &dummy); 1053 DCHECK(!dummy.is_linked()); 1054 // Cut out the single range by rewriting the array. This creates a new 1055 // range that is a merger of the two ranges on either side of the one we 1056 // are cutting out. The oddity of the labels is preserved. 1057 for (uint32_t j = cut_index; j > start_index; j--) { 1058 ranges->at(j) = ranges->at(j - 1); 1059 } 1060 for (uint32_t j = cut_index + 1; j < end_index; j++) { 1061 ranges->at(j) = ranges->at(j + 1); 1062 } 1063 } 1064 1065 // Unicode case. Split the search space into kSize spaces that are handled 1066 // with recursion. 1067 void SplitSearchSpace(ZoneList<base::uc32>* ranges, uint32_t start_index, 1068 uint32_t end_index, uint32_t* new_start_index, 1069 uint32_t* new_end_index, base::uc32* border) { 1070 static const uint32_t kSize = RegExpMacroAssembler::kTableSize; 1071 static const uint32_t kMask = RegExpMacroAssembler::kTableMask; 1072 1073 base::uc32 first = ranges->at(start_index); 1074 base::uc32 last = ranges->at(end_index) - 1; 1075 1076 *new_start_index = start_index; 1077 *border = (ranges->at(start_index) & ~kMask) + kSize; 1078 while (*new_start_index < end_index) { 1079 if (ranges->at(*new_start_index) > *border) break; 1080 (*new_start_index)++; 1081 } 1082 // new_start_index is the index of the first edge that is beyond the 1083 // current kSize space. 1084 1085 // For very large search spaces we do a binary chop search of the non-Latin1 1086 // space instead of just going to the end of the current kSize space. The 1087 // heuristics are complicated a little by the fact that any 128-character 1088 // encoding space can be quickly tested with a table lookup, so we don't 1089 // wish to do binary chop search at a smaller granularity than that. A 1090 // 128-character space can take up a lot of space in the ranges array if, 1091 // for example, we only want to match every second character (eg. the lower 1092 // case characters on some Unicode pages). 1093 uint32_t binary_chop_index = (end_index + start_index) / 2; 1094 // The first test ensures that we get to the code that handles the Latin1 1095 // range with a single not-taken branch, speeding up this important 1096 // character range (even non-Latin1 charset-based text has spaces and 1097 // punctuation). 1098 if (*border - 1 > String::kMaxOneByteCharCode && // Latin1 case. 1099 end_index - start_index > (*new_start_index - start_index) * 2 && 1100 last - first > kSize * 2 && binary_chop_index > *new_start_index && 1101 ranges->at(binary_chop_index) >= first + 2 * kSize) { 1102 uint32_t scan_forward_for_section_border = binary_chop_index; 1103 uint32_t new_border = (ranges->at(binary_chop_index) | kMask) + 1; 1104 1105 while (scan_forward_for_section_border < end_index) { 1106 if (ranges->at(scan_forward_for_section_border) > new_border) { 1107 *new_start_index = scan_forward_for_section_border; 1108 *border = new_border; 1109 break; 1110 } 1111 scan_forward_for_section_border++; 1112 } 1113 } 1114 1115 DCHECK(*new_start_index > start_index); 1116 *new_end_index = *new_start_index - 1; 1117 if (ranges->at(*new_end_index) == *border) { 1118 (*new_end_index)--; 1119 } 1120 if (*border >= ranges->at(end_index)) { 1121 *border = ranges->at(end_index); 1122 *new_start_index = end_index; // Won't be used. 1123 *new_end_index = end_index - 1; 1124 } 1125 } 1126 1127 // Gets a series of segment boundaries representing a character class. If the 1128 // character is in the range between an even and an odd boundary (counting from 1129 // start_index) then go to even_label, otherwise go to odd_label. We already 1130 // know that the character is in the range of min_char to max_char inclusive. 1131 // Either label can be nullptr indicating backtracking. Either label can also 1132 // be equal to the fall_through label. 1133 void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<base::uc32>* ranges, 1134 uint32_t start_index, uint32_t end_index, 1135 base::uc32 min_char, base::uc32 max_char, 1136 Label* fall_through, Label* even_label, 1137 Label* odd_label) { 1138 DCHECK_LE(min_char, String::kMaxUtf16CodeUnit); 1139 DCHECK_LE(max_char, String::kMaxUtf16CodeUnit); 1140 1141 base::uc32 first = ranges->at(start_index); 1142 base::uc32 last = ranges->at(end_index) - 1; 1143 1144 DCHECK_LT(min_char, first); 1145 1146 // Just need to test if the character is before or on-or-after 1147 // a particular character. 1148 if (start_index == end_index) { 1149 EmitBoundaryTest(masm, first, fall_through, even_label, odd_label); 1150 return; 1151 } 1152 1153 // Another almost trivial case: There is one interval in the middle that is 1154 // different from the end intervals. 1155 if (start_index + 1 == end_index) { 1156 EmitDoubleBoundaryTest(masm, first, last, fall_through, even_label, 1157 odd_label); 1158 return; 1159 } 1160 1161 // It's not worth using table lookup if there are very few intervals in the 1162 // character class. 1163 if (end_index - start_index <= 6) { 1164 // It is faster to test for individual characters, so we look for those 1165 // first, then try arbitrary ranges in the second round. 1166 static uint32_t kNoCutIndex = -1; 1167 uint32_t cut = kNoCutIndex; 1168 for (uint32_t i = start_index; i < end_index; i++) { 1169 if (ranges->at(i) == ranges->at(i + 1) - 1) { 1170 cut = i; 1171 break; 1172 } 1173 } 1174 if (cut == kNoCutIndex) cut = start_index; 1175 CutOutRange(masm, ranges, start_index, end_index, cut, even_label, 1176 odd_label); 1177 DCHECK_GE(end_index - start_index, 2); 1178 GenerateBranches(masm, ranges, start_index + 1, end_index - 1, min_char, 1179 max_char, fall_through, even_label, odd_label); 1180 return; 1181 } 1182 1183 // If there are a lot of intervals in the regexp, then we will use tables to 1184 // determine whether the character is inside or outside the character class. 1185 static const int kBits = RegExpMacroAssembler::kTableSizeBits; 1186 1187 if ((max_char >> kBits) == (min_char >> kBits)) { 1188 EmitUseLookupTable(masm, ranges, start_index, end_index, min_char, 1189 fall_through, even_label, odd_label); 1190 return; 1191 } 1192 1193 if ((min_char >> kBits) != first >> kBits) { 1194 masm->CheckCharacterLT(first, odd_label); 1195 GenerateBranches(masm, ranges, start_index + 1, end_index, first, max_char, 1196 fall_through, odd_label, even_label); 1197 return; 1198 } 1199 1200 uint32_t new_start_index = 0; 1201 uint32_t new_end_index = 0; 1202 base::uc32 border = 0; 1203 1204 SplitSearchSpace(ranges, start_index, end_index, &new_start_index, 1205 &new_end_index, &border); 1206 1207 Label handle_rest; 1208 Label* above = &handle_rest; 1209 if (border == last + 1) { 1210 // We didn't find any section that started after the limit, so everything 1211 // above the border is one of the terminal labels. 1212 above = (end_index & 1) != (start_index & 1) ? odd_label : even_label; 1213 DCHECK(new_end_index == end_index - 1); 1214 } 1215 1216 DCHECK_LE(start_index, new_end_index); 1217 DCHECK_LE(new_start_index, end_index); 1218 DCHECK_LT(start_index, new_start_index); 1219 DCHECK_LT(new_end_index, end_index); 1220 DCHECK(new_end_index + 1 == new_start_index || 1221 (new_end_index + 2 == new_start_index && 1222 border == ranges->at(new_end_index + 1))); 1223 DCHECK_LT(min_char, border - 1); 1224 DCHECK_LT(border, max_char); 1225 DCHECK_LT(ranges->at(new_end_index), border); 1226 DCHECK(border < ranges->at(new_start_index) || 1227 (border == ranges->at(new_start_index) && 1228 new_start_index == end_index && new_end_index == end_index - 1 && 1229 border == last + 1)); 1230 DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1)); 1231 1232 masm->CheckCharacterGT(border - 1, above); 1233 Label dummy; 1234 GenerateBranches(masm, ranges, start_index, new_end_index, min_char, 1235 border - 1, &dummy, even_label, odd_label); 1236 if (handle_rest.is_linked()) { 1237 masm->Bind(&handle_rest); 1238 bool flip = (new_start_index & 1) != (start_index & 1); 1239 GenerateBranches(masm, ranges, new_start_index, end_index, border, max_char, 1240 &dummy, flip ? odd_label : even_label, 1241 flip ? even_label : odd_label); 1242 } 1243 } 1244 1245 void EmitClassRanges(RegExpMacroAssembler* macro_assembler, 1246 RegExpClassRanges* cr, bool one_byte, Label* on_failure, 1247 int cp_offset, bool check_offset, bool preloaded, 1248 Zone* zone) { 1249 ZoneList<CharacterRange>* ranges = cr->ranges(zone); 1250 CharacterRange::Canonicalize(ranges); 1251 1252 // Now that all processing (like case-insensitivity) is done, clamp the 1253 // ranges to the set of ranges that may actually occur in the subject string. 1254 if (one_byte) CharacterRange::ClampToOneByte(ranges); 1255 1256 const int ranges_length = ranges->length(); 1257 if (ranges_length == 0) { 1258 if (!cr->is_negated()) { 1259 macro_assembler->GoTo(on_failure); 1260 } 1261 if (check_offset) { 1262 macro_assembler->CheckPosition(cp_offset, on_failure); 1263 } 1264 return; 1265 } 1266 1267 const base::uc32 max_char = MaxCodeUnit(one_byte); 1268 if (ranges_length == 1 && ranges->at(0).IsEverything(max_char)) { 1269 if (cr->is_negated()) { 1270 macro_assembler->GoTo(on_failure); 1271 } else { 1272 // This is a common case hit by non-anchored expressions. 1273 if (check_offset) { 1274 macro_assembler->CheckPosition(cp_offset, on_failure); 1275 } 1276 } 1277 return; 1278 } 1279 1280 if (!preloaded) { 1281 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset); 1282 } 1283 1284 if (cr->is_standard(zone) && macro_assembler->CheckSpecialClassRanges( 1285 cr->standard_type(), on_failure)) { 1286 return; 1287 } 1288 1289 static constexpr int kMaxRangesForInlineBranchGeneration = 16; 1290 if (ranges_length > kMaxRangesForInlineBranchGeneration) { 1291 // For large range sets, emit a more compact instruction sequence to avoid 1292 // a potentially problematic increase in code size. 1293 // Note the flipped logic below (we check InRange if negated, NotInRange if 1294 // not negated); this is necessary since the method falls through on 1295 // failure whereas we want to fall through on success. 1296 if (cr->is_negated()) { 1297 if (macro_assembler->CheckCharacterInRangeArray(ranges, on_failure)) { 1298 return; 1299 } 1300 } else { 1301 if (macro_assembler->CheckCharacterNotInRangeArray(ranges, on_failure)) { 1302 return; 1303 } 1304 } 1305 } 1306 1307 // Generate a flat list of range boundaries for consumption by 1308 // GenerateBranches. See the comment on that function for how the list should 1309 // be structured 1310 ZoneList<base::uc32>* range_boundaries = 1311 zone->New<ZoneList<base::uc32>>(ranges_length * 2, zone); 1312 1313 bool zeroth_entry_is_failure = !cr->is_negated(); 1314 1315 for (int i = 0; i < ranges_length; i++) { 1316 CharacterRange& range = ranges->at(i); 1317 if (range.from() == 0) { 1318 DCHECK_EQ(i, 0); 1319 zeroth_entry_is_failure = !zeroth_entry_is_failure; 1320 } else { 1321 range_boundaries->Add(range.from(), zone); 1322 } 1323 // `+ 1` to convert from inclusive to exclusive `to`. 1324 // [from, to] == [from, to+1[. 1325 range_boundaries->Add(range.to() + 1, zone); 1326 } 1327 int end_index = range_boundaries->length() - 1; 1328 if (range_boundaries->at(end_index) > max_char) { 1329 end_index--; 1330 } 1331 1332 Label fall_through; 1333 GenerateBranches(macro_assembler, range_boundaries, 1334 0, // start_index. 1335 end_index, 1336 0, // min_char. 1337 max_char, &fall_through, 1338 zeroth_entry_is_failure ? &fall_through : on_failure, 1339 zeroth_entry_is_failure ? on_failure : &fall_through); 1340 macro_assembler->Bind(&fall_through); 1341 } 1342 1343 } // namespace 1344 1345 RegExpNode::~RegExpNode() = default; 1346 1347 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler, 1348 Trace* trace) { 1349 // If we are generating a fixed length loop then don't stop and don't reuse 1350 // code. 1351 if (trace->fixed_length_loop_state() != nullptr) { 1352 return CONTINUE; 1353 } 1354 1355 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 1356 if (trace->is_trivial()) { 1357 if (label_.is_bound() || on_work_list() || !KeepRecursing(compiler)) { 1358 // If a generic version is already scheduled to be generated or we have 1359 // recursed too deeply then just generate a jump to that code. 1360 macro_assembler->GoTo(&label_); 1361 // This will queue it up for generation of a generic version if it hasn't 1362 // already been queued. 1363 compiler->AddWork(this); 1364 return DONE; 1365 } 1366 // Generate generic version of the node and bind the label for later use. 1367 macro_assembler->Bind(&label_); 1368 return CONTINUE; 1369 } 1370 1371 // We are being asked to make a non-generic version. Keep track of how many 1372 // non-generic versions we generate so as not to overdo it. 1373 trace_count_++; 1374 if (KeepRecursing(compiler) && compiler->optimize() && 1375 trace_count_ < kMaxCopiesCodeGenerated) { 1376 return CONTINUE; 1377 } 1378 1379 // If we get here code has been generated for this node too many times or 1380 // recursion is too deep. Time to switch to a generic version. The code for 1381 // generic versions above can handle deep recursion properly. 1382 bool was_limiting = compiler->limiting_recursion(); 1383 compiler->set_limiting_recursion(true); 1384 trace->Flush(compiler, this); 1385 compiler->set_limiting_recursion(was_limiting); 1386 return DONE; 1387 } 1388 1389 bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) { 1390 return !compiler->limiting_recursion() && 1391 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion; 1392 } 1393 1394 void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget, 1395 BoyerMooreLookahead* bm, bool not_at_start) { 1396 std::optional<RegExpFlags> old_flags; 1397 if (action_type_ == MODIFY_FLAGS) { 1398 // It is not guaranteed that we hit the resetting modify flags node, due to 1399 // recursion budget limitation for filling in BMInfo. Therefore we reset the 1400 // flags manually to the previous state after recursing. 1401 old_flags = bm->compiler()->flags(); 1402 bm->compiler()->set_flags(flags()); 1403 } 1404 if (action_type_ == BEGIN_POSITIVE_SUBMATCH) { 1405 // We use the node after the lookaround to fill in the eats_at_least info 1406 // so we have to use the same node to fill in the Boyer-Moore info. 1407 success_node()->on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, 1408 not_at_start); 1409 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) { 1410 // We don't use the node after a positive submatch success because it 1411 // rewinds the position. Since we returned 0 as the eats_at_least value for 1412 // this node, we don't need to fill in any data. 1413 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start); 1414 } 1415 SaveBMInfo(bm, not_at_start, offset); 1416 if (old_flags.has_value()) { 1417 bm->compiler()->set_flags(*old_flags); 1418 } 1419 } 1420 1421 void ActionNode::GetQuickCheckDetails(QuickCheckDetails* details, 1422 RegExpCompiler* compiler, int filled_in, 1423 bool not_at_start) { 1424 if (action_type_ == SET_REGISTER_FOR_LOOP) { 1425 on_success()->GetQuickCheckDetailsFromLoopEntry(details, compiler, 1426 filled_in, not_at_start); 1427 } else if (action_type_ == BEGIN_POSITIVE_SUBMATCH) { 1428 // We use the node after the lookaround to fill in the eats_at_least info 1429 // so we have to use the same node to fill in the QuickCheck info. 1430 success_node()->on_success()->GetQuickCheckDetails(details, compiler, 1431 filled_in, not_at_start); 1432 } else if (action_type() != POSITIVE_SUBMATCH_SUCCESS) { 1433 // We don't use the node after a positive submatch success because it 1434 // rewinds the position. Since we returned 0 as the eats_at_least value 1435 // for this node, we don't need to fill in any data. 1436 std::optional<RegExpFlags> old_flags; 1437 if (action_type() == MODIFY_FLAGS) { 1438 // It is not guaranteed that we hit the resetting modify flags node, as 1439 // GetQuickCheckDetails doesn't travers the whole graph. Therefore we 1440 // reset the flags manually to the previous state after recursing. 1441 old_flags = compiler->flags(); 1442 compiler->set_flags(flags()); 1443 } 1444 on_success()->GetQuickCheckDetails(details, compiler, filled_in, 1445 not_at_start); 1446 if (old_flags.has_value()) { 1447 compiler->set_flags(*old_flags); 1448 } 1449 } 1450 } 1451 1452 void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget, 1453 BoyerMooreLookahead* bm, bool not_at_start) { 1454 // Match the behaviour of EatsAtLeast on this node. 1455 if (assertion_type() == AT_START && not_at_start) return; 1456 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start); 1457 SaveBMInfo(bm, not_at_start, offset); 1458 } 1459 1460 void NegativeLookaroundChoiceNode::GetQuickCheckDetails( 1461 QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in, 1462 bool not_at_start) { 1463 RegExpNode* node = continue_node(); 1464 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start); 1465 } 1466 1467 namespace { 1468 1469 // Takes the left-most 1-bit and smears it out, setting all bits to its right. 1470 inline uint32_t SmearBitsRight(uint32_t v) { 1471 v |= v >> 1; 1472 v |= v >> 2; 1473 v |= v >> 4; 1474 v |= v >> 8; 1475 v |= v >> 16; 1476 return v; 1477 } 1478 1479 } // namespace 1480 1481 bool QuickCheckDetails::Rationalize(bool asc) { 1482 bool found_useful_op = false; 1483 const uint32_t char_mask = CharMask(asc); 1484 mask_ = 0; 1485 value_ = 0; 1486 int char_shift = 0; 1487 for (int i = 0; i < characters_; i++) { 1488 Position* pos = &positions_[i]; 1489 if ((pos->mask & String::kMaxOneByteCharCode) != 0) { 1490 found_useful_op = true; 1491 } 1492 mask_ |= (pos->mask & char_mask) << char_shift; 1493 value_ |= (pos->value & char_mask) << char_shift; 1494 char_shift += asc ? 8 : 16; 1495 } 1496 return found_useful_op; 1497 } 1498 1499 uint32_t RegExpNode::EatsAtLeast(bool not_at_start) { 1500 return not_at_start ? eats_at_least_.eats_at_least_from_not_start 1501 : eats_at_least_.eats_at_least_from_possibly_start; 1502 } 1503 1504 EatsAtLeastInfo RegExpNode::EatsAtLeastFromLoopEntry() { 1505 // SET_REGISTER_FOR_LOOP is only used to initialize loop counters, and it 1506 // implies that the following node must be a LoopChoiceNode. If we need to 1507 // set registers to constant values for other reasons, we could introduce a 1508 // new action type SET_REGISTER that doesn't imply anything about its 1509 // successor. 1510 UNREACHABLE(); 1511 } 1512 1513 void RegExpNode::GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails* details, 1514 RegExpCompiler* compiler, 1515 int characters_filled_in, 1516 bool not_at_start) { 1517 // See comment in RegExpNode::EatsAtLeastFromLoopEntry. 1518 UNREACHABLE(); 1519 } 1520 1521 EatsAtLeastInfo LoopChoiceNode::EatsAtLeastFromLoopEntry() { 1522 DCHECK_EQ(alternatives_->length(), 2); // There's just loop and continue. 1523 1524 if (read_backward()) { 1525 // The eats_at_least value is not used if reading backward. The 1526 // EatsAtLeastPropagator should've zeroed it as well. 1527 DCHECK_EQ(eats_at_least_info()->eats_at_least_from_possibly_start, 0); 1528 DCHECK_EQ(eats_at_least_info()->eats_at_least_from_not_start, 0); 1529 return {}; 1530 } 1531 1532 // Figure out how much the loop body itself eats, not including anything in 1533 // the continuation case. In general, the nodes in the loop body should report 1534 // that they eat at least the number eaten by the continuation node, since any 1535 // successful match in the loop body must also include the continuation node. 1536 // However, in some cases involving positive lookaround, the loop body under- 1537 // reports its appetite, so use saturated math here to avoid negative numbers. 1538 // For this to work correctly, we explicitly need to use signed integers here. 1539 uint8_t loop_body_from_not_start = base::saturated_cast<uint8_t>( 1540 static_cast<int>(loop_node_->EatsAtLeast(true)) - 1541 static_cast<int>(continue_node_->EatsAtLeast(true))); 1542 uint8_t loop_body_from_possibly_start = base::saturated_cast<uint8_t>( 1543 static_cast<int>(loop_node_->EatsAtLeast(false)) - 1544 static_cast<int>(continue_node_->EatsAtLeast(true))); 1545 1546 // Limit the number of loop iterations to avoid overflow in subsequent steps. 1547 int loop_iterations = base::saturated_cast<uint8_t>(min_loop_iterations()); 1548 1549 EatsAtLeastInfo result; 1550 result.eats_at_least_from_not_start = 1551 base::saturated_cast<uint8_t>(loop_iterations * loop_body_from_not_start + 1552 continue_node_->EatsAtLeast(true)); 1553 if (loop_iterations > 0 && loop_body_from_possibly_start > 0) { 1554 // First loop iteration eats at least one, so all subsequent iterations 1555 // and the after-loop chunk are guaranteed to not be at the start. 1556 result.eats_at_least_from_possibly_start = base::saturated_cast<uint8_t>( 1557 loop_body_from_possibly_start + 1558 (loop_iterations - 1) * loop_body_from_not_start + 1559 continue_node_->EatsAtLeast(true)); 1560 } else { 1561 // Loop body might eat nothing, so only continue node contributes. 1562 result.eats_at_least_from_possibly_start = 1563 continue_node_->EatsAtLeast(false); 1564 } 1565 return result; 1566 } 1567 1568 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler, 1569 Trace* bounds_check_trace, Trace* trace, 1570 bool preload_has_checked_bounds, 1571 Label* on_possible_success, 1572 QuickCheckDetails* details, 1573 bool fall_through_on_failure, 1574 ChoiceNode* predecessor) { 1575 DCHECK_NOT_NULL(predecessor); 1576 if (details->characters() == 0) return false; 1577 GetQuickCheckDetails(details, compiler, 0, 1578 trace->at_start() == Trace::FALSE_VALUE); 1579 if (details->cannot_match()) return false; 1580 if (!details->Rationalize(compiler->one_byte())) return false; 1581 DCHECK(details->characters() == 1 || 1582 compiler->macro_assembler()->CanReadUnaligned()); 1583 uint32_t mask = details->mask(); 1584 uint32_t value = details->value(); 1585 1586 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1587 1588 if (trace->characters_preloaded() != details->characters()) { 1589 DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset()); 1590 // The bounds check is performed using the minimum number of characters 1591 // any choice would eat, so if the bounds check fails, then none of the 1592 // choices can succeed, so we can just immediately backtrack, rather 1593 // than go to the next choice. The number of characters preloaded may be 1594 // less than the number used for the bounds check. 1595 int eats_at_least = predecessor->EatsAtLeast( 1596 bounds_check_trace->at_start() == Trace::FALSE_VALUE); 1597 DCHECK_GE(eats_at_least, details->characters()); 1598 assembler->LoadCurrentCharacter( 1599 trace->cp_offset(), bounds_check_trace->backtrack(), 1600 !preload_has_checked_bounds, details->characters(), eats_at_least); 1601 } 1602 1603 bool need_mask = true; 1604 1605 if (details->characters() == 1) { 1606 // If number of characters preloaded is 1 then we used a byte or 16 bit 1607 // load so the value is already masked down. 1608 const uint32_t char_mask = CharMask(compiler->one_byte()); 1609 if ((mask & char_mask) == char_mask) need_mask = false; 1610 mask &= char_mask; 1611 } else { 1612 // For 2-character preloads in one-byte mode or 1-character preloads in 1613 // two-byte mode we also use a 16 bit load with zero extend. 1614 static const uint32_t kTwoByteMask = 0xFFFF; 1615 static const uint32_t kFourByteMask = 0xFFFFFFFF; 1616 if (details->characters() == 2 && compiler->one_byte()) { 1617 if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false; 1618 } else if (details->characters() == 1 && !compiler->one_byte()) { 1619 if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false; 1620 } else { 1621 if (mask == kFourByteMask) need_mask = false; 1622 } 1623 } 1624 1625 if (fall_through_on_failure) { 1626 if (need_mask) { 1627 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success); 1628 } else { 1629 assembler->CheckCharacter(value, on_possible_success); 1630 } 1631 } else { 1632 if (need_mask) { 1633 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack()); 1634 } else { 1635 assembler->CheckNotCharacter(value, trace->backtrack()); 1636 } 1637 } 1638 return true; 1639 } 1640 1641 // Here is the meat of GetQuickCheckDetails (see also the comment on the 1642 // super-class in the .h file). 1643 // 1644 // We iterate along the text object, building up for each character a 1645 // mask and value that can be used to test for a quick failure to match. 1646 // The masks and values for the positions will be combined into a single 1647 // machine word for the current character width in order to be used in 1648 // generating a quick check. 1649 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, 1650 RegExpCompiler* compiler, 1651 int characters_filled_in, 1652 bool not_at_start) { 1653 // Do not collect any quick check details if the text node reads backward, 1654 // since it reads in the opposite direction than we use for quick checks. 1655 if (read_backward()) return; 1656 Isolate* isolate = compiler->macro_assembler()->isolate(); 1657 DCHECK(characters_filled_in < details->characters()); 1658 int characters = details->characters(); 1659 const uint32_t char_mask = CharMask(compiler->one_byte()); 1660 for (int k = 0; k < elements()->length(); k++) { 1661 TextElement elm = elements()->at(k); 1662 if (elm.text_type() == TextElement::ATOM) { 1663 base::Vector<const base::uc16> quarks = elm.atom()->data(); 1664 for (int i = 0; i < characters && i < quarks.length(); i++) { 1665 QuickCheckDetails::Position* pos = 1666 details->positions(characters_filled_in); 1667 base::uc16 c = quarks[i]; 1668 if (IsIgnoreCase(compiler->flags())) { 1669 unibrow::uchar chars[4]; 1670 int length = 1671 GetCaseIndependentLetters(isolate, c, compiler, chars, 4); 1672 if (length == 0) { 1673 // This can happen because all case variants are non-Latin1, but we 1674 // know the input is Latin1. 1675 details->set_cannot_match(); 1676 pos->determines_perfectly = false; 1677 return; 1678 } 1679 if (length == 1) { 1680 // This letter has no case equivalents, so it's nice and simple 1681 // and the mask-compare will determine definitely whether we have 1682 // a match at this character position. 1683 pos->mask = char_mask; 1684 pos->value = chars[0]; 1685 pos->determines_perfectly = true; 1686 } else { 1687 uint32_t common_bits = char_mask; 1688 uint32_t bits = chars[0]; 1689 for (int j = 1; j < length; j++) { 1690 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits); 1691 common_bits ^= differing_bits; 1692 bits &= common_bits; 1693 } 1694 // If length is 2 and common bits has only one zero in it then 1695 // our mask and compare instruction will determine definitely 1696 // whether we have a match at this character position. Otherwise 1697 // it can only be an approximate check. 1698 uint32_t one_zero = (common_bits | ~char_mask); 1699 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) { 1700 pos->determines_perfectly = true; 1701 } 1702 pos->mask = common_bits; 1703 pos->value = bits; 1704 } 1705 } else { 1706 // Don't ignore case. Nice simple case where the mask-compare will 1707 // determine definitely whether we have a match at this character 1708 // position. 1709 if (c > char_mask) { 1710 details->set_cannot_match(); 1711 pos->determines_perfectly = false; 1712 return; 1713 } 1714 pos->mask = char_mask; 1715 pos->value = c; 1716 pos->determines_perfectly = true; 1717 } 1718 characters_filled_in++; 1719 DCHECK(characters_filled_in <= details->characters()); 1720 if (characters_filled_in == details->characters()) { 1721 return; 1722 } 1723 } 1724 } else { 1725 QuickCheckDetails::Position* pos = 1726 details->positions(characters_filled_in); 1727 RegExpClassRanges* tree = elm.class_ranges(); 1728 ZoneList<CharacterRange>* ranges = tree->ranges(zone()); 1729 if (tree->is_negated() || ranges->is_empty()) { 1730 // A quick check uses multi-character mask and compare. There is no 1731 // useful way to incorporate a negative char class into this scheme 1732 // so we just conservatively create a mask and value that will always 1733 // succeed. 1734 // Likewise for empty ranges (empty ranges can occur e.g. when 1735 // compiling for one-byte subjects and impossible (non-one-byte) ranges 1736 // have been removed). 1737 pos->mask = 0; 1738 pos->value = 0; 1739 } else { 1740 int first_range = 0; 1741 while (ranges->at(first_range).from() > char_mask) { 1742 first_range++; 1743 if (first_range == ranges->length()) { 1744 details->set_cannot_match(); 1745 pos->determines_perfectly = false; 1746 return; 1747 } 1748 } 1749 CharacterRange range = ranges->at(first_range); 1750 const base::uc32 first_from = range.from(); 1751 const base::uc32 first_to = 1752 (range.to() > char_mask) ? char_mask : range.to(); 1753 const uint32_t differing_bits = (first_from ^ first_to); 1754 // A mask and compare is only perfect if the differing bits form a 1755 // number like 00011111 with one single block of trailing 1s. 1756 if ((differing_bits & (differing_bits + 1)) == 0 && 1757 first_from + differing_bits == first_to) { 1758 pos->determines_perfectly = true; 1759 } 1760 uint32_t common_bits = ~SmearBitsRight(differing_bits); 1761 uint32_t bits = (first_from & common_bits); 1762 for (int i = first_range + 1; i < ranges->length(); i++) { 1763 range = ranges->at(i); 1764 const base::uc32 from = range.from(); 1765 if (from > char_mask) continue; 1766 const base::uc32 to = 1767 (range.to() > char_mask) ? char_mask : range.to(); 1768 // Here we are combining more ranges into the mask and compare 1769 // value. With each new range the mask becomes more sparse and 1770 // so the chances of a false positive rise. A character class 1771 // with multiple ranges is assumed never to be equivalent to a 1772 // mask and compare operation. 1773 pos->determines_perfectly = false; 1774 uint32_t new_common_bits = (from ^ to); 1775 new_common_bits = ~SmearBitsRight(new_common_bits); 1776 common_bits &= new_common_bits; 1777 bits &= new_common_bits; 1778 uint32_t new_differing_bits = (from & common_bits) ^ bits; 1779 common_bits ^= new_differing_bits; 1780 bits &= common_bits; 1781 } 1782 pos->mask = common_bits; 1783 pos->value = bits; 1784 } 1785 characters_filled_in++; 1786 DCHECK(characters_filled_in <= details->characters()); 1787 if (characters_filled_in == details->characters()) return; 1788 } 1789 } 1790 DCHECK(characters_filled_in != details->characters()); 1791 if (!details->cannot_match()) { 1792 on_success()->GetQuickCheckDetails(details, compiler, characters_filled_in, 1793 true); 1794 } 1795 } 1796 1797 void QuickCheckDetails::Clear() { 1798 for (int i = 0; i < characters_; i++) { 1799 positions_[i].mask = 0; 1800 positions_[i].value = 0; 1801 positions_[i].determines_perfectly = false; 1802 } 1803 characters_ = 0; 1804 } 1805 1806 void QuickCheckDetails::Advance(int by, bool one_byte) { 1807 if (by >= characters_ || by < 0) { 1808 DCHECK_IMPLIES(by < 0, characters_ == 0); 1809 Clear(); 1810 return; 1811 } 1812 DCHECK_LE(characters_ - by, 4); 1813 DCHECK_LE(characters_, 4); 1814 for (int i = 0; i < characters_ - by; i++) { 1815 positions_[i] = positions_[by + i]; 1816 } 1817 for (int i = characters_ - by; i < characters_; i++) { 1818 positions_[i].mask = 0; 1819 positions_[i].value = 0; 1820 positions_[i].determines_perfectly = false; 1821 } 1822 characters_ -= by; 1823 // We could change mask_ and value_ here but we would never advance unless 1824 // they had already been used in a check and they won't be used again because 1825 // it would gain us nothing. So there's no point. 1826 } 1827 1828 void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) { 1829 DCHECK(characters_ == other->characters_); 1830 if (other->cannot_match_) { 1831 return; 1832 } 1833 if (cannot_match_) { 1834 *this = *other; 1835 return; 1836 } 1837 for (int i = from_index; i < characters_; i++) { 1838 QuickCheckDetails::Position* pos = positions(i); 1839 QuickCheckDetails::Position* other_pos = other->positions(i); 1840 if (pos->mask != other_pos->mask || pos->value != other_pos->value || 1841 !other_pos->determines_perfectly) { 1842 // Our mask-compare operation will be approximate unless we have the 1843 // exact same operation on both sides of the alternation. 1844 pos->determines_perfectly = false; 1845 } 1846 pos->mask &= other_pos->mask; 1847 pos->value &= pos->mask; 1848 other_pos->value &= pos->mask; 1849 uint32_t differing_bits = (pos->value ^ other_pos->value); 1850 pos->mask &= ~differing_bits; 1851 pos->value &= pos->mask; 1852 } 1853 } 1854 1855 class VisitMarker { 1856 public: 1857 explicit VisitMarker(NodeInfo* info) : info_(info) { 1858 DCHECK(!info->visited); 1859 info->visited = true; 1860 } 1861 ~VisitMarker() { info_->visited = false; } 1862 1863 private: 1864 NodeInfo* info_; 1865 }; 1866 1867 // Temporarily sets traversed_loop_initialization_node_. 1868 class LoopInitializationMarker { 1869 public: 1870 explicit LoopInitializationMarker(LoopChoiceNode* node) : node_(node) { 1871 DCHECK(!node_->traversed_loop_initialization_node_); 1872 node_->traversed_loop_initialization_node_ = true; 1873 } 1874 ~LoopInitializationMarker() { 1875 DCHECK(node_->traversed_loop_initialization_node_); 1876 node_->traversed_loop_initialization_node_ = false; 1877 } 1878 LoopInitializationMarker(const LoopInitializationMarker&) = delete; 1879 LoopInitializationMarker& operator=(const LoopInitializationMarker&) = delete; 1880 1881 private: 1882 LoopChoiceNode* node_; 1883 }; 1884 1885 // Temporarily decrements min_loop_iterations_. 1886 class IterationDecrementer { 1887 public: 1888 explicit IterationDecrementer(LoopChoiceNode* node) : node_(node) { 1889 DCHECK_GT(node_->min_loop_iterations_, 0); 1890 --node_->min_loop_iterations_; 1891 } 1892 ~IterationDecrementer() { ++node_->min_loop_iterations_; } 1893 IterationDecrementer(const IterationDecrementer&) = delete; 1894 IterationDecrementer& operator=(const IterationDecrementer&) = delete; 1895 1896 private: 1897 LoopChoiceNode* node_; 1898 }; 1899 1900 RegExpNode* SeqRegExpNode::FilterOneByte(int depth, RegExpCompiler* compiler) { 1901 if (info()->replacement_calculated) return replacement(); 1902 if (depth < 0) return this; 1903 DCHECK(!info()->visited); 1904 VisitMarker marker(info()); 1905 return FilterSuccessor(depth - 1, compiler); 1906 } 1907 1908 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, 1909 RegExpCompiler* compiler) { 1910 RegExpNode* next = on_success_->FilterOneByte(depth - 1, compiler); 1911 if (next == nullptr) return set_replacement(nullptr); 1912 on_success_ = next; 1913 return set_replacement(this); 1914 } 1915 1916 // We need to check for the following characters: 0x39C 0x3BC 0x178. 1917 bool RangeContainsLatin1Equivalents(CharacterRange range) { 1918 // TODO(dcarney): this could be a lot more efficient. 1919 return range.Contains(0x039C) || range.Contains(0x03BC) || 1920 range.Contains(0x0178); 1921 } 1922 1923 namespace { 1924 1925 bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) { 1926 for (int i = 0; i < ranges->length(); i++) { 1927 // TODO(dcarney): this could be a lot more efficient. 1928 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true; 1929 } 1930 return false; 1931 } 1932 1933 } // namespace 1934 1935 RegExpNode* TextNode::FilterOneByte(int depth, RegExpCompiler* compiler) { 1936 RegExpFlags flags = compiler->flags(); 1937 if (info()->replacement_calculated) return replacement(); 1938 if (depth < 0) return this; 1939 DCHECK(!info()->visited); 1940 VisitMarker marker(info()); 1941 int element_count = elements()->length(); 1942 for (int i = 0; i < element_count; i++) { 1943 TextElement elm = elements()->at(i); 1944 if (elm.text_type() == TextElement::ATOM) { 1945 base::Vector<const base::uc16> quarks = elm.atom()->data(); 1946 for (int j = 0; j < quarks.length(); j++) { 1947 base::uc16 c = quarks[j]; 1948 if (!IsIgnoreCase(flags)) { 1949 if (c > String::kMaxOneByteCharCode) return set_replacement(nullptr); 1950 } else { 1951 unibrow::uchar chars[4]; 1952 int length = GetCaseIndependentLetters(compiler->isolate(), c, 1953 compiler, chars, 4); 1954 if (length == 0 || chars[0] > String::kMaxOneByteCharCode) { 1955 return set_replacement(nullptr); 1956 } 1957 } 1958 } 1959 } else { 1960 // A character class can also be impossible to match in one-byte mode. 1961 DCHECK(elm.text_type() == TextElement::CLASS_RANGES); 1962 RegExpClassRanges* cr = elm.class_ranges(); 1963 ZoneList<CharacterRange>* ranges = cr->ranges(zone()); 1964 CharacterRange::Canonicalize(ranges); 1965 // Now they are in order so we only need to look at the first. 1966 // If we are in non-Unicode case independent mode then we need 1967 // to be a bit careful here, because the character classes have 1968 // not been case-desugared yet, but there are characters and ranges 1969 // that can become Latin-1 when case is considered. 1970 int range_count = ranges->length(); 1971 if (cr->is_negated()) { 1972 if (range_count != 0 && ranges->at(0).from() == 0 && 1973 ranges->at(0).to() >= String::kMaxOneByteCharCode) { 1974 bool case_complications = !IsEitherUnicode(flags) && 1975 IsIgnoreCase(flags) && 1976 RangesContainLatin1Equivalents(ranges); 1977 if (!case_complications) { 1978 return set_replacement(nullptr); 1979 } 1980 } 1981 } else { 1982 if (range_count == 0 || 1983 ranges->at(0).from() > String::kMaxOneByteCharCode) { 1984 bool case_complications = !IsEitherUnicode(flags) && 1985 IsIgnoreCase(flags) && 1986 RangesContainLatin1Equivalents(ranges); 1987 if (!case_complications) { 1988 return set_replacement(nullptr); 1989 } 1990 } 1991 } 1992 } 1993 } 1994 return FilterSuccessor(depth - 1, compiler); 1995 } 1996 1997 RegExpNode* LoopChoiceNode::FilterOneByte(int depth, RegExpCompiler* compiler) { 1998 if (info()->replacement_calculated) return replacement(); 1999 if (depth < 0) return this; 2000 if (info()->visited) return this; 2001 { 2002 VisitMarker marker(info()); 2003 2004 RegExpNode* continue_replacement = 2005 continue_node_->FilterOneByte(depth - 1, compiler); 2006 // If we can't continue after the loop then there is no sense in doing the 2007 // loop. 2008 if (continue_replacement == nullptr) return set_replacement(nullptr); 2009 } 2010 2011 return ChoiceNode::FilterOneByte(depth - 1, compiler); 2012 } 2013 2014 RegExpNode* ChoiceNode::FilterOneByte(int depth, RegExpCompiler* compiler) { 2015 if (info()->replacement_calculated) return replacement(); 2016 if (depth < 0) return this; 2017 if (info()->visited) return this; 2018 VisitMarker marker(info()); 2019 int choice_count = alternatives_->length(); 2020 2021 for (int i = 0; i < choice_count; i++) { 2022 GuardedAlternative alternative = alternatives_->at(i); 2023 if (alternative.guards() != nullptr && 2024 alternative.guards()->length() != 0) { 2025 set_replacement(this); 2026 return this; 2027 } 2028 } 2029 2030 int surviving = 0; 2031 RegExpNode* survivor = nullptr; 2032 for (int i = 0; i < choice_count; i++) { 2033 GuardedAlternative alternative = alternatives_->at(i); 2034 RegExpNode* replacement = 2035 alternative.node()->FilterOneByte(depth - 1, compiler); 2036 DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK. 2037 if (replacement != nullptr) { 2038 alternatives_->at(i).set_node(replacement); 2039 surviving++; 2040 survivor = replacement; 2041 } 2042 } 2043 if (surviving < 2) return set_replacement(survivor); 2044 2045 set_replacement(this); 2046 if (surviving == choice_count) { 2047 return this; 2048 } 2049 // Only some of the nodes survived the filtering. We need to rebuild the 2050 // alternatives list. 2051 ZoneList<GuardedAlternative>* new_alternatives = 2052 zone()->New<ZoneList<GuardedAlternative>>(surviving, zone()); 2053 for (int i = 0; i < choice_count; i++) { 2054 RegExpNode* replacement = 2055 alternatives_->at(i).node()->FilterOneByte(depth - 1, compiler); 2056 if (replacement != nullptr) { 2057 alternatives_->at(i).set_node(replacement); 2058 new_alternatives->Add(alternatives_->at(i), zone()); 2059 } 2060 } 2061 alternatives_ = new_alternatives; 2062 return this; 2063 } 2064 2065 RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte( 2066 int depth, RegExpCompiler* compiler) { 2067 if (info()->replacement_calculated) return replacement(); 2068 if (depth < 0) return this; 2069 if (info()->visited) return this; 2070 VisitMarker marker(info()); 2071 // Alternative 0 is the negative lookahead, alternative 1 is what comes 2072 // afterwards. 2073 RegExpNode* node = continue_node(); 2074 RegExpNode* replacement = node->FilterOneByte(depth - 1, compiler); 2075 if (replacement == nullptr) return set_replacement(nullptr); 2076 alternatives_->at(kContinueIndex).set_node(replacement); 2077 2078 RegExpNode* neg_node = lookaround_node(); 2079 RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, compiler); 2080 // If the negative lookahead is always going to fail then 2081 // we don't need to check it. 2082 if (neg_replacement == nullptr) return set_replacement(replacement); 2083 alternatives_->at(kLookaroundIndex).set_node(neg_replacement); 2084 return set_replacement(this); 2085 } 2086 2087 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 2088 RegExpCompiler* compiler, 2089 int characters_filled_in, 2090 bool not_at_start) { 2091 if (body_can_be_zero_length_ || info()->visited) return; 2092 not_at_start = not_at_start || this->not_at_start(); 2093 DCHECK_EQ(alternatives_->length(), 2); // There's just loop and continue. 2094 if (traversed_loop_initialization_node_ && min_loop_iterations_ > 0 && 2095 loop_node_->EatsAtLeast(not_at_start) > 2096 continue_node_->EatsAtLeast(true)) { 2097 // Loop body is guaranteed to execute at least once, and consume characters 2098 // when it does, meaning the only possible quick checks from this point 2099 // begin with the loop body. We may recursively visit this LoopChoiceNode, 2100 // but we temporarily decrease its minimum iteration counter so we know when 2101 // to check the continue case. 2102 IterationDecrementer next_iteration(this); 2103 loop_node_->GetQuickCheckDetails(details, compiler, characters_filled_in, 2104 not_at_start); 2105 } else { 2106 // Might not consume anything in the loop body, so treat it like a normal 2107 // ChoiceNode (and don't recursively visit this node again). 2108 VisitMarker marker(info()); 2109 ChoiceNode::GetQuickCheckDetails(details, compiler, characters_filled_in, 2110 not_at_start); 2111 } 2112 } 2113 2114 void LoopChoiceNode::GetQuickCheckDetailsFromLoopEntry( 2115 QuickCheckDetails* details, RegExpCompiler* compiler, 2116 int characters_filled_in, bool not_at_start) { 2117 if (traversed_loop_initialization_node_) { 2118 // We already entered this loop once, exited via its continuation node, and 2119 // followed an outer loop's back-edge to before the loop entry point. We 2120 // could try to reset the minimum iteration count to its starting value at 2121 // this point, but that seems like more trouble than it's worth. It's safe 2122 // to keep going with the current (possibly reduced) minimum iteration 2123 // count. 2124 GetQuickCheckDetails(details, compiler, characters_filled_in, not_at_start); 2125 } else { 2126 // We are entering a loop via its counter initialization action, meaning we 2127 // are guaranteed to run the loop body at least some minimum number of times 2128 // before running the continuation node. Set a flag so that this node knows 2129 // (now and any times we visit it again recursively) that it was entered 2130 // from the top. 2131 LoopInitializationMarker marker(this); 2132 GetQuickCheckDetails(details, compiler, characters_filled_in, not_at_start); 2133 } 2134 } 2135 2136 void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget, 2137 BoyerMooreLookahead* bm, bool not_at_start) { 2138 if (body_can_be_zero_length_ || budget <= 0) { 2139 bm->SetRest(offset); 2140 SaveBMInfo(bm, not_at_start, offset); 2141 return; 2142 } 2143 ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start); 2144 SaveBMInfo(bm, not_at_start, offset); 2145 } 2146 2147 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 2148 RegExpCompiler* compiler, 2149 int characters_filled_in, 2150 bool not_at_start) { 2151 not_at_start = (not_at_start || not_at_start_); 2152 int choice_count = alternatives_->length(); 2153 DCHECK_LT(0, choice_count); 2154 alternatives_->at(0).node()->GetQuickCheckDetails( 2155 details, compiler, characters_filled_in, not_at_start); 2156 for (int i = 1; i < choice_count; i++) { 2157 QuickCheckDetails new_details(details->characters()); 2158 RegExpNode* node = alternatives_->at(i).node(); 2159 node->GetQuickCheckDetails(&new_details, compiler, characters_filled_in, 2160 not_at_start); 2161 // Here we merge the quick match details of the two branches. 2162 details->Merge(&new_details, characters_filled_in); 2163 } 2164 } 2165 2166 namespace { 2167 2168 // Check for [0-9A-Z_a-z]. 2169 void EmitWordCheck(RegExpMacroAssembler* assembler, Label* word, 2170 Label* non_word, bool fall_through_on_word) { 2171 if (assembler->CheckSpecialClassRanges( 2172 fall_through_on_word ? StandardCharacterSet::kWord 2173 : StandardCharacterSet::kNotWord, 2174 fall_through_on_word ? non_word : word)) { 2175 // Optimized implementation available. 2176 return; 2177 } 2178 assembler->CheckCharacterGT('z', non_word); 2179 assembler->CheckCharacterLT('0', non_word); 2180 assembler->CheckCharacterGT('a' - 1, word); 2181 assembler->CheckCharacterLT('9' + 1, word); 2182 assembler->CheckCharacterLT('A', non_word); 2183 assembler->CheckCharacterLT('Z' + 1, word); 2184 if (fall_through_on_word) { 2185 assembler->CheckNotCharacter('_', non_word); 2186 } else { 2187 assembler->CheckCharacter('_', word); 2188 } 2189 } 2190 2191 // Emit the code to check for a ^ in multiline mode (1-character lookbehind 2192 // that matches newline or the start of input). 2193 void EmitHat(RegExpCompiler* compiler, RegExpNode* on_success, Trace* trace) { 2194 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2195 2196 // We will load the previous character into the current character register. 2197 Trace new_trace(*trace); 2198 new_trace.InvalidateCurrentCharacter(); 2199 2200 // A positive (> 0) cp_offset means we've already successfully matched a 2201 // non-empty-width part of the pattern, and thus cannot be at or before the 2202 // start of the subject string. We can thus skip both at-start and 2203 // bounds-checks when loading the one-character lookbehind. 2204 const bool may_be_at_or_before_subject_string_start = 2205 new_trace.cp_offset() <= 0; 2206 2207 Label ok; 2208 if (may_be_at_or_before_subject_string_start) { 2209 // The start of input counts as a newline in this context, so skip to ok if 2210 // we are at the start. 2211 assembler->CheckAtStart(new_trace.cp_offset(), &ok); 2212 } 2213 2214 // If we've already checked that we are not at the start of input, it's okay 2215 // to load the previous character without bounds checks. 2216 const bool can_skip_bounds_check = !may_be_at_or_before_subject_string_start; 2217 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, 2218 new_trace.backtrack(), can_skip_bounds_check); 2219 if (!assembler->CheckSpecialClassRanges(StandardCharacterSet::kLineTerminator, 2220 new_trace.backtrack())) { 2221 // Newline means \n, \r, 0x2028 or 0x2029. 2222 if (!compiler->one_byte()) { 2223 assembler->CheckCharacterAfterAnd(0x2028, 0xFFFE, &ok); 2224 } 2225 assembler->CheckCharacter('\n', &ok); 2226 assembler->CheckNotCharacter('\r', new_trace.backtrack()); 2227 } 2228 assembler->Bind(&ok); 2229 on_success->Emit(compiler, &new_trace); 2230 } 2231 2232 } // namespace 2233 2234 // Emit the code to handle \b and \B (word-boundary or non-word-boundary). 2235 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) { 2236 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2237 Isolate* isolate = assembler->isolate(); 2238 Trace::TriBool next_is_word_character = Trace::UNKNOWN; 2239 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE); 2240 BoyerMooreLookahead* lookahead = bm_info(not_at_start); 2241 if (lookahead == nullptr) { 2242 int eats_at_least = 2243 std::min(kMaxLookaheadForBoyerMoore, EatsAtLeast(not_at_start)); 2244 if (eats_at_least >= 1) { 2245 BoyerMooreLookahead* bm = 2246 zone()->New<BoyerMooreLookahead>(eats_at_least, compiler, zone()); 2247 FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start); 2248 if (bm->at(0)->is_non_word()) next_is_word_character = Trace::FALSE_VALUE; 2249 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE; 2250 } 2251 } else { 2252 if (lookahead->at(0)->is_non_word()) 2253 next_is_word_character = Trace::FALSE_VALUE; 2254 if (lookahead->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE; 2255 } 2256 bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY); 2257 if (next_is_word_character == Trace::UNKNOWN) { 2258 Label before_non_word; 2259 Label before_word; 2260 if (trace->characters_preloaded() != 1) { 2261 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word); 2262 } 2263 // Fall through on non-word. 2264 EmitWordCheck(assembler, &before_word, &before_non_word, false); 2265 // Next character is not a word character. 2266 assembler->Bind(&before_non_word); 2267 Label ok; 2268 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord); 2269 assembler->GoTo(&ok); 2270 2271 assembler->Bind(&before_word); 2272 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord); 2273 assembler->Bind(&ok); 2274 } else if (next_is_word_character == Trace::TRUE_VALUE) { 2275 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord); 2276 } else { 2277 DCHECK(next_is_word_character == Trace::FALSE_VALUE); 2278 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord); 2279 } 2280 } 2281 2282 void AssertionNode::BacktrackIfPrevious( 2283 RegExpCompiler* compiler, Trace* trace, 2284 AssertionNode::IfPrevious backtrack_if_previous) { 2285 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2286 Trace new_trace(*trace); 2287 new_trace.InvalidateCurrentCharacter(); 2288 2289 Label fall_through; 2290 Label* non_word = backtrack_if_previous == kIsNonWord ? new_trace.backtrack() 2291 : &fall_through; 2292 Label* word = backtrack_if_previous == kIsNonWord ? &fall_through 2293 : new_trace.backtrack(); 2294 2295 // A positive (> 0) cp_offset means we've already successfully matched a 2296 // non-empty-width part of the pattern, and thus cannot be at or before the 2297 // start of the subject string. We can thus skip both at-start and 2298 // bounds-checks when loading the one-character lookbehind. 2299 const bool may_be_at_or_before_subject_string_start = 2300 new_trace.cp_offset() <= 0; 2301 2302 if (may_be_at_or_before_subject_string_start) { 2303 // The start of input counts as a non-word character, so the question is 2304 // decided if we are at the start. 2305 assembler->CheckAtStart(new_trace.cp_offset(), non_word); 2306 } 2307 2308 // If we've already checked that we are not at the start of input, it's okay 2309 // to load the previous character without bounds checks. 2310 const bool can_skip_bounds_check = !may_be_at_or_before_subject_string_start; 2311 static_assert(Trace::kCPOffsetSlack == 1); 2312 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, non_word, 2313 can_skip_bounds_check); 2314 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord); 2315 2316 assembler->Bind(&fall_through); 2317 on_success()->Emit(compiler, &new_trace); 2318 } 2319 2320 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details, 2321 RegExpCompiler* compiler, 2322 int filled_in, bool not_at_start) { 2323 if (assertion_type_ == AT_START && not_at_start) { 2324 details->set_cannot_match(); 2325 return; 2326 } 2327 return on_success()->GetQuickCheckDetails(details, compiler, filled_in, 2328 not_at_start); 2329 } 2330 2331 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) { 2332 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2333 switch (assertion_type_) { 2334 case AT_END: { 2335 Label ok; 2336 assembler->CheckPosition(trace->cp_offset(), &ok); 2337 assembler->GoTo(trace->backtrack()); 2338 assembler->Bind(&ok); 2339 break; 2340 } 2341 case AT_START: { 2342 if (trace->at_start() == Trace::FALSE_VALUE) { 2343 assembler->GoTo(trace->backtrack()); 2344 return; 2345 } 2346 if (trace->at_start() == Trace::UNKNOWN) { 2347 assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack()); 2348 Trace at_start_trace = *trace; 2349 at_start_trace.set_at_start(Trace::TRUE_VALUE); 2350 on_success()->Emit(compiler, &at_start_trace); 2351 return; 2352 } 2353 } break; 2354 case AFTER_NEWLINE: 2355 EmitHat(compiler, on_success(), trace); 2356 return; 2357 case AT_BOUNDARY: 2358 case AT_NON_BOUNDARY: { 2359 EmitBoundaryCheck(compiler, trace); 2360 return; 2361 } 2362 } 2363 on_success()->Emit(compiler, trace); 2364 } 2365 2366 namespace { 2367 2368 bool DeterminedAlready(const QuickCheckDetails* quick_check, int offset) { 2369 if (quick_check == nullptr) return false; 2370 if (offset >= quick_check->characters()) return false; 2371 return quick_check->positions(offset)->determines_perfectly; 2372 } 2373 2374 void UpdateBoundsCheck(int index, int* checked_up_to) { 2375 if (index > *checked_up_to) { 2376 *checked_up_to = index; 2377 } 2378 } 2379 2380 } // namespace 2381 2382 // We call this repeatedly to generate code for each pass over the text node. 2383 // The passes are in increasing order of difficulty because we hope one 2384 // of the first passes will fail in which case we are saved the work of the 2385 // later passes. for example for the case independent regexp /%[asdfghjkl]a/ 2386 // we will check the '%' in the first pass, the case independent 'a' in the 2387 // second pass and the character class in the last pass. 2388 // 2389 // The passes are done from right to left, so for example to test for /bar/ 2390 // we will first test for an 'r' with offset 2, then an 'a' with offset 1 2391 // and then a 'b' with offset 0. This means we can avoid the end-of-input 2392 // bounds check most of the time. In the example we only need to check for 2393 // end-of-input when loading the putative 'r'. 2394 // 2395 // A slight complication involves the fact that the first character may already 2396 // be fetched into a register by the previous node. In this case we want to 2397 // do the test for that character first. We do this in separate passes. The 2398 // 'preloaded' argument indicates that we are doing such a 'pass'. If such a 2399 // pass has been performed then subsequent passes will have true in 2400 // first_element_checked to indicate that that character does not need to be 2401 // checked again. 2402 // 2403 // In addition to all this we are passed a Trace, which can 2404 // contain an AlternativeGeneration object. In this AlternativeGeneration 2405 // object we can see details of any quick check that was already passed in 2406 // order to get to the code we are now generating. The quick check can involve 2407 // loading characters, which means we do not need to recheck the bounds 2408 // up to the limit the quick check already checked. In addition the quick 2409 // check can have involved a mask and compare operation which may simplify 2410 // or obviate the need for further checks at some character positions. 2411 void TextNode::TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass, 2412 bool preloaded, Trace* trace, 2413 bool first_element_checked, int* checked_up_to) { 2414 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2415 Isolate* isolate = assembler->isolate(); 2416 bool one_byte = compiler->one_byte(); 2417 Label* backtrack = trace->backtrack(); 2418 const QuickCheckDetails* quick_check = trace->quick_check_performed(); 2419 int element_count = elements()->length(); 2420 int backward_offset = read_backward() ? -Length() : 0; 2421 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) { 2422 TextElement elm = elements()->at(i); 2423 int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset; 2424 if (elm.text_type() == TextElement::ATOM) { 2425 base::Vector<const base::uc16> quarks = elm.atom()->data(); 2426 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { 2427 if (first_element_checked && i == 0 && j == 0) continue; 2428 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue; 2429 base::uc16 quark = quarks[j]; 2430 bool needs_bounds_check = 2431 *checked_up_to < cp_offset + j || read_backward(); 2432 bool bounds_checked = false; 2433 switch (pass) { 2434 case NON_LATIN1_MATCH: { 2435 DCHECK(one_byte); // This pass is only done in one-byte mode. 2436 if (IsIgnoreCase(compiler->flags())) { 2437 // We are compiling for a one-byte subject, case independent mode. 2438 // We have to check whether any of the case alternatives are in 2439 // the one-byte range. 2440 unibrow::uchar chars[4]; 2441 // Only returns characters that are in the one-byte range. 2442 int length = 2443 GetCaseIndependentLetters(isolate, quark, compiler, chars, 4); 2444 if (length == 0) { 2445 assembler->GoTo(backtrack); 2446 return; 2447 } 2448 } else { 2449 // Case-dependent mode. 2450 if (quark > String::kMaxOneByteCharCode) { 2451 assembler->GoTo(backtrack); 2452 return; 2453 } 2454 } 2455 break; 2456 } 2457 case NON_LETTER_CHARACTER_MATCH: 2458 bounds_checked = 2459 EmitAtomNonLetter(isolate, compiler, quark, backtrack, 2460 cp_offset + j, needs_bounds_check, preloaded); 2461 break; 2462 case SIMPLE_CHARACTER_MATCH: 2463 bounds_checked = EmitSimpleCharacter(isolate, compiler, quark, 2464 backtrack, cp_offset + j, 2465 needs_bounds_check, preloaded); 2466 break; 2467 case CASE_CHARACTER_MATCH: 2468 bounds_checked = 2469 EmitAtomLetter(isolate, compiler, quark, backtrack, 2470 cp_offset + j, needs_bounds_check, preloaded); 2471 break; 2472 default: 2473 break; 2474 } 2475 if (bounds_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to); 2476 } 2477 } else { 2478 DCHECK_EQ(TextElement::CLASS_RANGES, elm.text_type()); 2479 if (pass == CHARACTER_CLASS_MATCH) { 2480 if (first_element_checked && i == 0) continue; 2481 if (DeterminedAlready(quick_check, elm.cp_offset())) continue; 2482 RegExpClassRanges* cr = elm.class_ranges(); 2483 bool bounds_check = *checked_up_to < cp_offset || read_backward(); 2484 EmitClassRanges(assembler, cr, one_byte, backtrack, cp_offset, 2485 bounds_check, preloaded, zone()); 2486 UpdateBoundsCheck(cp_offset, checked_up_to); 2487 } 2488 } 2489 } 2490 } 2491 2492 int TextNode::Length() { 2493 TextElement elm = elements()->last(); 2494 DCHECK_LE(0, elm.cp_offset()); 2495 return elm.cp_offset() + elm.length(); 2496 } 2497 2498 TextNode* TextNode::CreateForCharacterRanges(Zone* zone, 2499 ZoneList<CharacterRange>* ranges, 2500 bool read_backward, 2501 RegExpNode* on_success) { 2502 DCHECK_NOT_NULL(ranges); 2503 // TODO(jgruber): There's no fundamental need to create this 2504 // RegExpClassRanges; we could refactor to avoid the allocation. 2505 return zone->New<TextNode>(zone->New<RegExpClassRanges>(zone, ranges), 2506 read_backward, on_success); 2507 } 2508 2509 TextNode* TextNode::CreateForSurrogatePair( 2510 Zone* zone, CharacterRange lead, ZoneList<CharacterRange>* trail_ranges, 2511 bool read_backward, RegExpNode* on_success) { 2512 ZoneList<TextElement>* elms = zone->New<ZoneList<TextElement>>(2, zone); 2513 if (lead.from() == lead.to()) { 2514 ZoneList<base::uc16> lead_surrogate(1, zone); 2515 lead_surrogate.Add(lead.from(), zone); 2516 RegExpAtom* atom = zone->New<RegExpAtom>(lead_surrogate.ToConstVector()); 2517 elms->Add(TextElement::Atom(atom), zone); 2518 } else { 2519 ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead); 2520 elms->Add(TextElement::ClassRanges( 2521 zone->New<RegExpClassRanges>(zone, lead_ranges)), 2522 zone); 2523 } 2524 elms->Add(TextElement::ClassRanges( 2525 zone->New<RegExpClassRanges>(zone, trail_ranges)), 2526 zone); 2527 return zone->New<TextNode>(elms, read_backward, on_success); 2528 } 2529 2530 TextNode* TextNode::CreateForSurrogatePair( 2531 Zone* zone, ZoneList<CharacterRange>* lead_ranges, CharacterRange trail, 2532 bool read_backward, RegExpNode* on_success) { 2533 ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail); 2534 ZoneList<TextElement>* elms = zone->New<ZoneList<TextElement>>(2, zone); 2535 elms->Add( 2536 TextElement::ClassRanges(zone->New<RegExpClassRanges>(zone, lead_ranges)), 2537 zone); 2538 elms->Add(TextElement::ClassRanges( 2539 zone->New<RegExpClassRanges>(zone, trail_ranges)), 2540 zone); 2541 return zone->New<TextNode>(elms, read_backward, on_success); 2542 } 2543 2544 // This generates the code to match a text node. A text node can contain 2545 // straight character sequences (possibly to be matched in a case-independent 2546 // way) and character classes. For efficiency we do not do this in a single 2547 // pass from left to right. Instead we pass over the text node several times, 2548 // emitting code for some character positions every time. See the comment on 2549 // TextEmitPass for details. 2550 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) { 2551 LimitResult limit_result = LimitVersions(compiler, trace); 2552 if (limit_result == DONE) return; 2553 DCHECK(limit_result == CONTINUE); 2554 2555 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) { 2556 compiler->SetRegExpTooBig(); 2557 return; 2558 } 2559 2560 if (compiler->one_byte()) { 2561 int dummy = 0; 2562 TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy); 2563 } 2564 2565 bool first_elt_done = false; 2566 static_assert(Trace::kCPOffsetSlack == 1); 2567 int bound_checked_to = trace->cp_offset() - 1; 2568 bound_checked_to += trace->bound_checked_up_to(); 2569 2570 // If a character is preloaded into the current character register then 2571 // check that first to save reloading it. 2572 for (int twice = 0; twice < 2; twice++) { 2573 bool is_preloaded_pass = twice == 0; 2574 if (is_preloaded_pass && trace->characters_preloaded() != 1) continue; 2575 if (IsIgnoreCase(compiler->flags())) { 2576 TextEmitPass(compiler, NON_LETTER_CHARACTER_MATCH, is_preloaded_pass, 2577 trace, first_elt_done, &bound_checked_to); 2578 TextEmitPass(compiler, CASE_CHARACTER_MATCH, is_preloaded_pass, trace, 2579 first_elt_done, &bound_checked_to); 2580 } else { 2581 TextEmitPass(compiler, SIMPLE_CHARACTER_MATCH, is_preloaded_pass, trace, 2582 first_elt_done, &bound_checked_to); 2583 } 2584 TextEmitPass(compiler, CHARACTER_CLASS_MATCH, is_preloaded_pass, trace, 2585 first_elt_done, &bound_checked_to); 2586 first_elt_done = true; 2587 } 2588 2589 Trace successor_trace(*trace); 2590 // If we advance backward, we may end up at the start. 2591 successor_trace.AdvanceCurrentPositionInTrace( 2592 read_backward() ? -Length() : Length(), compiler); 2593 successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN 2594 : Trace::FALSE_VALUE); 2595 RecursionCheck rc(compiler); 2596 on_success()->Emit(compiler, &successor_trace); 2597 } 2598 2599 void Trace::InvalidateCurrentCharacter() { characters_preloaded_ = 0; } 2600 2601 void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) { 2602 // We don't have an instruction for shifting the current character register 2603 // down or for using a shifted value for anything so lets just forget that 2604 // we preloaded any characters into it. 2605 characters_preloaded_ = 0; 2606 // Adjust the offsets of the quick check performed information. This 2607 // information is used to find out what we already determined about the 2608 // characters by means of mask and compare. 2609 quick_check_performed_.Advance(by, compiler->one_byte()); 2610 cp_offset_ += by; 2611 static_assert(RegExpMacroAssembler::kMaxCPOffset == 2612 -RegExpMacroAssembler::kMinCPOffset); 2613 if (std::abs(cp_offset_) + kCPOffsetSlack > 2614 RegExpMacroAssembler::kMaxCPOffset) { 2615 compiler->SetRegExpTooBig(); 2616 cp_offset_ = 0; 2617 } 2618 bound_checked_up_to_ = std::max(0, bound_checked_up_to_ - by); 2619 } 2620 2621 void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte, 2622 RegExpFlags flags) { 2623 if (!IsIgnoreCase(flags)) return; 2624 #ifdef V8_INTL_SUPPORT 2625 // This is done in an earlier step when generating the nodes from the AST 2626 // because we may have to split up into separate nodes. 2627 if (NeedsUnicodeCaseEquivalents(flags)) return; 2628 #endif 2629 2630 int element_count = elements()->length(); 2631 for (int i = 0; i < element_count; i++) { 2632 TextElement elm = elements()->at(i); 2633 if (elm.text_type() == TextElement::CLASS_RANGES) { 2634 RegExpClassRanges* cr = elm.class_ranges(); 2635 // None of the standard character classes is different in the case 2636 // independent case and it slows us down if we don't know that. 2637 if (cr->is_standard(zone())) continue; 2638 ZoneList<CharacterRange>* ranges = cr->ranges(zone()); 2639 CharacterRange::AddCaseEquivalents(isolate, zone(), ranges, is_one_byte); 2640 } 2641 } 2642 } 2643 2644 int TextNode::FixedLengthLoopLength() { return Length(); } 2645 2646 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode( 2647 RegExpCompiler* compiler) { 2648 if (read_backward()) return nullptr; 2649 if (elements()->length() != 1) return nullptr; 2650 TextElement elm = elements()->at(0); 2651 if (elm.text_type() != TextElement::CLASS_RANGES) return nullptr; 2652 RegExpClassRanges* node = elm.class_ranges(); 2653 ZoneList<CharacterRange>* ranges = node->ranges(zone()); 2654 CharacterRange::Canonicalize(ranges); 2655 if (node->is_negated()) { 2656 return ranges->length() == 0 ? on_success() : nullptr; 2657 } 2658 if (ranges->length() != 1) return nullptr; 2659 const base::uc32 max_char = MaxCodeUnit(compiler->one_byte()); 2660 return ranges->at(0).IsEverything(max_char) ? on_success() : nullptr; 2661 } 2662 2663 // Finds the fixed match length of a sequence of nodes that goes from 2664 // this alternative and back to this choice node. If there are variable 2665 // length nodes or other complications in the way then return a sentinel 2666 // value indicating that a fixed length loop cannot be constructed. 2667 int ChoiceNode::FixedLengthLoopLengthForAlternative( 2668 GuardedAlternative* alternative) { 2669 int length = 0; 2670 RegExpNode* node = alternative->node(); 2671 // Later we will generate code for all these text nodes using recursion 2672 // so we have to limit the max number. 2673 int recursion_depth = 0; 2674 while (node != this) { 2675 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) { 2676 return kNodeIsTooComplexForFixedLengthLoops; 2677 } 2678 int node_length = node->FixedLengthLoopLength(); 2679 if (node_length == kNodeIsTooComplexForFixedLengthLoops) { 2680 return kNodeIsTooComplexForFixedLengthLoops; 2681 } 2682 length += node_length; 2683 node = node->AsSeqRegExpNode()->on_success(); 2684 } 2685 if (read_backward()) { 2686 length = -length; 2687 } 2688 // Check that we can jump by the whole text length. If not, return sentinel 2689 // to indicate the we can't construct a fixed length loop. 2690 if (length < RegExpMacroAssembler::kMinCPOffset || 2691 length > RegExpMacroAssembler::kMaxCPOffset) { 2692 return kNodeIsTooComplexForFixedLengthLoops; 2693 } 2694 return length; 2695 } 2696 2697 void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) { 2698 DCHECK_NULL(loop_node_); 2699 AddAlternative(alt); 2700 loop_node_ = alt.node(); 2701 } 2702 2703 void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) { 2704 DCHECK_NULL(continue_node_); 2705 AddAlternative(alt); 2706 continue_node_ = alt.node(); 2707 } 2708 2709 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 2710 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 2711 if (trace->fixed_length_loop_state() != nullptr && 2712 trace->fixed_length_loop_state()->loop_choice_node() == this) { 2713 // Back edge of fixed length optimized loop node graph. 2714 int text_length = 2715 FixedLengthLoopLengthForAlternative(&(alternatives_->at(0))); 2716 DCHECK_NE(kNodeIsTooComplexForFixedLengthLoops, text_length); 2717 // Update the counter-based backtracking info on the stack. This is an 2718 // optimization for fixed length loops (see below). 2719 DCHECK(trace->cp_offset() == text_length); 2720 macro_assembler->AdvanceCurrentPosition(text_length); 2721 trace->fixed_length_loop_state()->GoToLoopTopLabel(macro_assembler); 2722 return; 2723 } 2724 DCHECK_NULL(trace->fixed_length_loop_state()); 2725 if (!trace->is_trivial()) { 2726 trace->Flush(compiler, this); 2727 return; 2728 } 2729 ChoiceNode::Emit(compiler, trace); 2730 } 2731 2732 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler, 2733 int eats_at_least) { 2734 int preload_characters = std::min(4, eats_at_least); 2735 DCHECK_LE(preload_characters, 4); 2736 if (compiler->macro_assembler()->CanReadUnaligned()) { 2737 bool one_byte = compiler->one_byte(); 2738 if (one_byte) { 2739 // We can't preload 3 characters because there is no machine instruction 2740 // to do that. We can't just load 4 because we could be reading 2741 // beyond the end of the string, which could cause a memory fault. 2742 if (preload_characters == 3) preload_characters = 2; 2743 } else { 2744 if (preload_characters > 2) preload_characters = 2; 2745 } 2746 } else { 2747 if (preload_characters > 1) preload_characters = 1; 2748 } 2749 return preload_characters; 2750 } 2751 2752 // This class is used when generating the alternatives in a choice node. It 2753 // records the way the alternative is being code generated. 2754 class AlternativeGeneration : public Malloced { 2755 public: 2756 AlternativeGeneration() 2757 : possible_success(), 2758 expects_preload(false), 2759 after(), 2760 quick_check_details() {} 2761 Label possible_success; 2762 bool expects_preload; 2763 Label after; 2764 QuickCheckDetails quick_check_details; 2765 }; 2766 2767 // Creates a list of AlternativeGenerations. If the list has a reasonable 2768 // size then it is on the stack, otherwise the excess is on the heap. 2769 class AlternativeGenerationList { 2770 public: 2771 AlternativeGenerationList(int count, Zone* zone) : alt_gens_(count, zone) { 2772 for (int i = 0; i < count && i < kAFew; i++) { 2773 alt_gens_.Add(a_few_alt_gens_ + i, zone); 2774 } 2775 for (int i = kAFew; i < count; i++) { 2776 alt_gens_.Add(new AlternativeGeneration(), zone); 2777 } 2778 } 2779 ~AlternativeGenerationList() { 2780 for (int i = kAFew; i < alt_gens_.length(); i++) { 2781 delete alt_gens_[i]; 2782 alt_gens_[i] = nullptr; 2783 } 2784 } 2785 2786 AlternativeGeneration* at(int i) { return alt_gens_[i]; } 2787 2788 private: 2789 static const int kAFew = 10; 2790 ZoneList<AlternativeGeneration*> alt_gens_; 2791 AlternativeGeneration a_few_alt_gens_[kAFew]; 2792 }; 2793 2794 void BoyerMoorePositionInfo::Set(int character) { 2795 SetInterval(Interval(character, character)); 2796 } 2797 2798 namespace { 2799 2800 ContainedInLattice AddRange(ContainedInLattice containment, const int* ranges, 2801 int ranges_length, Interval new_range) { 2802 DCHECK_EQ(1, ranges_length & 1); 2803 DCHECK_EQ(String::kMaxCodePoint + 1, ranges[ranges_length - 1]); 2804 if (containment == kLatticeUnknown) return containment; 2805 bool inside = false; 2806 int last = 0; 2807 for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) { 2808 // Consider the range from last to ranges[i]. 2809 // We haven't got to the new range yet. 2810 if (ranges[i] <= new_range.from()) continue; 2811 // New range is wholly inside last-ranges[i]. Note that new_range.to() is 2812 // inclusive, but the values in ranges are not. 2813 if (last <= new_range.from() && new_range.to() < ranges[i]) { 2814 return Combine(containment, inside ? kLatticeIn : kLatticeOut); 2815 } 2816 return kLatticeUnknown; 2817 } 2818 return containment; 2819 } 2820 2821 int BitsetFirstSetBit(BoyerMoorePositionInfo::Bitset bitset) { 2822 static_assert(BoyerMoorePositionInfo::kMapSize == 2823 2 * kInt64Size * kBitsPerByte); 2824 2825 // Slight fiddling is needed here, since the bitset is of length 128 while 2826 // CountTrailingZeros requires an integral type and std::bitset can only 2827 // convert to unsigned long long. So we handle the most- and least-significant 2828 // bits separately. 2829 2830 { 2831 static constexpr BoyerMoorePositionInfo::Bitset mask(~uint64_t{0}); 2832 BoyerMoorePositionInfo::Bitset masked_bitset = bitset & mask; 2833 static_assert(kInt64Size >= sizeof(decltype(masked_bitset.to_ullong()))); 2834 uint64_t lsb = masked_bitset.to_ullong(); 2835 if (lsb != 0) return base::bits::CountTrailingZeros(lsb); 2836 } 2837 2838 { 2839 BoyerMoorePositionInfo::Bitset masked_bitset = bitset >> 64; 2840 uint64_t msb = masked_bitset.to_ullong(); 2841 if (msb != 0) return 64 + base::bits::CountTrailingZeros(msb); 2842 } 2843 2844 return -1; 2845 } 2846 2847 } // namespace 2848 2849 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) { 2850 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval); 2851 2852 if (interval.size() >= kMapSize) { 2853 map_count_ = kMapSize; 2854 map_.set(); 2855 return; 2856 } 2857 2858 for (int i = interval.from(); i <= interval.to(); i++) { 2859 int mod_character = (i & kMask); 2860 if (!map_[mod_character]) { 2861 map_count_++; 2862 map_.set(mod_character); 2863 } 2864 if (map_count_ == kMapSize) return; 2865 } 2866 } 2867 2868 void BoyerMoorePositionInfo::SetAll() { 2869 w_ = kLatticeUnknown; 2870 if (map_count_ != kMapSize) { 2871 map_count_ = kMapSize; 2872 map_.set(); 2873 } 2874 } 2875 2876 BoyerMooreLookahead::BoyerMooreLookahead(int length, RegExpCompiler* compiler, 2877 Zone* zone) 2878 : length_(length), 2879 compiler_(compiler), 2880 max_char_(MaxCodeUnit(compiler->one_byte())) { 2881 bitmaps_ = zone->New<ZoneList<BoyerMoorePositionInfo*>>(length, zone); 2882 for (int i = 0; i < length; i++) { 2883 bitmaps_->Add(zone->New<BoyerMoorePositionInfo>(), zone); 2884 } 2885 } 2886 2887 // Find the longest range of lookahead that has the fewest number of different 2888 // characters that can occur at a given position. Since we are optimizing two 2889 // different parameters at once this is a tradeoff. 2890 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) { 2891 int biggest_points = 0; 2892 // If more than 32 characters out of 128 can occur it is unlikely that we can 2893 // be lucky enough to step forwards much of the time. 2894 const int kMaxMax = 32; 2895 for (int max_number_of_chars = 4; max_number_of_chars < kMaxMax; 2896 max_number_of_chars *= 2) { 2897 biggest_points = 2898 FindBestInterval(max_number_of_chars, biggest_points, from, to); 2899 } 2900 if (biggest_points == 0) return false; 2901 return true; 2902 } 2903 2904 // Find the highest-points range between 0 and length_ where the character 2905 // information is not too vague. 'Too vague' means that there are more than 2906 // max_number_of_chars that can occur at this position. Calculates the number 2907 // of points as the product of width-of-the-range and 2908 // probability-of-finding-one-of-the-characters, where the probability is 2909 // calculated using the frequency distribution of the sample subject string. 2910 int BoyerMooreLookahead::FindBestInterval(int max_number_of_chars, 2911 int old_biggest_points, int* from, 2912 int* to) { 2913 int biggest_points = old_biggest_points; 2914 static const int kSize = RegExpMacroAssembler::kTableSize; 2915 for (int i = 0; i < length_;) { 2916 while (i < length_ && Count(i) > max_number_of_chars) i++; 2917 if (i == length_) break; 2918 int remembered_from = i; 2919 2920 BoyerMoorePositionInfo::Bitset union_bitset; 2921 for (; i < length_ && Count(i) <= max_number_of_chars; i++) { 2922 union_bitset |= bitmaps_->at(i)->raw_bitset(); 2923 } 2924 2925 int frequency = 0; 2926 2927 // Iterate only over set bits. 2928 int j; 2929 while ((j = BitsetFirstSetBit(union_bitset)) != -1) { 2930 DCHECK(union_bitset[j]); // Sanity check. 2931 // Add 1 to the frequency to give a small per-character boost for 2932 // the cases where our sampling is not good enough and many 2933 // characters have a frequency of zero. This means the frequency 2934 // can theoretically be up to 2*kSize though we treat it mostly as 2935 // a fraction of kSize. 2936 frequency += compiler_->frequency_collator()->Frequency(j) + 1; 2937 union_bitset.reset(j); 2938 } 2939 2940 // We use the probability of skipping times the distance we are skipping to 2941 // judge the effectiveness of this. Actually we have a cut-off: By 2942 // dividing by 2 we switch off the skipping if the probability of skipping 2943 // is less than 50%. This is because the multibyte mask-and-compare 2944 // skipping in quickcheck is more likely to do well on this case. 2945 bool in_quickcheck_range = 2946 ((i - remembered_from < 4) || 2947 (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2)); 2948 // Called 'probability' but it is only a rough estimate and can actually 2949 // be outside the 0-kSize range. 2950 int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency; 2951 int points = (i - remembered_from) * probability; 2952 if (points > biggest_points) { 2953 *from = remembered_from; 2954 *to = i - 1; 2955 biggest_points = points; 2956 } 2957 } 2958 return biggest_points; 2959 } 2960 2961 // Take all the characters that will not prevent a successful match if they 2962 // occur in the subject string in the range between min_lookahead and 2963 // max_lookahead (inclusive) measured from the current position. If the 2964 // character at max_lookahead offset is not one of these characters, then we 2965 // can safely skip forwards by the number of characters in the range. 2966 // nibble_table is only used for SIMD variants and encodes the same information 2967 // as boolean_skip_table but in only 128 bits. It contains 16 bytes where the 2968 // index into the table represent low nibbles of a character, and the stored 2969 // byte is a bitset representing matching high nibbles. E.g. to store the 2970 // character 'b' (0x62) in the nibble table, we set the 6th bit in row 2. 2971 int BoyerMooreLookahead::GetSkipTable( 2972 int min_lookahead, int max_lookahead, 2973 DirectHandle<ByteArray> boolean_skip_table, 2974 DirectHandle<ByteArray> nibble_table) { 2975 const int kSkipArrayEntry = 0; 2976 const int kDontSkipArrayEntry = 1; 2977 2978 std::memset(boolean_skip_table->begin(), kSkipArrayEntry, 2979 boolean_skip_table->length()); 2980 const bool fill_nibble_table = !nibble_table.is_null(); 2981 if (fill_nibble_table) { 2982 std::memset(nibble_table->begin(), 0, nibble_table->length()); 2983 } 2984 2985 for (int i = max_lookahead; i >= min_lookahead; i--) { 2986 BoyerMoorePositionInfo::Bitset bitset = bitmaps_->at(i)->raw_bitset(); 2987 2988 // Iterate only over set bits. 2989 int j; 2990 while ((j = BitsetFirstSetBit(bitset)) != -1) { 2991 DCHECK(bitset[j]); // Sanity check. 2992 boolean_skip_table->set(j, kDontSkipArrayEntry); 2993 if (fill_nibble_table) { 2994 int lo_nibble = j & 0x0f; 2995 int hi_nibble = (j >> 4) & 0x07; 2996 int row = nibble_table->get(lo_nibble); 2997 row |= 1 << hi_nibble; 2998 nibble_table->set(lo_nibble, row); 2999 } 3000 bitset.reset(j); 3001 } 3002 } 3003 3004 const int skip = max_lookahead + 1 - min_lookahead; 3005 return skip; 3006 } 3007 3008 // See comment above on the implementation of GetSkipTable. 3009 void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) { 3010 const int kSize = RegExpMacroAssembler::kTableSize; 3011 3012 int min_lookahead = 0; 3013 int max_lookahead = 0; 3014 3015 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return; 3016 3017 // Check if we only have a single non-empty position info, and that info 3018 // contains precisely one character. 3019 bool found_single_character = false; 3020 int single_character = 0; 3021 for (int i = max_lookahead; i >= min_lookahead; i--) { 3022 BoyerMoorePositionInfo* map = bitmaps_->at(i); 3023 if (map->map_count() == 0) continue; 3024 3025 if (found_single_character || map->map_count() > 1) { 3026 found_single_character = false; 3027 break; 3028 } 3029 3030 DCHECK(!found_single_character); 3031 DCHECK_EQ(map->map_count(), 1); 3032 3033 found_single_character = true; 3034 single_character = BitsetFirstSetBit(map->raw_bitset()); 3035 3036 DCHECK_NE(single_character, -1); 3037 } 3038 3039 int lookahead_width = max_lookahead + 1 - min_lookahead; 3040 3041 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) { 3042 // The mask-compare can probably handle this better. 3043 return; 3044 } 3045 3046 if (found_single_character) { 3047 // TODO(pthier): Add vectorized version. 3048 Label cont, again; 3049 masm->Bind(&again); 3050 masm->LoadCurrentCharacter(max_lookahead, &cont, true); 3051 if (max_char_ > kSize) { 3052 masm->CheckCharacterAfterAnd(single_character, 3053 RegExpMacroAssembler::kTableMask, &cont); 3054 } else { 3055 masm->CheckCharacter(single_character, &cont); 3056 } 3057 masm->AdvanceCurrentPosition(lookahead_width); 3058 masm->GoTo(&again); 3059 masm->Bind(&cont); 3060 return; 3061 } 3062 3063 Factory* factory = masm->isolate()->factory(); 3064 Handle<ByteArray> boolean_skip_table = 3065 factory->NewByteArray(kSize, AllocationType::kOld); 3066 Handle<ByteArray> nibble_table; 3067 const int skip_distance = max_lookahead + 1 - min_lookahead; 3068 if (masm->SkipUntilBitInTableUseSimd(skip_distance)) { 3069 // The current implementation is tailored specifically for 128-bit tables. 3070 static_assert(kSize == 128); 3071 nibble_table = 3072 factory->NewByteArray(kSize / kBitsPerByte, AllocationType::kOld); 3073 } 3074 GetSkipTable(min_lookahead, max_lookahead, boolean_skip_table, nibble_table); 3075 DCHECK_NE(0, skip_distance); 3076 3077 masm->SkipUntilBitInTable(max_lookahead, boolean_skip_table, nibble_table, 3078 skip_distance); 3079 } 3080 3081 /* Code generation for choice nodes. 3082 * 3083 * We generate quick checks that do a mask and compare to eliminate a 3084 * choice. If the quick check succeeds then it jumps to the continuation to 3085 * do slow checks and check subsequent nodes. If it fails (the common case) 3086 * it falls through to the next choice. 3087 * 3088 * Here is the desired flow graph. Nodes directly below each other imply 3089 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative 3090 * 3 doesn't have a quick check so we have to call the slow check. 3091 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire 3092 * regexp continuation is generated directly after the Sn node, up to the 3093 * next GoTo if we decide to reuse some already generated code. Some 3094 * nodes expect preload_characters to be preloaded into the current 3095 * character register. R nodes do this preloading. Vertices are marked 3096 * F for failures and S for success (possible success in the case of quick 3097 * nodes). L, V, < and > are used as arrow heads. 3098 * 3099 * ----------> R 3100 * | 3101 * V 3102 * Q1 -----> S1 3103 * | S / 3104 * F| / 3105 * | F/ 3106 * | / 3107 * | R 3108 * | / 3109 * V L 3110 * Q2 -----> S2 3111 * | S / 3112 * F| / 3113 * | F/ 3114 * | / 3115 * | R 3116 * | / 3117 * V L 3118 * S3 3119 * | 3120 * F| 3121 * | 3122 * R 3123 * | 3124 * backtrack V 3125 * <----------Q4 3126 * \ F | 3127 * \ |S 3128 * \ F V 3129 * \-----S4 3130 * 3131 * For fixed length loops we push the current position, then generate the code 3132 * that eats the input specially in EmitFixedLengthLoop. The other choice (the 3133 * continuation) is generated by the normal code in EmitChoices, and steps back 3134 * in the input to the starting position when it fails to match. The loop code 3135 * looks like this (U is the unwind code that steps back in the fixed length 3136 * loop). 3137 * 3138 * _____ 3139 * / \ 3140 * V | 3141 * ----------> S1 | 3142 * /| | 3143 * / |S | 3144 * F/ \_____/ 3145 * / 3146 * |<----- 3147 * | \ 3148 * V |S 3149 * Q2 ---> U----->backtrack 3150 * | F / 3151 * S| / 3152 * V F / 3153 * S2--/ 3154 */ 3155 3156 FixedLengthLoopState::FixedLengthLoopState(bool not_at_start, 3157 ChoiceNode* loop_choice_node) 3158 : loop_choice_node_(loop_choice_node) { 3159 counter_backtrack_trace_.set_backtrack(&step_backwards_label_); 3160 if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE); 3161 } 3162 3163 void FixedLengthLoopState::BindStepBackwardsLabel( 3164 RegExpMacroAssembler* macro_assembler) { 3165 macro_assembler->Bind(&step_backwards_label_); 3166 } 3167 3168 void FixedLengthLoopState::BindLoopTopLabel( 3169 RegExpMacroAssembler* macro_assembler) { 3170 macro_assembler->Bind(&loop_top_label_); 3171 } 3172 3173 void FixedLengthLoopState::GoToLoopTopLabel( 3174 RegExpMacroAssembler* macro_assembler) { 3175 macro_assembler->GoTo(&loop_top_label_); 3176 } 3177 3178 void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) { 3179 #ifdef DEBUG 3180 int choice_count = alternatives_->length(); 3181 for (int i = 0; i < choice_count - 1; i++) { 3182 GuardedAlternative alternative = alternatives_->at(i); 3183 ZoneList<Guard*>* guards = alternative.guards(); 3184 int guard_count = (guards == nullptr) ? 0 : guards->length(); 3185 for (int j = 0; j < guard_count; j++) { 3186 DCHECK(!trace->mentions_reg(guards->at(j)->reg())); 3187 } 3188 } 3189 #endif 3190 } 3191 3192 void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler, Trace* current_trace, 3193 PreloadState* state) { 3194 if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) { 3195 // Save some time by looking at most one machine word ahead. 3196 state->eats_at_least_ = 3197 EatsAtLeast(current_trace->at_start() == Trace::FALSE_VALUE); 3198 } 3199 state->preload_characters_ = 3200 CalculatePreloadCharacters(compiler, state->eats_at_least_); 3201 3202 state->preload_is_current_ = 3203 (current_trace->characters_preloaded() == state->preload_characters_); 3204 state->preload_has_checked_bounds_ = state->preload_is_current_; 3205 } 3206 3207 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3208 int choice_count = alternatives_->length(); 3209 3210 if (choice_count == 1 && alternatives_->at(0).guards() == nullptr) { 3211 alternatives_->at(0).node()->Emit(compiler, trace); 3212 return; 3213 } 3214 3215 AssertGuardsMentionRegisters(trace); 3216 3217 LimitResult limit_result = LimitVersions(compiler, trace); 3218 if (limit_result == DONE) return; 3219 DCHECK(limit_result == CONTINUE); 3220 3221 // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for 3222 // other choice nodes we only flush if we are out of code size budget. 3223 if (trace->flush_budget() == 0 && trace->has_any_actions()) { 3224 trace->Flush(compiler, this); 3225 return; 3226 } 3227 3228 RecursionCheck rc(compiler); 3229 3230 PreloadState preload; 3231 preload.init(); 3232 // This must be outside the 'if' because the trace we use for what 3233 // comes after the fixed_length_loop is inside it and needs the lifetime. 3234 FixedLengthLoopState fixed_length_loop_state(not_at_start(), this); 3235 3236 int text_length = FixedLengthLoopLengthForAlternative(&alternatives_->at(0)); 3237 AlternativeGenerationList alt_gens(choice_count, zone()); 3238 3239 if (choice_count > 1 && text_length != kNodeIsTooComplexForFixedLengthLoops) { 3240 trace = EmitFixedLengthLoop(compiler, trace, &alt_gens, &preload, 3241 &fixed_length_loop_state, text_length); 3242 } else { 3243 preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace); 3244 3245 EmitChoices(compiler, &alt_gens, 0, trace, &preload); 3246 } 3247 3248 // At this point we need to generate slow checks for the alternatives where 3249 // the quick check was inlined. We can recognize these because the associated 3250 // label was bound. 3251 int new_flush_budget = trace->flush_budget() / choice_count; 3252 for (int i = 0; i < choice_count; i++) { 3253 AlternativeGeneration* alt_gen = alt_gens.at(i); 3254 Trace new_trace(*trace); 3255 // If there are actions to be flushed we have to limit how many times 3256 // they are flushed. Take the budget of the parent trace and distribute 3257 // it fairly amongst the children. 3258 if (new_trace.has_any_actions()) { 3259 new_trace.set_flush_budget(new_flush_budget); 3260 } 3261 bool next_expects_preload = 3262 i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload; 3263 EmitOutOfLineContinuation(compiler, &new_trace, alternatives_->at(i), 3264 alt_gen, preload.preload_characters_, 3265 next_expects_preload); 3266 } 3267 } 3268 3269 Trace* ChoiceNode::EmitFixedLengthLoop( 3270 RegExpCompiler* compiler, Trace* trace, AlternativeGenerationList* alt_gens, 3271 PreloadState* preload, FixedLengthLoopState* fixed_length_loop_state, 3272 int text_length) { 3273 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 3274 // Here we have special handling for greedy loops containing only text nodes 3275 // and other simple nodes. We call these fixed length loops. These are 3276 // handled by pushing the current position on the stack and then incrementing 3277 // the current position each time around the switch. On backtrack we 3278 // decrement the current position and check it against the pushed value. 3279 // This avoids pushing backtrack information for each iteration of the loop, 3280 // which could take up a lot of space. 3281 DCHECK(trace->fixed_length_loop_state() == nullptr); 3282 macro_assembler->PushCurrentPosition(); 3283 // This is the label for trying to match what comes after the greedy 3284 // quantifier, either because the body of the quantifier failed, or because 3285 // we have stepped back to try again with one iteration fewer. 3286 Label after_body_match_attempt; 3287 Trace fixed_length_match_trace; 3288 if (not_at_start()) fixed_length_match_trace.set_at_start(Trace::FALSE_VALUE); 3289 fixed_length_match_trace.set_backtrack(&after_body_match_attempt); 3290 fixed_length_loop_state->BindLoopTopLabel(macro_assembler); 3291 fixed_length_match_trace.set_fixed_length_loop_state(fixed_length_loop_state); 3292 alternatives_->at(0).node()->Emit(compiler, &fixed_length_match_trace); 3293 macro_assembler->Bind(&after_body_match_attempt); 3294 3295 Trace* new_trace = fixed_length_loop_state->counter_backtrack_trace(); 3296 3297 // In a fixed length loop there is only one other choice, which is what 3298 // comes after the greedy quantifer. Try to match that now. 3299 EmitChoices(compiler, alt_gens, 1, new_trace, preload); 3300 3301 fixed_length_loop_state->BindStepBackwardsLabel(macro_assembler); 3302 // If we have unwound to the bottom then backtrack. 3303 macro_assembler->CheckFixedLengthLoop(trace->backtrack()); 3304 // Otherwise try the second priority at an earlier position. 3305 macro_assembler->AdvanceCurrentPosition(-text_length); 3306 macro_assembler->GoTo(&after_body_match_attempt); 3307 return new_trace; 3308 } 3309 3310 int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler, 3311 Trace* trace) { 3312 int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized; 3313 if (alternatives_->length() != 2) return eats_at_least; 3314 3315 GuardedAlternative alt1 = alternatives_->at(1); 3316 if (alt1.guards() != nullptr && alt1.guards()->length() != 0) { 3317 return eats_at_least; 3318 } 3319 RegExpNode* eats_anything_node = alt1.node(); 3320 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) { 3321 return eats_at_least; 3322 } 3323 3324 // Really we should be creating a new trace when we execute this function, 3325 // but there is no need, because the code it generates cannot backtrack, and 3326 // we always arrive here with a trivial trace (since it's the entry to a 3327 // loop. That also implies that there are no preloaded characters, which is 3328 // good, because it means we won't be violating any assumptions by 3329 // overwriting those characters with new load instructions. 3330 DCHECK(trace->is_trivial()); 3331 3332 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 3333 Isolate* isolate = macro_assembler->isolate(); 3334 // At this point we know that we are at a non-greedy loop that will eat 3335 // any character one at a time. Any non-anchored regexp has such a 3336 // loop prepended to it in order to find where it starts. We look for 3337 // a pattern of the form ...abc... where we can look 6 characters ahead 3338 // and step forwards 3 if the character is not one of abc. Abc need 3339 // not be atoms, they can be any reasonably limited character class or 3340 // small alternation. 3341 BoyerMooreLookahead* bm = bm_info(false); 3342 if (bm == nullptr) { 3343 eats_at_least = std::min(kMaxLookaheadForBoyerMoore, EatsAtLeast(false)); 3344 if (eats_at_least >= 1) { 3345 bm = zone()->New<BoyerMooreLookahead>(eats_at_least, compiler, zone()); 3346 GuardedAlternative alt0 = alternatives_->at(0); 3347 alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false); 3348 } 3349 } 3350 if (bm != nullptr) { 3351 bm->EmitSkipInstructions(macro_assembler); 3352 } 3353 return eats_at_least; 3354 } 3355 3356 void ChoiceNode::EmitChoices(RegExpCompiler* compiler, 3357 AlternativeGenerationList* alt_gens, 3358 int first_choice, Trace* trace, 3359 PreloadState* preload) { 3360 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 3361 SetUpPreLoad(compiler, trace, preload); 3362 3363 // For now we just call all choices one after the other. The idea ultimately 3364 // is to use the Dispatch table to try only the relevant ones. 3365 int choice_count = alternatives_->length(); 3366 3367 int new_flush_budget = trace->flush_budget() / choice_count; 3368 3369 for (int i = first_choice; i < choice_count; i++) { 3370 bool is_last = i == choice_count - 1; 3371 bool fall_through_on_failure = !is_last; 3372 GuardedAlternative alternative = alternatives_->at(i); 3373 AlternativeGeneration* alt_gen = alt_gens->at(i); 3374 alt_gen->quick_check_details.set_characters(preload->preload_characters_); 3375 ZoneList<Guard*>* guards = alternative.guards(); 3376 int guard_count = (guards == nullptr) ? 0 : guards->length(); 3377 Trace new_trace(*trace); 3378 new_trace.set_characters_preloaded( 3379 preload->preload_is_current_ ? preload->preload_characters_ : 0); 3380 if (preload->preload_has_checked_bounds_) { 3381 new_trace.set_bound_checked_up_to(preload->preload_characters_); 3382 } 3383 new_trace.quick_check_performed()->Clear(); 3384 if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE); 3385 if (!is_last) { 3386 new_trace.set_backtrack(&alt_gen->after); 3387 } 3388 alt_gen->expects_preload = preload->preload_is_current_; 3389 bool generate_full_check_inline = false; 3390 if (v8_flags.regexp_optimization && 3391 try_to_emit_quick_check_for_alternative(i == 0) && 3392 alternative.node()->EmitQuickCheck( 3393 compiler, trace, &new_trace, preload->preload_has_checked_bounds_, 3394 &alt_gen->possible_success, &alt_gen->quick_check_details, 3395 fall_through_on_failure, this)) { 3396 // Quick check was generated for this choice. 3397 preload->preload_is_current_ = true; 3398 preload->preload_has_checked_bounds_ = true; 3399 // If we generated the quick check to fall through on possible success, 3400 // we now need to generate the full check inline. 3401 if (!fall_through_on_failure) { 3402 macro_assembler->Bind(&alt_gen->possible_success); 3403 new_trace.set_quick_check_performed(&alt_gen->quick_check_details); 3404 new_trace.set_characters_preloaded(preload->preload_characters_); 3405 new_trace.set_bound_checked_up_to(preload->preload_characters_); 3406 generate_full_check_inline = true; 3407 } 3408 } else if (alt_gen->quick_check_details.cannot_match()) { 3409 if (!fall_through_on_failure) { 3410 macro_assembler->GoTo(trace->backtrack()); 3411 } 3412 continue; 3413 } else { 3414 // No quick check was generated. Put the full code here. 3415 // If this is not the first choice then there could be slow checks from 3416 // previous cases that go here when they fail. There's no reason to 3417 // insist that they preload characters since the slow check we are about 3418 // to generate probably can't use it. 3419 if (i != first_choice) { 3420 alt_gen->expects_preload = false; 3421 new_trace.InvalidateCurrentCharacter(); 3422 } 3423 generate_full_check_inline = true; 3424 } 3425 if (generate_full_check_inline) { 3426 if (new_trace.has_any_actions()) { 3427 new_trace.set_flush_budget(new_flush_budget); 3428 } 3429 for (int j = 0; j < guard_count; j++) { 3430 GenerateGuard(macro_assembler, guards->at(j), &new_trace); 3431 } 3432 alternative.node()->Emit(compiler, &new_trace); 3433 preload->preload_is_current_ = false; 3434 } 3435 macro_assembler->Bind(&alt_gen->after); 3436 } 3437 } 3438 3439 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler, 3440 Trace* trace, 3441 GuardedAlternative alternative, 3442 AlternativeGeneration* alt_gen, 3443 int preload_characters, 3444 bool next_expects_preload) { 3445 if (!alt_gen->possible_success.is_linked()) return; 3446 3447 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 3448 macro_assembler->Bind(&alt_gen->possible_success); 3449 Trace out_of_line_trace(*trace); 3450 out_of_line_trace.set_characters_preloaded(preload_characters); 3451 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details); 3452 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE); 3453 ZoneList<Guard*>* guards = alternative.guards(); 3454 int guard_count = (guards == nullptr) ? 0 : guards->length(); 3455 if (next_expects_preload) { 3456 Label reload_current_char; 3457 out_of_line_trace.set_backtrack(&reload_current_char); 3458 for (int j = 0; j < guard_count; j++) { 3459 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace); 3460 } 3461 alternative.node()->Emit(compiler, &out_of_line_trace); 3462 macro_assembler->Bind(&reload_current_char); 3463 // Reload the current character, since the next quick check expects that. 3464 // We don't need to check bounds here because we only get into this 3465 // code through a quick check which already did the checked load. 3466 macro_assembler->LoadCurrentCharacter(trace->cp_offset(), nullptr, false, 3467 preload_characters); 3468 macro_assembler->GoTo(&(alt_gen->after)); 3469 } else { 3470 out_of_line_trace.set_backtrack(&(alt_gen->after)); 3471 for (int j = 0; j < guard_count; j++) { 3472 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace); 3473 } 3474 alternative.node()->Emit(compiler, &out_of_line_trace); 3475 } 3476 } 3477 3478 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3479 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3480 LimitResult limit_result = LimitVersions(compiler, trace); 3481 if (limit_result == DONE) return; 3482 DCHECK(limit_result == CONTINUE); 3483 3484 RecursionCheck rc(compiler); 3485 3486 switch (action_type_) { 3487 // Start with the actions we know how to defer. These are just recorded in 3488 // the new trace, no code is emitted right now. (If we backtrack then we 3489 // don't have to perform and undo these actions.) 3490 case CLEAR_POSITION: 3491 case RESTORE_POSITION: 3492 case INCREMENT_REGISTER: 3493 case SET_REGISTER_FOR_LOOP: 3494 case CLEAR_CAPTURES: { 3495 Trace new_trace = *trace; 3496 new_trace.add_action(this); 3497 on_success()->Emit(compiler, &new_trace); 3498 break; 3499 } 3500 // We don't yet have the ability to defer these. 3501 case BEGIN_POSITIVE_SUBMATCH: 3502 case BEGIN_NEGATIVE_SUBMATCH: 3503 if (!trace->is_trivial()) { 3504 // Complex situation: Flush the trace state to the assembler and 3505 // generate a generic version of this action. This call will 3506 // recurse back to the else clause here. 3507 trace->Flush(compiler, this); 3508 } else { 3509 assembler->WriteCurrentPositionToRegister( 3510 data_.u_submatch.current_position_register, 0); 3511 assembler->WriteStackPointerToRegister( 3512 data_.u_submatch.stack_pointer_register); 3513 on_success()->Emit(compiler, trace); 3514 } 3515 break; 3516 case EMPTY_MATCH_CHECK: { 3517 int start_pos_reg = data_.u_empty_match_check.start_register; 3518 int stored_pos = 0; 3519 int rep_reg = data_.u_empty_match_check.repetition_register; 3520 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister); 3521 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos); 3522 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) { 3523 // If we know we haven't advanced and there is no minimum we 3524 // can just backtrack immediately. 3525 assembler->GoTo(trace->backtrack()); 3526 } else if (know_dist && stored_pos < trace->cp_offset()) { 3527 // If we know we've advanced we can generate the continuation 3528 // immediately. 3529 on_success()->Emit(compiler, trace); 3530 } else if (!trace->is_trivial()) { 3531 trace->Flush(compiler, this); 3532 } else { 3533 Label skip_empty_check; 3534 // If we have a minimum number of repetitions we check the current 3535 // number first and skip the empty check if it's not enough. 3536 if (has_minimum) { 3537 int limit = data_.u_empty_match_check.repetition_limit; 3538 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check); 3539 } 3540 // If the match is empty we bail out, otherwise we fall through 3541 // to the on-success continuation. 3542 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register, 3543 trace->backtrack()); 3544 assembler->Bind(&skip_empty_check); 3545 on_success()->Emit(compiler, trace); 3546 } 3547 break; 3548 } 3549 case POSITIVE_SUBMATCH_SUCCESS: { 3550 if (!trace->is_trivial()) { 3551 trace->Flush(compiler, this, Trace::kFlushSuccess); 3552 return; 3553 } 3554 assembler->ReadCurrentPositionFromRegister( 3555 data_.u_submatch.current_position_register); 3556 assembler->ReadStackPointerFromRegister( 3557 data_.u_submatch.stack_pointer_register); 3558 int clear_register_count = data_.u_submatch.clear_register_count; 3559 if (clear_register_count == 0) { 3560 on_success()->Emit(compiler, trace); 3561 return; 3562 } 3563 int clear_registers_from = data_.u_submatch.clear_register_from; 3564 Label clear_registers_backtrack; 3565 Trace new_trace = *trace; 3566 new_trace.set_backtrack(&clear_registers_backtrack); 3567 on_success()->Emit(compiler, &new_trace); 3568 3569 assembler->Bind(&clear_registers_backtrack); 3570 int clear_registers_to = clear_registers_from + clear_register_count - 1; 3571 assembler->ClearRegisters(clear_registers_from, clear_registers_to); 3572 3573 DCHECK(trace->backtrack() == nullptr); 3574 assembler->Backtrack(); 3575 return; 3576 } 3577 case MODIFY_FLAGS: { 3578 compiler->set_flags(flags()); 3579 on_success()->Emit(compiler, trace); 3580 break; 3581 } 3582 default: 3583 UNREACHABLE(); 3584 } 3585 } 3586 3587 void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3588 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3589 if (!trace->is_trivial()) { 3590 trace->Flush(compiler, this); 3591 return; 3592 } 3593 3594 LimitResult limit_result = LimitVersions(compiler, trace); 3595 if (limit_result == DONE) return; 3596 DCHECK(limit_result == CONTINUE); 3597 3598 RecursionCheck rc(compiler); 3599 3600 DCHECK_EQ(start_reg_ + 1, end_reg_); 3601 if (IsIgnoreCase(compiler->flags())) { 3602 bool unicode = IsEitherUnicode(compiler->flags()); 3603 assembler->CheckNotBackReferenceIgnoreCase(start_reg_, read_backward(), 3604 unicode, trace->backtrack()); 3605 } else { 3606 assembler->CheckNotBackReference(start_reg_, read_backward(), 3607 trace->backtrack()); 3608 } 3609 // We are going to advance backward, so we may end up at the start. 3610 if (read_backward()) trace->set_at_start(Trace::UNKNOWN); 3611 3612 // Check that the back reference does not end inside a surrogate pair. 3613 if (IsEitherUnicode(compiler->flags()) && !compiler->one_byte()) { 3614 assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack()); 3615 } 3616 on_success()->Emit(compiler, trace); 3617 } 3618 3619 void TextNode::CalculateOffsets() { 3620 int element_count = elements()->length(); 3621 // Set up the offsets of the elements relative to the start. This is a fixed 3622 // quantity since a TextNode can only contain fixed-width things. 3623 int cp_offset = 0; 3624 for (int i = 0; i < element_count; i++) { 3625 TextElement& elm = elements()->at(i); 3626 elm.set_cp_offset(cp_offset); 3627 cp_offset += elm.length(); 3628 } 3629 } 3630 3631 namespace { 3632 3633 // Assertion propagation moves information about assertions such as 3634 // \b to the affected nodes. For instance, in /.\b./ information must 3635 // be propagated to the first '.' that whatever follows needs to know 3636 // if it matched a word or a non-word, and to the second '.' that it 3637 // has to check if it succeeds a word or non-word. In this case the 3638 // result will be something like: 3639 // 3640 // +-------+ +------------+ 3641 // | . | | . | 3642 // +-------+ ---> +------------+ 3643 // | word? | | check word | 3644 // +-------+ +------------+ 3645 class AssertionPropagator : public AllStatic { 3646 public: 3647 static void VisitText(TextNode* that) {} 3648 3649 static void VisitAction(ActionNode* that) { 3650 // If the next node is interested in what it follows then this node 3651 // has to be interested too so it can pass the information on. 3652 that->info()->AddFromFollowing(that->on_success()->info()); 3653 } 3654 3655 static void VisitChoice(ChoiceNode* that, int i) { 3656 // Anything the following nodes need to know has to be known by 3657 // this node also, so it can pass it on. 3658 that->info()->AddFromFollowing(that->alternatives()->at(i).node()->info()); 3659 } 3660 3661 static void VisitLoopChoiceContinueNode(LoopChoiceNode* that) { 3662 that->info()->AddFromFollowing(that->continue_node()->info()); 3663 } 3664 3665 static void VisitLoopChoiceLoopNode(LoopChoiceNode* that) { 3666 that->info()->AddFromFollowing(that->loop_node()->info()); 3667 } 3668 3669 static void VisitNegativeLookaroundChoiceLookaroundNode( 3670 NegativeLookaroundChoiceNode* that) { 3671 VisitChoice(that, NegativeLookaroundChoiceNode::kLookaroundIndex); 3672 } 3673 3674 static void VisitNegativeLookaroundChoiceContinueNode( 3675 NegativeLookaroundChoiceNode* that) { 3676 VisitChoice(that, NegativeLookaroundChoiceNode::kContinueIndex); 3677 } 3678 3679 static void VisitBackReference(BackReferenceNode* that) {} 3680 3681 static void VisitAssertion(AssertionNode* that) {} 3682 }; 3683 3684 // Propagates information about the minimum size of successful matches from 3685 // successor nodes to their predecessors. Note that all eats_at_least values 3686 // are initialized to zero before analysis. 3687 class EatsAtLeastPropagator : public AllStatic { 3688 public: 3689 static void VisitText(TextNode* that) { 3690 // The eats_at_least value is not used if reading backward. 3691 if (!that->read_backward()) { 3692 // We are not at the start after this node, and thus we can use the 3693 // successor's eats_at_least_from_not_start value. 3694 uint8_t eats_at_least = base::saturated_cast<uint8_t>( 3695 that->Length() + that->on_success() 3696 ->eats_at_least_info() 3697 ->eats_at_least_from_not_start); 3698 that->set_eats_at_least_info(EatsAtLeastInfo(eats_at_least)); 3699 } 3700 } 3701 3702 static void VisitAction(ActionNode* that) { 3703 switch (that->action_type()) { 3704 case ActionNode::BEGIN_POSITIVE_SUBMATCH: { 3705 // For a begin positive submatch we propagate the eats_at_least 3706 // data from the successor of the success node, ignoring the body of 3707 // the lookahead, which eats nothing, since it is a zero-width 3708 // assertion. 3709 // TODO(chromium:42201836) This is better than discarding all 3710 // information when there is a positive lookahead, but it loses some 3711 // information that could be useful, since the body of the lookahead 3712 // could tell us something about how close to the end of the string we 3713 // are. 3714 that->set_eats_at_least_info( 3715 *that->success_node()->on_success()->eats_at_least_info()); 3716 break; 3717 } 3718 case ActionNode::POSITIVE_SUBMATCH_SUCCESS: 3719 // We do not propagate eats_at_least data through positive submatch 3720 // success because it rewinds input. 3721 DCHECK(that->eats_at_least_info()->IsZero()); 3722 break; 3723 case ActionNode::SET_REGISTER_FOR_LOOP: 3724 // SET_REGISTER_FOR_LOOP indicates a loop entry point, which means the 3725 // loop body will run at least the minimum number of times before the 3726 // continuation case can run. 3727 that->set_eats_at_least_info( 3728 that->on_success()->EatsAtLeastFromLoopEntry()); 3729 break; 3730 case ActionNode::BEGIN_NEGATIVE_SUBMATCH: 3731 default: 3732 // Otherwise, the current node eats at least as much as its successor. 3733 // Note: we can propagate eats_at_least data for BEGIN_NEGATIVE_SUBMATCH 3734 // because NegativeLookaroundChoiceNode ignores its lookaround successor 3735 // when computing eats-at-least and quick check information. 3736 that->set_eats_at_least_info(*that->on_success()->eats_at_least_info()); 3737 break; 3738 } 3739 } 3740 3741 static void VisitChoice(ChoiceNode* that, int i) { 3742 // The minimum possible match from a choice node is the minimum of its 3743 // successors. 3744 EatsAtLeastInfo eats_at_least = 3745 i == 0 ? EatsAtLeastInfo(UINT8_MAX) : *that->eats_at_least_info(); 3746 eats_at_least.SetMin( 3747 *that->alternatives()->at(i).node()->eats_at_least_info()); 3748 that->set_eats_at_least_info(eats_at_least); 3749 } 3750 3751 static void VisitLoopChoiceContinueNode(LoopChoiceNode* that) { 3752 if (!that->read_backward()) { 3753 that->set_eats_at_least_info( 3754 *that->continue_node()->eats_at_least_info()); 3755 } 3756 } 3757 3758 static void VisitLoopChoiceLoopNode(LoopChoiceNode* that) {} 3759 3760 static void VisitNegativeLookaroundChoiceLookaroundNode( 3761 NegativeLookaroundChoiceNode* that) {} 3762 3763 static void VisitNegativeLookaroundChoiceContinueNode( 3764 NegativeLookaroundChoiceNode* that) { 3765 that->set_eats_at_least_info(*that->continue_node()->eats_at_least_info()); 3766 } 3767 3768 static void VisitBackReference(BackReferenceNode* that) { 3769 if (!that->read_backward()) { 3770 that->set_eats_at_least_info(*that->on_success()->eats_at_least_info()); 3771 } 3772 } 3773 3774 static void VisitAssertion(AssertionNode* that) { 3775 EatsAtLeastInfo eats_at_least = *that->on_success()->eats_at_least_info(); 3776 if (that->assertion_type() == AssertionNode::AT_START) { 3777 // If we know we are not at the start and we are asked "how many 3778 // characters will you match if you succeed?" then we can answer anything 3779 // since false implies false. So let's just set the max answer 3780 // (UINT8_MAX) since that won't prevent us from preloading a lot of 3781 // characters for the other branches in the node graph. 3782 eats_at_least.eats_at_least_from_not_start = UINT8_MAX; 3783 } 3784 that->set_eats_at_least_info(eats_at_least); 3785 } 3786 }; 3787 3788 } // namespace 3789 3790 // ------------------------------------------------------------------- 3791 // Analysis 3792 3793 // Iterates the node graph and provides the opportunity for propagators to set 3794 // values that depend on successor nodes. 3795 template <typename... Propagators> 3796 class Analysis : public NodeVisitor { 3797 public: 3798 Analysis(Isolate* isolate, bool is_one_byte, RegExpFlags flags) 3799 : isolate_(isolate), 3800 is_one_byte_(is_one_byte), 3801 flags_(flags), 3802 error_(RegExpError::kNone) {} 3803 3804 void EnsureAnalyzed(RegExpNode* that) { 3805 StackLimitCheck check(isolate()); 3806 if (check.HasOverflowed()) { 3807 if (v8_flags.correctness_fuzzer_suppressions) { 3808 FATAL("Analysis: Aborting on stack overflow"); 3809 } 3810 fail(RegExpError::kAnalysisStackOverflow); 3811 return; 3812 } 3813 if (that->info()->been_analyzed || that->info()->being_analyzed) return; 3814 that->info()->being_analyzed = true; 3815 that->Accept(this); 3816 that->info()->being_analyzed = false; 3817 that->info()->been_analyzed = true; 3818 } 3819 3820 bool has_failed() { return error_ != RegExpError::kNone; } 3821 RegExpError error() { 3822 DCHECK(error_ != RegExpError::kNone); 3823 return error_; 3824 } 3825 void fail(RegExpError error) { error_ = error; } 3826 3827 Isolate* isolate() const { return isolate_; } 3828 3829 void VisitEnd(EndNode* that) override { 3830 // nothing to do 3831 } 3832 3833 // Used to call the given static function on each propagator / variadic template 3834 // argument. 3835 #define STATIC_FOR_EACH(expr) \ 3836 do { \ 3837 int dummy[] = {((expr), 0)...}; \ 3838 USE(dummy); \ 3839 } while (false) 3840 3841 void VisitText(TextNode* that) override { 3842 that->MakeCaseIndependent(isolate(), is_one_byte_, flags()); 3843 EnsureAnalyzed(that->on_success()); 3844 if (has_failed()) return; 3845 that->CalculateOffsets(); 3846 STATIC_FOR_EACH(Propagators::VisitText(that)); 3847 } 3848 3849 void VisitAction(ActionNode* that) override { 3850 if (that->action_type() == ActionNode::MODIFY_FLAGS) { 3851 set_flags(that->flags()); 3852 } 3853 EnsureAnalyzed(that->on_success()); 3854 if (has_failed()) return; 3855 STATIC_FOR_EACH(Propagators::VisitAction(that)); 3856 } 3857 3858 void VisitChoice(ChoiceNode* that) override { 3859 for (int i = 0; i < that->alternatives()->length(); i++) { 3860 EnsureAnalyzed(that->alternatives()->at(i).node()); 3861 if (has_failed()) return; 3862 STATIC_FOR_EACH(Propagators::VisitChoice(that, i)); 3863 } 3864 } 3865 3866 void VisitLoopChoice(LoopChoiceNode* that) override { 3867 DCHECK_EQ(that->alternatives()->length(), 2); // Just loop and continue. 3868 3869 // First propagate all information from the continuation node. 3870 // Due to the unusual visitation order, we need to manage the flags manually 3871 // as if we were visiting the loop node before visiting the continuation. 3872 RegExpFlags orig_flags = flags(); 3873 3874 EnsureAnalyzed(that->continue_node()); 3875 if (has_failed()) return; 3876 // Propagators don't access global state (including flags), so we don't need 3877 // to reset them here. 3878 STATIC_FOR_EACH(Propagators::VisitLoopChoiceContinueNode(that)); 3879 3880 RegExpFlags continuation_flags = flags(); 3881 3882 // Check the loop last since it may need the value of this node 3883 // to get a correct result. 3884 set_flags(orig_flags); 3885 EnsureAnalyzed(that->loop_node()); 3886 if (has_failed()) return; 3887 // Propagators don't access global state (including flags), so we don't need 3888 // to reset them here. 3889 STATIC_FOR_EACH(Propagators::VisitLoopChoiceLoopNode(that)); 3890 3891 set_flags(continuation_flags); 3892 } 3893 3894 void VisitNegativeLookaroundChoice( 3895 NegativeLookaroundChoiceNode* that) override { 3896 DCHECK_EQ(that->alternatives()->length(), 2); // Lookaround and continue. 3897 3898 EnsureAnalyzed(that->lookaround_node()); 3899 if (has_failed()) return; 3900 STATIC_FOR_EACH( 3901 Propagators::VisitNegativeLookaroundChoiceLookaroundNode(that)); 3902 3903 EnsureAnalyzed(that->continue_node()); 3904 if (has_failed()) return; 3905 STATIC_FOR_EACH( 3906 Propagators::VisitNegativeLookaroundChoiceContinueNode(that)); 3907 } 3908 3909 void VisitBackReference(BackReferenceNode* that) override { 3910 EnsureAnalyzed(that->on_success()); 3911 if (has_failed()) return; 3912 STATIC_FOR_EACH(Propagators::VisitBackReference(that)); 3913 } 3914 3915 void VisitAssertion(AssertionNode* that) override { 3916 EnsureAnalyzed(that->on_success()); 3917 if (has_failed()) return; 3918 STATIC_FOR_EACH(Propagators::VisitAssertion(that)); 3919 } 3920 3921 #undef STATIC_FOR_EACH 3922 3923 private: 3924 RegExpFlags flags() const { return flags_; } 3925 void set_flags(RegExpFlags flags) { flags_ = flags; } 3926 3927 Isolate* isolate_; 3928 const bool is_one_byte_; 3929 RegExpFlags flags_; 3930 RegExpError error_; 3931 3932 DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis); 3933 }; 3934 3935 RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpFlags flags, 3936 RegExpNode* node) { 3937 Analysis<AssertionPropagator, EatsAtLeastPropagator> analysis( 3938 isolate, is_one_byte, flags); 3939 DCHECK_EQ(node->info()->been_analyzed, false); 3940 analysis.EnsureAnalyzed(node); 3941 DCHECK_IMPLIES(analysis.has_failed(), analysis.error() != RegExpError::kNone); 3942 return analysis.has_failed() ? analysis.error() : RegExpError::kNone; 3943 } 3944 3945 void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget, 3946 BoyerMooreLookahead* bm, 3947 bool not_at_start) { 3948 // Working out the set of characters that a backreference can match is too 3949 // hard, so we just say that any character can match. 3950 bm->SetRest(offset); 3951 SaveBMInfo(bm, not_at_start, offset); 3952 } 3953 3954 static_assert(BoyerMoorePositionInfo::kMapSize == 3955 RegExpMacroAssembler::kTableSize); 3956 3957 void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget, 3958 BoyerMooreLookahead* bm, bool not_at_start) { 3959 ZoneList<GuardedAlternative>* alts = alternatives(); 3960 budget = (budget - 1) / alts->length(); 3961 for (int i = 0; i < alts->length(); i++) { 3962 GuardedAlternative& alt = alts->at(i); 3963 if (alt.guards() != nullptr && alt.guards()->length() != 0) { 3964 bm->SetRest(offset); // Give up trying to fill in info. 3965 SaveBMInfo(bm, not_at_start, offset); 3966 return; 3967 } 3968 alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start); 3969 } 3970 SaveBMInfo(bm, not_at_start, offset); 3971 } 3972 3973 void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget, 3974 BoyerMooreLookahead* bm, bool not_at_start) { 3975 if (initial_offset >= bm->length()) return; 3976 if (read_backward()) return; 3977 int offset = initial_offset; 3978 int max_char = bm->max_char(); 3979 for (int i = 0; i < elements()->length(); i++) { 3980 if (offset >= bm->length()) { 3981 if (initial_offset == 0) set_bm_info(not_at_start, bm); 3982 return; 3983 } 3984 TextElement text = elements()->at(i); 3985 if (text.text_type() == TextElement::ATOM) { 3986 RegExpAtom* atom = text.atom(); 3987 for (int j = 0; j < atom->length(); j++, offset++) { 3988 if (offset >= bm->length()) { 3989 if (initial_offset == 0) set_bm_info(not_at_start, bm); 3990 return; 3991 } 3992 base::uc16 character = atom->data()[j]; 3993 if (IsIgnoreCase(bm->compiler()->flags())) { 3994 unibrow::uchar chars[4]; 3995 int length = GetCaseIndependentLetters(isolate, character, 3996 bm->compiler(), chars, 4); 3997 for (int k = 0; k < length; k++) { 3998 bm->Set(offset, chars[k]); 3999 } 4000 } else { 4001 if (character <= max_char) bm->Set(offset, character); 4002 } 4003 } 4004 } else { 4005 DCHECK_EQ(TextElement::CLASS_RANGES, text.text_type()); 4006 RegExpClassRanges* class_ranges = text.class_ranges(); 4007 ZoneList<CharacterRange>* ranges = class_ranges->ranges(zone()); 4008 if (class_ranges->is_negated()) { 4009 bm->SetAll(offset); 4010 } else { 4011 for (int k = 0; k < ranges->length(); k++) { 4012 CharacterRange& range = ranges->at(k); 4013 if (static_cast<int>(range.from()) > max_char) continue; 4014 int to = std::min(max_char, static_cast<int>(range.to())); 4015 bm->SetInterval(offset, Interval(range.from(), to)); 4016 } 4017 } 4018 offset++; 4019 } 4020 } 4021 if (offset >= bm->length()) { 4022 if (initial_offset == 0) set_bm_info(not_at_start, bm); 4023 return; 4024 } 4025 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, 4026 true); // Not at start after a text node. 4027 if (initial_offset == 0) set_bm_info(not_at_start, bm); 4028 } 4029 4030 RegExpNode* RegExpCompiler::OptionallyStepBackToLeadSurrogate( 4031 RegExpNode* on_success) { 4032 DCHECK(!read_backward()); 4033 ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List( 4034 zone(), CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd)); 4035 ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List( 4036 zone(), CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd)); 4037 4038 ChoiceNode* optional_step_back = zone()->New<ChoiceNode>(2, zone()); 4039 4040 int stack_register = UnicodeLookaroundStackRegister(); 4041 int position_register = UnicodeLookaroundPositionRegister(); 4042 RegExpNode* step_back = TextNode::CreateForCharacterRanges( 4043 zone(), lead_surrogates, true, on_success); 4044 RegExpLookaround::Builder builder(true, step_back, stack_register, 4045 position_register); 4046 RegExpNode* match_trail = TextNode::CreateForCharacterRanges( 4047 zone(), trail_surrogates, false, builder.on_match_success()); 4048 4049 optional_step_back->AddAlternative( 4050 GuardedAlternative(builder.ForMatch(match_trail))); 4051 optional_step_back->AddAlternative(GuardedAlternative(on_success)); 4052 4053 return optional_step_back; 4054 } 4055 4056 RegExpNode* RegExpCompiler::PreprocessRegExp(RegExpCompileData* data, 4057 bool is_one_byte) { 4058 // Wrap the body of the regexp in capture #0. 4059 RegExpNode* captured_body = 4060 RegExpCapture::ToNode(data->tree, 0, this, accept()); 4061 RegExpNode* node = captured_body; 4062 if (!data->tree->IsAnchoredAtStart() && !IsSticky(flags())) { 4063 // Add a .*? at the beginning, outside the body capture, unless 4064 // this expression is anchored at the beginning or sticky. 4065 RegExpNode* loop_node = RegExpQuantifier::ToNode( 4066 0, RegExpTree::kInfinity, false, 4067 zone()->New<RegExpClassRanges>(StandardCharacterSet::kEverything), this, 4068 captured_body, data->contains_anchor); 4069 4070 if (data->contains_anchor) { 4071 // Unroll loop once, to take care of the case that might start 4072 // at the start of input. 4073 ChoiceNode* first_step_node = zone()->New<ChoiceNode>(2, zone()); 4074 first_step_node->AddAlternative(GuardedAlternative(captured_body)); 4075 first_step_node->AddAlternative(GuardedAlternative(zone()->New<TextNode>( 4076 zone()->New<RegExpClassRanges>(StandardCharacterSet::kEverything), 4077 false, loop_node))); 4078 node = first_step_node; 4079 } else { 4080 node = loop_node; 4081 } 4082 } 4083 if (is_one_byte) { 4084 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, this); 4085 // Do it again to propagate the new nodes to places where they were not 4086 // put because they had not been calculated yet. 4087 if (node != nullptr) { 4088 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, this); 4089 } 4090 } else if (IsEitherUnicode(flags()) && 4091 (IsGlobal(flags()) || IsSticky(flags()))) { 4092 node = OptionallyStepBackToLeadSurrogate(node); 4093 } 4094 4095 if (node == nullptr) node = zone()->New<EndNode>(EndNode::BACKTRACK, zone()); 4096 // We can run out of registers during preprocessing. Indicate an error in case 4097 // we do. 4098 if (reg_exp_too_big_) { 4099 data->error = RegExpError::kTooLarge; 4100 } 4101 return node; 4102 } 4103 4104 void RegExpCompiler::ToNodeCheckForStackOverflow() { 4105 if (StackLimitCheck{isolate()}.HasOverflowed()) { 4106 V8::FatalProcessOutOfMemory(isolate(), "RegExpCompiler"); 4107 } 4108 } 4109 4110 } // namespace v8::internal