regexp-interpreter.cc (44042B)
1 // Copyright 2011 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // A simple interpreter for the Irregexp byte code. 6 7 #include "irregexp/imported/regexp-interpreter.h" 8 9 #include "irregexp/imported/regexp-bytecodes.h" 10 #include "irregexp/imported/regexp-macro-assembler.h" 11 #include "irregexp/imported/regexp-stack.h" // For kMaximumStackSize. 12 #include "irregexp/imported/regexp.h" 13 14 #ifdef V8_INTL_SUPPORT 15 #include "unicode/uchar.h" 16 #endif // V8_INTL_SUPPORT 17 18 // Use token threaded dispatch iff the compiler supports computed gotos and the 19 // build argument v8_enable_regexp_interpreter_threaded_dispatch was set. 20 #if V8_HAS_COMPUTED_GOTO && \ 21 defined(V8_ENABLE_REGEXP_INTERPRETER_THREADED_DISPATCH) 22 #define V8_USE_COMPUTED_GOTO 1 23 #endif // V8_HAS_COMPUTED_GOTO 24 25 namespace v8 { 26 namespace internal { 27 28 namespace { 29 30 bool BackRefMatchesNoCase(Isolate* isolate, int from, int current, int len, 31 base::Vector<const base::uc16> subject, 32 bool unicode) { 33 Address offset_a = 34 reinterpret_cast<Address>(const_cast<base::uc16*>(&subject.at(from))); 35 Address offset_b = 36 reinterpret_cast<Address>(const_cast<base::uc16*>(&subject.at(current))); 37 size_t length = len * base::kUC16Size; 38 39 bool result = unicode 40 ? RegExpMacroAssembler::CaseInsensitiveCompareUnicode( 41 offset_a, offset_b, length, isolate) 42 : RegExpMacroAssembler::CaseInsensitiveCompareNonUnicode( 43 offset_a, offset_b, length, isolate); 44 return result == 1; 45 } 46 47 bool BackRefMatchesNoCase(Isolate* isolate, int from, int current, int len, 48 base::Vector<const uint8_t> subject, bool unicode) { 49 // For Latin1 characters the unicode flag makes no difference. 50 for (int i = 0; i < len; i++) { 51 unsigned int old_char = subject[from++]; 52 unsigned int new_char = subject[current++]; 53 if (old_char == new_char) continue; 54 // Convert both characters to lower case. 55 old_char |= 0x20; 56 new_char |= 0x20; 57 if (old_char != new_char) return false; 58 // Not letters in the ASCII range and Latin-1 range. 59 if (!(old_char - 'a' <= 'z' - 'a') && 60 !(old_char - 224 <= 254 - 224 && old_char != 247)) { 61 return false; 62 } 63 } 64 return true; 65 } 66 67 #ifdef DEBUG 68 void MaybeTraceInterpreter(const uint8_t* code_base, const uint8_t* pc, 69 int stack_depth, int current_position, 70 uint32_t current_char, int bytecode_length, 71 const char* bytecode_name) { 72 if (v8_flags.trace_regexp_bytecodes) { 73 const bool printable = std::isprint(current_char); 74 const char* format = 75 printable 76 ? "pc = %02x, sp = %d, curpos = %d, curchar = %08x (%c), bc = " 77 : "pc = %02x, sp = %d, curpos = %d, curchar = %08x .%c., bc = "; 78 PrintF(format, pc - code_base, stack_depth, current_position, current_char, 79 printable ? current_char : '.'); 80 81 RegExpBytecodeDisassembleSingle(code_base, pc); 82 } 83 } 84 #endif // DEBUG 85 86 int32_t Load32Aligned(const uint8_t* pc) { 87 DCHECK_EQ(0, reinterpret_cast<intptr_t>(pc) & 3); 88 return *reinterpret_cast<const int32_t*>(pc); 89 } 90 91 uint32_t Load16AlignedUnsigned(const uint8_t* pc) { 92 DCHECK_EQ(0, reinterpret_cast<intptr_t>(pc) & 1); 93 return *reinterpret_cast<const uint16_t*>(pc); 94 } 95 96 int32_t Load16AlignedSigned(const uint8_t* pc) { 97 DCHECK_EQ(0, reinterpret_cast<intptr_t>(pc) & 1); 98 return *reinterpret_cast<const int16_t*>(pc); 99 } 100 101 // Helpers to access the packed argument. Takes the 32 bits containing the 102 // current bytecode, where the 8 LSB contain the bytecode and the rest contains 103 // a packed 24-bit argument. 104 // TODO(jgruber): Specify signed-ness in bytecode signature declarations, and 105 // police restrictions during bytecode generation. 106 int32_t LoadPacked24Signed(int32_t bytecode_and_packed_arg) { 107 return bytecode_and_packed_arg >> BYTECODE_SHIFT; 108 } 109 uint32_t LoadPacked24Unsigned(int32_t bytecode_and_packed_arg) { 110 return static_cast<uint32_t>(bytecode_and_packed_arg) >> BYTECODE_SHIFT; 111 } 112 113 // A simple abstraction over the backtracking stack used by the interpreter. 114 // 115 // Despite the name 'backtracking' stack, it's actually used as a generic stack 116 // that stores both program counters (= offsets into the bytecode) and generic 117 // integer values. 118 class BacktrackStack { 119 public: 120 BacktrackStack() = default; 121 BacktrackStack(const BacktrackStack&) = delete; 122 BacktrackStack& operator=(const BacktrackStack&) = delete; 123 124 V8_WARN_UNUSED_RESULT bool push(int v) { 125 data_.emplace_back(v); 126 return (static_cast<int>(data_.size()) <= kMaxSize); 127 } 128 int peek() const { 129 SBXCHECK(!data_.empty()); 130 return data_.back(); 131 } 132 int pop() { 133 int v = peek(); 134 data_.pop_back(); 135 return v; 136 } 137 138 // The 'sp' is the index of the first empty element in the stack. 139 int sp() const { return static_cast<int>(data_.size()); } 140 void set_sp(uint32_t new_sp) { 141 DCHECK_LE(new_sp, sp()); 142 data_.resize(new_sp); 143 } 144 145 private: 146 // Semi-arbitrary. Should be large enough for common cases to remain in the 147 // static stack-allocated backing store, but small enough not to waste space. 148 static constexpr int kStaticCapacity = 64; 149 150 using ValueT = int; 151 base::SmallVector<ValueT, kStaticCapacity> data_; 152 153 static constexpr int kMaxSize = 154 RegExpStack::kMaximumStackSize / sizeof(ValueT); 155 }; 156 157 // Registers used during interpreter execution. These consist of output 158 // registers in indices [0, output_register_count[ which will contain matcher 159 // results as a {start,end} index tuple for each capture (where the whole match 160 // counts as implicit capture 0); and internal registers in indices 161 // [output_register_count, total_register_count[. 162 class InterpreterRegisters { 163 public: 164 using RegisterT = int; 165 166 InterpreterRegisters(int total_register_count, RegisterT* output_registers, 167 int output_register_count) 168 : registers_(total_register_count), 169 output_registers_(output_registers), 170 total_register_count_(total_register_count), 171 output_register_count_(output_register_count) { 172 // TODO(jgruber): Use int32_t consistently for registers. Currently, CSA 173 // uses int32_t while runtime uses int. 174 static_assert(sizeof(int) == sizeof(int32_t)); 175 SBXCHECK_GE(output_register_count, 2); // At least 2 for the match itself. 176 SBXCHECK_GE(total_register_count, output_register_count); 177 SBXCHECK_LE(total_register_count, RegExpMacroAssembler::kMaxRegisterCount); 178 DCHECK_NOT_NULL(output_registers); 179 180 // Initialize the output register region to -1 signifying 'no match'. 181 std::memset(registers_.data(), -1, 182 output_register_count * sizeof(RegisterT)); 183 USE(total_register_count_); 184 } 185 186 const RegisterT& operator[](size_t index) const { 187 SBXCHECK_LT(index, total_register_count_); 188 return registers_[index]; 189 } 190 RegisterT& operator[](size_t index) { 191 SBXCHECK_LT(index, total_register_count_); 192 return registers_[index]; 193 } 194 195 void CopyToOutputRegisters() { 196 MemCopy(output_registers_, registers_.data(), 197 output_register_count_ * sizeof(RegisterT)); 198 } 199 200 private: 201 static constexpr int kStaticCapacity = 64; // Arbitrary. 202 base::SmallVector<RegisterT, kStaticCapacity> registers_; 203 RegisterT* const output_registers_; 204 const int total_register_count_; 205 const int output_register_count_; 206 }; 207 208 IrregexpInterpreter::Result ThrowStackOverflow(Isolate* isolate, 209 RegExp::CallOrigin call_origin) { 210 CHECK(call_origin == RegExp::CallOrigin::kFromRuntime); 211 // We abort interpreter execution after the stack overflow is thrown, and thus 212 // allow allocation here despite the outer DisallowGarbageCollectionScope. 213 AllowGarbageCollection yes_gc; 214 isolate->StackOverflow(); 215 return IrregexpInterpreter::EXCEPTION; 216 } 217 218 // Only throws if called from the runtime, otherwise just returns the EXCEPTION 219 // status code. 220 IrregexpInterpreter::Result MaybeThrowStackOverflow( 221 Isolate* isolate, RegExp::CallOrigin call_origin) { 222 if (call_origin == RegExp::CallOrigin::kFromRuntime) { 223 return ThrowStackOverflow(isolate, call_origin); 224 } else { 225 return IrregexpInterpreter::EXCEPTION; 226 } 227 } 228 229 template <typename Char> 230 void UpdateCodeAndSubjectReferences( 231 Isolate* isolate, DirectHandle<TrustedByteArray> code_array, 232 DirectHandle<String> subject_string, 233 Tagged<TrustedByteArray>* code_array_out, const uint8_t** code_base_out, 234 const uint8_t** pc_out, Tagged<String>* subject_string_out, 235 base::Vector<const Char>* subject_string_vector_out) { 236 DisallowGarbageCollection no_gc; 237 238 if (*code_base_out != code_array->begin()) { 239 *code_array_out = *code_array; 240 const intptr_t pc_offset = *pc_out - *code_base_out; 241 DCHECK_GT(pc_offset, 0); 242 *code_base_out = code_array->begin(); 243 *pc_out = *code_base_out + pc_offset; 244 } 245 246 DCHECK(subject_string->IsFlat()); 247 *subject_string_out = *subject_string; 248 *subject_string_vector_out = subject_string->GetCharVector<Char>(no_gc); 249 } 250 251 // Runs all pending interrupts and updates unhandlified object references if 252 // necessary. 253 template <typename Char> 254 IrregexpInterpreter::Result HandleInterrupts( 255 Isolate* isolate, RegExp::CallOrigin call_origin, 256 Tagged<TrustedByteArray>* code_array_out, 257 Tagged<String>* subject_string_out, const uint8_t** code_base_out, 258 base::Vector<const Char>* subject_string_vector_out, 259 const uint8_t** pc_out) { 260 DisallowGarbageCollection no_gc; 261 262 StackLimitCheck check(isolate); 263 bool js_has_overflowed = check.JsHasOverflowed(); 264 265 if (call_origin == RegExp::CallOrigin::kFromJs) { 266 // Direct calls from JavaScript can be interrupted in two ways: 267 // 1. A real stack overflow, in which case we let the caller throw the 268 // exception. 269 // 2. The stack guard was used to interrupt execution for another purpose, 270 // forcing the call through the runtime system. 271 if (js_has_overflowed) { 272 return IrregexpInterpreter::EXCEPTION; 273 } else if (check.InterruptRequested()) { 274 return IrregexpInterpreter::RETRY; 275 } 276 } else { 277 DCHECK(call_origin == RegExp::CallOrigin::kFromRuntime); 278 // Prepare for possible GC. 279 HandleScope handles(isolate); 280 DirectHandle<TrustedByteArray> code_handle(*code_array_out, isolate); 281 DirectHandle<String> subject_handle(*subject_string_out, isolate); 282 283 if (js_has_overflowed) { 284 return ThrowStackOverflow(isolate, call_origin); 285 } else if (check.InterruptRequested()) { 286 const bool was_one_byte = 287 String::IsOneByteRepresentationUnderneath(*subject_string_out); 288 Tagged<Object> result; 289 { 290 AllowGarbageCollection yes_gc; 291 result = isolate->stack_guard()->HandleInterrupts(); 292 } 293 if (IsExceptionHole(result, isolate)) { 294 return IrregexpInterpreter::EXCEPTION; 295 } 296 297 // If we changed between a LATIN1 and a UC16 string, we need to 298 // restart regexp matching with the appropriate template instantiation of 299 // RawMatch. 300 if (String::IsOneByteRepresentationUnderneath(*subject_handle) != 301 was_one_byte) { 302 return IrregexpInterpreter::RETRY; 303 } 304 305 UpdateCodeAndSubjectReferences( 306 isolate, code_handle, subject_handle, code_array_out, code_base_out, 307 pc_out, subject_string_out, subject_string_vector_out); 308 } 309 } 310 311 return IrregexpInterpreter::SUCCESS; 312 } 313 314 bool CheckBitInTable(const uint32_t current_char, const uint8_t* const table) { 315 int mask = RegExpMacroAssembler::kTableMask; 316 int b = table[(current_char & mask) >> kBitsPerByteLog2]; 317 int bit = (current_char & (kBitsPerByte - 1)); 318 return (b & (1 << bit)) != 0; 319 } 320 321 // Returns true iff 0 <= index < length. 322 bool IndexIsInBounds(int index, int length) { 323 DCHECK_GE(length, 0); 324 return static_cast<uintptr_t>(index) < static_cast<uintptr_t>(length); 325 } 326 327 // If computed gotos are supported by the compiler, we can get addresses to 328 // labels directly in C/C++. Every bytecode handler has its own label and we 329 // store the addresses in a dispatch table indexed by bytecode. To execute the 330 // next handler we simply jump (goto) directly to its address. 331 #if V8_USE_COMPUTED_GOTO 332 #define BC_LABEL(name) BC_##name: 333 #define DECODE() \ 334 do { \ 335 next_insn = Load32Aligned(next_pc); \ 336 next_handler_addr = dispatch_table[next_insn & BYTECODE_MASK]; \ 337 } while (false) 338 #define DISPATCH() \ 339 pc = next_pc; \ 340 insn = next_insn; \ 341 goto* next_handler_addr 342 // Without computed goto support, we fall back to a simple switch-based 343 // dispatch (A large switch statement inside a loop with a case for every 344 // bytecode). 345 #else // V8_USE_COMPUTED_GOTO 346 #define BC_LABEL(name) case BC_##name: 347 #define DECODE() next_insn = Load32Aligned(next_pc) 348 #define DISPATCH() \ 349 pc = next_pc; \ 350 insn = next_insn; \ 351 goto switch_dispatch_continuation 352 #endif // V8_USE_COMPUTED_GOTO 353 354 // ADVANCE/SET_PC_FROM_OFFSET are separated from DISPATCH, because ideally some 355 // instructions can be executed between ADVANCE/SET_PC_FROM_OFFSET and DISPATCH. 356 // We want those two macros as far apart as possible, because the goto in 357 // DISPATCH is dependent on a memory load in ADVANCE/SET_PC_FROM_OFFSET. If we 358 // don't hit the cache and have to fetch the next handler address from physical 359 // memory, instructions between ADVANCE/SET_PC_FROM_OFFSET and DISPATCH can 360 // potentially be executed unconditionally, reducing memory stall. 361 #define ADVANCE(name) \ 362 next_pc = pc + RegExpBytecodeLength(BC_##name); \ 363 DECODE() 364 #define SET_PC_FROM_OFFSET(offset) \ 365 next_pc = code_base + offset; \ 366 DECODE() 367 368 // Current position mutations. 369 #define SET_CURRENT_POSITION(value) \ 370 do { \ 371 current = (value); \ 372 DCHECK(base::IsInRange(current, 0, subject.length())); \ 373 } while (false) 374 #define ADVANCE_CURRENT_POSITION(by) SET_CURRENT_POSITION(current + (by)) 375 376 #ifdef DEBUG 377 #define BYTECODE(name) \ 378 BC_LABEL(name) \ 379 MaybeTraceInterpreter(code_base, pc, backtrack_stack.sp(), current, \ 380 current_char, RegExpBytecodeLength(BC_##name), #name); 381 #else 382 #define BYTECODE(name) BC_LABEL(name) 383 #endif // DEBUG 384 385 template <typename Char> 386 IrregexpInterpreter::Result RawMatch( 387 Isolate* isolate, Tagged<TrustedByteArray>* code_array, 388 Tagged<String>* subject_string, base::Vector<const Char> subject, 389 int* output_registers, int output_register_count, int total_register_count, 390 int current, uint32_t current_char, RegExp::CallOrigin call_origin, 391 const uint32_t backtrack_limit) { 392 DisallowGarbageCollection no_gc; 393 394 #if V8_USE_COMPUTED_GOTO 395 396 // We have to make sure that no OOB access to the dispatch table is possible and 397 // all values are valid label addresses. 398 // Otherwise jumps to arbitrary addresses could potentially happen. 399 // This is ensured as follows: 400 // Every index to the dispatch table gets masked using BYTECODE_MASK in 401 // DECODE(). This way we can only get values between 0 (only the least 402 // significant byte of an integer is used) and kRegExpPaddedBytecodeCount - 1 403 // (BYTECODE_MASK is defined to be exactly this value). 404 // All entries from kRegExpBytecodeCount to kRegExpPaddedBytecodeCount have to 405 // be filled with BREAKs (invalid operation). 406 407 // Fill dispatch table from last defined bytecode up to the next power of two 408 // with BREAK (invalid operation). 409 // TODO(pthier): Find a way to fill up automatically (at compile time) 410 // 59 real bytecodes -> 5 fillers 411 #define BYTECODE_FILLER_ITERATOR(V) \ 412 V(BREAK) /* 1 */ \ 413 V(BREAK) /* 2 */ \ 414 V(BREAK) /* 3 */ \ 415 V(BREAK) /* 4 */ \ 416 V(BREAK) /* 5 */ 417 418 #define COUNT(...) +1 419 static constexpr int kRegExpBytecodeFillerCount = 420 BYTECODE_FILLER_ITERATOR(COUNT); 421 #undef COUNT 422 423 // Make sure kRegExpPaddedBytecodeCount is actually the closest possible power 424 // of two. 425 DCHECK_EQ(kRegExpPaddedBytecodeCount, 426 base::bits::RoundUpToPowerOfTwo32(kRegExpBytecodeCount)); 427 428 // Make sure every bytecode we get by using BYTECODE_MASK is well defined. 429 static_assert(kRegExpBytecodeCount <= kRegExpPaddedBytecodeCount); 430 static_assert(kRegExpBytecodeCount + kRegExpBytecodeFillerCount == 431 kRegExpPaddedBytecodeCount); 432 433 #define DECLARE_DISPATCH_TABLE_ENTRY(name, ...) &&BC_##name, 434 static const void* const dispatch_table[kRegExpPaddedBytecodeCount] = { 435 BYTECODE_ITERATOR(DECLARE_DISPATCH_TABLE_ENTRY) 436 BYTECODE_FILLER_ITERATOR(DECLARE_DISPATCH_TABLE_ENTRY)}; 437 #undef DECLARE_DISPATCH_TABLE_ENTRY 438 #undef BYTECODE_FILLER_ITERATOR 439 440 #endif // V8_USE_COMPUTED_GOTO 441 442 const uint8_t* pc = (*code_array)->begin(); 443 const uint8_t* code_base = pc; 444 445 InterpreterRegisters registers(total_register_count, output_registers, 446 output_register_count); 447 BacktrackStack backtrack_stack; 448 449 uint32_t backtrack_count = 0; 450 451 #ifdef DEBUG 452 if (v8_flags.trace_regexp_bytecodes) { 453 PrintF("\n\nStart bytecode interpreter\n\n"); 454 } 455 #endif 456 457 while (true) { 458 const uint8_t* next_pc = pc; 459 int32_t insn; 460 int32_t next_insn; 461 #if V8_USE_COMPUTED_GOTO 462 const void* next_handler_addr; 463 DECODE(); 464 DISPATCH(); 465 #else 466 insn = Load32Aligned(pc); 467 switch (insn & BYTECODE_MASK) { 468 #endif // V8_USE_COMPUTED_GOTO 469 BYTECODE(BREAK) { UNREACHABLE(); } 470 BYTECODE(PUSH_CP) { 471 ADVANCE(PUSH_CP); 472 if (!backtrack_stack.push(current)) { 473 return MaybeThrowStackOverflow(isolate, call_origin); 474 } 475 DISPATCH(); 476 } 477 BYTECODE(PUSH_BT) { 478 ADVANCE(PUSH_BT); 479 if (!backtrack_stack.push(Load32Aligned(pc + 4))) { 480 return MaybeThrowStackOverflow(isolate, call_origin); 481 } 482 DISPATCH(); 483 } 484 BYTECODE(PUSH_REGISTER) { 485 ADVANCE(PUSH_REGISTER); 486 if (!backtrack_stack.push(registers[LoadPacked24Unsigned(insn)])) { 487 return MaybeThrowStackOverflow(isolate, call_origin); 488 } 489 DISPATCH(); 490 } 491 BYTECODE(SET_REGISTER) { 492 ADVANCE(SET_REGISTER); 493 registers[LoadPacked24Unsigned(insn)] = Load32Aligned(pc + 4); 494 DISPATCH(); 495 } 496 BYTECODE(ADVANCE_REGISTER) { 497 ADVANCE(ADVANCE_REGISTER); 498 registers[LoadPacked24Unsigned(insn)] += Load32Aligned(pc + 4); 499 DISPATCH(); 500 } 501 BYTECODE(SET_REGISTER_TO_CP) { 502 ADVANCE(SET_REGISTER_TO_CP); 503 registers[LoadPacked24Unsigned(insn)] = current + Load32Aligned(pc + 4); 504 DISPATCH(); 505 } 506 BYTECODE(SET_CP_TO_REGISTER) { 507 ADVANCE(SET_CP_TO_REGISTER); 508 SET_CURRENT_POSITION(registers[LoadPacked24Unsigned(insn)]); 509 DISPATCH(); 510 } 511 BYTECODE(SET_REGISTER_TO_SP) { 512 ADVANCE(SET_REGISTER_TO_SP); 513 registers[LoadPacked24Unsigned(insn)] = backtrack_stack.sp(); 514 DISPATCH(); 515 } 516 BYTECODE(SET_SP_TO_REGISTER) { 517 ADVANCE(SET_SP_TO_REGISTER); 518 backtrack_stack.set_sp(registers[LoadPacked24Unsigned(insn)]); 519 DISPATCH(); 520 } 521 BYTECODE(POP_CP) { 522 ADVANCE(POP_CP); 523 SET_CURRENT_POSITION(backtrack_stack.pop()); 524 DISPATCH(); 525 } 526 BYTECODE(POP_BT) { 527 static_assert(JSRegExp::kNoBacktrackLimit == 0); 528 if (++backtrack_count == backtrack_limit) { 529 int return_code = LoadPacked24Signed(insn); 530 return static_cast<IrregexpInterpreter::Result>(return_code); 531 } 532 533 IrregexpInterpreter::Result return_code = 534 HandleInterrupts(isolate, call_origin, code_array, subject_string, 535 &code_base, &subject, &pc); 536 if (return_code != IrregexpInterpreter::SUCCESS) return return_code; 537 538 SET_PC_FROM_OFFSET(backtrack_stack.pop()); 539 DISPATCH(); 540 } 541 BYTECODE(POP_REGISTER) { 542 ADVANCE(POP_REGISTER); 543 registers[LoadPacked24Unsigned(insn)] = backtrack_stack.pop(); 544 DISPATCH(); 545 } 546 BYTECODE(FAIL) { 547 isolate->counters()->regexp_backtracks()->AddSample( 548 static_cast<int>(backtrack_count)); 549 return IrregexpInterpreter::FAILURE; 550 } 551 BYTECODE(SUCCEED) { 552 isolate->counters()->regexp_backtracks()->AddSample( 553 static_cast<int>(backtrack_count)); 554 registers.CopyToOutputRegisters(); 555 return IrregexpInterpreter::SUCCESS; 556 } 557 BYTECODE(ADVANCE_CP) { 558 ADVANCE(ADVANCE_CP); 559 ADVANCE_CURRENT_POSITION(LoadPacked24Signed(insn)); 560 DISPATCH(); 561 } 562 BYTECODE(GOTO) { 563 SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); 564 DISPATCH(); 565 } 566 BYTECODE(ADVANCE_CP_AND_GOTO) { 567 SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); 568 ADVANCE_CURRENT_POSITION(LoadPacked24Signed(insn)); 569 DISPATCH(); 570 } 571 BYTECODE(CHECK_FIXED_LENGTH) { 572 if (current == backtrack_stack.peek()) { 573 SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); 574 backtrack_stack.pop(); 575 } else { 576 ADVANCE(CHECK_FIXED_LENGTH); 577 } 578 DISPATCH(); 579 } 580 BYTECODE(LOAD_CURRENT_CHAR) { 581 int pos = current + LoadPacked24Signed(insn); 582 if (pos >= subject.length() || pos < 0) { 583 SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); 584 } else { 585 ADVANCE(LOAD_CURRENT_CHAR); 586 current_char = subject[pos]; 587 } 588 DISPATCH(); 589 } 590 BYTECODE(LOAD_CURRENT_CHAR_UNCHECKED) { 591 ADVANCE(LOAD_CURRENT_CHAR_UNCHECKED); 592 int pos = current + LoadPacked24Signed(insn); 593 current_char = subject[pos]; 594 DISPATCH(); 595 } 596 BYTECODE(LOAD_2_CURRENT_CHARS) { 597 int pos = current + LoadPacked24Signed(insn); 598 if (pos + 2 > subject.length() || pos < 0) { 599 SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); 600 } else { 601 ADVANCE(LOAD_2_CURRENT_CHARS); 602 Char next = subject[pos + 1]; 603 current_char = (subject[pos] | (next << (kBitsPerByte * sizeof(Char)))); 604 } 605 DISPATCH(); 606 } 607 BYTECODE(LOAD_2_CURRENT_CHARS_UNCHECKED) { 608 ADVANCE(LOAD_2_CURRENT_CHARS_UNCHECKED); 609 int pos = current + LoadPacked24Signed(insn); 610 Char next = subject[pos + 1]; 611 current_char = (subject[pos] | (next << (kBitsPerByte * sizeof(Char)))); 612 DISPATCH(); 613 } 614 BYTECODE(LOAD_4_CURRENT_CHARS) { 615 DCHECK_EQ(1, sizeof(Char)); 616 int pos = current + LoadPacked24Signed(insn); 617 if (pos + 4 > subject.length() || pos < 0) { 618 SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); 619 } else { 620 ADVANCE(LOAD_4_CURRENT_CHARS); 621 Char next1 = subject[pos + 1]; 622 Char next2 = subject[pos + 2]; 623 Char next3 = subject[pos + 3]; 624 current_char = 625 (subject[pos] | (next1 << 8) | (next2 << 16) | (next3 << 24)); 626 } 627 DISPATCH(); 628 } 629 BYTECODE(LOAD_4_CURRENT_CHARS_UNCHECKED) { 630 ADVANCE(LOAD_4_CURRENT_CHARS_UNCHECKED); 631 DCHECK_EQ(1, sizeof(Char)); 632 int pos = current + LoadPacked24Signed(insn); 633 Char next1 = subject[pos + 1]; 634 Char next2 = subject[pos + 2]; 635 Char next3 = subject[pos + 3]; 636 current_char = 637 (subject[pos] | (next1 << 8) | (next2 << 16) | (next3 << 24)); 638 DISPATCH(); 639 } 640 BYTECODE(CHECK_4_CHARS) { 641 uint32_t c = Load32Aligned(pc + 4); 642 if (c == current_char) { 643 SET_PC_FROM_OFFSET(Load32Aligned(pc + 8)); 644 } else { 645 ADVANCE(CHECK_4_CHARS); 646 } 647 DISPATCH(); 648 } 649 BYTECODE(CHECK_CHAR) { 650 uint32_t c = LoadPacked24Unsigned(insn); 651 if (c == current_char) { 652 SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); 653 } else { 654 ADVANCE(CHECK_CHAR); 655 } 656 DISPATCH(); 657 } 658 BYTECODE(CHECK_NOT_4_CHARS) { 659 uint32_t c = Load32Aligned(pc + 4); 660 if (c != current_char) { 661 SET_PC_FROM_OFFSET(Load32Aligned(pc + 8)); 662 } else { 663 ADVANCE(CHECK_NOT_4_CHARS); 664 } 665 DISPATCH(); 666 } 667 BYTECODE(CHECK_NOT_CHAR) { 668 uint32_t c = LoadPacked24Unsigned(insn); 669 if (c != current_char) { 670 SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); 671 } else { 672 ADVANCE(CHECK_NOT_CHAR); 673 } 674 DISPATCH(); 675 } 676 BYTECODE(AND_CHECK_4_CHARS) { 677 uint32_t c = Load32Aligned(pc + 4); 678 if (c == (current_char & Load32Aligned(pc + 8))) { 679 SET_PC_FROM_OFFSET(Load32Aligned(pc + 12)); 680 } else { 681 ADVANCE(AND_CHECK_4_CHARS); 682 } 683 DISPATCH(); 684 } 685 BYTECODE(AND_CHECK_CHAR) { 686 uint32_t c = LoadPacked24Unsigned(insn); 687 if (c == (current_char & Load32Aligned(pc + 4))) { 688 SET_PC_FROM_OFFSET(Load32Aligned(pc + 8)); 689 } else { 690 ADVANCE(AND_CHECK_CHAR); 691 } 692 DISPATCH(); 693 } 694 BYTECODE(AND_CHECK_NOT_4_CHARS) { 695 uint32_t c = Load32Aligned(pc + 4); 696 if (c != (current_char & Load32Aligned(pc + 8))) { 697 SET_PC_FROM_OFFSET(Load32Aligned(pc + 12)); 698 } else { 699 ADVANCE(AND_CHECK_NOT_4_CHARS); 700 } 701 DISPATCH(); 702 } 703 BYTECODE(AND_CHECK_NOT_CHAR) { 704 uint32_t c = LoadPacked24Unsigned(insn); 705 if (c != (current_char & Load32Aligned(pc + 4))) { 706 SET_PC_FROM_OFFSET(Load32Aligned(pc + 8)); 707 } else { 708 ADVANCE(AND_CHECK_NOT_CHAR); 709 } 710 DISPATCH(); 711 } 712 BYTECODE(MINUS_AND_CHECK_NOT_CHAR) { 713 uint32_t c = LoadPacked24Unsigned(insn); 714 uint32_t minus = Load16AlignedUnsigned(pc + 4); 715 uint32_t mask = Load16AlignedUnsigned(pc + 6); 716 if (c != ((current_char - minus) & mask)) { 717 SET_PC_FROM_OFFSET(Load32Aligned(pc + 8)); 718 } else { 719 ADVANCE(MINUS_AND_CHECK_NOT_CHAR); 720 } 721 DISPATCH(); 722 } 723 BYTECODE(CHECK_CHAR_IN_RANGE) { 724 uint32_t from = Load16AlignedUnsigned(pc + 4); 725 uint32_t to = Load16AlignedUnsigned(pc + 6); 726 if (from <= current_char && current_char <= to) { 727 SET_PC_FROM_OFFSET(Load32Aligned(pc + 8)); 728 } else { 729 ADVANCE(CHECK_CHAR_IN_RANGE); 730 } 731 DISPATCH(); 732 } 733 BYTECODE(CHECK_CHAR_NOT_IN_RANGE) { 734 uint32_t from = Load16AlignedUnsigned(pc + 4); 735 uint32_t to = Load16AlignedUnsigned(pc + 6); 736 if (from > current_char || current_char > to) { 737 SET_PC_FROM_OFFSET(Load32Aligned(pc + 8)); 738 } else { 739 ADVANCE(CHECK_CHAR_NOT_IN_RANGE); 740 } 741 DISPATCH(); 742 } 743 BYTECODE(CHECK_BIT_IN_TABLE) { 744 if (CheckBitInTable(current_char, pc + 8)) { 745 SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); 746 } else { 747 ADVANCE(CHECK_BIT_IN_TABLE); 748 } 749 DISPATCH(); 750 } 751 BYTECODE(CHECK_LT) { 752 uint32_t limit = LoadPacked24Unsigned(insn); 753 if (current_char < limit) { 754 SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); 755 } else { 756 ADVANCE(CHECK_LT); 757 } 758 DISPATCH(); 759 } 760 BYTECODE(CHECK_GT) { 761 uint32_t limit = LoadPacked24Unsigned(insn); 762 if (current_char > limit) { 763 SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); 764 } else { 765 ADVANCE(CHECK_GT); 766 } 767 DISPATCH(); 768 } 769 BYTECODE(CHECK_REGISTER_LT) { 770 if (registers[LoadPacked24Unsigned(insn)] < Load32Aligned(pc + 4)) { 771 SET_PC_FROM_OFFSET(Load32Aligned(pc + 8)); 772 } else { 773 ADVANCE(CHECK_REGISTER_LT); 774 } 775 DISPATCH(); 776 } 777 BYTECODE(CHECK_REGISTER_GE) { 778 if (registers[LoadPacked24Unsigned(insn)] >= Load32Aligned(pc + 4)) { 779 SET_PC_FROM_OFFSET(Load32Aligned(pc + 8)); 780 } else { 781 ADVANCE(CHECK_REGISTER_GE); 782 } 783 DISPATCH(); 784 } 785 BYTECODE(CHECK_REGISTER_EQ_POS) { 786 if (registers[LoadPacked24Unsigned(insn)] == current) { 787 SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); 788 } else { 789 ADVANCE(CHECK_REGISTER_EQ_POS); 790 } 791 DISPATCH(); 792 } 793 BYTECODE(CHECK_NOT_REGS_EQUAL) { 794 if (registers[LoadPacked24Unsigned(insn)] == 795 registers[Load32Aligned(pc + 4)]) { 796 ADVANCE(CHECK_NOT_REGS_EQUAL); 797 } else { 798 SET_PC_FROM_OFFSET(Load32Aligned(pc + 8)); 799 } 800 DISPATCH(); 801 } 802 BYTECODE(CHECK_NOT_BACK_REF) { 803 int from = registers[LoadPacked24Unsigned(insn)]; 804 int len = registers[LoadPacked24Unsigned(insn) + 1] - from; 805 if (from >= 0 && len > 0) { 806 if (current + len > subject.length() || 807 !CompareCharsEqual(&subject[from], &subject[current], len)) { 808 SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); 809 DISPATCH(); 810 } 811 ADVANCE_CURRENT_POSITION(len); 812 } 813 ADVANCE(CHECK_NOT_BACK_REF); 814 DISPATCH(); 815 } 816 BYTECODE(CHECK_NOT_BACK_REF_BACKWARD) { 817 int from = registers[LoadPacked24Unsigned(insn)]; 818 int len = registers[LoadPacked24Unsigned(insn) + 1] - from; 819 if (from >= 0 && len > 0) { 820 if (current - len < 0 || 821 !CompareCharsEqual(&subject[from], &subject[current - len], len)) { 822 SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); 823 DISPATCH(); 824 } 825 SET_CURRENT_POSITION(current - len); 826 } 827 ADVANCE(CHECK_NOT_BACK_REF_BACKWARD); 828 DISPATCH(); 829 } 830 BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE) { 831 int from = registers[LoadPacked24Unsigned(insn)]; 832 int len = registers[LoadPacked24Unsigned(insn) + 1] - from; 833 if (from >= 0 && len > 0) { 834 if (current + len > subject.length() || 835 !BackRefMatchesNoCase(isolate, from, current, len, subject, true)) { 836 SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); 837 DISPATCH(); 838 } 839 ADVANCE_CURRENT_POSITION(len); 840 } 841 ADVANCE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE); 842 DISPATCH(); 843 } 844 BYTECODE(CHECK_NOT_BACK_REF_NO_CASE) { 845 int from = registers[LoadPacked24Unsigned(insn)]; 846 int len = registers[LoadPacked24Unsigned(insn) + 1] - from; 847 if (from >= 0 && len > 0) { 848 if (current + len > subject.length() || 849 !BackRefMatchesNoCase(isolate, from, current, len, subject, 850 false)) { 851 SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); 852 DISPATCH(); 853 } 854 ADVANCE_CURRENT_POSITION(len); 855 } 856 ADVANCE(CHECK_NOT_BACK_REF_NO_CASE); 857 DISPATCH(); 858 } 859 BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD) { 860 int from = registers[LoadPacked24Unsigned(insn)]; 861 int len = registers[LoadPacked24Unsigned(insn) + 1] - from; 862 if (from >= 0 && len > 0) { 863 if (current - len < 0 || 864 !BackRefMatchesNoCase(isolate, from, current - len, len, subject, 865 true)) { 866 SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); 867 DISPATCH(); 868 } 869 SET_CURRENT_POSITION(current - len); 870 } 871 ADVANCE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD); 872 DISPATCH(); 873 } 874 BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD) { 875 int from = registers[LoadPacked24Unsigned(insn)]; 876 int len = registers[LoadPacked24Unsigned(insn) + 1] - from; 877 if (from >= 0 && len > 0) { 878 if (current - len < 0 || 879 !BackRefMatchesNoCase(isolate, from, current - len, len, subject, 880 false)) { 881 SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); 882 DISPATCH(); 883 } 884 SET_CURRENT_POSITION(current - len); 885 } 886 ADVANCE(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD); 887 DISPATCH(); 888 } 889 BYTECODE(CHECK_AT_START) { 890 if (current + LoadPacked24Signed(insn) == 0) { 891 SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); 892 } else { 893 ADVANCE(CHECK_AT_START); 894 } 895 DISPATCH(); 896 } 897 BYTECODE(CHECK_NOT_AT_START) { 898 if (current + LoadPacked24Signed(insn) == 0) { 899 ADVANCE(CHECK_NOT_AT_START); 900 } else { 901 SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); 902 } 903 DISPATCH(); 904 } 905 BYTECODE(SET_CURRENT_POSITION_FROM_END) { 906 ADVANCE(SET_CURRENT_POSITION_FROM_END); 907 int by = LoadPacked24Unsigned(insn); 908 if (subject.length() - current > by) { 909 SET_CURRENT_POSITION(subject.length() - by); 910 current_char = subject[current - 1]; 911 } 912 DISPATCH(); 913 } 914 BYTECODE(CHECK_CURRENT_POSITION) { 915 int pos = current + LoadPacked24Signed(insn); 916 if (pos > subject.length() || pos < 0) { 917 SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); 918 } else { 919 ADVANCE(CHECK_CURRENT_POSITION); 920 } 921 DISPATCH(); 922 } 923 BYTECODE(SKIP_UNTIL_CHAR) { 924 int32_t load_offset = LoadPacked24Signed(insn); 925 int32_t advance = Load16AlignedSigned(pc + 4); 926 uint32_t c = Load16AlignedUnsigned(pc + 6); 927 while (IndexIsInBounds(current + load_offset, subject.length())) { 928 current_char = subject[current + load_offset]; 929 if (c == current_char) { 930 SET_PC_FROM_OFFSET(Load32Aligned(pc + 8)); 931 DISPATCH(); 932 } 933 ADVANCE_CURRENT_POSITION(advance); 934 } 935 SET_PC_FROM_OFFSET(Load32Aligned(pc + 12)); 936 DISPATCH(); 937 } 938 BYTECODE(SKIP_UNTIL_CHAR_AND) { 939 int32_t load_offset = LoadPacked24Signed(insn); 940 int32_t advance = Load16AlignedSigned(pc + 4); 941 uint16_t c = Load16AlignedUnsigned(pc + 6); 942 uint32_t mask = Load32Aligned(pc + 8); 943 int32_t maximum_offset = Load32Aligned(pc + 12); 944 while (static_cast<uintptr_t>(current + maximum_offset) <= 945 static_cast<uintptr_t>(subject.length())) { 946 current_char = subject[current + load_offset]; 947 if (c == (current_char & mask)) { 948 SET_PC_FROM_OFFSET(Load32Aligned(pc + 16)); 949 DISPATCH(); 950 } 951 ADVANCE_CURRENT_POSITION(advance); 952 } 953 SET_PC_FROM_OFFSET(Load32Aligned(pc + 20)); 954 DISPATCH(); 955 } 956 BYTECODE(SKIP_UNTIL_CHAR_POS_CHECKED) { 957 int32_t load_offset = LoadPacked24Signed(insn); 958 int32_t advance = Load16AlignedSigned(pc + 4); 959 uint16_t c = Load16AlignedUnsigned(pc + 6); 960 int32_t maximum_offset = Load32Aligned(pc + 8); 961 while (static_cast<uintptr_t>(current + maximum_offset) <= 962 static_cast<uintptr_t>(subject.length())) { 963 current_char = subject[current + load_offset]; 964 if (c == current_char) { 965 SET_PC_FROM_OFFSET(Load32Aligned(pc + 12)); 966 DISPATCH(); 967 } 968 ADVANCE_CURRENT_POSITION(advance); 969 } 970 SET_PC_FROM_OFFSET(Load32Aligned(pc + 16)); 971 DISPATCH(); 972 } 973 BYTECODE(SKIP_UNTIL_BIT_IN_TABLE) { 974 int32_t load_offset = LoadPacked24Signed(insn); 975 int32_t advance = Load32Aligned(pc + 4); 976 const uint8_t* table = pc + 8; 977 while (IndexIsInBounds(current + load_offset, subject.length())) { 978 current_char = subject[current + load_offset]; 979 if (CheckBitInTable(current_char, table)) { 980 SET_PC_FROM_OFFSET(Load32Aligned(pc + 24)); 981 DISPATCH(); 982 } 983 ADVANCE_CURRENT_POSITION(advance); 984 } 985 SET_PC_FROM_OFFSET(Load32Aligned(pc + 28)); 986 DISPATCH(); 987 } 988 BYTECODE(SKIP_UNTIL_GT_OR_NOT_BIT_IN_TABLE) { 989 int32_t load_offset = LoadPacked24Signed(insn); 990 int32_t advance = Load16AlignedSigned(pc + 4); 991 uint16_t limit = Load16AlignedUnsigned(pc + 6); 992 const uint8_t* table = pc + 8; 993 while (IndexIsInBounds(current + load_offset, subject.length())) { 994 current_char = subject[current + load_offset]; 995 if (current_char > limit) { 996 SET_PC_FROM_OFFSET(Load32Aligned(pc + 24)); 997 DISPATCH(); 998 } 999 if (!CheckBitInTable(current_char, table)) { 1000 SET_PC_FROM_OFFSET(Load32Aligned(pc + 24)); 1001 DISPATCH(); 1002 } 1003 ADVANCE_CURRENT_POSITION(advance); 1004 } 1005 SET_PC_FROM_OFFSET(Load32Aligned(pc + 28)); 1006 DISPATCH(); 1007 } 1008 BYTECODE(SKIP_UNTIL_CHAR_OR_CHAR) { 1009 int32_t load_offset = LoadPacked24Signed(insn); 1010 int32_t advance = Load32Aligned(pc + 4); 1011 uint16_t c = Load16AlignedUnsigned(pc + 8); 1012 uint16_t c2 = Load16AlignedUnsigned(pc + 10); 1013 while (IndexIsInBounds(current + load_offset, subject.length())) { 1014 current_char = subject[current + load_offset]; 1015 // The two if-statements below are split up intentionally, as combining 1016 // them seems to result in register allocation behaving quite 1017 // differently and slowing down the resulting code. 1018 if (c == current_char) { 1019 SET_PC_FROM_OFFSET(Load32Aligned(pc + 12)); 1020 DISPATCH(); 1021 } 1022 if (c2 == current_char) { 1023 SET_PC_FROM_OFFSET(Load32Aligned(pc + 12)); 1024 DISPATCH(); 1025 } 1026 ADVANCE_CURRENT_POSITION(advance); 1027 } 1028 SET_PC_FROM_OFFSET(Load32Aligned(pc + 16)); 1029 DISPATCH(); 1030 } 1031 #if V8_USE_COMPUTED_GOTO 1032 // Lint gets confused a lot if we just use !V8_USE_COMPUTED_GOTO or ifndef 1033 // V8_USE_COMPUTED_GOTO here. 1034 #else 1035 default: 1036 UNREACHABLE(); 1037 } 1038 // Label we jump to in DISPATCH(). There must be no instructions between the 1039 // end of the switch, this label and the end of the loop. 1040 switch_dispatch_continuation : {} 1041 #endif // V8_USE_COMPUTED_GOTO 1042 } 1043 } 1044 1045 #undef BYTECODE 1046 #undef ADVANCE_CURRENT_POSITION 1047 #undef SET_CURRENT_POSITION 1048 #undef DISPATCH 1049 #undef DECODE 1050 #undef SET_PC_FROM_OFFSET 1051 #undef ADVANCE 1052 #undef BC_LABEL 1053 #undef V8_USE_COMPUTED_GOTO 1054 1055 } // namespace 1056 1057 // static 1058 int IrregexpInterpreter::Match(Isolate* isolate, 1059 Tagged<IrRegExpData> regexp_data, 1060 Tagged<String> subject_string, 1061 int* output_registers, int output_register_count, 1062 int start_position, 1063 RegExp::CallOrigin call_origin) { 1064 if (v8_flags.regexp_tier_up) regexp_data->TierUpTick(); 1065 1066 bool is_any_unicode = 1067 IsEitherUnicode(JSRegExp::AsRegExpFlags(regexp_data->flags())); 1068 bool is_one_byte = String::IsOneByteRepresentationUnderneath(subject_string); 1069 Tagged<TrustedByteArray> code_array = regexp_data->bytecode(is_one_byte); 1070 int total_register_count = regexp_data->max_register_count(); 1071 1072 // MatchInternal only supports returning a single match per call. In global 1073 // mode, i.e. when output_registers has space for more than one match, we 1074 // need to keep running until all matches are filled in. 1075 int registers_per_match = 1076 JSRegExp::RegistersForCaptureCount(regexp_data->capture_count()); 1077 DCHECK_LE(registers_per_match, output_register_count); 1078 int number_of_matches_in_output_registers = 1079 output_register_count / registers_per_match; 1080 1081 int backtrack_limit = regexp_data->backtrack_limit(); 1082 1083 int num_matches = 0; 1084 int* current_output_registers = output_registers; 1085 for (int i = 0; i < number_of_matches_in_output_registers; i++) { 1086 auto current_result = MatchInternal( 1087 isolate, &code_array, &subject_string, current_output_registers, 1088 registers_per_match, total_register_count, start_position, call_origin, 1089 backtrack_limit); 1090 1091 if (current_result == SUCCESS) { 1092 // Fall through. 1093 } else if (current_result == FAILURE) { 1094 break; 1095 } else { 1096 DCHECK(current_result == EXCEPTION || 1097 current_result == FALLBACK_TO_EXPERIMENTAL || 1098 current_result == RETRY); 1099 return current_result; 1100 } 1101 1102 // Found a match. Advance the index. 1103 1104 num_matches++; 1105 1106 int next_start_position = current_output_registers[1]; 1107 if (next_start_position == current_output_registers[0]) { 1108 // Zero-length matches. 1109 // TODO(jgruber): Use AdvanceStringIndex based on flat contents instead. 1110 next_start_position = static_cast<int>(RegExpUtils::AdvanceStringIndex( 1111 subject_string, next_start_position, is_any_unicode)); 1112 if (next_start_position > static_cast<int>(subject_string->length())) { 1113 break; 1114 } 1115 } 1116 1117 start_position = next_start_position; 1118 current_output_registers += registers_per_match; 1119 } 1120 1121 return num_matches; 1122 } 1123 1124 IrregexpInterpreter::Result IrregexpInterpreter::MatchInternal( 1125 Isolate* isolate, Tagged<TrustedByteArray>* code_array, 1126 Tagged<String>* subject_string, int* output_registers, 1127 int output_register_count, int total_register_count, int start_position, 1128 RegExp::CallOrigin call_origin, uint32_t backtrack_limit) { 1129 DCHECK((*subject_string)->IsFlat()); 1130 1131 // Note: Heap allocation *is* allowed in two situations if calling from 1132 // Runtime: 1133 // 1. When creating & throwing a stack overflow exception. The interpreter 1134 // aborts afterwards, and thus possible-moved objects are never used. 1135 // 2. When handling interrupts. We manually relocate unhandlified references 1136 // after interrupts have run. 1137 DisallowGarbageCollection no_gc; 1138 1139 base::uc16 previous_char = '\n'; 1140 String::FlatContent subject_content = 1141 (*subject_string)->GetFlatContent(no_gc); 1142 // Because interrupts can result in GC and string content relocation, the 1143 // checksum verification in FlatContent may fail even though this code is 1144 // safe. See (2) above. 1145 subject_content.UnsafeDisableChecksumVerification(); 1146 if (subject_content.IsOneByte()) { 1147 base::Vector<const uint8_t> subject_vector = 1148 subject_content.ToOneByteVector(); 1149 if (start_position != 0) previous_char = subject_vector[start_position - 1]; 1150 return RawMatch(isolate, code_array, subject_string, subject_vector, 1151 output_registers, output_register_count, 1152 total_register_count, start_position, previous_char, 1153 call_origin, backtrack_limit); 1154 } else { 1155 DCHECK(subject_content.IsTwoByte()); 1156 base::Vector<const base::uc16> subject_vector = 1157 subject_content.ToUC16Vector(); 1158 if (start_position != 0) previous_char = subject_vector[start_position - 1]; 1159 return RawMatch(isolate, code_array, subject_string, subject_vector, 1160 output_registers, output_register_count, 1161 total_register_count, start_position, previous_char, 1162 call_origin, backtrack_limit); 1163 } 1164 } 1165 1166 #ifndef COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER 1167 1168 // This method is called through an external reference from RegExpExecInternal 1169 // builtin. 1170 #ifdef V8_ENABLE_SANDBOX_HARDWARE_SUPPORT 1171 // Hardware sandboxing is incompatible with ASAN, see crbug.com/432168626. 1172 DISABLE_ASAN 1173 #endif // V8_ENABLE_SANDBOX_HARDWARE_SUPPORT 1174 int IrregexpInterpreter::MatchForCallFromJs( 1175 Address subject, int32_t start_position, Address, Address, 1176 int* output_registers, int32_t output_register_count, 1177 RegExp::CallOrigin call_origin, Isolate* isolate, Address regexp_data) { 1178 // TODO(422992937): investigate running the interpreter in sandboxed mode. 1179 ExitSandboxScope unsandboxed; 1180 1181 DCHECK_NOT_NULL(isolate); 1182 DCHECK_NOT_NULL(output_registers); 1183 DCHECK(call_origin == RegExp::CallOrigin::kFromJs); 1184 1185 DisallowGarbageCollection no_gc; 1186 DisallowJavascriptExecution no_js(isolate); 1187 DisallowHandleAllocation no_handles; 1188 DisallowHandleDereference no_deref; 1189 1190 Tagged<String> subject_string = Cast<String>(Tagged<Object>(subject)); 1191 Tagged<IrRegExpData> regexp_data_obj = 1192 SbxCast<IrRegExpData>(Tagged<Object>(regexp_data)); 1193 1194 if (regexp_data_obj->MarkedForTierUp()) { 1195 // Returning RETRY will re-enter through runtime, where actual recompilation 1196 // for tier-up takes place. 1197 return IrregexpInterpreter::RETRY; 1198 } 1199 1200 return Match(isolate, regexp_data_obj, subject_string, output_registers, 1201 output_register_count, start_position, call_origin); 1202 } 1203 1204 #endif // !COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER 1205 1206 int IrregexpInterpreter::MatchForCallFromRuntime( 1207 Isolate* isolate, DirectHandle<IrRegExpData> regexp_data, 1208 DirectHandle<String> subject_string, int* output_registers, 1209 int output_register_count, int start_position) { 1210 return Match(isolate, *regexp_data, *subject_string, output_registers, 1211 output_register_count, start_position, 1212 RegExp::CallOrigin::kFromRuntime); 1213 } 1214 1215 } // namespace internal 1216 } // namespace v8