RegExpAPI.cpp (36998B)
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 * vim: set ts=8 sts=2 et sw=2 tw=80: 3 * This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 // Copyright 2020 the V8 project authors. All rights reserved. 8 // Use of this source code is governed by a BSD-style license that can be 9 // found in the LICENSE file. 10 11 #include "irregexp/RegExpAPI.h" 12 13 #include "mozilla/ArrayUtils.h" 14 #include "mozilla/Casting.h" 15 16 #include "frontend/FrontendContext.h" // AutoReportFrontendContext 17 #include "frontend/TokenStream.h" 18 #include "gc/GC.h" 19 #include "gc/Zone.h" 20 #include "irregexp/imported/regexp-ast.h" 21 #include "irregexp/imported/regexp-bytecode-generator.h" 22 #include "irregexp/imported/regexp-compiler.h" 23 #include "irregexp/imported/regexp-interpreter.h" 24 #include "irregexp/imported/regexp-macro-assembler-arch.h" 25 #include "irregexp/imported/regexp-macro-assembler-tracer.h" 26 #include "irregexp/imported/regexp-parser.h" 27 #include "irregexp/imported/regexp-stack.h" 28 #include "irregexp/imported/regexp.h" 29 #include "irregexp/RegExpNativeMacroAssembler.h" 30 #include "irregexp/RegExpShim.h" 31 #include "jit/JitCommon.h" 32 #include "js/ColumnNumber.h" // JS::ColumnNumberOneOrigin, JS::ColumnNumberOffset 33 #include "js/friend/ErrorMessages.h" // JSMSG_* 34 #include "js/friend/StackLimits.h" // js::ReportOverRecursed 35 #include "util/StringBuilder.h" 36 #include "vm/MatchPairs.h" 37 #include "vm/PlainObject.h" 38 #include "vm/RegExpShared.h" 39 40 namespace js { 41 namespace irregexp { 42 43 using mozilla::AssertedCast; 44 using mozilla::Maybe; 45 using mozilla::Nothing; 46 using mozilla::PointerRangeSize; 47 using mozilla::Some; 48 49 using frontend::DummyTokenStream; 50 using frontend::TokenStreamAnyChars; 51 52 using v8::internal::DisallowGarbageCollection; 53 using v8::internal::HandleScope; 54 using v8::internal::InputOutputData; 55 using v8::internal::IrregexpInterpreter; 56 using v8::internal::NativeRegExpMacroAssembler; 57 using v8::internal::RegExpBytecodeGenerator; 58 using v8::internal::RegExpCapture; 59 using v8::internal::RegExpCompileData; 60 using v8::internal::RegExpCompiler; 61 using v8::internal::RegExpError; 62 using v8::internal::RegExpMacroAssembler; 63 using v8::internal::RegExpMacroAssemblerTracer; 64 using v8::internal::RegExpNode; 65 using v8::internal::RegExpParser; 66 using v8::internal::SMRegExpMacroAssembler; 67 using v8::internal::Zone; 68 using v8::internal::ZoneVector; 69 70 using V8HandleString = v8::internal::Handle<v8::internal::String>; 71 using V8HandleRegExp = v8::internal::Handle<v8::internal::IrRegExpData>; 72 73 using namespace v8::internal::regexp_compiler_constants; 74 75 static uint32_t ErrorNumber(RegExpError err) { 76 switch (err) { 77 case RegExpError::kNone: 78 return JSMSG_NOT_AN_ERROR; 79 case RegExpError::kStackOverflow: 80 return JSMSG_OVER_RECURSED; 81 case RegExpError::kAnalysisStackOverflow: 82 return JSMSG_OVER_RECURSED; 83 case RegExpError::kTooLarge: 84 return JSMSG_TOO_MANY_PARENS; 85 case RegExpError::kUnterminatedGroup: 86 return JSMSG_MISSING_PAREN; 87 case RegExpError::kUnmatchedParen: 88 return JSMSG_UNMATCHED_RIGHT_PAREN; 89 case RegExpError::kEscapeAtEndOfPattern: 90 return JSMSG_ESCAPE_AT_END_OF_REGEXP; 91 case RegExpError::kInvalidPropertyName: 92 return JSMSG_INVALID_PROPERTY_NAME; 93 case RegExpError::kInvalidEscape: 94 return JSMSG_INVALID_IDENTITY_ESCAPE; 95 case RegExpError::kInvalidDecimalEscape: 96 return JSMSG_INVALID_DECIMAL_ESCAPE; 97 case RegExpError::kInvalidUnicodeEscape: 98 return JSMSG_INVALID_UNICODE_ESCAPE; 99 case RegExpError::kNothingToRepeat: 100 return JSMSG_NOTHING_TO_REPEAT; 101 case RegExpError::kLoneQuantifierBrackets: 102 // Note: V8 reports the same error for both ']' and '}'. 103 return JSMSG_RAW_BRACKET_IN_REGEXP; 104 case RegExpError::kRangeOutOfOrder: 105 return JSMSG_NUMBERS_OUT_OF_ORDER; 106 case RegExpError::kIncompleteQuantifier: 107 return JSMSG_INCOMPLETE_QUANTIFIER; 108 case RegExpError::kInvalidQuantifier: 109 return JSMSG_INVALID_QUANTIFIER; 110 case RegExpError::kInvalidGroup: 111 return JSMSG_INVALID_GROUP; 112 case RegExpError::kRepeatedFlag: 113 return JSMSG_REPEATED_FLAG; 114 case RegExpError::kInvalidFlagGroup: 115 return JSMSG_INVALID_FLAG_GROUP; 116 case RegExpError::kMultipleFlagDashes: 117 return JSMSG_MULTIPLE_FLAG_DASHES; 118 case RegExpError::kNotLinear: 119 // V8 has an experimental non-backtracking engine. We do not 120 // support it yet. 121 MOZ_CRASH("Non-backtracking execution not supported"); 122 case RegExpError::kTooManyCaptures: 123 return JSMSG_TOO_MANY_PARENS; 124 case RegExpError::kInvalidCaptureGroupName: 125 return JSMSG_INVALID_CAPTURE_NAME; 126 case RegExpError::kDuplicateCaptureGroupName: 127 return JSMSG_DUPLICATE_CAPTURE_NAME; 128 case RegExpError::kInvalidNamedReference: 129 return JSMSG_INVALID_NAMED_REF; 130 case RegExpError::kInvalidNamedCaptureReference: 131 return JSMSG_INVALID_NAMED_CAPTURE_REF; 132 case RegExpError::kInvalidClassPropertyName: 133 return JSMSG_INVALID_CLASS_PROPERTY_NAME; 134 case RegExpError::kInvalidCharacterClass: 135 return JSMSG_RANGE_WITH_CLASS_ESCAPE; 136 case RegExpError::kUnterminatedCharacterClass: 137 return JSMSG_UNTERM_CLASS; 138 case RegExpError::kOutOfOrderCharacterClass: 139 return JSMSG_BAD_CLASS_RANGE; 140 141 case RegExpError::kInvalidClassSetOperation: 142 return JSMSG_INVALID_CLASS_SET_OP; 143 case RegExpError::kInvalidCharacterInClass: 144 return JSMSG_INVALID_CHAR_IN_CLASS; 145 case RegExpError::kNegatedCharacterClassWithStrings: 146 return JSMSG_NEGATED_CLASS_WITH_STR; 147 148 case RegExpError::NumErrors: 149 MOZ_CRASH("Unreachable"); 150 } 151 MOZ_CRASH("Unreachable"); 152 } 153 154 Isolate* CreateIsolate(JSContext* cx) { 155 auto isolate = MakeUnique<Isolate>(cx); 156 if (!isolate || !isolate->init()) { 157 return nullptr; 158 } 159 return isolate.release(); 160 } 161 162 void TraceIsolate(JSTracer* trc, Isolate* isolate) { isolate->trace(trc); } 163 164 void DestroyIsolate(Isolate* isolate) { 165 MOZ_ASSERT(isolate->liveHandles() == 0); 166 MOZ_ASSERT(isolate->livePseudoHandles() == 0); 167 js_delete(isolate); 168 } 169 170 size_t IsolateSizeOfIncludingThis(Isolate* isolate, 171 mozilla::MallocSizeOf mallocSizeOf) { 172 return isolate->sizeOfIncludingThis(mallocSizeOf); 173 } 174 175 static JS::ColumnNumberOffset ComputeColumnOffset(const Latin1Char* begin, 176 const Latin1Char* end) { 177 return JS::ColumnNumberOffset( 178 AssertedCast<uint32_t>(PointerRangeSize(begin, end))); 179 } 180 181 static JS::ColumnNumberOffset ComputeColumnOffset(const char16_t* begin, 182 const char16_t* end) { 183 return JS::ColumnNumberOffset( 184 AssertedCast<uint32_t>(unicode::CountUTF16CodeUnits(begin, end))); 185 } 186 187 // This function is varargs purely so it can call ReportCompileErrorLatin1. 188 // We never call it with additional arguments. 189 template <typename CharT> 190 static void ReportSyntaxError(TokenStreamAnyChars& ts, 191 mozilla::Maybe<uint32_t> line, 192 mozilla::Maybe<JS::ColumnNumberOneOrigin> column, 193 RegExpCompileData& result, CharT* start, 194 size_t length, ...) { 195 MOZ_ASSERT(line.isSome() == column.isSome()); 196 197 Maybe<gc::AutoSuppressGC> suppressGC; 198 if (JSContext* maybeCx = ts.context()->maybeCurrentJSContext()) { 199 suppressGC.emplace(maybeCx); 200 } 201 uint32_t errorNumber = ErrorNumber(result.error); 202 203 if (errorNumber == JSMSG_OVER_RECURSED) { 204 ReportOverRecursed(ts.context()); 205 return; 206 } 207 208 uint32_t offset = std::max(result.error_pos, 0); 209 MOZ_ASSERT(offset <= length); 210 211 ErrorMetadata err; 212 213 // Ordinarily this indicates whether line-of-context information can be 214 // added, but we entirely ignore that here because we create a 215 // a line of context based on the expression source. 216 uint32_t location = ts.currentToken().pos.begin; 217 if (ts.fillExceptingContext(&err, location)) { 218 JS::ColumnNumberOffset columnOffset = 219 ComputeColumnOffset(start, start + offset); 220 if (line.isSome()) { 221 // If this pattern is being checked by the frontend Parser instead 222 // of other API entry points like |new RegExp|, then the parser will 223 // have provided both a line and column pointing at the *beginning* 224 // of the RegExp literal inside the source text. 225 // We adjust the columnNumber to point to the actual syntax error 226 // inside the literal. 227 err.lineNumber = *line; 228 err.columnNumber = *column + columnOffset; 229 } else { 230 // Line breaks are not significant in pattern text in the same way as 231 // in source text, so act as though pattern text is a single line, then 232 // compute a column based on "code point" count (treating a lone 233 // surrogate as a "code point" in UTF-16). Gak. 234 err.lineNumber = 1; 235 err.columnNumber = JS::ColumnNumberOneOrigin() + columnOffset; 236 } 237 } 238 239 // For most error reporting, the line of context derives from the token 240 // stream. So when location information doesn't come from the token 241 // stream, we can't give a line of context. But here the "line of context" 242 // can be (and is) derived from the pattern text, so we can provide it no 243 // matter if the location is derived from the caller. 244 245 const CharT* windowStart = 246 (offset > ErrorMetadata::lineOfContextRadius) 247 ? start + (offset - ErrorMetadata::lineOfContextRadius) 248 : start; 249 250 const CharT* windowEnd = 251 (length - offset > ErrorMetadata::lineOfContextRadius) 252 ? start + offset + ErrorMetadata::lineOfContextRadius 253 : start + length; 254 255 size_t windowLength = PointerRangeSize(windowStart, windowEnd); 256 MOZ_ASSERT(windowLength <= ErrorMetadata::lineOfContextRadius * 2); 257 258 // Create the windowed string, not including the potential line 259 // terminator. 260 StringBuilder windowBuf(ts.context()); 261 if (!windowBuf.append(windowStart, windowEnd)) { 262 return; 263 } 264 265 // The line of context must be null-terminated, and StringBuilder doesn't 266 // make that happen unless we force it to. 267 if (!windowBuf.append('\0')) { 268 return; 269 } 270 271 err.lineOfContext.reset(windowBuf.stealChars()); 272 if (!err.lineOfContext) { 273 return; 274 } 275 276 err.lineLength = windowLength; 277 err.tokenOffset = offset - (windowStart - start); 278 279 va_list args; 280 va_start(args, length); 281 ReportCompileErrorLatin1VA(ts.context(), std::move(err), nullptr, errorNumber, 282 &args); 283 va_end(args); 284 } 285 286 static void ReportSyntaxError(TokenStreamAnyChars& ts, 287 RegExpCompileData& result, 288 Handle<JSAtom*> pattern) { 289 JS::AutoCheckCannotGC nogc_; 290 if (pattern->hasLatin1Chars()) { 291 ReportSyntaxError(ts, Nothing(), Nothing(), result, 292 pattern->latin1Chars(nogc_), pattern->length()); 293 } else { 294 ReportSyntaxError(ts, Nothing(), Nothing(), result, 295 pattern->twoByteChars(nogc_), pattern->length()); 296 } 297 } 298 299 template <typename CharT> 300 static bool CheckPatternSyntaxImpl(js::LifoAlloc& alloc, 301 JS::NativeStackLimit stackLimit, 302 const CharT* input, uint32_t inputLength, 303 JS::RegExpFlags flags, 304 RegExpCompileData* result, 305 JS::AutoAssertNoGC& nogc) { 306 LifoAllocScope allocScope(&alloc); 307 Zone zone(allocScope.alloc()); 308 309 return RegExpParser::VerifyRegExpSyntax(&zone, stackLimit, input, inputLength, 310 flags, result, nogc); 311 } 312 313 bool CheckPatternSyntax(js::LifoAlloc& alloc, JS::NativeStackLimit stackLimit, 314 TokenStreamAnyChars& ts, 315 const mozilla::Range<const char16_t> chars, 316 JS::RegExpFlags flags, mozilla::Maybe<uint32_t> line, 317 mozilla::Maybe<JS::ColumnNumberOneOrigin> column) { 318 RegExpCompileData result; 319 JS::AutoAssertNoGC nogc; 320 if (!CheckPatternSyntaxImpl(alloc, stackLimit, chars.begin().get(), 321 chars.length(), flags, &result, nogc)) { 322 ReportSyntaxError(ts, line, column, result, chars.begin().get(), 323 chars.length()); 324 return false; 325 } 326 return true; 327 } 328 329 bool CheckPatternSyntax(JSContext* cx, JS::NativeStackLimit stackLimit, 330 TokenStreamAnyChars& ts, Handle<JSAtom*> pattern, 331 JS::RegExpFlags flags) { 332 RegExpCompileData result; 333 JS::AutoAssertNoGC nogc(cx); 334 if (pattern->hasLatin1Chars()) { 335 if (!CheckPatternSyntaxImpl(cx->tempLifoAlloc(), stackLimit, 336 pattern->latin1Chars(nogc), pattern->length(), 337 flags, &result, nogc)) { 338 ReportSyntaxError(ts, result, pattern); 339 return false; 340 } 341 return true; 342 } 343 if (!CheckPatternSyntaxImpl(cx->tempLifoAlloc(), stackLimit, 344 pattern->twoByteChars(nogc), pattern->length(), 345 flags, &result, nogc)) { 346 ReportSyntaxError(ts, result, pattern); 347 return false; 348 } 349 return true; 350 } 351 352 // A regexp is a good candidate for Boyer-Moore if it has at least 3 353 // times as many characters as it has unique characters. Note that 354 // table lookups in irregexp are done modulo tableSize (128). 355 template <typename CharT> 356 static bool HasFewDifferentCharacters(const CharT* chars, size_t length) { 357 const uint32_t tableSize = 358 v8::internal::NativeRegExpMacroAssembler::kTableSize; 359 bool character_found[tableSize] = {}; 360 uint32_t different = 0; 361 for (uint32_t i = 0; i < length; i++) { 362 uint32_t ch = chars[i] % tableSize; 363 if (!character_found[ch]) { 364 character_found[ch] = true; 365 different++; 366 // We declare a regexp low-alphabet if it has at least 3 times as many 367 // characters as it has different characters. 368 if (different * 3 > length) { 369 return false; 370 } 371 } 372 } 373 return true; 374 } 375 376 // Identifies the sort of pattern where Boyer-Moore is faster than string search 377 static bool UseBoyerMoore(Handle<JSAtom*> pattern, JS::AutoAssertNoGC& nogc) { 378 size_t length = 379 std::min(size_t(kMaxLookaheadForBoyerMoore), pattern->length()); 380 if (length <= kPatternTooShortForBoyerMoore) { 381 return false; 382 } 383 384 if (pattern->hasLatin1Chars()) { 385 return HasFewDifferentCharacters(pattern->latin1Chars(nogc), length); 386 } 387 MOZ_ASSERT(pattern->hasTwoByteChars()); 388 return HasFewDifferentCharacters(pattern->twoByteChars(nogc), length); 389 } 390 391 // Sample character frequency information for use in Boyer-Moore. 392 static void SampleCharacters(Handle<JSLinearString*> sample_subject, 393 RegExpCompiler& compiler) { 394 static const int kSampleSize = 128; 395 int chars_sampled = 0; 396 397 int length = sample_subject->length(); 398 399 int half_way = (length - kSampleSize) / 2; 400 for (int i = std::max(0, half_way); i < length && chars_sampled < kSampleSize; 401 i++, chars_sampled++) { 402 compiler.frequency_collator()->CountCharacter( 403 sample_subject->latin1OrTwoByteChar(i)); 404 } 405 } 406 407 // Recursively walking the AST for a deeply nested regexp (like 408 // `/(a(a(a(a(a(a(a(...(a)...))))))))/`) may overflow the stack while 409 // compiling. To avoid this, we use V8's implementation of the Visitor 410 // pattern to walk the AST first with an overly large stack frame. 411 class RegExpDepthCheck final : public v8::internal::RegExpVisitor { 412 public: 413 explicit RegExpDepthCheck(JSContext* cx) : cx_(cx) {} 414 415 bool check(v8::internal::RegExpTree* root) { 416 return !!root->Accept(this, nullptr); 417 } 418 419 // Leaf nodes with no children 420 #define LEAF_DEPTH(Kind) \ 421 void* Visit##Kind(v8::internal::RegExp##Kind* node, void*) override { \ 422 AutoCheckRecursionLimit recursion(cx_); \ 423 return (void*)recursion.checkWithExtraDontReport(cx_, FRAME_PADDING); \ 424 } 425 426 LEAF_DEPTH(Assertion) 427 LEAF_DEPTH(Atom) 428 LEAF_DEPTH(BackReference) 429 LEAF_DEPTH(ClassSetOperand) 430 LEAF_DEPTH(ClassRanges) 431 LEAF_DEPTH(Empty) 432 LEAF_DEPTH(Text) 433 #undef LEAF_DEPTH 434 435 // Wrapper nodes with one child 436 #define WRAPPER_DEPTH(Kind) \ 437 void* Visit##Kind(v8::internal::RegExp##Kind* node, void*) override { \ 438 AutoCheckRecursionLimit recursion(cx_); \ 439 if (!recursion.checkWithExtraDontReport(cx_, FRAME_PADDING)) { \ 440 return nullptr; \ 441 } \ 442 return node->body()->Accept(this, nullptr); \ 443 } 444 445 WRAPPER_DEPTH(Capture) 446 WRAPPER_DEPTH(Group) 447 WRAPPER_DEPTH(Lookaround) 448 WRAPPER_DEPTH(Quantifier) 449 #undef WRAPPER_DEPTH 450 451 void* VisitAlternative(v8::internal::RegExpAlternative* node, 452 void*) override { 453 AutoCheckRecursionLimit recursion(cx_); 454 if (!recursion.checkWithExtraDontReport(cx_, FRAME_PADDING)) { 455 return nullptr; 456 } 457 for (auto* child : *node->nodes()) { 458 if (!child->Accept(this, nullptr)) { 459 return nullptr; 460 } 461 } 462 return (void*)true; 463 } 464 void* VisitDisjunction(v8::internal::RegExpDisjunction* node, 465 void*) override { 466 AutoCheckRecursionLimit recursion(cx_); 467 if (!recursion.checkWithExtraDontReport(cx_, FRAME_PADDING)) { 468 return nullptr; 469 } 470 for (auto* child : *node->alternatives()) { 471 if (!child->Accept(this, nullptr)) { 472 return nullptr; 473 } 474 } 475 return (void*)true; 476 } 477 void* VisitClassSetExpression(v8::internal::RegExpClassSetExpression* node, 478 void*) override { 479 AutoCheckRecursionLimit recursion(cx_); 480 if (!recursion.checkWithExtraDontReport(cx_, FRAME_PADDING)) { 481 return nullptr; 482 } 483 for (auto* child : *node->operands()) { 484 if (!child->Accept(this, nullptr)) { 485 return nullptr; 486 } 487 } 488 return (void*)true; 489 } 490 491 private: 492 JSContext* cx_; 493 494 // This size is picked to be comfortably larger than any 495 // RegExp*::ToNode stack frame. 496 #if !defined(DEBUG) && !defined(MOZ_CODE_COVERAGE) 497 static const size_t FRAME_PADDING = 256; 498 #else 499 // Use a slightly larger padding for debug and code coverage builds. 500 static const size_t FRAME_PADDING = 256 * 2; 501 #endif 502 }; 503 504 enum class AssembleResult { 505 Success, 506 TooLarge, 507 OutOfMemory, 508 }; 509 510 [[nodiscard]] static AssembleResult Assemble( 511 JSContext* cx, RegExpCompiler* compiler, RegExpCompileData* data, 512 MutableHandleRegExpShared re, Handle<JSAtom*> pattern, Zone* zone, 513 bool useNativeCode, bool isLatin1) { 514 // Because we create a StackMacroAssembler, this function is not allowed 515 // to GC. If needed, we allocate and throw errors in the caller. 516 jit::TempAllocator temp(&cx->tempLifoAlloc()); 517 Maybe<jit::JitContext> jctx; 518 Maybe<js::jit::StackMacroAssembler> stack_masm; 519 UniquePtr<RegExpMacroAssembler> masm; 520 if (useNativeCode) { 521 NativeRegExpMacroAssembler::Mode mode = 522 isLatin1 ? NativeRegExpMacroAssembler::LATIN1 523 : NativeRegExpMacroAssembler::UC16; 524 // If we are compiling native code, we need a macroassembler, 525 // which needs a jit context. 526 jctx.emplace(cx); 527 stack_masm.emplace(cx, temp); 528 #ifdef DEBUG 529 // It would be much preferable to use `class AutoCreatedBy` here, but we 530 // may be operating without an assembler at all if `useNativeCode` is 531 // `false`, so there's no place to put such a call. 532 stack_masm.ref().pushCreator("Assemble() in RegExpAPI.cpp"); 533 #endif 534 uint32_t num_capture_registers = re->pairCount() * 2; 535 masm = MakeUnique<SMRegExpMacroAssembler>(cx, stack_masm.ref(), zone, mode, 536 num_capture_registers); 537 } else { 538 masm = MakeUnique<RegExpBytecodeGenerator>(cx->isolate, zone); 539 } 540 if (!masm) { 541 ReportOutOfMemory(cx); 542 return AssembleResult::OutOfMemory; 543 } 544 545 bool isLargePattern = 546 pattern->length() > v8::internal::RegExp::kRegExpTooLargeToOptimize; 547 masm->set_slow_safe(isLargePattern); 548 if (compiler->optimize()) { 549 compiler->set_optimize(!isLargePattern); 550 } 551 552 // When matching a regexp with known maximum length that is anchored 553 // at the end, we may be able to skip the beginning of long input 554 // strings. This decision is made here because it depends on 555 // information in the AST that isn't replicated in the Node 556 // structure used inside the compiler. 557 bool is_start_anchored = data->tree->IsAnchoredAtStart(); 558 bool is_end_anchored = data->tree->IsAnchoredAtEnd(); 559 int max_length = data->tree->max_match(); 560 static const int kMaxBacksearchLimit = 1024; 561 if (is_end_anchored && !is_start_anchored && !re->sticky() && 562 max_length < kMaxBacksearchLimit) { 563 masm->SetCurrentPositionFromEnd(max_length); 564 } 565 566 if (re->global()) { 567 RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL; 568 if (data->tree->min_match() > 0) { 569 mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK; 570 } else if (re->unicode()) { 571 mode = RegExpMacroAssembler::GLOBAL_UNICODE; 572 } 573 masm->set_global_mode(mode); 574 } 575 576 // The masm tracer works as a thin wrapper around another macroassembler. 577 RegExpMacroAssembler* masm_ptr = masm.get(); 578 #ifdef DEBUG 579 UniquePtr<RegExpMacroAssembler> tracer_masm; 580 if (jit::JitOptions.trace_regexp_assembler) { 581 tracer_masm = MakeUnique<RegExpMacroAssemblerTracer>(masm_ptr); 582 masm_ptr = tracer_masm.get(); 583 } 584 #endif 585 586 // Compile the regexp. 587 V8HandleString wrappedPattern(v8::internal::String(pattern), cx->isolate); 588 RegExpCompiler::CompilationResult result = compiler->Assemble( 589 cx->isolate, masm_ptr, data->node, data->capture_count, wrappedPattern); 590 591 if (useNativeCode) { 592 #ifdef DEBUG 593 // See comment referencing `pushCreator` above. 594 stack_masm.ref().popCreator(); 595 #endif 596 } 597 598 if (!result.Succeeded()) { 599 MOZ_ASSERT(result.error == RegExpError::kTooLarge); 600 return AssembleResult::TooLarge; 601 } 602 if (result.code->value().isUndefined()) { 603 // SMRegExpMacroAssembler::GetCode returns undefined on OOM. 604 MOZ_ASSERT(useNativeCode); 605 return AssembleResult::OutOfMemory; 606 } 607 608 re->updateMaxRegisters(result.num_registers); 609 if (useNativeCode) { 610 // Transfer ownership of the tables from the macroassembler to the 611 // RegExpShared. 612 SMRegExpMacroAssembler::TableVector& tables = 613 static_cast<SMRegExpMacroAssembler*>(masm.get())->tables(); 614 for (uint32_t i = 0; i < tables.length(); i++) { 615 if (!re->addTable(std::move(tables[i]))) { 616 ReportOutOfMemory(cx); 617 return AssembleResult::OutOfMemory; 618 } 619 } 620 re->setJitCode(v8::internal::Code::cast(*result.code).inner(), isLatin1); 621 } else { 622 // Transfer ownership of the bytecode from the HandleScope to the 623 // RegExpShared. 624 ByteArray bytecode = 625 v8::internal::ByteArray::cast(*result.code).takeOwnership(cx->isolate); 626 uint32_t length = bytecode->length(); 627 re->setByteCode(bytecode.release(), isLatin1); 628 js::AddCellMemory(re, length, MemoryUse::RegExpSharedBytecode); 629 } 630 631 return AssembleResult::Success; 632 } 633 634 struct RegExpNamedCapture { 635 const ZoneVector<char16_t>* name; 636 js::Vector<uint32_t> indices; 637 638 RegExpNamedCapture(JSContext* cx, const ZoneVector<char16_t>* name) 639 : name(name), indices(cx) {} 640 }; 641 642 struct RegExpNamedCaptureIndexLess { 643 bool operator()(const RegExpNamedCapture& lhs, 644 const RegExpNamedCapture& rhs) const { 645 // Every name must have at least one corresponding capture index, and all 646 // the capture indices must be distinct. This allows us to sort on the 647 // lowest capture index. 648 MOZ_ASSERT(!lhs.indices.empty()); 649 MOZ_ASSERT(!rhs.indices.empty()); 650 MOZ_ASSERT_IF(&lhs != &rhs, lhs.indices[0] != rhs.indices[0]); 651 return lhs.indices[0] < rhs.indices[0]; 652 } 653 }; 654 655 bool InitializeNamedCaptures(JSContext* cx, HandleRegExpShared re, 656 ZoneVector<RegExpCapture*>* namedCaptures) { 657 // The irregexp parser returns named capture information in the form 658 // of a ZoneVector of RegExpCaptures nodes, each of which stores the 659 // capture name and the corresponding capture index. We create a 660 // template object with a property for each capture name, and store 661 // the capture indices as a heap-allocated array. 662 uint32_t numNamedCaptures = namedCaptures->size(); 663 664 // The input vector of named captures is already sorted by name, and then by 665 // capture index if there are duplicates. We iterate through the captures, 666 // creating groups for each set of indices corresponding to a name. Usually, 667 // there will be a 1:1 mapping. 668 js::Vector<RegExpNamedCapture> groups(cx); 669 if (!groups.reserve(numNamedCaptures)) { 670 js::ReportOutOfMemory(cx); 671 return false; 672 } 673 const ZoneVector<char16_t>* prevName = nullptr; 674 uint32_t numDistinctNamedCaptures = 0; 675 for (uint32_t i = 0; i < numNamedCaptures; i++) { 676 RegExpCapture* capture = (*namedCaptures)[i]; 677 const ZoneVector<char16_t>* name = capture->name(); 678 if (!prevName || *name != *prevName) { 679 if (!groups.emplaceBack(RegExpNamedCapture(cx, name))) { 680 js::ReportOutOfMemory(cx); 681 return false; 682 } 683 numDistinctNamedCaptures++; 684 prevName = name; 685 } 686 // Make sure we're getting the indices in the order we expect 687 MOZ_ASSERT_IF(!groups.back().indices.empty(), 688 groups.back().indices.back() < (uint32_t)capture->index()); 689 if (!groups.back().indices.emplaceBack(capture->index())) { 690 js::ReportOutOfMemory(cx); 691 return false; 692 } 693 } 694 695 // The capture name map must be sorted by index. 696 std::sort(groups.begin(), groups.end(), RegExpNamedCaptureIndexLess{}); 697 698 // Create a plain template object. 699 Rooted<js::PlainObject*> templateObject( 700 cx, js::NewPlainObjectWithProto(cx, nullptr, TenuredObject)); 701 if (!templateObject) { 702 return false; 703 } 704 705 // Allocate the capture index array. 706 uint32_t arraySize = numNamedCaptures * sizeof(uint32_t); 707 UniquePtr<uint32_t[], JS::FreePolicy> captureIndices( 708 static_cast<uint32_t*>(js_malloc(arraySize))); 709 if (!captureIndices) { 710 js::ReportOutOfMemory(cx); 711 return false; 712 } 713 714 // Allocate the capture slice index array, if necessary. We only use this 715 // if we have duplicate named capture groups. 716 bool hasDuplicateNames = numNamedCaptures != numDistinctNamedCaptures; 717 UniquePtr<uint32_t[], JS::FreePolicy> sliceIndices; 718 if (hasDuplicateNames) { 719 arraySize = numDistinctNamedCaptures * sizeof(uint32_t); 720 sliceIndices.reset(static_cast<uint32_t*>(js_malloc(arraySize))); 721 if (!sliceIndices) { 722 js::ReportOutOfMemory(cx); 723 return false; 724 } 725 } 726 727 // Initialize the properties of the template and store capture indices. 728 RootedId id(cx); 729 RootedValue dummyString(cx, StringValue(cx->runtime()->emptyString)); 730 size_t insertIndex = 0; 731 732 for (size_t i = 0; i < numDistinctNamedCaptures; ++i) { 733 RegExpNamedCapture& group = groups[i]; 734 // We store the names as properties on the template object, in the order of 735 // their lowest capture index. 736 JSAtom* name = js::AtomizeChars(cx, group.name->data(), group.name->size()); 737 if (!name) { 738 return false; 739 } 740 id = NameToId(name->asPropertyName()); 741 if (!NativeDefineDataProperty(cx, templateObject, id, dummyString, 742 JSPROP_ENUMERATE)) { 743 return false; 744 } 745 // The slice index keeps track of the captureIndex where indices 746 // corresponding to a name start. The difference between the current slice 747 // index and the next slice index is used to calculate how many values to 748 // return in the slice. This is only needed when we have duplicate capture 749 // names, otherwise, there's a 1:1 mapping, and we don't need the extra 750 // data. 751 if (hasDuplicateNames) { 752 sliceIndices[i] = insertIndex; 753 } 754 755 for (uint32_t captureIndex : groups[i].indices) { 756 captureIndices[insertIndex++] = captureIndex; 757 } 758 } 759 760 RegExpShared::InitializeNamedCaptures( 761 cx, re, numNamedCaptures, numDistinctNamedCaptures, templateObject, 762 captureIndices.release(), sliceIndices.release()); 763 return true; 764 } 765 766 bool CompilePattern(JSContext* cx, MutableHandleRegExpShared re, 767 Handle<JSLinearString*> input, 768 RegExpShared::CodeKind codeKind) { 769 Rooted<JSAtom*> pattern(cx, re->getSource()); 770 JS::RegExpFlags flags = re->getFlags(); 771 LifoAllocScope allocScope(&cx->tempLifoAlloc()); 772 HandleScope handleScope(cx->isolate); 773 Zone zone(allocScope.alloc()); 774 775 RegExpCompileData data; 776 { 777 V8HandleString wrappedPattern(v8::internal::String(pattern), cx->isolate); 778 if (!RegExpParser::ParseRegExpFromHeapString( 779 cx->isolate, &zone, wrappedPattern, flags, &data)) { 780 AutoReportFrontendContext fc(cx); 781 JS::CompileOptions options(cx); 782 DummyTokenStream dummyTokenStream(&fc, options); 783 ReportSyntaxError(dummyTokenStream, data, pattern); 784 return false; 785 } 786 } 787 788 // Avoid stack overflow while recursively walking the AST. 789 RegExpDepthCheck depthCheck(cx); 790 if (!depthCheck.check(data.tree)) { 791 ReportOverRecursed(cx); 792 return false; 793 } 794 795 if (re->kind() == RegExpShared::Kind::Unparsed) { 796 // This is the first time we have compiled this regexp. 797 // First, check to see if we should use simple string search 798 // with an atom. 799 if (!flags.ignoreCase() && !flags.sticky()) { 800 Rooted<JSAtom*> searchAtom(cx); 801 if (data.simple) { 802 // The parse-tree is a single atom that is equal to the pattern. 803 searchAtom = re->getSource(); 804 } else if (data.tree->IsAtom() && data.capture_count == 0) { 805 // The parse-tree is a single atom that is not equal to the pattern. 806 v8::internal::RegExpAtom* atom = data.tree->AsAtom(); 807 const char16_t* twoByteChars = atom->data().begin(); 808 searchAtom = AtomizeChars(cx, twoByteChars, atom->length()); 809 if (!searchAtom) { 810 return false; 811 } 812 } 813 JS::AutoAssertNoGC nogc(cx); 814 if (searchAtom && !UseBoyerMoore(searchAtom, nogc)) { 815 re->useAtomMatch(searchAtom); 816 return true; 817 } 818 } 819 if (data.named_captures) { 820 if (!InitializeNamedCaptures(cx, re, data.named_captures)) { 821 return false; 822 } 823 } 824 // All fallible initialization has succeeded, so we can change state. 825 // Add one to capture_count to account for the whole-match capture. 826 uint32_t pairCount = data.capture_count + 1; 827 re->useRegExpMatch(pairCount); 828 } 829 830 MOZ_ASSERT(re->kind() == RegExpShared::Kind::RegExp); 831 832 RegExpCompiler compiler(cx->isolate, &zone, data.capture_count, flags, 833 input->hasLatin1Chars()); 834 835 bool isLatin1 = input->hasLatin1Chars(); 836 837 SampleCharacters(input, compiler); 838 data.node = compiler.PreprocessRegExp(&data, isLatin1); 839 if (data.error != RegExpError::kNone) { 840 MOZ_ASSERT(data.error == RegExpError::kTooLarge); 841 JS_ReportErrorASCII(cx, "regexp too big"); 842 cx->reportResourceExhaustion(); 843 return false; 844 } 845 data.error = AnalyzeRegExp(cx->isolate, isLatin1, flags, data.node); 846 if (data.error != RegExpError::kNone) { 847 MOZ_ASSERT(data.error == RegExpError::kAnalysisStackOverflow); 848 ReportOverRecursed(cx); 849 return false; 850 } 851 852 bool useNativeCode = codeKind == RegExpShared::CodeKind::Jitcode; 853 MOZ_ASSERT_IF(useNativeCode, IsNativeRegExpEnabled()); 854 855 switch (Assemble(cx, &compiler, &data, re, pattern, &zone, useNativeCode, 856 isLatin1)) { 857 case AssembleResult::TooLarge: 858 JS_ReportErrorASCII(cx, "regexp too big"); 859 cx->reportResourceExhaustion(); 860 return false; 861 case AssembleResult::OutOfMemory: 862 MOZ_ASSERT(cx->isThrowingOutOfMemory()); 863 return false; 864 case AssembleResult::Success: 865 break; 866 } 867 return true; 868 } 869 870 template <typename CharT> 871 RegExpRunStatus ExecuteRaw(jit::JitCode* code, const CharT* chars, 872 size_t length, size_t startIndex, 873 VectorMatchPairs* matches) { 874 InputOutputData data(chars, chars + length, startIndex, matches); 875 876 static_assert(static_cast<int32_t>(RegExpRunStatus::Error) == 877 v8::internal::RegExp::kInternalRegExpException); 878 static_assert(static_cast<int32_t>(RegExpRunStatus::Success) == 879 v8::internal::RegExp::kInternalRegExpSuccess); 880 static_assert(static_cast<int32_t>(RegExpRunStatus::Success_NotFound) == 881 v8::internal::RegExp::kInternalRegExpFailure); 882 883 using RegExpCodeSignature = int (*)(InputOutputData*); 884 auto function = reinterpret_cast<RegExpCodeSignature>(code->raw()); 885 { 886 JS::AutoSuppressGCAnalysis nogc; 887 return (RegExpRunStatus)CALL_GENERATED_1(function, &data); 888 } 889 } 890 891 RegExpRunStatus Interpret(JSContext* cx, MutableHandleRegExpShared re, 892 Handle<JSLinearString*> input, size_t startIndex, 893 VectorMatchPairs* matches) { 894 MOZ_ASSERT(re->getByteCode(input->hasLatin1Chars())); 895 896 HandleScope handleScope(cx->isolate); 897 V8HandleRegExp wrappedRegExp(v8::internal::IrRegExpData(re), cx->isolate); 898 V8HandleString wrappedInput(v8::internal::String(input), cx->isolate); 899 900 static_assert(static_cast<int32_t>(RegExpRunStatus::Error) == 901 v8::internal::RegExp::kInternalRegExpException); 902 static_assert(static_cast<int32_t>(RegExpRunStatus::Success) == 903 v8::internal::RegExp::kInternalRegExpSuccess); 904 static_assert(static_cast<int32_t>(RegExpRunStatus::Success_NotFound) == 905 v8::internal::RegExp::kInternalRegExpFailure); 906 907 RegExpRunStatus status = 908 (RegExpRunStatus)IrregexpInterpreter::MatchForCallFromRuntime( 909 cx->isolate, wrappedRegExp, wrappedInput, matches->pairsRaw(), 910 uint32_t(matches->pairCount() * 2), uint32_t(startIndex)); 911 912 MOZ_ASSERT(status == RegExpRunStatus::Error || 913 status == RegExpRunStatus::Success || 914 status == RegExpRunStatus::Success_NotFound); 915 916 return status; 917 } 918 919 RegExpRunStatus Execute(JSContext* cx, MutableHandleRegExpShared re, 920 Handle<JSLinearString*> input, size_t startIndex, 921 VectorMatchPairs* matches) { 922 bool latin1 = input->hasLatin1Chars(); 923 jit::JitCode* jitCode = re->getJitCode(latin1); 924 bool isCompiled = !!jitCode; 925 926 // Reset the Irregexp backtrack stack if it grows during execution. 927 irregexp::RegExpStackScope stackScope(cx->isolate); 928 929 if (isCompiled) { 930 JS::AutoCheckCannotGC nogc; 931 if (latin1) { 932 return ExecuteRaw(jitCode, input->latin1Chars(nogc), input->length(), 933 startIndex, matches); 934 } 935 return ExecuteRaw(jitCode, input->twoByteChars(nogc), input->length(), 936 startIndex, matches); 937 } 938 939 return Interpret(cx, re, input, startIndex, matches); 940 } 941 942 RegExpRunStatus ExecuteForFuzzing(JSContext* cx, Handle<JSAtom*> pattern, 943 Handle<JSLinearString*> input, 944 JS::RegExpFlags flags, size_t startIndex, 945 VectorMatchPairs* matches, 946 RegExpShared::CodeKind codeKind) { 947 RootedRegExpShared re(cx, cx->zone()->regExps().get(cx, pattern, flags)); 948 if (!RegExpShared::compileIfNecessary(cx, &re, input, codeKind)) { 949 return RegExpRunStatus::Error; 950 } 951 return RegExpShared::execute(cx, &re, input, startIndex, matches); 952 } 953 954 bool GrowBacktrackStack(RegExpStack* regexp_stack) { 955 return SMRegExpMacroAssembler::GrowBacktrackStack(regexp_stack); 956 } 957 958 uint32_t CaseInsensitiveCompareNonUnicode(const char16_t* substring1, 959 const char16_t* substring2, 960 size_t byteLength) { 961 return SMRegExpMacroAssembler::CaseInsensitiveCompareNonUnicode( 962 substring1, substring2, byteLength); 963 } 964 965 uint32_t CaseInsensitiveCompareUnicode(const char16_t* substring1, 966 const char16_t* substring2, 967 size_t byteLength) { 968 return SMRegExpMacroAssembler::CaseInsensitiveCompareUnicode( 969 substring1, substring2, byteLength); 970 } 971 972 bool IsCharacterInRangeArray(uint32_t c, ByteArrayData* ranges) { 973 return SMRegExpMacroAssembler::IsCharacterInRangeArray(c, ranges); 974 } 975 976 #ifdef DEBUG 977 bool IsolateShouldSimulateInterrupt(Isolate* isolate) { 978 return isolate->shouldSimulateInterrupt_ != 0; 979 } 980 981 void IsolateSetShouldSimulateInterrupt(Isolate* isolate) { 982 isolate->shouldSimulateInterrupt_ = 1; 983 } 984 void IsolateClearShouldSimulateInterrupt(Isolate* isolate) { 985 isolate->shouldSimulateInterrupt_ = 0; 986 } 987 #endif 988 989 } // namespace irregexp 990 } // namespace js