[ tor-browser ].git.dasho

RegExpAPI.cpp (36998B)
      1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
      2 * vim: set ts=8 sts=2 et sw=2 tw=80:
      3 * This Source Code Form is subject to the terms of the Mozilla Public
      4 * License, v. 2.0. If a copy of the MPL was not distributed with this
      5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      6 
      7 // Copyright 2020 the V8 project authors. All rights reserved.
      8 // Use of this source code is governed by a BSD-style license that can be
      9 // found in the LICENSE file.
     10 
     11 #include "irregexp/RegExpAPI.h"
     12 
     13 #include "mozilla/ArrayUtils.h"
     14 #include "mozilla/Casting.h"
     15 
     16 #include "frontend/FrontendContext.h"  // AutoReportFrontendContext
     17 #include "frontend/TokenStream.h"
     18 #include "gc/GC.h"
     19 #include "gc/Zone.h"
     20 #include "irregexp/imported/regexp-ast.h"
     21 #include "irregexp/imported/regexp-bytecode-generator.h"
     22 #include "irregexp/imported/regexp-compiler.h"
     23 #include "irregexp/imported/regexp-interpreter.h"
     24 #include "irregexp/imported/regexp-macro-assembler-arch.h"
     25 #include "irregexp/imported/regexp-macro-assembler-tracer.h"
     26 #include "irregexp/imported/regexp-parser.h"
     27 #include "irregexp/imported/regexp-stack.h"
     28 #include "irregexp/imported/regexp.h"
     29 #include "irregexp/RegExpNativeMacroAssembler.h"
     30 #include "irregexp/RegExpShim.h"
     31 #include "jit/JitCommon.h"
     32 #include "js/ColumnNumber.h"  // JS::ColumnNumberOneOrigin, JS::ColumnNumberOffset
     33 #include "js/friend/ErrorMessages.h"  // JSMSG_*
     34 #include "js/friend/StackLimits.h"    // js::ReportOverRecursed
     35 #include "util/StringBuilder.h"
     36 #include "vm/MatchPairs.h"
     37 #include "vm/PlainObject.h"
     38 #include "vm/RegExpShared.h"
     39 
     40 namespace js {
     41 namespace irregexp {
     42 
     43 using mozilla::AssertedCast;
     44 using mozilla::Maybe;
     45 using mozilla::Nothing;
     46 using mozilla::PointerRangeSize;
     47 using mozilla::Some;
     48 
     49 using frontend::DummyTokenStream;
     50 using frontend::TokenStreamAnyChars;
     51 
     52 using v8::internal::DisallowGarbageCollection;
     53 using v8::internal::HandleScope;
     54 using v8::internal::InputOutputData;
     55 using v8::internal::IrregexpInterpreter;
     56 using v8::internal::NativeRegExpMacroAssembler;
     57 using v8::internal::RegExpBytecodeGenerator;
     58 using v8::internal::RegExpCapture;
     59 using v8::internal::RegExpCompileData;
     60 using v8::internal::RegExpCompiler;
     61 using v8::internal::RegExpError;
     62 using v8::internal::RegExpMacroAssembler;
     63 using v8::internal::RegExpMacroAssemblerTracer;
     64 using v8::internal::RegExpNode;
     65 using v8::internal::RegExpParser;
     66 using v8::internal::SMRegExpMacroAssembler;
     67 using v8::internal::Zone;
     68 using v8::internal::ZoneVector;
     69 
     70 using V8HandleString = v8::internal::Handle<v8::internal::String>;
     71 using V8HandleRegExp = v8::internal::Handle<v8::internal::IrRegExpData>;
     72 
     73 using namespace v8::internal::regexp_compiler_constants;
     74 
     75 static uint32_t ErrorNumber(RegExpError err) {
     76  switch (err) {
     77    case RegExpError::kNone:
     78      return JSMSG_NOT_AN_ERROR;
     79    case RegExpError::kStackOverflow:
     80      return JSMSG_OVER_RECURSED;
     81    case RegExpError::kAnalysisStackOverflow:
     82      return JSMSG_OVER_RECURSED;
     83    case RegExpError::kTooLarge:
     84      return JSMSG_TOO_MANY_PARENS;
     85    case RegExpError::kUnterminatedGroup:
     86      return JSMSG_MISSING_PAREN;
     87    case RegExpError::kUnmatchedParen:
     88      return JSMSG_UNMATCHED_RIGHT_PAREN;
     89    case RegExpError::kEscapeAtEndOfPattern:
     90      return JSMSG_ESCAPE_AT_END_OF_REGEXP;
     91    case RegExpError::kInvalidPropertyName:
     92      return JSMSG_INVALID_PROPERTY_NAME;
     93    case RegExpError::kInvalidEscape:
     94      return JSMSG_INVALID_IDENTITY_ESCAPE;
     95    case RegExpError::kInvalidDecimalEscape:
     96      return JSMSG_INVALID_DECIMAL_ESCAPE;
     97    case RegExpError::kInvalidUnicodeEscape:
     98      return JSMSG_INVALID_UNICODE_ESCAPE;
     99    case RegExpError::kNothingToRepeat:
    100      return JSMSG_NOTHING_TO_REPEAT;
    101    case RegExpError::kLoneQuantifierBrackets:
    102      // Note: V8 reports the same error for both ']' and '}'.
    103      return JSMSG_RAW_BRACKET_IN_REGEXP;
    104    case RegExpError::kRangeOutOfOrder:
    105      return JSMSG_NUMBERS_OUT_OF_ORDER;
    106    case RegExpError::kIncompleteQuantifier:
    107      return JSMSG_INCOMPLETE_QUANTIFIER;
    108    case RegExpError::kInvalidQuantifier:
    109      return JSMSG_INVALID_QUANTIFIER;
    110    case RegExpError::kInvalidGroup:
    111      return JSMSG_INVALID_GROUP;
    112    case RegExpError::kRepeatedFlag:
    113      return JSMSG_REPEATED_FLAG;
    114    case RegExpError::kInvalidFlagGroup:
    115      return JSMSG_INVALID_FLAG_GROUP;
    116    case RegExpError::kMultipleFlagDashes:
    117      return JSMSG_MULTIPLE_FLAG_DASHES;
    118    case RegExpError::kNotLinear:
    119      // V8 has an experimental non-backtracking engine. We do not
    120      // support it yet.
    121      MOZ_CRASH("Non-backtracking execution not supported");
    122    case RegExpError::kTooManyCaptures:
    123      return JSMSG_TOO_MANY_PARENS;
    124    case RegExpError::kInvalidCaptureGroupName:
    125      return JSMSG_INVALID_CAPTURE_NAME;
    126    case RegExpError::kDuplicateCaptureGroupName:
    127      return JSMSG_DUPLICATE_CAPTURE_NAME;
    128    case RegExpError::kInvalidNamedReference:
    129      return JSMSG_INVALID_NAMED_REF;
    130    case RegExpError::kInvalidNamedCaptureReference:
    131      return JSMSG_INVALID_NAMED_CAPTURE_REF;
    132    case RegExpError::kInvalidClassPropertyName:
    133      return JSMSG_INVALID_CLASS_PROPERTY_NAME;
    134    case RegExpError::kInvalidCharacterClass:
    135      return JSMSG_RANGE_WITH_CLASS_ESCAPE;
    136    case RegExpError::kUnterminatedCharacterClass:
    137      return JSMSG_UNTERM_CLASS;
    138    case RegExpError::kOutOfOrderCharacterClass:
    139      return JSMSG_BAD_CLASS_RANGE;
    140 
    141    case RegExpError::kInvalidClassSetOperation:
    142      return JSMSG_INVALID_CLASS_SET_OP;
    143    case RegExpError::kInvalidCharacterInClass:
    144      return JSMSG_INVALID_CHAR_IN_CLASS;
    145    case RegExpError::kNegatedCharacterClassWithStrings:
    146      return JSMSG_NEGATED_CLASS_WITH_STR;
    147 
    148    case RegExpError::NumErrors:
    149      MOZ_CRASH("Unreachable");
    150  }
    151  MOZ_CRASH("Unreachable");
    152 }
    153 
    154 Isolate* CreateIsolate(JSContext* cx) {
    155  auto isolate = MakeUnique<Isolate>(cx);
    156  if (!isolate || !isolate->init()) {
    157    return nullptr;
    158  }
    159  return isolate.release();
    160 }
    161 
    162 void TraceIsolate(JSTracer* trc, Isolate* isolate) { isolate->trace(trc); }
    163 
    164 void DestroyIsolate(Isolate* isolate) {
    165  MOZ_ASSERT(isolate->liveHandles() == 0);
    166  MOZ_ASSERT(isolate->livePseudoHandles() == 0);
    167  js_delete(isolate);
    168 }
    169 
    170 size_t IsolateSizeOfIncludingThis(Isolate* isolate,
    171                                  mozilla::MallocSizeOf mallocSizeOf) {
    172  return isolate->sizeOfIncludingThis(mallocSizeOf);
    173 }
    174 
    175 static JS::ColumnNumberOffset ComputeColumnOffset(const Latin1Char* begin,
    176                                                  const Latin1Char* end) {
    177  return JS::ColumnNumberOffset(
    178      AssertedCast<uint32_t>(PointerRangeSize(begin, end)));
    179 }
    180 
    181 static JS::ColumnNumberOffset ComputeColumnOffset(const char16_t* begin,
    182                                                  const char16_t* end) {
    183  return JS::ColumnNumberOffset(
    184      AssertedCast<uint32_t>(unicode::CountUTF16CodeUnits(begin, end)));
    185 }
    186 
    187 // This function is varargs purely so it can call ReportCompileErrorLatin1.
    188 // We never call it with additional arguments.
    189 template <typename CharT>
    190 static void ReportSyntaxError(TokenStreamAnyChars& ts,
    191                              mozilla::Maybe<uint32_t> line,
    192                              mozilla::Maybe<JS::ColumnNumberOneOrigin> column,
    193                              RegExpCompileData& result, CharT* start,
    194                              size_t length, ...) {
    195  MOZ_ASSERT(line.isSome() == column.isSome());
    196 
    197  Maybe<gc::AutoSuppressGC> suppressGC;
    198  if (JSContext* maybeCx = ts.context()->maybeCurrentJSContext()) {
    199    suppressGC.emplace(maybeCx);
    200  }
    201  uint32_t errorNumber = ErrorNumber(result.error);
    202 
    203  if (errorNumber == JSMSG_OVER_RECURSED) {
    204    ReportOverRecursed(ts.context());
    205    return;
    206  }
    207 
    208  uint32_t offset = std::max(result.error_pos, 0);
    209  MOZ_ASSERT(offset <= length);
    210 
    211  ErrorMetadata err;
    212 
    213  // Ordinarily this indicates whether line-of-context information can be
    214  // added, but we entirely ignore that here because we create a
    215  // a line of context based on the expression source.
    216  uint32_t location = ts.currentToken().pos.begin;
    217  if (ts.fillExceptingContext(&err, location)) {
    218    JS::ColumnNumberOffset columnOffset =
    219        ComputeColumnOffset(start, start + offset);
    220    if (line.isSome()) {
    221      // If this pattern is being checked by the frontend Parser instead
    222      // of other API entry points like |new RegExp|, then the parser will
    223      // have provided both a line and column pointing at the *beginning*
    224      // of the RegExp literal inside the source text.
    225      // We adjust the columnNumber to point to the actual syntax error
    226      // inside the literal.
    227      err.lineNumber = *line;
    228      err.columnNumber = *column + columnOffset;
    229    } else {
    230      // Line breaks are not significant in pattern text in the same way as
    231      // in source text, so act as though pattern text is a single line, then
    232      // compute a column based on "code point" count (treating a lone
    233      // surrogate as a "code point" in UTF-16).  Gak.
    234      err.lineNumber = 1;
    235      err.columnNumber = JS::ColumnNumberOneOrigin() + columnOffset;
    236    }
    237  }
    238 
    239  // For most error reporting, the line of context derives from the token
    240  // stream.  So when location information doesn't come from the token
    241  // stream, we can't give a line of context.  But here the "line of context"
    242  // can be (and is) derived from the pattern text, so we can provide it no
    243  // matter if the location is derived from the caller.
    244 
    245  const CharT* windowStart =
    246      (offset > ErrorMetadata::lineOfContextRadius)
    247          ? start + (offset - ErrorMetadata::lineOfContextRadius)
    248          : start;
    249 
    250  const CharT* windowEnd =
    251      (length - offset > ErrorMetadata::lineOfContextRadius)
    252          ? start + offset + ErrorMetadata::lineOfContextRadius
    253          : start + length;
    254 
    255  size_t windowLength = PointerRangeSize(windowStart, windowEnd);
    256  MOZ_ASSERT(windowLength <= ErrorMetadata::lineOfContextRadius * 2);
    257 
    258  // Create the windowed string, not including the potential line
    259  // terminator.
    260  StringBuilder windowBuf(ts.context());
    261  if (!windowBuf.append(windowStart, windowEnd)) {
    262    return;
    263  }
    264 
    265  // The line of context must be null-terminated, and StringBuilder doesn't
    266  // make that happen unless we force it to.
    267  if (!windowBuf.append('\0')) {
    268    return;
    269  }
    270 
    271  err.lineOfContext.reset(windowBuf.stealChars());
    272  if (!err.lineOfContext) {
    273    return;
    274  }
    275 
    276  err.lineLength = windowLength;
    277  err.tokenOffset = offset - (windowStart - start);
    278 
    279  va_list args;
    280  va_start(args, length);
    281  ReportCompileErrorLatin1VA(ts.context(), std::move(err), nullptr, errorNumber,
    282                             &args);
    283  va_end(args);
    284 }
    285 
    286 static void ReportSyntaxError(TokenStreamAnyChars& ts,
    287                              RegExpCompileData& result,
    288                              Handle<JSAtom*> pattern) {
    289  JS::AutoCheckCannotGC nogc_;
    290  if (pattern->hasLatin1Chars()) {
    291    ReportSyntaxError(ts, Nothing(), Nothing(), result,
    292                      pattern->latin1Chars(nogc_), pattern->length());
    293  } else {
    294    ReportSyntaxError(ts, Nothing(), Nothing(), result,
    295                      pattern->twoByteChars(nogc_), pattern->length());
    296  }
    297 }
    298 
    299 template <typename CharT>
    300 static bool CheckPatternSyntaxImpl(js::LifoAlloc& alloc,
    301                                   JS::NativeStackLimit stackLimit,
    302                                   const CharT* input, uint32_t inputLength,
    303                                   JS::RegExpFlags flags,
    304                                   RegExpCompileData* result,
    305                                   JS::AutoAssertNoGC& nogc) {
    306  LifoAllocScope allocScope(&alloc);
    307  Zone zone(allocScope.alloc());
    308 
    309  return RegExpParser::VerifyRegExpSyntax(&zone, stackLimit, input, inputLength,
    310                                          flags, result, nogc);
    311 }
    312 
    313 bool CheckPatternSyntax(js::LifoAlloc& alloc, JS::NativeStackLimit stackLimit,
    314                        TokenStreamAnyChars& ts,
    315                        const mozilla::Range<const char16_t> chars,
    316                        JS::RegExpFlags flags, mozilla::Maybe<uint32_t> line,
    317                        mozilla::Maybe<JS::ColumnNumberOneOrigin> column) {
    318  RegExpCompileData result;
    319  JS::AutoAssertNoGC nogc;
    320  if (!CheckPatternSyntaxImpl(alloc, stackLimit, chars.begin().get(),
    321                              chars.length(), flags, &result, nogc)) {
    322    ReportSyntaxError(ts, line, column, result, chars.begin().get(),
    323                      chars.length());
    324    return false;
    325  }
    326  return true;
    327 }
    328 
    329 bool CheckPatternSyntax(JSContext* cx, JS::NativeStackLimit stackLimit,
    330                        TokenStreamAnyChars& ts, Handle<JSAtom*> pattern,
    331                        JS::RegExpFlags flags) {
    332  RegExpCompileData result;
    333  JS::AutoAssertNoGC nogc(cx);
    334  if (pattern->hasLatin1Chars()) {
    335    if (!CheckPatternSyntaxImpl(cx->tempLifoAlloc(), stackLimit,
    336                                pattern->latin1Chars(nogc), pattern->length(),
    337                                flags, &result, nogc)) {
    338      ReportSyntaxError(ts, result, pattern);
    339      return false;
    340    }
    341    return true;
    342  }
    343  if (!CheckPatternSyntaxImpl(cx->tempLifoAlloc(), stackLimit,
    344                              pattern->twoByteChars(nogc), pattern->length(),
    345                              flags, &result, nogc)) {
    346    ReportSyntaxError(ts, result, pattern);
    347    return false;
    348  }
    349  return true;
    350 }
    351 
    352 // A regexp is a good candidate for Boyer-Moore if it has at least 3
    353 // times as many characters as it has unique characters. Note that
    354 // table lookups in irregexp are done modulo tableSize (128).
    355 template <typename CharT>
    356 static bool HasFewDifferentCharacters(const CharT* chars, size_t length) {
    357  const uint32_t tableSize =
    358      v8::internal::NativeRegExpMacroAssembler::kTableSize;
    359  bool character_found[tableSize] = {};
    360  uint32_t different = 0;
    361  for (uint32_t i = 0; i < length; i++) {
    362    uint32_t ch = chars[i] % tableSize;
    363    if (!character_found[ch]) {
    364      character_found[ch] = true;
    365      different++;
    366      // We declare a regexp low-alphabet if it has at least 3 times as many
    367      // characters as it has different characters.
    368      if (different * 3 > length) {
    369        return false;
    370      }
    371    }
    372  }
    373  return true;
    374 }
    375 
    376 // Identifies the sort of pattern where Boyer-Moore is faster than string search
    377 static bool UseBoyerMoore(Handle<JSAtom*> pattern, JS::AutoAssertNoGC& nogc) {
    378  size_t length =
    379      std::min(size_t(kMaxLookaheadForBoyerMoore), pattern->length());
    380  if (length <= kPatternTooShortForBoyerMoore) {
    381    return false;
    382  }
    383 
    384  if (pattern->hasLatin1Chars()) {
    385    return HasFewDifferentCharacters(pattern->latin1Chars(nogc), length);
    386  }
    387  MOZ_ASSERT(pattern->hasTwoByteChars());
    388  return HasFewDifferentCharacters(pattern->twoByteChars(nogc), length);
    389 }
    390 
    391 // Sample character frequency information for use in Boyer-Moore.
    392 static void SampleCharacters(Handle<JSLinearString*> sample_subject,
    393                             RegExpCompiler& compiler) {
    394  static const int kSampleSize = 128;
    395  int chars_sampled = 0;
    396 
    397  int length = sample_subject->length();
    398 
    399  int half_way = (length - kSampleSize) / 2;
    400  for (int i = std::max(0, half_way); i < length && chars_sampled < kSampleSize;
    401       i++, chars_sampled++) {
    402    compiler.frequency_collator()->CountCharacter(
    403        sample_subject->latin1OrTwoByteChar(i));
    404  }
    405 }
    406 
    407 // Recursively walking the AST for a deeply nested regexp (like
    408 // `/(a(a(a(a(a(a(a(...(a)...))))))))/`) may overflow the stack while
    409 // compiling. To avoid this, we use V8's implementation of the Visitor
    410 // pattern to walk the AST first with an overly large stack frame.
    411 class RegExpDepthCheck final : public v8::internal::RegExpVisitor {
    412 public:
    413  explicit RegExpDepthCheck(JSContext* cx) : cx_(cx) {}
    414 
    415  bool check(v8::internal::RegExpTree* root) {
    416    return !!root->Accept(this, nullptr);
    417  }
    418 
    419  // Leaf nodes with no children
    420 #define LEAF_DEPTH(Kind)                                                  \
    421  void* Visit##Kind(v8::internal::RegExp##Kind* node, void*) override {   \
    422    AutoCheckRecursionLimit recursion(cx_);                               \
    423    return (void*)recursion.checkWithExtraDontReport(cx_, FRAME_PADDING); \
    424  }
    425 
    426  LEAF_DEPTH(Assertion)
    427  LEAF_DEPTH(Atom)
    428  LEAF_DEPTH(BackReference)
    429  LEAF_DEPTH(ClassSetOperand)
    430  LEAF_DEPTH(ClassRanges)
    431  LEAF_DEPTH(Empty)
    432  LEAF_DEPTH(Text)
    433 #undef LEAF_DEPTH
    434 
    435  // Wrapper nodes with one child
    436 #define WRAPPER_DEPTH(Kind)                                             \
    437  void* Visit##Kind(v8::internal::RegExp##Kind* node, void*) override { \
    438    AutoCheckRecursionLimit recursion(cx_);                             \
    439    if (!recursion.checkWithExtraDontReport(cx_, FRAME_PADDING)) {      \
    440      return nullptr;                                                   \
    441    }                                                                   \
    442    return node->body()->Accept(this, nullptr);                         \
    443  }
    444 
    445  WRAPPER_DEPTH(Capture)
    446  WRAPPER_DEPTH(Group)
    447  WRAPPER_DEPTH(Lookaround)
    448  WRAPPER_DEPTH(Quantifier)
    449 #undef WRAPPER_DEPTH
    450 
    451  void* VisitAlternative(v8::internal::RegExpAlternative* node,
    452                         void*) override {
    453    AutoCheckRecursionLimit recursion(cx_);
    454    if (!recursion.checkWithExtraDontReport(cx_, FRAME_PADDING)) {
    455      return nullptr;
    456    }
    457    for (auto* child : *node->nodes()) {
    458      if (!child->Accept(this, nullptr)) {
    459        return nullptr;
    460      }
    461    }
    462    return (void*)true;
    463  }
    464  void* VisitDisjunction(v8::internal::RegExpDisjunction* node,
    465                         void*) override {
    466    AutoCheckRecursionLimit recursion(cx_);
    467    if (!recursion.checkWithExtraDontReport(cx_, FRAME_PADDING)) {
    468      return nullptr;
    469    }
    470    for (auto* child : *node->alternatives()) {
    471      if (!child->Accept(this, nullptr)) {
    472        return nullptr;
    473      }
    474    }
    475    return (void*)true;
    476  }
    477  void* VisitClassSetExpression(v8::internal::RegExpClassSetExpression* node,
    478                                void*) override {
    479    AutoCheckRecursionLimit recursion(cx_);
    480    if (!recursion.checkWithExtraDontReport(cx_, FRAME_PADDING)) {
    481      return nullptr;
    482    }
    483    for (auto* child : *node->operands()) {
    484      if (!child->Accept(this, nullptr)) {
    485        return nullptr;
    486      }
    487    }
    488    return (void*)true;
    489  }
    490 
    491 private:
    492  JSContext* cx_;
    493 
    494  // This size is picked to be comfortably larger than any
    495  // RegExp*::ToNode stack frame.
    496 #if !defined(DEBUG) && !defined(MOZ_CODE_COVERAGE)
    497  static const size_t FRAME_PADDING = 256;
    498 #else
    499  // Use a slightly larger padding for debug and code coverage builds.
    500  static const size_t FRAME_PADDING = 256 * 2;
    501 #endif
    502 };
    503 
    504 enum class AssembleResult {
    505  Success,
    506  TooLarge,
    507  OutOfMemory,
    508 };
    509 
    510 [[nodiscard]] static AssembleResult Assemble(
    511    JSContext* cx, RegExpCompiler* compiler, RegExpCompileData* data,
    512    MutableHandleRegExpShared re, Handle<JSAtom*> pattern, Zone* zone,
    513    bool useNativeCode, bool isLatin1) {
    514  // Because we create a StackMacroAssembler, this function is not allowed
    515  // to GC. If needed, we allocate and throw errors in the caller.
    516  jit::TempAllocator temp(&cx->tempLifoAlloc());
    517  Maybe<jit::JitContext> jctx;
    518  Maybe<js::jit::StackMacroAssembler> stack_masm;
    519  UniquePtr<RegExpMacroAssembler> masm;
    520  if (useNativeCode) {
    521    NativeRegExpMacroAssembler::Mode mode =
    522        isLatin1 ? NativeRegExpMacroAssembler::LATIN1
    523                 : NativeRegExpMacroAssembler::UC16;
    524    // If we are compiling native code, we need a macroassembler,
    525    // which needs a jit context.
    526    jctx.emplace(cx);
    527    stack_masm.emplace(cx, temp);
    528 #ifdef DEBUG
    529    // It would be much preferable to use `class AutoCreatedBy` here, but we
    530    // may be operating without an assembler at all if `useNativeCode` is
    531    // `false`, so there's no place to put such a call.
    532    stack_masm.ref().pushCreator("Assemble() in RegExpAPI.cpp");
    533 #endif
    534    uint32_t num_capture_registers = re->pairCount() * 2;
    535    masm = MakeUnique<SMRegExpMacroAssembler>(cx, stack_masm.ref(), zone, mode,
    536                                              num_capture_registers);
    537  } else {
    538    masm = MakeUnique<RegExpBytecodeGenerator>(cx->isolate, zone);
    539  }
    540  if (!masm) {
    541    ReportOutOfMemory(cx);
    542    return AssembleResult::OutOfMemory;
    543  }
    544 
    545  bool isLargePattern =
    546      pattern->length() > v8::internal::RegExp::kRegExpTooLargeToOptimize;
    547  masm->set_slow_safe(isLargePattern);
    548  if (compiler->optimize()) {
    549    compiler->set_optimize(!isLargePattern);
    550  }
    551 
    552  // When matching a regexp with known maximum length that is anchored
    553  // at the end, we may be able to skip the beginning of long input
    554  // strings. This decision is made here because it depends on
    555  // information in the AST that isn't replicated in the Node
    556  // structure used inside the compiler.
    557  bool is_start_anchored = data->tree->IsAnchoredAtStart();
    558  bool is_end_anchored = data->tree->IsAnchoredAtEnd();
    559  int max_length = data->tree->max_match();
    560  static const int kMaxBacksearchLimit = 1024;
    561  if (is_end_anchored && !is_start_anchored && !re->sticky() &&
    562      max_length < kMaxBacksearchLimit) {
    563    masm->SetCurrentPositionFromEnd(max_length);
    564  }
    565 
    566  if (re->global()) {
    567    RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
    568    if (data->tree->min_match() > 0) {
    569      mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
    570    } else if (re->unicode()) {
    571      mode = RegExpMacroAssembler::GLOBAL_UNICODE;
    572    }
    573    masm->set_global_mode(mode);
    574  }
    575 
    576  // The masm tracer works as a thin wrapper around another macroassembler.
    577  RegExpMacroAssembler* masm_ptr = masm.get();
    578 #ifdef DEBUG
    579  UniquePtr<RegExpMacroAssembler> tracer_masm;
    580  if (jit::JitOptions.trace_regexp_assembler) {
    581    tracer_masm = MakeUnique<RegExpMacroAssemblerTracer>(masm_ptr);
    582    masm_ptr = tracer_masm.get();
    583  }
    584 #endif
    585 
    586  // Compile the regexp.
    587  V8HandleString wrappedPattern(v8::internal::String(pattern), cx->isolate);
    588  RegExpCompiler::CompilationResult result = compiler->Assemble(
    589      cx->isolate, masm_ptr, data->node, data->capture_count, wrappedPattern);
    590 
    591  if (useNativeCode) {
    592 #ifdef DEBUG
    593    // See comment referencing `pushCreator` above.
    594    stack_masm.ref().popCreator();
    595 #endif
    596  }
    597 
    598  if (!result.Succeeded()) {
    599    MOZ_ASSERT(result.error == RegExpError::kTooLarge);
    600    return AssembleResult::TooLarge;
    601  }
    602  if (result.code->value().isUndefined()) {
    603    // SMRegExpMacroAssembler::GetCode returns undefined on OOM.
    604    MOZ_ASSERT(useNativeCode);
    605    return AssembleResult::OutOfMemory;
    606  }
    607 
    608  re->updateMaxRegisters(result.num_registers);
    609  if (useNativeCode) {
    610    // Transfer ownership of the tables from the macroassembler to the
    611    // RegExpShared.
    612    SMRegExpMacroAssembler::TableVector& tables =
    613        static_cast<SMRegExpMacroAssembler*>(masm.get())->tables();
    614    for (uint32_t i = 0; i < tables.length(); i++) {
    615      if (!re->addTable(std::move(tables[i]))) {
    616        ReportOutOfMemory(cx);
    617        return AssembleResult::OutOfMemory;
    618      }
    619    }
    620    re->setJitCode(v8::internal::Code::cast(*result.code).inner(), isLatin1);
    621  } else {
    622    // Transfer ownership of the bytecode from the HandleScope to the
    623    // RegExpShared.
    624    ByteArray bytecode =
    625        v8::internal::ByteArray::cast(*result.code).takeOwnership(cx->isolate);
    626    uint32_t length = bytecode->length();
    627    re->setByteCode(bytecode.release(), isLatin1);
    628    js::AddCellMemory(re, length, MemoryUse::RegExpSharedBytecode);
    629  }
    630 
    631  return AssembleResult::Success;
    632 }
    633 
    634 struct RegExpNamedCapture {
    635  const ZoneVector<char16_t>* name;
    636  js::Vector<uint32_t> indices;
    637 
    638  RegExpNamedCapture(JSContext* cx, const ZoneVector<char16_t>* name)
    639      : name(name), indices(cx) {}
    640 };
    641 
    642 struct RegExpNamedCaptureIndexLess {
    643  bool operator()(const RegExpNamedCapture& lhs,
    644                  const RegExpNamedCapture& rhs) const {
    645    // Every name must have at least one corresponding capture index, and all
    646    // the capture indices must be distinct. This allows us to sort on the
    647    // lowest capture index.
    648    MOZ_ASSERT(!lhs.indices.empty());
    649    MOZ_ASSERT(!rhs.indices.empty());
    650    MOZ_ASSERT_IF(&lhs != &rhs, lhs.indices[0] != rhs.indices[0]);
    651    return lhs.indices[0] < rhs.indices[0];
    652  }
    653 };
    654 
    655 bool InitializeNamedCaptures(JSContext* cx, HandleRegExpShared re,
    656                             ZoneVector<RegExpCapture*>* namedCaptures) {
    657  // The irregexp parser returns named capture information in the form
    658  // of a ZoneVector of RegExpCaptures nodes, each of which stores the
    659  // capture name and the corresponding capture index. We create a
    660  // template object with a property for each capture name, and store
    661  // the capture indices as a heap-allocated array.
    662  uint32_t numNamedCaptures = namedCaptures->size();
    663 
    664  // The input vector of named captures is already sorted by name, and then by
    665  // capture index if there are duplicates. We iterate through the captures,
    666  // creating groups for each set of indices corresponding to a name. Usually,
    667  // there will be a 1:1 mapping.
    668  js::Vector<RegExpNamedCapture> groups(cx);
    669  if (!groups.reserve(numNamedCaptures)) {
    670    js::ReportOutOfMemory(cx);
    671    return false;
    672  }
    673  const ZoneVector<char16_t>* prevName = nullptr;
    674  uint32_t numDistinctNamedCaptures = 0;
    675  for (uint32_t i = 0; i < numNamedCaptures; i++) {
    676    RegExpCapture* capture = (*namedCaptures)[i];
    677    const ZoneVector<char16_t>* name = capture->name();
    678    if (!prevName || *name != *prevName) {
    679      if (!groups.emplaceBack(RegExpNamedCapture(cx, name))) {
    680        js::ReportOutOfMemory(cx);
    681        return false;
    682      }
    683      numDistinctNamedCaptures++;
    684      prevName = name;
    685    }
    686    // Make sure we're getting the indices in the order we expect
    687    MOZ_ASSERT_IF(!groups.back().indices.empty(),
    688                  groups.back().indices.back() < (uint32_t)capture->index());
    689    if (!groups.back().indices.emplaceBack(capture->index())) {
    690      js::ReportOutOfMemory(cx);
    691      return false;
    692    }
    693  }
    694 
    695  // The capture name map must be sorted by index.
    696  std::sort(groups.begin(), groups.end(), RegExpNamedCaptureIndexLess{});
    697 
    698  // Create a plain template object.
    699  Rooted<js::PlainObject*> templateObject(
    700      cx, js::NewPlainObjectWithProto(cx, nullptr, TenuredObject));
    701  if (!templateObject) {
    702    return false;
    703  }
    704 
    705  // Allocate the capture index array.
    706  uint32_t arraySize = numNamedCaptures * sizeof(uint32_t);
    707  UniquePtr<uint32_t[], JS::FreePolicy> captureIndices(
    708      static_cast<uint32_t*>(js_malloc(arraySize)));
    709  if (!captureIndices) {
    710    js::ReportOutOfMemory(cx);
    711    return false;
    712  }
    713 
    714  // Allocate the capture slice index array, if necessary. We only use this
    715  // if we have duplicate named capture groups.
    716  bool hasDuplicateNames = numNamedCaptures != numDistinctNamedCaptures;
    717  UniquePtr<uint32_t[], JS::FreePolicy> sliceIndices;
    718  if (hasDuplicateNames) {
    719    arraySize = numDistinctNamedCaptures * sizeof(uint32_t);
    720    sliceIndices.reset(static_cast<uint32_t*>(js_malloc(arraySize)));
    721    if (!sliceIndices) {
    722      js::ReportOutOfMemory(cx);
    723      return false;
    724    }
    725  }
    726 
    727  // Initialize the properties of the template and store capture indices.
    728  RootedId id(cx);
    729  RootedValue dummyString(cx, StringValue(cx->runtime()->emptyString));
    730  size_t insertIndex = 0;
    731 
    732  for (size_t i = 0; i < numDistinctNamedCaptures; ++i) {
    733    RegExpNamedCapture& group = groups[i];
    734    // We store the names as properties on the template object, in the order of
    735    // their lowest capture index.
    736    JSAtom* name = js::AtomizeChars(cx, group.name->data(), group.name->size());
    737    if (!name) {
    738      return false;
    739    }
    740    id = NameToId(name->asPropertyName());
    741    if (!NativeDefineDataProperty(cx, templateObject, id, dummyString,
    742                                  JSPROP_ENUMERATE)) {
    743      return false;
    744    }
    745    // The slice index keeps track of the captureIndex where indices
    746    // corresponding to a name start. The difference between the current slice
    747    // index and the next slice index is used to calculate how many values to
    748    // return in the slice. This is only needed when we have duplicate capture
    749    // names, otherwise, there's a 1:1 mapping, and we don't need the extra
    750    // data.
    751    if (hasDuplicateNames) {
    752      sliceIndices[i] = insertIndex;
    753    }
    754 
    755    for (uint32_t captureIndex : groups[i].indices) {
    756      captureIndices[insertIndex++] = captureIndex;
    757    }
    758  }
    759 
    760  RegExpShared::InitializeNamedCaptures(
    761      cx, re, numNamedCaptures, numDistinctNamedCaptures, templateObject,
    762      captureIndices.release(), sliceIndices.release());
    763  return true;
    764 }
    765 
    766 bool CompilePattern(JSContext* cx, MutableHandleRegExpShared re,
    767                    Handle<JSLinearString*> input,
    768                    RegExpShared::CodeKind codeKind) {
    769  Rooted<JSAtom*> pattern(cx, re->getSource());
    770  JS::RegExpFlags flags = re->getFlags();
    771  LifoAllocScope allocScope(&cx->tempLifoAlloc());
    772  HandleScope handleScope(cx->isolate);
    773  Zone zone(allocScope.alloc());
    774 
    775  RegExpCompileData data;
    776  {
    777    V8HandleString wrappedPattern(v8::internal::String(pattern), cx->isolate);
    778    if (!RegExpParser::ParseRegExpFromHeapString(
    779            cx->isolate, &zone, wrappedPattern, flags, &data)) {
    780      AutoReportFrontendContext fc(cx);
    781      JS::CompileOptions options(cx);
    782      DummyTokenStream dummyTokenStream(&fc, options);
    783      ReportSyntaxError(dummyTokenStream, data, pattern);
    784      return false;
    785    }
    786  }
    787 
    788  // Avoid stack overflow while recursively walking the AST.
    789  RegExpDepthCheck depthCheck(cx);
    790  if (!depthCheck.check(data.tree)) {
    791    ReportOverRecursed(cx);
    792    return false;
    793  }
    794 
    795  if (re->kind() == RegExpShared::Kind::Unparsed) {
    796    // This is the first time we have compiled this regexp.
    797    // First, check to see if we should use simple string search
    798    // with an atom.
    799    if (!flags.ignoreCase() && !flags.sticky()) {
    800      Rooted<JSAtom*> searchAtom(cx);
    801      if (data.simple) {
    802        // The parse-tree is a single atom that is equal to the pattern.
    803        searchAtom = re->getSource();
    804      } else if (data.tree->IsAtom() && data.capture_count == 0) {
    805        // The parse-tree is a single atom that is not equal to the pattern.
    806        v8::internal::RegExpAtom* atom = data.tree->AsAtom();
    807        const char16_t* twoByteChars = atom->data().begin();
    808        searchAtom = AtomizeChars(cx, twoByteChars, atom->length());
    809        if (!searchAtom) {
    810          return false;
    811        }
    812      }
    813      JS::AutoAssertNoGC nogc(cx);
    814      if (searchAtom && !UseBoyerMoore(searchAtom, nogc)) {
    815        re->useAtomMatch(searchAtom);
    816        return true;
    817      }
    818    }
    819    if (data.named_captures) {
    820      if (!InitializeNamedCaptures(cx, re, data.named_captures)) {
    821        return false;
    822      }
    823    }
    824    // All fallible initialization has succeeded, so we can change state.
    825    // Add one to capture_count to account for the whole-match capture.
    826    uint32_t pairCount = data.capture_count + 1;
    827    re->useRegExpMatch(pairCount);
    828  }
    829 
    830  MOZ_ASSERT(re->kind() == RegExpShared::Kind::RegExp);
    831 
    832  RegExpCompiler compiler(cx->isolate, &zone, data.capture_count, flags,
    833                          input->hasLatin1Chars());
    834 
    835  bool isLatin1 = input->hasLatin1Chars();
    836 
    837  SampleCharacters(input, compiler);
    838  data.node = compiler.PreprocessRegExp(&data, isLatin1);
    839  if (data.error != RegExpError::kNone) {
    840    MOZ_ASSERT(data.error == RegExpError::kTooLarge);
    841    JS_ReportErrorASCII(cx, "regexp too big");
    842    cx->reportResourceExhaustion();
    843    return false;
    844  }
    845  data.error = AnalyzeRegExp(cx->isolate, isLatin1, flags, data.node);
    846  if (data.error != RegExpError::kNone) {
    847    MOZ_ASSERT(data.error == RegExpError::kAnalysisStackOverflow);
    848    ReportOverRecursed(cx);
    849    return false;
    850  }
    851 
    852  bool useNativeCode = codeKind == RegExpShared::CodeKind::Jitcode;
    853  MOZ_ASSERT_IF(useNativeCode, IsNativeRegExpEnabled());
    854 
    855  switch (Assemble(cx, &compiler, &data, re, pattern, &zone, useNativeCode,
    856                   isLatin1)) {
    857    case AssembleResult::TooLarge:
    858      JS_ReportErrorASCII(cx, "regexp too big");
    859      cx->reportResourceExhaustion();
    860      return false;
    861    case AssembleResult::OutOfMemory:
    862      MOZ_ASSERT(cx->isThrowingOutOfMemory());
    863      return false;
    864    case AssembleResult::Success:
    865      break;
    866  }
    867  return true;
    868 }
    869 
    870 template <typename CharT>
    871 RegExpRunStatus ExecuteRaw(jit::JitCode* code, const CharT* chars,
    872                           size_t length, size_t startIndex,
    873                           VectorMatchPairs* matches) {
    874  InputOutputData data(chars, chars + length, startIndex, matches);
    875 
    876  static_assert(static_cast<int32_t>(RegExpRunStatus::Error) ==
    877                v8::internal::RegExp::kInternalRegExpException);
    878  static_assert(static_cast<int32_t>(RegExpRunStatus::Success) ==
    879                v8::internal::RegExp::kInternalRegExpSuccess);
    880  static_assert(static_cast<int32_t>(RegExpRunStatus::Success_NotFound) ==
    881                v8::internal::RegExp::kInternalRegExpFailure);
    882 
    883  using RegExpCodeSignature = int (*)(InputOutputData*);
    884  auto function = reinterpret_cast<RegExpCodeSignature>(code->raw());
    885  {
    886    JS::AutoSuppressGCAnalysis nogc;
    887    return (RegExpRunStatus)CALL_GENERATED_1(function, &data);
    888  }
    889 }
    890 
    891 RegExpRunStatus Interpret(JSContext* cx, MutableHandleRegExpShared re,
    892                          Handle<JSLinearString*> input, size_t startIndex,
    893                          VectorMatchPairs* matches) {
    894  MOZ_ASSERT(re->getByteCode(input->hasLatin1Chars()));
    895 
    896  HandleScope handleScope(cx->isolate);
    897  V8HandleRegExp wrappedRegExp(v8::internal::IrRegExpData(re), cx->isolate);
    898  V8HandleString wrappedInput(v8::internal::String(input), cx->isolate);
    899 
    900  static_assert(static_cast<int32_t>(RegExpRunStatus::Error) ==
    901                v8::internal::RegExp::kInternalRegExpException);
    902  static_assert(static_cast<int32_t>(RegExpRunStatus::Success) ==
    903                v8::internal::RegExp::kInternalRegExpSuccess);
    904  static_assert(static_cast<int32_t>(RegExpRunStatus::Success_NotFound) ==
    905                v8::internal::RegExp::kInternalRegExpFailure);
    906 
    907  RegExpRunStatus status =
    908      (RegExpRunStatus)IrregexpInterpreter::MatchForCallFromRuntime(
    909          cx->isolate, wrappedRegExp, wrappedInput, matches->pairsRaw(),
    910          uint32_t(matches->pairCount() * 2), uint32_t(startIndex));
    911 
    912  MOZ_ASSERT(status == RegExpRunStatus::Error ||
    913             status == RegExpRunStatus::Success ||
    914             status == RegExpRunStatus::Success_NotFound);
    915 
    916  return status;
    917 }
    918 
    919 RegExpRunStatus Execute(JSContext* cx, MutableHandleRegExpShared re,
    920                        Handle<JSLinearString*> input, size_t startIndex,
    921                        VectorMatchPairs* matches) {
    922  bool latin1 = input->hasLatin1Chars();
    923  jit::JitCode* jitCode = re->getJitCode(latin1);
    924  bool isCompiled = !!jitCode;
    925 
    926  // Reset the Irregexp backtrack stack if it grows during execution.
    927  irregexp::RegExpStackScope stackScope(cx->isolate);
    928 
    929  if (isCompiled) {
    930    JS::AutoCheckCannotGC nogc;
    931    if (latin1) {
    932      return ExecuteRaw(jitCode, input->latin1Chars(nogc), input->length(),
    933                        startIndex, matches);
    934    }
    935    return ExecuteRaw(jitCode, input->twoByteChars(nogc), input->length(),
    936                      startIndex, matches);
    937  }
    938 
    939  return Interpret(cx, re, input, startIndex, matches);
    940 }
    941 
    942 RegExpRunStatus ExecuteForFuzzing(JSContext* cx, Handle<JSAtom*> pattern,
    943                                  Handle<JSLinearString*> input,
    944                                  JS::RegExpFlags flags, size_t startIndex,
    945                                  VectorMatchPairs* matches,
    946                                  RegExpShared::CodeKind codeKind) {
    947  RootedRegExpShared re(cx, cx->zone()->regExps().get(cx, pattern, flags));
    948  if (!RegExpShared::compileIfNecessary(cx, &re, input, codeKind)) {
    949    return RegExpRunStatus::Error;
    950  }
    951  return RegExpShared::execute(cx, &re, input, startIndex, matches);
    952 }
    953 
    954 bool GrowBacktrackStack(RegExpStack* regexp_stack) {
    955  return SMRegExpMacroAssembler::GrowBacktrackStack(regexp_stack);
    956 }
    957 
    958 uint32_t CaseInsensitiveCompareNonUnicode(const char16_t* substring1,
    959                                          const char16_t* substring2,
    960                                          size_t byteLength) {
    961  return SMRegExpMacroAssembler::CaseInsensitiveCompareNonUnicode(
    962      substring1, substring2, byteLength);
    963 }
    964 
    965 uint32_t CaseInsensitiveCompareUnicode(const char16_t* substring1,
    966                                       const char16_t* substring2,
    967                                       size_t byteLength) {
    968  return SMRegExpMacroAssembler::CaseInsensitiveCompareUnicode(
    969      substring1, substring2, byteLength);
    970 }
    971 
    972 bool IsCharacterInRangeArray(uint32_t c, ByteArrayData* ranges) {
    973  return SMRegExpMacroAssembler::IsCharacterInRangeArray(c, ranges);
    974 }
    975 
    976 #ifdef DEBUG
    977 bool IsolateShouldSimulateInterrupt(Isolate* isolate) {
    978  return isolate->shouldSimulateInterrupt_ != 0;
    979 }
    980 
    981 void IsolateSetShouldSimulateInterrupt(Isolate* isolate) {
    982  isolate->shouldSimulateInterrupt_ = 1;
    983 }
    984 void IsolateClearShouldSimulateInterrupt(Isolate* isolate) {
    985  isolate->shouldSimulateInterrupt_ = 0;
    986 }
    987 #endif
    988 
    989 }  // namespace irregexp
    990 }  // namespace js
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE