tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

RegExpShared.h (15789B)


      1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
      2 * vim: set ts=8 sts=2 et sw=2 tw=80:
      3 * This Source Code Form is subject to the terms of the Mozilla Public
      4 * License, v. 2.0. If a copy of the MPL was not distributed with this
      5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      6 
      7 /**
      8 * The compiled representation of a RegExp, potentially shared among RegExp
      9 * instances created during separate evaluations of a single RegExp literal in
     10 * source code.
     11 */
     12 
     13 #ifndef vm_RegExpShared_h
     14 #define vm_RegExpShared_h
     15 
     16 #include "mozilla/Assertions.h"
     17 #include "mozilla/MemoryReporting.h"
     18 
     19 #include "gc/Barrier.h"
     20 #include "gc/Policy.h"
     21 #include "gc/ZoneAllocator.h"
     22 #include "irregexp/RegExpTypes.h"
     23 #include "jit/JitCode.h"
     24 #include "jit/JitOptions.h"
     25 #include "js/AllocPolicy.h"
     26 #include "js/RegExpFlags.h"  // JS::RegExpFlag, JS::RegExpFlags
     27 #include "js/UbiNode.h"
     28 #include "js/Vector.h"
     29 #include "vm/ArrayObject.h"
     30 
     31 namespace js {
     32 
     33 class ArrayObject;
     34 class PlainObject;
     35 class RegExpRealm;
     36 class RegExpShared;
     37 class RegExpStatics;
     38 class VectorMatchPairs;
     39 
     40 using RootedRegExpShared = JS::Rooted<RegExpShared*>;
     41 using HandleRegExpShared = JS::Handle<RegExpShared*>;
     42 using MutableHandleRegExpShared = JS::MutableHandle<RegExpShared*>;
     43 
     44 enum class RegExpRunStatus : int32_t {
     45  Error = -1,
     46  Success = 1,
     47  Success_NotFound = 0,
     48 };
     49 
     50 inline bool IsNativeRegExpEnabled() {
     51  return jit::HasJitBackend() && jit::JitOptions.nativeRegExp;
     52 }
     53 
     54 /*
     55 * A RegExpShared is the compiled representation of a regexp. A RegExpShared is
     56 * potentially pointed to by multiple RegExpObjects. Additionally, C++ code may
     57 * have pointers to RegExpShareds on the stack. The RegExpShareds are kept in a
     58 * table so that they can be reused when compiling the same regex string.
     59 *
     60 * To save memory, a RegExpShared is not created for a RegExpObject until it is
     61 * needed for execution. When a RegExpShared needs to be created, it is looked
     62 * up in a per-compartment table to allow reuse between objects.
     63 *
     64 * During a GC, RegExpShared instances are marked and swept like GC things.
     65 * Usually, RegExpObjects clear their pointers to their RegExpShareds rather
     66 * than explicitly tracing them, so that the RegExpShared and any jitcode can
     67 * be reclaimed quicker. However, the RegExpShareds are traced through by
     68 * objects when we are preserving jitcode in their zone, to avoid the same
     69 * recompilation inefficiencies as normal Ion and baseline compilation.
     70 */
     71 class RegExpShared
     72    : public gc::CellWithTenuredGCPointer<gc::TenuredCell, JSAtom> {
     73  friend class js::gc::CellAllocator;
     74 
     75 public:
     76  enum class Kind : uint32_t { Unparsed, Atom, RegExp };
     77  enum class CodeKind { Bytecode, Jitcode, Any };
     78 
     79  using ByteCode = js::irregexp::ByteArrayData;
     80  using JitCodeTable = js::irregexp::ByteArray;
     81  using JitCodeTables = Vector<JitCodeTable, 0, SystemAllocPolicy>;
     82 
     83 private:
     84  friend class RegExpStatics;
     85  friend class RegExpZone;
     86 
     87  struct RegExpCompilation {
     88    GCPtr<jit::JitCode*> jitCode;
     89    ByteCode* byteCode = nullptr;
     90 
     91    bool compiled(CodeKind kind = CodeKind::Any) const {
     92      switch (kind) {
     93        case CodeKind::Bytecode:
     94          return !!byteCode;
     95        case CodeKind::Jitcode:
     96          return !!jitCode;
     97        case CodeKind::Any:
     98          return !!byteCode || !!jitCode;
     99      }
    100      MOZ_CRASH("Unreachable");
    101    }
    102 
    103    size_t byteCodeLength() const {
    104      MOZ_ASSERT(byteCode);
    105      return byteCode->length();
    106    }
    107  };
    108 
    109 public:
    110  /* Source to the RegExp, for lazy compilation. Stored in the cell header. */
    111  JSAtom* getSource() const { return headerPtr(); }
    112 
    113 private:
    114  RegExpCompilation compilationArray[2];
    115 
    116  uint32_t pairCount_;
    117  JS::RegExpFlags flags;
    118 
    119  RegExpShared::Kind kind_ = Kind::Unparsed;
    120  GCPtr<JSAtom*> patternAtom_;
    121  uint32_t maxRegisters_ = 0;
    122  uint32_t ticks_ = 0;
    123 
    124  // With duplicate named capture groups, it's possible that the number of
    125  // distinct named groups is less than the total number of named captures.
    126  // If they are equal, we used the namedCaptureIndices_ array directly to
    127  // return the capture index corresponding to a name. If they are not equal,
    128  // we use the namedCapturedSliceIndices_ array to keep track of the offsets
    129  // into the namedCaptureIndices_ array where indices corresponding to a new
    130  // name start.
    131  // E.g. if we have a regex like /((<?a>a)|(<?a>A))(<?b>b)/, the
    132  // namedCapturedIndices_ array might look like [0, 1, 2], and the
    133  // namedCaptureSliceIndices_ array like [0, 2]. Then we can construct the
    134  // slice corresponding to `a` as offset 0, length 2, and the slice
    135  // corresponding to `b` as offset 2, length 1.
    136  uint32_t numNamedCaptures_ = {};
    137  uint32_t numDistinctNamedCaptures_ = {};
    138  uint32_t* namedCaptureIndices_ = {};
    139  uint32_t* namedCaptureSliceIndices_ = {};
    140  GCPtr<PlainObject*> groupsTemplate_ = {};
    141 
    142  static int CompilationIndex(bool latin1) { return latin1 ? 0 : 1; }
    143 
    144  // Tables referenced by JIT code.
    145  JitCodeTables tables;
    146 
    147  /* Internal functions. */
    148  RegExpShared(JSAtom* source, JS::RegExpFlags flags);
    149 
    150  const RegExpCompilation& compilation(bool latin1) const {
    151    return compilationArray[CompilationIndex(latin1)];
    152  }
    153 
    154  RegExpCompilation& compilation(bool latin1) {
    155    return compilationArray[CompilationIndex(latin1)];
    156  }
    157 
    158 public:
    159  ~RegExpShared() = delete;
    160 
    161  static bool compileIfNecessary(JSContext* cx, MutableHandleRegExpShared res,
    162                                 Handle<JSLinearString*> input, CodeKind code);
    163 
    164  static RegExpRunStatus executeAtom(MutableHandleRegExpShared re,
    165                                     Handle<JSLinearString*> input,
    166                                     size_t start, VectorMatchPairs* matches);
    167 
    168  // Execute this RegExp on input starting from searchIndex, filling in matches.
    169  static RegExpRunStatus execute(JSContext* cx, MutableHandleRegExpShared res,
    170                                 Handle<JSLinearString*> input,
    171                                 size_t searchIndex, VectorMatchPairs* matches);
    172 
    173  // Register a table with this RegExpShared, and take ownership.
    174  bool addTable(JitCodeTable table) { return tables.append(std::move(table)); }
    175 
    176  /* Accessors */
    177 
    178  size_t pairCount() const {
    179    MOZ_ASSERT(kind() != Kind::Unparsed);
    180    return pairCount_;
    181  }
    182 
    183  RegExpShared::Kind kind() const { return kind_; }
    184 
    185  // Use simple string matching for this regexp.
    186  void useAtomMatch(Handle<JSAtom*> pattern);
    187 
    188  // Use the regular expression engine for this regexp.
    189  void useRegExpMatch(size_t parenCount);
    190 
    191  static void InitializeNamedCaptures(JSContext* cx, HandleRegExpShared re,
    192                                      uint32_t numNamedCaptures,
    193                                      uint32_t numDistinctNamedCaptures,
    194                                      Handle<PlainObject*> templateObject,
    195                                      uint32_t* captureIndices,
    196                                      uint32_t* captureSliceIndices);
    197  PlainObject* getGroupsTemplate() { return groupsTemplate_; }
    198 
    199  void tierUpTick();
    200  bool markedForTierUp() const;
    201 
    202  void setByteCode(ByteCode* code, bool latin1) {
    203    compilation(latin1).byteCode = code;
    204  }
    205  ByteCode* getByteCode(bool latin1) const {
    206    return compilation(latin1).byteCode;
    207  }
    208  void setJitCode(jit::JitCode* code, bool latin1) {
    209    compilation(latin1).jitCode = code;
    210  }
    211  jit::JitCode* getJitCode(bool latin1) const {
    212    return compilation(latin1).jitCode;
    213  }
    214  uint32_t getMaxRegisters() const { return maxRegisters_; }
    215  void updateMaxRegisters(uint32_t numRegisters) {
    216    maxRegisters_ = std::max(maxRegisters_, numRegisters);
    217  }
    218 
    219  uint32_t numNamedCaptures() const { return numNamedCaptures_; }
    220  uint32_t numDistinctNamedCaptures() const {
    221    return numDistinctNamedCaptures_;
    222  }
    223  int32_t getNamedCaptureIndex(uint32_t idx) const {
    224    MOZ_ASSERT(idx < numNamedCaptures());
    225    MOZ_ASSERT(namedCaptureIndices_);
    226    MOZ_ASSERT(!namedCaptureSliceIndices_);
    227    return namedCaptureIndices_[idx];
    228  }
    229 
    230  mozilla::Span<uint32_t> getNamedCaptureIndices(uint32_t idx) const {
    231    MOZ_ASSERT(idx < numDistinctNamedCaptures());
    232    MOZ_ASSERT(namedCaptureIndices_);
    233    MOZ_ASSERT(namedCaptureSliceIndices_);
    234    // The start of our slice is the value stored in the slice indices.
    235    uint32_t* start = &namedCaptureIndices_[namedCaptureSliceIndices_[idx]];
    236    size_t length = 0;
    237    if (idx + 1 < numDistinctNamedCaptures()) {
    238      // If we're not the last slice index, the number of items is the
    239      // difference between us and the next index.
    240      length =
    241          namedCaptureSliceIndices_[idx + 1] - namedCaptureSliceIndices_[idx];
    242    } else {
    243      // Otherwise, it's the number of remaining capture indices.
    244      length = numNamedCaptures() - namedCaptureSliceIndices_[idx];
    245    }
    246    return mozilla::Span<uint32_t>(start, length);
    247  }
    248 
    249  JSAtom* patternAtom() const { return patternAtom_; }
    250 
    251  JS::RegExpFlags getFlags() const { return flags; }
    252 
    253  bool hasIndices() const { return flags.hasIndices(); }
    254  bool global() const { return flags.global(); }
    255  bool ignoreCase() const { return flags.ignoreCase(); }
    256  bool multiline() const { return flags.multiline(); }
    257  bool dotAll() const { return flags.dotAll(); }
    258  bool unicode() const { return flags.unicode(); }
    259  bool unicodeSets() const { return flags.unicodeSets(); }
    260  bool sticky() const { return flags.sticky(); }
    261 
    262  bool isCompiled(bool latin1, CodeKind codeKind = CodeKind::Any) const {
    263    return compilation(latin1).compiled(codeKind);
    264  }
    265  bool isCompiled() const { return isCompiled(true) || isCompiled(false); }
    266 
    267  void traceChildren(JSTracer* trc);
    268  void discardJitCode();
    269  void finalize(JS::GCContext* gcx);
    270 
    271  static size_t offsetOfSource() { return offsetOfHeaderPtr(); }
    272 
    273  static size_t offsetOfPatternAtom() {
    274    return offsetof(RegExpShared, patternAtom_);
    275  }
    276 
    277  static size_t offsetOfFlags() { return offsetof(RegExpShared, flags); }
    278 
    279  static size_t offsetOfPairCount() {
    280    return offsetof(RegExpShared, pairCount_);
    281  }
    282 
    283  static size_t offsetOfKind() { return offsetof(RegExpShared, kind_); }
    284 
    285  static size_t offsetOfJitCode(bool latin1) {
    286    return offsetof(RegExpShared, compilationArray) +
    287           (CompilationIndex(latin1) * sizeof(RegExpCompilation)) +
    288           offsetof(RegExpCompilation, jitCode);
    289  }
    290 
    291  static size_t offsetOfGroupsTemplate() {
    292    return offsetof(RegExpShared, groupsTemplate_);
    293  }
    294 
    295  size_t sizeOfExcludingThis(mozilla::MallocSizeOf mallocSizeOf);
    296 
    297 public:
    298  static const JS::TraceKind TraceKind = JS::TraceKind::RegExpShared;
    299 };
    300 
    301 class RegExpZone {
    302  struct Key {
    303    JSAtom* atom = nullptr;
    304    JS::RegExpFlags flags = JS::RegExpFlag::NoFlags;
    305 
    306    Key() = default;
    307    Key(JSAtom* atom, JS::RegExpFlags flags) : atom(atom), flags(flags) {}
    308    MOZ_IMPLICIT Key(const WeakHeapPtr<RegExpShared*>& shared)
    309        : atom(shared.unbarrieredGet()->getSource()),
    310          flags(shared.unbarrieredGet()->getFlags()) {}
    311 
    312    using Lookup = Key;
    313    static HashNumber hash(const Lookup& l) {
    314      HashNumber hash = DefaultHasher<JSAtom*>::hash(l.atom);
    315      return mozilla::AddToHash(hash, l.flags.value());
    316    }
    317    static bool match(Key l, Key r) {
    318      return l.atom == r.atom && l.flags == r.flags;
    319    }
    320  };
    321 
    322  /*
    323   * The set of all RegExpShareds in the zone. On every GC, every RegExpShared
    324   * that was not marked is deleted and removed from the set.
    325   */
    326  using Set = JS::WeakCache<
    327      JS::GCHashSet<WeakHeapPtr<RegExpShared*>, Key, ZoneAllocPolicy>>;
    328  Set set_;
    329 
    330 public:
    331  explicit RegExpZone(Zone* zone);
    332 
    333  ~RegExpZone() { MOZ_ASSERT(set_.empty()); }
    334 
    335  bool empty() const { return set_.empty(); }
    336 
    337  RegExpShared* maybeGet(JSAtom* source, JS::RegExpFlags flags) const {
    338    Set::Ptr p = set_.lookup(Key(source, flags));
    339    return p ? *p : nullptr;
    340  }
    341 
    342  RegExpShared* get(JSContext* cx, Handle<JSAtom*> source,
    343                    JS::RegExpFlags flags);
    344 
    345 #ifdef DEBUG
    346  void clear() { set_.clear(); }
    347 #endif
    348 
    349  size_t sizeOfIncludingThis(mozilla::MallocSizeOf mallocSizeOf) const;
    350 };
    351 
    352 class RegExpRealm {
    353 public:
    354  enum ResultShapeKind { Normal, WithIndices, Indices, NumKinds };
    355 
    356  // Information about the last regular expression match. This is used by the
    357  // static RegExp properties such as RegExp.lastParen.
    358  UniquePtr<RegExpStatics> regExpStatics;
    359 
    360 private:
    361  /*
    362   * The shapes used for the result object of re.exec(), if there is a result.
    363   * These are used in CreateRegExpMatchResult. There are three shapes, each of
    364   * which is an ArrayObject shape with some additional properties. We decide
    365   * which to use based on the |hasIndices| (/d) flag.
    366   *
    367   *  Normal: Has |index|, |input|, and |groups| properties.
    368   *          Used for the result object if |hasIndices| is not set.
    369   *
    370   *  WithIndices: Has |index|, |input|, |groups|, and |indices| properties.
    371   *               Used for the result object if |hasIndices| is set.
    372   *
    373   *  Indices: Has a |groups| property. If |hasIndices| is set, used
    374   *           for the |.indices| property of the result object.
    375   */
    376  GCPtr<SharedShape*> matchResultShapes_[ResultShapeKind::NumKinds];
    377 
    378  SharedShape* createMatchResultShape(JSContext* cx, ResultShapeKind kind);
    379 
    380 public:
    381  explicit RegExpRealm();
    382 
    383  void trace(JSTracer* trc);
    384 
    385  static const size_t MatchResultObjectIndexSlot = 0;
    386  static const size_t MatchResultObjectInputSlot = 1;
    387  static const size_t MatchResultObjectGroupsSlot = 2;
    388  static const size_t MatchResultObjectIndicesSlot = 3;
    389 
    390  // Number of used and allocated dynamic slots for a Normal match result
    391  // object. These values are checked in createMatchResultShape.
    392  static const size_t MatchResultObjectSlotSpan = 3;
    393  static const size_t MatchResultObjectNumDynamicSlots = 6;
    394 
    395  static const size_t IndicesGroupsSlot = 0;
    396 
    397  static size_t offsetOfMatchResultObjectIndexSlot() {
    398    return sizeof(Value) * MatchResultObjectIndexSlot;
    399  }
    400  static size_t offsetOfMatchResultObjectInputSlot() {
    401    return sizeof(Value) * MatchResultObjectInputSlot;
    402  }
    403  static size_t offsetOfMatchResultObjectGroupsSlot() {
    404    return sizeof(Value) * MatchResultObjectGroupsSlot;
    405  }
    406  static size_t offsetOfMatchResultObjectIndicesSlot() {
    407    return sizeof(Value) * MatchResultObjectIndicesSlot;
    408  }
    409 
    410  /* Get or create the shape used for the result of .exec(). */
    411  SharedShape* getOrCreateMatchResultShape(
    412      JSContext* cx, ResultShapeKind kind = ResultShapeKind::Normal) {
    413    if (matchResultShapes_[kind]) {
    414      return matchResultShapes_[kind];
    415    }
    416    return createMatchResultShape(cx, kind);
    417  }
    418 
    419  static constexpr size_t offsetOfRegExpStatics() {
    420    return offsetof(RegExpRealm, regExpStatics);
    421  }
    422  static constexpr size_t offsetOfNormalMatchResultShape() {
    423    static_assert(sizeof(GCPtr<SharedShape*>) == sizeof(uintptr_t));
    424    return offsetof(RegExpRealm, matchResultShapes_) +
    425           ResultShapeKind::Normal * sizeof(uintptr_t);
    426  }
    427 };
    428 
    429 RegExpRunStatus ExecuteRegExpAtomRaw(RegExpShared* re,
    430                                     const JSLinearString* input, size_t start,
    431                                     MatchPairs* matchPairs);
    432 
    433 } /* namespace js */
    434 
    435 namespace JS {
    436 namespace ubi {
    437 
    438 template <>
    439 class Concrete<js::RegExpShared> : TracerConcrete<js::RegExpShared> {
    440 protected:
    441  explicit Concrete(js::RegExpShared* ptr)
    442      : TracerConcrete<js::RegExpShared>(ptr) {}
    443 
    444 public:
    445  static void construct(void* storage, js::RegExpShared* ptr) {
    446    new (storage) Concrete(ptr);
    447  }
    448 
    449  CoarseType coarseType() const final { return CoarseType::Other; }
    450 
    451  Size size(mozilla::MallocSizeOf mallocSizeOf) const override;
    452 
    453  const char16_t* typeName() const override { return concreteTypeName; }
    454  static const char16_t concreteTypeName[];
    455 };
    456 
    457 }  // namespace ubi
    458 }  // namespace JS
    459 
    460 #endif /* vm_RegExpShared_h */