tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

regexp.h (11342B)


      1 // Copyright 2012 the V8 project authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef V8_REGEXP_REGEXP_H_
      6 #define V8_REGEXP_REGEXP_H_
      7 
      8 #include "irregexp/imported/regexp-error.h"
      9 #include "irregexp/RegExpShim.h"
     10 
     11 namespace v8 {
     12 namespace internal {
     13 
     14 class JSRegExp;
     15 class RegExpCapture;
     16 class RegExpData;
     17 class IrRegExpData;
     18 class AtomRegExpData;
     19 class RegExpMatchInfo;
     20 class RegExpNode;
     21 class RegExpTree;
     22 
     23 enum class RegExpCompilationTarget : int { kBytecode, kNative };
     24 
     25 // TODO(jgruber): Do not expose in regexp.h.
     26 // TODO(jgruber): Consider splitting between ParseData and CompileData.
     27 struct RegExpCompileData {
     28  // The parsed AST as produced by the RegExpParser.
     29  RegExpTree* tree = nullptr;
     30 
     31  // The compiled Node graph as produced by RegExpTree::ToNode methods.
     32  RegExpNode* node = nullptr;
     33 
     34  // Either the generated code as produced by the compiler or a trampoline
     35  // to the interpreter.
     36  DirectHandle<Object> code;
     37 
     38  // True, iff the pattern is a 'simple' atom with zero captures. In other
     39  // words, the pattern consists of a string with no metacharacters and special
     40  // regexp features, and can be implemented as a standard string search.
     41  bool simple = true;
     42 
     43  // True, iff the pattern is anchored at the start of the string with '^'.
     44  bool contains_anchor = false;
     45 
     46  // Only set if the pattern contains named captures.
     47  // Note: the lifetime equals that of the parse/compile zone.
     48  ZoneVector<RegExpCapture*>* named_captures = nullptr;
     49 
     50  // The error message. Only used if an error occurred during parsing or
     51  // compilation.
     52  RegExpError error = RegExpError::kNone;
     53 
     54  // The position at which the error was detected. Only used if an
     55  // error occurred.
     56  int error_pos = 0;
     57 
     58  // The number of capture groups, without the global capture \0.
     59  int capture_count = 0;
     60 
     61  // The number of registers used by the generated code.
     62  int register_count = 0;
     63 
     64  // The compilation target (bytecode or native code).
     65  RegExpCompilationTarget compilation_target;
     66 };
     67 
     68 class RegExp final : public AllStatic {
     69 public:
     70  // Whether the irregexp engine generates interpreter bytecode.
     71  static bool CanGenerateBytecode();
     72 
     73  // Verify that the given flags combination is valid.
     74  V8_EXPORT_PRIVATE static bool VerifyFlags(RegExpFlags flags);
     75 
     76  // Verify the given pattern, i.e. check that parsing succeeds. If
     77  // verification fails, `regexp_error_out` is set.
     78  template <class CharT>
     79  static bool VerifySyntax(Zone* zone, uintptr_t stack_limit,
     80                           const CharT* input, int input_length,
     81                           RegExpFlags flags, RegExpError* regexp_error_out,
     82                           const DisallowGarbageCollection& no_gc);
     83 
     84  // Parses the RegExp pattern and prepares the JSRegExp object with
     85  // generic data and choice of implementation - as well as what
     86  // the implementation wants to store in the data field.
     87  // Returns false if compilation fails.
     88  V8_WARN_UNUSED_RESULT static MaybeDirectHandle<Object> Compile(
     89      Isolate* isolate, DirectHandle<JSRegExp> re, DirectHandle<String> pattern,
     90      RegExpFlags flags, uint32_t backtrack_limit);
     91 
     92  // Ensures that a regexp is fully compiled and ready to be executed on a
     93  // subject string.  Returns true on success. Throw and return false on
     94  // failure.
     95  V8_WARN_UNUSED_RESULT static bool EnsureFullyCompiled(
     96      Isolate* isolate, DirectHandle<RegExpData> re_data,
     97      DirectHandle<String> subject);
     98 
     99  enum CallOrigin : int {
    100    kFromRuntime = 0,
    101    kFromJs = 1,
    102  };
    103 
    104  // See ECMA-262 section 15.10.6.2.
    105  // This function calls the garbage collector if necessary.
    106  V8_EXPORT_PRIVATE V8_WARN_UNUSED_RESULT static std::optional<int> Exec(
    107      Isolate* isolate, DirectHandle<JSRegExp> regexp,
    108      DirectHandle<String> subject, int index, int32_t* result_offsets_vector,
    109      uint32_t result_offsets_vector_length);
    110  // As above, but passes the result through the old-style RegExpMatchInfo|Null
    111  // interface. At most one match is returned.
    112  V8_EXPORT_PRIVATE V8_WARN_UNUSED_RESULT static MaybeDirectHandle<Object>
    113  Exec_Single(Isolate* isolate, DirectHandle<JSRegExp> regexp,
    114              DirectHandle<String> subject, int index,
    115              DirectHandle<RegExpMatchInfo> last_match_info);
    116 
    117  V8_EXPORT_PRIVATE V8_WARN_UNUSED_RESULT static std::optional<int>
    118  ExperimentalOneshotExec(Isolate* isolate, DirectHandle<JSRegExp> regexp,
    119                          DirectHandle<String> subject, int index,
    120                          int32_t* result_offsets_vector,
    121                          uint32_t result_offsets_vector_length);
    122 
    123  // Called directly from generated code through ExternalReference.
    124  V8_EXPORT_PRIVATE static intptr_t AtomExecRaw(
    125      Isolate* isolate, Address /* AtomRegExpData */ data_address,
    126      Address /* String */ subject_address, int32_t index,
    127      int32_t* result_offsets_vector, int32_t result_offsets_vector_length);
    128 
    129  // Integral return values used throughout regexp code layers.
    130  static constexpr int kInternalRegExpFailure = 0;
    131  static constexpr int kInternalRegExpSuccess = 1;
    132  static constexpr int kInternalRegExpException = -1;
    133  static constexpr int kInternalRegExpRetry = -2;
    134  static constexpr int kInternalRegExpFallbackToExperimental = -3;
    135  static constexpr int kInternalRegExpSmallestResult = -3;
    136 
    137  enum IrregexpResult : int32_t {
    138    RE_FAILURE = kInternalRegExpFailure,
    139    RE_SUCCESS = kInternalRegExpSuccess,
    140    RE_EXCEPTION = kInternalRegExpException,
    141    RE_RETRY = kInternalRegExpRetry,
    142    RE_FALLBACK_TO_EXPERIMENTAL = kInternalRegExpFallbackToExperimental,
    143  };
    144 
    145  // Set last match info.  If match is nullptr, then setting captures is
    146  // omitted.
    147  static DirectHandle<RegExpMatchInfo> SetLastMatchInfo(
    148      Isolate* isolate, DirectHandle<RegExpMatchInfo> last_match_info,
    149      DirectHandle<String> subject, int capture_count, int32_t* match);
    150 
    151  V8_EXPORT_PRIVATE static bool CompileForTesting(
    152      Isolate* isolate, Zone* zone, RegExpCompileData* input, RegExpFlags flags,
    153      DirectHandle<String> pattern, DirectHandle<String> sample_subject,
    154      bool is_one_byte);
    155 
    156  V8_EXPORT_PRIVATE static void DotPrintForTesting(const char* label,
    157                                                   RegExpNode* node);
    158 
    159  static const int kRegExpTooLargeToOptimize = 20 * KB;
    160 
    161  V8_WARN_UNUSED_RESULT
    162  static MaybeDirectHandle<Object> ThrowRegExpException(
    163      Isolate* isolate, RegExpFlags flags, DirectHandle<String> pattern,
    164      RegExpError error);
    165  static void ThrowRegExpException(Isolate* isolate,
    166                                   DirectHandle<RegExpData> re_data,
    167                                   RegExpError error_text);
    168 
    169  static bool IsUnmodifiedRegExp(Isolate* isolate,
    170                                 DirectHandle<JSRegExp> regexp);
    171 
    172  static DirectHandle<FixedArray> CreateCaptureNameMap(
    173      Isolate* isolate, ZoneVector<RegExpCapture*>* named_captures);
    174 };
    175 
    176 // Uses a special global mode of irregexp-generated code to perform a global
    177 // search and return multiple results at once. As such, this is essentially an
    178 // iterator over multiple results (retrieved batch-wise in advance).
    179 class RegExpGlobalExecRunner final {
    180 public:
    181  RegExpGlobalExecRunner(DirectHandle<RegExpData> regexp_data,
    182                         DirectHandle<String> subject, Isolate* isolate);
    183 
    184  // Fetch the next entry in the cache for global regexp match results.
    185  // This does not set the last match info.  Upon failure, nullptr is
    186  // returned. The cause can be checked with Result().  The previous result is
    187  // still in available in memory when a failure happens.
    188  int32_t* FetchNext();
    189 
    190  int32_t* LastSuccessfulMatch() const;
    191 
    192  bool HasException() const { return num_matches_ < 0; }
    193 
    194 private:
    195  int AdvanceZeroLength(int last_index) const;
    196 
    197  int max_matches() const {
    198    DCHECK_NE(register_array_size_, 0);
    199    return register_array_size_ / registers_per_match_;
    200  }
    201 
    202  RegExpResultVectorScope result_vector_scope_;
    203  int num_matches_ = 0;
    204  int current_match_index_ = 0;
    205  int registers_per_match_ = 0;
    206  // Pointer to the last set of captures.
    207  int32_t* register_array_ = nullptr;
    208  int register_array_size_ = 0;
    209  DirectHandle<RegExpData> regexp_data_;
    210  DirectHandle<String> subject_;
    211  Isolate* const isolate_;
    212 };
    213 
    214 // Caches results for specific regexp queries on the isolate. At the time of
    215 // writing, this is used during global calls to RegExp.prototype.exec and
    216 // @@split.
    217 class RegExpResultsCache final : public AllStatic {
    218 public:
    219  enum ResultsCacheType { REGEXP_MULTIPLE_INDICES, STRING_SPLIT_SUBSTRINGS };
    220 
    221  // Attempt to retrieve a cached result.  On failure, 0 is returned as a Smi.
    222  // On success, the returned result is guaranteed to be a COW-array.
    223  static Tagged<Object> Lookup(Heap* heap, Tagged<String> key_string,
    224                               Tagged<Object> key_pattern,
    225                               Tagged<FixedArray>* last_match_out,
    226                               ResultsCacheType type);
    227  // Attempt to add value_array to the cache specified by type.  On success,
    228  // value_array is turned into a COW-array.
    229  static void Enter(Isolate* isolate, DirectHandle<String> key_string,
    230                    DirectHandle<Object> key_pattern,
    231                    DirectHandle<FixedArray> value_array,
    232                    DirectHandle<FixedArray> last_match_cache,
    233                    ResultsCacheType type);
    234  static void Clear(Tagged<FixedArray> cache);
    235 
    236  static constexpr int kRegExpResultsCacheSize = 0x100;
    237 
    238 private:
    239  static constexpr int kStringOffset = 0;
    240  static constexpr int kPatternOffset = 1;
    241  static constexpr int kArrayOffset = 2;
    242  static constexpr int kLastMatchOffset = 3;
    243  static constexpr int kArrayEntriesPerCacheEntry = 4;
    244 };
    245 
    246 // Caches results of RegExpPrototypeMatch when:
    247 // - the subject is a SlicedString
    248 // - the pattern is an ATOM type regexp.
    249 //
    250 // This is intended for usage patterns where we search ever-growing slices of
    251 // some large string. After a cache hit, RegExpMatchGlobalAtom only needs to
    252 // process the trailing part of the subject string that was *not* part of the
    253 // cached SlicedString.
    254 //
    255 // For example:
    256 //
    257 // long_string.substring(0, 100).match(pattern);
    258 // long_string.substring(0, 200).match(pattern);
    259 //
    260 // The second call hits the cache for the slice [0, 100[ and only has to search
    261 // the slice [100, 200].
    262 class RegExpResultsCache_MatchGlobalAtom final : public AllStatic {
    263 public:
    264  static void TryInsert(Isolate* isolate, Tagged<String> subject,
    265                        Tagged<String> pattern, int number_of_matches,
    266                        int last_match_index);
    267  static bool TryGet(Isolate* isolate, Tagged<String> subject,
    268                     Tagged<String> pattern, int* number_of_matches_out,
    269                     int* last_match_index_out);
    270  static void Clear(Heap* heap);
    271 
    272 private:
    273  static constexpr int kSubjectIndex = 0;          // SlicedString.
    274  static constexpr int kPatternIndex = 1;          // String.
    275  static constexpr int kNumberOfMatchesIndex = 2;  // Smi.
    276  static constexpr int kLastMatchIndexIndex = 3;   // Smi.
    277  static constexpr int kEntrySize = 4;
    278 
    279 public:
    280  static constexpr int kSize = kEntrySize;  // Single-entry cache.
    281 };
    282 
    283 }  // namespace internal
    284 }  // namespace v8
    285 
    286 #endif  // V8_REGEXP_REGEXP_H_