regexp.h (11342B)
1 // Copyright 2012 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef V8_REGEXP_REGEXP_H_ 6 #define V8_REGEXP_REGEXP_H_ 7 8 #include "irregexp/imported/regexp-error.h" 9 #include "irregexp/RegExpShim.h" 10 11 namespace v8 { 12 namespace internal { 13 14 class JSRegExp; 15 class RegExpCapture; 16 class RegExpData; 17 class IrRegExpData; 18 class AtomRegExpData; 19 class RegExpMatchInfo; 20 class RegExpNode; 21 class RegExpTree; 22 23 enum class RegExpCompilationTarget : int { kBytecode, kNative }; 24 25 // TODO(jgruber): Do not expose in regexp.h. 26 // TODO(jgruber): Consider splitting between ParseData and CompileData. 27 struct RegExpCompileData { 28 // The parsed AST as produced by the RegExpParser. 29 RegExpTree* tree = nullptr; 30 31 // The compiled Node graph as produced by RegExpTree::ToNode methods. 32 RegExpNode* node = nullptr; 33 34 // Either the generated code as produced by the compiler or a trampoline 35 // to the interpreter. 36 DirectHandle<Object> code; 37 38 // True, iff the pattern is a 'simple' atom with zero captures. In other 39 // words, the pattern consists of a string with no metacharacters and special 40 // regexp features, and can be implemented as a standard string search. 41 bool simple = true; 42 43 // True, iff the pattern is anchored at the start of the string with '^'. 44 bool contains_anchor = false; 45 46 // Only set if the pattern contains named captures. 47 // Note: the lifetime equals that of the parse/compile zone. 48 ZoneVector<RegExpCapture*>* named_captures = nullptr; 49 50 // The error message. Only used if an error occurred during parsing or 51 // compilation. 52 RegExpError error = RegExpError::kNone; 53 54 // The position at which the error was detected. Only used if an 55 // error occurred. 56 int error_pos = 0; 57 58 // The number of capture groups, without the global capture \0. 59 int capture_count = 0; 60 61 // The number of registers used by the generated code. 62 int register_count = 0; 63 64 // The compilation target (bytecode or native code). 65 RegExpCompilationTarget compilation_target; 66 }; 67 68 class RegExp final : public AllStatic { 69 public: 70 // Whether the irregexp engine generates interpreter bytecode. 71 static bool CanGenerateBytecode(); 72 73 // Verify that the given flags combination is valid. 74 V8_EXPORT_PRIVATE static bool VerifyFlags(RegExpFlags flags); 75 76 // Verify the given pattern, i.e. check that parsing succeeds. If 77 // verification fails, `regexp_error_out` is set. 78 template <class CharT> 79 static bool VerifySyntax(Zone* zone, uintptr_t stack_limit, 80 const CharT* input, int input_length, 81 RegExpFlags flags, RegExpError* regexp_error_out, 82 const DisallowGarbageCollection& no_gc); 83 84 // Parses the RegExp pattern and prepares the JSRegExp object with 85 // generic data and choice of implementation - as well as what 86 // the implementation wants to store in the data field. 87 // Returns false if compilation fails. 88 V8_WARN_UNUSED_RESULT static MaybeDirectHandle<Object> Compile( 89 Isolate* isolate, DirectHandle<JSRegExp> re, DirectHandle<String> pattern, 90 RegExpFlags flags, uint32_t backtrack_limit); 91 92 // Ensures that a regexp is fully compiled and ready to be executed on a 93 // subject string. Returns true on success. Throw and return false on 94 // failure. 95 V8_WARN_UNUSED_RESULT static bool EnsureFullyCompiled( 96 Isolate* isolate, DirectHandle<RegExpData> re_data, 97 DirectHandle<String> subject); 98 99 enum CallOrigin : int { 100 kFromRuntime = 0, 101 kFromJs = 1, 102 }; 103 104 // See ECMA-262 section 15.10.6.2. 105 // This function calls the garbage collector if necessary. 106 V8_EXPORT_PRIVATE V8_WARN_UNUSED_RESULT static std::optional<int> Exec( 107 Isolate* isolate, DirectHandle<JSRegExp> regexp, 108 DirectHandle<String> subject, int index, int32_t* result_offsets_vector, 109 uint32_t result_offsets_vector_length); 110 // As above, but passes the result through the old-style RegExpMatchInfo|Null 111 // interface. At most one match is returned. 112 V8_EXPORT_PRIVATE V8_WARN_UNUSED_RESULT static MaybeDirectHandle<Object> 113 Exec_Single(Isolate* isolate, DirectHandle<JSRegExp> regexp, 114 DirectHandle<String> subject, int index, 115 DirectHandle<RegExpMatchInfo> last_match_info); 116 117 V8_EXPORT_PRIVATE V8_WARN_UNUSED_RESULT static std::optional<int> 118 ExperimentalOneshotExec(Isolate* isolate, DirectHandle<JSRegExp> regexp, 119 DirectHandle<String> subject, int index, 120 int32_t* result_offsets_vector, 121 uint32_t result_offsets_vector_length); 122 123 // Called directly from generated code through ExternalReference. 124 V8_EXPORT_PRIVATE static intptr_t AtomExecRaw( 125 Isolate* isolate, Address /* AtomRegExpData */ data_address, 126 Address /* String */ subject_address, int32_t index, 127 int32_t* result_offsets_vector, int32_t result_offsets_vector_length); 128 129 // Integral return values used throughout regexp code layers. 130 static constexpr int kInternalRegExpFailure = 0; 131 static constexpr int kInternalRegExpSuccess = 1; 132 static constexpr int kInternalRegExpException = -1; 133 static constexpr int kInternalRegExpRetry = -2; 134 static constexpr int kInternalRegExpFallbackToExperimental = -3; 135 static constexpr int kInternalRegExpSmallestResult = -3; 136 137 enum IrregexpResult : int32_t { 138 RE_FAILURE = kInternalRegExpFailure, 139 RE_SUCCESS = kInternalRegExpSuccess, 140 RE_EXCEPTION = kInternalRegExpException, 141 RE_RETRY = kInternalRegExpRetry, 142 RE_FALLBACK_TO_EXPERIMENTAL = kInternalRegExpFallbackToExperimental, 143 }; 144 145 // Set last match info. If match is nullptr, then setting captures is 146 // omitted. 147 static DirectHandle<RegExpMatchInfo> SetLastMatchInfo( 148 Isolate* isolate, DirectHandle<RegExpMatchInfo> last_match_info, 149 DirectHandle<String> subject, int capture_count, int32_t* match); 150 151 V8_EXPORT_PRIVATE static bool CompileForTesting( 152 Isolate* isolate, Zone* zone, RegExpCompileData* input, RegExpFlags flags, 153 DirectHandle<String> pattern, DirectHandle<String> sample_subject, 154 bool is_one_byte); 155 156 V8_EXPORT_PRIVATE static void DotPrintForTesting(const char* label, 157 RegExpNode* node); 158 159 static const int kRegExpTooLargeToOptimize = 20 * KB; 160 161 V8_WARN_UNUSED_RESULT 162 static MaybeDirectHandle<Object> ThrowRegExpException( 163 Isolate* isolate, RegExpFlags flags, DirectHandle<String> pattern, 164 RegExpError error); 165 static void ThrowRegExpException(Isolate* isolate, 166 DirectHandle<RegExpData> re_data, 167 RegExpError error_text); 168 169 static bool IsUnmodifiedRegExp(Isolate* isolate, 170 DirectHandle<JSRegExp> regexp); 171 172 static DirectHandle<FixedArray> CreateCaptureNameMap( 173 Isolate* isolate, ZoneVector<RegExpCapture*>* named_captures); 174 }; 175 176 // Uses a special global mode of irregexp-generated code to perform a global 177 // search and return multiple results at once. As such, this is essentially an 178 // iterator over multiple results (retrieved batch-wise in advance). 179 class RegExpGlobalExecRunner final { 180 public: 181 RegExpGlobalExecRunner(DirectHandle<RegExpData> regexp_data, 182 DirectHandle<String> subject, Isolate* isolate); 183 184 // Fetch the next entry in the cache for global regexp match results. 185 // This does not set the last match info. Upon failure, nullptr is 186 // returned. The cause can be checked with Result(). The previous result is 187 // still in available in memory when a failure happens. 188 int32_t* FetchNext(); 189 190 int32_t* LastSuccessfulMatch() const; 191 192 bool HasException() const { return num_matches_ < 0; } 193 194 private: 195 int AdvanceZeroLength(int last_index) const; 196 197 int max_matches() const { 198 DCHECK_NE(register_array_size_, 0); 199 return register_array_size_ / registers_per_match_; 200 } 201 202 RegExpResultVectorScope result_vector_scope_; 203 int num_matches_ = 0; 204 int current_match_index_ = 0; 205 int registers_per_match_ = 0; 206 // Pointer to the last set of captures. 207 int32_t* register_array_ = nullptr; 208 int register_array_size_ = 0; 209 DirectHandle<RegExpData> regexp_data_; 210 DirectHandle<String> subject_; 211 Isolate* const isolate_; 212 }; 213 214 // Caches results for specific regexp queries on the isolate. At the time of 215 // writing, this is used during global calls to RegExp.prototype.exec and 216 // @@split. 217 class RegExpResultsCache final : public AllStatic { 218 public: 219 enum ResultsCacheType { REGEXP_MULTIPLE_INDICES, STRING_SPLIT_SUBSTRINGS }; 220 221 // Attempt to retrieve a cached result. On failure, 0 is returned as a Smi. 222 // On success, the returned result is guaranteed to be a COW-array. 223 static Tagged<Object> Lookup(Heap* heap, Tagged<String> key_string, 224 Tagged<Object> key_pattern, 225 Tagged<FixedArray>* last_match_out, 226 ResultsCacheType type); 227 // Attempt to add value_array to the cache specified by type. On success, 228 // value_array is turned into a COW-array. 229 static void Enter(Isolate* isolate, DirectHandle<String> key_string, 230 DirectHandle<Object> key_pattern, 231 DirectHandle<FixedArray> value_array, 232 DirectHandle<FixedArray> last_match_cache, 233 ResultsCacheType type); 234 static void Clear(Tagged<FixedArray> cache); 235 236 static constexpr int kRegExpResultsCacheSize = 0x100; 237 238 private: 239 static constexpr int kStringOffset = 0; 240 static constexpr int kPatternOffset = 1; 241 static constexpr int kArrayOffset = 2; 242 static constexpr int kLastMatchOffset = 3; 243 static constexpr int kArrayEntriesPerCacheEntry = 4; 244 }; 245 246 // Caches results of RegExpPrototypeMatch when: 247 // - the subject is a SlicedString 248 // - the pattern is an ATOM type regexp. 249 // 250 // This is intended for usage patterns where we search ever-growing slices of 251 // some large string. After a cache hit, RegExpMatchGlobalAtom only needs to 252 // process the trailing part of the subject string that was *not* part of the 253 // cached SlicedString. 254 // 255 // For example: 256 // 257 // long_string.substring(0, 100).match(pattern); 258 // long_string.substring(0, 200).match(pattern); 259 // 260 // The second call hits the cache for the slice [0, 100[ and only has to search 261 // the slice [100, 200]. 262 class RegExpResultsCache_MatchGlobalAtom final : public AllStatic { 263 public: 264 static void TryInsert(Isolate* isolate, Tagged<String> subject, 265 Tagged<String> pattern, int number_of_matches, 266 int last_match_index); 267 static bool TryGet(Isolate* isolate, Tagged<String> subject, 268 Tagged<String> pattern, int* number_of_matches_out, 269 int* last_match_index_out); 270 static void Clear(Heap* heap); 271 272 private: 273 static constexpr int kSubjectIndex = 0; // SlicedString. 274 static constexpr int kPatternIndex = 1; // String. 275 static constexpr int kNumberOfMatchesIndex = 2; // Smi. 276 static constexpr int kLastMatchIndexIndex = 3; // Smi. 277 static constexpr int kEntrySize = 4; 278 279 public: 280 static constexpr int kSize = kEntrySize; // Single-entry cache. 281 }; 282 283 } // namespace internal 284 } // namespace v8 285 286 #endif // V8_REGEXP_REGEXP_H_