tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

regexp-macro-assembler.h (16700B)


      1 // Copyright 2012 the V8 project authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef V8_REGEXP_REGEXP_MACRO_ASSEMBLER_H_
      6 #define V8_REGEXP_REGEXP_MACRO_ASSEMBLER_H_
      7 
      8 #include "irregexp/imported/regexp-ast.h"
      9 #include "irregexp/imported/regexp.h"
     10 #include "irregexp/RegExpShim.h"
     11 
     12 namespace v8 {
     13 namespace internal {
     14 
     15 class ByteArray;
     16 class JSRegExp;
     17 class Label;
     18 class String;
     19 
     20 static const base::uc32 kLeadSurrogateStart = 0xd800;
     21 static const base::uc32 kLeadSurrogateEnd = 0xdbff;
     22 static const base::uc32 kTrailSurrogateStart = 0xdc00;
     23 static const base::uc32 kTrailSurrogateEnd = 0xdfff;
     24 static const base::uc32 kNonBmpStart = 0x10000;
     25 static const base::uc32 kNonBmpEnd = 0x10ffff;
     26 
     27 class RegExpMacroAssembler {
     28 public:
     29  // The implementation must be able to handle at least:
     30  static constexpr int kMaxRegisterCount = (1 << 16);
     31  static constexpr int kMaxRegister = kMaxRegisterCount - 1;
     32  static constexpr int kMaxCaptures = (kMaxRegister - 1) / 2;
     33  // Note the minimum value is chosen s.t. a negated valid offset is also a
     34  // valid offset.
     35  static constexpr int kMaxCPOffset = (1 << 15) - 1;
     36  static constexpr int kMinCPOffset = -kMaxCPOffset;
     37 
     38  static constexpr int kTableSizeBits = 7;
     39  static constexpr int kTableSize = 1 << kTableSizeBits;
     40  static constexpr int kTableMask = kTableSize - 1;
     41 
     42  static constexpr int kUseCharactersValue = -1;
     43 
     44  RegExpMacroAssembler(Isolate* isolate, Zone* zone);
     45  RegExpMacroAssembler(const RegExpMacroAssembler& other) V8_NOEXCEPT = default;
     46  virtual ~RegExpMacroAssembler() = default;
     47 
     48  virtual DirectHandle<HeapObject> GetCode(DirectHandle<String> source,
     49                                           RegExpFlags flags) = 0;
     50 
     51  // This function is called when code generation is aborted, so that
     52  // the assembler could clean up internal data structures.
     53  virtual void AbortedCodeGeneration() {}
     54  // The maximal number of pushes between stack checks. Users must supply
     55  // kCheckStackLimit flag to push operations (instead of kNoStackLimitCheck)
     56  // at least once for every stack_limit() pushes that are executed.
     57  virtual int stack_limit_slack_slot_count() = 0;
     58  virtual bool CanReadUnaligned() const = 0;
     59 
     60  virtual void AdvanceCurrentPosition(int by) = 0;  // Signed cp change.
     61  virtual void AdvanceRegister(int reg, int by) = 0;  // r[reg] += by.
     62  // Continues execution from the position pushed on the top of the backtrack
     63  // stack by an earlier PushBacktrack(Label*).
     64  virtual void Backtrack() = 0;
     65  virtual void Bind(Label* label) = 0;
     66  // Dispatch after looking the current character up in a 2-bits-per-entry
     67  // map.  The destinations vector has up to 4 labels.
     68  virtual void CheckCharacter(unsigned c, Label* on_equal) = 0;
     69  // Bitwise and the current character with the given constant and then
     70  // check for a match with c.
     71  virtual void CheckCharacterAfterAnd(unsigned c,
     72                                      unsigned and_with,
     73                                      Label* on_equal) = 0;
     74  virtual void CheckCharacterGT(base::uc16 limit, Label* on_greater) = 0;
     75  virtual void CheckCharacterLT(base::uc16 limit, Label* on_less) = 0;
     76  virtual void CheckFixedLengthLoop(Label* on_tos_equals_current_position) = 0;
     77  virtual void CheckAtStart(int cp_offset, Label* on_at_start) = 0;
     78  virtual void CheckNotAtStart(int cp_offset, Label* on_not_at_start) = 0;
     79  virtual void CheckNotBackReference(int start_reg, bool read_backward,
     80                                     Label* on_no_match) = 0;
     81  virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
     82                                               bool read_backward, bool unicode,
     83                                               Label* on_no_match) = 0;
     84  // Check the current character for a match with a literal character.  If we
     85  // fail to match then goto the on_failure label.  End of input always
     86  // matches.  If the label is nullptr then we should pop a backtrack address
     87  // off the stack and go to that.
     88  virtual void CheckNotCharacter(unsigned c, Label* on_not_equal) = 0;
     89  virtual void CheckNotCharacterAfterAnd(unsigned c,
     90                                         unsigned and_with,
     91                                         Label* on_not_equal) = 0;
     92  // Subtract a constant from the current character, then and with the given
     93  // constant and then check for a match with c.
     94  virtual void CheckNotCharacterAfterMinusAnd(base::uc16 c, base::uc16 minus,
     95                                              base::uc16 and_with,
     96                                              Label* on_not_equal) = 0;
     97  virtual void CheckCharacterInRange(base::uc16 from,
     98                                     base::uc16 to,  // Both inclusive.
     99                                     Label* on_in_range) = 0;
    100  virtual void CheckCharacterNotInRange(base::uc16 from,
    101                                        base::uc16 to,  // Both inclusive.
    102                                        Label* on_not_in_range) = 0;
    103  // Returns true if the check was emitted, false otherwise.
    104  virtual bool CheckCharacterInRangeArray(
    105      const ZoneList<CharacterRange>* ranges, Label* on_in_range) = 0;
    106  virtual bool CheckCharacterNotInRangeArray(
    107      const ZoneList<CharacterRange>* ranges, Label* on_not_in_range) = 0;
    108 
    109  // The current character (modulus the kTableSize) is looked up in the byte
    110  // array, and if the found byte is non-zero, we jump to the on_bit_set label.
    111  virtual void CheckBitInTable(Handle<ByteArray> table, Label* on_bit_set) = 0;
    112 
    113  virtual void SkipUntilBitInTable(int cp_offset, Handle<ByteArray> table,
    114                                   Handle<ByteArray> nibble_table,
    115                                   int advance_by) = 0;
    116  virtual bool SkipUntilBitInTableUseSimd(int advance_by) { return false; }
    117 
    118  // Checks whether the given offset from the current position is before
    119  // the end of the string.  May overwrite the current character.
    120  virtual void CheckPosition(int cp_offset, Label* on_outside_input);
    121  // Check whether a standard/default character class matches the current
    122  // character. Returns false if the type of special character class does
    123  // not have custom support.
    124  // May clobber the current loaded character.
    125  virtual bool CheckSpecialClassRanges(StandardCharacterSet type,
    126                                       Label* on_no_match) {
    127    return false;
    128  }
    129 
    130  // Control-flow integrity:
    131  // Define a jump target and bind a label.
    132  virtual void BindJumpTarget(Label* label) { Bind(label); }
    133 
    134  virtual void Fail() = 0;
    135  virtual void GoTo(Label* label) = 0;
    136  // Check whether a register is >= a given constant and go to a label if it
    137  // is.  Backtracks instead if the label is nullptr.
    138  virtual void IfRegisterGE(int reg, int comparand, Label* if_ge) = 0;
    139  // Check whether a register is < a given constant and go to a label if it is.
    140  // Backtracks instead if the label is nullptr.
    141  virtual void IfRegisterLT(int reg, int comparand, Label* if_lt) = 0;
    142  // Check whether a register is == to the current position and go to a
    143  // label if it is.
    144  virtual void IfRegisterEqPos(int reg, Label* if_eq) = 0;
    145  V8_EXPORT_PRIVATE void LoadCurrentCharacter(
    146      int cp_offset, Label* on_end_of_input, bool check_bounds = true,
    147      int characters = 1, int eats_at_least = kUseCharactersValue);
    148  virtual void LoadCurrentCharacterImpl(int cp_offset, Label* on_end_of_input,
    149                                        bool check_bounds, int characters,
    150                                        int eats_at_least) = 0;
    151  virtual void PopCurrentPosition() = 0;
    152  virtual void PopRegister(int register_index) = 0;
    153  // Pushes the label on the backtrack stack, so that a following Backtrack
    154  // will go to this label. Always checks the backtrack stack limit.
    155  virtual void PushBacktrack(Label* label) = 0;
    156  virtual void PushCurrentPosition() = 0;
    157  enum StackCheckFlag { kNoStackLimitCheck = false, kCheckStackLimit = true };
    158  virtual void PushRegister(int register_index,
    159                            StackCheckFlag check_stack_limit) = 0;
    160  virtual void ReadCurrentPositionFromRegister(int reg) = 0;
    161  virtual void ReadStackPointerFromRegister(int reg) = 0;
    162  virtual void SetCurrentPositionFromEnd(int by) = 0;
    163  virtual void SetRegister(int register_index, int to) = 0;
    164  // Return whether the matching (with a global regexp) will be restarted.
    165  virtual bool Succeed() = 0;
    166  virtual void WriteCurrentPositionToRegister(int reg, int cp_offset) = 0;
    167  virtual void ClearRegisters(int reg_from, int reg_to) = 0;
    168  virtual void WriteStackPointerToRegister(int reg) = 0;
    169 
    170  // Check that we are not in the middle of a surrogate pair.
    171  void CheckNotInSurrogatePair(int cp_offset, Label* on_failure);
    172 
    173 #define IMPLEMENTATIONS_LIST(V) \
    174  V(IA32)                       \
    175  V(ARM)                        \
    176  V(ARM64)                      \
    177  V(MIPS)                       \
    178  V(LOONG64)                    \
    179  V(RISCV)                      \
    180  V(RISCV32)                    \
    181  V(S390)                       \
    182  V(PPC)                        \
    183  V(X64)                        \
    184  V(Bytecode)
    185 
    186  enum IrregexpImplementation {
    187 #define V(Name) k##Name##Implementation,
    188    IMPLEMENTATIONS_LIST(V)
    189 #undef V
    190  };
    191 
    192  inline const char* ImplementationToString(IrregexpImplementation impl) {
    193    static const char* const kNames[] = {
    194 #define V(Name) #Name,
    195        IMPLEMENTATIONS_LIST(V)
    196 #undef V
    197    };
    198    return kNames[impl];
    199  }
    200 #undef IMPLEMENTATIONS_LIST
    201  virtual IrregexpImplementation Implementation() = 0;
    202 
    203  // Compare two-byte strings case insensitively.
    204  //
    205  // Called from generated code.
    206  static int CaseInsensitiveCompareNonUnicode(Address byte_offset1,
    207                                              Address byte_offset2,
    208                                              size_t byte_length,
    209                                              Isolate* isolate);
    210  static int CaseInsensitiveCompareUnicode(Address byte_offset1,
    211                                           Address byte_offset2,
    212                                           size_t byte_length,
    213                                           Isolate* isolate);
    214 
    215  // `raw_byte_array` is a ByteArray containing a set of character ranges,
    216  // where ranges are encoded as uint16_t elements:
    217  //
    218  //  [from0, to0, from1, to1, ..., fromN, toN], or
    219  //  [from0, to0, from1, to1, ..., fromN]  (open-ended last interval).
    220  //
    221  // fromN is inclusive, toN is exclusive. Returns zero if not in a range,
    222  // non-zero otherwise.
    223  //
    224  // Called from generated code.
    225  static uint32_t IsCharacterInRangeArray(uint32_t current_char,
    226                                          Address raw_byte_array);
    227 
    228  // Controls the generation of large inlined constants in the code.
    229  void set_slow_safe(bool ssc) { slow_safe_compiler_ = ssc; }
    230  bool slow_safe() const { return slow_safe_compiler_; }
    231 
    232  // Controls after how many backtracks irregexp should abort execution.  If it
    233  // can fall back to the experimental engine (see `set_can_fallback`), it will
    234  // return the appropriate error code, otherwise it will return the number of
    235  // matches found so far (perhaps none).
    236  void set_backtrack_limit(uint32_t backtrack_limit) {
    237    backtrack_limit_ = backtrack_limit;
    238  }
    239 
    240  // Set whether or not irregexp can fall back to the experimental engine on
    241  // excessive backtracking.  The number of backtracks considered excessive can
    242  // be controlled with set_backtrack_limit.
    243  void set_can_fallback(bool val) { can_fallback_ = val; }
    244 
    245  enum GlobalMode {
    246    NOT_GLOBAL,
    247    GLOBAL_NO_ZERO_LENGTH_CHECK,
    248    GLOBAL,
    249    GLOBAL_UNICODE
    250  };
    251  // Set whether the regular expression has the global flag.  Exiting due to
    252  // a failure in a global regexp may still mean success overall.
    253  inline void set_global_mode(GlobalMode mode) { global_mode_ = mode; }
    254  inline bool global() const { return global_mode_ != NOT_GLOBAL; }
    255  inline bool global_with_zero_length_check() const {
    256    return global_mode_ == GLOBAL || global_mode_ == GLOBAL_UNICODE;
    257  }
    258  inline bool global_unicode() const { return global_mode_ == GLOBAL_UNICODE; }
    259 
    260  Isolate* isolate() const { return isolate_; }
    261  Zone* zone() const { return zone_; }
    262 
    263 protected:
    264  bool has_backtrack_limit() const;
    265  uint32_t backtrack_limit() const { return backtrack_limit_; }
    266 
    267  bool can_fallback() const { return can_fallback_; }
    268 
    269 private:
    270  bool slow_safe_compiler_;
    271  uint32_t backtrack_limit_;
    272  bool can_fallback_ = false;
    273  GlobalMode global_mode_;
    274  Isolate* const isolate_;
    275  Zone* const zone_;
    276 };
    277 
    278 class NativeRegExpMacroAssembler: public RegExpMacroAssembler {
    279 public:
    280  // Type of input string to generate code for.
    281  enum Mode { LATIN1 = 1, UC16 = 2 };
    282 
    283  // Result of calling generated native RegExp code.
    284  // RETRY: Something significant changed during execution, and the matching
    285  //        should be retried from scratch.
    286  // EXCEPTION: Something failed during execution. If no exception has been
    287  //            thrown, it's an internal out-of-memory, and the caller should
    288  //            throw the exception.
    289  // FAILURE: Matching failed.
    290  // SUCCESS: Matching succeeded, and the output array has been filled with
    291  //          capture positions.
    292  // FALLBACK_TO_EXPERIMENTAL: Execute the regexp on this subject using the
    293  //                           experimental engine instead.
    294  enum Result {
    295    FAILURE = RegExp::kInternalRegExpFailure,
    296    SUCCESS = RegExp::kInternalRegExpSuccess,
    297    EXCEPTION = RegExp::kInternalRegExpException,
    298    RETRY = RegExp::kInternalRegExpRetry,
    299    FALLBACK_TO_EXPERIMENTAL = RegExp::kInternalRegExpFallbackToExperimental,
    300    SMALLEST_REGEXP_RESULT = RegExp::kInternalRegExpSmallestResult,
    301  };
    302 
    303  NativeRegExpMacroAssembler(Isolate* isolate, Zone* zone)
    304      : RegExpMacroAssembler(isolate, zone), range_array_cache_(zone) {}
    305  ~NativeRegExpMacroAssembler() override = default;
    306 
    307  // Returns a {Result} sentinel, or the number of successful matches.
    308  static int Match(DirectHandle<IrRegExpData> regexp_data,
    309                   DirectHandle<String> subject, int* offsets_vector,
    310                   int offsets_vector_length, int previous_index,
    311                   Isolate* isolate);
    312 
    313  V8_EXPORT_PRIVATE static int ExecuteForTesting(
    314      Tagged<String> input, int start_offset, const uint8_t* input_start,
    315      const uint8_t* input_end, int* output, int output_size, Isolate* isolate,
    316      Tagged<JSRegExp> regexp);
    317 
    318  bool CanReadUnaligned() const override;
    319 
    320  void LoadCurrentCharacterImpl(int cp_offset, Label* on_end_of_input,
    321                                bool check_bounds, int characters,
    322                                int eats_at_least) override;
    323  // Load a number of characters at the given offset from the
    324  // current position, into the current-character register.
    325  virtual void LoadCurrentCharacterUnchecked(int cp_offset,
    326                                             int character_count) = 0;
    327 
    328  // Called from RegExp if the backtrack stack limit is hit. Tries to expand
    329  // the stack. Returns the new stack-pointer if successful, or returns 0 if
    330  // unable to grow the stack.
    331  // This function must not trigger a garbage collection.
    332  //
    333  // Called from generated code.
    334  static Address GrowStack(Isolate* isolate);
    335 
    336  // Called from generated code.
    337  static int CheckStackGuardState(Isolate* isolate, int start_index,
    338                                  RegExp::CallOrigin call_origin,
    339                                  Address* return_address,
    340                                  Tagged<InstructionStream> re_code,
    341                                  Address* subject, const uint8_t** input_start,
    342                                  const uint8_t** input_end, uintptr_t gap);
    343 
    344  static Address word_character_map_address() {
    345    return reinterpret_cast<Address>(&word_character_map[0]);
    346  }
    347 
    348 protected:
    349  // Byte map of one byte characters with a 0xff if the character is a word
    350  // character (digit, letter or underscore) and 0x00 otherwise.
    351  // Used by generated RegExp code.
    352  static const uint8_t word_character_map[256];
    353 
    354  Handle<ByteArray> GetOrAddRangeArray(const ZoneList<CharacterRange>* ranges);
    355 
    356 private:
    357  // Returns a {Result} sentinel, or the number of successful matches.
    358  static int Execute(Tagged<String> input, int start_offset,
    359                     const uint8_t* input_start, const uint8_t* input_end,
    360                     int* output, int output_size, Isolate* isolate,
    361                     Tagged<IrRegExpData> regexp_data);
    362 
    363  ZoneUnorderedMap<uint32_t, IndirectHandle<FixedUInt16Array>>
    364      range_array_cache_;
    365 };
    366 
    367 }  // namespace internal
    368 }  // namespace v8
    369 
    370 #endif  // V8_REGEXP_REGEXP_MACRO_ASSEMBLER_H_