tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

SourceText.h (13555B)


      1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
      2 /* This Source Code Form is subject to the terms of the Mozilla Public
      3 * License, v. 2.0. If a copy of the MPL was not distributed with this
      4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      5 
      6 /*
      7 * SourceText encapsulates a count of char16_t (UTF-16) or Utf8Unit (UTF-8)
      8 * code units (note: code *units*, not bytes or code points) and those code
      9 * units ("source units").  (Latin-1 is not supported: all places where Latin-1
     10 * must be compiled first convert to a supported encoding.)
     11 *
     12 * A SourceText either observes without owning, or takes ownership of, source
     13 * units passed to |SourceText::init|.  Thus SourceText can be used to
     14 * efficiently avoid copying.
     15 *
     16 * Rules for use:
     17 *
     18 *  1) The passed-in source units must be allocated with js_malloc(),
     19 *     js_calloc(), or js_realloc() if |SourceText::init| is instructed to take
     20 *     ownership of the source units.
     21 *  2) If |SourceText::init| merely borrows the source units, the user must
     22 *     keep them alive until associated JS compilation is complete.
     23 *  3) Code that calls |SourceText::take{Chars,Units}()| must keep the source
     24 *     units alive until JS compilation completes.  Normally only the JS engine
     25 *     should call |SourceText::take{Chars,Units}()|.
     26 *  4) Use the appropriate SourceText parameterization depending on the source
     27 *     units encoding.
     28 *
     29 * Example use:
     30 *
     31 *    size_t length = 512;
     32 *    char16_t* chars = js_pod_malloc<char16_t>(length);
     33 *    if (!chars) {
     34 *        JS_ReportOutOfMemory(cx);
     35 *        return false;
     36 *    }
     37 *    JS::SourceText<char16_t> srcBuf;
     38 *    if (!srcBuf.init(cx, chars, length, JS::SourceOwnership::TakeOwnership)) {
     39 *        return false;
     40 *    }
     41 *    JS::Rooted<JSScript*> script(cx);
     42 *    if (!JS::Compile(cx, options, srcBuf, &script)) {
     43 *        return false;
     44 *    }
     45 */
     46 
     47 #ifndef js_SourceText_h
     48 #define js_SourceText_h
     49 
     50 #include "mozilla/Assertions.h"  // MOZ_ASSERT
     51 #include "mozilla/Attributes.h"  // MOZ_COLD, MOZ_IS_CLASS_INIT
     52 #include "mozilla/Likely.h"      // MOZ_UNLIKELY
     53 
     54 #include <stddef.h>     // size_t
     55 #include <stdint.h>     // UINT32_MAX
     56 #include <type_traits>  // std::conditional_t, std::is_same_v
     57 
     58 #include "js/UniquePtr.h"  // js::UniquePtr
     59 #include "js/Utility.h"    // JS::FreePolicy
     60 
     61 namespace mozilla {
     62 union Utf8Unit;
     63 }
     64 
     65 namespace js {
     66 class FrontendContext;
     67 }  // namespace js
     68 
     69 namespace JS {
     70 
     71 class JS_PUBLIC_API AutoStableStringChars;
     72 using FrontendContext = js::FrontendContext;
     73 
     74 namespace detail {
     75 
     76 MOZ_COLD extern JS_PUBLIC_API void ReportSourceTooLong(JSContext* cx);
     77 MOZ_COLD extern JS_PUBLIC_API void ReportSourceTooLong(JS::FrontendContext* fc);
     78 
     79 }  // namespace detail
     80 
     81 enum class SourceOwnership {
     82  Borrowed,
     83  TakeOwnership,
     84 };
     85 
     86 template <typename Unit>
     87 class SourceText final {
     88 private:
     89  static_assert(std::is_same_v<Unit, mozilla::Utf8Unit> ||
     90                    std::is_same_v<Unit, char16_t>,
     91                "Unit must be either char16_t or Utf8Unit for "
     92                "SourceText<Unit>");
     93 
     94  /** |char16_t| or |Utf8Unit| source units of uncertain validity. */
     95  const Unit* units_ = nullptr;
     96 
     97  /** The length in code units of |units_|. */
     98  uint32_t length_ = 0;
     99 
    100  /**
    101   * Whether this owns |units_| or merely observes source units owned by some
    102   * other object.
    103   */
    104  bool ownsUnits_ = false;
    105 
    106 public:
    107  // A C++ character type that can represent the source units -- suitable for
    108  // passing to C++ string functions.
    109  using CharT =
    110      std::conditional_t<std::is_same_v<Unit, char16_t>, char16_t, char>;
    111 
    112 public:
    113  /**
    114   * Construct a SourceText.  It must be initialized using |init()| before it
    115   * can be used as compilation source text.
    116   */
    117  SourceText() = default;
    118 
    119  /**
    120   * Construct a SourceText from contents extracted from |other|.  This
    121   * SourceText will then act exactly as |other| would have acted, had it
    122   * not been passed to this function.  |other| will return to its default-
    123   * constructed state and must have |init()| called on it to use it.
    124   */
    125  SourceText(SourceText&& other)
    126      : units_(other.units_),
    127        length_(other.length_),
    128        ownsUnits_(other.ownsUnits_) {
    129    other.units_ = nullptr;
    130    other.length_ = 0;
    131    other.ownsUnits_ = false;
    132  }
    133 
    134  ~SourceText() {
    135    if (ownsUnits_) {
    136      js_free(const_cast<Unit*>(units_));
    137    }
    138  }
    139 
    140 private:
    141  template <typename ContextT>
    142  [[nodiscard]] MOZ_IS_CLASS_INIT bool initImpl(ContextT* context,
    143                                                const Unit* units,
    144                                                size_t unitsLength,
    145                                                SourceOwnership ownership) {
    146    MOZ_ASSERT_IF(units == nullptr, unitsLength == 0);
    147 
    148    // Ideally we'd use |Unit| and not cast below, but the risk of a static
    149    // initializer is too great.
    150    static const CharT emptyString[] = {'\0'};
    151 
    152    // Initialize all fields *before* checking length.  This ensures that
    153    // if |ownership == SourceOwnership::TakeOwnership|, |units| will be
    154    // freed when |this|'s destructor is called.
    155    if (units) {
    156      units_ = units;
    157      length_ = static_cast<uint32_t>(unitsLength);
    158      ownsUnits_ = ownership == SourceOwnership::TakeOwnership;
    159    } else {
    160      units_ = reinterpret_cast<const Unit*>(emptyString);
    161      length_ = 0;
    162      ownsUnits_ = false;
    163    }
    164 
    165    // IMPLEMENTATION DETAIL, DO NOT RELY ON: This limit is used so we can
    166    // store offsets in |JSScript|s as |uint32_t|.  It could be lifted
    167    // fairly easily if desired, as the compiler uses |size_t| internally.
    168    if (MOZ_UNLIKELY(unitsLength > UINT32_MAX)) {
    169      detail::ReportSourceTooLong(context);
    170      return false;
    171    }
    172 
    173    return true;
    174  }
    175 
    176 public:
    177  /**
    178   * Initialize this with source unit data: |char16_t| for UTF-16 source
    179   * units, or |Utf8Unit| for UTF-8 source units.
    180   *
    181   * If |ownership == TakeOwnership|, *this function* takes ownership of
    182   * |units|, *even if* this function fails, and you MUST NOT free |units|
    183   * yourself.  This single-owner-friendly approach reduces risk of leaks on
    184   * failure.
    185   *
    186   * |units| may be null if |unitsLength == 0|; if so, this will silently be
    187   * initialized using non-null, unowned units.
    188   */
    189  [[nodiscard]] MOZ_IS_CLASS_INIT bool init(JSContext* cx, const Unit* units,
    190                                            size_t unitsLength,
    191                                            SourceOwnership ownership) {
    192    return initImpl(cx, units, unitsLength, ownership);
    193  }
    194  [[nodiscard]] MOZ_IS_CLASS_INIT bool init(JS::FrontendContext* fc,
    195                                            const Unit* units,
    196                                            size_t unitsLength,
    197                                            SourceOwnership ownership) {
    198    return initImpl(fc, units, unitsLength, ownership);
    199  }
    200 
    201  /**
    202   * Exactly identical to the |init()| overload above that accepts
    203   * |const Unit*|, but instead takes character data: |const CharT*|.
    204   *
    205   * (We can't just write this to accept |const CharT*|, because then in the
    206   * UTF-16 case this overload and the one above would be identical.  So we
    207   * use SFINAE to expose the |CharT| overload only if it's different.)
    208   */
    209  template <typename Char,
    210            typename = std::enable_if_t<std::is_same_v<Char, CharT> &&
    211                                        !std::is_same_v<Char, Unit>>>
    212  [[nodiscard]] MOZ_IS_CLASS_INIT bool init(JSContext* cx, const Char* chars,
    213                                            size_t charsLength,
    214                                            SourceOwnership ownership) {
    215    return initImpl(cx, reinterpret_cast<const Unit*>(chars), charsLength,
    216                    ownership);
    217  }
    218  template <typename Char,
    219            typename = std::enable_if_t<std::is_same_v<Char, CharT> &&
    220                                        !std::is_same_v<Char, Unit>>>
    221  [[nodiscard]] MOZ_IS_CLASS_INIT bool init(JS::FrontendContext* fc,
    222                                            const Char* chars,
    223                                            size_t charsLength,
    224                                            SourceOwnership ownership) {
    225    return initImpl(fc, reinterpret_cast<const Unit*>(chars), charsLength,
    226                    ownership);
    227  }
    228 
    229  /**
    230   * Initialize this using source units transferred out of |data|.
    231   */
    232  [[nodiscard]] bool init(JSContext* cx,
    233                          js::UniquePtr<Unit[], JS::FreePolicy> data,
    234                          size_t dataLength) {
    235    return initImpl(cx, data.release(), dataLength,
    236                    SourceOwnership::TakeOwnership);
    237  }
    238  [[nodiscard]] bool init(JS::FrontendContext* fc,
    239                          js::UniquePtr<Unit[], JS::FreePolicy> data,
    240                          size_t dataLength) {
    241    return initImpl(fc, data.release(), dataLength,
    242                    SourceOwnership::TakeOwnership);
    243  }
    244 
    245  /**
    246   * Exactly identical to the |init()| overload above that accepts
    247   * |UniquePtr<Unit[], JS::FreePolicy>|, but instead takes character data:
    248   * |UniquePtr<CharT[], JS::FreePolicy>|.
    249   *
    250   * (We can't just duplicate the signature above with s/Unit/CharT/, because
    251   * then in the UTF-16 case this overload and the one above would be identical.
    252   * So we use SFINAE to expose the |CharT| overload only if it's different.)
    253   */
    254  template <typename Char,
    255            typename = std::enable_if_t<std::is_same_v<Char, CharT> &&
    256                                        !std::is_same_v<Char, Unit>>>
    257  [[nodiscard]] bool init(JSContext* cx,
    258                          js::UniquePtr<Char[], JS::FreePolicy> data,
    259                          size_t dataLength) {
    260    return init(cx, data.release(), dataLength, SourceOwnership::TakeOwnership);
    261  }
    262  template <typename Char,
    263            typename = std::enable_if_t<std::is_same_v<Char, CharT> &&
    264                                        !std::is_same_v<Char, Unit>>>
    265  [[nodiscard]] bool init(JS::FrontendContext* fc,
    266                          js::UniquePtr<Char[], JS::FreePolicy> data,
    267                          size_t dataLength) {
    268    return init(fc, data.release(), dataLength, SourceOwnership::TakeOwnership);
    269  }
    270 
    271  /**
    272   * Initialize this using an AutoStableStringChars. Transfers the code units if
    273   * they are owned by the AutoStableStringChars, otherwise borrow directly from
    274   * the underlying JSString. The AutoStableStringChars must outlive this
    275   * SourceText and must be explicitly configured to the same unit type as this
    276   * SourceText.
    277   */
    278  [[nodiscard]] bool initMaybeBorrowed(JSContext* cx,
    279                                       AutoStableStringChars& linearChars);
    280  [[nodiscard]] bool initMaybeBorrowed(JS::FrontendContext* fc,
    281                                       AutoStableStringChars& linearChars);
    282 
    283  /**
    284   * Access the encapsulated data using a code unit type.
    285   *
    286   * This function is useful for code that wants to interact with source text
    287   * as *code units*, not as string data.  This doesn't matter for UTF-16,
    288   * but it's a crucial distinction for UTF-8.  When UTF-8 source text is
    289   * encapsulated, |Unit| being |mozilla::Utf8Unit| unambiguously indicates
    290   * that the code units are UTF-8.  In contrast |const char*| returned by
    291   * |get()| below could hold UTF-8 (or its ASCII subset) or Latin-1 or (in
    292   * particularly cursed embeddings) EBCDIC or some other legacy character
    293   * set.  Prefer this function to |get()| wherever possible.
    294   */
    295  const Unit* units() const { return units_; }
    296 
    297  /**
    298   * Access the encapsulated data using a character type.
    299   *
    300   * This function is useful for interactions with character-centric actions
    301   * like interacting with UniqueChars/UniqueTwoByteChars or printing out
    302   * text in a debugger, that only work with |CharT|.  But as |CharT| loses
    303   * encoding specificity when UTF-8 source text is encapsulated, prefer
    304   * |units()| to this function.
    305   */
    306  const CharT* get() const { return reinterpret_cast<const CharT*>(units_); }
    307 
    308  /**
    309   * Returns true if this owns the source units and will free them on
    310   * destruction.  If true, it is legal to call |take{Chars,Units}()|.
    311   */
    312  bool ownsUnits() const { return ownsUnits_; }
    313 
    314  /**
    315   * Count of the underlying source units -- code units, not bytes or code
    316   * points -- in this.
    317   */
    318  uint32_t length() const { return length_; }
    319 
    320  /**
    321   * Retrieve and take ownership of the underlying source units.  The caller
    322   * is now responsible for calling js_free() on the returned value, *but
    323   * only after JS script compilation has completed*.
    324   *
    325   * After underlying source units have been taken, this will continue to
    326   * refer to the same data -- it just won't own the data.  get() and
    327   * length() will return the same values, but ownsUnits() will be false.
    328   * The taken source units must be kept alive until after JS script
    329   * compilation completes, as noted above, for this to be safe.
    330   *
    331   * The caller must check ownsUnits() before calling takeUnits().  Taking
    332   * and then free'ing an unowned buffer will have dire consequences.
    333   */
    334  Unit* takeUnits() {
    335    MOZ_ASSERT(ownsUnits_);
    336    ownsUnits_ = false;
    337    return const_cast<Unit*>(units_);
    338  }
    339 
    340  /**
    341   * Akin to |takeUnits()| in all respects, but returns characters rather
    342   * than units.
    343   */
    344  CharT* takeChars() { return reinterpret_cast<CharT*>(takeUnits()); }
    345 
    346 private:
    347  SourceText(const SourceText&) = delete;
    348  void operator=(const SourceText&) = delete;
    349 };
    350 
    351 }  // namespace JS
    352 
    353 #endif /* js_SourceText_h */