tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

WasmHeuristics.h (14558B)


      1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
      2 * vim: set ts=8 sts=2 et sw=2 tw=80:
      3 * This Source Code Form is subject to the terms of the Mozilla Public
      4 * License, v. 2.0. If a copy of the MPL was not distributed with this
      5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      6 
      7 #ifndef wasm_WasmHeuristics_h
      8 #define wasm_WasmHeuristics_h
      9 
     10 #include <math.h>
     11 
     12 #include "js/Prefs.h"
     13 #include "threading/ExclusiveData.h"
     14 #include "vm/MutexIDs.h"
     15 #include "wasm/WasmConstants.h"
     16 
     17 namespace js {
     18 namespace wasm {
     19 
     20 // Classes LazyTieringHeuristics and InliningHeuristics allow answering of
     21 // simple questions relating to lazy tiering and inlining, eg, "is this
     22 // function small enough to inline?"  They do not answer questions that involve
     23 // carrying state (eg, remaining inlining budget) across multiple queries.
     24 //
     25 // Note also, they may be queried in parallel without locking, by multiple
     26 // instantiating / compilation threads, and so must be immutable once created.
     27 
     28 // For both LazyTieringHeuristics and InliningHeuristics, the default `level_`
     29 // is set to 5 in modules/libpref/init/StaticPrefList.yaml.  The scaling
     30 // factors and tables defined in this file have been set so as to give
     31 // near-optimal performance on Barista-3 and another benchmark; they are
     32 // generally within 2% of the best value that can be found by changing the
     33 // `level_` numbers.  Further performance gains may depend on improving the
     34 // accuracy of estimateIonCompilationCost().
     35 //
     36 // Performance was measured on a mid/high-end Intel CPU (Core i5-1135G7 --
     37 // Tiger Lake) and a low end Intel (Celeron N3050 -- Goldmont).
     38 
     39 class LazyTieringHeuristics {
     40  static constexpr uint32_t MIN_LEVEL = 1;
     41  static constexpr uint32_t MAX_LEVEL = 9;
     42  static constexpr uint32_t SMALL_MODULE_THRESH = 150000;
     43 
     44  // A scaling table for levels 2 .. 8.  Levels 1 and 9 are special-cased.  In
     45  // this table, each value differs from its neighbour by a factor of 3, giving
     46  // a dynamic range in the table of 3 ^ 6 == 729, hence a wide selection of
     47  // tier-up aggressiveness.
     48  static constexpr float scale_[7] = {27.0,  9.0,   3.0,
     49                                      1.0,  // default
     50                                      0.333, 0.111, 0.037};
     51 
     52 public:
     53  // 1 = min (almost never, set tiering threshold to max possible, == 2^31-1)
     54  // 5 = default
     55  // 9 = max (request tier up at first call, set tiering threshold to zero)
     56  //
     57  // Don't use this directly, except for logging etc.
     58  static uint32_t rawLevel() {
     59    uint32_t level = JS::Prefs::wasm_lazy_tiering_level();
     60    return std::clamp(level, MIN_LEVEL, MAX_LEVEL);
     61  }
     62 
     63  // Estimate the cost of compiling a function of bytecode size `bodyLength`
     64  // using Ion, in terms of arbitrary work-units.  The baseline code for the
     65  // function counts down from the returned value as it runs.  When the value
     66  // goes negative it requests tier-up.  See "[SMDOC] WebAssembly baseline
     67  // compiler -- Lazy Tier-Up mechanism" in WasmBaselineCompile.cpp.
     68 
     69  static int32_t estimateIonCompilationCost(uint32_t bodyLength,
     70                                            size_t codeSectionSize) {
     71    uint32_t level = rawLevel();
     72 
     73    // Increase the aggressiveness of tiering for small modules, since they
     74    // don't generate much optimised-tier compilation work, so we might as well
     75    // try to get them into optimized code sooner.  But don't overdo it, since
     76    // we don't want to lose indirect-target resolution as a result.  See bug
     77    // 1965195.
     78    MOZ_ASSERT(codeSectionSize > 0);
     79    if (codeSectionSize <= SMALL_MODULE_THRESH && level < MAX_LEVEL) {
     80      level += 1;
     81    }
     82 
     83    if (MOZ_LIKELY(MIN_LEVEL < level && level < MAX_LEVEL)) {
     84      // The estimated cost, in X86_64 insns, for Ion compilation:
     85      // 30k up-front cost + 4k per bytecode byte.
     86      //
     87      // This is derived from measurements of an optimized build of Ion
     88      // compiling about 99000 functions.  Each estimate is pretty bad, but
     89      // averaged over a number of functions it's often within 20% of correct.
     90      // However, this is with no inlining; that causes a much wider variance
     91      // of costs.  This will need to be revisited at some point.
     92      float thresholdF = 30000.0 + 4000.0 * float(bodyLength);
     93 
     94      // Rescale to step-down work units, so that the default `level` setting
     95      // (5) gives pretty good results.
     96      thresholdF *= 0.25;
     97 
     98      // Rescale again to take into account `level`.
     99      thresholdF *= scale_[level - (MIN_LEVEL + 1)];
    100 
    101      // Clamp and convert.
    102      constexpr float thresholdHigh = 2.0e9f;  // at most 2 billion;
    103      int32_t thresholdI = int32_t(std::clamp(thresholdF, 10.f, thresholdHigh));
    104      MOZ_RELEASE_ASSERT(thresholdI >= 0);
    105      return thresholdI;
    106    }
    107    if (level == MIN_LEVEL) {
    108      // "almost never tier up"; produce our closest approximation to infinity
    109      return INT32_MAX;
    110    }
    111    if (level == MAX_LEVEL) {
    112      // request tier up at the first call; return the lowest possible value
    113      return 0;
    114    }
    115    MOZ_CRASH();
    116  }
    117 };
    118 
    119 // [SMDOC] Per-function and per-module inlining limits
    120 
    121 // `class InliningHeuristics` makes inlining decisions on a per-call-site
    122 // basis.  Even with that in place, it is still possible to create a small
    123 // input function for which inlining produces a huge (1000 x) expansion.  Hence
    124 // we also need a backstop mechanism to limit growth of functions and of
    125 // modules as a whole.
    126 //
    127 // The following scheme is therefore implemented:
    128 //
    129 // * no function can have an inlining-based expansion of more than a constant
    130 //   factor (here, 99 x).
    131 //
    132 // * for a module as a whole there is also a max expansion factor, and this is
    133 //   much lower, perhaps 1 x.
    134 //
    135 // This means that
    136 //
    137 // * no individual function can cause too much trouble (due to the 99 x limit),
    138 //   yet any function that needs a lot of inlining can still get it. In
    139 //   practice most functions have an inlining expansion, at default settings,
    140 //   of much less than 5 x.
    141 //
    142 // * the module as a whole cannot chew up excessive resources.
    143 //
    144 // Once a limit is exhausted, Ion compilation is still possible, but no
    145 // inlining will be done.
    146 //
    147 // The per-module limit needs to be interpreted in the light of lazy tiering.
    148 // Many modules only tier up a small subset of their functions.  Hence the
    149 // relatively low per-module limit still allows a high level of expansion of
    150 // the functions that do get tiered up.
    151 //
    152 // In effect, the tiering mechanism gives hot functions (early tierer-uppers)
    153 // preferential access to the module-level inlining budget.  Colder functions
    154 // that tier up later may find the budget to be exhausted, in which case they
    155 // get no inlining.  It would be feasible to gradually reduce inlining
    156 // aggressiveness as the budget is used up, rather than have cliff-edge
    157 // behaviour, but it hardly seems worth the hassle.
    158 //
    159 // To implement this, we have
    160 //
    161 // * `int64_t WasmCodeMetadata::ProtectedOptimizationStats::inliningBudget`:
    162 //   this is initially set as the maximum copied-in bytecode length allowable
    163 //   for the module.  Inlining of individual call sites decreases the value and
    164 //   may drive it negative.  Once the value is negative, no more inlining is
    165 //   allowed.
    166 //
    167 // * `int64_t FunctionCompiler::inliningBudget_` does the same at a
    168 //   per-function level.  Its initial value takes into account the current
    169 //   value of the module-level budget; hence if the latter is exhausted, the
    170 //   function-level budget will be zero and so no inlining occurs.
    171 //
    172 // If either limit is exceeded, a message is printed on the
    173 // `MOZ_LOG=wasmCodeMetaStats:3` channel.
    174 
    175 // Allowing budgets to be driven negative means we slightly overshoot them.  An
    176 // alternative to be to ensure they can never be driven negative, in which case
    177 // we will slightly undershoot them instead, given that the sum of inlined
    178 // function sizes is unlikely to exactly match the budget.  We use the
    179 // overshoot scheme only because it makes it simple to decide when to log a
    180 // budget-overshoot message and not emit any duplicates.
    181 
    182 // There is a (logical, not-TSan-detectable) race condition in that the
    183 // inlining budget for a function is set in part from the module-level budget
    184 // at the time that compilation of the function begins, and the module-level
    185 // budget is updated when compilation of a function ends -- see
    186 // FunctionCompiler::initToplevel and ::finish.  If there are multiple
    187 // compilation threads, it can happen that multiple threads individually
    188 // overrun the module-level budget, and so collectively overshoot the budget
    189 // multiple times.
    190 //
    191 // The worst-case total overshoot is equal to the worst-case per-function
    192 // overshoot multiplied by the max number of functions that can be concurrently
    193 // compiled:
    194 //
    195 //   <max per-function overshoot, which
    196 //      == the largest body length that can be accepted
    197 //             by InliningHeuristics::isSmallEnoughToInline>
    198 //   * MaxPartialTier2CompileTasks
    199 //
    200 // which with current settings is 320 * 1 == 320.
    201 //
    202 // We never expect to hit either limit in normal operation -- they exist only
    203 // to protect against the worst case.  So the imprecision doesn't matter.
    204 
    205 // Setting the multiplier here to 1 means that inlining can copy in at maximum
    206 // the same amount of bytecode as is in the module; 2 means twice as much, etc,
    207 // and setting it to 0 would completely disable inlining.
    208 static constexpr int64_t PerModuleMaxInliningRatio = 1;
    209 
    210 // Same meaning as above, except at a per-function level.
    211 static constexpr int64_t PerFunctionMaxInliningRatio = 99;
    212 
    213 class InliningHeuristics {
    214  static constexpr uint32_t MIN_LEVEL = 1;
    215  static constexpr uint32_t MAX_LEVEL = 9;
    216 
    217  static constexpr uint32_t LARGE_FUNCTION_THRESH_1 = 400000;
    218  static constexpr uint32_t LARGE_FUNCTION_THRESH_2 = 800000;
    219  static constexpr uint32_t LARGE_FUNCTION_THRESH_3 = 1200000;
    220 
    221 public:
    222  // 1 = no inlining allowed
    223  // 2 = min (minimal inlining)
    224  // 5 = default
    225  // 9 = max (very aggressive inlining)
    226  //
    227  // Don't use these directly, except for logging etc.
    228  static uint32_t rawLevel() {
    229    uint32_t level = JS::Prefs::wasm_inlining_level();
    230    return std::clamp(level, MIN_LEVEL, MAX_LEVEL);
    231  }
    232  static bool rawDirectAllowed() { return JS::Prefs::wasm_direct_inlining(); }
    233  static bool rawCallRefAllowed() {
    234    return JS::Prefs::wasm_call_ref_inlining();
    235  }
    236  // For a call_ref site, returns the percentage of total calls made by that
    237  // site, that any single target has to make in order to be considered as a
    238  // candidate for speculative inlining.
    239  static uint32_t rawCallRefPercent() {
    240    uint32_t percent = JS::Prefs::wasm_call_ref_inlining_percent();
    241    // Clamp to range 10 .. 100 (%).
    242    return std::clamp(percent, 10u, 100u);
    243  }
    244 
    245  // Calculate the total inlining budget for a module, based on the size of the
    246  // code section.
    247  static int64_t moduleInliningBudget(size_t codeSectionSize) {
    248    int64_t budget = int64_t(codeSectionSize) * PerModuleMaxInliningRatio;
    249 
    250    // Don't be overly stingy for tiny modules.  Function-level inlining
    251    // limits will still protect us from excessive inlining.
    252    return std::max<int64_t>(budget, 1000);
    253  }
    254 
    255  // Given a call of kind `callKind` to a function of bytecode size
    256  // `bodyLength` at `inliningDepth`, decide whether the it is allowable to
    257  // inline the call.  Note that `inliningDepth` starts at zero, not one.  In
    258  // other words, a value of zero means the query relates to a function which
    259  // (if approved) would be inlined into the top-level function currently being
    260  // compiled.
    261  //
    262  // `rootFunctionBodyLength` is the bytecode size of the function at the root
    263  // of this inlining stack.  If that is (very) large, we back off somewhat on
    264  // inlining.  `*largeFunctionBackoff` indicates whether or not that happened.
    265  enum class CallKind { Direct, CallRef };
    266  static bool isSmallEnoughToInline(CallKind callKind, uint32_t inliningDepth,
    267                                    uint32_t bodyLength,
    268                                    uint32_t rootFunctionBodyLength,
    269                                    bool* largeFunctionBackoff) {
    270    *largeFunctionBackoff = false;
    271 
    272    // If this fails, something's seriously wrong; bail out.
    273    MOZ_RELEASE_ASSERT(inliningDepth <= 10);  // because 10 > (320 / 40)
    274    MOZ_ASSERT(rootFunctionBodyLength > 0 &&
    275               rootFunctionBodyLength <= wasm::MaxFunctionBytes);
    276 
    277    // Check whether calls of this kind are currently allowed
    278    if ((callKind == CallKind::Direct && !rawDirectAllowed()) ||
    279        (callKind == CallKind::CallRef && !rawCallRefAllowed())) {
    280      return false;
    281    }
    282    // Check the size is allowable.  This depends on how deep we are in the
    283    // stack and on the setting of level_.  We allow inlining of functions of
    284    // size up to the `baseSize[]` value at depth zero, but reduce the
    285    // allowable size by 40 for each further level of inlining, so that only
    286    // smaller and smaller functions are allowed as we inline deeper.
    287    //
    288    // At some point `allowedSize` goes negative and thereby disallows all
    289    // further inlining.  Note that the `baseSize` entry for
    290    // `level_ == MIN_LEVEL (== 1)` is set so as to disallow inlining even at
    291    // depth zero.  Hence `level_ == MIN_LEVEL` disallows all inlining.
    292    static constexpr int32_t baseSize[9] = {0,   40,  80,  120,
    293                                            160,  // default
    294                                            200, 240, 280, 320};
    295    uint32_t level = rawLevel();
    296 
    297    // If the root function is large, back off somewhat on inlining, so as to
    298    // limit its further growth.  The limits are set so high that almost all
    299    // functions will be unaffected by this.  See bug 1967644.
    300    if (rootFunctionBodyLength > LARGE_FUNCTION_THRESH_1 && level > MIN_LEVEL) {
    301      level--;
    302      *largeFunctionBackoff = true;
    303    }
    304    if (rootFunctionBodyLength > LARGE_FUNCTION_THRESH_2 && level > MIN_LEVEL) {
    305      level--;
    306      *largeFunctionBackoff = true;
    307    }
    308    if (rootFunctionBodyLength > LARGE_FUNCTION_THRESH_3 && level > MIN_LEVEL) {
    309      level--;
    310      *largeFunctionBackoff = true;
    311    }
    312 
    313    // Having established `level`, check whether the callee is small enough.
    314    MOZ_RELEASE_ASSERT(level >= MIN_LEVEL && level <= MAX_LEVEL);
    315    int32_t allowedSize = baseSize[level - MIN_LEVEL];
    316    allowedSize -= int32_t(40 * inliningDepth);
    317    return allowedSize > 0 && bodyLength <= uint32_t(allowedSize);
    318  }
    319 };
    320 
    321 }  // namespace wasm
    322 }  // namespace js
    323 
    324 #endif /* wasm_WasmHeuristics_h */