WasmHeuristics.h (14558B)
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 * vim: set ts=8 sts=2 et sw=2 tw=80: 3 * This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 #ifndef wasm_WasmHeuristics_h 8 #define wasm_WasmHeuristics_h 9 10 #include <math.h> 11 12 #include "js/Prefs.h" 13 #include "threading/ExclusiveData.h" 14 #include "vm/MutexIDs.h" 15 #include "wasm/WasmConstants.h" 16 17 namespace js { 18 namespace wasm { 19 20 // Classes LazyTieringHeuristics and InliningHeuristics allow answering of 21 // simple questions relating to lazy tiering and inlining, eg, "is this 22 // function small enough to inline?" They do not answer questions that involve 23 // carrying state (eg, remaining inlining budget) across multiple queries. 24 // 25 // Note also, they may be queried in parallel without locking, by multiple 26 // instantiating / compilation threads, and so must be immutable once created. 27 28 // For both LazyTieringHeuristics and InliningHeuristics, the default `level_` 29 // is set to 5 in modules/libpref/init/StaticPrefList.yaml. The scaling 30 // factors and tables defined in this file have been set so as to give 31 // near-optimal performance on Barista-3 and another benchmark; they are 32 // generally within 2% of the best value that can be found by changing the 33 // `level_` numbers. Further performance gains may depend on improving the 34 // accuracy of estimateIonCompilationCost(). 35 // 36 // Performance was measured on a mid/high-end Intel CPU (Core i5-1135G7 -- 37 // Tiger Lake) and a low end Intel (Celeron N3050 -- Goldmont). 38 39 class LazyTieringHeuristics { 40 static constexpr uint32_t MIN_LEVEL = 1; 41 static constexpr uint32_t MAX_LEVEL = 9; 42 static constexpr uint32_t SMALL_MODULE_THRESH = 150000; 43 44 // A scaling table for levels 2 .. 8. Levels 1 and 9 are special-cased. In 45 // this table, each value differs from its neighbour by a factor of 3, giving 46 // a dynamic range in the table of 3 ^ 6 == 729, hence a wide selection of 47 // tier-up aggressiveness. 48 static constexpr float scale_[7] = {27.0, 9.0, 3.0, 49 1.0, // default 50 0.333, 0.111, 0.037}; 51 52 public: 53 // 1 = min (almost never, set tiering threshold to max possible, == 2^31-1) 54 // 5 = default 55 // 9 = max (request tier up at first call, set tiering threshold to zero) 56 // 57 // Don't use this directly, except for logging etc. 58 static uint32_t rawLevel() { 59 uint32_t level = JS::Prefs::wasm_lazy_tiering_level(); 60 return std::clamp(level, MIN_LEVEL, MAX_LEVEL); 61 } 62 63 // Estimate the cost of compiling a function of bytecode size `bodyLength` 64 // using Ion, in terms of arbitrary work-units. The baseline code for the 65 // function counts down from the returned value as it runs. When the value 66 // goes negative it requests tier-up. See "[SMDOC] WebAssembly baseline 67 // compiler -- Lazy Tier-Up mechanism" in WasmBaselineCompile.cpp. 68 69 static int32_t estimateIonCompilationCost(uint32_t bodyLength, 70 size_t codeSectionSize) { 71 uint32_t level = rawLevel(); 72 73 // Increase the aggressiveness of tiering for small modules, since they 74 // don't generate much optimised-tier compilation work, so we might as well 75 // try to get them into optimized code sooner. But don't overdo it, since 76 // we don't want to lose indirect-target resolution as a result. See bug 77 // 1965195. 78 MOZ_ASSERT(codeSectionSize > 0); 79 if (codeSectionSize <= SMALL_MODULE_THRESH && level < MAX_LEVEL) { 80 level += 1; 81 } 82 83 if (MOZ_LIKELY(MIN_LEVEL < level && level < MAX_LEVEL)) { 84 // The estimated cost, in X86_64 insns, for Ion compilation: 85 // 30k up-front cost + 4k per bytecode byte. 86 // 87 // This is derived from measurements of an optimized build of Ion 88 // compiling about 99000 functions. Each estimate is pretty bad, but 89 // averaged over a number of functions it's often within 20% of correct. 90 // However, this is with no inlining; that causes a much wider variance 91 // of costs. This will need to be revisited at some point. 92 float thresholdF = 30000.0 + 4000.0 * float(bodyLength); 93 94 // Rescale to step-down work units, so that the default `level` setting 95 // (5) gives pretty good results. 96 thresholdF *= 0.25; 97 98 // Rescale again to take into account `level`. 99 thresholdF *= scale_[level - (MIN_LEVEL + 1)]; 100 101 // Clamp and convert. 102 constexpr float thresholdHigh = 2.0e9f; // at most 2 billion; 103 int32_t thresholdI = int32_t(std::clamp(thresholdF, 10.f, thresholdHigh)); 104 MOZ_RELEASE_ASSERT(thresholdI >= 0); 105 return thresholdI; 106 } 107 if (level == MIN_LEVEL) { 108 // "almost never tier up"; produce our closest approximation to infinity 109 return INT32_MAX; 110 } 111 if (level == MAX_LEVEL) { 112 // request tier up at the first call; return the lowest possible value 113 return 0; 114 } 115 MOZ_CRASH(); 116 } 117 }; 118 119 // [SMDOC] Per-function and per-module inlining limits 120 121 // `class InliningHeuristics` makes inlining decisions on a per-call-site 122 // basis. Even with that in place, it is still possible to create a small 123 // input function for which inlining produces a huge (1000 x) expansion. Hence 124 // we also need a backstop mechanism to limit growth of functions and of 125 // modules as a whole. 126 // 127 // The following scheme is therefore implemented: 128 // 129 // * no function can have an inlining-based expansion of more than a constant 130 // factor (here, 99 x). 131 // 132 // * for a module as a whole there is also a max expansion factor, and this is 133 // much lower, perhaps 1 x. 134 // 135 // This means that 136 // 137 // * no individual function can cause too much trouble (due to the 99 x limit), 138 // yet any function that needs a lot of inlining can still get it. In 139 // practice most functions have an inlining expansion, at default settings, 140 // of much less than 5 x. 141 // 142 // * the module as a whole cannot chew up excessive resources. 143 // 144 // Once a limit is exhausted, Ion compilation is still possible, but no 145 // inlining will be done. 146 // 147 // The per-module limit needs to be interpreted in the light of lazy tiering. 148 // Many modules only tier up a small subset of their functions. Hence the 149 // relatively low per-module limit still allows a high level of expansion of 150 // the functions that do get tiered up. 151 // 152 // In effect, the tiering mechanism gives hot functions (early tierer-uppers) 153 // preferential access to the module-level inlining budget. Colder functions 154 // that tier up later may find the budget to be exhausted, in which case they 155 // get no inlining. It would be feasible to gradually reduce inlining 156 // aggressiveness as the budget is used up, rather than have cliff-edge 157 // behaviour, but it hardly seems worth the hassle. 158 // 159 // To implement this, we have 160 // 161 // * `int64_t WasmCodeMetadata::ProtectedOptimizationStats::inliningBudget`: 162 // this is initially set as the maximum copied-in bytecode length allowable 163 // for the module. Inlining of individual call sites decreases the value and 164 // may drive it negative. Once the value is negative, no more inlining is 165 // allowed. 166 // 167 // * `int64_t FunctionCompiler::inliningBudget_` does the same at a 168 // per-function level. Its initial value takes into account the current 169 // value of the module-level budget; hence if the latter is exhausted, the 170 // function-level budget will be zero and so no inlining occurs. 171 // 172 // If either limit is exceeded, a message is printed on the 173 // `MOZ_LOG=wasmCodeMetaStats:3` channel. 174 175 // Allowing budgets to be driven negative means we slightly overshoot them. An 176 // alternative to be to ensure they can never be driven negative, in which case 177 // we will slightly undershoot them instead, given that the sum of inlined 178 // function sizes is unlikely to exactly match the budget. We use the 179 // overshoot scheme only because it makes it simple to decide when to log a 180 // budget-overshoot message and not emit any duplicates. 181 182 // There is a (logical, not-TSan-detectable) race condition in that the 183 // inlining budget for a function is set in part from the module-level budget 184 // at the time that compilation of the function begins, and the module-level 185 // budget is updated when compilation of a function ends -- see 186 // FunctionCompiler::initToplevel and ::finish. If there are multiple 187 // compilation threads, it can happen that multiple threads individually 188 // overrun the module-level budget, and so collectively overshoot the budget 189 // multiple times. 190 // 191 // The worst-case total overshoot is equal to the worst-case per-function 192 // overshoot multiplied by the max number of functions that can be concurrently 193 // compiled: 194 // 195 // <max per-function overshoot, which 196 // == the largest body length that can be accepted 197 // by InliningHeuristics::isSmallEnoughToInline> 198 // * MaxPartialTier2CompileTasks 199 // 200 // which with current settings is 320 * 1 == 320. 201 // 202 // We never expect to hit either limit in normal operation -- they exist only 203 // to protect against the worst case. So the imprecision doesn't matter. 204 205 // Setting the multiplier here to 1 means that inlining can copy in at maximum 206 // the same amount of bytecode as is in the module; 2 means twice as much, etc, 207 // and setting it to 0 would completely disable inlining. 208 static constexpr int64_t PerModuleMaxInliningRatio = 1; 209 210 // Same meaning as above, except at a per-function level. 211 static constexpr int64_t PerFunctionMaxInliningRatio = 99; 212 213 class InliningHeuristics { 214 static constexpr uint32_t MIN_LEVEL = 1; 215 static constexpr uint32_t MAX_LEVEL = 9; 216 217 static constexpr uint32_t LARGE_FUNCTION_THRESH_1 = 400000; 218 static constexpr uint32_t LARGE_FUNCTION_THRESH_2 = 800000; 219 static constexpr uint32_t LARGE_FUNCTION_THRESH_3 = 1200000; 220 221 public: 222 // 1 = no inlining allowed 223 // 2 = min (minimal inlining) 224 // 5 = default 225 // 9 = max (very aggressive inlining) 226 // 227 // Don't use these directly, except for logging etc. 228 static uint32_t rawLevel() { 229 uint32_t level = JS::Prefs::wasm_inlining_level(); 230 return std::clamp(level, MIN_LEVEL, MAX_LEVEL); 231 } 232 static bool rawDirectAllowed() { return JS::Prefs::wasm_direct_inlining(); } 233 static bool rawCallRefAllowed() { 234 return JS::Prefs::wasm_call_ref_inlining(); 235 } 236 // For a call_ref site, returns the percentage of total calls made by that 237 // site, that any single target has to make in order to be considered as a 238 // candidate for speculative inlining. 239 static uint32_t rawCallRefPercent() { 240 uint32_t percent = JS::Prefs::wasm_call_ref_inlining_percent(); 241 // Clamp to range 10 .. 100 (%). 242 return std::clamp(percent, 10u, 100u); 243 } 244 245 // Calculate the total inlining budget for a module, based on the size of the 246 // code section. 247 static int64_t moduleInliningBudget(size_t codeSectionSize) { 248 int64_t budget = int64_t(codeSectionSize) * PerModuleMaxInliningRatio; 249 250 // Don't be overly stingy for tiny modules. Function-level inlining 251 // limits will still protect us from excessive inlining. 252 return std::max<int64_t>(budget, 1000); 253 } 254 255 // Given a call of kind `callKind` to a function of bytecode size 256 // `bodyLength` at `inliningDepth`, decide whether the it is allowable to 257 // inline the call. Note that `inliningDepth` starts at zero, not one. In 258 // other words, a value of zero means the query relates to a function which 259 // (if approved) would be inlined into the top-level function currently being 260 // compiled. 261 // 262 // `rootFunctionBodyLength` is the bytecode size of the function at the root 263 // of this inlining stack. If that is (very) large, we back off somewhat on 264 // inlining. `*largeFunctionBackoff` indicates whether or not that happened. 265 enum class CallKind { Direct, CallRef }; 266 static bool isSmallEnoughToInline(CallKind callKind, uint32_t inliningDepth, 267 uint32_t bodyLength, 268 uint32_t rootFunctionBodyLength, 269 bool* largeFunctionBackoff) { 270 *largeFunctionBackoff = false; 271 272 // If this fails, something's seriously wrong; bail out. 273 MOZ_RELEASE_ASSERT(inliningDepth <= 10); // because 10 > (320 / 40) 274 MOZ_ASSERT(rootFunctionBodyLength > 0 && 275 rootFunctionBodyLength <= wasm::MaxFunctionBytes); 276 277 // Check whether calls of this kind are currently allowed 278 if ((callKind == CallKind::Direct && !rawDirectAllowed()) || 279 (callKind == CallKind::CallRef && !rawCallRefAllowed())) { 280 return false; 281 } 282 // Check the size is allowable. This depends on how deep we are in the 283 // stack and on the setting of level_. We allow inlining of functions of 284 // size up to the `baseSize[]` value at depth zero, but reduce the 285 // allowable size by 40 for each further level of inlining, so that only 286 // smaller and smaller functions are allowed as we inline deeper. 287 // 288 // At some point `allowedSize` goes negative and thereby disallows all 289 // further inlining. Note that the `baseSize` entry for 290 // `level_ == MIN_LEVEL (== 1)` is set so as to disallow inlining even at 291 // depth zero. Hence `level_ == MIN_LEVEL` disallows all inlining. 292 static constexpr int32_t baseSize[9] = {0, 40, 80, 120, 293 160, // default 294 200, 240, 280, 320}; 295 uint32_t level = rawLevel(); 296 297 // If the root function is large, back off somewhat on inlining, so as to 298 // limit its further growth. The limits are set so high that almost all 299 // functions will be unaffected by this. See bug 1967644. 300 if (rootFunctionBodyLength > LARGE_FUNCTION_THRESH_1 && level > MIN_LEVEL) { 301 level--; 302 *largeFunctionBackoff = true; 303 } 304 if (rootFunctionBodyLength > LARGE_FUNCTION_THRESH_2 && level > MIN_LEVEL) { 305 level--; 306 *largeFunctionBackoff = true; 307 } 308 if (rootFunctionBodyLength > LARGE_FUNCTION_THRESH_3 && level > MIN_LEVEL) { 309 level--; 310 *largeFunctionBackoff = true; 311 } 312 313 // Having established `level`, check whether the callee is small enough. 314 MOZ_RELEASE_ASSERT(level >= MIN_LEVEL && level <= MAX_LEVEL); 315 int32_t allowedSize = baseSize[level - MIN_LEVEL]; 316 allowedSize -= int32_t(40 * inliningDepth); 317 return allowedSize > 0 && bodyLength <= uint32_t(allowedSize); 318 } 319 }; 320 321 } // namespace wasm 322 } // namespace js 323 324 #endif /* wasm_WasmHeuristics_h */