BaseProfilingStack.h (20214B)
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 * vim: set ts=8 sts=2 et sw=2 tw=80: 3 * This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 #ifndef BaseProfilingStack_h 8 #define BaseProfilingStack_h 9 10 #ifndef MOZ_GECKO_PROFILER 11 # error Do not #include this header when MOZ_GECKO_PROFILER is not #defined. 12 #endif 13 14 #include "mozilla/Assertions.h" 15 #include "mozilla/Atomics.h" 16 #include "mozilla/BaseProfilingCategory.h" 17 18 #include <stdint.h> 19 20 // This file defines the classes ProfilingStack and ProfilingStackFrame. 21 // The ProfilingStack manages an array of ProfilingStackFrames. 22 // It keeps track of the "label stack" and the JS interpreter stack. 23 // The two stack types are interleaved. 24 // 25 // Usage: 26 // 27 // ProfilingStack* profilingStack = ...; 28 // 29 // // For label frames: 30 // profilingStack->pushLabelFrame(...); 31 // // Execute some code. When finished, pop the frame: 32 // profilingStack->pop(); 33 // 34 // // For JS stack frames: 35 // profilingStack->pushJSFrame(...); 36 // // Execute some code. When finished, pop the frame: 37 // profilingStack->pop(); 38 // 39 // 40 // Concurrency considerations 41 // 42 // A thread's profiling stack (and the frames inside it) is only modified by 43 // that thread. However, the profiling stack can be *read* by a different 44 // thread, the sampler thread: Whenever the profiler wants to sample a given 45 // thread A, the following happens: 46 // (1) Thread A is suspended. 47 // (2) The sampler thread (thread S) reads the ProfilingStack of thread A, 48 // including all ProfilingStackFrames that are currently in that stack 49 // (profilingStack->frames[0..profilingStack->stackSize()]). 50 // (3) Thread A is resumed. 51 // 52 // Thread suspension is achieved using platform-specific APIs; refer to each 53 // platform's Sampler::SuspendAndSampleAndResumeThread implementation in 54 // platform-*.cpp for details. 55 // 56 // When the thread is suspended, the values in profilingStack->stackPointer and 57 // in the stack frame range 58 // profilingStack->frames[0..profilingStack->stackPointer] need to be in a 59 // consistent state, so that thread S does not read partially- constructed stack 60 // frames. More specifically, we have two requirements: 61 // (1) When adding a new frame at the top of the stack, its ProfilingStackFrame 62 // data needs to be put in place *before* the stackPointer is incremented, 63 // and the compiler + CPU need to know that this order matters. 64 // (2) When popping an frame from the stack and then preparing the 65 // ProfilingStackFrame data for the next frame that is about to be pushed, 66 // the decrement of the stackPointer in pop() needs to happen *before* the 67 // ProfilingStackFrame for the new frame is being popuplated, and the 68 // compiler + CPU need to know that this order matters. 69 // 70 // We can express the relevance of these orderings in multiple ways. 71 // Option A is to make stackPointer an atomic with SequentiallyConsistent 72 // memory ordering. This would ensure that no writes in thread A would be 73 // reordered across any writes to stackPointer, which satisfies requirements 74 // (1) and (2) at the same time. Option A is the simplest. 75 // Option B is to use ReleaseAcquire memory ordering both for writes to 76 // stackPointer *and* for writes to ProfilingStackFrame fields. Release-stores 77 // ensure that all writes that happened *before this write in program order* are 78 // not reordered to happen after this write. ReleaseAcquire ordering places no 79 // requirements on the ordering of writes that happen *after* this write in 80 // program order. 81 // Using release-stores for writes to stackPointer expresses requirement (1), 82 // and using release-stores for writes to the ProfilingStackFrame fields 83 // expresses requirement (2). 84 // 85 // Option B is more complicated than option A, but has much better performance 86 // on x86/64: In a microbenchmark run on a Macbook Pro from 2017, switching 87 // from option A to option B reduced the overhead of pushing+popping a 88 // ProfilingStackFrame by 10 nanoseconds. 89 // On x86/64, release-stores require no explicit hardware barriers or lock 90 // instructions. 91 // On ARM/64, option B may be slower than option A, because the compiler will 92 // generate hardware barriers for every single release-store instead of just 93 // for the writes to stackPointer. However, the actual performance impact of 94 // this has not yet been measured on ARM, so we're currently using option B 95 // everywhere. This is something that we may want to change in the future once 96 // we've done measurements. 97 98 namespace mozilla { 99 namespace baseprofiler { 100 101 // A call stack can be specified to the JS engine such that all JS entry/exits 102 // to functions push/pop a stack frame to/from the specified stack. 103 // 104 // For more detailed information, see vm/GeckoProfiler.h. 105 // 106 class ProfilingStackFrame { 107 // A ProfilingStackFrame represents either a label frame or a JS frame. 108 109 // WARNING WARNING WARNING 110 // 111 // All the fields below are Atomic<...,ReleaseAcquire>. This is needed so 112 // that writes to these fields are release-writes, which ensures that 113 // earlier writes in this thread don't get reordered after the writes to 114 // these fields. In particular, the decrement of the stack pointer in 115 // ProfilingStack::pop() is a write that *must* happen before the values in 116 // this ProfilingStackFrame are changed. Otherwise, the sampler thread might 117 // see an inconsistent state where the stack pointer still points to a 118 // ProfilingStackFrame which has already been popped off the stack and whose 119 // fields have now been partially repopulated with new values. 120 // See the "Concurrency considerations" paragraph at the top of this file 121 // for more details. 122 123 // Descriptive label for this stack frame. Must be a static string! Can be 124 // an empty string, but not a null pointer. 125 Atomic<const char*, ReleaseAcquire> label_; 126 127 // An additional descriptive string of this frame which is combined with 128 // |label_| in profiler output. Need not be (and usually isn't) static. Can 129 // be null. 130 Atomic<const char*, ReleaseAcquire> dynamicString_; 131 132 // Stack pointer for non-JS stack frames, the script pointer otherwise. 133 Atomic<void*, ReleaseAcquire> spOrScript; 134 135 // ID of the JS Realm for JS stack frames. 136 // Must not be used on non-JS frames; it'll contain either the default 0, 137 // or a leftover value from a previous JS stack frame that was using this 138 // ProfilingStackFrame object. 139 mozilla::Atomic<uint64_t, mozilla::ReleaseAcquire> realmID_; 140 141 // The bytecode offset for JS stack frames. 142 // Must not be used on non-JS frames; it'll contain either the default 0, 143 // or a leftover value from a previous JS stack frame that was using this 144 // ProfilingStackFrame object. 145 Atomic<int32_t, ReleaseAcquire> pcOffsetIfJS_; 146 147 // Bits 0...8 hold the Flags. Bits 9...31 hold the category pair. 148 Atomic<uint32_t, ReleaseAcquire> flagsAndCategoryPair_; 149 150 public: 151 ProfilingStackFrame() = default; 152 ProfilingStackFrame& operator=(const ProfilingStackFrame& other) { 153 label_ = other.label(); 154 dynamicString_ = other.dynamicString(); 155 void* spScript = other.spOrScript; 156 spOrScript = spScript; 157 int32_t offsetIfJS = other.pcOffsetIfJS_; 158 pcOffsetIfJS_ = offsetIfJS; 159 int64_t realmID = other.realmID_; 160 realmID_ = realmID; 161 uint32_t flagsAndCategory = other.flagsAndCategoryPair_; 162 flagsAndCategoryPair_ = flagsAndCategory; 163 return *this; 164 } 165 166 // Reserve up to 16 bits for flags, and 16 for category pair. 167 enum class Flags : uint32_t { 168 // The first three flags describe the kind of the frame and are 169 // mutually exclusive. (We still give them individual bits for 170 // simplicity.) 171 172 // A regular label frame. These usually come from AutoProfilerLabel. 173 IS_LABEL_FRAME = 1 << 0, 174 175 // A special frame indicating the start of a run of JS profiling stack 176 // frames. IS_SP_MARKER_FRAME frames are ignored, except for the sp 177 // field. These frames are needed to get correct ordering between JS 178 // and LABEL frames because JS frames don't carry sp information. 179 // SP is short for "stack pointer". 180 IS_SP_MARKER_FRAME = 1 << 1, 181 182 // A JS frame. 183 IS_JS_FRAME = 1 << 2, 184 185 // An interpreter JS frame that has OSR-ed into baseline. IS_JS_FRAME 186 // frames can have this flag set and unset during their lifetime. 187 // JS_OSR frames are ignored. 188 JS_OSR = 1 << 3, 189 190 // The next three are mutually exclusive. 191 // By default, for profiling stack frames that have both a label and a 192 // dynamic string, the two strings are combined into one string of the 193 // form "<label> <dynamicString>" during JSON serialization. The 194 // following flags can be used to change this preset. 195 STRING_TEMPLATE_METHOD = 1 << 4, // "<label>.<dynamicString>" 196 STRING_TEMPLATE_GETTER = 1 << 5, // "get <label>.<dynamicString>" 197 STRING_TEMPLATE_SETTER = 1 << 6, // "set <label>.<dynamicString>" 198 199 // If set, causes this stack frame to be marked as "relevantForJS" in 200 // the profile JSON, which will make it show up in the "JS only" call 201 // tree view. 202 RELEVANT_FOR_JS = 1 << 7, 203 204 // If set, causes the label on this ProfilingStackFrame to be ignored 205 // and to be replaced by the subcategory's label. 206 LABEL_DETERMINED_BY_CATEGORY_PAIR = 1 << 8, 207 208 // Frame dynamic string does not contain user data. 209 NONSENSITIVE = 1 << 9, 210 211 // A JS Baseline Interpreter frame. 212 IS_BLINTERP_FRAME = 1 << 10, 213 214 FLAGS_BITCOUNT = 16, 215 FLAGS_MASK = (1 << FLAGS_BITCOUNT) - 1 216 }; 217 218 static_assert( 219 uint32_t(ProfilingCategoryPair::LAST) <= 220 (UINT32_MAX >> uint32_t(Flags::FLAGS_BITCOUNT)), 221 "Too many category pairs to fit into u32 with together with the " 222 "reserved bits for the flags"); 223 224 bool isLabelFrame() const { 225 return uint32_t(flagsAndCategoryPair_) & uint32_t(Flags::IS_LABEL_FRAME); 226 } 227 228 bool isSpMarkerFrame() const { 229 return uint32_t(flagsAndCategoryPair_) & 230 uint32_t(Flags::IS_SP_MARKER_FRAME); 231 } 232 233 bool isJsFrame() const { 234 return uint32_t(flagsAndCategoryPair_) & uint32_t(Flags::IS_JS_FRAME); 235 } 236 237 bool isOSRFrame() const { 238 return uint32_t(flagsAndCategoryPair_) & uint32_t(Flags::JS_OSR); 239 } 240 241 void setIsOSRFrame(bool isOSR) { 242 if (isOSR) { 243 flagsAndCategoryPair_ = 244 uint32_t(flagsAndCategoryPair_) | uint32_t(Flags::JS_OSR); 245 } else { 246 flagsAndCategoryPair_ = 247 uint32_t(flagsAndCategoryPair_) & ~uint32_t(Flags::JS_OSR); 248 } 249 } 250 251 const char* label() const { 252 uint32_t flagsAndCategoryPair = flagsAndCategoryPair_; 253 if (flagsAndCategoryPair & 254 uint32_t(Flags::LABEL_DETERMINED_BY_CATEGORY_PAIR)) { 255 auto categoryPair = ProfilingCategoryPair( 256 flagsAndCategoryPair >> uint32_t(Flags::FLAGS_BITCOUNT)); 257 return GetProfilingCategoryPairInfo(categoryPair).mLabel; 258 } 259 return label_; 260 } 261 262 const char* dynamicString() const { return dynamicString_; } 263 264 void initLabelFrame(const char* aLabel, const char* aDynamicString, void* sp, 265 ProfilingCategoryPair aCategoryPair, uint32_t aFlags) { 266 label_ = aLabel; 267 dynamicString_ = aDynamicString; 268 spOrScript = sp; 269 // pcOffsetIfJS_ is not set and must not be used on label frames. 270 flagsAndCategoryPair_ = 271 uint32_t(Flags::IS_LABEL_FRAME) | 272 (uint32_t(aCategoryPair) << uint32_t(Flags::FLAGS_BITCOUNT)) | aFlags; 273 MOZ_ASSERT(isLabelFrame()); 274 } 275 276 void initSpMarkerFrame(void* sp) { 277 label_ = ""; 278 dynamicString_ = nullptr; 279 spOrScript = sp; 280 // pcOffsetIfJS_ is not set and must not be used on sp marker frames. 281 flagsAndCategoryPair_ = uint32_t(Flags::IS_SP_MARKER_FRAME) | 282 (uint32_t(ProfilingCategoryPair::OTHER) 283 << uint32_t(Flags::FLAGS_BITCOUNT)); 284 MOZ_ASSERT(isSpMarkerFrame()); 285 } 286 287 void initJsFrame(const char* aLabel, const char* aDynamicString, 288 void* /* JSScript* */ aScript, int32_t aOffset, 289 uint64_t aRealmID) { 290 label_ = aLabel; 291 dynamicString_ = aDynamicString; 292 spOrScript = aScript; 293 pcOffsetIfJS_ = aOffset; 294 realmID_ = aRealmID; 295 flagsAndCategoryPair_ = 296 uint32_t(Flags::IS_JS_FRAME) | (uint32_t(ProfilingCategoryPair::JS) 297 << uint32_t(Flags::FLAGS_BITCOUNT)); 298 MOZ_ASSERT(isJsFrame()); 299 } 300 301 uint32_t flags() const { 302 return uint32_t(flagsAndCategoryPair_) & uint32_t(Flags::FLAGS_MASK); 303 } 304 305 ProfilingCategoryPair categoryPair() const { 306 return ProfilingCategoryPair(flagsAndCategoryPair_ >> 307 uint32_t(Flags::FLAGS_BITCOUNT)); 308 } 309 310 uint64_t realmID() const { return realmID_; } 311 312 void* stackAddress() const { 313 MOZ_ASSERT(!isJsFrame()); 314 return spOrScript; 315 } 316 317 // Note that the pointer returned might be invalid. 318 void* rawScript() const { 319 MOZ_ASSERT(isJsFrame()); 320 return spOrScript; 321 } 322 void setRawScript(void* aScript) { 323 MOZ_ASSERT(isJsFrame()); 324 spOrScript = aScript; 325 } 326 327 int32_t pcOffset() const { 328 MOZ_ASSERT(isJsFrame()); 329 return pcOffsetIfJS_; 330 } 331 332 void setPCOffset(int32_t aOffset) { 333 MOZ_ASSERT(isJsFrame()); 334 pcOffsetIfJS_ = aOffset; 335 } 336 337 // The offset of a pc into a script's code can actually be 0, so to 338 // signify a nullptr pc, use a -1 index. This is checked against in 339 // pc() and setPC() to set/get the right pc. 340 static const int32_t NullPCOffset = -1; 341 }; 342 343 // Each thread has its own ProfilingStack. That thread modifies the 344 // ProfilingStack, pushing and popping elements as necessary. 345 // 346 // The ProfilingStack is also read periodically by the profiler's sampler 347 // thread. This happens only when the thread that owns the ProfilingStack is 348 // suspended. So there are no genuine parallel accesses. 349 // 350 // However, it is possible for pushing/popping to be interrupted by a periodic 351 // sample. Because of this, we need pushing/popping to be effectively atomic. 352 // 353 // - When pushing a new frame, we increment the stack pointer -- making the new 354 // frame visible to the sampler thread -- only after the new frame has been 355 // fully written. The stack pointer is Atomic<uint32_t,ReleaseAcquire>, so 356 // the increment is a release-store, which ensures that this store is not 357 // reordered before the writes of the frame. 358 // 359 // - When popping an old frame, the only operation is the decrementing of the 360 // stack pointer, which is obviously atomic. 361 // 362 class ProfilingStack final { 363 public: 364 ProfilingStack() = default; 365 366 MFBT_API ~ProfilingStack(); 367 368 void pushLabelFrame(const char* label, const char* dynamicString, void* sp, 369 ProfilingCategoryPair categoryPair, uint32_t flags = 0) { 370 // This thread is the only one that ever changes the value of 371 // stackPointer. 372 // Store the value of the atomic in a non-atomic local variable so that 373 // the compiler won't generate two separate loads from the atomic for 374 // the size check and the frames[] array indexing operation. 375 uint32_t stackPointerVal = stackPointer; 376 377 if (MOZ_UNLIKELY(stackPointerVal >= capacity)) { 378 ensureCapacitySlow(); 379 } 380 frames[stackPointerVal].initLabelFrame(label, dynamicString, sp, 381 categoryPair, flags); 382 383 // This must happen at the end! The compiler will not reorder this 384 // update because stackPointer is Atomic<..., ReleaseAcquire>, so any 385 // the writes above will not be reordered below the stackPointer store. 386 // Do the read and the write as two separate statements, in order to 387 // make it clear that we don't need an atomic increment, which would be 388 // more expensive on x86 than the separate operations done here. 389 // However, don't use stackPointerVal here; instead, allow the compiler 390 // to turn this store into a non-atomic increment instruction which 391 // takes up less code size. 392 stackPointer = stackPointer + 1; 393 } 394 395 void pushSpMarkerFrame(void* sp) { 396 uint32_t oldStackPointer = stackPointer; 397 398 if (MOZ_UNLIKELY(oldStackPointer >= capacity)) { 399 ensureCapacitySlow(); 400 } 401 frames[oldStackPointer].initSpMarkerFrame(sp); 402 403 // This must happen at the end, see the comment in pushLabelFrame. 404 stackPointer = oldStackPointer + 1; 405 } 406 407 void pushJsOffsetFrame(const char* label, const char* dynamicString, 408 void* script, int32_t offset, uint64_t aRealmID) { 409 // This thread is the only one that ever changes the value of 410 // stackPointer. Only load the atomic once. 411 uint32_t oldStackPointer = stackPointer; 412 413 if (MOZ_UNLIKELY(oldStackPointer >= capacity)) { 414 ensureCapacitySlow(); 415 } 416 frames[oldStackPointer].initJsFrame(label, dynamicString, script, offset, 417 aRealmID); 418 419 // This must happen at the end, see the comment in pushLabelFrame. 420 stackPointer = stackPointer + 1; 421 } 422 423 void pop() { 424 MOZ_ASSERT(stackPointer > 0); 425 // Do the read and the write as two separate statements, in order to 426 // make it clear that we don't need an atomic decrement, which would be 427 // more expensive on x86 than the separate operations done here. 428 // This thread is the only one that ever changes the value of 429 // stackPointer. 430 uint32_t oldStackPointer = stackPointer; 431 stackPointer = oldStackPointer - 1; 432 } 433 434 uint32_t stackSize() const { return stackPointer; } 435 uint32_t stackCapacity() const { return capacity; } 436 437 private: 438 // Out of line path for expanding the buffer, since otherwise this would get 439 // inlined in every DOM WebIDL call. 440 MFBT_API MOZ_COLD void ensureCapacitySlow(); 441 442 // No copying. 443 ProfilingStack(const ProfilingStack&) = delete; 444 void operator=(const ProfilingStack&) = delete; 445 446 // No moving either. 447 ProfilingStack(ProfilingStack&&) = delete; 448 void operator=(ProfilingStack&&) = delete; 449 450 uint32_t capacity = 0; 451 452 public: 453 // The pointer to the stack frames, this is read from the profiler thread and 454 // written from the current thread. 455 // 456 // This is effectively a unique pointer. 457 Atomic<ProfilingStackFrame*, SequentiallyConsistent> frames{nullptr}; 458 459 // This may exceed the capacity, so instead use the stackSize() method to 460 // determine the number of valid frames in stackFrames. When this is less 461 // than stackCapacity(), it refers to the first free stackframe past the top 462 // of the in-use stack (i.e. frames[stackPointer - 1] is the top stack 463 // frame). 464 // 465 // WARNING WARNING WARNING 466 // 467 // This is an atomic variable that uses ReleaseAcquire memory ordering. 468 // See the "Concurrency considerations" paragraph at the top of this file 469 // for more details. 470 Atomic<uint32_t, ReleaseAcquire> stackPointer{0}; 471 }; 472 473 class AutoGeckoProfilerEntry; 474 class GeckoProfilerEntryMarker; 475 class GeckoProfilerBaselineOSRMarker; 476 477 class GeckoProfilerThread { 478 friend class AutoGeckoProfilerEntry; 479 friend class GeckoProfilerEntryMarker; 480 friend class GeckoProfilerBaselineOSRMarker; 481 482 ProfilingStack* profilingStack_; 483 484 // Same as profilingStack_ if the profiler is currently active, otherwise 485 // null. 486 ProfilingStack* profilingStackIfEnabled_; 487 488 public: 489 MFBT_API GeckoProfilerThread(); 490 491 uint32_t stackPointer() { 492 MOZ_ASSERT(infraInstalled()); 493 return profilingStack_->stackPointer; 494 } 495 ProfilingStackFrame* stack() { return profilingStack_->frames; } 496 ProfilingStack* getProfilingStack() { return profilingStack_; } 497 ProfilingStack* getProfilingStackIfEnabled() { 498 return profilingStackIfEnabled_; 499 } 500 501 /* 502 * True if the profiler infrastructure is setup. Should be true in builds 503 * that include profiler support except during early startup or late 504 * shutdown. Unrelated to the presence of the Gecko Profiler addon. 505 */ 506 bool infraInstalled() { return profilingStack_ != nullptr; } 507 508 MFBT_API void setProfilingStack(ProfilingStack* profilingStack, bool enabled); 509 void enable(bool enable) { 510 profilingStackIfEnabled_ = enable ? profilingStack_ : nullptr; 511 } 512 }; 513 514 } // namespace baseprofiler 515 } // namespace mozilla 516 517 #endif /* BaseProfilingStack_h */