profiler.cc (5459B)
1 // Copyright 2025 Google LLC 2 // SPDX-License-Identifier: Apache-2.0 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 #include "hwy/profiler.h" 17 18 #include "hwy/highway_export.h" // HWY_DLLEXPORT 19 20 #if PROFILER_ENABLED 21 22 #include <stddef.h> 23 #include <stdint.h> 24 #include <stdio.h> 25 26 #include "hwy/base.h" 27 #include "hwy/robust_statistics.h" 28 #include "hwy/timer.h" 29 30 #endif // PROFILER_ENABLED 31 32 namespace hwy { 33 34 #if PROFILER_ENABLED 35 36 static constexpr bool kPrintOverhead = true; 37 38 // Must zero-init because `ThreadFunc` calls `SetGlobalIdx()` potentially after 39 // this is first used in the `pool::Worker` ctor. 40 /*static*/ thread_local size_t Profiler::s_global_idx = 0; 41 42 // Detects duration of a zero-length zone: timer plus packet overhead. 43 static uint64_t DetectSelfOverhead(Profiler& profiler, size_t global_idx) { 44 static const profiler::ZoneHandle zone = profiler.AddZone("DetectSelf"); 45 profiler::Results results; 46 const size_t kNumSamples = 25; 47 uint32_t samples[kNumSamples]; 48 for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) { 49 // Enough for stable measurements, but only about 50 ms startup cost. 50 const size_t kNumDurations = 700; 51 uint32_t durations[kNumDurations]; 52 for (size_t idx_duration = 0; idx_duration < kNumDurations; 53 ++idx_duration) { 54 { 55 PROFILER_ZONE3(profiler, global_idx, zone); 56 } 57 durations[idx_duration] = 58 static_cast<uint32_t>(profiler.GetFirstDurationAndReset(global_idx)); 59 } 60 samples[idx_sample] = robust_statistics::Mode(durations, kNumDurations); 61 } 62 return robust_statistics::Mode(samples, kNumSamples); 63 } 64 65 // Detects average duration of a zero-length zone, after deducting self 66 // overhead. This accounts for the delay before/after capturing start/end 67 // timestamps, for example due to fence instructions in timer::Start/Stop. 68 static uint64_t DetectChildOverhead(Profiler& profiler, size_t global_idx, 69 uint64_t self_overhead) { 70 static const profiler::ZoneHandle zone = profiler.AddZone("DetectChild"); 71 // Enough for stable measurements, but only about 50 ms startup cost. 72 const size_t kMaxSamples = 30; 73 uint32_t samples[kMaxSamples]; 74 size_t num_samples = 0; 75 // Upper bound because timer resolution might be too coarse to get nonzero. 76 for (size_t s = 0; s < 2 * kMaxSamples && num_samples < kMaxSamples; ++s) { 77 const size_t kNumDurations = 50; 78 uint32_t durations[kNumDurations]; 79 for (size_t d = 0; d < kNumDurations; ++d) { 80 constexpr size_t kReps = 500; 81 HWY_FENCE; 82 const uint64_t t0 = timer::Start(); 83 for (size_t r = 0; r < kReps; ++r) { 84 PROFILER_ZONE3(profiler, global_idx, zone); 85 } 86 const uint64_t t1 = timer::Stop(); 87 HWY_FENCE; 88 // We are measuring the total, not individual zone durations, to include 89 // cross-zone overhead. 90 (void)profiler.GetFirstDurationAndReset(global_idx); 91 92 const uint64_t avg_duration = (t1 - t0 + kReps / 2) / kReps; 93 durations[d] = static_cast<uint32_t>( 94 profiler::PerWorker::ClampedSubtract(avg_duration, self_overhead)); 95 } 96 samples[num_samples] = robust_statistics::Mode(durations, kNumDurations); 97 // Overhead is nonzero, but we often measure zero; skip them to prevent 98 // getting a zero result. 99 num_samples += (samples[num_samples] != 0); 100 } 101 return num_samples == 0 ? 0 : robust_statistics::Mode(samples, num_samples); 102 } 103 104 Profiler::Profiler() { 105 const uint64_t t0 = timer::Start(); 106 107 char cpu[100]; 108 if (HWY_UNLIKELY(!platform::HaveTimerStop(cpu))) { 109 HWY_ABORT("CPU %s is too old for PROFILER_ENABLED=1, exiting", cpu); 110 } 111 112 // `ThreadPool` calls `Profiler::Get()` before it creates threads, hence this 113 // is guaranteed to be running on the main thread. 114 constexpr size_t kMain = 0; 115 // Must be called before any use of `PROFILER_ZONE*/PROFILER_FUNC*`. This runs 116 // only once because `Profiler` is a singleton. 117 ReserveWorker(kMain); 118 SetGlobalIdx(kMain); 119 120 profiler::Overheads overheads; 121 // WARNING: must pass in `*this` and use `PROFILER_ZONE3` to avoid calling 122 // `Profiler::Get()`, because that would re-enter the magic static init. 123 overheads.self = DetectSelfOverhead(*this, kMain); 124 overheads.child = DetectChildOverhead(*this, kMain, overheads.self); 125 for (size_t worker = 0; worker < profiler::kMaxWorkers; ++worker) { 126 workers_[worker].SetOverheads(overheads); 127 } 128 129 HWY_IF_CONSTEXPR(kPrintOverhead) { 130 printf("Self overhead: %.0f; child: %.0f; elapsed %.1f ms\n", 131 static_cast<double>(overheads.self), 132 static_cast<double>(overheads.child), 133 static_cast<double>(timer::Stop() - t0) / 134 platform::InvariantTicksPerSecond() * 1E3); 135 } 136 } 137 138 #endif // PROFILER_ENABLED 139 140 // Even if disabled, we want to export the symbol. 141 HWY_DLLEXPORT Profiler& Profiler::Get() { 142 static Profiler* profiler = new Profiler(); 143 return *profiler; 144 } 145 146 } // namespace hwy