perf_counters_test.cc (5746B)
1 // Copyright 2024 Google LLC 2 // SPDX-License-Identifier: Apache-2.0 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 #include "hwy/perf_counters.h" 17 18 #include <stddef.h> 19 #include <stdint.h> 20 #include <stdio.h> 21 22 #include <vector> 23 24 #include "hwy/contrib/thread_pool/futex.h" // NanoSleep 25 #include "hwy/nanobenchmark.h" // Unpredictable1 26 #include "hwy/tests/hwy_gtest.h" 27 #include "hwy/tests/test_util-inl.h" 28 #include "hwy/timer.h" 29 30 namespace hwy { 31 namespace { 32 33 using ::hwy::platform::PerfCounters; 34 35 void ReadAndPrint(uint64_t r, double* values) { 36 char cpu100[100]; 37 const bool have_stop = hwy::platform::HaveTimerStop(cpu100); 38 const uint64_t t0 = timer::Start(); 39 40 PerfCounters counters; 41 const uint64_t t1 = have_stop ? timer::Stop() : timer::Start(); 42 const double elapsed_ns = 43 static_cast<double>(t1 - t0) * 1E9 / platform::InvariantTicksPerSecond(); 44 fprintf(stderr, "r: %d, any valid %d extrapolate %f, overhead %.1f ns\n", 45 static_cast<int>(r), counters.AnyValid(), counters.MaxExtrapolate(), 46 elapsed_ns); 47 48 if (counters.AnyValid()) { 49 HWY_ASSERT(counters.MaxExtrapolate() >= 1.0); 50 } 51 52 counters.Foreach([&counters, values](double val, PerfCounters::Counter c) { 53 HWY_ASSERT(counters.IsValid(c)); 54 fprintf(stderr, "%-20s: %.3E\n", PerfCounters::Name(c), val); 55 values[static_cast<size_t>(c)] = val; 56 }); 57 PerfCounters::StopAllAndReset(); 58 } 59 60 // Ensures a memory-intensive workload has high memory-related counters. 61 TEST(PerfCountersTest, TestMem) { 62 RandomState rng; 63 if (!PerfCounters::Init() || !PerfCounters::StartAll()) { 64 HWY_WARN("Perf counters unavailable, skipping test\n"); 65 return; 66 } 67 // Force L3 cache misses (loads). 68 std::vector<uint64_t> big_array(128 * 1024 * 1024); 69 for (uint64_t& x : big_array) { 70 x = rng() & static_cast<uint64_t>(hwy::Unpredictable1()); 71 } 72 const uint64_t r = big_array[rng() & 0xFFFF]; 73 74 double values[64] = {0.0}; 75 ReadAndPrint(r, values); 76 77 // Note that counters might not be available, and values differ considerably 78 // for debug/sanitizer builds. 79 HWY_ASSERT(values[PerfCounters::kRefCycles] == 0.0 || 80 values[PerfCounters::kRefCycles] > 1E8); // 470M..9B 81 HWY_ASSERT(values[PerfCounters::kInstructions] == 0.0 || 82 values[PerfCounters::kInstructions] > 1E5); // 1.5M..10B 83 HWY_ASSERT(values[PerfCounters::kPageFaults] == 0.0 || 84 values[PerfCounters::kPageFaults] > 1); // 4..500K 85 HWY_ASSERT(values[PerfCounters::kBranches] == 0.0 || 86 values[PerfCounters::kBranches] > 1E5); // > 900K 87 HWY_ASSERT(values[PerfCounters::kBranchMispredicts] < 1E9); // 273K..400M 88 89 HWY_ASSERT(values[PerfCounters::kL3Loads] == 0.0 || 90 values[PerfCounters::kL3Loads] > 10.0); // ~90K, 50 with L4 91 HWY_ASSERT(values[PerfCounters::kL3Stores] == 0.0 || 92 values[PerfCounters::kL3Stores] > 10.0); // 9K..5M 93 94 HWY_ASSERT(values[PerfCounters::kCacheRefs] == 0.0 || 95 values[PerfCounters::kCacheRefs] > 1E4); // 75K..66M 96 HWY_ASSERT(values[PerfCounters::kCacheMisses] == 0.0 || 97 values[PerfCounters::kCacheMisses] > 1.0); // 10..51M 98 HWY_ASSERT(values[PerfCounters::kBusCycles] == 0.0 || 99 values[PerfCounters::kBusCycles] > 1E6); // 8M 100 } 101 102 // Ensures a branch-heavy workload has high branch-related counters and not 103 // too high memory-related counters. 104 TEST(PerfCountersTest, RunBranches) { 105 RandomState rng; 106 if (!PerfCounters::Init() || !PerfCounters::StartAll()) { 107 HWY_WARN("Perf counters unavailable, skipping test\n"); 108 return; 109 } 110 111 // Branch-heavy, non-constexpr calculation so we see changes to counters. 112 const size_t iters = 113 static_cast<size_t>(hwy::Unpredictable1()) * 100000 + (rng() & 1); 114 uint64_t r = rng(); 115 for (size_t i = 0; i < iters; ++i) { 116 if (PopCount(rng()) < 36) { 117 r += rng() & 0xFF; 118 } else { 119 // Entirely different operation to ensure there is a branch. 120 r >>= 1; 121 } 122 // Ensure test runs long enough for counter multiplexing to happen. 123 NanoSleep(100 * 1000); 124 } 125 126 double values[64] = {0.0}; 127 ReadAndPrint(r, values); 128 129 // Note that counters might not be available, and values differ considerably 130 // for debug/sanitizer builds. 131 HWY_ASSERT(values[PerfCounters::kRefCycles] == 0.0 || 132 values[PerfCounters::kRefCycles] > 1E3); // 13K..18M 133 HWY_ASSERT(values[PerfCounters::kInstructions] == 0.0 || 134 values[PerfCounters::kInstructions] > 100.0); // 900..2M 135 HWY_ASSERT(values[PerfCounters::kBranches] == 0.0 || 136 values[PerfCounters::kBranches] > 100.0); // 1K..273K 137 HWY_ASSERT(values[PerfCounters::kBranchMispredicts] == 0 || 138 values[PerfCounters::kBranchMispredicts] > 10.0); // 65..5K 139 140 HWY_ASSERT(values[PerfCounters::kL3Loads] < 1E8); // 174K..12M 141 HWY_ASSERT(values[PerfCounters::kL3Stores] < 1E7); // 44K..1.8M 142 HWY_ASSERT(values[PerfCounters::kCacheRefs] < 1E9); // 5M..104M 143 HWY_ASSERT(values[PerfCounters::kCacheMisses] < 1E8); // 500K..10M 144 HWY_ASSERT(values[PerfCounters::kBusCycles] < 1E11); // 1M..10B 145 HWY_ASSERT(values[PerfCounters::kPageFaults] < 1E4); // 0..1.1K (in SDE) 146 } 147 148 } // namespace 149 } // namespace hwy 150 151 HWY_TEST_MAIN();