stats.h (5488B)
1 // Copyright 2024 Google LLC 2 // SPDX-License-Identifier: Apache-2.0 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // https://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 #ifndef HIGHWAY_HWY_STATS_H_ 17 #define HIGHWAY_HWY_STATS_H_ 18 19 #include <stdint.h> 20 #include <stdio.h> 21 22 #include <cmath> 23 #include <string> 24 25 #include "hwy/base.h" // HWY_ASSERT 26 27 namespace hwy { 28 29 // Thread-compatible. 30 template <size_t N> 31 class Bins { 32 public: 33 Bins() { Reset(); } 34 35 template <typename T> 36 void Notify(T bin) { 37 HWY_ASSERT(T{0} <= bin && bin < static_cast<T>(N)); 38 counts_[static_cast<int32_t>(bin)]++; 39 } 40 41 void Assimilate(const Bins<N>& other) { 42 for (size_t i = 0; i < N; ++i) { 43 counts_[i] += other.counts_[i]; 44 } 45 } 46 47 void Print(const char* caption) const { 48 fprintf(stderr, "\n%s [%zu]\n", caption, N); 49 size_t last_nonzero = 0; 50 for (size_t i = N - 1; i < N; --i) { 51 if (counts_[i] != 0) { 52 last_nonzero = i; 53 break; 54 } 55 } 56 for (size_t i = 0; i <= last_nonzero; ++i) { 57 fprintf(stderr, " %zu\n", counts_[i]); 58 } 59 } 60 61 void Reset() { 62 for (size_t i = 0; i < N; ++i) { 63 counts_[i] = 0; 64 } 65 } 66 67 private: 68 size_t counts_[N]; 69 }; 70 71 // Descriptive statistics of a variable (4 moments). Thread-compatible. 72 class Stats { 73 public: 74 Stats() { Reset(); } 75 76 void Notify(const float x) { 77 ++n_; 78 79 min_ = HWY_MIN(min_, x); 80 max_ = HWY_MAX(max_, x); 81 82 // Logarithmic transform avoids/delays underflow and overflow. 83 sum_log_ += std::log(static_cast<double>(x)); 84 85 // Online moments. 86 // Reference: https://www.thinkbrg.com/media/publication/720_McCrary_ImplementingAlgorithms_Whitepaper_20151119_WEB.pdf 87 const double d = x - m1_; 88 const double d_div_n = d / static_cast<double>(n_); 89 const double d2n1_div_n = d * (static_cast<double>(n_) - 1) * d_div_n; 90 const int64_t n_poly = n_ * n_ - 3 * n_ + 3; 91 m1_ += d_div_n; 92 m4_ += d_div_n * 93 (d_div_n * (d2n1_div_n * static_cast<double>(n_poly) + 6.0 * m2_) - 94 4.0 * m3_); 95 m3_ += d_div_n * (d2n1_div_n * (static_cast<double>(n_) - 2) - 3.0 * m2_); 96 m2_ += d2n1_div_n; 97 } 98 99 void Assimilate(const Stats& other); 100 101 int64_t Count() const { return n_; } 102 103 float Min() const { return min_; } 104 float Max() const { return max_; } 105 106 double GeometricMean() const { 107 return n_ == 0 ? 0.0 : std::exp(sum_log_ / static_cast<double>(n_)); 108 } 109 110 double Mean() const { return m1_; } 111 // Same as Mu2. Assumes n_ is large. 112 double SampleVariance() const { 113 return n_ == 0 ? 0.0 : m2_ / static_cast<int>(n_); 114 } 115 // Unbiased estimator for population variance even for smaller n_. 116 double Variance() const { 117 if (n_ == 0) return 0.0; 118 if (n_ == 1) return m2_; 119 return m2_ / static_cast<int>(n_ - 1); 120 } 121 double StandardDeviation() const { return std::sqrt(Variance()); } 122 // Near zero for normal distributions; if positive on a unimodal distribution, 123 // the right tail is fatter. Assumes n_ is large. 124 double SampleSkewness() const { 125 if (ScalarAbs(m2_) < 1E-7) return 0.0; 126 return m3_ * std::sqrt(static_cast<double>(n_)) / std::pow(m2_, 1.5); 127 } 128 // Corrected for bias (same as Wikipedia and Minitab but not Excel). 129 double Skewness() const { 130 if (n_ == 0) return 0.0; 131 const double biased = SampleSkewness(); 132 const double r = (static_cast<double>(n_) - 1.0) / static_cast<double>(n_); 133 return biased * std::pow(r, 1.5); 134 } 135 // Near zero for normal distributions; smaller values indicate fewer/smaller 136 // outliers and larger indicates more/larger outliers. Assumes n_ is large. 137 double SampleKurtosis() const { 138 if (ScalarAbs(m2_) < 1E-7) return 0.0; 139 return m4_ * static_cast<double>(n_) / (m2_ * m2_); 140 } 141 // Corrected for bias (same as Wikipedia and Minitab but not Excel). 142 double Kurtosis() const { 143 if (n_ == 0) return 0.0; 144 const double biased = SampleKurtosis(); 145 const double r = (static_cast<double>(n_) - 1.0) / static_cast<double>(n_); 146 return biased * r * r; 147 } 148 149 // Central moments, useful for "method of moments"-based parameter estimation 150 // of a mixture of two Gaussians. Assumes Count() != 0. 151 double Mu1() const { return m1_; } 152 double Mu2() const { return m2_ / static_cast<int>(n_); } 153 double Mu3() const { return m3_ / static_cast<int>(n_); } 154 double Mu4() const { return m4_ / static_cast<int>(n_); } 155 156 // Which statistics to EXCLUDE in ToString 157 enum { 158 kNoCount = 1, 159 kNoMeanSD = 2, 160 kNoMinMax = 4, 161 kNoSkewKurt = 8, 162 kNoGeomean = 16 163 }; 164 std::string ToString(int exclude = 0) const; 165 166 void Reset() { 167 n_ = 0; 168 169 min_ = hwy::HighestValue<float>(); 170 max_ = hwy::LowestValue<float>(); 171 172 sum_log_ = 0.0; 173 174 m1_ = 0.0; 175 m2_ = 0.0; 176 m3_ = 0.0; 177 m4_ = 0.0; 178 } 179 180 private: 181 int64_t n_; // signed for faster conversion + safe subtraction 182 183 float min_; 184 float max_; 185 186 double sum_log_; // for geomean 187 188 // Moments 189 double m1_; 190 double m2_; 191 double m3_; 192 double m4_; 193 }; 194 195 } // namespace hwy 196 197 #endif // HIGHWAY_HWY_STATS_H_