zipf_distribution_test.cc (14336B)
1 // Copyright 2017 The Abseil Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // https://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #include "absl/random/zipf_distribution.h" 16 17 #include <algorithm> 18 #include <cstddef> 19 #include <cstdint> 20 #include <iterator> 21 #include <random> 22 #include <string> 23 #include <utility> 24 #include <vector> 25 26 #include "gmock/gmock.h" 27 #include "gtest/gtest.h" 28 #include "absl/log/log.h" 29 #include "absl/random/internal/chi_square.h" 30 #include "absl/random/internal/pcg_engine.h" 31 #include "absl/random/internal/sequence_urbg.h" 32 #include "absl/random/random.h" 33 #include "absl/strings/str_cat.h" 34 #include "absl/strings/str_replace.h" 35 #include "absl/strings/strip.h" 36 37 namespace { 38 39 using ::absl::random_internal::kChiSquared; 40 using ::testing::ElementsAre; 41 42 template <typename IntType> 43 class ZipfDistributionTypedTest : public ::testing::Test {}; 44 45 using IntTypes = ::testing::Types<int, int8_t, int16_t, int32_t, int64_t, 46 uint8_t, uint16_t, uint32_t, uint64_t>; 47 TYPED_TEST_SUITE(ZipfDistributionTypedTest, IntTypes); 48 49 TYPED_TEST(ZipfDistributionTypedTest, SerializeTest) { 50 using param_type = typename absl::zipf_distribution<TypeParam>::param_type; 51 52 constexpr int kCount = 1000; 53 absl::InsecureBitGen gen; 54 for (const auto& param : { 55 param_type(), 56 param_type(32), 57 param_type(100, 3, 2), 58 param_type(std::numeric_limits<TypeParam>::max(), 4, 3), 59 param_type(std::numeric_limits<TypeParam>::max() / 2), 60 }) { 61 // Validate parameters. 62 const auto k = param.k(); 63 const auto q = param.q(); 64 const auto v = param.v(); 65 66 absl::zipf_distribution<TypeParam> before(k, q, v); 67 EXPECT_EQ(before.k(), param.k()); 68 EXPECT_EQ(before.q(), param.q()); 69 EXPECT_EQ(before.v(), param.v()); 70 71 { 72 absl::zipf_distribution<TypeParam> via_param(param); 73 EXPECT_EQ(via_param, before); 74 } 75 76 // Validate stream serialization. 77 std::stringstream ss; 78 ss << before; 79 absl::zipf_distribution<TypeParam> after(4, 5.5, 4.4); 80 81 EXPECT_NE(before.k(), after.k()); 82 EXPECT_NE(before.q(), after.q()); 83 EXPECT_NE(before.v(), after.v()); 84 EXPECT_NE(before.param(), after.param()); 85 EXPECT_NE(before, after); 86 87 ss >> after; 88 89 EXPECT_EQ(before.k(), after.k()); 90 EXPECT_EQ(before.q(), after.q()); 91 EXPECT_EQ(before.v(), after.v()); 92 EXPECT_EQ(before.param(), after.param()); 93 EXPECT_EQ(before, after); 94 95 // Smoke test. 96 auto sample_min = after.max(); 97 auto sample_max = after.min(); 98 for (int i = 0; i < kCount; i++) { 99 auto sample = after(gen); 100 EXPECT_GE(sample, after.min()); 101 EXPECT_LE(sample, after.max()); 102 if (sample > sample_max) sample_max = sample; 103 if (sample < sample_min) sample_min = sample; 104 } 105 LOG(INFO) << "Range: " << sample_min << ", " << sample_max; 106 } 107 } 108 109 class ZipfModel { 110 public: 111 ZipfModel(size_t k, double q, double v) : k_(k), q_(q), v_(v) {} 112 113 double mean() const { return mean_; } 114 115 // For the other moments of the Zipf distribution, see, for example, 116 // http://mathworld.wolfram.com/ZipfDistribution.html 117 118 // PMF(k) = (1 / k^s) / H(N,s) 119 // Returns the probability that any single invocation returns k. 120 double PMF(size_t i) { return i >= hnq_.size() ? 0.0 : hnq_[i] / sum_hnq_; } 121 122 // CDF = H(k, s) / H(N,s) 123 double CDF(size_t i) { 124 if (i >= hnq_.size()) { 125 return 1.0; 126 } 127 auto it = std::begin(hnq_); 128 double h = 0.0; 129 for (const auto end = it; it != end; it++) { 130 h += *it; 131 } 132 return h / sum_hnq_; 133 } 134 135 // The InverseCDF returns the k values which bound p on the upper and lower 136 // bound. Since there is no closed-form solution, this is implemented as a 137 // bisction of the cdf. 138 std::pair<size_t, size_t> InverseCDF(double p) { 139 size_t min = 0; 140 size_t max = hnq_.size(); 141 while (max > min + 1) { 142 size_t target = (max + min) >> 1; 143 double x = CDF(target); 144 if (x > p) { 145 max = target; 146 } else { 147 min = target; 148 } 149 } 150 return {min, max}; 151 } 152 153 // Compute the probability totals, which are based on the generalized harmonic 154 // number, H(N,s). 155 // H(N,s) == SUM(k=1..N, 1 / k^s) 156 // 157 // In the limit, H(N,s) == zetac(s) + 1. 158 // 159 // NOTE: The mean of a zipf distribution could be computed here as well. 160 // Mean := H(N, s-1) / H(N,s). 161 // Given the parameter v = 1, this gives the following function: 162 // (Hn(100, 1) - Hn(1,1)) / (Hn(100,2) - Hn(1,2)) = 6.5944 163 // 164 void Init() { 165 if (!hnq_.empty()) { 166 return; 167 } 168 hnq_.clear(); 169 hnq_.reserve(std::min(k_, size_t{1000})); 170 171 sum_hnq_ = 0; 172 double qm1 = q_ - 1.0; 173 double sum_hnq_m1 = 0; 174 for (size_t i = 0; i < k_; i++) { 175 // Partial n-th generalized harmonic number 176 const double x = v_ + i; 177 178 // H(n, q-1) 179 const double hnqm1 = (q_ == 2.0) ? (1.0 / x) 180 : (q_ == 3.0) ? (1.0 / (x * x)) 181 : std::pow(x, -qm1); 182 sum_hnq_m1 += hnqm1; 183 184 // H(n, q) 185 const double hnq = (q_ == 2.0) ? (1.0 / (x * x)) 186 : (q_ == 3.0) ? (1.0 / (x * x * x)) 187 : std::pow(x, -q_); 188 sum_hnq_ += hnq; 189 hnq_.push_back(hnq); 190 if (i > 1000 && hnq <= 1e-10) { 191 // The harmonic number is too small. 192 break; 193 } 194 } 195 assert(sum_hnq_ > 0); 196 mean_ = sum_hnq_m1 / sum_hnq_; 197 } 198 199 private: 200 const size_t k_; 201 const double q_; 202 const double v_; 203 204 double mean_; 205 std::vector<double> hnq_; 206 double sum_hnq_; 207 }; 208 209 using zipf_u64 = absl::zipf_distribution<uint64_t>; 210 211 class ZipfTest : public testing::TestWithParam<zipf_u64::param_type>, 212 public ZipfModel { 213 public: 214 ZipfTest() : ZipfModel(GetParam().k(), GetParam().q(), GetParam().v()) {} 215 216 // We use a fixed bit generator for distribution accuracy tests. This allows 217 // these tests to be deterministic, while still testing the qualify of the 218 // implementation. 219 absl::random_internal::pcg64_2018_engine rng_{0x2B7E151628AED2A6}; 220 }; 221 222 TEST_P(ZipfTest, ChiSquaredTest) { 223 const auto& param = GetParam(); 224 Init(); 225 226 size_t trials = 10000; 227 228 // Find the split-points for the buckets. 229 std::vector<size_t> points; 230 std::vector<double> expected; 231 { 232 double last_cdf = 0.0; 233 double min_p = 1.0; 234 for (double p = 0.01; p < 1.0; p += 0.01) { 235 auto x = InverseCDF(p); 236 if (points.empty() || points.back() < x.second) { 237 const double p = CDF(x.second); 238 points.push_back(x.second); 239 double q = p - last_cdf; 240 expected.push_back(q); 241 last_cdf = p; 242 if (q < min_p) { 243 min_p = q; 244 } 245 } 246 } 247 if (last_cdf < 0.999) { 248 points.push_back(std::numeric_limits<size_t>::max()); 249 double q = 1.0 - last_cdf; 250 expected.push_back(q); 251 if (q < min_p) { 252 min_p = q; 253 } 254 } else { 255 points.back() = std::numeric_limits<size_t>::max(); 256 expected.back() += (1.0 - last_cdf); 257 } 258 // The Chi-Squared score is not completely scale-invariant; it works best 259 // when the small values are in the small digits. 260 trials = static_cast<size_t>(8.0 / min_p); 261 } 262 ASSERT_GT(points.size(), 0); 263 264 // Generate n variates and fill the counts vector with the count of their 265 // occurrences. 266 std::vector<int64_t> buckets(points.size(), 0); 267 double avg = 0; 268 { 269 zipf_u64 dis(param); 270 for (size_t i = 0; i < trials; i++) { 271 uint64_t x = dis(rng_); 272 ASSERT_LE(x, dis.max()); 273 ASSERT_GE(x, dis.min()); 274 avg += static_cast<double>(x); 275 auto it = std::upper_bound(std::begin(points), std::end(points), 276 static_cast<size_t>(x)); 277 buckets[std::distance(std::begin(points), it)]++; 278 } 279 avg = avg / static_cast<double>(trials); 280 } 281 282 // Validate the output using the Chi-Squared test. 283 for (auto& e : expected) { 284 e *= trials; 285 } 286 287 // The null-hypothesis is that the distribution is a poisson distribution with 288 // the provided mean (not estimated from the data). 289 const int dof = static_cast<int>(expected.size()) - 1; 290 291 // NOTE: This test runs about 15x per invocation, so a value of 0.9995 is 292 // approximately correct for a test suite failure rate of 1 in 100. In 293 // practice we see failures slightly higher than that. 294 const double threshold = absl::random_internal::ChiSquareValue(dof, 0.9999); 295 296 const double chi_square = absl::random_internal::ChiSquare( 297 std::begin(buckets), std::end(buckets), std::begin(expected), 298 std::end(expected)); 299 300 const double p_actual = 301 absl::random_internal::ChiSquarePValue(chi_square, dof); 302 303 // Log if the chi_squared value is above the threshold. 304 if (chi_square > threshold) { 305 LOG(INFO) << "values"; 306 for (size_t i = 0; i < expected.size(); i++) { 307 LOG(INFO) << points[i] << ": " << buckets[i] << " vs. E=" << expected[i]; 308 } 309 LOG(INFO) << "trials " << trials; 310 LOG(INFO) << "mean " << avg << " vs. expected " << mean(); 311 LOG(INFO) << kChiSquared << "(data, " << dof << ") = " << chi_square << " (" 312 << p_actual << ")"; 313 LOG(INFO) << kChiSquared << " @ 0.9995 = " << threshold; 314 FAIL() << kChiSquared << " value of " << chi_square 315 << " is above the threshold."; 316 } 317 } 318 319 std::vector<zipf_u64::param_type> GenParams() { 320 using param = zipf_u64::param_type; 321 const auto k = param().k(); 322 const auto q = param().q(); 323 const auto v = param().v(); 324 const uint64_t k2 = 1 << 10; 325 return std::vector<zipf_u64::param_type>{ 326 // Default 327 param(k, q, v), 328 // vary K 329 param(4, q, v), param(1 << 4, q, v), param(k2, q, v), 330 // vary V 331 param(k2, q, 0.5), param(k2, q, 1.5), param(k2, q, 2.5), param(k2, q, 10), 332 // vary Q 333 param(k2, 1.5, v), param(k2, 3, v), param(k2, 5, v), param(k2, 10, v), 334 // Vary V & Q 335 param(k2, 1.5, 0.5), param(k2, 3, 1.5), param(k, 10, 10)}; 336 } 337 338 std::string ParamName( 339 const ::testing::TestParamInfo<zipf_u64::param_type>& info) { 340 const auto& p = info.param; 341 std::string name = absl::StrCat("k_", p.k(), "__q_", absl::SixDigits(p.q()), 342 "__v_", absl::SixDigits(p.v())); 343 return absl::StrReplaceAll(name, {{"+", "_"}, {"-", "_"}, {".", "_"}}); 344 } 345 346 INSTANTIATE_TEST_SUITE_P(All, ZipfTest, ::testing::ValuesIn(GenParams()), 347 ParamName); 348 349 // NOTE: absl::zipf_distribution is not guaranteed to be stable. 350 TEST(ZipfDistributionTest, StabilityTest) { 351 // absl::zipf_distribution stability relies on 352 // absl::uniform_real_distribution, std::log, std::exp, std::log1p 353 absl::random_internal::sequence_urbg urbg( 354 {0x0003eb76f6f7f755ull, 0xFFCEA50FDB2F953Bull, 0xC332DDEFBE6C5AA5ull, 355 0x6558218568AB9702ull, 0x2AEF7DAD5B6E2F84ull, 0x1521B62829076170ull, 356 0xECDD4775619F1510ull, 0x13CCA830EB61BD96ull, 0x0334FE1EAA0363CFull, 357 0xB5735C904C70A239ull, 0xD59E9E0BCBAADE14ull, 0xEECC86BC60622CA7ull}); 358 359 std::vector<int> output(10); 360 361 { 362 absl::zipf_distribution<int32_t> dist; 363 std::generate(std::begin(output), std::end(output), 364 [&] { return dist(urbg); }); 365 EXPECT_THAT(output, ElementsAre(10031, 0, 0, 3, 6, 0, 7, 47, 0, 0)); 366 } 367 urbg.reset(); 368 { 369 absl::zipf_distribution<int32_t> dist(std::numeric_limits<int32_t>::max(), 370 3.3); 371 std::generate(std::begin(output), std::end(output), 372 [&] { return dist(urbg); }); 373 EXPECT_THAT(output, ElementsAre(44, 0, 0, 0, 0, 1, 0, 1, 3, 0)); 374 } 375 } 376 377 TEST(ZipfDistributionTest, AlgorithmBounds) { 378 absl::zipf_distribution<int32_t> dist; 379 380 // Small values from absl::uniform_real_distribution map to larger Zipf 381 // distribution values. 382 const std::pair<uint64_t, int32_t> kInputs[] = { 383 {0xffffffffffffffff, 0x0}, {0x7fffffffffffffff, 0x0}, 384 {0x3ffffffffffffffb, 0x1}, {0x1ffffffffffffffd, 0x4}, 385 {0xffffffffffffffe, 0x9}, {0x7ffffffffffffff, 0x12}, 386 {0x3ffffffffffffff, 0x25}, {0x1ffffffffffffff, 0x4c}, 387 {0xffffffffffffff, 0x99}, {0x7fffffffffffff, 0x132}, 388 {0x3fffffffffffff, 0x265}, {0x1fffffffffffff, 0x4cc}, 389 {0xfffffffffffff, 0x999}, {0x7ffffffffffff, 0x1332}, 390 {0x3ffffffffffff, 0x2665}, {0x1ffffffffffff, 0x4ccc}, 391 {0xffffffffffff, 0x9998}, {0x7fffffffffff, 0x1332f}, 392 {0x3fffffffffff, 0x2665a}, {0x1fffffffffff, 0x4cc9e}, 393 {0xfffffffffff, 0x998e0}, {0x7ffffffffff, 0x133051}, 394 {0x3ffffffffff, 0x265ae4}, {0x1ffffffffff, 0x4c9ed3}, 395 {0xffffffffff, 0x98e223}, {0x7fffffffff, 0x13058c4}, 396 {0x3fffffffff, 0x25b178e}, {0x1fffffffff, 0x4a062b2}, 397 {0xfffffffff, 0x8ee23b8}, {0x7ffffffff, 0x10b21642}, 398 {0x3ffffffff, 0x1d89d89d}, {0x1ffffffff, 0x2fffffff}, 399 {0xffffffff, 0x45d1745d}, {0x7fffffff, 0x5a5a5a5a}, 400 {0x3fffffff, 0x69ee5846}, {0x1fffffff, 0x73ecade3}, 401 {0xfffffff, 0x79a9d260}, {0x7ffffff, 0x7cc0532b}, 402 {0x3ffffff, 0x7e5ad146}, {0x1ffffff, 0x7f2c0bec}, 403 {0xffffff, 0x7f95adef}, {0x7fffff, 0x7fcac0da}, 404 {0x3fffff, 0x7fe55ae2}, {0x1fffff, 0x7ff2ac0e}, 405 {0xfffff, 0x7ff955ae}, {0x7ffff, 0x7ffcaac1}, 406 {0x3ffff, 0x7ffe555b}, {0x1ffff, 0x7fff2aac}, 407 {0xffff, 0x7fff9556}, {0x7fff, 0x7fffcaab}, 408 {0x3fff, 0x7fffe555}, {0x1fff, 0x7ffff2ab}, 409 {0xfff, 0x7ffff955}, {0x7ff, 0x7ffffcab}, 410 {0x3ff, 0x7ffffe55}, {0x1ff, 0x7fffff2b}, 411 {0xff, 0x7fffff95}, {0x7f, 0x7fffffcb}, 412 {0x3f, 0x7fffffe5}, {0x1f, 0x7ffffff3}, 413 {0xf, 0x7ffffff9}, {0x7, 0x7ffffffd}, 414 {0x3, 0x7ffffffe}, {0x1, 0x7fffffff}, 415 }; 416 417 for (const auto& instance : kInputs) { 418 absl::random_internal::sequence_urbg urbg({instance.first}); 419 EXPECT_EQ(instance.second, dist(urbg)); 420 } 421 } 422 423 } // namespace