poisson_distribution_test.cc (20469B)
1 // Copyright 2017 The Abseil Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // https://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #include "absl/random/poisson_distribution.h" 16 17 #include <algorithm> 18 #include <cstddef> 19 #include <cstdint> 20 #include <iterator> 21 #include <random> 22 #include <sstream> 23 #include <string> 24 #include <vector> 25 26 #include "gmock/gmock.h" 27 #include "gtest/gtest.h" 28 #include "absl/base/macros.h" 29 #include "absl/container/flat_hash_map.h" 30 #include "absl/log/log.h" 31 #include "absl/random/internal/chi_square.h" 32 #include "absl/random/internal/distribution_test_util.h" 33 #include "absl/random/internal/pcg_engine.h" 34 #include "absl/random/internal/sequence_urbg.h" 35 #include "absl/random/random.h" 36 #include "absl/strings/str_cat.h" 37 #include "absl/strings/str_format.h" 38 #include "absl/strings/str_replace.h" 39 #include "absl/strings/strip.h" 40 41 // Notes about generating poisson variates: 42 // 43 // It is unlikely that any implementation of std::poisson_distribution 44 // will be stable over time and across library implementations. For instance 45 // the three different poisson variate generators listed below all differ: 46 // 47 // https://github.com/ampl/gsl/tree/master/randist/poisson.c 48 // * GSL uses a gamma + binomial + knuth method to compute poisson variates. 49 // 50 // https://github.com/gcc-mirror/gcc/blob/master/libstdc%2B%2B-v3/include/bits/random.tcc 51 // * GCC uses the Devroye rejection algorithm, based on 52 // Devroye, L. Non-Uniform Random Variates Generation. Springer-Verlag, 53 // New York, 1986, Ch. X, Sects. 3.3 & 3.4 (+ Errata!), ~p.511 54 // http://www.nrbook.com/devroye/ 55 // 56 // https://github.com/llvm-mirror/libcxx/blob/master/include/random 57 // * CLANG uses a different rejection method, which appears to include a 58 // normal-distribution approximation and an exponential distribution to 59 // compute the threshold, including a similar factorial approximation to this 60 // one, but it is unclear where the algorithm comes from, exactly. 61 // 62 63 namespace { 64 65 using absl::random_internal::kChiSquared; 66 67 // The PoissonDistributionInterfaceTest provides a basic test that 68 // absl::poisson_distribution conforms to the interface and serialization 69 // requirements imposed by [rand.req.dist] for the common integer types. 70 71 template <typename IntType> 72 class PoissonDistributionInterfaceTest : public ::testing::Test {}; 73 74 using IntTypes = ::testing::Types<int, int8_t, int16_t, int32_t, int64_t, 75 uint8_t, uint16_t, uint32_t, uint64_t>; 76 TYPED_TEST_SUITE(PoissonDistributionInterfaceTest, IntTypes); 77 78 TYPED_TEST(PoissonDistributionInterfaceTest, SerializeTest) { 79 using param_type = typename absl::poisson_distribution<TypeParam>::param_type; 80 const double kMax = 81 std::min(1e10 /* assertion limit */, 82 static_cast<double>(std::numeric_limits<TypeParam>::max())); 83 84 const double kParams[] = { 85 // Cases around 1. 86 1, // 87 std::nextafter(1.0, 0.0), // 1 - epsilon 88 std::nextafter(1.0, 2.0), // 1 + epsilon 89 // Arbitrary values. 90 1e-8, 1e-4, 91 0.0000005, // ~7.2e-7 92 0.2, // ~0.2x 93 0.5, // 0.72 94 2, // ~2.8 95 20, // 3x ~9.6 96 100, 1e4, 1e8, 1.5e9, 1e20, 97 // Boundary cases. 98 std::numeric_limits<double>::max(), 99 std::numeric_limits<double>::epsilon(), 100 std::nextafter(std::numeric_limits<double>::min(), 101 1.0), // min + epsilon 102 std::numeric_limits<double>::min(), // smallest normal 103 std::numeric_limits<double>::denorm_min(), // smallest denorm 104 std::numeric_limits<double>::min() / 2, // denorm 105 std::nextafter(std::numeric_limits<double>::min(), 106 0.0), // denorm_max 107 }; 108 109 constexpr int kCount = 1000; 110 absl::InsecureBitGen gen; 111 for (const double m : kParams) { 112 const double mean = std::min(kMax, m); 113 const param_type param(mean); 114 115 // Validate parameters. 116 absl::poisson_distribution<TypeParam> before(mean); 117 EXPECT_EQ(before.mean(), param.mean()); 118 119 { 120 absl::poisson_distribution<TypeParam> via_param(param); 121 EXPECT_EQ(via_param, before); 122 EXPECT_EQ(via_param.param(), before.param()); 123 } 124 125 // Smoke test. 126 auto sample_min = before.max(); 127 auto sample_max = before.min(); 128 for (int i = 0; i < kCount; i++) { 129 auto sample = before(gen); 130 EXPECT_GE(sample, before.min()); 131 EXPECT_LE(sample, before.max()); 132 if (sample > sample_max) sample_max = sample; 133 if (sample < sample_min) sample_min = sample; 134 } 135 136 LOG(INFO) << "Range {" << param.mean() << "}: " << sample_min << ", " 137 << sample_max; 138 139 // Validate stream serialization. 140 std::stringstream ss; 141 ss << before; 142 143 absl::poisson_distribution<TypeParam> after(3.8); 144 145 EXPECT_NE(before.mean(), after.mean()); 146 EXPECT_NE(before.param(), after.param()); 147 EXPECT_NE(before, after); 148 149 ss >> after; 150 151 EXPECT_EQ(before.mean(), after.mean()) // 152 << ss.str() << " " // 153 << (ss.good() ? "good " : "") // 154 << (ss.bad() ? "bad " : "") // 155 << (ss.eof() ? "eof " : "") // 156 << (ss.fail() ? "fail " : ""); 157 } 158 } 159 160 // See http://www.itl.nist.gov/div898/handbook/eda/section3/eda366j.htm 161 162 class PoissonModel { 163 public: 164 explicit PoissonModel(double mean) : mean_(mean) {} 165 166 double mean() const { return mean_; } 167 double variance() const { return mean_; } 168 double stddev() const { return std::sqrt(variance()); } 169 double skew() const { return 1.0 / mean_; } 170 double kurtosis() const { return 3.0 + 1.0 / mean_; } 171 172 // InitCDF() initializes the CDF for the distribution parameters. 173 void InitCDF(); 174 175 // The InverseCDF, or the Percent-point function returns x, P(x) < v. 176 struct CDF { 177 size_t index; 178 double pmf; 179 double cdf; 180 }; 181 CDF InverseCDF(double p) { 182 CDF target{0, 0, p}; 183 auto it = std::upper_bound( 184 std::begin(cdf_), std::end(cdf_), target, 185 [](const CDF& a, const CDF& b) { return a.cdf < b.cdf; }); 186 return *it; 187 } 188 189 void LogCDF() { 190 LOG(INFO) << "CDF (mean = " << mean_ << ")"; 191 for (const auto c : cdf_) { 192 LOG(INFO) << c.index << ": pmf=" << c.pmf << " cdf=" << c.cdf; 193 } 194 } 195 196 private: 197 const double mean_; 198 199 std::vector<CDF> cdf_; 200 }; 201 202 // The goal is to compute an InverseCDF function, or percent point function for 203 // the poisson distribution, and use that to partition our output into equal 204 // range buckets. However there is no closed form solution for the inverse cdf 205 // for poisson distributions (the closest is the incomplete gamma function). 206 // Instead, `InitCDF` iteratively computes the PMF and the CDF. This enables 207 // searching for the bucket points. 208 void PoissonModel::InitCDF() { 209 if (!cdf_.empty()) { 210 // State already initialized. 211 return; 212 } 213 ABSL_ASSERT(mean_ < 201.0); 214 215 const size_t max_i = 50 * stddev() + mean(); 216 const double e_neg_mean = std::exp(-mean()); 217 ABSL_ASSERT(e_neg_mean > 0); 218 219 double d = 1; 220 double last_result = e_neg_mean; 221 double cumulative = e_neg_mean; 222 if (e_neg_mean > 1e-10) { 223 cdf_.push_back({0, e_neg_mean, cumulative}); 224 } 225 for (size_t i = 1; i < max_i; i++) { 226 d *= (mean() / i); 227 double result = e_neg_mean * d; 228 cumulative += result; 229 if (result < 1e-10 && result < last_result && cumulative > 0.999999) { 230 break; 231 } 232 if (result > 1e-7) { 233 cdf_.push_back({i, result, cumulative}); 234 } 235 last_result = result; 236 } 237 ABSL_ASSERT(!cdf_.empty()); 238 } 239 240 // PoissonDistributionZTest implements a z-test for the poisson distribution. 241 242 struct ZParam { 243 double mean; 244 double p_fail; // Z-Test probability of failure. 245 int trials; // Z-Test trials. 246 size_t samples; // Z-Test samples. 247 }; 248 249 class PoissonDistributionZTest : public testing::TestWithParam<ZParam>, 250 public PoissonModel { 251 public: 252 PoissonDistributionZTest() : PoissonModel(GetParam().mean) {} 253 254 // ZTestImpl provides a basic z-squared test of the mean vs. expected 255 // mean for data generated by the poisson distribution. 256 template <typename D> 257 bool SingleZTest(const double p, const size_t samples); 258 259 // We use a fixed bit generator for distribution accuracy tests. This allows 260 // these tests to be deterministic, while still testing the qualify of the 261 // implementation. 262 absl::random_internal::pcg64_2018_engine rng_{0x2B7E151628AED2A6}; 263 }; 264 265 template <typename D> 266 bool PoissonDistributionZTest::SingleZTest(const double p, 267 const size_t samples) { 268 D dis(mean()); 269 270 absl::flat_hash_map<int32_t, int> buckets; 271 std::vector<double> data; 272 data.reserve(samples); 273 for (int j = 0; j < samples; j++) { 274 const auto x = dis(rng_); 275 buckets[x]++; 276 data.push_back(x); 277 } 278 279 // The null-hypothesis is that the distribution is a poisson distribution with 280 // the provided mean (not estimated from the data). 281 const auto m = absl::random_internal::ComputeDistributionMoments(data); 282 const double max_err = absl::random_internal::MaxErrorTolerance(p); 283 const double z = absl::random_internal::ZScore(mean(), m); 284 const bool pass = absl::random_internal::Near("z", z, 0.0, max_err); 285 286 if (!pass) { 287 // clang-format off 288 LOG(INFO) 289 << "p=" << p << " max_err=" << max_err << "\n" 290 " mean=" << m.mean << " vs. " << mean() << "\n" 291 " stddev=" << std::sqrt(m.variance) << " vs. " << stddev() << "\n" 292 " skewness=" << m.skewness << " vs. " << skew() << "\n" 293 " kurtosis=" << m.kurtosis << " vs. " << kurtosis() << "\n" 294 " z=" << z; 295 // clang-format on 296 } 297 return pass; 298 } 299 300 TEST_P(PoissonDistributionZTest, AbslPoissonDistribution) { 301 const auto& param = GetParam(); 302 const int expected_failures = 303 std::max(1, static_cast<int>(std::ceil(param.trials * param.p_fail))); 304 const double p = absl::random_internal::RequiredSuccessProbability( 305 param.p_fail, param.trials); 306 307 int failures = 0; 308 for (int i = 0; i < param.trials; i++) { 309 failures += 310 SingleZTest<absl::poisson_distribution<int32_t>>(p, param.samples) ? 0 311 : 1; 312 } 313 EXPECT_LE(failures, expected_failures); 314 } 315 316 std::vector<ZParam> GetZParams() { 317 // These values have been adjusted from the "exact" computed values to reduce 318 // failure rates. 319 // 320 // It turns out that the actual values are not as close to the expected values 321 // as would be ideal. 322 return std::vector<ZParam>({ 323 // Knuth method. 324 ZParam{0.5, 0.01, 100, 1000}, 325 ZParam{1.0, 0.01, 100, 1000}, 326 ZParam{10.0, 0.01, 100, 5000}, 327 // Split-knuth method. 328 ZParam{20.0, 0.01, 100, 10000}, 329 ZParam{50.0, 0.01, 100, 10000}, 330 // Ratio of gaussians method. 331 ZParam{51.0, 0.01, 100, 10000}, 332 ZParam{200.0, 0.05, 10, 100000}, 333 ZParam{100000.0, 0.05, 10, 1000000}, 334 }); 335 } 336 337 std::string ZParamName(const ::testing::TestParamInfo<ZParam>& info) { 338 const auto& p = info.param; 339 std::string name = absl::StrCat("mean_", absl::SixDigits(p.mean)); 340 return absl::StrReplaceAll(name, {{"+", "_"}, {"-", "_"}, {".", "_"}}); 341 } 342 343 INSTANTIATE_TEST_SUITE_P(All, PoissonDistributionZTest, 344 ::testing::ValuesIn(GetZParams()), ZParamName); 345 346 // The PoissonDistributionChiSquaredTest class provides a basic test framework 347 // for variates generated by a conforming poisson_distribution. 348 class PoissonDistributionChiSquaredTest : public testing::TestWithParam<double>, 349 public PoissonModel { 350 public: 351 PoissonDistributionChiSquaredTest() : PoissonModel(GetParam()) {} 352 353 // The ChiSquaredTestImpl provides a chi-squared goodness of fit test for data 354 // generated by the poisson distribution. 355 template <typename D> 356 double ChiSquaredTestImpl(); 357 358 private: 359 void InitChiSquaredTest(const double buckets); 360 361 std::vector<size_t> cutoffs_; 362 std::vector<double> expected_; 363 364 // We use a fixed bit generator for distribution accuracy tests. This allows 365 // these tests to be deterministic, while still testing the qualify of the 366 // implementation. 367 absl::random_internal::pcg64_2018_engine rng_{0x2B7E151628AED2A6}; 368 }; 369 370 void PoissonDistributionChiSquaredTest::InitChiSquaredTest( 371 const double buckets) { 372 if (!cutoffs_.empty() && !expected_.empty()) { 373 return; 374 } 375 InitCDF(); 376 377 // The code below finds cuttoffs that yield approximately equally-sized 378 // buckets to the extent that it is possible. However for poisson 379 // distributions this is particularly challenging for small mean parameters. 380 // Track the expected proportion of items in each bucket. 381 double last_cdf = 0; 382 const double inc = 1.0 / buckets; 383 for (double p = inc; p <= 1.0; p += inc) { 384 auto result = InverseCDF(p); 385 if (!cutoffs_.empty() && cutoffs_.back() == result.index) { 386 continue; 387 } 388 double d = result.cdf - last_cdf; 389 cutoffs_.push_back(result.index); 390 expected_.push_back(d); 391 last_cdf = result.cdf; 392 } 393 cutoffs_.push_back(std::numeric_limits<size_t>::max()); 394 expected_.push_back(std::max(0.0, 1.0 - last_cdf)); 395 } 396 397 template <typename D> 398 double PoissonDistributionChiSquaredTest::ChiSquaredTestImpl() { 399 const int kSamples = 2000; 400 const int kBuckets = 50; 401 402 // The poisson CDF fails for large mean values, since e^-mean exceeds the 403 // machine precision. For these cases, using a normal approximation would be 404 // appropriate. 405 ABSL_ASSERT(mean() <= 200); 406 InitChiSquaredTest(kBuckets); 407 408 D dis(mean()); 409 410 std::vector<int32_t> counts(cutoffs_.size(), 0); 411 for (int j = 0; j < kSamples; j++) { 412 const size_t x = dis(rng_); 413 auto it = std::lower_bound(std::begin(cutoffs_), std::end(cutoffs_), x); 414 counts[std::distance(cutoffs_.begin(), it)]++; 415 } 416 417 // Normalize the counts. 418 std::vector<int32_t> e(expected_.size(), 0); 419 for (int i = 0; i < e.size(); i++) { 420 e[i] = kSamples * expected_[i]; 421 } 422 423 // The null-hypothesis is that the distribution is a poisson distribution with 424 // the provided mean (not estimated from the data). 425 const int dof = static_cast<int>(counts.size()) - 1; 426 427 // The threshold for logging is 1-in-50. 428 const double threshold = absl::random_internal::ChiSquareValue(dof, 0.98); 429 430 const double chi_square = absl::random_internal::ChiSquare( 431 std::begin(counts), std::end(counts), std::begin(e), std::end(e)); 432 433 const double p = absl::random_internal::ChiSquarePValue(chi_square, dof); 434 435 // Log if the chi_squared value is above the threshold. 436 if (chi_square > threshold) { 437 LogCDF(); 438 439 LOG(INFO) << "VALUES buckets=" << counts.size() 440 << " samples=" << kSamples; 441 for (size_t i = 0; i < counts.size(); i++) { 442 LOG(INFO) << cutoffs_[i] << ": " << counts[i] << " vs. E=" << e[i]; 443 } 444 445 LOG(INFO) << kChiSquared << "(data, dof=" << dof << ") = " << chi_square 446 << " (" << p << ")\n" 447 << " vs.\n" 448 << kChiSquared << " @ 0.98 = " << threshold; 449 } 450 return p; 451 } 452 453 TEST_P(PoissonDistributionChiSquaredTest, AbslPoissonDistribution) { 454 const int kTrials = 20; 455 456 // Large values are not yet supported -- this requires estimating the cdf 457 // using the normal distribution instead of the poisson in this case. 458 ASSERT_LE(mean(), 200.0); 459 if (mean() > 200.0) { 460 return; 461 } 462 463 int failures = 0; 464 for (int i = 0; i < kTrials; i++) { 465 double p_value = ChiSquaredTestImpl<absl::poisson_distribution<int32_t>>(); 466 if (p_value < 0.005) { 467 failures++; 468 } 469 } 470 // There is a 0.10% chance of producing at least one failure, so raise the 471 // failure threshold high enough to allow for a flake rate < 10,000. 472 EXPECT_LE(failures, 4); 473 } 474 475 INSTANTIATE_TEST_SUITE_P(All, PoissonDistributionChiSquaredTest, 476 ::testing::Values(0.5, 1.0, 2.0, 10.0, 50.0, 51.0, 477 200.0)); 478 479 // NOTE: absl::poisson_distribution is not guaranteed to be stable. 480 TEST(PoissonDistributionTest, StabilityTest) { 481 using testing::ElementsAre; 482 // absl::poisson_distribution stability relies on stability of 483 // std::exp, std::log, std::sqrt, std::ceil, std::floor, and 484 // absl::FastUniformBits, absl::StirlingLogFactorial, absl::RandU64ToDouble. 485 absl::random_internal::sequence_urbg urbg({ 486 0x035b0dc7e0a18acfull, 0x06cebe0d2653682eull, 0x0061e9b23861596bull, 487 0x0003eb76f6f7f755ull, 0xFFCEA50FDB2F953Bull, 0xC332DDEFBE6C5AA5ull, 488 0x6558218568AB9702ull, 0x2AEF7DAD5B6E2F84ull, 0x1521B62829076170ull, 489 0xECDD4775619F1510ull, 0x13CCA830EB61BD96ull, 0x0334FE1EAA0363CFull, 490 0xB5735C904C70A239ull, 0xD59E9E0BCBAADE14ull, 0xEECC86BC60622CA7ull, 491 0x4864f22c059bf29eull, 0x247856d8b862665cull, 0xe46e86e9a1337e10ull, 492 0xd8c8541f3519b133ull, 0xe75b5162c567b9e4ull, 0xf732e5ded7009c5bull, 493 0xb170b98353121eacull, 0x1ec2e8986d2362caull, 0x814c8e35fe9a961aull, 494 0x0c3cd59c9b638a02ull, 0xcb3bb6478a07715cull, 0x1224e62c978bbc7full, 495 0x671ef2cb04e81f6eull, 0x3c1cbd811eaf1808ull, 0x1bbc23cfa8fac721ull, 496 0xa4c2cda65e596a51ull, 0xb77216fad37adf91ull, 0x836d794457c08849ull, 497 0xe083df03475f49d7ull, 0xbc9feb512e6b0d6cull, 0xb12d74fdd718c8c5ull, 498 0x12ff09653bfbe4caull, 0x8dd03a105bc4ee7eull, 0x5738341045ba0d85ull, 499 0xf3fd722dc65ad09eull, 0xfa14fd21ea2a5705ull, 0xffe6ea4d6edb0c73ull, 500 0xD07E9EFE2BF11FB4ull, 0x95DBDA4DAE909198ull, 0xEAAD8E716B93D5A0ull, 501 0xD08ED1D0AFC725E0ull, 0x8E3C5B2F8E7594B7ull, 0x8FF6E2FBF2122B64ull, 502 0x8888B812900DF01Cull, 0x4FAD5EA0688FC31Cull, 0xD1CFF191B3A8C1ADull, 503 0x2F2F2218BE0E1777ull, 0xEA752DFE8B021FA1ull, 0xE5A0CC0FB56F74E8ull, 504 0x18ACF3D6CE89E299ull, 0xB4A84FE0FD13E0B7ull, 0x7CC43B81D2ADA8D9ull, 505 0x165FA26680957705ull, 0x93CC7314211A1477ull, 0xE6AD206577B5FA86ull, 506 0xC75442F5FB9D35CFull, 0xEBCDAF0C7B3E89A0ull, 0xD6411BD3AE1E7E49ull, 507 0x00250E2D2071B35Eull, 0x226800BB57B8E0AFull, 0x2464369BF009B91Eull, 508 0x5563911D59DFA6AAull, 0x78C14389D95A537Full, 0x207D5BA202E5B9C5ull, 509 0x832603766295CFA9ull, 0x11C819684E734A41ull, 0xB3472DCA7B14A94Aull, 510 }); 511 512 std::vector<int> output(10); 513 514 // Method 1. 515 { 516 absl::poisson_distribution<int> dist(5); 517 std::generate(std::begin(output), std::end(output), 518 [&] { return dist(urbg); }); 519 } 520 EXPECT_THAT(output, // mean = 4.2 521 ElementsAre(1, 0, 0, 4, 2, 10, 3, 3, 7, 12)); 522 523 // Method 2. 524 { 525 urbg.reset(); 526 absl::poisson_distribution<int> dist(25); 527 std::generate(std::begin(output), std::end(output), 528 [&] { return dist(urbg); }); 529 } 530 EXPECT_THAT(output, // mean = 19.8 531 ElementsAre(9, 35, 18, 10, 35, 18, 10, 35, 18, 10)); 532 533 // Method 3. 534 { 535 urbg.reset(); 536 absl::poisson_distribution<int> dist(121); 537 std::generate(std::begin(output), std::end(output), 538 [&] { return dist(urbg); }); 539 } 540 EXPECT_THAT(output, // mean = 124.1 541 ElementsAre(161, 122, 129, 124, 112, 112, 117, 120, 130, 114)); 542 } 543 544 TEST(PoissonDistributionTest, AlgorithmExpectedValue_1) { 545 // This tests small values of the Knuth method. 546 // The underlying uniform distribution will generate exactly 0.5. 547 absl::random_internal::sequence_urbg urbg({0x8000000000000001ull}); 548 absl::poisson_distribution<int> dist(5); 549 EXPECT_EQ(7, dist(urbg)); 550 } 551 552 TEST(PoissonDistributionTest, AlgorithmExpectedValue_2) { 553 // This tests larger values of the Knuth method. 554 // The underlying uniform distribution will generate exactly 0.5. 555 absl::random_internal::sequence_urbg urbg({0x8000000000000001ull}); 556 absl::poisson_distribution<int> dist(25); 557 EXPECT_EQ(36, dist(urbg)); 558 } 559 560 TEST(PoissonDistributionTest, AlgorithmExpectedValue_3) { 561 // This variant uses the ratio of uniforms method. 562 absl::random_internal::sequence_urbg urbg( 563 {0x7fffffffffffffffull, 0x8000000000000000ull}); 564 565 absl::poisson_distribution<int> dist(121); 566 EXPECT_EQ(121, dist(urbg)); 567 } 568 569 } // namespace