tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

zipf_distribution_test.cc (14336B)


      1 // Copyright 2017 The Abseil Authors.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //      https://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 #include "absl/random/zipf_distribution.h"
     16 
     17 #include <algorithm>
     18 #include <cstddef>
     19 #include <cstdint>
     20 #include <iterator>
     21 #include <random>
     22 #include <string>
     23 #include <utility>
     24 #include <vector>
     25 
     26 #include "gmock/gmock.h"
     27 #include "gtest/gtest.h"
     28 #include "absl/log/log.h"
     29 #include "absl/random/internal/chi_square.h"
     30 #include "absl/random/internal/pcg_engine.h"
     31 #include "absl/random/internal/sequence_urbg.h"
     32 #include "absl/random/random.h"
     33 #include "absl/strings/str_cat.h"
     34 #include "absl/strings/str_replace.h"
     35 #include "absl/strings/strip.h"
     36 
     37 namespace {
     38 
     39 using ::absl::random_internal::kChiSquared;
     40 using ::testing::ElementsAre;
     41 
     42 template <typename IntType>
     43 class ZipfDistributionTypedTest : public ::testing::Test {};
     44 
     45 using IntTypes = ::testing::Types<int, int8_t, int16_t, int32_t, int64_t,
     46                                  uint8_t, uint16_t, uint32_t, uint64_t>;
     47 TYPED_TEST_SUITE(ZipfDistributionTypedTest, IntTypes);
     48 
     49 TYPED_TEST(ZipfDistributionTypedTest, SerializeTest) {
     50  using param_type = typename absl::zipf_distribution<TypeParam>::param_type;
     51 
     52  constexpr int kCount = 1000;
     53  absl::InsecureBitGen gen;
     54  for (const auto& param : {
     55           param_type(),
     56           param_type(32),
     57           param_type(100, 3, 2),
     58           param_type(std::numeric_limits<TypeParam>::max(), 4, 3),
     59           param_type(std::numeric_limits<TypeParam>::max() / 2),
     60       }) {
     61    // Validate parameters.
     62    const auto k = param.k();
     63    const auto q = param.q();
     64    const auto v = param.v();
     65 
     66    absl::zipf_distribution<TypeParam> before(k, q, v);
     67    EXPECT_EQ(before.k(), param.k());
     68    EXPECT_EQ(before.q(), param.q());
     69    EXPECT_EQ(before.v(), param.v());
     70 
     71    {
     72      absl::zipf_distribution<TypeParam> via_param(param);
     73      EXPECT_EQ(via_param, before);
     74    }
     75 
     76    // Validate stream serialization.
     77    std::stringstream ss;
     78    ss << before;
     79    absl::zipf_distribution<TypeParam> after(4, 5.5, 4.4);
     80 
     81    EXPECT_NE(before.k(), after.k());
     82    EXPECT_NE(before.q(), after.q());
     83    EXPECT_NE(before.v(), after.v());
     84    EXPECT_NE(before.param(), after.param());
     85    EXPECT_NE(before, after);
     86 
     87    ss >> after;
     88 
     89    EXPECT_EQ(before.k(), after.k());
     90    EXPECT_EQ(before.q(), after.q());
     91    EXPECT_EQ(before.v(), after.v());
     92    EXPECT_EQ(before.param(), after.param());
     93    EXPECT_EQ(before, after);
     94 
     95    // Smoke test.
     96    auto sample_min = after.max();
     97    auto sample_max = after.min();
     98    for (int i = 0; i < kCount; i++) {
     99      auto sample = after(gen);
    100      EXPECT_GE(sample, after.min());
    101      EXPECT_LE(sample, after.max());
    102      if (sample > sample_max) sample_max = sample;
    103      if (sample < sample_min) sample_min = sample;
    104    }
    105    LOG(INFO) << "Range: " << sample_min << ", " << sample_max;
    106  }
    107 }
    108 
    109 class ZipfModel {
    110 public:
    111  ZipfModel(size_t k, double q, double v) : k_(k), q_(q), v_(v) {}
    112 
    113  double mean() const { return mean_; }
    114 
    115  // For the other moments of the Zipf distribution, see, for example,
    116  // http://mathworld.wolfram.com/ZipfDistribution.html
    117 
    118  // PMF(k) = (1 / k^s) / H(N,s)
    119  // Returns the probability that any single invocation returns k.
    120  double PMF(size_t i) { return i >= hnq_.size() ? 0.0 : hnq_[i] / sum_hnq_; }
    121 
    122  // CDF = H(k, s) / H(N,s)
    123  double CDF(size_t i) {
    124    if (i >= hnq_.size()) {
    125      return 1.0;
    126    }
    127    auto it = std::begin(hnq_);
    128    double h = 0.0;
    129    for (const auto end = it; it != end; it++) {
    130      h += *it;
    131    }
    132    return h / sum_hnq_;
    133  }
    134 
    135  // The InverseCDF returns the k values which bound p on the upper and lower
    136  // bound. Since there is no closed-form solution, this is implemented as a
    137  // bisction of the cdf.
    138  std::pair<size_t, size_t> InverseCDF(double p) {
    139    size_t min = 0;
    140    size_t max = hnq_.size();
    141    while (max > min + 1) {
    142      size_t target = (max + min) >> 1;
    143      double x = CDF(target);
    144      if (x > p) {
    145        max = target;
    146      } else {
    147        min = target;
    148      }
    149    }
    150    return {min, max};
    151  }
    152 
    153  // Compute the probability totals, which are based on the generalized harmonic
    154  // number, H(N,s).
    155  //   H(N,s) == SUM(k=1..N, 1 / k^s)
    156  //
    157  // In the limit, H(N,s) == zetac(s) + 1.
    158  //
    159  // NOTE: The mean of a zipf distribution could be computed here as well.
    160  // Mean :=  H(N, s-1) / H(N,s).
    161  // Given the parameter v = 1, this gives the following function:
    162  // (Hn(100, 1) - Hn(1,1)) / (Hn(100,2) - Hn(1,2)) = 6.5944
    163  //
    164  void Init() {
    165    if (!hnq_.empty()) {
    166      return;
    167    }
    168    hnq_.clear();
    169    hnq_.reserve(std::min(k_, size_t{1000}));
    170 
    171    sum_hnq_ = 0;
    172    double qm1 = q_ - 1.0;
    173    double sum_hnq_m1 = 0;
    174    for (size_t i = 0; i < k_; i++) {
    175      // Partial n-th generalized harmonic number
    176      const double x = v_ + i;
    177 
    178      // H(n, q-1)
    179      const double hnqm1 = (q_ == 2.0)   ? (1.0 / x)
    180                           : (q_ == 3.0) ? (1.0 / (x * x))
    181                                         : std::pow(x, -qm1);
    182      sum_hnq_m1 += hnqm1;
    183 
    184      // H(n, q)
    185      const double hnq = (q_ == 2.0)   ? (1.0 / (x * x))
    186                         : (q_ == 3.0) ? (1.0 / (x * x * x))
    187                                       : std::pow(x, -q_);
    188      sum_hnq_ += hnq;
    189      hnq_.push_back(hnq);
    190      if (i > 1000 && hnq <= 1e-10) {
    191        // The harmonic number is too small.
    192        break;
    193      }
    194    }
    195    assert(sum_hnq_ > 0);
    196    mean_ = sum_hnq_m1 / sum_hnq_;
    197  }
    198 
    199 private:
    200  const size_t k_;
    201  const double q_;
    202  const double v_;
    203 
    204  double mean_;
    205  std::vector<double> hnq_;
    206  double sum_hnq_;
    207 };
    208 
    209 using zipf_u64 = absl::zipf_distribution<uint64_t>;
    210 
    211 class ZipfTest : public testing::TestWithParam<zipf_u64::param_type>,
    212                 public ZipfModel {
    213 public:
    214  ZipfTest() : ZipfModel(GetParam().k(), GetParam().q(), GetParam().v()) {}
    215 
    216  // We use a fixed bit generator for distribution accuracy tests.  This allows
    217  // these tests to be deterministic, while still testing the qualify of the
    218  // implementation.
    219  absl::random_internal::pcg64_2018_engine rng_{0x2B7E151628AED2A6};
    220 };
    221 
    222 TEST_P(ZipfTest, ChiSquaredTest) {
    223  const auto& param = GetParam();
    224  Init();
    225 
    226  size_t trials = 10000;
    227 
    228  // Find the split-points for the buckets.
    229  std::vector<size_t> points;
    230  std::vector<double> expected;
    231  {
    232    double last_cdf = 0.0;
    233    double min_p = 1.0;
    234    for (double p = 0.01; p < 1.0; p += 0.01) {
    235      auto x = InverseCDF(p);
    236      if (points.empty() || points.back() < x.second) {
    237        const double p = CDF(x.second);
    238        points.push_back(x.second);
    239        double q = p - last_cdf;
    240        expected.push_back(q);
    241        last_cdf = p;
    242        if (q < min_p) {
    243          min_p = q;
    244        }
    245      }
    246    }
    247    if (last_cdf < 0.999) {
    248      points.push_back(std::numeric_limits<size_t>::max());
    249      double q = 1.0 - last_cdf;
    250      expected.push_back(q);
    251      if (q < min_p) {
    252        min_p = q;
    253      }
    254    } else {
    255      points.back() = std::numeric_limits<size_t>::max();
    256      expected.back() += (1.0 - last_cdf);
    257    }
    258    // The Chi-Squared score is not completely scale-invariant; it works best
    259    // when the small values are in the small digits.
    260    trials = static_cast<size_t>(8.0 / min_p);
    261  }
    262  ASSERT_GT(points.size(), 0);
    263 
    264  // Generate n variates and fill the counts vector with the count of their
    265  // occurrences.
    266  std::vector<int64_t> buckets(points.size(), 0);
    267  double avg = 0;
    268  {
    269    zipf_u64 dis(param);
    270    for (size_t i = 0; i < trials; i++) {
    271      uint64_t x = dis(rng_);
    272      ASSERT_LE(x, dis.max());
    273      ASSERT_GE(x, dis.min());
    274      avg += static_cast<double>(x);
    275      auto it = std::upper_bound(std::begin(points), std::end(points),
    276                                 static_cast<size_t>(x));
    277      buckets[std::distance(std::begin(points), it)]++;
    278    }
    279    avg = avg / static_cast<double>(trials);
    280  }
    281 
    282  // Validate the output using the Chi-Squared test.
    283  for (auto& e : expected) {
    284    e *= trials;
    285  }
    286 
    287  // The null-hypothesis is that the distribution is a poisson distribution with
    288  // the provided mean (not estimated from the data).
    289  const int dof = static_cast<int>(expected.size()) - 1;
    290 
    291  // NOTE: This test runs about 15x per invocation, so a value of 0.9995 is
    292  // approximately correct for a test suite failure rate of 1 in 100.  In
    293  // practice we see failures slightly higher than that.
    294  const double threshold = absl::random_internal::ChiSquareValue(dof, 0.9999);
    295 
    296  const double chi_square = absl::random_internal::ChiSquare(
    297      std::begin(buckets), std::end(buckets), std::begin(expected),
    298      std::end(expected));
    299 
    300  const double p_actual =
    301      absl::random_internal::ChiSquarePValue(chi_square, dof);
    302 
    303  // Log if the chi_squared value is above the threshold.
    304  if (chi_square > threshold) {
    305    LOG(INFO) << "values";
    306    for (size_t i = 0; i < expected.size(); i++) {
    307      LOG(INFO) << points[i] << ": " << buckets[i] << " vs. E=" << expected[i];
    308    }
    309    LOG(INFO) << "trials " << trials;
    310    LOG(INFO) << "mean " << avg << " vs. expected " << mean();
    311    LOG(INFO) << kChiSquared << "(data, " << dof << ") = " << chi_square << " ("
    312              << p_actual << ")";
    313    LOG(INFO) << kChiSquared << " @ 0.9995 = " << threshold;
    314    FAIL() << kChiSquared << " value of " << chi_square
    315           << " is above the threshold.";
    316  }
    317 }
    318 
    319 std::vector<zipf_u64::param_type> GenParams() {
    320  using param = zipf_u64::param_type;
    321  const auto k = param().k();
    322  const auto q = param().q();
    323  const auto v = param().v();
    324  const uint64_t k2 = 1 << 10;
    325  return std::vector<zipf_u64::param_type>{
    326      // Default
    327      param(k, q, v),
    328      // vary K
    329      param(4, q, v), param(1 << 4, q, v), param(k2, q, v),
    330      // vary V
    331      param(k2, q, 0.5), param(k2, q, 1.5), param(k2, q, 2.5), param(k2, q, 10),
    332      // vary Q
    333      param(k2, 1.5, v), param(k2, 3, v), param(k2, 5, v), param(k2, 10, v),
    334      // Vary V & Q
    335      param(k2, 1.5, 0.5), param(k2, 3, 1.5), param(k, 10, 10)};
    336 }
    337 
    338 std::string ParamName(
    339    const ::testing::TestParamInfo<zipf_u64::param_type>& info) {
    340  const auto& p = info.param;
    341  std::string name = absl::StrCat("k_", p.k(), "__q_", absl::SixDigits(p.q()),
    342                                  "__v_", absl::SixDigits(p.v()));
    343  return absl::StrReplaceAll(name, {{"+", "_"}, {"-", "_"}, {".", "_"}});
    344 }
    345 
    346 INSTANTIATE_TEST_SUITE_P(All, ZipfTest, ::testing::ValuesIn(GenParams()),
    347                         ParamName);
    348 
    349 // NOTE: absl::zipf_distribution is not guaranteed to be stable.
    350 TEST(ZipfDistributionTest, StabilityTest) {
    351  // absl::zipf_distribution stability relies on
    352  // absl::uniform_real_distribution, std::log, std::exp, std::log1p
    353  absl::random_internal::sequence_urbg urbg(
    354      {0x0003eb76f6f7f755ull, 0xFFCEA50FDB2F953Bull, 0xC332DDEFBE6C5AA5ull,
    355       0x6558218568AB9702ull, 0x2AEF7DAD5B6E2F84ull, 0x1521B62829076170ull,
    356       0xECDD4775619F1510ull, 0x13CCA830EB61BD96ull, 0x0334FE1EAA0363CFull,
    357       0xB5735C904C70A239ull, 0xD59E9E0BCBAADE14ull, 0xEECC86BC60622CA7ull});
    358 
    359  std::vector<int> output(10);
    360 
    361  {
    362    absl::zipf_distribution<int32_t> dist;
    363    std::generate(std::begin(output), std::end(output),
    364                  [&] { return dist(urbg); });
    365    EXPECT_THAT(output, ElementsAre(10031, 0, 0, 3, 6, 0, 7, 47, 0, 0));
    366  }
    367  urbg.reset();
    368  {
    369    absl::zipf_distribution<int32_t> dist(std::numeric_limits<int32_t>::max(),
    370                                          3.3);
    371    std::generate(std::begin(output), std::end(output),
    372                  [&] { return dist(urbg); });
    373    EXPECT_THAT(output, ElementsAre(44, 0, 0, 0, 0, 1, 0, 1, 3, 0));
    374  }
    375 }
    376 
    377 TEST(ZipfDistributionTest, AlgorithmBounds) {
    378  absl::zipf_distribution<int32_t> dist;
    379 
    380  // Small values from absl::uniform_real_distribution map to larger Zipf
    381  // distribution values.
    382  const std::pair<uint64_t, int32_t> kInputs[] = {
    383      {0xffffffffffffffff, 0x0}, {0x7fffffffffffffff, 0x0},
    384      {0x3ffffffffffffffb, 0x1}, {0x1ffffffffffffffd, 0x4},
    385      {0xffffffffffffffe, 0x9},  {0x7ffffffffffffff, 0x12},
    386      {0x3ffffffffffffff, 0x25}, {0x1ffffffffffffff, 0x4c},
    387      {0xffffffffffffff, 0x99},  {0x7fffffffffffff, 0x132},
    388      {0x3fffffffffffff, 0x265}, {0x1fffffffffffff, 0x4cc},
    389      {0xfffffffffffff, 0x999},  {0x7ffffffffffff, 0x1332},
    390      {0x3ffffffffffff, 0x2665}, {0x1ffffffffffff, 0x4ccc},
    391      {0xffffffffffff, 0x9998},  {0x7fffffffffff, 0x1332f},
    392      {0x3fffffffffff, 0x2665a}, {0x1fffffffffff, 0x4cc9e},
    393      {0xfffffffffff, 0x998e0},  {0x7ffffffffff, 0x133051},
    394      {0x3ffffffffff, 0x265ae4}, {0x1ffffffffff, 0x4c9ed3},
    395      {0xffffffffff, 0x98e223},  {0x7fffffffff, 0x13058c4},
    396      {0x3fffffffff, 0x25b178e}, {0x1fffffffff, 0x4a062b2},
    397      {0xfffffffff, 0x8ee23b8},  {0x7ffffffff, 0x10b21642},
    398      {0x3ffffffff, 0x1d89d89d}, {0x1ffffffff, 0x2fffffff},
    399      {0xffffffff, 0x45d1745d},  {0x7fffffff, 0x5a5a5a5a},
    400      {0x3fffffff, 0x69ee5846},  {0x1fffffff, 0x73ecade3},
    401      {0xfffffff, 0x79a9d260},   {0x7ffffff, 0x7cc0532b},
    402      {0x3ffffff, 0x7e5ad146},   {0x1ffffff, 0x7f2c0bec},
    403      {0xffffff, 0x7f95adef},    {0x7fffff, 0x7fcac0da},
    404      {0x3fffff, 0x7fe55ae2},    {0x1fffff, 0x7ff2ac0e},
    405      {0xfffff, 0x7ff955ae},     {0x7ffff, 0x7ffcaac1},
    406      {0x3ffff, 0x7ffe555b},     {0x1ffff, 0x7fff2aac},
    407      {0xffff, 0x7fff9556},      {0x7fff, 0x7fffcaab},
    408      {0x3fff, 0x7fffe555},      {0x1fff, 0x7ffff2ab},
    409      {0xfff, 0x7ffff955},       {0x7ff, 0x7ffffcab},
    410      {0x3ff, 0x7ffffe55},       {0x1ff, 0x7fffff2b},
    411      {0xff, 0x7fffff95},        {0x7f, 0x7fffffcb},
    412      {0x3f, 0x7fffffe5},        {0x1f, 0x7ffffff3},
    413      {0xf, 0x7ffffff9},         {0x7, 0x7ffffffd},
    414      {0x3, 0x7ffffffe},         {0x1, 0x7fffffff},
    415  };
    416 
    417  for (const auto& instance : kInputs) {
    418    absl::random_internal::sequence_urbg urbg({instance.first});
    419    EXPECT_EQ(instance.second, dist(urbg));
    420  }
    421 }
    422 
    423 }  // namespace