[ tor ] .git.dasho

test_prob_distr.c (44484B)
      1 /* Copyright (c) 2018-2021, The Tor Project, Inc. */
      2 /* See LICENSE for licensing information */
      3 
      4 /**
      5 * \file test_prob_distr.c
      6 * \brief Test probability distributions.
      7 * \detail
      8 *
      9 * For each probability distribution we do two kinds of tests:
     10 *
     11 * a) We do numerical deterministic testing of their cdf/icdf/sf/isf functions
     12 *    and the various relationships between them for each distribution. We also
     13 *    do deterministic tests on their sampling functions. Test vectors for
     14 *    these tests were computed from alternative implementations and were
     15 *    eyeballed to make sure they make sense
     16 *    (e.g. src/test/prob_distr_mpfr_ref.c computes logit(p) using GNU mpfr
     17 *    with 200-bit precision and is then tested in test_logit_logistic()).
     18 *
     19 * b) We do stochastic hypothesis testing (G-test) to ensure that sampling from
     20 *    the given distributions is distributed properly. The stochastic tests are
     21 *    slow and their false positive rate is not well suited for CI, so they are
     22 *    currently disabled-by-default and put into 'tests-slow'.
     23 */
     24 
     25 #define PROB_DISTR_PRIVATE
     26 
     27 #include "orconfig.h"
     28 
     29 #include "test/test.h"
     30 
     31 #include "core/or/or.h"
     32 
     33 #include "lib/math/prob_distr.h"
     34 #include "lib/math/fp.h"
     35 #include "lib/crypt_ops/crypto_rand.h"
     36 #include "test/rng_test_helpers.h"
     37 
     38 #include <float.h>
     39 #include <math.h>
     40 #include <stdbool.h>
     41 #include <stddef.h>
     42 #include <stdint.h>
     43 #include <stdio.h>
     44 #include <stdlib.h>
     45 
     46 /**
     47 * Return floor(d) converted to size_t, as a workaround for complaints
     48 * under -Wbad-function-cast for (size_t)floor(d).
     49 */
     50 static size_t
     51 floor_to_size_t(double d)
     52 {
     53  double integral_d = floor(d);
     54  return (size_t)integral_d;
     55 }
     56 
     57 /**
     58 * Return ceil(d) converted to size_t, as a workaround for complaints
     59 * under -Wbad-function-cast for (size_t)ceil(d).
     60 */
     61 static size_t
     62 ceil_to_size_t(double d)
     63 {
     64  double integral_d = ceil(d);
     65  return (size_t)integral_d;
     66 }
     67 
     68 /*
     69 * Geometric(p) distribution, supported on {1, 2, 3, ...}.
     70 *
     71 * Compute the probability mass function Geom(n; p) of the number of
     72 * trials before the first success when success has probability p.
     73 */
     74 static double
     75 logpmf_geometric(unsigned n, double p)
     76 {
     77  /* This is actually a check against 1, but we do >= so that the compiler
     78     does not raise a -Wfloat-equal */
     79  if (p >= 1) {
     80    if (n == 1)
     81      return 0;
     82    else
     83      return -HUGE_VAL;
     84  }
     85  return (n - 1)*log1p(-p) + log(p);
     86 }
     87 
     88 /**
     89 * Compute the logistic function, translated in output by 1/2:
     90 * logistichalf(x) = logistic(x) - 1/2.  Well-conditioned on the entire
     91 * real plane, with maximum condition number 1 at 0.
     92 *
     93 * This implementation gives relative error bounded by 5 eps.
     94 */
     95 static double
     96 logistichalf(double x)
     97 {
     98  /*
     99   * Rewrite this with the identity
    100   *
    101   *  1/(1 + e^{-x}) - 1/2
    102   *  = (1 - 1/2 - e^{-x}/2)/(1 + e^{-x})
    103   *  = (1/2 - e^{-x}/2)/(1 + e^{-x})
    104   *  = (1 - e^{-x})/[2 (1 + e^{-x})]
    105   *  = -(e^{-x} - 1)/[2 (1 + e^{-x})],
    106   *
    107   * which we can evaluate by -expm1(-x)/[2 (1 + exp(-x))].
    108   *
    109   * Suppose exp has error d0, + has error d1, expm1 has error
    110   * d2, and / has error d3, so we evaluate
    111   *
    112   *  -(1 + d2) (1 + d3) (e^{-x} - 1)
    113   *    / [2 (1 + d1) (1 + (1 + d0) e^{-x})].
    114   *
    115   * In the denominator,
    116   *
    117   *  1 + (1 + d0) e^{-x}
    118   *  = 1 + e^{-x} + d0 e^{-x}
    119   *  = (1 + e^{-x}) (1 + d0 e^{-x}/(1 + e^{-x})),
    120   *
    121   * so the relative error of the numerator is
    122   *
    123   *  d' = d2 + d3 + d2 d3,
    124   * and of the denominator,
    125   *  d'' = d1 + d0 e^{-x}/(1 + e^{-x}) + d0 d1 e^{-x}/(1 + e^{-x})
    126   *      = d1 + d0 L(-x) + d0 d1 L(-x),
    127   *
    128   * where L(-x) is logistic(-x).  By Lemma 1 the relative error
    129   * of the quotient is bounded by
    130   *
    131   *  2|d2 + d3 + d2 d3 - d1 - d0 L(x) + d0 d1 L(x)|,
    132   *
    133   * Since 0 < L(x) < 1, this is bounded by
    134   *
    135   *  2|d2| + 2|d3| + 2|d2 d3| + 2|d1| + 2|d0| + 2|d0 d1|
    136   *  <= 4 eps + 2 eps^2.
    137   */
    138  if (x < log(DBL_EPSILON/8)) {
    139    /*
    140     * Avoid overflow in e^{-x}.  When x < log(eps/4), we
    141     * we further have x < logit(eps/4), so that
    142     * logistic(x) < eps/4.  Hence the relative error of
    143     * logistic(x) - 1/2 from -1/2 is bounded by eps/2, and
    144     * so the relative error of -1/2 from logistic(x) - 1/2
    145     * is bounded by eps.
    146     */
    147    return -0.5;
    148  } else {
    149    return -expm1(-x)/(2*(1 + exp(-x)));
    150  }
    151 }
    152 
    153 /**
    154 * Compute the log of the sum of the exps.  Caller should arrange the
    155 * array in descending order to minimize error because I don't want to
    156 * deal with using temporary space and the one caller in this file
    157 * arranges that anyway.
    158 *
    159 * Warning: This implementation does not handle infinite or NaN inputs
    160 * sensibly, because I don't need that here at the moment.  (NaN, or
    161 * -inf and +inf together, should yield NaN; +inf and finite should
    162 * yield +inf; otherwise all -inf should be ignored because exp(-inf) =
    163 * 0.)
    164 */
    165 static double
    166 logsumexp(double *A, size_t n)
    167 {
    168  double maximum, sum;
    169  size_t i;
    170 
    171  if (n == 0)
    172    return log(0);
    173 
    174  maximum = A[0];
    175  for (i = 1; i < n; i++) {
    176    if (A[i] > maximum)
    177      maximum = A[i];
    178  }
    179 
    180  sum = 0;
    181  for (i = n; i --> 0;)
    182    sum += exp(A[i] - maximum);
    183 
    184  return log(sum) + maximum;
    185 }
    186 
    187 /**
    188 * Compute log(1 - e^x).  Defined only for negative x so that e^x < 1.
    189 * This is the complement of a probability in log space.
    190 */
    191 static double
    192 log1mexp(double x)
    193 {
    194 
    195  /*
    196   * We want to compute log on [0, 1/2) but log1p on [1/2, +inf),
    197   * so partition x at -log(2) = log(1/2).
    198   */
    199  if (-log(2) < x)
    200    return log(-expm1(x));
    201  else
    202    return log1p(-exp(x));
    203 }
    204 
    205 /*
    206 * Tests of numerical errors in computing logit, logistic, and the
    207 * various cdfs, sfs, icdfs, and isfs.
    208 */
    209 
    210 #define arraycount(A) (sizeof(A)/sizeof(A[0]))
    211 
    212 /** Return relative error between <b>actual</b> and <b>expected</b>.
    213 *  Special cases: If <b>expected</b> is zero or infinite, return 1 if
    214 *  <b>actual</b> is equal to <b>expected</b> and 0 if not, since the
    215 *  usual notion of relative error is undefined but we only use this
    216 *  for testing relerr(e, a) <= bound.  If either is NaN, return NaN,
    217 *  which has the property that NaN <= bound is false no matter what
    218 *  bound is.
    219 *
    220 *  Beware: if you test !(relerr(e, a) > bound), then then the result
    221 *  is true when a is NaN because NaN > bound is false too.  See
    222 *  CHECK_RELERR for correct use to decide when to report failure.
    223 */
    224 static double
    225 relerr(double expected, double actual)
    226 {
    227  /*
    228   * To silence -Wfloat-equal, we have to test for equality using
    229   * inequalities: we have (fabs(expected) <= 0) iff (expected == 0),
    230   * and (actual <= expected && actual >= expected) iff actual ==
    231   * expected whether expected is zero or infinite.
    232   */
    233  if (fabs(expected) <= 0 || tor_isinf(expected)) {
    234    if (actual <= expected && actual >= expected)
    235      return 0;
    236    else
    237      return 1;
    238  } else {
    239    return fabs((expected - actual)/expected);
    240  }
    241 }
    242 
    243 /** Check that relative error of <b>expected</b> and <b>actual</b> is within
    244 *  <b>relerr_bound</b>.  Caller must arrange to have i and relerr_bound in
    245 *  scope.  */
    246 #define CHECK_RELERR(expected, actual) do {                                   \
    247  double check_expected = (expected);                                         \
    248  double check_actual = (actual);                                             \
    249  const char *str_expected = #expected;                                       \
    250  const char *str_actual = #actual;                                           \
    251  double check_relerr = relerr(expected, actual);                             \
    252  if (!(relerr(check_expected, check_actual) <= relerr_bound)) {              \
    253    log_warn(LD_GENERAL, "%s:%d: case %u: relerr(%s=%.17e, %s=%.17e)"        \
    254             " = %.17e > %.17e\n",                                            \
    255             __func__, __LINE__, (unsigned) i,                                \
    256             str_expected, check_expected,                                    \
    257             str_actual, check_actual,                                        \
    258             check_relerr, relerr_bound);                                     \
    259    ok = false;                                                               \
    260  }                                                                           \
    261 } while (0)
    262 
    263 /* Check that a <= b.
    264 * Caller must arrange to have i in scope.  */
    265 #define CHECK_LE(a, b) do {                                                   \
    266  double check_a = (a);                                                       \
    267  double check_b = (b);                                                       \
    268  const char *str_a = #a;                                                     \
    269  const char *str_b = #b;                                                     \
    270  if (!(check_a <= check_b)) {                                                \
    271    log_warn(LD_GENERAL, "%s:%d: case %u: %s=%.17e > %s=%.17e\n",             \
    272             __func__, __LINE__, (unsigned) i,                                \
    273             str_a, check_a, str_b, check_b);                                 \
    274    ok = false;                                                               \
    275  }                                                                           \
    276 } while (0)
    277 
    278 /**
    279 * Test the logit and logistic functions.  Confirm that they agree with
    280 * the cdf, sf, icdf, and isf of the standard Logistic distribution.
    281 * Confirm that the sampler for the standard logistic distribution maps
    282 * [0, 1] into the right subinterval for the inverse transform, for
    283 * this implementation.
    284 */
    285 static void
    286 test_logit_logistic(void *arg)
    287 {
    288  (void) arg;
    289 
    290  static const struct {
    291    double x;                   /* x = logit(p) */
    292    double p;                   /* p = logistic(x) */
    293    double phalf;               /* p - 1/2 = logistic(x) - 1/2 */
    294  } cases[] = {
    295    { -HUGE_VAL, 0, -0.5 },
    296    { -1000, 0, -0.5 },
    297    { -710, 4.47628622567513e-309, -0.5 },
    298    { -708, 3.307553003638408e-308, -0.5 },
    299    { -2, .11920292202211755, -.3807970779778824 },
    300    { -1.0000001, .2689414017088022, -.23105859829119776 },
    301    { -1, .2689414213699951, -.23105857863000487 },
    302    { -0.9999999, .26894144103118883, -.2310585589688111 },
    303    /* see src/test/prob_distr_mpfr_ref.c for computation */
    304    { -4.000000000537333e-5, .49999, -1.0000000000010001e-5 },
    305    { -4.000000000533334e-5, .49999, -.00001 },
    306    { -4.000000108916878e-9, .499999999, -1.0000000272292198e-9 },
    307    { -4e-9, .499999999, -1e-9 },
    308    { -4e-16, .5, -1e-16 },
    309    { -4e-300, .5, -1e-300 },
    310    { 0, .5, 0 },
    311    { 4e-300, .5, 1e-300 },
    312    { 4e-16, .5, 1e-16 },
    313    { 3.999999886872274e-9, .500000001, 9.999999717180685e-10 },
    314    { 4e-9, .500000001, 1e-9 },
    315    { 4.0000000005333336e-5, .50001, .00001 },
    316    { 8.000042667076272e-3, .502, .002 },
    317    { 0.9999999, .7310585589688111, .2310585589688111 },
    318    { 1, .7310585786300049, .23105857863000487 },
    319    { 1.0000001, .7310585982911977, .23105859829119774 },
    320    { 2, .8807970779778823, .3807970779778824 },
    321    { 708, 1, .5 },
    322    { 710, 1, .5 },
    323    { 1000, 1, .5 },
    324    { HUGE_VAL, 1, .5 },
    325  };
    326  double relerr_bound = 3e-15; /* >10eps */
    327  size_t i;
    328  bool ok = true;
    329 
    330  for (i = 0; i < arraycount(cases); i++) {
    331    double x = cases[i].x;
    332    double p = cases[i].p;
    333    double phalf = cases[i].phalf;
    334 
    335    /*
    336     * cdf is logistic, icdf is logit, and symmetry for
    337     * sf/isf.
    338     */
    339    CHECK_RELERR(logistic(x), cdf_logistic(x, 0, 1));
    340    CHECK_RELERR(logistic(-x), sf_logistic(x, 0, 1));
    341    CHECK_RELERR(logit(p), icdf_logistic(p, 0, 1));
    342    CHECK_RELERR(-logit(p), isf_logistic(p, 0, 1));
    343 
    344    CHECK_RELERR(cdf_logistic(x, 0, 1), cdf_logistic(x*2, 0, 2));
    345    CHECK_RELERR(sf_logistic(x, 0, 1), sf_logistic(x*2, 0, 2));
    346    CHECK_RELERR(icdf_logistic(p, 0, 1), icdf_logistic(p, 0, 2)/2);
    347    CHECK_RELERR(isf_logistic(p, 0, 1), isf_logistic(p, 0, 2)/2);
    348 
    349    CHECK_RELERR(cdf_logistic(x, 0, 1), cdf_logistic(x/2, 0, .5));
    350    CHECK_RELERR(sf_logistic(x, 0, 1), sf_logistic(x/2, 0, .5));
    351    CHECK_RELERR(icdf_logistic(p, 0, 1), icdf_logistic(p, 0,.5)*2);
    352    CHECK_RELERR(isf_logistic(p, 0, 1), isf_logistic(p, 0, .5)*2);
    353 
    354    CHECK_RELERR(cdf_logistic(x, 0, 1), cdf_logistic(x*2 + 1, 1, 2));
    355    CHECK_RELERR(sf_logistic(x, 0, 1), sf_logistic(x*2 + 1, 1, 2));
    356 
    357    /*
    358     * For p near 0 and p near 1/2, the arithmetic of
    359     * translating by 1 loses precision.
    360     */
    361    if (fabs(p) > DBL_EPSILON && fabs(p) < 0.4) {
    362      CHECK_RELERR(icdf_logistic(p, 0, 1),
    363          (icdf_logistic(p, 1, 2) - 1)/2);
    364      CHECK_RELERR(isf_logistic(p, 0, 1),
    365          (isf_logistic(p, 1, 2) - 1)/2);
    366    }
    367 
    368    CHECK_RELERR(p, logistic(x));
    369    CHECK_RELERR(phalf, logistichalf(x));
    370 
    371    /*
    372     * On the interior floating-point numbers, either logit or
    373     * logithalf had better give the correct answer.
    374     *
    375     * For probabilities near 0, we can get much finer resolution with
    376     * logit, and for probabilities near 1/2, we can get much finer
    377     * resolution with logithalf by representing them using p - 1/2.
    378     *
    379     * E.g., we can write -.00001 for phalf, and .49999 for p, but the
    380     * difference 1/2 - .00001 gives 1.0000000000010001e-5 in binary64
    381     * arithmetic.  So test logit(.49999) which should give the same
    382     * answer as logithalf(-1.0000000000010001e-5), namely
    383     * -4.000000000537333e-5, and also test logithalf(-.00001) which
    384     * gives -4.000000000533334e-5 instead -- but don't expect
    385     * logit(.49999) to give -4.000000000533334e-5 even though it looks
    386     * like 1/2 - .00001.
    387     *
    388     * A naive implementation of logit will just use log(p/(1 - p)) and
    389     * give the answer -4.000000000551673e-05 for .49999, which is
    390     * wrong in a lot of digits, which happens because log is
    391     * ill-conditioned near 1 and thus amplifies whatever relative
    392     * error we made in computing p/(1 - p).
    393     */
    394    if ((0 < p && p < 1) || tor_isinf(x)) {
    395      if (phalf >= p - 0.5 && phalf <= p - 0.5)
    396        CHECK_RELERR(x, logit(p));
    397      if (p >= 0.5 + phalf && p <= 0.5 + phalf)
    398        CHECK_RELERR(x, logithalf(phalf));
    399    }
    400 
    401    CHECK_RELERR(-phalf, logistichalf(-x));
    402    if (fabs(phalf) < 0.5 || tor_isinf(x))
    403      CHECK_RELERR(-x, logithalf(-phalf));
    404    if (p < 1 || tor_isinf(x)) {
    405      CHECK_RELERR(1 - p, logistic(-x));
    406      if (p > .75 || tor_isinf(x))
    407        CHECK_RELERR(-x, logit(1 - p));
    408    } else {
    409      CHECK_LE(logistic(-x), 1e-300);
    410    }
    411  }
    412 
    413  for (i = 0; i <= 100; i++) {
    414    double p0 = (double)i/100;
    415 
    416    CHECK_RELERR(logit(p0/(1 + M_E)), sample_logistic(0, 0, p0));
    417    CHECK_RELERR(-logit(p0/(1 + M_E)), sample_logistic(1, 0, p0));
    418    CHECK_RELERR(logithalf(p0*(0.5 - 1/(1 + M_E))),
    419        sample_logistic(0, 1, p0));
    420    CHECK_RELERR(-logithalf(p0*(0.5 - 1/(1 + M_E))),
    421        sample_logistic(1, 1, p0));
    422  }
    423 
    424  if (!ok)
    425    printf("fail logit/logistic / logistic cdf/sf\n");
    426 
    427  tt_assert(ok);
    428 
    429 done:
    430  ;
    431 }
    432 
    433 /**
    434 * Test the cdf, sf, icdf, and isf of the LogLogistic distribution.
    435 */
    436 static void
    437 test_log_logistic(void *arg)
    438 {
    439  (void) arg;
    440 
    441  static const struct {
    442    /* x is a point in the support of the LogLogistic distribution */
    443    double x;
    444    /* 'p' is the probability that a random variable X for a given LogLogistic
    445     * probability distribution will take value less-or-equal to x */
    446    double p;
    447    /* 'np' is the probability that a random variable X for a given LogLogistic
    448     * probability distribution will take value greater-or-equal to x. */
    449    double np;
    450  } cases[] = {
    451    { 0, 0, 1 },
    452    { 1e-300, 1e-300, 1 },
    453    { 1e-17, 1e-17, 1 },
    454    { 1e-15, 1e-15, .999999999999999 },
    455    { .1, .09090909090909091, .90909090909090909 },
    456    { .25, .2, .8 },
    457    { .5, .33333333333333333, .66666666666666667 },
    458    { .75, .42857142857142855, .5714285714285714 },
    459    { .9999, .49997499874993756, .5000250012500626 },
    460    { .99999999, .49999999749999996, .5000000025 },
    461    { .999999999999999, .49999999999999994, .5000000000000002 },
    462    { 1, .5, .5 },
    463  };
    464  double relerr_bound = 3e-15;
    465  size_t i;
    466  bool ok = true;
    467 
    468  for (i = 0; i < arraycount(cases); i++) {
    469    double x = cases[i].x;
    470    double p = cases[i].p;
    471    double np = cases[i].np;
    472 
    473    CHECK_RELERR(p, cdf_log_logistic(x, 1, 1));
    474    CHECK_RELERR(p, cdf_log_logistic(x/2, .5, 1));
    475    CHECK_RELERR(p, cdf_log_logistic(x*2, 2, 1));
    476    CHECK_RELERR(p, cdf_log_logistic(sqrt(x), 1, 2));
    477    CHECK_RELERR(p, cdf_log_logistic(sqrt(x)/2, .5, 2));
    478    CHECK_RELERR(p, cdf_log_logistic(sqrt(x)*2, 2, 2));
    479    if (2*sqrt(DBL_MIN) < x) {
    480      CHECK_RELERR(p, cdf_log_logistic(x*x, 1, .5));
    481      CHECK_RELERR(p, cdf_log_logistic(x*x/2, .5, .5));
    482      CHECK_RELERR(p, cdf_log_logistic(x*x*2, 2, .5));
    483    }
    484 
    485    CHECK_RELERR(np, sf_log_logistic(x, 1, 1));
    486    CHECK_RELERR(np, sf_log_logistic(x/2, .5, 1));
    487    CHECK_RELERR(np, sf_log_logistic(x*2, 2, 1));
    488    CHECK_RELERR(np, sf_log_logistic(sqrt(x), 1, 2));
    489    CHECK_RELERR(np, sf_log_logistic(sqrt(x)/2, .5, 2));
    490    CHECK_RELERR(np, sf_log_logistic(sqrt(x)*2, 2, 2));
    491    if (2*sqrt(DBL_MIN) < x) {
    492      CHECK_RELERR(np, sf_log_logistic(x*x, 1, .5));
    493      CHECK_RELERR(np, sf_log_logistic(x*x/2, .5, .5));
    494      CHECK_RELERR(np, sf_log_logistic(x*x*2, 2, .5));
    495    }
    496 
    497    CHECK_RELERR(np, cdf_log_logistic(1/x, 1, 1));
    498    CHECK_RELERR(np, cdf_log_logistic(1/(2*x), .5, 1));
    499    CHECK_RELERR(np, cdf_log_logistic(2/x, 2, 1));
    500    CHECK_RELERR(np, cdf_log_logistic(1/sqrt(x), 1, 2));
    501    CHECK_RELERR(np, cdf_log_logistic(1/(2*sqrt(x)), .5, 2));
    502    CHECK_RELERR(np, cdf_log_logistic(2/sqrt(x), 2, 2));
    503    if (2*sqrt(DBL_MIN) < x && x < 1/(2*sqrt(DBL_MIN))) {
    504      CHECK_RELERR(np, cdf_log_logistic(1/(x*x), 1, .5));
    505      CHECK_RELERR(np, cdf_log_logistic(1/(2*x*x), .5, .5));
    506      CHECK_RELERR(np, cdf_log_logistic(2/(x*x), 2, .5));
    507    }
    508 
    509    CHECK_RELERR(p, sf_log_logistic(1/x, 1, 1));
    510    CHECK_RELERR(p, sf_log_logistic(1/(2*x), .5, 1));
    511    CHECK_RELERR(p, sf_log_logistic(2/x, 2, 1));
    512    CHECK_RELERR(p, sf_log_logistic(1/sqrt(x), 1, 2));
    513    CHECK_RELERR(p, sf_log_logistic(1/(2*sqrt(x)), .5, 2));
    514    CHECK_RELERR(p, sf_log_logistic(2/sqrt(x), 2, 2));
    515    if (2*sqrt(DBL_MIN) < x && x < 1/(2*sqrt(DBL_MIN))) {
    516      CHECK_RELERR(p, sf_log_logistic(1/(x*x), 1, .5));
    517      CHECK_RELERR(p, sf_log_logistic(1/(2*x*x), .5, .5));
    518      CHECK_RELERR(p, sf_log_logistic(2/(x*x), 2, .5));
    519    }
    520 
    521    CHECK_RELERR(x, icdf_log_logistic(p, 1, 1));
    522    CHECK_RELERR(x/2, icdf_log_logistic(p, .5, 1));
    523    CHECK_RELERR(x*2, icdf_log_logistic(p, 2, 1));
    524    CHECK_RELERR(x, icdf_log_logistic(p, 1, 1));
    525    CHECK_RELERR(sqrt(x)/2, icdf_log_logistic(p, .5, 2));
    526    CHECK_RELERR(sqrt(x)*2, icdf_log_logistic(p, 2, 2));
    527    CHECK_RELERR(sqrt(x), icdf_log_logistic(p, 1, 2));
    528    CHECK_RELERR(x*x/2, icdf_log_logistic(p, .5, .5));
    529    CHECK_RELERR(x*x*2, icdf_log_logistic(p, 2, .5));
    530 
    531    if (np < .9) {
    532      CHECK_RELERR(x, isf_log_logistic(np, 1, 1));
    533      CHECK_RELERR(x/2, isf_log_logistic(np, .5, 1));
    534      CHECK_RELERR(x*2, isf_log_logistic(np, 2, 1));
    535      CHECK_RELERR(sqrt(x), isf_log_logistic(np, 1, 2));
    536      CHECK_RELERR(sqrt(x)/2, isf_log_logistic(np, .5, 2));
    537      CHECK_RELERR(sqrt(x)*2, isf_log_logistic(np, 2, 2));
    538      CHECK_RELERR(x*x, isf_log_logistic(np, 1, .5));
    539      CHECK_RELERR(x*x/2, isf_log_logistic(np, .5, .5));
    540      CHECK_RELERR(x*x*2, isf_log_logistic(np, 2, .5));
    541 
    542      CHECK_RELERR(1/x, icdf_log_logistic(np, 1, 1));
    543      CHECK_RELERR(1/(2*x), icdf_log_logistic(np, .5, 1));
    544      CHECK_RELERR(2/x, icdf_log_logistic(np, 2, 1));
    545      CHECK_RELERR(1/sqrt(x), icdf_log_logistic(np, 1, 2));
    546      CHECK_RELERR(1/(2*sqrt(x)),
    547          icdf_log_logistic(np, .5, 2));
    548      CHECK_RELERR(2/sqrt(x), icdf_log_logistic(np, 2, 2));
    549      CHECK_RELERR(1/(x*x), icdf_log_logistic(np, 1, .5));
    550      CHECK_RELERR(1/(2*x*x), icdf_log_logistic(np, .5, .5));
    551      CHECK_RELERR(2/(x*x), icdf_log_logistic(np, 2, .5));
    552    }
    553 
    554    CHECK_RELERR(1/x, isf_log_logistic(p, 1, 1));
    555    CHECK_RELERR(1/(2*x), isf_log_logistic(p, .5, 1));
    556    CHECK_RELERR(2/x, isf_log_logistic(p, 2, 1));
    557    CHECK_RELERR(1/sqrt(x), isf_log_logistic(p, 1, 2));
    558    CHECK_RELERR(1/(2*sqrt(x)), isf_log_logistic(p, .5, 2));
    559    CHECK_RELERR(2/sqrt(x), isf_log_logistic(p, 2, 2));
    560    CHECK_RELERR(1/(x*x), isf_log_logistic(p, 1, .5));
    561    CHECK_RELERR(1/(2*x*x), isf_log_logistic(p, .5, .5));
    562    CHECK_RELERR(2/(x*x), isf_log_logistic(p, 2, .5));
    563  }
    564 
    565  for (i = 0; i <= 100; i++) {
    566    double p0 = (double)i/100;
    567 
    568    CHECK_RELERR(0.5*p0/(1 - 0.5*p0), sample_log_logistic(0, p0));
    569    CHECK_RELERR((1 - 0.5*p0)/(0.5*p0),
    570        sample_log_logistic(1, p0));
    571  }
    572 
    573  if (!ok)
    574    printf("fail log logistic cdf/sf\n");
    575 
    576  tt_assert(ok);
    577 
    578 done:
    579  ;
    580 }
    581 
    582 /**
    583 * Test the cdf, sf, icdf, isf of the Weibull distribution.
    584 */
    585 static void
    586 test_weibull(void *arg)
    587 {
    588  (void) arg;
    589 
    590  static const struct {
    591    /* x is a point in the support of the Weibull distribution */
    592    double x;
    593    /* 'p' is the probability that a random variable X for a given Weibull
    594     * probability distribution will take value less-or-equal to x */
    595    double p;
    596    /* 'np' is the probability that a random variable X for a given Weibull
    597     * probability distribution will take value greater-or-equal to x. */
    598    double np;
    599  } cases[] = {
    600    { 0, 0, 1 },
    601    { 1e-300, 1e-300, 1 },
    602    { 1e-17, 1e-17, 1 },
    603    { .1, .09516258196404043, .9048374180359595 },
    604    { .5, .3934693402873666, .6065306597126334 },
    605    { .6931471805599453, .5, .5 },
    606    { 1, .6321205588285577, .36787944117144233 },
    607    { 10, .9999546000702375, 4.5399929762484854e-5 },
    608    { 36, .9999999999999998, 2.319522830243569e-16 },
    609    { 37, .9999999999999999, 8.533047625744066e-17 },
    610    { 38, 1, 3.1391327920480296e-17 },
    611    { 100, 1, 3.720075976020836e-44 },
    612    { 708, 1, 3.307553003638408e-308 },
    613    { 710, 1, 4.47628622567513e-309 },
    614    { 1000, 1, 0 },
    615    { HUGE_VAL, 1, 0 },
    616  };
    617  double relerr_bound = 3e-15;
    618  size_t i;
    619  bool ok = true;
    620 
    621  for (i = 0; i < arraycount(cases); i++) {
    622    double x = cases[i].x;
    623    double p = cases[i].p;
    624    double np = cases[i].np;
    625 
    626    CHECK_RELERR(p, cdf_weibull(x, 1, 1));
    627    CHECK_RELERR(p, cdf_weibull(x/2, .5, 1));
    628    CHECK_RELERR(p, cdf_weibull(x*2, 2, 1));
    629    /* For 0 < x < sqrt(DBL_MIN), x^2 loses lots of bits.  */
    630    if (x <= 0 ||
    631        sqrt(DBL_MIN) <= x) {
    632      CHECK_RELERR(p, cdf_weibull(x*x, 1, .5));
    633      CHECK_RELERR(p, cdf_weibull(x*x/2, .5, .5));
    634      CHECK_RELERR(p, cdf_weibull(x*x*2, 2, .5));
    635    }
    636    CHECK_RELERR(p, cdf_weibull(sqrt(x), 1, 2));
    637    CHECK_RELERR(p, cdf_weibull(sqrt(x)/2, .5, 2));
    638    CHECK_RELERR(p, cdf_weibull(sqrt(x)*2, 2, 2));
    639    CHECK_RELERR(np, sf_weibull(x, 1, 1));
    640    CHECK_RELERR(np, sf_weibull(x/2, .5, 1));
    641    CHECK_RELERR(np, sf_weibull(x*2, 2, 1));
    642    CHECK_RELERR(np, sf_weibull(x*x, 1, .5));
    643    CHECK_RELERR(np, sf_weibull(x*x/2, .5, .5));
    644    CHECK_RELERR(np, sf_weibull(x*x*2, 2, .5));
    645    if (x >= 10) {
    646      /*
    647       * exp amplifies the error of sqrt(x)^2
    648       * proportionally to exp(x); for large inputs
    649       * this is significant.
    650       */
    651      double t = -expm1(-x*(2*DBL_EPSILON + DBL_EPSILON));
    652      relerr_bound = t + DBL_EPSILON + t*DBL_EPSILON;
    653      if (relerr_bound < 3e-15)
    654        /*
    655         * The tests are written only to 16
    656         * decimal places anyway even if your
    657         * `double' is, say, i387 binary80, for
    658         * whatever reason.
    659         */
    660        relerr_bound = 3e-15;
    661      CHECK_RELERR(np, sf_weibull(sqrt(x), 1, 2));
    662      CHECK_RELERR(np, sf_weibull(sqrt(x)/2, .5, 2));
    663      CHECK_RELERR(np, sf_weibull(sqrt(x)*2, 2, 2));
    664    }
    665 
    666    if (p <= 0.75) {
    667      /*
    668       * For p near 1, not enough precision near 1 to
    669       * recover x.
    670       */
    671      CHECK_RELERR(x, icdf_weibull(p, 1, 1));
    672      CHECK_RELERR(x/2, icdf_weibull(p, .5, 1));
    673      CHECK_RELERR(x*2, icdf_weibull(p, 2, 1));
    674    }
    675    if (p >= 0.25 && !tor_isinf(x) && np > 0) {
    676      /*
    677       * For p near 0, not enough precision in np
    678       * near 1 to recover x.  For 0, isf gives inf,
    679       * even if p is precise enough for the icdf to
    680       * work.
    681       */
    682      CHECK_RELERR(x, isf_weibull(np, 1, 1));
    683      CHECK_RELERR(x/2, isf_weibull(np, .5, 1));
    684      CHECK_RELERR(x*2, isf_weibull(np, 2, 1));
    685    }
    686  }
    687 
    688  for (i = 0; i <= 100; i++) {
    689    double p0 = (double)i/100;
    690 
    691    CHECK_RELERR(3*sqrt(-log(p0/2)), sample_weibull(0, p0, 3, 2));
    692    CHECK_RELERR(3*sqrt(-log1p(-p0/2)),
    693        sample_weibull(1, p0, 3, 2));
    694  }
    695 
    696  if (!ok)
    697    printf("fail Weibull cdf/sf\n");
    698 
    699  tt_assert(ok);
    700 
    701 done:
    702  ;
    703 }
    704 
    705 /**
    706 * Test the cdf, sf, icdf, and isf of the generalized Pareto
    707 * distribution.
    708 */
    709 static void
    710 test_genpareto(void *arg)
    711 {
    712  (void) arg;
    713 
    714  struct {
    715    /* xi is the 'xi' parameter of the generalized Pareto distribution, and the
    716     * rest are the same as in the above tests */
    717    double xi, x, p, np;
    718  } cases[] = {
    719    { 0, 0, 0, 1 },
    720    { 1e-300, .004, 3.992010656008528e-3, .9960079893439915 },
    721    { 1e-300, .1, .09516258196404043, .9048374180359595 },
    722    { 1e-300, 1, .6321205588285577, .36787944117144233 },
    723    { 1e-300, 10, .9999546000702375, 4.5399929762484854e-5 },
    724    { 1e-200, 1e-16, 9.999999999999999e-17, .9999999999999999 },
    725    { 1e-16, 1e-200, 9.999999999999998e-201, 1 },
    726    { 1e-16, 1e-16, 1e-16, 1 },
    727    { 1e-16, .004, 3.992010656008528e-3, .9960079893439915 },
    728    { 1e-16, .1, .09516258196404043, .9048374180359595 },
    729    { 1e-16, 1, .6321205588285577, .36787944117144233 },
    730    { 1e-16, 10, .9999546000702375, 4.539992976248509e-5 },
    731    { 1e-10, 1e-6, 9.999995000001667e-7, .9999990000005 },
    732    { 1e-8, 1e-8, 9.999999950000001e-9, .9999999900000001 },
    733    { 1, 1e-300, 1e-300, 1 },
    734    { 1, 1e-16, 1e-16, .9999999999999999 },
    735    { 1, .1, .09090909090909091, .9090909090909091 },
    736    { 1, 1, .5, .5 },
    737    { 1, 10, .9090909090909091, .0909090909090909 },
    738    { 1, 100, .9900990099009901, .0099009900990099 },
    739    { 1, 1000, .999000999000999, 9.990009990009992e-4 },
    740    { 10, 1e-300, 1e-300, 1 },
    741    { 10, 1e-16, 9.999999999999995e-17, .9999999999999999 },
    742    { 10, .1, .06696700846319258, .9330329915368074 },
    743    { 10, 1, .21320655780322778, .7867934421967723 },
    744    { 10, 10, .3696701667040189, .6303298332959811 },
    745    { 10, 100, .49886285755007337, .5011371424499267 },
    746    { 10, 1000, .6018968102992647, .3981031897007353 },
    747  };
    748  double xi_array[] = { -1.5, -1, -1e-30, 0, 1e-30, 1, 1.5 };
    749  size_t i, j;
    750  double relerr_bound = 3e-15;
    751  bool ok = true;
    752 
    753  for (i = 0; i < arraycount(cases); i++) {
    754    double xi = cases[i].xi;
    755    double x = cases[i].x;
    756    double p = cases[i].p;
    757    double np = cases[i].np;
    758 
    759    CHECK_RELERR(p, cdf_genpareto(x, 0, 1, xi));
    760    CHECK_RELERR(p, cdf_genpareto(x*2, 0, 2, xi));
    761    CHECK_RELERR(p, cdf_genpareto(x/2, 0, .5, xi));
    762    CHECK_RELERR(np, sf_genpareto(x, 0, 1, xi));
    763    CHECK_RELERR(np, sf_genpareto(x*2, 0, 2, xi));
    764    CHECK_RELERR(np, sf_genpareto(x/2, 0, .5, xi));
    765 
    766    if (p < .5) {
    767      CHECK_RELERR(x, icdf_genpareto(p, 0, 1, xi));
    768      CHECK_RELERR(x*2, icdf_genpareto(p, 0, 2, xi));
    769      CHECK_RELERR(x/2, icdf_genpareto(p, 0, .5, xi));
    770    }
    771    if (np < .5) {
    772      CHECK_RELERR(x, isf_genpareto(np, 0, 1, xi));
    773      CHECK_RELERR(x*2, isf_genpareto(np, 0, 2, xi));
    774      CHECK_RELERR(x/2, isf_genpareto(np, 0, .5, xi));
    775    }
    776  }
    777 
    778  for (i = 0; i < arraycount(xi_array); i++) {
    779    for (j = 0; j <= 100; j++) {
    780      double p0 = (j == 0 ? 2*DBL_MIN : (double)j/100);
    781 
    782      /* This is actually a check against 0, but we do <= so that the compiler
    783         does not raise a -Wfloat-equal */
    784      if (fabs(xi_array[i]) <= 0) {
    785        /*
    786         * When xi == 0, the generalized Pareto
    787         * distribution reduces to an
    788         * exponential distribution.
    789         */
    790        CHECK_RELERR(-log(p0/2),
    791            sample_genpareto(0, p0, 0));
    792        CHECK_RELERR(-log1p(-p0/2),
    793            sample_genpareto(1, p0, 0));
    794      } else {
    795        CHECK_RELERR(expm1(-xi_array[i]*log(p0/2))/xi_array[i],
    796            sample_genpareto(0, p0, xi_array[i]));
    797        CHECK_RELERR((j == 0 ? DBL_MIN :
    798                expm1(-xi_array[i]*log1p(-p0/2))/xi_array[i]),
    799            sample_genpareto(1, p0, xi_array[i]));
    800      }
    801 
    802      CHECK_RELERR(isf_genpareto(p0/2, 0, 1, xi_array[i]),
    803          sample_genpareto(0, p0, xi_array[i]));
    804      CHECK_RELERR(icdf_genpareto(p0/2, 0, 1, xi_array[i]),
    805          sample_genpareto(1, p0, xi_array[i]));
    806    }
    807  }
    808 
    809  tt_assert(ok);
    810 
    811 done:
    812  ;
    813 }
    814 
    815 /**
    816 * Test the deterministic sampler for uniform distribution on [a, b].
    817 *
    818 * This currently only tests whether the outcome lies within [a, b].
    819 */
    820 static void
    821 test_uniform_interval(void *arg)
    822 {
    823  (void) arg;
    824  struct {
    825    /* Sample from a uniform distribution with parameters 'a' and 'b', using
    826     * 't' as the sampling index. */
    827    double t, a, b;
    828  } cases[] = {
    829    { 0, 0, 0 },
    830    { 0, 0, 1 },
    831    { 0, 1.0000000000000007, 3.999999999999995 },
    832    { 0, 4000, 4000 },
    833    { 0.42475836677491291, 4000, 4000 },
    834    { 0, -DBL_MAX, DBL_MAX },
    835    { 0.25, -DBL_MAX, DBL_MAX },
    836    { 0.5, -DBL_MAX, DBL_MAX },
    837  };
    838  size_t i = 0;
    839  bool ok = true;
    840 
    841  for (i = 0; i < arraycount(cases); i++) {
    842    double t = cases[i].t;
    843    double a = cases[i].a;
    844    double b = cases[i].b;
    845 
    846    CHECK_LE(a, sample_uniform_interval(t, a, b));
    847    CHECK_LE(sample_uniform_interval(t, a, b), b);
    848 
    849    CHECK_LE(a, sample_uniform_interval(1 - t, a, b));
    850    CHECK_LE(sample_uniform_interval(1 - t, a, b), b);
    851 
    852    CHECK_LE(sample_uniform_interval(t, -b, -a), -a);
    853    CHECK_LE(-b, sample_uniform_interval(t, -b, -a));
    854 
    855    CHECK_LE(sample_uniform_interval(1 - t, -b, -a), -a);
    856    CHECK_LE(-b, sample_uniform_interval(1 - t, -b, -a));
    857  }
    858 
    859  tt_assert(ok);
    860 
    861 done:
    862  ;
    863 }
    864 
    865 /********************** Stochastic tests ****************************/
    866 
    867 /*
    868 * Psi test, sometimes also called G-test.  The psi test statistic,
    869 * suitably scaled, has chi^2 distribution, but the psi test tends to
    870 * have better statistical power in practice to detect deviations than
    871 * the chi^2 test does.  (The chi^2 test statistic is the first term of
    872 * the Taylor expansion of the psi test statistic.)  The psi test is
    873 * generic, for any CDF; particular distributions might have higher-
    874 * power tests to distinguish them from predictable deviations or bugs.
    875 *
    876 * We choose the psi critical value so that a single psi test has
    877 * probability below alpha = 1% of spuriously failing even if all the
    878 * code is correct.  But the false positive rate for a suite of n tests
    879 * is higher: 1 - Binom(0; n, alpha) = 1 - (1 - alpha)^n.  For n = 10,
    880 * this is about 10%, and for n = 100 it is well over 50%.
    881 *
    882 * Given that these tests will run with every CI job, we want to drive down the
    883 * false positive rate. We can drive it down by running each test four times,
    884 * and accepting it if it passes at least once; in that case, it is as if we
    885 * used Binom(4; 2, alpha) = alpha^4 as the false positive rate for each test,
    886 * and for n = 10 tests, it would be 9.99999959506e-08. If each CI build has 14
    887 * jobs, then the chance of a CI build failing is 1.39999903326e-06, which
    888 * means that a CI build will break with probability 50% after about 495106
    889 * builds.
    890 *
    891 * The critical value for a chi^2 distribution with 100 degrees of
    892 * freedom and false positive rate alpha = 1% was taken from:
    893 *
    894 *  NIST/SEMATECH e-Handbook of Statistical Methods, Section
    895 *  1.3.6.7.4 `Critical Values of the Chi-Square Distribution',
    896 *  <https://www.itl.nist.gov/div898/handbook/eda/section3/eda3674.htm>,
    897 *  retrieved 2018-10-28.
    898 */
    899 
    900 static const size_t NSAMPLES = 100000;
    901 /* Number of chances we give to the test to succeed. */
    902 static const unsigned NTRIALS = 4;
    903 /* Number of times we want the test to pass per NTRIALS. */
    904 static const unsigned NPASSES_MIN = 1;
    905 
    906 #define PSI_DF 100                          /* degrees of freedom */
    907 static const double PSI_CRITICAL = 135.807; /* critical value, alpha = .01 */
    908 
    909 /**
    910 * Perform a psi test on an array of sample counts, C, adding up to N
    911 * samples, and an array of log expected probabilities, logP,
    912 * representing the null hypothesis for the distribution of samples
    913 * counted.  Return false if the psi test rejects the null hypothesis,
    914 * true if otherwise.
    915 */
    916 static bool
    917 psi_test(const size_t C[PSI_DF], const double logP[PSI_DF], size_t N)
    918 {
    919  double psi = 0;
    920  double c = 0;                 /* Kahan compensation */
    921  double t, u;
    922  size_t i;
    923 
    924  for (i = 0; i < PSI_DF; i++) {
    925    /*
    926     * c*log(c/(n*p)) = (1/n) * f*log(f/p) where f = c/n is
    927     * the frequency, and f*log(f/p) ---> 0 as f ---> 0, so
    928     * this is a reasonable choice.  Further, any mass that
    929     * _fails_ to turn up in this bin will inflate another
    930     * bin instead, so we don't really lose anything by
    931     * ignoring empty bins even if they have high
    932     * probability.
    933     */
    934    if (C[i] == 0)
    935      continue;
    936    t = C[i]*(log((double)C[i]/N) - logP[i]) - c;
    937    u = psi + t;
    938    c = (u - psi) - t;
    939    psi = u;
    940  }
    941  psi *= 2;
    942 
    943  return psi <= PSI_CRITICAL;
    944 }
    945 
    946 static bool
    947 test_stochastic_geometric_impl(double p)
    948 {
    949  const struct geometric_t geometric = {
    950    .base = GEOMETRIC(geometric),
    951    .p = p,
    952  };
    953  double logP[PSI_DF] = {0};
    954  unsigned ntry = NTRIALS, npass = 0;
    955  unsigned i;
    956  size_t j;
    957 
    958  /* Compute logP[i] = Geom(i + 1; p).  */
    959  for (i = 0; i < PSI_DF - 1; i++)
    960    logP[i] = logpmf_geometric(i + 1, p);
    961 
    962  /* Compute logP[n-1] = log (1 - (P[0] + P[1] + ... + P[n-2])).  */
    963  logP[PSI_DF - 1] = log1mexp(logsumexp(logP, PSI_DF - 1));
    964 
    965  while (ntry --> 0) {
    966    size_t C[PSI_DF] = {0};
    967 
    968    for (j = 0; j < NSAMPLES; j++) {
    969      double n_tmp = dist_sample(&geometric.base);
    970 
    971      /* Must be an integer.  (XXX -Wfloat-equal)  */
    972      tor_assert(ceil(n_tmp) <= n_tmp && ceil(n_tmp) >= n_tmp);
    973 
    974      /* Must be a positive integer.  */
    975      tor_assert(n_tmp >= 1);
    976 
    977      /* Probability of getting a value in the billions is negligible.  */
    978      tor_assert(n_tmp <= (double)UINT_MAX);
    979 
    980      unsigned n = (unsigned) n_tmp;
    981 
    982      if (n > PSI_DF)
    983        n = PSI_DF;
    984      C[n - 1]++;
    985    }
    986 
    987    if (psi_test(C, logP, NSAMPLES)) {
    988      if (++npass >= NPASSES_MIN)
    989        break;
    990    }
    991  }
    992 
    993  if (npass >= NPASSES_MIN) {
    994    /* printf("pass %s sampler\n", "geometric"); */
    995    return true;
    996  } else {
    997    printf("fail %s sampler\n", "geometric");
    998    return false;
    999  }
   1000 }
   1001 
   1002 /**
   1003 * Divide the support of <b>dist</b> into histogram bins in <b>logP</b>. Start
   1004 * at the 1st percentile and ending at the 99th percentile. Pick the bin
   1005 * boundaries using linear interpolation so that they are uniformly spaced.
   1006 *
   1007 * In each bin logP[i] we insert the expected log-probability that a sampled
   1008 * value will fall into that bin. We will use this as the null hypothesis of
   1009 * the psi test.
   1010 *
   1011 * Set logP[i] = log(CDF(x_i) - CDF(x_{i-1})), where x_-1 = -inf, x_n =
   1012 * +inf, and x_i = i*(hi - lo)/(n - 2).
   1013 */
   1014 static void
   1015 bin_cdfs(const struct dist_t *dist, double lo, double hi, double *logP,
   1016         size_t n)
   1017 {
   1018 #define CDF(x)  dist_cdf(dist, x)
   1019 #define SF(x)   dist_sf(dist, x)
   1020  const double w = (hi - lo)/(n - 2);
   1021  double halfway = dist_icdf(dist, 0.5);
   1022  double x_0, x_1;
   1023  size_t i;
   1024  size_t n2 = ceil_to_size_t((halfway - lo)/w);
   1025 
   1026  tor_assert(lo <= halfway);
   1027  tor_assert(halfway <= hi);
   1028  tor_assert(n2 <= n);
   1029 
   1030  x_1 = lo;
   1031  logP[0] = log(CDF(x_1) - 0); /* 0 = CDF(-inf) */
   1032  for (i = 1; i < n2; i++) {
   1033    x_0 = x_1;
   1034    /* do the linear interpolation */
   1035    x_1 = (i <= n/2 ? lo + i*w : hi - (n - 2 - i)*w);
   1036    /* set the expected log-probability */
   1037    logP[i] = log(CDF(x_1) - CDF(x_0));
   1038  }
   1039  x_0 = hi;
   1040  logP[n - 1] = log(SF(x_0) - 0); /* 0 = SF(+inf) = 1 - CDF(+inf) */
   1041 
   1042  /* In this loop we are filling out the high part of the array. We are using
   1043   * SF because in these cases the CDF is near 1 where precision is lower. So
   1044   * instead we are using SF near 0 where the precision is higher. We have
   1045   * SF(t) = 1 - CDF(t).  */
   1046  for (i = 1; i < n - n2; i++) {
   1047    x_1 = x_0;
   1048    /* do the linear interpolation */
   1049    x_0 = (i <= n/2 ? hi - i*w : lo + (n - 2 - i)*w);
   1050    /* set the expected log-probability */
   1051    logP[n - i - 1] = log(SF(x_0) - SF(x_1));
   1052  }
   1053 #undef SF
   1054 #undef CDF
   1055 }
   1056 
   1057 /**
   1058 * Draw NSAMPLES samples from dist, counting the number of samples x in
   1059 * the ith bin C[i] if x_{i-1} <= x < x_i, where x_-1 = -inf, x_n =
   1060 * +inf, and x_i = i*(hi - lo)/(n - 2).
   1061 */
   1062 static void
   1063 bin_samples(const struct dist_t *dist, double lo, double hi, size_t *C,
   1064            size_t n)
   1065 {
   1066  const double w = (hi - lo)/(n - 2);
   1067  size_t i;
   1068 
   1069  for (i = 0; i < NSAMPLES; i++) {
   1070    double x = dist_sample(dist);
   1071    size_t bin;
   1072 
   1073    if (x < lo)
   1074      bin = 0;
   1075    else if (x < hi)
   1076      bin = 1 + floor_to_size_t((x - lo)/w);
   1077    else
   1078      bin = n - 1;
   1079    tor_assert(bin < n);
   1080    C[bin]++;
   1081  }
   1082 }
   1083 
   1084 /**
   1085 * Carry out a Psi test on <b>dist</b>.
   1086 *
   1087 * Sample NSAMPLES from dist, putting them in bins from -inf to lo to
   1088 * hi to +inf, and apply up to two psi tests.  True if at least one psi
   1089 * test passes; false if not.  False positive rate should be bounded by
   1090 * 0.01^2 = 0.0001.
   1091 */
   1092 static bool
   1093 test_psi_dist_sample(const struct dist_t *dist)
   1094 {
   1095  double logP[PSI_DF] = {0};
   1096  unsigned ntry = NTRIALS, npass = 0;
   1097  double lo = dist_icdf(dist, 1/(double)(PSI_DF + 2));
   1098  double hi = dist_isf(dist, 1/(double)(PSI_DF + 2));
   1099 
   1100  /* Create the null hypothesis in logP */
   1101  bin_cdfs(dist, lo, hi, logP, PSI_DF);
   1102 
   1103  /* Now run the test */
   1104  while (ntry --> 0) {
   1105    size_t C[PSI_DF] = {0};
   1106    bin_samples(dist, lo, hi, C, PSI_DF);
   1107    if (psi_test(C, logP, NSAMPLES)) {
   1108      if (++npass >= NPASSES_MIN)
   1109        break;
   1110    }
   1111  }
   1112 
   1113  /* Did we fail or succeed? */
   1114  if (npass >= NPASSES_MIN) {
   1115    /* printf("pass %s sampler\n", dist_name(dist));*/
   1116    return true;
   1117  } else {
   1118    printf("fail %s sampler\n", dist_name(dist));
   1119    return false;
   1120  }
   1121 }
   1122 
   1123 static void
   1124 write_stochastic_warning(void)
   1125 {
   1126  if (tinytest_cur_test_has_failed()) {
   1127    printf("\n"
   1128         "NOTE: This is a stochastic test, and we expect it to fail from\n"
   1129         "time to time, with some low probability. If you see it fail more\n"
   1130         "than one trial in 100, though, please tell us.\n\n");
   1131  }
   1132 }
   1133 
   1134 static void
   1135 test_stochastic_uniform(void *arg)
   1136 {
   1137  (void) arg;
   1138 
   1139  const struct uniform_t uniform01 = {
   1140    .base = UNIFORM(uniform01),
   1141    .a = 0,
   1142    .b = 1,
   1143  };
   1144  const struct uniform_t uniform_pos = {
   1145    .base = UNIFORM(uniform_pos),
   1146    .a = 1.23,
   1147    .b = 4.56,
   1148  };
   1149  const struct uniform_t uniform_neg = {
   1150    .base = UNIFORM(uniform_neg),
   1151    .a = -10,
   1152    .b = -1,
   1153  };
   1154  const struct uniform_t uniform_cross = {
   1155    .base = UNIFORM(uniform_cross),
   1156    .a = -1.23,
   1157    .b = 4.56,
   1158  };
   1159  const struct uniform_t uniform_subnormal = {
   1160    .base = UNIFORM(uniform_subnormal),
   1161    .a = 4e-324,
   1162    .b = 4e-310,
   1163  };
   1164  const struct uniform_t uniform_subnormal_cross = {
   1165    .base = UNIFORM(uniform_subnormal_cross),
   1166    .a = -4e-324,
   1167    .b = 4e-310,
   1168  };
   1169  bool ok = true, tests_failed = true;
   1170 
   1171  testing_enable_reproducible_rng();
   1172 
   1173  ok &= test_psi_dist_sample(&uniform01.base);
   1174  ok &= test_psi_dist_sample(&uniform_pos.base);
   1175  ok &= test_psi_dist_sample(&uniform_neg.base);
   1176  ok &= test_psi_dist_sample(&uniform_cross.base);
   1177  ok &= test_psi_dist_sample(&uniform_subnormal.base);
   1178  ok &= test_psi_dist_sample(&uniform_subnormal_cross.base);
   1179 
   1180  tt_assert(ok);
   1181 
   1182  tests_failed = false;
   1183 
   1184 done:
   1185  if (tests_failed) {
   1186    write_stochastic_warning();
   1187  }
   1188  testing_disable_reproducible_rng();
   1189 }
   1190 
   1191 static bool
   1192 test_stochastic_logistic_impl(double mu, double sigma)
   1193 {
   1194  const struct logistic_t dist = {
   1195    .base = LOGISTIC(dist),
   1196    .mu = mu,
   1197    .sigma = sigma,
   1198  };
   1199 
   1200  /* XXX Consider some fancier logistic test.  */
   1201  return test_psi_dist_sample(&dist.base);
   1202 }
   1203 
   1204 static bool
   1205 test_stochastic_log_logistic_impl(double alpha, double beta)
   1206 {
   1207  const struct log_logistic_t dist = {
   1208    .base = LOG_LOGISTIC(dist),
   1209    .alpha = alpha,
   1210    .beta = beta,
   1211  };
   1212 
   1213  /* XXX Consider some fancier log logistic test.  */
   1214  return test_psi_dist_sample(&dist.base);
   1215 }
   1216 
   1217 static bool
   1218 test_stochastic_weibull_impl(double lambda, double k)
   1219 {
   1220  const struct weibull_t dist = {
   1221    .base = WEIBULL(dist),
   1222    .lambda = lambda,
   1223    .k = k,
   1224  };
   1225 
   1226 // clang-format off
   1227 /*
   1228 * XXX Consider applying a Tiku-Singh test:
   1229 *
   1230 *    M.L. Tiku and M. Singh, `Testing the two-parameter
   1231 *    Weibull distribution', Communications in Statistics --
   1232 *    Theory and Methods A10(9), 1981, 907--918.
   1233 https://www.tandfonline.com/doi/pdf/10.1080/03610928108828082?needAccess=true
   1234 */
   1235 // clang-format on
   1236  return test_psi_dist_sample(&dist.base);
   1237 }
   1238 
   1239 static bool
   1240 test_stochastic_genpareto_impl(double mu, double sigma, double xi)
   1241 {
   1242  const struct genpareto_t dist = {
   1243    .base = GENPARETO(dist),
   1244    .mu = mu,
   1245    .sigma = sigma,
   1246    .xi = xi,
   1247  };
   1248 
   1249  /* XXX Consider some fancier GPD test.  */
   1250  return test_psi_dist_sample(&dist.base);
   1251 }
   1252 
   1253 static void
   1254 test_stochastic_genpareto(void *arg)
   1255 {
   1256  bool ok = 0;
   1257  bool tests_failed = true;
   1258  (void) arg;
   1259 
   1260  testing_enable_reproducible_rng();
   1261 
   1262  ok = test_stochastic_genpareto_impl(0, 1, -0.25);
   1263  tt_assert(ok);
   1264  ok = test_stochastic_genpareto_impl(0, 1, -1e-30);
   1265  tt_assert(ok);
   1266  ok = test_stochastic_genpareto_impl(0, 1, 0);
   1267  tt_assert(ok);
   1268  ok = test_stochastic_genpareto_impl(0, 1, 1e-30);
   1269  tt_assert(ok);
   1270  ok = test_stochastic_genpareto_impl(0, 1, 0.25);
   1271  tt_assert(ok);
   1272  ok = test_stochastic_genpareto_impl(-1, 1, -0.25);
   1273  tt_assert(ok);
   1274  ok = test_stochastic_genpareto_impl(1, 2, 0.25);
   1275  tt_assert(ok);
   1276 
   1277  tests_failed = false;
   1278 
   1279 done:
   1280  if (tests_failed) {
   1281    write_stochastic_warning();
   1282  }
   1283  testing_disable_reproducible_rng();
   1284 }
   1285 
   1286 static void
   1287 test_stochastic_geometric(void *arg)
   1288 {
   1289  bool ok = 0;
   1290  bool tests_failed = true;
   1291 
   1292  (void) arg;
   1293 
   1294  testing_enable_reproducible_rng();
   1295 
   1296  ok = test_stochastic_geometric_impl(0.1);
   1297  tt_assert(ok);
   1298  ok = test_stochastic_geometric_impl(0.5);
   1299  tt_assert(ok);
   1300  ok = test_stochastic_geometric_impl(0.9);
   1301  tt_assert(ok);
   1302  ok = test_stochastic_geometric_impl(1);
   1303  tt_assert(ok);
   1304 
   1305  tests_failed = false;
   1306 
   1307 done:
   1308  if (tests_failed) {
   1309    write_stochastic_warning();
   1310  }
   1311  testing_disable_reproducible_rng();
   1312 }
   1313 
   1314 static void
   1315 test_stochastic_logistic(void *arg)
   1316 {
   1317  bool ok = 0;
   1318  bool tests_failed = true;
   1319  (void) arg;
   1320 
   1321  testing_enable_reproducible_rng();
   1322 
   1323  ok = test_stochastic_logistic_impl(0, 1);
   1324  tt_assert(ok);
   1325  ok = test_stochastic_logistic_impl(0, 1e-16);
   1326  tt_assert(ok);
   1327  ok = test_stochastic_logistic_impl(1, 10);
   1328  tt_assert(ok);
   1329  ok = test_stochastic_logistic_impl(-10, 100);
   1330  tt_assert(ok);
   1331 
   1332  tests_failed = false;
   1333 
   1334 done:
   1335  if (tests_failed) {
   1336    write_stochastic_warning();
   1337  }
   1338  testing_disable_reproducible_rng();
   1339 }
   1340 
   1341 static void
   1342 test_stochastic_log_logistic(void *arg)
   1343 {
   1344  bool ok = 0;
   1345  (void) arg;
   1346 
   1347  testing_enable_reproducible_rng();
   1348 
   1349  ok = test_stochastic_log_logistic_impl(1, 1);
   1350  tt_assert(ok);
   1351  ok = test_stochastic_log_logistic_impl(1, 10);
   1352  tt_assert(ok);
   1353  ok = test_stochastic_log_logistic_impl(M_E, 1e-1);
   1354  tt_assert(ok);
   1355  ok = test_stochastic_log_logistic_impl(exp(-10), 1e-2);
   1356  tt_assert(ok);
   1357 
   1358 done:
   1359  write_stochastic_warning();
   1360  testing_disable_reproducible_rng();
   1361 }
   1362 
   1363 static void
   1364 test_stochastic_weibull(void *arg)
   1365 {
   1366  bool ok = 0;
   1367  (void) arg;
   1368 
   1369  testing_enable_reproducible_rng();
   1370 
   1371  ok = test_stochastic_weibull_impl(1, 0.5);
   1372  tt_assert(ok);
   1373  ok = test_stochastic_weibull_impl(1, 1);
   1374  tt_assert(ok);
   1375  ok = test_stochastic_weibull_impl(1, 1.5);
   1376  tt_assert(ok);
   1377  ok = test_stochastic_weibull_impl(1, 2);
   1378  tt_assert(ok);
   1379  ok = test_stochastic_weibull_impl(10, 1);
   1380  tt_assert(ok);
   1381 
   1382 done:
   1383  write_stochastic_warning();
   1384  testing_disable_reproducible_rng();
   1385  UNMOCK(crypto_rand);
   1386 }
   1387 
   1388 struct testcase_t prob_distr_tests[] = {
   1389  { "logit_logistics", test_logit_logistic, TT_FORK, NULL, NULL },
   1390  { "log_logistic", test_log_logistic, TT_FORK, NULL, NULL },
   1391  { "weibull", test_weibull, TT_FORK, NULL, NULL },
   1392  { "genpareto", test_genpareto, TT_FORK, NULL, NULL },
   1393  { "uniform_interval", test_uniform_interval, TT_FORK, NULL, NULL },
   1394  END_OF_TESTCASES
   1395 };
   1396 
   1397 struct testcase_t slow_stochastic_prob_distr_tests[] = {
   1398  { "stochastic_genpareto", test_stochastic_genpareto, TT_FORK, NULL, NULL },
   1399  { "stochastic_geometric", test_stochastic_geometric, TT_FORK, NULL, NULL },
   1400  { "stochastic_uniform", test_stochastic_uniform, TT_FORK, NULL, NULL },
   1401  { "stochastic_logistic", test_stochastic_logistic, TT_FORK, NULL, NULL },
   1402  { "stochastic_log_logistic", test_stochastic_log_logistic, TT_FORK, NULL,
   1403    NULL },
   1404  { "stochastic_weibull", test_stochastic_weibull, TT_FORK, NULL, NULL },
   1405  END_OF_TESTCASES
   1406 };
	tor The Tor anonymity network
	git clone https://git.dasho.dev/tor.git
	Log \| Files \| Refs \| README \| LICENSE