tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

sum_squares_test.cc (29565B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <cmath>
     13 #include <cstdlib>
     14 #include <string>
     15 #include <tuple>
     16 
     17 #include "gtest/gtest.h"
     18 
     19 #include "config/aom_config.h"
     20 #include "config/aom_dsp_rtcd.h"
     21 
     22 #include "aom_ports/mem.h"
     23 #include "av1/common/common_data.h"
     24 #include "test/acm_random.h"
     25 #include "test/register_state_check.h"
     26 #include "test/util.h"
     27 #include "test/function_equivalence_test.h"
     28 
     29 using libaom_test::ACMRandom;
     30 using libaom_test::FunctionEquivalenceTest;
     31 using ::testing::Combine;
     32 using ::testing::Range;
     33 using ::testing::Values;
     34 using ::testing::ValuesIn;
     35 
     36 namespace {
     37 const int kNumIterations = 10000;
     38 
     39 static const int16_t kInt13Max = (1 << 12) - 1;
     40 
     41 using SSI16Func = uint64_t (*)(const int16_t *src, int stride, int width,
     42                               int height);
     43 using TestFuncs = libaom_test::FuncParam<SSI16Func>;
     44 
     45 class SumSquaresTest : public ::testing::TestWithParam<TestFuncs> {
     46 public:
     47  ~SumSquaresTest() override = default;
     48  void SetUp() override {
     49    params_ = this->GetParam();
     50    rnd_.Reset(ACMRandom::DeterministicSeed());
     51    src_ = reinterpret_cast<int16_t *>(aom_memalign(16, 256 * 256 * 2));
     52    ASSERT_NE(src_, nullptr);
     53  }
     54 
     55  void TearDown() override { aom_free(src_); }
     56  void RunTest(bool is_random);
     57  void RunSpeedTest();
     58 
     59  void GenRandomData(int width, int height, int stride) {
     60    const int msb = 11;  // Up to 12 bit input
     61    const int limit = 1 << (msb + 1);
     62    for (int ii = 0; ii < height; ii++) {
     63      for (int jj = 0; jj < width; jj++) {
     64        src_[ii * stride + jj] = rnd_(2) ? rnd_(limit) : -rnd_(limit);
     65      }
     66    }
     67  }
     68 
     69  void GenExtremeData(int width, int height, int stride) {
     70    const int msb = 11;  // Up to 12 bit input
     71    const int limit = 1 << (msb + 1);
     72    const int val = rnd_(2) ? limit - 1 : -(limit - 1);
     73    for (int ii = 0; ii < height; ii++) {
     74      for (int jj = 0; jj < width; jj++) {
     75        src_[ii * stride + jj] = val;
     76      }
     77    }
     78  }
     79 
     80 protected:
     81  TestFuncs params_;
     82  int16_t *src_;
     83  ACMRandom rnd_;
     84 };
     85 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SumSquaresTest);
     86 
     87 void SumSquaresTest::RunTest(bool is_random) {
     88  int failed = 0;
     89  for (int k = 0; k < kNumIterations; k++) {
     90    const int width = 4 * (rnd_(31) + 1);   // Up to 128x128
     91    const int height = 4 * (rnd_(31) + 1);  // Up to 128x128
     92    int stride = 4 << rnd_(7);              // Up to 256 stride
     93    while (stride < width) {                // Make sure it's valid
     94      stride = 4 << rnd_(7);
     95    }
     96    if (is_random) {
     97      GenRandomData(width, height, stride);
     98    } else {
     99      GenExtremeData(width, height, stride);
    100    }
    101    const uint64_t res_ref = params_.ref_func(src_, stride, width, height);
    102    uint64_t res_tst;
    103    API_REGISTER_STATE_CHECK(res_tst =
    104                                 params_.tst_func(src_, stride, width, height));
    105 
    106    if (!failed) {
    107      failed = res_ref != res_tst;
    108      EXPECT_EQ(res_ref, res_tst)
    109          << "Error: Sum Squares Test [" << width << "x" << height
    110          << "] C output does not match optimized output.";
    111    }
    112  }
    113 }
    114 
    115 void SumSquaresTest::RunSpeedTest() {
    116  for (int block = BLOCK_4X4; block < BLOCK_SIZES_ALL; block++) {
    117    const int width = block_size_wide[block];   // Up to 128x128
    118    const int height = block_size_high[block];  // Up to 128x128
    119    int stride = 4 << rnd_(7);                  // Up to 256 stride
    120    while (stride < width) {                    // Make sure it's valid
    121      stride = 4 << rnd_(7);
    122    }
    123    GenExtremeData(width, height, stride);
    124    const int num_loops = 1000000000 / (width + height);
    125    aom_usec_timer timer;
    126    aom_usec_timer_start(&timer);
    127 
    128    for (int i = 0; i < num_loops; ++i)
    129      params_.ref_func(src_, stride, width, height);
    130 
    131    aom_usec_timer_mark(&timer);
    132    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
    133    printf("SumSquaresTest C %3dx%-3d: %7.2f ns\n", width, height,
    134           1000.0 * elapsed_time / num_loops);
    135 
    136    aom_usec_timer timer1;
    137    aom_usec_timer_start(&timer1);
    138    for (int i = 0; i < num_loops; ++i)
    139      params_.tst_func(src_, stride, width, height);
    140    aom_usec_timer_mark(&timer1);
    141    const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
    142    printf("SumSquaresTest Test %3dx%-3d: %7.2f ns\n", width, height,
    143           1000.0 * elapsed_time1 / num_loops);
    144  }
    145 }
    146 
    147 TEST_P(SumSquaresTest, OperationCheck) {
    148  RunTest(true);  // GenRandomData
    149 }
    150 
    151 TEST_P(SumSquaresTest, ExtremeValues) {
    152  RunTest(false);  // GenExtremeData
    153 }
    154 
    155 TEST_P(SumSquaresTest, DISABLED_Speed) { RunSpeedTest(); }
    156 
    157 #if HAVE_SSE2
    158 
    159 INSTANTIATE_TEST_SUITE_P(
    160    SSE2, SumSquaresTest,
    161    ::testing::Values(TestFuncs(&aom_sum_squares_2d_i16_c,
    162                                &aom_sum_squares_2d_i16_sse2)));
    163 
    164 #endif  // HAVE_SSE2
    165 
    166 #if HAVE_NEON
    167 
    168 INSTANTIATE_TEST_SUITE_P(
    169    NEON, SumSquaresTest,
    170    ::testing::Values(TestFuncs(&aom_sum_squares_2d_i16_c,
    171                                &aom_sum_squares_2d_i16_neon)));
    172 
    173 #endif  // HAVE_NEON
    174 
    175 #if HAVE_SVE
    176 INSTANTIATE_TEST_SUITE_P(
    177    SVE, SumSquaresTest,
    178    ::testing::Values(TestFuncs(&aom_sum_squares_2d_i16_c,
    179                                &aom_sum_squares_2d_i16_sve)));
    180 
    181 #endif  // HAVE_SVE
    182 
    183 #if HAVE_AVX2
    184 INSTANTIATE_TEST_SUITE_P(
    185    AVX2, SumSquaresTest,
    186    ::testing::Values(TestFuncs(&aom_sum_squares_2d_i16_c,
    187                                &aom_sum_squares_2d_i16_avx2)));
    188 #endif  // HAVE_AVX2
    189 
    190 //////////////////////////////////////////////////////////////////////////////
    191 // 1D version
    192 //////////////////////////////////////////////////////////////////////////////
    193 
    194 using F1D = uint64_t (*)(const int16_t *src, uint32_t n);
    195 using TestFuncs1D = libaom_test::FuncParam<F1D>;
    196 
    197 class SumSquares1DTest : public FunctionEquivalenceTest<F1D> {
    198 protected:
    199  static const int kIterations = 1000;
    200  static const int kMaxSize = 256;
    201 };
    202 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SumSquares1DTest);
    203 
    204 TEST_P(SumSquares1DTest, RandomValues) {
    205  DECLARE_ALIGNED(16, int16_t, src[kMaxSize * kMaxSize]);
    206 
    207  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
    208    for (int i = 0; i < kMaxSize * kMaxSize; ++i)
    209      src[i] = rng_(kInt13Max * 2 + 1) - kInt13Max;
    210 
    211    // Block size is between 64 and 128 * 128 and is always a multiple of 64.
    212    const int n = (rng_(255) + 1) * 64;
    213 
    214    const uint64_t ref_res = params_.ref_func(src, n);
    215    uint64_t tst_res;
    216    API_REGISTER_STATE_CHECK(tst_res = params_.tst_func(src, n));
    217 
    218    ASSERT_EQ(ref_res, tst_res);
    219  }
    220 }
    221 
    222 TEST_P(SumSquares1DTest, ExtremeValues) {
    223  DECLARE_ALIGNED(16, int16_t, src[kMaxSize * kMaxSize]);
    224 
    225  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
    226    if (rng_(2)) {
    227      for (int i = 0; i < kMaxSize * kMaxSize; ++i) src[i] = kInt13Max;
    228    } else {
    229      for (int i = 0; i < kMaxSize * kMaxSize; ++i) src[i] = -kInt13Max;
    230    }
    231 
    232    // Block size is between 64 and 128 * 128 and is always a multiple of 64.
    233    const int n = (rng_(255) + 1) * 64;
    234 
    235    const uint64_t ref_res = params_.ref_func(src, n);
    236    uint64_t tst_res;
    237    API_REGISTER_STATE_CHECK(tst_res = params_.tst_func(src, n));
    238 
    239    ASSERT_EQ(ref_res, tst_res);
    240  }
    241 }
    242 
    243 #if HAVE_SSE2
    244 INSTANTIATE_TEST_SUITE_P(SSE2, SumSquares1DTest,
    245                         ::testing::Values(TestFuncs1D(
    246                             aom_sum_squares_i16_c, aom_sum_squares_i16_sse2)));
    247 
    248 #endif  // HAVE_SSE2
    249 
    250 #if HAVE_NEON
    251 INSTANTIATE_TEST_SUITE_P(NEON, SumSquares1DTest,
    252                         ::testing::Values(TestFuncs1D(
    253                             aom_sum_squares_i16_c, aom_sum_squares_i16_neon)));
    254 
    255 #endif  // HAVE_NEON
    256 
    257 #if HAVE_SVE
    258 INSTANTIATE_TEST_SUITE_P(SVE, SumSquares1DTest,
    259                         ::testing::Values(TestFuncs1D(
    260                             aom_sum_squares_i16_c, aom_sum_squares_i16_sve)));
    261 
    262 #endif  // HAVE_SVE
    263 
    264 using SSEFunc = int64_t (*)(const uint8_t *a, int a_stride, const uint8_t *b,
    265                            int b_stride, int width, int height);
    266 using TestSSEFuncs = libaom_test::FuncParam<SSEFunc>;
    267 
    268 using SSETestParam = std::tuple<TestSSEFuncs, int>;
    269 
    270 class SSETest : public ::testing::TestWithParam<SSETestParam> {
    271 public:
    272  ~SSETest() override = default;
    273  void SetUp() override {
    274    params_ = GET_PARAM(0);
    275    width_ = GET_PARAM(1);
    276    is_hbd_ =
    277 #if CONFIG_AV1_HIGHBITDEPTH
    278        params_.ref_func == aom_highbd_sse_c;
    279 #else
    280        false;
    281 #endif
    282    rnd_.Reset(ACMRandom::DeterministicSeed());
    283    src_ = reinterpret_cast<uint8_t *>(aom_memalign(32, 256 * 256 * 2));
    284    ref_ = reinterpret_cast<uint8_t *>(aom_memalign(32, 256 * 256 * 2));
    285    ASSERT_NE(src_, nullptr);
    286    ASSERT_NE(ref_, nullptr);
    287  }
    288 
    289  void TearDown() override {
    290    aom_free(src_);
    291    aom_free(ref_);
    292  }
    293  void RunTest(bool is_random, int width, int height, int run_times);
    294 
    295  void GenRandomData(int width, int height, int stride) {
    296    uint16_t *src16 = reinterpret_cast<uint16_t *>(src_);
    297    uint16_t *ref16 = reinterpret_cast<uint16_t *>(ref_);
    298    const int msb = 11;  // Up to 12 bit input
    299    const int limit = 1 << (msb + 1);
    300    for (int ii = 0; ii < height; ii++) {
    301      for (int jj = 0; jj < width; jj++) {
    302        if (!is_hbd_) {
    303          src_[ii * stride + jj] = rnd_.Rand8();
    304          ref_[ii * stride + jj] = rnd_.Rand8();
    305        } else {
    306          src16[ii * stride + jj] = rnd_(limit);
    307          ref16[ii * stride + jj] = rnd_(limit);
    308        }
    309      }
    310    }
    311  }
    312 
    313  void GenExtremeData(int width, int height, int stride, uint8_t *data,
    314                      int16_t val) {
    315    uint16_t *data16 = reinterpret_cast<uint16_t *>(data);
    316    for (int ii = 0; ii < height; ii++) {
    317      for (int jj = 0; jj < width; jj++) {
    318        if (!is_hbd_) {
    319          data[ii * stride + jj] = static_cast<uint8_t>(val);
    320        } else {
    321          data16[ii * stride + jj] = val;
    322        }
    323      }
    324    }
    325  }
    326 
    327 protected:
    328  bool is_hbd_;
    329  int width_;
    330  TestSSEFuncs params_;
    331  uint8_t *src_;
    332  uint8_t *ref_;
    333  ACMRandom rnd_;
    334 };
    335 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SSETest);
    336 
    337 void SSETest::RunTest(bool is_random, int width, int height, int run_times) {
    338  int failed = 0;
    339  aom_usec_timer ref_timer, test_timer;
    340  for (int k = 0; k < 3; k++) {
    341    int stride = 4 << rnd_(7);  // Up to 256 stride
    342    while (stride < width) {    // Make sure it's valid
    343      stride = 4 << rnd_(7);
    344    }
    345    if (is_random) {
    346      GenRandomData(width, height, stride);
    347    } else {
    348      const int msb = is_hbd_ ? 12 : 8;  // Up to 12 bit input
    349      const int limit = (1 << msb) - 1;
    350      if (k == 0) {
    351        GenExtremeData(width, height, stride, src_, 0);
    352        GenExtremeData(width, height, stride, ref_, limit);
    353      } else {
    354        GenExtremeData(width, height, stride, src_, limit);
    355        GenExtremeData(width, height, stride, ref_, 0);
    356      }
    357    }
    358    int64_t res_ref, res_tst;
    359    uint8_t *src = src_;
    360    uint8_t *ref = ref_;
    361    if (is_hbd_) {
    362      src = CONVERT_TO_BYTEPTR(src_);
    363      ref = CONVERT_TO_BYTEPTR(ref_);
    364    }
    365    res_ref = params_.ref_func(src, stride, ref, stride, width, height);
    366    res_tst = params_.tst_func(src, stride, ref, stride, width, height);
    367    if (run_times > 1) {
    368      aom_usec_timer_start(&ref_timer);
    369      for (int j = 0; j < run_times; j++) {
    370        params_.ref_func(src, stride, ref, stride, width, height);
    371      }
    372      aom_usec_timer_mark(&ref_timer);
    373      const int elapsed_time_c =
    374          static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
    375 
    376      aom_usec_timer_start(&test_timer);
    377      for (int j = 0; j < run_times; j++) {
    378        params_.tst_func(src, stride, ref, stride, width, height);
    379      }
    380      aom_usec_timer_mark(&test_timer);
    381      const int elapsed_time_simd =
    382          static_cast<int>(aom_usec_timer_elapsed(&test_timer));
    383 
    384      printf(
    385          "c_time=%d \t simd_time=%d \t "
    386          "gain=%d\n",
    387          elapsed_time_c, elapsed_time_simd,
    388          (elapsed_time_c / elapsed_time_simd));
    389    } else {
    390      if (!failed) {
    391        failed = res_ref != res_tst;
    392        EXPECT_EQ(res_ref, res_tst)
    393            << "Error:" << (is_hbd_ ? "hbd " : " ") << k << " SSE Test ["
    394            << width << "x" << height
    395            << "] C output does not match optimized output.";
    396      }
    397    }
    398  }
    399 }
    400 
    401 TEST_P(SSETest, OperationCheck) {
    402  for (int height = 4; height <= 128; height += 4) {
    403    RunTest(true, width_, height, 1);  // GenRandomData
    404  }
    405 }
    406 
    407 TEST_P(SSETest, ExtremeValues) {
    408  for (int height = 4; height <= 128; height += 4) {
    409    RunTest(false, width_, height, 1);
    410  }
    411 }
    412 
    413 TEST_P(SSETest, DISABLED_Speed) {
    414  for (int height = 4; height <= 128; height += 4) {
    415    RunTest(true, width_, height, 100);
    416  }
    417 }
    418 
    419 #if HAVE_NEON
    420 TestSSEFuncs sse_neon[] = {
    421  TestSSEFuncs(&aom_sse_c, &aom_sse_neon),
    422 #if CONFIG_AV1_HIGHBITDEPTH
    423  TestSSEFuncs(&aom_highbd_sse_c, &aom_highbd_sse_neon)
    424 #endif
    425 };
    426 INSTANTIATE_TEST_SUITE_P(NEON, SSETest,
    427                         Combine(ValuesIn(sse_neon), Range(4, 129, 4)));
    428 #endif  // HAVE_NEON
    429 
    430 #if HAVE_NEON_DOTPROD
    431 TestSSEFuncs sse_neon_dotprod[] = {
    432  TestSSEFuncs(&aom_sse_c, &aom_sse_neon_dotprod),
    433 };
    434 INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SSETest,
    435                         Combine(ValuesIn(sse_neon_dotprod), Range(4, 129, 4)));
    436 #endif  // HAVE_NEON_DOTPROD
    437 
    438 #if HAVE_SSE4_1
    439 TestSSEFuncs sse_sse4[] = {
    440  TestSSEFuncs(&aom_sse_c, &aom_sse_sse4_1),
    441 #if CONFIG_AV1_HIGHBITDEPTH
    442  TestSSEFuncs(&aom_highbd_sse_c, &aom_highbd_sse_sse4_1)
    443 #endif
    444 };
    445 INSTANTIATE_TEST_SUITE_P(SSE4_1, SSETest,
    446                         Combine(ValuesIn(sse_sse4), Range(4, 129, 4)));
    447 #endif  // HAVE_SSE4_1
    448 
    449 #if HAVE_AVX2
    450 
    451 TestSSEFuncs sse_avx2[] = {
    452  TestSSEFuncs(&aom_sse_c, &aom_sse_avx2),
    453 #if CONFIG_AV1_HIGHBITDEPTH
    454  TestSSEFuncs(&aom_highbd_sse_c, &aom_highbd_sse_avx2)
    455 #endif
    456 };
    457 INSTANTIATE_TEST_SUITE_P(AVX2, SSETest,
    458                         Combine(ValuesIn(sse_avx2), Range(4, 129, 4)));
    459 #endif  // HAVE_AVX2
    460 
    461 #if HAVE_SVE
    462 #if CONFIG_AV1_HIGHBITDEPTH
    463 TestSSEFuncs sse_sve[] = { TestSSEFuncs(&aom_highbd_sse_c,
    464                                        &aom_highbd_sse_sve) };
    465 INSTANTIATE_TEST_SUITE_P(SVE, SSETest,
    466                         Combine(ValuesIn(sse_sve), Range(4, 129, 4)));
    467 #endif
    468 #endif  // HAVE_SVE
    469 
    470 //////////////////////////////////////////////////////////////////////////////
    471 // get_blk sum squares test functions
    472 //////////////////////////////////////////////////////////////////////////////
    473 
    474 using sse_sum_func = void (*)(const int16_t *data, int stride, int bw, int bh,
    475                              int *x_sum, int64_t *x2_sum);
    476 using TestSSE_SumFuncs = libaom_test::FuncParam<sse_sum_func>;
    477 
    478 using SSE_SumTestParam = std::tuple<TestSSE_SumFuncs, TX_SIZE>;
    479 
    480 class SSE_Sum_Test : public ::testing::TestWithParam<SSE_SumTestParam> {
    481 public:
    482  ~SSE_Sum_Test() override = default;
    483  void SetUp() override {
    484    params_ = GET_PARAM(0);
    485    rnd_.Reset(ACMRandom::DeterministicSeed());
    486    src_ = reinterpret_cast<int16_t *>(aom_memalign(32, 256 * 256 * 2));
    487    ASSERT_NE(src_, nullptr);
    488  }
    489 
    490  void TearDown() override { aom_free(src_); }
    491  void RunTest(bool is_random, int tx_size, int run_times);
    492 
    493  void GenRandomData(int width, int height, int stride) {
    494    const int msb = 11;  // Up to 12 bit input
    495    const int limit = 1 << (msb + 1);
    496    for (int ii = 0; ii < height; ii++) {
    497      for (int jj = 0; jj < width; jj++) {
    498        src_[ii * stride + jj] = rnd_(limit);
    499      }
    500    }
    501  }
    502 
    503  void GenExtremeData(int width, int height, int stride, int16_t *data,
    504                      int16_t val) {
    505    for (int ii = 0; ii < height; ii++) {
    506      for (int jj = 0; jj < width; jj++) {
    507        data[ii * stride + jj] = val;
    508      }
    509    }
    510  }
    511 
    512 protected:
    513  TestSSE_SumFuncs params_;
    514  int16_t *src_;
    515  ACMRandom rnd_;
    516 };
    517 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SSE_Sum_Test);
    518 
    519 void SSE_Sum_Test::RunTest(bool is_random, int tx_size, int run_times) {
    520  aom_usec_timer ref_timer, test_timer;
    521  int width = tx_size_wide[tx_size];
    522  int height = tx_size_high[tx_size];
    523  for (int k = 0; k < 3; k++) {
    524    int stride = 4 << rnd_(7);  // Up to 256 stride
    525    while (stride < width) {    // Make sure it's valid
    526      stride = 4 << rnd_(7);
    527    }
    528    if (is_random) {
    529      GenRandomData(width, height, stride);
    530    } else {
    531      const int msb = 12;  // Up to 12 bit input
    532      const int limit = (1 << msb) - 1;
    533      if (k == 0) {
    534        GenExtremeData(width, height, stride, src_, limit);
    535      } else {
    536        GenExtremeData(width, height, stride, src_, -limit);
    537      }
    538    }
    539    int sum_c = 0;
    540    int64_t sse_intr = 0;
    541    int sum_intr = 0;
    542    int64_t sse_c = 0;
    543 
    544    params_.ref_func(src_, stride, width, height, &sum_c, &sse_c);
    545    params_.tst_func(src_, stride, width, height, &sum_intr, &sse_intr);
    546 
    547    if (run_times > 1) {
    548      aom_usec_timer_start(&ref_timer);
    549      for (int j = 0; j < run_times; j++) {
    550        params_.ref_func(src_, stride, width, height, &sum_c, &sse_c);
    551      }
    552      aom_usec_timer_mark(&ref_timer);
    553      const int elapsed_time_c =
    554          static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
    555 
    556      aom_usec_timer_start(&test_timer);
    557      for (int j = 0; j < run_times; j++) {
    558        params_.tst_func(src_, stride, width, height, &sum_intr, &sse_intr);
    559      }
    560      aom_usec_timer_mark(&test_timer);
    561      const int elapsed_time_simd =
    562          static_cast<int>(aom_usec_timer_elapsed(&test_timer));
    563 
    564      printf(
    565          "c_time=%d \t simd_time=%d \t "
    566          "gain=%f\t width=%d\t height=%d \n",
    567          elapsed_time_c, elapsed_time_simd,
    568          (float)((float)elapsed_time_c / (float)elapsed_time_simd), width,
    569          height);
    570 
    571    } else {
    572      EXPECT_EQ(sum_c, sum_intr)
    573          << "Error:" << k << " SSE Sum Test [" << width << "x" << height
    574          << "] C output does not match optimized output.";
    575      EXPECT_EQ(sse_c, sse_intr)
    576          << "Error:" << k << " SSE Sum Test [" << width << "x" << height
    577          << "] C output does not match optimized output.";
    578    }
    579  }
    580 }
    581 
    582 TEST_P(SSE_Sum_Test, OperationCheck) {
    583  RunTest(true, GET_PARAM(1), 1);  // GenRandomData
    584 }
    585 
    586 TEST_P(SSE_Sum_Test, ExtremeValues) { RunTest(false, GET_PARAM(1), 1); }
    587 
    588 TEST_P(SSE_Sum_Test, DISABLED_Speed) { RunTest(true, GET_PARAM(1), 10000); }
    589 
    590 #if HAVE_SSE2 || HAVE_AVX2 || HAVE_NEON
    591 const TX_SIZE kValidBlockSize[] = { TX_4X4,   TX_8X8,   TX_16X16, TX_32X32,
    592                                    TX_64X64, TX_4X8,   TX_8X4,   TX_8X16,
    593                                    TX_16X8,  TX_16X32, TX_32X16, TX_64X32,
    594                                    TX_32X64, TX_4X16,  TX_16X4,  TX_8X32,
    595                                    TX_32X8,  TX_16X64, TX_64X16 };
    596 #endif
    597 
    598 #if HAVE_SSE2
    599 TestSSE_SumFuncs sse_sum_sse2[] = { TestSSE_SumFuncs(
    600    &aom_get_blk_sse_sum_c, &aom_get_blk_sse_sum_sse2) };
    601 INSTANTIATE_TEST_SUITE_P(SSE2, SSE_Sum_Test,
    602                         Combine(ValuesIn(sse_sum_sse2),
    603                                 ValuesIn(kValidBlockSize)));
    604 #endif  // HAVE_SSE2
    605 
    606 #if HAVE_AVX2
    607 TestSSE_SumFuncs sse_sum_avx2[] = { TestSSE_SumFuncs(
    608    &aom_get_blk_sse_sum_c, &aom_get_blk_sse_sum_avx2) };
    609 INSTANTIATE_TEST_SUITE_P(AVX2, SSE_Sum_Test,
    610                         Combine(ValuesIn(sse_sum_avx2),
    611                                 ValuesIn(kValidBlockSize)));
    612 #endif  // HAVE_AVX2
    613 
    614 #if HAVE_NEON
    615 TestSSE_SumFuncs sse_sum_neon[] = { TestSSE_SumFuncs(
    616    &aom_get_blk_sse_sum_c, &aom_get_blk_sse_sum_neon) };
    617 INSTANTIATE_TEST_SUITE_P(NEON, SSE_Sum_Test,
    618                         Combine(ValuesIn(sse_sum_neon),
    619                                 ValuesIn(kValidBlockSize)));
    620 #endif  // HAVE_NEON
    621 
    622 #if HAVE_SVE
    623 TestSSE_SumFuncs sse_sum_sve[] = { TestSSE_SumFuncs(&aom_get_blk_sse_sum_c,
    624                                                    &aom_get_blk_sse_sum_sve) };
    625 INSTANTIATE_TEST_SUITE_P(SVE, SSE_Sum_Test,
    626                         Combine(ValuesIn(sse_sum_sve),
    627                                 ValuesIn(kValidBlockSize)));
    628 #endif  // HAVE_SVE
    629 
    630 //////////////////////////////////////////////////////////////////////////////
    631 // 2D Variance test functions
    632 //////////////////////////////////////////////////////////////////////////////
    633 
    634 using Var2DFunc = uint64_t (*)(uint8_t *src, int stride, int width, int height);
    635 using TestFuncVar2D = libaom_test::FuncParam<Var2DFunc>;
    636 
    637 const uint16_t test_block_size[2] = { 128, 256 };
    638 
    639 class Lowbd2dVarTest : public ::testing::TestWithParam<TestFuncVar2D> {
    640 public:
    641  ~Lowbd2dVarTest() override = default;
    642  void SetUp() override {
    643    params_ = this->GetParam();
    644    rnd_.Reset(ACMRandom::DeterministicSeed());
    645    src_ = reinterpret_cast<uint8_t *>(
    646        aom_memalign(16, 512 * 512 * sizeof(uint8_t)));
    647    ASSERT_NE(src_, nullptr);
    648  }
    649 
    650  void TearDown() override { aom_free(src_); }
    651  void RunTest(bool is_random);
    652  void RunSpeedTest();
    653 
    654  void GenRandomData(int width, int height, int stride) {
    655    const int msb = 7;  // Up to 8 bit input
    656    const int limit = 1 << (msb + 1);
    657    for (int ii = 0; ii < height; ii++) {
    658      for (int jj = 0; jj < width; jj++) {
    659        src_[ii * stride + jj] = rnd_(limit);
    660      }
    661    }
    662  }
    663 
    664  void GenExtremeData(int width, int height, int stride) {
    665    const int msb = 7;  // Up to 8 bit input
    666    const int limit = 1 << (msb + 1);
    667    const int val = rnd_(2) ? limit - 1 : 0;
    668    for (int ii = 0; ii < height; ii++) {
    669      for (int jj = 0; jj < width; jj++) {
    670        src_[ii * stride + jj] = val;
    671      }
    672    }
    673  }
    674 
    675 protected:
    676  TestFuncVar2D params_;
    677  uint8_t *src_;
    678  ACMRandom rnd_;
    679 };
    680 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Lowbd2dVarTest);
    681 
    682 void Lowbd2dVarTest::RunTest(bool is_random) {
    683  int failed = 0;
    684  for (int k = 0; k < kNumIterations; k++) {
    685    const int width = 4 * (rnd_(63) + 1);   // Up to 256x256
    686    const int height = 4 * (rnd_(63) + 1);  // Up to 256x256
    687    int stride = 4 << rnd_(8);              // Up to 512 stride
    688    while (stride < width) {                // Make sure it's valid
    689      stride = 4 << rnd_(8);
    690    }
    691    if (is_random) {
    692      GenRandomData(width, height, stride);
    693    } else {
    694      GenExtremeData(width, height, stride);
    695    }
    696 
    697    const uint64_t res_ref = params_.ref_func(src_, stride, width, height);
    698    uint64_t res_tst;
    699    API_REGISTER_STATE_CHECK(res_tst =
    700                                 params_.tst_func(src_, stride, width, height));
    701 
    702    if (!failed) {
    703      failed = res_ref != res_tst;
    704      EXPECT_EQ(res_ref, res_tst)
    705          << "Error: Sum Squares Test [" << width << "x" << height
    706          << "] C output does not match optimized output.";
    707    }
    708  }
    709 }
    710 
    711 void Lowbd2dVarTest::RunSpeedTest() {
    712  for (int block = 0; block < 2; block++) {
    713    const int width = test_block_size[block];
    714    const int height = test_block_size[block];
    715    int stride = 4 << rnd_(8);  // Up to 512 stride
    716    while (stride < width) {    // Make sure it's valid
    717      stride = 4 << rnd_(8);
    718    }
    719    GenExtremeData(width, height, stride);
    720    const int num_loops = 1000000000 / (width + height);
    721    aom_usec_timer timer;
    722    aom_usec_timer_start(&timer);
    723 
    724    for (int i = 0; i < num_loops; ++i)
    725      params_.ref_func(src_, stride, width, height);
    726 
    727    aom_usec_timer_mark(&timer);
    728    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
    729 
    730    aom_usec_timer timer1;
    731    aom_usec_timer_start(&timer1);
    732    for (int i = 0; i < num_loops; ++i)
    733      params_.tst_func(src_, stride, width, height);
    734    aom_usec_timer_mark(&timer1);
    735    const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
    736    printf("%3dx%-3d: Scaling = %.2f\n", width, height,
    737           (double)elapsed_time / elapsed_time1);
    738  }
    739 }
    740 
    741 TEST_P(Lowbd2dVarTest, OperationCheck) {
    742  RunTest(true);  // GenRandomData
    743 }
    744 
    745 TEST_P(Lowbd2dVarTest, ExtremeValues) {
    746  RunTest(false);  // GenExtremeData
    747 }
    748 
    749 TEST_P(Lowbd2dVarTest, DISABLED_Speed) { RunSpeedTest(); }
    750 
    751 #if HAVE_SSE2
    752 
    753 INSTANTIATE_TEST_SUITE_P(SSE2, Lowbd2dVarTest,
    754                         ::testing::Values(TestFuncVar2D(&aom_var_2d_u8_c,
    755                                                         &aom_var_2d_u8_sse2)));
    756 
    757 #endif  // HAVE_SSE2
    758 
    759 #if HAVE_AVX2
    760 
    761 INSTANTIATE_TEST_SUITE_P(AVX2, Lowbd2dVarTest,
    762                         ::testing::Values(TestFuncVar2D(&aom_var_2d_u8_c,
    763                                                         &aom_var_2d_u8_avx2)));
    764 
    765 #endif  // HAVE_SSE2
    766 
    767 #if HAVE_NEON
    768 
    769 INSTANTIATE_TEST_SUITE_P(NEON, Lowbd2dVarTest,
    770                         ::testing::Values(TestFuncVar2D(&aom_var_2d_u8_c,
    771                                                         &aom_var_2d_u8_neon)));
    772 
    773 #endif  // HAVE_NEON
    774 
    775 #if HAVE_NEON_DOTPROD
    776 
    777 INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, Lowbd2dVarTest,
    778                         ::testing::Values(TestFuncVar2D(
    779                             &aom_var_2d_u8_c, &aom_var_2d_u8_neon_dotprod)));
    780 
    781 #endif  // HAVE_NEON_DOTPROD
    782 
    783 #if CONFIG_AV1_HIGHBITDEPTH
    784 class Highbd2dVarTest : public ::testing::TestWithParam<TestFuncVar2D> {
    785 public:
    786  ~Highbd2dVarTest() override = default;
    787  void SetUp() override {
    788    params_ = this->GetParam();
    789    rnd_.Reset(ACMRandom::DeterministicSeed());
    790    src_ = reinterpret_cast<uint16_t *>(
    791        aom_memalign(16, 512 * 512 * sizeof(uint16_t)));
    792    ASSERT_NE(src_, nullptr);
    793  }
    794 
    795  void TearDown() override { aom_free(src_); }
    796  void RunTest(bool is_random);
    797  void RunSpeedTest();
    798 
    799  void GenRandomData(int width, int height, int stride) {
    800    const int msb = 11;  // Up to 12 bit input
    801    const int limit = 1 << (msb + 1);
    802    for (int ii = 0; ii < height; ii++) {
    803      for (int jj = 0; jj < width; jj++) {
    804        src_[ii * stride + jj] = rnd_(limit);
    805      }
    806    }
    807  }
    808 
    809  void GenExtremeData(int width, int height, int stride) {
    810    const int msb = 11;  // Up to 12 bit input
    811    const int limit = 1 << (msb + 1);
    812    const int val = rnd_(2) ? limit - 1 : 0;
    813    for (int ii = 0; ii < height; ii++) {
    814      for (int jj = 0; jj < width; jj++) {
    815        src_[ii * stride + jj] = val;
    816      }
    817    }
    818  }
    819 
    820 protected:
    821  TestFuncVar2D params_;
    822  uint16_t *src_;
    823  ACMRandom rnd_;
    824 };
    825 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Highbd2dVarTest);
    826 
    827 void Highbd2dVarTest::RunTest(bool is_random) {
    828  int failed = 0;
    829  for (int k = 0; k < kNumIterations; k++) {
    830    const int width = 4 * (rnd_(63) + 1);   // Up to 256x256
    831    const int height = 4 * (rnd_(63) + 1);  // Up to 256x256
    832    int stride = 4 << rnd_(8);              // Up to 512 stride
    833    while (stride < width) {                // Make sure it's valid
    834      stride = 4 << rnd_(8);
    835    }
    836    if (is_random) {
    837      GenRandomData(width, height, stride);
    838    } else {
    839      GenExtremeData(width, height, stride);
    840    }
    841 
    842    const uint64_t res_ref =
    843        params_.ref_func(CONVERT_TO_BYTEPTR(src_), stride, width, height);
    844    uint64_t res_tst;
    845    API_REGISTER_STATE_CHECK(
    846        res_tst =
    847            params_.tst_func(CONVERT_TO_BYTEPTR(src_), stride, width, height));
    848 
    849    if (!failed) {
    850      failed = res_ref != res_tst;
    851      EXPECT_EQ(res_ref, res_tst)
    852          << "Error: Sum Squares Test [" << width << "x" << height
    853          << "] C output does not match optimized output.";
    854    }
    855  }
    856 }
    857 
    858 void Highbd2dVarTest::RunSpeedTest() {
    859  for (int block = 0; block < 2; block++) {
    860    const int width = test_block_size[block];
    861    const int height = test_block_size[block];
    862    int stride = 4 << rnd_(8);  // Up to 512 stride
    863    while (stride < width) {    // Make sure it's valid
    864      stride = 4 << rnd_(8);
    865    }
    866    GenExtremeData(width, height, stride);
    867    const int num_loops = 1000000000 / (width + height);
    868    aom_usec_timer timer;
    869    aom_usec_timer_start(&timer);
    870 
    871    for (int i = 0; i < num_loops; ++i)
    872      params_.ref_func(CONVERT_TO_BYTEPTR(src_), stride, width, height);
    873 
    874    aom_usec_timer_mark(&timer);
    875    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
    876 
    877    aom_usec_timer timer1;
    878    aom_usec_timer_start(&timer1);
    879    for (int i = 0; i < num_loops; ++i)
    880      params_.tst_func(CONVERT_TO_BYTEPTR(src_), stride, width, height);
    881    aom_usec_timer_mark(&timer1);
    882    const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
    883    printf("%3dx%-3d: Scaling = %.2f\n", width, height,
    884           (double)elapsed_time / elapsed_time1);
    885  }
    886 }
    887 
    888 TEST_P(Highbd2dVarTest, OperationCheck) {
    889  RunTest(true);  // GenRandomData
    890 }
    891 
    892 TEST_P(Highbd2dVarTest, ExtremeValues) {
    893  RunTest(false);  // GenExtremeData
    894 }
    895 
    896 TEST_P(Highbd2dVarTest, DISABLED_Speed) { RunSpeedTest(); }
    897 
    898 #if HAVE_SSE2
    899 
    900 INSTANTIATE_TEST_SUITE_P(
    901    SSE2, Highbd2dVarTest,
    902    ::testing::Values(TestFuncVar2D(&aom_var_2d_u16_c, &aom_var_2d_u16_sse2)));
    903 
    904 #endif  // HAVE_SSE2
    905 
    906 #if HAVE_AVX2
    907 
    908 INSTANTIATE_TEST_SUITE_P(
    909    AVX2, Highbd2dVarTest,
    910    ::testing::Values(TestFuncVar2D(&aom_var_2d_u16_c, &aom_var_2d_u16_avx2)));
    911 
    912 #endif  // HAVE_SSE2
    913 
    914 #if HAVE_NEON
    915 
    916 INSTANTIATE_TEST_SUITE_P(
    917    NEON, Highbd2dVarTest,
    918    ::testing::Values(TestFuncVar2D(&aom_var_2d_u16_c, &aom_var_2d_u16_neon)));
    919 
    920 #endif  // HAVE_NEON
    921 
    922 #if HAVE_SVE
    923 
    924 INSTANTIATE_TEST_SUITE_P(SVE, Highbd2dVarTest,
    925                         ::testing::Values(TestFuncVar2D(&aom_var_2d_u16_c,
    926                                                         &aom_var_2d_u16_sve)));
    927 
    928 #endif  // HAVE_SVE
    929 #endif  // CONFIG_AV1_HIGHBITDEPTH
    930 }  // namespace