tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

simd_cmp_impl.inc (94047B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <assert.h>
     13 #include <string>
     14 
     15 #include "config/aom_dsp_rtcd.h"
     16 
     17 #include "test/acm_random.h"
     18 // Inlining not forced for the compiler due to some tests calling
     19 // SIMD_INLINE functions via function pointers
     20 #undef SIMD_INLINE
     21 #define SIMD_INLINE static inline
     22 #include "aom_dsp/aom_simd.h"
     23 #include "aom_dsp/simd/v256_intrinsics_c.h"
     24 
     25 // Machine tuned code goes into this file. This file is included from
     26 // simd_cmp_sse2.cc, simd_cmp_ssse3.cc etc which define the macros
     27 // ARCH (=neon, sse2, ssse3, etc), SIMD_NAMESPACE and ARCH_POSTFIX().
     28 
     29 #ifdef _MSC_VER
     30 // Disable "value of intrinsic immediate argument 'value' is out of range
     31 // 'lowerbound - upperbound'" warning. Visual Studio emits this warning though
     32 // the parameters are conditionally checked in e.g., v256_shr_n_byte. Adding a
     33 // mask doesn't always appear to be sufficient.
     34 #pragma warning(disable : 4556)
     35 #endif
     36 
     37 using libaom_test::ACMRandom;
     38 
     39 namespace SIMD_NAMESPACE {
     40 
     41 // Wrap templates around intrinsics using immediate values
     42 template <int shift>
     43 v64 imm_v64_shl_n_byte(v64 a) {
     44  return v64_shl_n_byte(a, shift);
     45 }
     46 template <int shift>
     47 v64 imm_v64_shr_n_byte(v64 a) {
     48  return v64_shr_n_byte(a, shift);
     49 }
     50 template <int shift>
     51 v64 imm_v64_shl_n_8(v64 a) {
     52  return v64_shl_n_8(a, shift);
     53 }
     54 template <int shift>
     55 v64 imm_v64_shr_n_u8(v64 a) {
     56  return v64_shr_n_u8(a, shift);
     57 }
     58 template <int shift>
     59 v64 imm_v64_shr_n_s8(v64 a) {
     60  return v64_shr_n_s8(a, shift);
     61 }
     62 template <int shift>
     63 v64 imm_v64_shl_n_16(v64 a) {
     64  return v64_shl_n_16(a, shift);
     65 }
     66 template <int shift>
     67 v64 imm_v64_shr_n_u16(v64 a) {
     68  return v64_shr_n_u16(a, shift);
     69 }
     70 template <int shift>
     71 v64 imm_v64_shr_n_s16(v64 a) {
     72  return v64_shr_n_s16(a, shift);
     73 }
     74 template <int shift>
     75 v64 imm_v64_shl_n_32(v64 a) {
     76  return v64_shl_n_32(a, shift);
     77 }
     78 template <int shift>
     79 v64 imm_v64_shr_n_u32(v64 a) {
     80  return v64_shr_n_u32(a, shift);
     81 }
     82 template <int shift>
     83 v64 imm_v64_shr_n_s32(v64 a) {
     84  return v64_shr_n_s32(a, shift);
     85 }
     86 template <int shift>
     87 v64 imm_v64_align(v64 a, v64 b) {
     88  return v64_align(a, b, shift);
     89 }
     90 
     91 // Wrap templates around corresponding C implementations of the above
     92 template <int shift>
     93 c_v64 c_imm_v64_shl_n_byte(c_v64 a) {
     94  return c_v64_shl_n_byte(a, shift);
     95 }
     96 template <int shift>
     97 c_v64 c_imm_v64_shr_n_byte(c_v64 a) {
     98  return c_v64_shr_n_byte(a, shift);
     99 }
    100 template <int shift>
    101 c_v64 c_imm_v64_shl_n_8(c_v64 a) {
    102  return c_v64_shl_n_8(a, shift);
    103 }
    104 template <int shift>
    105 c_v64 c_imm_v64_shr_n_u8(c_v64 a) {
    106  return c_v64_shr_n_u8(a, shift);
    107 }
    108 template <int shift>
    109 c_v64 c_imm_v64_shr_n_s8(c_v64 a) {
    110  return c_v64_shr_n_s8(a, shift);
    111 }
    112 template <int shift>
    113 c_v64 c_imm_v64_shl_n_16(c_v64 a) {
    114  return c_v64_shl_n_16(a, shift);
    115 }
    116 template <int shift>
    117 c_v64 c_imm_v64_shr_n_u16(c_v64 a) {
    118  return c_v64_shr_n_u16(a, shift);
    119 }
    120 template <int shift>
    121 c_v64 c_imm_v64_shr_n_s16(c_v64 a) {
    122  return c_v64_shr_n_s16(a, shift);
    123 }
    124 template <int shift>
    125 c_v64 c_imm_v64_shl_n_32(c_v64 a) {
    126  return c_v64_shl_n_32(a, shift);
    127 }
    128 template <int shift>
    129 c_v64 c_imm_v64_shr_n_u32(c_v64 a) {
    130  return c_v64_shr_n_u32(a, shift);
    131 }
    132 template <int shift>
    133 c_v64 c_imm_v64_shr_n_s32(c_v64 a) {
    134  return c_v64_shr_n_s32(a, shift);
    135 }
    136 template <int shift>
    137 c_v64 c_imm_v64_align(c_v64 a, c_v64 b) {
    138  return c_v64_align(a, b, shift);
    139 }
    140 
    141 template <int shift>
    142 v128 imm_v128_shl_n_byte(v128 a) {
    143  return v128_shl_n_byte(a, shift);
    144 }
    145 template <int shift>
    146 v128 imm_v128_shr_n_byte(v128 a) {
    147  return v128_shr_n_byte(a, shift);
    148 }
    149 template <int shift>
    150 v128 imm_v128_shl_n_8(v128 a) {
    151  return v128_shl_n_8(a, shift);
    152 }
    153 template <int shift>
    154 v128 imm_v128_shr_n_u8(v128 a) {
    155  return v128_shr_n_u8(a, shift);
    156 }
    157 template <int shift>
    158 v128 imm_v128_shr_n_s8(v128 a) {
    159  return v128_shr_n_s8(a, shift);
    160 }
    161 template <int shift>
    162 v128 imm_v128_shl_n_16(v128 a) {
    163  return v128_shl_n_16(a, shift);
    164 }
    165 template <int shift>
    166 v128 imm_v128_shr_n_u16(v128 a) {
    167  return v128_shr_n_u16(a, shift);
    168 }
    169 template <int shift>
    170 v128 imm_v128_shr_n_s16(v128 a) {
    171  return v128_shr_n_s16(a, shift);
    172 }
    173 template <int shift>
    174 v128 imm_v128_shl_n_32(v128 a) {
    175  return v128_shl_n_32(a, shift);
    176 }
    177 template <int shift>
    178 v128 imm_v128_shr_n_u32(v128 a) {
    179  return v128_shr_n_u32(a, shift);
    180 }
    181 template <int shift>
    182 v128 imm_v128_shr_n_s32(v128 a) {
    183  return v128_shr_n_s32(a, shift);
    184 }
    185 template <int shift>
    186 v128 imm_v128_shl_n_64(v128 a) {
    187  return v128_shl_n_64(a, shift);
    188 }
    189 template <int shift>
    190 v128 imm_v128_shr_n_u64(v128 a) {
    191  return v128_shr_n_u64(a, shift);
    192 }
    193 template <int shift>
    194 v128 imm_v128_shr_n_s64(v128 a) {
    195  return v128_shr_n_s64(a, shift);
    196 }
    197 template <int shift>
    198 v128 imm_v128_align(v128 a, v128 b) {
    199  return v128_align(a, b, shift);
    200 }
    201 
    202 template <int shift>
    203 c_v128 c_imm_v128_shl_n_byte(c_v128 a) {
    204  return c_v128_shl_n_byte(a, shift);
    205 }
    206 template <int shift>
    207 c_v128 c_imm_v128_shr_n_byte(c_v128 a) {
    208  return c_v128_shr_n_byte(a, shift);
    209 }
    210 template <int shift>
    211 c_v128 c_imm_v128_shl_n_8(c_v128 a) {
    212  return c_v128_shl_n_8(a, shift);
    213 }
    214 template <int shift>
    215 c_v128 c_imm_v128_shr_n_u8(c_v128 a) {
    216  return c_v128_shr_n_u8(a, shift);
    217 }
    218 template <int shift>
    219 c_v128 c_imm_v128_shr_n_s8(c_v128 a) {
    220  return c_v128_shr_n_s8(a, shift);
    221 }
    222 template <int shift>
    223 c_v128 c_imm_v128_shl_n_16(c_v128 a) {
    224  return c_v128_shl_n_16(a, shift);
    225 }
    226 template <int shift>
    227 c_v128 c_imm_v128_shr_n_u16(c_v128 a) {
    228  return c_v128_shr_n_u16(a, shift);
    229 }
    230 template <int shift>
    231 c_v128 c_imm_v128_shr_n_s16(c_v128 a) {
    232  return c_v128_shr_n_s16(a, shift);
    233 }
    234 template <int shift>
    235 c_v128 c_imm_v128_shl_n_32(c_v128 a) {
    236  return c_v128_shl_n_32(a, shift);
    237 }
    238 template <int shift>
    239 c_v128 c_imm_v128_shr_n_u32(c_v128 a) {
    240  return c_v128_shr_n_u32(a, shift);
    241 }
    242 template <int shift>
    243 c_v128 c_imm_v128_shr_n_s32(c_v128 a) {
    244  return c_v128_shr_n_s32(a, shift);
    245 }
    246 template <int shift>
    247 c_v128 c_imm_v128_shl_n_64(c_v128 a) {
    248  return c_v128_shl_n_64(a, shift);
    249 }
    250 template <int shift>
    251 c_v128 c_imm_v128_shr_n_u64(c_v128 a) {
    252  return c_v128_shr_n_u64(a, shift);
    253 }
    254 template <int shift>
    255 c_v128 c_imm_v128_shr_n_s64(c_v128 a) {
    256  return c_v128_shr_n_s64(a, shift);
    257 }
    258 template <int shift>
    259 c_v128 c_imm_v128_align(c_v128 a, c_v128 b) {
    260  return c_v128_align(a, b, shift);
    261 }
    262 
    263 template <int shift>
    264 v256 imm_v256_shl_n_word(v256 a) {
    265  return v256_shl_n_word(a, shift);
    266 }
    267 template <int shift>
    268 v256 imm_v256_shr_n_word(v256 a) {
    269  return v256_shr_n_word(a, shift);
    270 }
    271 template <int shift>
    272 v256 imm_v256_shl_n_byte(v256 a) {
    273  return v256_shl_n_byte(a, shift);
    274 }
    275 template <int shift>
    276 v256 imm_v256_shr_n_byte(v256 a) {
    277  return v256_shr_n_byte(a, shift);
    278 }
    279 template <int shift>
    280 v256 imm_v256_shl_n_8(v256 a) {
    281  return v256_shl_n_8(a, shift);
    282 }
    283 template <int shift>
    284 v256 imm_v256_shr_n_u8(v256 a) {
    285  return v256_shr_n_u8(a, shift);
    286 }
    287 template <int shift>
    288 v256 imm_v256_shr_n_s8(v256 a) {
    289  return v256_shr_n_s8(a, shift);
    290 }
    291 template <int shift>
    292 v256 imm_v256_shl_n_16(v256 a) {
    293  return v256_shl_n_16(a, shift);
    294 }
    295 template <int shift>
    296 v256 imm_v256_shr_n_u16(v256 a) {
    297  return v256_shr_n_u16(a, shift);
    298 }
    299 template <int shift>
    300 v256 imm_v256_shr_n_s16(v256 a) {
    301  return v256_shr_n_s16(a, shift);
    302 }
    303 template <int shift>
    304 v256 imm_v256_shl_n_32(v256 a) {
    305  return v256_shl_n_32(a, shift);
    306 }
    307 template <int shift>
    308 v256 imm_v256_shr_n_u32(v256 a) {
    309  return v256_shr_n_u32(a, shift);
    310 }
    311 template <int shift>
    312 v256 imm_v256_shr_n_s32(v256 a) {
    313  return v256_shr_n_s32(a, shift);
    314 }
    315 template <int shift>
    316 v256 imm_v256_shl_n_64(v256 a) {
    317  return v256_shl_n_64(a, shift);
    318 }
    319 template <int shift>
    320 v256 imm_v256_shr_n_u64(v256 a) {
    321  return v256_shr_n_u64(a, shift);
    322 }
    323 template <int shift>
    324 v256 imm_v256_shr_n_s64(v256 a) {
    325  return v256_shr_n_s64(a, shift);
    326 }
    327 template <int shift>
    328 v256 imm_v256_align(v256 a, v256 b) {
    329  return v256_align(a, b, shift);
    330 }
    331 
    332 template <int shift>
    333 c_v256 c_imm_v256_shl_n_word(c_v256 a) {
    334  return c_v256_shl_n_word(a, shift);
    335 }
    336 template <int shift>
    337 c_v256 c_imm_v256_shr_n_word(c_v256 a) {
    338  return c_v256_shr_n_word(a, shift);
    339 }
    340 template <int shift>
    341 c_v256 c_imm_v256_shl_n_byte(c_v256 a) {
    342  return c_v256_shl_n_byte(a, shift);
    343 }
    344 template <int shift>
    345 c_v256 c_imm_v256_shr_n_byte(c_v256 a) {
    346  return c_v256_shr_n_byte(a, shift);
    347 }
    348 template <int shift>
    349 c_v256 c_imm_v256_shl_n_8(c_v256 a) {
    350  return c_v256_shl_n_8(a, shift);
    351 }
    352 template <int shift>
    353 c_v256 c_imm_v256_shr_n_u8(c_v256 a) {
    354  return c_v256_shr_n_u8(a, shift);
    355 }
    356 template <int shift>
    357 c_v256 c_imm_v256_shr_n_s8(c_v256 a) {
    358  return c_v256_shr_n_s8(a, shift);
    359 }
    360 template <int shift>
    361 c_v256 c_imm_v256_shl_n_16(c_v256 a) {
    362  return c_v256_shl_n_16(a, shift);
    363 }
    364 template <int shift>
    365 c_v256 c_imm_v256_shr_n_u16(c_v256 a) {
    366  return c_v256_shr_n_u16(a, shift);
    367 }
    368 template <int shift>
    369 c_v256 c_imm_v256_shr_n_s16(c_v256 a) {
    370  return c_v256_shr_n_s16(a, shift);
    371 }
    372 template <int shift>
    373 c_v256 c_imm_v256_shl_n_32(c_v256 a) {
    374  return c_v256_shl_n_32(a, shift);
    375 }
    376 template <int shift>
    377 c_v256 c_imm_v256_shr_n_u32(c_v256 a) {
    378  return c_v256_shr_n_u32(a, shift);
    379 }
    380 template <int shift>
    381 c_v256 c_imm_v256_shr_n_s32(c_v256 a) {
    382  return c_v256_shr_n_s32(a, shift);
    383 }
    384 template <int shift>
    385 c_v256 c_imm_v256_shl_n_64(c_v256 a) {
    386  return c_v256_shl_n_64(a, shift);
    387 }
    388 template <int shift>
    389 c_v256 c_imm_v256_shr_n_u64(c_v256 a) {
    390  return c_v256_shr_n_u64(a, shift);
    391 }
    392 template <int shift>
    393 c_v256 c_imm_v256_shr_n_s64(c_v256 a) {
    394  return c_v256_shr_n_s64(a, shift);
    395 }
    396 template <int shift>
    397 c_v256 c_imm_v256_align(c_v256 a, c_v256 b) {
    398  return c_v256_align(a, b, shift);
    399 }
    400 
    401 namespace {
    402 
    403 // Wrappers around the the SAD and SSD functions
    404 uint32_t v64_sad_u8(v64 a, v64 b) {
    405  return v64_sad_u8_sum(::v64_sad_u8(v64_sad_u8_init(), a, b));
    406 }
    407 uint32_t v64_ssd_u8(v64 a, v64 b) {
    408  return v64_ssd_u8_sum(::v64_ssd_u8(v64_ssd_u8_init(), a, b));
    409 }
    410 
    411 uint32_t c_v64_sad_u8(c_v64 a, c_v64 b) {
    412  return c_v64_sad_u8_sum(::c_v64_sad_u8(c_v64_sad_u8_init(), a, b));
    413 }
    414 uint32_t c_v64_ssd_u8(c_v64 a, c_v64 b) {
    415  return c_v64_ssd_u8_sum(::c_v64_ssd_u8(c_v64_ssd_u8_init(), a, b));
    416 }
    417 uint32_t v128_sad_u8(v128 a, v128 b) {
    418  return v128_sad_u8_sum(::v128_sad_u8(v128_sad_u8_init(), a, b));
    419 }
    420 uint32_t v128_ssd_u8(v128 a, v128 b) {
    421  return v128_ssd_u8_sum(::v128_ssd_u8(v128_ssd_u8_init(), a, b));
    422 }
    423 uint32_t c_v128_sad_u8(c_v128 a, c_v128 b) {
    424  return c_v128_sad_u8_sum(::c_v128_sad_u8(c_v128_sad_u8_init(), a, b));
    425 }
    426 uint32_t c_v128_ssd_u8(c_v128 a, c_v128 b) {
    427  return c_v128_ssd_u8_sum(::c_v128_ssd_u8(c_v128_ssd_u8_init(), a, b));
    428 }
    429 uint32_t v128_sad_u16(v128 a, v128 b) {
    430  return v128_sad_u16_sum(::v128_sad_u16(v128_sad_u16_init(), a, b));
    431 }
    432 uint64_t v128_ssd_s16(v128 a, v128 b) {
    433  return v128_ssd_s16_sum(::v128_ssd_s16(v128_ssd_s16_init(), a, b));
    434 }
    435 uint32_t c_v128_sad_u16(c_v128 a, c_v128 b) {
    436  return c_v128_sad_u16_sum(::c_v128_sad_u16(c_v128_sad_u16_init(), a, b));
    437 }
    438 uint64_t c_v128_ssd_s16(c_v128 a, c_v128 b) {
    439  return c_v128_ssd_s16_sum(::c_v128_ssd_s16(c_v128_ssd_s16_init(), a, b));
    440 }
    441 uint32_t v256_sad_u8(v256 a, v256 b) {
    442  return v256_sad_u8_sum(::v256_sad_u8(v256_sad_u8_init(), a, b));
    443 }
    444 uint32_t v256_ssd_u8(v256 a, v256 b) {
    445  return v256_ssd_u8_sum(::v256_ssd_u8(v256_ssd_u8_init(), a, b));
    446 }
    447 uint32_t c_v256_sad_u8(c_v256 a, c_v256 b) {
    448  return c_v256_sad_u8_sum(::c_v256_sad_u8(c_v256_sad_u8_init(), a, b));
    449 }
    450 uint32_t c_v256_ssd_u8(c_v256 a, c_v256 b) {
    451  return c_v256_ssd_u8_sum(::c_v256_ssd_u8(c_v256_ssd_u8_init(), a, b));
    452 }
    453 uint32_t v256_sad_u16(v256 a, v256 b) {
    454  return v256_sad_u16_sum(::v256_sad_u16(v256_sad_u16_init(), a, b));
    455 }
    456 uint64_t v256_ssd_s16(v256 a, v256 b) {
    457  return v256_ssd_s16_sum(::v256_ssd_s16(v256_ssd_s16_init(), a, b));
    458 }
    459 uint32_t c_v256_sad_u16(c_v256 a, c_v256 b) {
    460  return c_v256_sad_u16_sum(::c_v256_sad_u16(c_v256_sad_u16_init(), a, b));
    461 }
    462 uint64_t c_v256_ssd_s16(c_v256 a, c_v256 b) {
    463  return c_v256_ssd_s16_sum(::c_v256_ssd_s16(c_v256_ssd_s16_init(), a, b));
    464 }
    465 
    466 using fptr = void (*)();
    467 
    468 struct mapping {
    469  const char *name;
    470  fptr ref;
    471  fptr simd;
    472 };
    473 
    474 #define MAP(name) \
    475  { #name, reinterpret_cast < fptr>(c_##name), reinterpret_cast < fptr>(name) }
    476 
    477 const mapping m[] = { MAP(v64_sad_u8),
    478                      MAP(v64_ssd_u8),
    479                      MAP(v64_add_8),
    480                      MAP(v64_add_16),
    481                      MAP(v64_sadd_s8),
    482                      MAP(v64_sadd_u8),
    483                      MAP(v64_sadd_s16),
    484                      MAP(v64_add_32),
    485                      MAP(v64_sub_8),
    486                      MAP(v64_ssub_u8),
    487                      MAP(v64_ssub_s8),
    488                      MAP(v64_sub_16),
    489                      MAP(v64_ssub_s16),
    490                      MAP(v64_ssub_u16),
    491                      MAP(v64_sub_32),
    492                      MAP(v64_ziplo_8),
    493                      MAP(v64_ziphi_8),
    494                      MAP(v64_ziplo_16),
    495                      MAP(v64_ziphi_16),
    496                      MAP(v64_ziplo_32),
    497                      MAP(v64_ziphi_32),
    498                      MAP(v64_pack_s32_u16),
    499                      MAP(v64_pack_s32_s16),
    500                      MAP(v64_pack_s16_u8),
    501                      MAP(v64_pack_s16_s8),
    502                      MAP(v64_unziphi_8),
    503                      MAP(v64_unziplo_8),
    504                      MAP(v64_unziphi_16),
    505                      MAP(v64_unziplo_16),
    506                      MAP(v64_or),
    507                      MAP(v64_xor),
    508                      MAP(v64_and),
    509                      MAP(v64_andn),
    510                      MAP(v64_mullo_s16),
    511                      MAP(v64_mulhi_s16),
    512                      MAP(v64_mullo_s32),
    513                      MAP(v64_madd_s16),
    514                      MAP(v64_madd_us8),
    515                      MAP(v64_avg_u8),
    516                      MAP(v64_rdavg_u8),
    517                      MAP(v64_rdavg_u16),
    518                      MAP(v64_avg_u16),
    519                      MAP(v64_min_u8),
    520                      MAP(v64_max_u8),
    521                      MAP(v64_min_s8),
    522                      MAP(v64_max_s8),
    523                      MAP(v64_min_s16),
    524                      MAP(v64_max_s16),
    525                      MAP(v64_cmpgt_s8),
    526                      MAP(v64_cmplt_s8),
    527                      MAP(v64_cmpeq_8),
    528                      MAP(v64_cmpgt_s16),
    529                      MAP(v64_cmplt_s16),
    530                      MAP(v64_cmpeq_16),
    531                      MAP(v64_shuffle_8),
    532                      MAP(imm_v64_align<1>),
    533                      MAP(imm_v64_align<2>),
    534                      MAP(imm_v64_align<3>),
    535                      MAP(imm_v64_align<4>),
    536                      MAP(imm_v64_align<5>),
    537                      MAP(imm_v64_align<6>),
    538                      MAP(imm_v64_align<7>),
    539                      MAP(v64_abs_s8),
    540                      MAP(v64_abs_s16),
    541                      MAP(v64_unpacklo_u8_s16),
    542                      MAP(v64_unpackhi_u8_s16),
    543                      MAP(v64_unpacklo_s8_s16),
    544                      MAP(v64_unpackhi_s8_s16),
    545                      MAP(v64_unpacklo_u16_s32),
    546                      MAP(v64_unpacklo_s16_s32),
    547                      MAP(v64_unpackhi_u16_s32),
    548                      MAP(v64_unpackhi_s16_s32),
    549                      MAP(imm_v64_shr_n_byte<1>),
    550                      MAP(imm_v64_shr_n_byte<2>),
    551                      MAP(imm_v64_shr_n_byte<3>),
    552                      MAP(imm_v64_shr_n_byte<4>),
    553                      MAP(imm_v64_shr_n_byte<5>),
    554                      MAP(imm_v64_shr_n_byte<6>),
    555                      MAP(imm_v64_shr_n_byte<7>),
    556                      MAP(imm_v64_shl_n_byte<1>),
    557                      MAP(imm_v64_shl_n_byte<2>),
    558                      MAP(imm_v64_shl_n_byte<3>),
    559                      MAP(imm_v64_shl_n_byte<4>),
    560                      MAP(imm_v64_shl_n_byte<5>),
    561                      MAP(imm_v64_shl_n_byte<6>),
    562                      MAP(imm_v64_shl_n_byte<7>),
    563                      MAP(imm_v64_shl_n_8<1>),
    564                      MAP(imm_v64_shl_n_8<2>),
    565                      MAP(imm_v64_shl_n_8<3>),
    566                      MAP(imm_v64_shl_n_8<4>),
    567                      MAP(imm_v64_shl_n_8<5>),
    568                      MAP(imm_v64_shl_n_8<6>),
    569                      MAP(imm_v64_shl_n_8<7>),
    570                      MAP(imm_v64_shr_n_u8<1>),
    571                      MAP(imm_v64_shr_n_u8<2>),
    572                      MAP(imm_v64_shr_n_u8<3>),
    573                      MAP(imm_v64_shr_n_u8<4>),
    574                      MAP(imm_v64_shr_n_u8<5>),
    575                      MAP(imm_v64_shr_n_u8<6>),
    576                      MAP(imm_v64_shr_n_u8<7>),
    577                      MAP(imm_v64_shr_n_s8<1>),
    578                      MAP(imm_v64_shr_n_s8<2>),
    579                      MAP(imm_v64_shr_n_s8<3>),
    580                      MAP(imm_v64_shr_n_s8<4>),
    581                      MAP(imm_v64_shr_n_s8<5>),
    582                      MAP(imm_v64_shr_n_s8<6>),
    583                      MAP(imm_v64_shr_n_s8<7>),
    584                      MAP(imm_v64_shl_n_16<1>),
    585                      MAP(imm_v64_shl_n_16<2>),
    586                      MAP(imm_v64_shl_n_16<4>),
    587                      MAP(imm_v64_shl_n_16<6>),
    588                      MAP(imm_v64_shl_n_16<8>),
    589                      MAP(imm_v64_shl_n_16<10>),
    590                      MAP(imm_v64_shl_n_16<12>),
    591                      MAP(imm_v64_shl_n_16<14>),
    592                      MAP(imm_v64_shr_n_u16<1>),
    593                      MAP(imm_v64_shr_n_u16<2>),
    594                      MAP(imm_v64_shr_n_u16<4>),
    595                      MAP(imm_v64_shr_n_u16<6>),
    596                      MAP(imm_v64_shr_n_u16<8>),
    597                      MAP(imm_v64_shr_n_u16<10>),
    598                      MAP(imm_v64_shr_n_u16<12>),
    599                      MAP(imm_v64_shr_n_u16<14>),
    600                      MAP(imm_v64_shr_n_s16<1>),
    601                      MAP(imm_v64_shr_n_s16<2>),
    602                      MAP(imm_v64_shr_n_s16<4>),
    603                      MAP(imm_v64_shr_n_s16<6>),
    604                      MAP(imm_v64_shr_n_s16<8>),
    605                      MAP(imm_v64_shr_n_s16<10>),
    606                      MAP(imm_v64_shr_n_s16<12>),
    607                      MAP(imm_v64_shr_n_s16<14>),
    608                      MAP(imm_v64_shl_n_32<1>),
    609                      MAP(imm_v64_shl_n_32<4>),
    610                      MAP(imm_v64_shl_n_32<8>),
    611                      MAP(imm_v64_shl_n_32<12>),
    612                      MAP(imm_v64_shl_n_32<16>),
    613                      MAP(imm_v64_shl_n_32<20>),
    614                      MAP(imm_v64_shl_n_32<24>),
    615                      MAP(imm_v64_shl_n_32<28>),
    616                      MAP(imm_v64_shr_n_u32<1>),
    617                      MAP(imm_v64_shr_n_u32<4>),
    618                      MAP(imm_v64_shr_n_u32<8>),
    619                      MAP(imm_v64_shr_n_u32<12>),
    620                      MAP(imm_v64_shr_n_u32<16>),
    621                      MAP(imm_v64_shr_n_u32<20>),
    622                      MAP(imm_v64_shr_n_u32<24>),
    623                      MAP(imm_v64_shr_n_u32<28>),
    624                      MAP(imm_v64_shr_n_s32<1>),
    625                      MAP(imm_v64_shr_n_s32<4>),
    626                      MAP(imm_v64_shr_n_s32<8>),
    627                      MAP(imm_v64_shr_n_s32<12>),
    628                      MAP(imm_v64_shr_n_s32<16>),
    629                      MAP(imm_v64_shr_n_s32<20>),
    630                      MAP(imm_v64_shr_n_s32<24>),
    631                      MAP(imm_v64_shr_n_s32<28>),
    632                      MAP(v64_shl_8),
    633                      MAP(v64_shr_u8),
    634                      MAP(v64_shr_s8),
    635                      MAP(v64_shl_16),
    636                      MAP(v64_shr_u16),
    637                      MAP(v64_shr_s16),
    638                      MAP(v64_shl_32),
    639                      MAP(v64_shr_u32),
    640                      MAP(v64_shr_s32),
    641                      MAP(v64_hadd_u8),
    642                      MAP(v64_hadd_s16),
    643                      MAP(v64_dotp_s16),
    644                      MAP(v64_dotp_su8),
    645                      MAP(v64_u64),
    646                      MAP(v64_low_u32),
    647                      MAP(v64_high_u32),
    648                      MAP(v64_low_s32),
    649                      MAP(v64_high_s32),
    650                      MAP(v64_dup_8),
    651                      MAP(v64_dup_16),
    652                      MAP(v64_dup_32),
    653                      MAP(v64_from_32),
    654                      MAP(v64_zero),
    655                      MAP(v64_from_16),
    656                      MAP(v128_sad_u8),
    657                      MAP(v128_ssd_u8),
    658                      MAP(v128_sad_u16),
    659                      MAP(v128_ssd_s16),
    660                      MAP(v128_add_8),
    661                      MAP(v128_add_16),
    662                      MAP(v128_sadd_s8),
    663                      MAP(v128_sadd_u8),
    664                      MAP(v128_sadd_s16),
    665                      MAP(v128_add_32),
    666                      MAP(v128_add_64),
    667                      MAP(v128_sub_8),
    668                      MAP(v128_ssub_u8),
    669                      MAP(v128_ssub_s8),
    670                      MAP(v128_sub_16),
    671                      MAP(v128_ssub_s16),
    672                      MAP(v128_ssub_u16),
    673                      MAP(v128_sub_32),
    674                      MAP(v128_sub_64),
    675                      MAP(v128_ziplo_8),
    676                      MAP(v128_ziphi_8),
    677                      MAP(v128_ziplo_16),
    678                      MAP(v128_ziphi_16),
    679                      MAP(v128_ziplo_32),
    680                      MAP(v128_ziphi_32),
    681                      MAP(v128_ziplo_64),
    682                      MAP(v128_ziphi_64),
    683                      MAP(v128_unziphi_8),
    684                      MAP(v128_unziplo_8),
    685                      MAP(v128_unziphi_16),
    686                      MAP(v128_unziplo_16),
    687                      MAP(v128_unziphi_32),
    688                      MAP(v128_unziplo_32),
    689                      MAP(v128_pack_s32_u16),
    690                      MAP(v128_pack_s32_s16),
    691                      MAP(v128_pack_s16_u8),
    692                      MAP(v128_pack_s16_s8),
    693                      MAP(v128_or),
    694                      MAP(v128_xor),
    695                      MAP(v128_and),
    696                      MAP(v128_andn),
    697                      MAP(v128_mullo_s16),
    698                      MAP(v128_mulhi_s16),
    699                      MAP(v128_mullo_s32),
    700                      MAP(v128_madd_s16),
    701                      MAP(v128_madd_us8),
    702                      MAP(v128_avg_u8),
    703                      MAP(v128_rdavg_u8),
    704                      MAP(v128_rdavg_u16),
    705                      MAP(v128_avg_u16),
    706                      MAP(v128_min_u8),
    707                      MAP(v128_max_u8),
    708                      MAP(v128_min_s8),
    709                      MAP(v128_max_s8),
    710                      MAP(v128_min_s16),
    711                      MAP(v128_max_s16),
    712                      MAP(v128_min_s32),
    713                      MAP(v128_max_s32),
    714                      MAP(v128_cmpgt_s8),
    715                      MAP(v128_cmplt_s8),
    716                      MAP(v128_cmpeq_8),
    717                      MAP(v128_cmpgt_s16),
    718                      MAP(v128_cmpeq_16),
    719                      MAP(v128_cmplt_s16),
    720                      MAP(v128_cmpgt_s32),
    721                      MAP(v128_cmpeq_32),
    722                      MAP(v128_cmplt_s32),
    723                      MAP(v128_shuffle_8),
    724                      MAP(imm_v128_align<1>),
    725                      MAP(imm_v128_align<2>),
    726                      MAP(imm_v128_align<3>),
    727                      MAP(imm_v128_align<4>),
    728                      MAP(imm_v128_align<5>),
    729                      MAP(imm_v128_align<6>),
    730                      MAP(imm_v128_align<7>),
    731                      MAP(imm_v128_align<8>),
    732                      MAP(imm_v128_align<9>),
    733                      MAP(imm_v128_align<10>),
    734                      MAP(imm_v128_align<11>),
    735                      MAP(imm_v128_align<12>),
    736                      MAP(imm_v128_align<13>),
    737                      MAP(imm_v128_align<14>),
    738                      MAP(imm_v128_align<15>),
    739                      MAP(v128_abs_s8),
    740                      MAP(v128_abs_s16),
    741                      MAP(v128_padd_u8),
    742                      MAP(v128_padd_s16),
    743                      MAP(v128_unpacklo_u16_s32),
    744                      MAP(v128_unpacklo_s16_s32),
    745                      MAP(v128_unpackhi_u16_s32),
    746                      MAP(v128_unpackhi_s16_s32),
    747                      MAP(imm_v128_shr_n_byte<1>),
    748                      MAP(imm_v128_shr_n_byte<2>),
    749                      MAP(imm_v128_shr_n_byte<3>),
    750                      MAP(imm_v128_shr_n_byte<4>),
    751                      MAP(imm_v128_shr_n_byte<5>),
    752                      MAP(imm_v128_shr_n_byte<6>),
    753                      MAP(imm_v128_shr_n_byte<7>),
    754                      MAP(imm_v128_shr_n_byte<8>),
    755                      MAP(imm_v128_shr_n_byte<9>),
    756                      MAP(imm_v128_shr_n_byte<10>),
    757                      MAP(imm_v128_shr_n_byte<11>),
    758                      MAP(imm_v128_shr_n_byte<12>),
    759                      MAP(imm_v128_shr_n_byte<13>),
    760                      MAP(imm_v128_shr_n_byte<14>),
    761                      MAP(imm_v128_shr_n_byte<15>),
    762                      MAP(imm_v128_shl_n_byte<1>),
    763                      MAP(imm_v128_shl_n_byte<2>),
    764                      MAP(imm_v128_shl_n_byte<3>),
    765                      MAP(imm_v128_shl_n_byte<4>),
    766                      MAP(imm_v128_shl_n_byte<5>),
    767                      MAP(imm_v128_shl_n_byte<6>),
    768                      MAP(imm_v128_shl_n_byte<7>),
    769                      MAP(imm_v128_shl_n_byte<8>),
    770                      MAP(imm_v128_shl_n_byte<9>),
    771                      MAP(imm_v128_shl_n_byte<10>),
    772                      MAP(imm_v128_shl_n_byte<11>),
    773                      MAP(imm_v128_shl_n_byte<12>),
    774                      MAP(imm_v128_shl_n_byte<13>),
    775                      MAP(imm_v128_shl_n_byte<14>),
    776                      MAP(imm_v128_shl_n_byte<15>),
    777                      MAP(imm_v128_shl_n_8<1>),
    778                      MAP(imm_v128_shl_n_8<2>),
    779                      MAP(imm_v128_shl_n_8<3>),
    780                      MAP(imm_v128_shl_n_8<4>),
    781                      MAP(imm_v128_shl_n_8<5>),
    782                      MAP(imm_v128_shl_n_8<6>),
    783                      MAP(imm_v128_shl_n_8<7>),
    784                      MAP(imm_v128_shr_n_u8<1>),
    785                      MAP(imm_v128_shr_n_u8<2>),
    786                      MAP(imm_v128_shr_n_u8<3>),
    787                      MAP(imm_v128_shr_n_u8<4>),
    788                      MAP(imm_v128_shr_n_u8<5>),
    789                      MAP(imm_v128_shr_n_u8<6>),
    790                      MAP(imm_v128_shr_n_u8<7>),
    791                      MAP(imm_v128_shr_n_s8<1>),
    792                      MAP(imm_v128_shr_n_s8<2>),
    793                      MAP(imm_v128_shr_n_s8<3>),
    794                      MAP(imm_v128_shr_n_s8<4>),
    795                      MAP(imm_v128_shr_n_s8<5>),
    796                      MAP(imm_v128_shr_n_s8<6>),
    797                      MAP(imm_v128_shr_n_s8<7>),
    798                      MAP(imm_v128_shl_n_16<1>),
    799                      MAP(imm_v128_shl_n_16<2>),
    800                      MAP(imm_v128_shl_n_16<4>),
    801                      MAP(imm_v128_shl_n_16<6>),
    802                      MAP(imm_v128_shl_n_16<8>),
    803                      MAP(imm_v128_shl_n_16<10>),
    804                      MAP(imm_v128_shl_n_16<12>),
    805                      MAP(imm_v128_shl_n_16<14>),
    806                      MAP(imm_v128_shr_n_u16<1>),
    807                      MAP(imm_v128_shr_n_u16<2>),
    808                      MAP(imm_v128_shr_n_u16<4>),
    809                      MAP(imm_v128_shr_n_u16<6>),
    810                      MAP(imm_v128_shr_n_u16<8>),
    811                      MAP(imm_v128_shr_n_u16<10>),
    812                      MAP(imm_v128_shr_n_u16<12>),
    813                      MAP(imm_v128_shr_n_u16<14>),
    814                      MAP(imm_v128_shr_n_s16<1>),
    815                      MAP(imm_v128_shr_n_s16<2>),
    816                      MAP(imm_v128_shr_n_s16<4>),
    817                      MAP(imm_v128_shr_n_s16<6>),
    818                      MAP(imm_v128_shr_n_s16<8>),
    819                      MAP(imm_v128_shr_n_s16<10>),
    820                      MAP(imm_v128_shr_n_s16<12>),
    821                      MAP(imm_v128_shr_n_s16<14>),
    822                      MAP(imm_v128_shl_n_32<1>),
    823                      MAP(imm_v128_shl_n_32<4>),
    824                      MAP(imm_v128_shl_n_32<8>),
    825                      MAP(imm_v128_shl_n_32<12>),
    826                      MAP(imm_v128_shl_n_32<16>),
    827                      MAP(imm_v128_shl_n_32<20>),
    828                      MAP(imm_v128_shl_n_32<24>),
    829                      MAP(imm_v128_shl_n_32<28>),
    830                      MAP(imm_v128_shr_n_u32<1>),
    831                      MAP(imm_v128_shr_n_u32<4>),
    832                      MAP(imm_v128_shr_n_u32<8>),
    833                      MAP(imm_v128_shr_n_u32<12>),
    834                      MAP(imm_v128_shr_n_u32<16>),
    835                      MAP(imm_v128_shr_n_u32<20>),
    836                      MAP(imm_v128_shr_n_u32<24>),
    837                      MAP(imm_v128_shr_n_u32<28>),
    838                      MAP(imm_v128_shr_n_s32<1>),
    839                      MAP(imm_v128_shr_n_s32<4>),
    840                      MAP(imm_v128_shr_n_s32<8>),
    841                      MAP(imm_v128_shr_n_s32<12>),
    842                      MAP(imm_v128_shr_n_s32<16>),
    843                      MAP(imm_v128_shr_n_s32<20>),
    844                      MAP(imm_v128_shr_n_s32<24>),
    845                      MAP(imm_v128_shr_n_s32<28>),
    846                      MAP(imm_v128_shl_n_64<1>),
    847                      MAP(imm_v128_shl_n_64<4>),
    848                      MAP(imm_v128_shl_n_64<8>),
    849                      MAP(imm_v128_shl_n_64<12>),
    850                      MAP(imm_v128_shl_n_64<16>),
    851                      MAP(imm_v128_shl_n_64<20>),
    852                      MAP(imm_v128_shl_n_64<24>),
    853                      MAP(imm_v128_shl_n_64<28>),
    854                      MAP(imm_v128_shl_n_64<32>),
    855                      MAP(imm_v128_shl_n_64<36>),
    856                      MAP(imm_v128_shl_n_64<40>),
    857                      MAP(imm_v128_shl_n_64<44>),
    858                      MAP(imm_v128_shl_n_64<48>),
    859                      MAP(imm_v128_shl_n_64<52>),
    860                      MAP(imm_v128_shl_n_64<56>),
    861                      MAP(imm_v128_shl_n_64<60>),
    862                      MAP(imm_v128_shr_n_u64<1>),
    863                      MAP(imm_v128_shr_n_u64<4>),
    864                      MAP(imm_v128_shr_n_u64<8>),
    865                      MAP(imm_v128_shr_n_u64<12>),
    866                      MAP(imm_v128_shr_n_u64<16>),
    867                      MAP(imm_v128_shr_n_u64<20>),
    868                      MAP(imm_v128_shr_n_u64<24>),
    869                      MAP(imm_v128_shr_n_u64<28>),
    870                      MAP(imm_v128_shr_n_u64<32>),
    871                      MAP(imm_v128_shr_n_u64<36>),
    872                      MAP(imm_v128_shr_n_u64<40>),
    873                      MAP(imm_v128_shr_n_u64<44>),
    874                      MAP(imm_v128_shr_n_u64<48>),
    875                      MAP(imm_v128_shr_n_u64<52>),
    876                      MAP(imm_v128_shr_n_u64<56>),
    877                      MAP(imm_v128_shr_n_u64<60>),
    878                      MAP(imm_v128_shr_n_s64<1>),
    879                      MAP(imm_v128_shr_n_s64<4>),
    880                      MAP(imm_v128_shr_n_s64<8>),
    881                      MAP(imm_v128_shr_n_s64<12>),
    882                      MAP(imm_v128_shr_n_s64<16>),
    883                      MAP(imm_v128_shr_n_s64<20>),
    884                      MAP(imm_v128_shr_n_s64<24>),
    885                      MAP(imm_v128_shr_n_s64<28>),
    886                      MAP(imm_v128_shr_n_s64<32>),
    887                      MAP(imm_v128_shr_n_s64<36>),
    888                      MAP(imm_v128_shr_n_s64<40>),
    889                      MAP(imm_v128_shr_n_s64<44>),
    890                      MAP(imm_v128_shr_n_s64<48>),
    891                      MAP(imm_v128_shr_n_s64<52>),
    892                      MAP(imm_v128_shr_n_s64<56>),
    893                      MAP(imm_v128_shr_n_s64<60>),
    894                      MAP(v128_from_v64),
    895                      MAP(v128_zip_8),
    896                      MAP(v128_zip_16),
    897                      MAP(v128_zip_32),
    898                      MAP(v128_mul_s16),
    899                      MAP(v128_unpack_u8_s16),
    900                      MAP(v128_unpack_s8_s16),
    901                      MAP(v128_unpack_u16_s32),
    902                      MAP(v128_unpack_s16_s32),
    903                      MAP(v128_shl_8),
    904                      MAP(v128_shr_u8),
    905                      MAP(v128_shr_s8),
    906                      MAP(v128_shl_16),
    907                      MAP(v128_shr_u16),
    908                      MAP(v128_shr_s16),
    909                      MAP(v128_shl_32),
    910                      MAP(v128_shr_u32),
    911                      MAP(v128_shr_s32),
    912                      MAP(v128_shl_64),
    913                      MAP(v128_shr_u64),
    914                      MAP(v128_shr_s64),
    915                      MAP(v128_hadd_u8),
    916                      MAP(v128_dotp_su8),
    917                      MAP(v128_dotp_s16),
    918                      MAP(v128_dotp_s32),
    919                      MAP(v128_low_u32),
    920                      MAP(v128_low_v64),
    921                      MAP(v128_high_v64),
    922                      MAP(v128_from_64),
    923                      MAP(v128_from_32),
    924                      MAP(v128_movemask_8),
    925                      MAP(v128_zero),
    926                      MAP(v128_dup_8),
    927                      MAP(v128_dup_16),
    928                      MAP(v128_dup_32),
    929                      MAP(v128_dup_64),
    930                      MAP(v128_unpacklo_u8_s16),
    931                      MAP(v128_unpackhi_u8_s16),
    932                      MAP(v128_unpacklo_s8_s16),
    933                      MAP(v128_unpackhi_s8_s16),
    934                      MAP(v128_blend_8),
    935                      MAP(u32_load_unaligned),
    936                      MAP(u32_store_unaligned),
    937                      MAP(v64_load_unaligned),
    938                      MAP(v64_store_unaligned),
    939                      MAP(v128_load_unaligned),
    940                      MAP(v128_store_unaligned),
    941                      MAP(v256_sad_u8),
    942                      MAP(v256_ssd_u8),
    943                      MAP(v256_sad_u16),
    944                      MAP(v256_ssd_s16),
    945                      MAP(v256_hadd_u8),
    946                      MAP(v256_low_u64),
    947                      MAP(v256_dotp_su8),
    948                      MAP(v256_dotp_s16),
    949                      MAP(v256_dotp_s32),
    950                      MAP(v256_add_8),
    951                      MAP(v256_add_16),
    952                      MAP(v256_sadd_s8),
    953                      MAP(v256_sadd_u8),
    954                      MAP(v256_sadd_s16),
    955                      MAP(v256_add_32),
    956                      MAP(v256_add_64),
    957                      MAP(v256_sub_8),
    958                      MAP(v256_ssub_u8),
    959                      MAP(v256_ssub_s8),
    960                      MAP(v256_sub_16),
    961                      MAP(v256_ssub_u16),
    962                      MAP(v256_ssub_s16),
    963                      MAP(v256_sub_32),
    964                      MAP(v256_sub_64),
    965                      MAP(v256_ziplo_8),
    966                      MAP(v256_ziphi_8),
    967                      MAP(v256_ziplo_16),
    968                      MAP(v256_ziphi_16),
    969                      MAP(v256_ziplo_32),
    970                      MAP(v256_ziphi_32),
    971                      MAP(v256_ziplo_64),
    972                      MAP(v256_ziphi_64),
    973                      MAP(v256_unziphi_8),
    974                      MAP(v256_unziplo_8),
    975                      MAP(v256_unziphi_16),
    976                      MAP(v256_unziplo_16),
    977                      MAP(v256_unziphi_32),
    978                      MAP(v256_unziplo_32),
    979                      MAP(v256_unziphi_64),
    980                      MAP(v256_unziplo_64),
    981                      MAP(v256_pack_s32_u16),
    982                      MAP(v256_pack_s32_s16),
    983                      MAP(v256_pack_s16_u8),
    984                      MAP(v256_pack_s16_s8),
    985                      MAP(v256_or),
    986                      MAP(v256_xor),
    987                      MAP(v256_and),
    988                      MAP(v256_andn),
    989                      MAP(v256_mullo_s16),
    990                      MAP(v256_mulhi_s16),
    991                      MAP(v256_mullo_s32),
    992                      MAP(v256_madd_s16),
    993                      MAP(v256_madd_us8),
    994                      MAP(v256_avg_u8),
    995                      MAP(v256_rdavg_u8),
    996                      MAP(v256_rdavg_u16),
    997                      MAP(v256_avg_u16),
    998                      MAP(v256_min_u8),
    999                      MAP(v256_max_u8),
   1000                      MAP(v256_min_s8),
   1001                      MAP(v256_max_s8),
   1002                      MAP(v256_min_s16),
   1003                      MAP(v256_max_s16),
   1004                      MAP(v256_min_s32),
   1005                      MAP(v256_max_s32),
   1006                      MAP(v256_cmpgt_s8),
   1007                      MAP(v256_cmplt_s8),
   1008                      MAP(v256_cmpeq_8),
   1009                      MAP(v256_cmpgt_s16),
   1010                      MAP(v256_cmplt_s16),
   1011                      MAP(v256_cmpeq_16),
   1012                      MAP(v256_cmpgt_s32),
   1013                      MAP(v256_cmplt_s32),
   1014                      MAP(v256_cmpeq_32),
   1015                      MAP(v256_shuffle_8),
   1016                      MAP(v256_pshuffle_8),
   1017                      MAP(v256_wideshuffle_8),
   1018                      MAP(imm_v256_align<1>),
   1019                      MAP(imm_v256_align<2>),
   1020                      MAP(imm_v256_align<3>),
   1021                      MAP(imm_v256_align<4>),
   1022                      MAP(imm_v256_align<5>),
   1023                      MAP(imm_v256_align<6>),
   1024                      MAP(imm_v256_align<7>),
   1025                      MAP(imm_v256_align<8>),
   1026                      MAP(imm_v256_align<9>),
   1027                      MAP(imm_v256_align<10>),
   1028                      MAP(imm_v256_align<11>),
   1029                      MAP(imm_v256_align<12>),
   1030                      MAP(imm_v256_align<13>),
   1031                      MAP(imm_v256_align<14>),
   1032                      MAP(imm_v256_align<15>),
   1033                      MAP(imm_v256_align<16>),
   1034                      MAP(imm_v256_align<17>),
   1035                      MAP(imm_v256_align<18>),
   1036                      MAP(imm_v256_align<19>),
   1037                      MAP(imm_v256_align<20>),
   1038                      MAP(imm_v256_align<21>),
   1039                      MAP(imm_v256_align<22>),
   1040                      MAP(imm_v256_align<23>),
   1041                      MAP(imm_v256_align<24>),
   1042                      MAP(imm_v256_align<25>),
   1043                      MAP(imm_v256_align<26>),
   1044                      MAP(imm_v256_align<27>),
   1045                      MAP(imm_v256_align<28>),
   1046                      MAP(imm_v256_align<29>),
   1047                      MAP(imm_v256_align<30>),
   1048                      MAP(imm_v256_align<31>),
   1049                      MAP(v256_from_v128),
   1050                      MAP(v256_zip_8),
   1051                      MAP(v256_zip_16),
   1052                      MAP(v256_zip_32),
   1053                      MAP(v256_mul_s16),
   1054                      MAP(v256_unpack_u8_s16),
   1055                      MAP(v256_unpack_s8_s16),
   1056                      MAP(v256_unpack_u16_s32),
   1057                      MAP(v256_unpack_s16_s32),
   1058                      MAP(v256_shl_8),
   1059                      MAP(v256_shr_u8),
   1060                      MAP(v256_shr_s8),
   1061                      MAP(v256_shl_16),
   1062                      MAP(v256_shr_u16),
   1063                      MAP(v256_shr_s16),
   1064                      MAP(v256_shl_32),
   1065                      MAP(v256_shr_u32),
   1066                      MAP(v256_shr_s32),
   1067                      MAP(v256_shl_64),
   1068                      MAP(v256_shr_u64),
   1069                      MAP(v256_shr_s64),
   1070                      MAP(v256_abs_s8),
   1071                      MAP(v256_abs_s16),
   1072                      MAP(v256_padd_u8),
   1073                      MAP(v256_padd_s16),
   1074                      MAP(v256_unpacklo_u16_s32),
   1075                      MAP(v256_unpacklo_s16_s32),
   1076                      MAP(v256_unpackhi_u16_s32),
   1077                      MAP(v256_unpackhi_s16_s32),
   1078                      MAP(imm_v256_shr_n_word<1>),
   1079                      MAP(imm_v256_shr_n_word<2>),
   1080                      MAP(imm_v256_shr_n_word<3>),
   1081                      MAP(imm_v256_shr_n_word<4>),
   1082                      MAP(imm_v256_shr_n_word<5>),
   1083                      MAP(imm_v256_shr_n_word<6>),
   1084                      MAP(imm_v256_shr_n_word<7>),
   1085                      MAP(imm_v256_shr_n_word<8>),
   1086                      MAP(imm_v256_shr_n_word<9>),
   1087                      MAP(imm_v256_shr_n_word<10>),
   1088                      MAP(imm_v256_shr_n_word<11>),
   1089                      MAP(imm_v256_shr_n_word<12>),
   1090                      MAP(imm_v256_shr_n_word<13>),
   1091                      MAP(imm_v256_shr_n_word<14>),
   1092                      MAP(imm_v256_shr_n_word<15>),
   1093                      MAP(imm_v256_shl_n_word<1>),
   1094                      MAP(imm_v256_shl_n_word<2>),
   1095                      MAP(imm_v256_shl_n_word<3>),
   1096                      MAP(imm_v256_shl_n_word<4>),
   1097                      MAP(imm_v256_shl_n_word<5>),
   1098                      MAP(imm_v256_shl_n_word<6>),
   1099                      MAP(imm_v256_shl_n_word<7>),
   1100                      MAP(imm_v256_shl_n_word<8>),
   1101                      MAP(imm_v256_shl_n_word<9>),
   1102                      MAP(imm_v256_shl_n_word<10>),
   1103                      MAP(imm_v256_shl_n_word<11>),
   1104                      MAP(imm_v256_shl_n_word<12>),
   1105                      MAP(imm_v256_shl_n_word<13>),
   1106                      MAP(imm_v256_shl_n_word<14>),
   1107                      MAP(imm_v256_shl_n_word<15>),
   1108                      MAP(imm_v256_shr_n_byte<1>),
   1109                      MAP(imm_v256_shr_n_byte<2>),
   1110                      MAP(imm_v256_shr_n_byte<3>),
   1111                      MAP(imm_v256_shr_n_byte<4>),
   1112                      MAP(imm_v256_shr_n_byte<5>),
   1113                      MAP(imm_v256_shr_n_byte<6>),
   1114                      MAP(imm_v256_shr_n_byte<7>),
   1115                      MAP(imm_v256_shr_n_byte<8>),
   1116                      MAP(imm_v256_shr_n_byte<9>),
   1117                      MAP(imm_v256_shr_n_byte<10>),
   1118                      MAP(imm_v256_shr_n_byte<11>),
   1119                      MAP(imm_v256_shr_n_byte<12>),
   1120                      MAP(imm_v256_shr_n_byte<13>),
   1121                      MAP(imm_v256_shr_n_byte<14>),
   1122                      MAP(imm_v256_shr_n_byte<15>),
   1123                      MAP(imm_v256_shr_n_byte<16>),
   1124                      MAP(imm_v256_shr_n_byte<17>),
   1125                      MAP(imm_v256_shr_n_byte<18>),
   1126                      MAP(imm_v256_shr_n_byte<19>),
   1127                      MAP(imm_v256_shr_n_byte<20>),
   1128                      MAP(imm_v256_shr_n_byte<21>),
   1129                      MAP(imm_v256_shr_n_byte<22>),
   1130                      MAP(imm_v256_shr_n_byte<23>),
   1131                      MAP(imm_v256_shr_n_byte<24>),
   1132                      MAP(imm_v256_shr_n_byte<25>),
   1133                      MAP(imm_v256_shr_n_byte<26>),
   1134                      MAP(imm_v256_shr_n_byte<27>),
   1135                      MAP(imm_v256_shr_n_byte<28>),
   1136                      MAP(imm_v256_shr_n_byte<29>),
   1137                      MAP(imm_v256_shr_n_byte<30>),
   1138                      MAP(imm_v256_shr_n_byte<31>),
   1139                      MAP(imm_v256_shl_n_byte<1>),
   1140                      MAP(imm_v256_shl_n_byte<2>),
   1141                      MAP(imm_v256_shl_n_byte<3>),
   1142                      MAP(imm_v256_shl_n_byte<4>),
   1143                      MAP(imm_v256_shl_n_byte<5>),
   1144                      MAP(imm_v256_shl_n_byte<6>),
   1145                      MAP(imm_v256_shl_n_byte<7>),
   1146                      MAP(imm_v256_shl_n_byte<8>),
   1147                      MAP(imm_v256_shl_n_byte<9>),
   1148                      MAP(imm_v256_shl_n_byte<10>),
   1149                      MAP(imm_v256_shl_n_byte<11>),
   1150                      MAP(imm_v256_shl_n_byte<12>),
   1151                      MAP(imm_v256_shl_n_byte<13>),
   1152                      MAP(imm_v256_shl_n_byte<14>),
   1153                      MAP(imm_v256_shl_n_byte<15>),
   1154                      MAP(imm_v256_shl_n_byte<16>),
   1155                      MAP(imm_v256_shl_n_byte<17>),
   1156                      MAP(imm_v256_shl_n_byte<18>),
   1157                      MAP(imm_v256_shl_n_byte<19>),
   1158                      MAP(imm_v256_shl_n_byte<20>),
   1159                      MAP(imm_v256_shl_n_byte<21>),
   1160                      MAP(imm_v256_shl_n_byte<22>),
   1161                      MAP(imm_v256_shl_n_byte<23>),
   1162                      MAP(imm_v256_shl_n_byte<24>),
   1163                      MAP(imm_v256_shl_n_byte<25>),
   1164                      MAP(imm_v256_shl_n_byte<26>),
   1165                      MAP(imm_v256_shl_n_byte<27>),
   1166                      MAP(imm_v256_shl_n_byte<28>),
   1167                      MAP(imm_v256_shl_n_byte<29>),
   1168                      MAP(imm_v256_shl_n_byte<30>),
   1169                      MAP(imm_v256_shl_n_byte<31>),
   1170                      MAP(imm_v256_shl_n_8<1>),
   1171                      MAP(imm_v256_shl_n_8<2>),
   1172                      MAP(imm_v256_shl_n_8<3>),
   1173                      MAP(imm_v256_shl_n_8<4>),
   1174                      MAP(imm_v256_shl_n_8<5>),
   1175                      MAP(imm_v256_shl_n_8<6>),
   1176                      MAP(imm_v256_shl_n_8<7>),
   1177                      MAP(imm_v256_shr_n_u8<1>),
   1178                      MAP(imm_v256_shr_n_u8<2>),
   1179                      MAP(imm_v256_shr_n_u8<3>),
   1180                      MAP(imm_v256_shr_n_u8<4>),
   1181                      MAP(imm_v256_shr_n_u8<5>),
   1182                      MAP(imm_v256_shr_n_u8<6>),
   1183                      MAP(imm_v256_shr_n_u8<7>),
   1184                      MAP(imm_v256_shr_n_s8<1>),
   1185                      MAP(imm_v256_shr_n_s8<2>),
   1186                      MAP(imm_v256_shr_n_s8<3>),
   1187                      MAP(imm_v256_shr_n_s8<4>),
   1188                      MAP(imm_v256_shr_n_s8<5>),
   1189                      MAP(imm_v256_shr_n_s8<6>),
   1190                      MAP(imm_v256_shr_n_s8<7>),
   1191                      MAP(imm_v256_shl_n_16<1>),
   1192                      MAP(imm_v256_shl_n_16<2>),
   1193                      MAP(imm_v256_shl_n_16<4>),
   1194                      MAP(imm_v256_shl_n_16<6>),
   1195                      MAP(imm_v256_shl_n_16<8>),
   1196                      MAP(imm_v256_shl_n_16<10>),
   1197                      MAP(imm_v256_shl_n_16<12>),
   1198                      MAP(imm_v256_shl_n_16<14>),
   1199                      MAP(imm_v256_shr_n_u16<1>),
   1200                      MAP(imm_v256_shr_n_u16<2>),
   1201                      MAP(imm_v256_shr_n_u16<4>),
   1202                      MAP(imm_v256_shr_n_u16<6>),
   1203                      MAP(imm_v256_shr_n_u16<8>),
   1204                      MAP(imm_v256_shr_n_u16<10>),
   1205                      MAP(imm_v256_shr_n_u16<12>),
   1206                      MAP(imm_v256_shr_n_u16<14>),
   1207                      MAP(imm_v256_shr_n_s16<1>),
   1208                      MAP(imm_v256_shr_n_s16<2>),
   1209                      MAP(imm_v256_shr_n_s16<4>),
   1210                      MAP(imm_v256_shr_n_s16<6>),
   1211                      MAP(imm_v256_shr_n_s16<8>),
   1212                      MAP(imm_v256_shr_n_s16<10>),
   1213                      MAP(imm_v256_shr_n_s16<12>),
   1214                      MAP(imm_v256_shr_n_s16<14>),
   1215                      MAP(imm_v256_shl_n_32<1>),
   1216                      MAP(imm_v256_shl_n_32<4>),
   1217                      MAP(imm_v256_shl_n_32<8>),
   1218                      MAP(imm_v256_shl_n_32<12>),
   1219                      MAP(imm_v256_shl_n_32<16>),
   1220                      MAP(imm_v256_shl_n_32<20>),
   1221                      MAP(imm_v256_shl_n_32<24>),
   1222                      MAP(imm_v256_shl_n_32<28>),
   1223                      MAP(imm_v256_shr_n_u32<1>),
   1224                      MAP(imm_v256_shr_n_u32<4>),
   1225                      MAP(imm_v256_shr_n_u32<8>),
   1226                      MAP(imm_v256_shr_n_u32<12>),
   1227                      MAP(imm_v256_shr_n_u32<16>),
   1228                      MAP(imm_v256_shr_n_u32<20>),
   1229                      MAP(imm_v256_shr_n_u32<24>),
   1230                      MAP(imm_v256_shr_n_u32<28>),
   1231                      MAP(imm_v256_shr_n_s32<1>),
   1232                      MAP(imm_v256_shr_n_s32<4>),
   1233                      MAP(imm_v256_shr_n_s32<8>),
   1234                      MAP(imm_v256_shr_n_s32<12>),
   1235                      MAP(imm_v256_shr_n_s32<16>),
   1236                      MAP(imm_v256_shr_n_s32<20>),
   1237                      MAP(imm_v256_shr_n_s32<24>),
   1238                      MAP(imm_v256_shr_n_s32<28>),
   1239                      MAP(imm_v256_shl_n_64<1>),
   1240                      MAP(imm_v256_shl_n_64<4>),
   1241                      MAP(imm_v256_shl_n_64<8>),
   1242                      MAP(imm_v256_shl_n_64<12>),
   1243                      MAP(imm_v256_shl_n_64<16>),
   1244                      MAP(imm_v256_shl_n_64<20>),
   1245                      MAP(imm_v256_shl_n_64<24>),
   1246                      MAP(imm_v256_shl_n_64<28>),
   1247                      MAP(imm_v256_shl_n_64<32>),
   1248                      MAP(imm_v256_shl_n_64<36>),
   1249                      MAP(imm_v256_shl_n_64<40>),
   1250                      MAP(imm_v256_shl_n_64<44>),
   1251                      MAP(imm_v256_shl_n_64<48>),
   1252                      MAP(imm_v256_shl_n_64<52>),
   1253                      MAP(imm_v256_shl_n_64<56>),
   1254                      MAP(imm_v256_shl_n_64<60>),
   1255                      MAP(imm_v256_shr_n_u64<1>),
   1256                      MAP(imm_v256_shr_n_u64<4>),
   1257                      MAP(imm_v256_shr_n_u64<8>),
   1258                      MAP(imm_v256_shr_n_u64<12>),
   1259                      MAP(imm_v256_shr_n_u64<16>),
   1260                      MAP(imm_v256_shr_n_u64<20>),
   1261                      MAP(imm_v256_shr_n_u64<24>),
   1262                      MAP(imm_v256_shr_n_u64<28>),
   1263                      MAP(imm_v256_shr_n_u64<32>),
   1264                      MAP(imm_v256_shr_n_u64<36>),
   1265                      MAP(imm_v256_shr_n_u64<40>),
   1266                      MAP(imm_v256_shr_n_u64<44>),
   1267                      MAP(imm_v256_shr_n_u64<48>),
   1268                      MAP(imm_v256_shr_n_u64<52>),
   1269                      MAP(imm_v256_shr_n_u64<56>),
   1270                      MAP(imm_v256_shr_n_u64<60>),
   1271                      MAP(imm_v256_shr_n_s64<1>),
   1272                      MAP(imm_v256_shr_n_s64<4>),
   1273                      MAP(imm_v256_shr_n_s64<8>),
   1274                      MAP(imm_v256_shr_n_s64<12>),
   1275                      MAP(imm_v256_shr_n_s64<16>),
   1276                      MAP(imm_v256_shr_n_s64<20>),
   1277                      MAP(imm_v256_shr_n_s64<24>),
   1278                      MAP(imm_v256_shr_n_s64<28>),
   1279                      MAP(imm_v256_shr_n_s64<32>),
   1280                      MAP(imm_v256_shr_n_s64<36>),
   1281                      MAP(imm_v256_shr_n_s64<40>),
   1282                      MAP(imm_v256_shr_n_s64<44>),
   1283                      MAP(imm_v256_shr_n_s64<48>),
   1284                      MAP(imm_v256_shr_n_s64<52>),
   1285                      MAP(imm_v256_shr_n_s64<56>),
   1286                      MAP(imm_v256_shr_n_s64<60>),
   1287                      MAP(v256_movemask_8),
   1288                      MAP(v256_zero),
   1289                      MAP(v256_dup_8),
   1290                      MAP(v256_dup_16),
   1291                      MAP(v256_dup_32),
   1292                      MAP(v256_dup_64),
   1293                      MAP(v256_low_u32),
   1294                      MAP(v256_low_v64),
   1295                      MAP(v256_from_64),
   1296                      MAP(v256_from_v64),
   1297                      MAP(v256_ziplo_128),
   1298                      MAP(v256_ziphi_128),
   1299                      MAP(v256_unpacklo_u8_s16),
   1300                      MAP(v256_unpackhi_u8_s16),
   1301                      MAP(v256_unpacklo_s8_s16),
   1302                      MAP(v256_unpackhi_s8_s16),
   1303                      MAP(v256_blend_8),
   1304                      { nullptr, nullptr, nullptr } };
   1305 #undef MAP
   1306 
   1307 // Map reference functions to machine tuned functions. Since the
   1308 // functions depend on machine tuned types, the non-machine tuned
   1309 // instantiations of the test can't refer to these functions directly,
   1310 // so we refer to them by name and do the mapping here.
   1311 void Map(const char *name, fptr *ref, fptr *simd) {
   1312  unsigned int i;
   1313  for (i = 0; m[i].name && strcmp(name, m[i].name); i++) {
   1314  }
   1315 
   1316  *ref = m[i].ref;
   1317  *simd = m[i].simd;
   1318 }
   1319 
   1320 // Used for printing errors in TestSimd1Arg, TestSimd2Args and TestSimd3Args
   1321 std::string Print(const uint8_t *a, int size) {
   1322  std::string text = "0x";
   1323  for (int i = 0; i < size; i++) {
   1324    const uint8_t c = a[!CONFIG_BIG_ENDIAN ? size - 1 - i : i];
   1325    // Same as snprintf(..., ..., "%02x", c)
   1326    text += (c >> 4) + '0' + ((c >> 4) > 9) * ('a' - '0' - 10);
   1327    text += (c & 15) + '0' + ((c & 15) > 9) * ('a' - '0' - 10);
   1328  }
   1329 
   1330  return text;
   1331 }
   1332 
   1333 // Used in TestSimd1Arg, TestSimd2Args and TestSimd3Args to restrict argument
   1334 // ranges
   1335 void SetMask(uint8_t *s, int size, uint32_t mask, uint32_t maskwidth) {
   1336  switch (maskwidth) {
   1337    case 0: {
   1338      break;
   1339    }
   1340    case 8: {
   1341      for (int i = 0; i < size; i++) s[i] &= mask;
   1342      break;
   1343    }
   1344    case 16: {
   1345      uint16_t *t = reinterpret_cast<uint16_t *>(s);
   1346      assert(!(reinterpret_cast<uintptr_t>(s) & 1));
   1347      for (int i = 0; i < size / 2; i++) t[i] &= mask;
   1348      break;
   1349    }
   1350    case 32: {
   1351      uint32_t *t = reinterpret_cast<uint32_t *>(s);
   1352      assert(!(reinterpret_cast<uintptr_t>(s) & 3));
   1353      for (int i = 0; i < size / 4; i++) t[i] &= mask;
   1354      break;
   1355    }
   1356    case 64: {
   1357      uint64_t *t = reinterpret_cast<uint64_t *>(s);
   1358      assert(!(reinterpret_cast<uintptr_t>(s) & 7));
   1359      for (int i = 0; i < size / 8; i++) t[i] &= mask;
   1360      break;
   1361    }
   1362    default: {
   1363      FAIL() << "Unsupported mask width";
   1364      break;
   1365    }
   1366  }
   1367 }
   1368 
   1369 // We need some extra load/store functions
   1370 void u64_store_aligned(void *p, uint64_t a) {
   1371  v64_store_aligned(p, v64_from_64(a));
   1372 }
   1373 void s32_store_aligned(void *p, int32_t a) {
   1374  u32_store_aligned(p, static_cast<uint32_t>(a));
   1375 }
   1376 void s64_store_aligned(void *p, int64_t a) {
   1377  v64_store_aligned(p, v64_from_64(static_cast<uint64_t>(a)));
   1378 }
   1379 
   1380 void c_u64_store_aligned(void *p, uint64_t a) {
   1381  c_v64_store_aligned(p, c_v64_from_64(a));
   1382 }
   1383 
   1384 void c_s32_store_aligned(void *p, int32_t a) {
   1385  c_u32_store_aligned(p, static_cast<uint32_t>(a));
   1386 }
   1387 
   1388 void c_s64_store_aligned(void *p, int64_t a) {
   1389  c_v64_store_aligned(p, c_v64_from_64(static_cast<uint64_t>(a)));
   1390 }
   1391 
   1392 uint64_t u64_load_aligned(const void *p) {
   1393  return v64_u64(v64_load_aligned(p));
   1394 }
   1395 uint16_t u16_load_aligned(const void *p) {
   1396  return *(reinterpret_cast<const uint16_t *>(p));
   1397 }
   1398 uint8_t u8_load_aligned(const void *p) {
   1399  return *(reinterpret_cast<const uint8_t *>(p));
   1400 }
   1401 
   1402 uint64_t c_u64_load_aligned(const void *p) {
   1403  return c_v64_u64(c_v64_load_aligned(p));
   1404 }
   1405 uint16_t c_u16_load_aligned(const void *p) {
   1406  return *(reinterpret_cast<const uint16_t *>(p));
   1407 }
   1408 uint8_t c_u8_load_aligned(const void *p) {
   1409  return *(reinterpret_cast<const uint8_t *>(p));
   1410 }
   1411 
   1412 // CompareSimd1Arg, CompareSimd2Args and CompareSimd3Args compare
   1413 // intrinsics taking 1, 2 or 3 arguments respectively with their
   1414 // corresponding C reference.  Ideally, the loads and stores should
   1415 // have gone into the template parameter list, but v64 and v128 could
   1416 // be typedef'ed to the same type (which is the case on x86) and then
   1417 // we can't instantiate both v64 and v128, so the function return and
   1418 // argument types, including the always differing types in the C
   1419 // equivalent are used instead.  The function arguments must be void
   1420 // pointers and then go through a cast to avoid matching errors in the
   1421 // branches eliminated by the typeid tests in the calling function.
   1422 template <typename Ret, typename Arg, typename CRet, typename CArg>
   1423 int CompareSimd1Arg(fptr store, fptr load, fptr simd, void *d, fptr c_store,
   1424                    fptr c_load, fptr c_simd, void *ref_d, const void *a) {
   1425  void (*const my_store)(void *, Ret) = (void (*const)(void *, Ret))store;
   1426  Arg (*const my_load)(const void *) = (Arg(*const)(const void *))load;
   1427  Ret (*const my_simd)(Arg) = (Ret(*const)(Arg))simd;
   1428  void (*const my_c_store)(void *, CRet) = (void (*const)(void *, CRet))c_store;
   1429  CArg (*const my_c_load)(const void *) = (CArg(*const)(const void *))c_load;
   1430  CRet (*const my_c_simd)(CArg) = (CRet(*const)(CArg))c_simd;
   1431 
   1432  // Call reference and intrinsic
   1433  my_c_store(ref_d, my_c_simd(my_c_load(a)));
   1434  my_store(d, my_simd(my_load(a)));
   1435 
   1436  // Compare results
   1437  return memcmp(ref_d, d, sizeof(CRet));
   1438 }
   1439 
   1440 template <typename Ret, typename Arg1, typename Arg2, typename CRet,
   1441          typename CArg1, typename CArg2>
   1442 int CompareSimd2Args(fptr store, fptr load1, fptr load2, fptr simd, void *d,
   1443                     fptr c_store, fptr c_load1, fptr c_load2, fptr c_simd,
   1444                     void *ref_d, const void *a, const void *b) {
   1445  void (*const my_store)(void *, Ret) = (void (*const)(void *, Ret))store;
   1446  Arg1 (*const my_load1)(const void *) = (Arg1(*const)(const void *))load1;
   1447  Arg2 (*const my_load2)(const void *) = (Arg2(*const)(const void *))load2;
   1448  Ret (*const my_simd)(Arg1, Arg2) = (Ret(*const)(Arg1, Arg2))simd;
   1449  void (*const my_c_store)(void *, CRet) = (void (*const)(void *, CRet))c_store;
   1450  CArg1 (*const my_c_load1)(const void *) =
   1451      (CArg1(*const)(const void *))c_load1;
   1452  CArg2 (*const my_c_load2)(const void *) =
   1453      (CArg2(*const)(const void *))c_load2;
   1454  CRet (*const my_c_simd)(CArg1, CArg2) = (CRet(*const)(CArg1, CArg2))c_simd;
   1455 
   1456  // Call reference and intrinsic
   1457  my_c_store(ref_d, my_c_simd(my_c_load1(a), my_c_load2(b)));
   1458  my_store(d, my_simd(my_load1(a), my_load2(b)));
   1459 
   1460  // Compare results
   1461  return memcmp(ref_d, d, sizeof(CRet));
   1462 }
   1463 
   1464 template <typename Ret, typename Arg1, typename Arg2, typename Arg3,
   1465          typename CRet, typename CArg1, typename CArg2, typename CArg3>
   1466 int CompareSimd3Args(fptr store, fptr load1, fptr load2, fptr load3, fptr simd,
   1467                     void *d, fptr c_store, fptr c_load1, fptr c_load2,
   1468                     fptr c_load3, fptr c_simd, void *ref_d, const void *a,
   1469                     const void *b, const void *c) {
   1470  void (*const my_store)(void *, Ret) = (void (*const)(void *, Ret))store;
   1471  Arg1 (*const my_load1)(const void *) = (Arg1(*const)(const void *))load1;
   1472  Arg2 (*const my_load2)(const void *) = (Arg2(*const)(const void *))load2;
   1473  Arg3 (*const my_load3)(const void *) = (Arg3(*const)(const void *))load3;
   1474  Ret (*const my_simd)(Arg1, Arg2, Arg3) = (Ret(*const)(Arg1, Arg2, Arg3))simd;
   1475  void (*const my_c_store)(void *, CRet) = (void (*const)(void *, CRet))c_store;
   1476  CArg1 (*const my_c_load1)(const void *) =
   1477      (CArg1(*const)(const void *))c_load1;
   1478  CArg2 (*const my_c_load2)(const void *) =
   1479      (CArg2(*const)(const void *))c_load2;
   1480  CArg3 (*const my_c_load3)(const void *) =
   1481      (CArg3(*const)(const void *))c_load3;
   1482  CRet (*const my_c_simd)(CArg1, CArg2, CArg3) =
   1483      (CRet(*const)(CArg1, CArg2, CArg3))c_simd;
   1484 
   1485  // Call reference and intrinsic
   1486  my_c_store(ref_d, my_c_simd(my_c_load1(a), my_c_load2(b), my_c_load3(c)));
   1487  my_store(d, my_simd(my_load1(a), my_load2(b), my_load3(c)));
   1488 
   1489  // Compare results
   1490  return memcmp(ref_d, d, sizeof(CRet));
   1491 }
   1492 
   1493 }  // namespace
   1494 
   1495 template <typename CRet, typename CArg>
   1496 void TestSimd1Arg(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
   1497                  const char *name) {
   1498  ACMRandom rnd(ACMRandom::DeterministicSeed());
   1499  fptr ref_simd;
   1500  fptr simd;
   1501  int error = 0;
   1502  DECLARE_ALIGNED(32, uint8_t, s[32]);
   1503  DECLARE_ALIGNED(32, uint8_t, d[32]);
   1504  DECLARE_ALIGNED(32, uint8_t, ref_d[32]);
   1505  assert(sizeof(CArg) <= 32 && sizeof(CRet) <= 32);
   1506  memset(ref_d, 0, sizeof(ref_d));
   1507  memset(d, 0, sizeof(d));
   1508 
   1509  Map(name, &ref_simd, &simd);
   1510  if (simd == nullptr || ref_simd == nullptr) {
   1511    FAIL() << "Internal error: Unknown intrinsic function " << name;
   1512  }
   1513  for (unsigned int count = 0;
   1514       count < iterations && !error && !testing::Test::HasFailure(); count++) {
   1515    for (unsigned int c = 0; c < sizeof(CArg); c++) s[c] = rnd.Rand8();
   1516 
   1517    if (maskwidth) {
   1518      SetMask(s, sizeof(CArg), mask, maskwidth);
   1519    }
   1520 
   1521    if (typeid(CRet) == typeid(c_v64) && typeid(CArg) == typeid(c_v64)) {
   1522      // V64_V64
   1523      error = CompareSimd1Arg<v64, v64, c_v64, c_v64>(
   1524          reinterpret_cast<fptr>(v64_store_aligned),
   1525          reinterpret_cast<fptr>(v64_load_aligned), simd, d,
   1526          reinterpret_cast<fptr>(c_v64_store_aligned),
   1527          reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s);
   1528    } else if (typeid(CRet) == typeid(c_v64) &&
   1529               typeid(CArg) == typeid(uint8_t)) {
   1530      // V64_U8
   1531      error = CompareSimd1Arg<v64, uint8_t, c_v64, uint8_t>(
   1532          reinterpret_cast<fptr>(v64_store_aligned),
   1533          reinterpret_cast<fptr>(u8_load_aligned), simd, d,
   1534          reinterpret_cast<fptr>(c_v64_store_aligned),
   1535          reinterpret_cast<fptr>(c_u8_load_aligned), ref_simd, ref_d, s);
   1536    } else if (typeid(CRet) == typeid(c_v64) &&
   1537               typeid(CArg) == typeid(uint16_t)) {
   1538      // V64_U16
   1539      error = CompareSimd1Arg<v64, uint16_t, c_v64, uint16_t>(
   1540          reinterpret_cast<fptr>(v64_store_aligned),
   1541          reinterpret_cast<fptr>(u16_load_aligned), simd, d,
   1542          reinterpret_cast<fptr>(c_v64_store_aligned),
   1543          reinterpret_cast<fptr>(c_u16_load_aligned), ref_simd, ref_d, s);
   1544    } else if (typeid(CRet) == typeid(c_v64) &&
   1545               typeid(CArg) == typeid(uint32_t)) {
   1546      // V64_U32
   1547      error = CompareSimd1Arg<v64, uint32_t, c_v64, uint32_t>(
   1548          reinterpret_cast<fptr>(v64_store_aligned),
   1549          reinterpret_cast<fptr>(u32_load_aligned), simd, d,
   1550          reinterpret_cast<fptr>(c_v64_store_aligned),
   1551          reinterpret_cast<fptr>(c_u32_load_aligned), ref_simd, ref_d, s);
   1552    } else if (typeid(CRet) == typeid(uint64_t) &&
   1553               typeid(CArg) == typeid(c_v64)) {
   1554      // U64_V64
   1555      error = CompareSimd1Arg<uint64_t, v64, uint64_t, c_v64>(
   1556          reinterpret_cast<fptr>(u64_store_aligned),
   1557          reinterpret_cast<fptr>(v64_load_aligned), simd, d,
   1558          reinterpret_cast<fptr>(c_u64_store_aligned),
   1559          reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s);
   1560    } else if (typeid(CRet) == typeid(int64_t) &&
   1561               typeid(CArg) == typeid(c_v64)) {
   1562      // S64_V64
   1563      error = CompareSimd1Arg<int64_t, v64, int64_t, c_v64>(
   1564          reinterpret_cast<fptr>(s64_store_aligned),
   1565          reinterpret_cast<fptr>(v64_load_aligned), simd, d,
   1566          reinterpret_cast<fptr>(c_s64_store_aligned),
   1567          reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s);
   1568    } else if (typeid(CRet) == typeid(uint32_t) &&
   1569               typeid(CArg) == typeid(c_v64)) {
   1570      // U32_V64
   1571      error = CompareSimd1Arg<uint32_t, v64, uint32_t, c_v64>(
   1572          reinterpret_cast<fptr>(u32_store_aligned),
   1573          reinterpret_cast<fptr>(v64_load_aligned), simd, d,
   1574          reinterpret_cast<fptr>(c_u32_store_aligned),
   1575          reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s);
   1576    } else if (typeid(CRet) == typeid(int32_t) &&
   1577               typeid(CArg) == typeid(c_v64)) {
   1578      // S32_V64
   1579      error = CompareSimd1Arg<int32_t, v64, int32_t, c_v64>(
   1580          reinterpret_cast<fptr>(s32_store_aligned),
   1581          reinterpret_cast<fptr>(v64_load_aligned), simd, d,
   1582          reinterpret_cast<fptr>(c_s32_store_aligned),
   1583          reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s);
   1584    } else if (typeid(CRet) == typeid(uint32_t) &&
   1585               typeid(CArg) == typeid(c_v128)) {
   1586      // U32_V128
   1587      error = CompareSimd1Arg<uint32_t, v128, uint32_t, c_v128>(
   1588          reinterpret_cast<fptr>(u32_store_aligned),
   1589          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
   1590          reinterpret_cast<fptr>(c_u32_store_aligned),
   1591          reinterpret_cast<fptr>(c_v128_load_aligned), ref_simd, ref_d, s);
   1592    } else if (typeid(CRet) == typeid(uint64_t) &&
   1593               typeid(CArg) == typeid(c_v128)) {
   1594      // U64_V128
   1595      error = CompareSimd1Arg<uint64_t, v128, uint64_t, c_v128>(
   1596          reinterpret_cast<fptr>(u64_store_aligned),
   1597          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
   1598          reinterpret_cast<fptr>(c_u64_store_aligned),
   1599          reinterpret_cast<fptr>(c_v128_load_aligned), ref_simd, ref_d, s);
   1600    } else if (typeid(CRet) == typeid(uint64_t) &&
   1601               typeid(CArg) == typeid(c_v256)) {
   1602      // U64_V256
   1603      error = CompareSimd1Arg<uint64_t, v256, uint64_t, c_v256>(
   1604          reinterpret_cast<fptr>(u64_store_aligned),
   1605          reinterpret_cast<fptr>(v256_load_aligned), simd, d,
   1606          reinterpret_cast<fptr>(c_u64_store_aligned),
   1607          reinterpret_cast<fptr>(c_v256_load_aligned), ref_simd, ref_d, s);
   1608    } else if (typeid(CRet) == typeid(c_v64) &&
   1609               typeid(CArg) == typeid(c_v128)) {
   1610      // V64_V128
   1611      error = CompareSimd1Arg<v64, v128, c_v64, c_v128>(
   1612          reinterpret_cast<fptr>(v64_store_aligned),
   1613          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
   1614          reinterpret_cast<fptr>(c_v64_store_aligned),
   1615          reinterpret_cast<fptr>(c_v128_load_aligned), ref_simd, ref_d, s);
   1616    } else if (typeid(CRet) == typeid(c_v128) &&
   1617               typeid(CArg) == typeid(c_v128)) {
   1618      // V128_V128
   1619      error = CompareSimd1Arg<v128, v128, c_v128, c_v128>(
   1620          reinterpret_cast<fptr>(v128_store_aligned),
   1621          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
   1622          reinterpret_cast<fptr>(c_v128_store_aligned),
   1623          reinterpret_cast<fptr>(c_v128_load_aligned), ref_simd, ref_d, s);
   1624    } else if (typeid(CRet) == typeid(c_v128) &&
   1625               typeid(CArg) == typeid(c_v64)) {
   1626      // V128_V64
   1627      error = CompareSimd1Arg<v128, v64, c_v128, c_v64>(
   1628          reinterpret_cast<fptr>(v128_store_aligned),
   1629          reinterpret_cast<fptr>(v64_load_aligned), simd, d,
   1630          reinterpret_cast<fptr>(c_v128_store_aligned),
   1631          reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s);
   1632    } else if (typeid(CRet) == typeid(c_v128) &&
   1633               typeid(CArg) == typeid(uint8_t)) {
   1634      // V128_U8
   1635      error = CompareSimd1Arg<v128, uint8_t, c_v128, uint8_t>(
   1636          reinterpret_cast<fptr>(v128_store_aligned),
   1637          reinterpret_cast<fptr>(u8_load_aligned), simd, d,
   1638          reinterpret_cast<fptr>(c_v128_store_aligned),
   1639          reinterpret_cast<fptr>(c_u8_load_aligned), ref_simd, ref_d, s);
   1640    } else if (typeid(CRet) == typeid(c_v128) &&
   1641               typeid(CArg) == typeid(uint16_t)) {
   1642      // V128_U16
   1643      error = CompareSimd1Arg<v128, uint16_t, c_v128, uint16_t>(
   1644          reinterpret_cast<fptr>(v128_store_aligned),
   1645          reinterpret_cast<fptr>(u16_load_aligned), simd, d,
   1646          reinterpret_cast<fptr>(c_v128_store_aligned),
   1647          reinterpret_cast<fptr>(c_u16_load_aligned), ref_simd, ref_d, s);
   1648    } else if (typeid(CRet) == typeid(c_v128) &&
   1649               typeid(CArg) == typeid(uint32_t)) {
   1650      // V128_U32
   1651      error = CompareSimd1Arg<v128, uint32_t, c_v128, uint32_t>(
   1652          reinterpret_cast<fptr>(v128_store_aligned),
   1653          reinterpret_cast<fptr>(u32_load_aligned), simd, d,
   1654          reinterpret_cast<fptr>(c_v128_store_aligned),
   1655          reinterpret_cast<fptr>(c_u32_load_aligned), ref_simd, ref_d, s);
   1656    } else if (typeid(CRet) == typeid(c_v128) &&
   1657               typeid(CArg) == typeid(uint64_t)) {
   1658      // V128_U64
   1659      error = CompareSimd1Arg<v128, uint64_t, c_v128, uint64_t>(
   1660          reinterpret_cast<fptr>(v128_store_aligned),
   1661          reinterpret_cast<fptr>(u64_load_aligned), simd, d,
   1662          reinterpret_cast<fptr>(c_v128_store_aligned),
   1663          reinterpret_cast<fptr>(c_u64_load_aligned), ref_simd, ref_d, s);
   1664    } else if (typeid(CRet) == typeid(c_v256) &&
   1665               typeid(CArg) == typeid(c_v256)) {
   1666      // V256_V256
   1667      error = CompareSimd1Arg<v256, v256, c_v256, c_v256>(
   1668          reinterpret_cast<fptr>(v256_store_aligned),
   1669          reinterpret_cast<fptr>(v256_load_aligned), simd, d,
   1670          reinterpret_cast<fptr>(c_v256_store_aligned),
   1671          reinterpret_cast<fptr>(c_v256_load_aligned), ref_simd, ref_d, s);
   1672    } else if (typeid(CRet) == typeid(c_v256) &&
   1673               typeid(CArg) == typeid(c_v128)) {
   1674      // V256_V128
   1675      error = CompareSimd1Arg<v256, v128, c_v256, c_v128>(
   1676          reinterpret_cast<fptr>(v256_store_aligned),
   1677          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
   1678          reinterpret_cast<fptr>(c_v256_store_aligned),
   1679          reinterpret_cast<fptr>(c_v128_load_aligned), ref_simd, ref_d, s);
   1680    } else if (typeid(CRet) == typeid(c_v256) &&
   1681               typeid(CArg) == typeid(uint8_t)) {
   1682      // V256_U8
   1683      error = CompareSimd1Arg<v256, uint8_t, c_v256, uint8_t>(
   1684          reinterpret_cast<fptr>(v256_store_aligned),
   1685          reinterpret_cast<fptr>(u8_load_aligned), simd, d,
   1686          reinterpret_cast<fptr>(c_v256_store_aligned),
   1687          reinterpret_cast<fptr>(c_u8_load_aligned), ref_simd, ref_d, s);
   1688    } else if (typeid(CRet) == typeid(c_v256) &&
   1689               typeid(CArg) == typeid(uint16_t)) {
   1690      // V256_U16
   1691      error = CompareSimd1Arg<v256, uint16_t, c_v256, uint16_t>(
   1692          reinterpret_cast<fptr>(v256_store_aligned),
   1693          reinterpret_cast<fptr>(u16_load_aligned), simd, d,
   1694          reinterpret_cast<fptr>(c_v256_store_aligned),
   1695          reinterpret_cast<fptr>(c_u16_load_aligned), ref_simd, ref_d, s);
   1696    } else if (typeid(CRet) == typeid(c_v256) &&
   1697               typeid(CArg) == typeid(uint32_t)) {
   1698      // V256_U32
   1699      error = CompareSimd1Arg<v256, uint32_t, c_v256, uint32_t>(
   1700          reinterpret_cast<fptr>(v256_store_aligned),
   1701          reinterpret_cast<fptr>(u32_load_aligned), simd, d,
   1702          reinterpret_cast<fptr>(c_v256_store_aligned),
   1703          reinterpret_cast<fptr>(c_u32_load_aligned), ref_simd, ref_d, s);
   1704    } else if (typeid(CRet) == typeid(c_v256) &&
   1705               typeid(CArg) == typeid(uint64_t)) {
   1706      // V256_U64
   1707      error = CompareSimd1Arg<v256, uint64_t, c_v256, uint64_t>(
   1708          reinterpret_cast<fptr>(v256_store_aligned),
   1709          reinterpret_cast<fptr>(u64_load_aligned), simd, d,
   1710          reinterpret_cast<fptr>(c_v256_store_aligned),
   1711          reinterpret_cast<fptr>(c_u64_load_aligned), ref_simd, ref_d, s);
   1712    } else if (typeid(CRet) == typeid(uint32_t) &&
   1713               typeid(CArg) == typeid(c_v256)) {
   1714      // U32_V256
   1715      error = CompareSimd1Arg<uint32_t, v256, uint32_t, c_v256>(
   1716          reinterpret_cast<fptr>(u32_store_aligned),
   1717          reinterpret_cast<fptr>(v256_load_aligned), simd, d,
   1718          reinterpret_cast<fptr>(c_u32_store_aligned),
   1719          reinterpret_cast<fptr>(c_v256_load_aligned), ref_simd, ref_d, s);
   1720    } else if (typeid(CRet) == typeid(c_v64) &&
   1721               typeid(CArg) == typeid(c_v256)) {
   1722      // V64_V256
   1723      error = CompareSimd1Arg<v64, v256, c_v64, c_v256>(
   1724          reinterpret_cast<fptr>(v64_store_aligned),
   1725          reinterpret_cast<fptr>(v256_load_aligned), simd, d,
   1726          reinterpret_cast<fptr>(c_v64_store_aligned),
   1727          reinterpret_cast<fptr>(c_v256_load_aligned), ref_simd, ref_d, s);
   1728    } else {
   1729      FAIL() << "Internal error: Unknown intrinsic function "
   1730             << typeid(CRet).name() << " " << name << "(" << typeid(CArg).name()
   1731             << ")";
   1732    }
   1733  }
   1734 
   1735  EXPECT_EQ(0, error) << "Error: mismatch for " << name << "("
   1736                      << Print(s, sizeof(CArg)) << ") -> "
   1737                      << Print(d, sizeof(CRet)) << " (simd), "
   1738                      << Print(ref_d, sizeof(CRet)) << " (ref)";
   1739 }
   1740 
   1741 template <typename CRet, typename CArg1, typename CArg2>
   1742 void TestSimd2Args(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
   1743                   const char *name) {
   1744  ACMRandom rnd(ACMRandom::DeterministicSeed());
   1745  fptr ref_simd;
   1746  fptr simd;
   1747  int error = 0;
   1748  DECLARE_ALIGNED(32, uint8_t, s1[32]);
   1749  DECLARE_ALIGNED(32, uint8_t, s2[32]);
   1750  DECLARE_ALIGNED(32, uint8_t, d[32]);
   1751  DECLARE_ALIGNED(32, uint8_t, ref_d[32]);
   1752  assert(sizeof(CArg1) <= 32 && sizeof(CArg2) <= 32 && sizeof(CRet) <= 32);
   1753  memset(ref_d, 0, sizeof(ref_d));
   1754  memset(d, 0, sizeof(d));
   1755 
   1756  Map(name, &ref_simd, &simd);
   1757  if (simd == nullptr || ref_simd == nullptr) {
   1758    FAIL() << "Internal error: Unknown intrinsic function " << name;
   1759  }
   1760 
   1761  for (unsigned int count = 0;
   1762       count < iterations && !error && !testing::Test::HasFailure(); count++) {
   1763    for (unsigned int c = 0; c < sizeof(CArg1); c++) s1[c] = rnd.Rand8();
   1764 
   1765    for (unsigned int c = 0; c < sizeof(CArg2); c++) s2[c] = rnd.Rand8();
   1766 
   1767    if (maskwidth) SetMask(s2, sizeof(CArg2), mask, maskwidth);
   1768 
   1769    if (typeid(CRet) == typeid(c_v64) && typeid(CArg1) == typeid(c_v64) &&
   1770        typeid(CArg2) == typeid(c_v64)) {
   1771      // V64_V64V64
   1772      error = CompareSimd2Args<v64, v64, v64, c_v64, c_v64, c_v64>(
   1773          reinterpret_cast<fptr>(v64_store_aligned),
   1774          reinterpret_cast<fptr>(v64_load_aligned),
   1775          reinterpret_cast<fptr>(v64_load_aligned), simd, d,
   1776          reinterpret_cast<fptr>(c_v64_store_aligned),
   1777          reinterpret_cast<fptr>(c_v64_load_aligned),
   1778          reinterpret_cast<fptr>(c_v64_load_aligned),
   1779          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
   1780    } else if (typeid(CRet) == typeid(c_v64) &&
   1781               typeid(CArg1) == typeid(uint32_t) &&
   1782               typeid(CArg2) == typeid(uint32_t)) {
   1783      // V64_U32U32
   1784      error =
   1785          CompareSimd2Args<v64, uint32_t, uint32_t, c_v64, uint32_t, uint32_t>(
   1786              reinterpret_cast<fptr>(v64_store_aligned),
   1787              reinterpret_cast<fptr>(u32_load_aligned),
   1788              reinterpret_cast<fptr>(u32_load_aligned), simd, d,
   1789              reinterpret_cast<fptr>(c_v64_store_aligned),
   1790              reinterpret_cast<fptr>(c_u32_load_aligned),
   1791              reinterpret_cast<fptr>(c_u32_load_aligned),
   1792              reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
   1793    } else if (typeid(CRet) == typeid(uint32_t) &&
   1794               typeid(CArg1) == typeid(c_v64) &&
   1795               typeid(CArg2) == typeid(c_v64)) {
   1796      // U32_V64V64
   1797      error = CompareSimd2Args<uint32_t, v64, v64, uint32_t, c_v64, c_v64>(
   1798          reinterpret_cast<fptr>(u32_store_aligned),
   1799          reinterpret_cast<fptr>(v64_load_aligned),
   1800          reinterpret_cast<fptr>(v64_load_aligned), simd, d,
   1801          reinterpret_cast<fptr>(c_u32_store_aligned),
   1802          reinterpret_cast<fptr>(c_v64_load_aligned),
   1803          reinterpret_cast<fptr>(c_v64_load_aligned),
   1804          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
   1805    } else if (typeid(CRet) == typeid(int64_t) &&
   1806               typeid(CArg1) == typeid(c_v64) &&
   1807               typeid(CArg2) == typeid(c_v64)) {
   1808      // S64_V64V64
   1809      error = CompareSimd2Args<int64_t, v64, v64, int64_t, c_v64, c_v64>(
   1810          reinterpret_cast<fptr>(s64_store_aligned),
   1811          reinterpret_cast<fptr>(v64_load_aligned),
   1812          reinterpret_cast<fptr>(v64_load_aligned), simd, d,
   1813          reinterpret_cast<fptr>(c_s64_store_aligned),
   1814          reinterpret_cast<fptr>(c_v64_load_aligned),
   1815          reinterpret_cast<fptr>(c_v64_load_aligned),
   1816          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
   1817    } else if (typeid(CRet) == typeid(c_v64) &&
   1818               typeid(CArg1) == typeid(c_v64) &&
   1819               typeid(CArg2) == typeid(uint32_t)) {
   1820      // V64_V64U32
   1821      error = CompareSimd2Args<v64, v64, uint32_t, c_v64, c_v64, uint32_t>(
   1822          reinterpret_cast<fptr>(v64_store_aligned),
   1823          reinterpret_cast<fptr>(v64_load_aligned),
   1824          reinterpret_cast<fptr>(u32_load_aligned), simd, d,
   1825          reinterpret_cast<fptr>(c_v64_store_aligned),
   1826          reinterpret_cast<fptr>(c_v64_load_aligned),
   1827          reinterpret_cast<fptr>(c_u32_load_aligned),
   1828          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
   1829    } else if (typeid(CRet) == typeid(c_v128) &&
   1830               typeid(CArg1) == typeid(c_v128) &&
   1831               typeid(CArg2) == typeid(c_v128)) {
   1832      // V128_V128V128
   1833      error = CompareSimd2Args<v128, v128, v128, c_v128, c_v128, c_v128>(
   1834          reinterpret_cast<fptr>(v128_store_aligned),
   1835          reinterpret_cast<fptr>(v128_load_aligned),
   1836          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
   1837          reinterpret_cast<fptr>(c_v128_store_aligned),
   1838          reinterpret_cast<fptr>(c_v128_load_aligned),
   1839          reinterpret_cast<fptr>(c_v128_load_aligned),
   1840          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
   1841    } else if (typeid(CRet) == typeid(uint32_t) &&
   1842               typeid(CArg1) == typeid(c_v128) &&
   1843               typeid(CArg2) == typeid(c_v128)) {
   1844      // U32_V128V128
   1845      error = CompareSimd2Args<uint32_t, v128, v128, uint32_t, c_v128, c_v128>(
   1846          reinterpret_cast<fptr>(u32_store_aligned),
   1847          reinterpret_cast<fptr>(v128_load_aligned),
   1848          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
   1849          reinterpret_cast<fptr>(c_u32_store_aligned),
   1850          reinterpret_cast<fptr>(c_v128_load_aligned),
   1851          reinterpret_cast<fptr>(c_v128_load_aligned),
   1852          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
   1853    } else if (typeid(CRet) == typeid(uint64_t) &&
   1854               typeid(CArg1) == typeid(c_v128) &&
   1855               typeid(CArg2) == typeid(c_v128)) {
   1856      // U64_V128V128
   1857      error = CompareSimd2Args<uint64_t, v128, v128, uint64_t, c_v128, c_v128>(
   1858          reinterpret_cast<fptr>(u64_store_aligned),
   1859          reinterpret_cast<fptr>(v128_load_aligned),
   1860          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
   1861          reinterpret_cast<fptr>(c_u64_store_aligned),
   1862          reinterpret_cast<fptr>(c_v128_load_aligned),
   1863          reinterpret_cast<fptr>(c_v128_load_aligned),
   1864          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
   1865    } else if (typeid(CRet) == typeid(int64_t) &&
   1866               typeid(CArg1) == typeid(c_v128) &&
   1867               typeid(CArg2) == typeid(c_v128)) {
   1868      // S64_V128V128
   1869      error = CompareSimd2Args<int64_t, v128, v128, int64_t, c_v128, c_v128>(
   1870          reinterpret_cast<fptr>(s64_store_aligned),
   1871          reinterpret_cast<fptr>(v128_load_aligned),
   1872          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
   1873          reinterpret_cast<fptr>(c_s64_store_aligned),
   1874          reinterpret_cast<fptr>(c_v128_load_aligned),
   1875          reinterpret_cast<fptr>(c_v128_load_aligned),
   1876          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
   1877    } else if (typeid(CRet) == typeid(c_v128) &&
   1878               typeid(CArg1) == typeid(uint64_t) &&
   1879               typeid(CArg2) == typeid(uint64_t)) {
   1880      // V128_U64U64
   1881      error = CompareSimd2Args<v128, uint64_t, uint64_t, c_v128, uint64_t,
   1882                               uint64_t>(
   1883          reinterpret_cast<fptr>(v128_store_aligned),
   1884          reinterpret_cast<fptr>(u64_load_aligned),
   1885          reinterpret_cast<fptr>(u64_load_aligned), simd, d,
   1886          reinterpret_cast<fptr>(c_v128_store_aligned),
   1887          reinterpret_cast<fptr>(c_u64_load_aligned),
   1888          reinterpret_cast<fptr>(c_u64_load_aligned),
   1889          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
   1890    } else if (typeid(CRet) == typeid(c_v128) &&
   1891               typeid(CArg1) == typeid(c_v64) &&
   1892               typeid(CArg2) == typeid(c_v64)) {
   1893      // V128_V64V64
   1894      error = CompareSimd2Args<v128, v64, v64, c_v128, c_v64, c_v64>(
   1895          reinterpret_cast<fptr>(v128_store_aligned),
   1896          reinterpret_cast<fptr>(v64_load_aligned),
   1897          reinterpret_cast<fptr>(v64_load_aligned), simd, d,
   1898          reinterpret_cast<fptr>(c_v128_store_aligned),
   1899          reinterpret_cast<fptr>(c_v64_load_aligned),
   1900          reinterpret_cast<fptr>(c_v64_load_aligned),
   1901          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
   1902    } else if (typeid(CRet) == typeid(c_v128) &&
   1903               typeid(CArg1) == typeid(c_v128) &&
   1904               typeid(CArg2) == typeid(uint32_t)) {
   1905      // V128_V128U32
   1906      error = CompareSimd2Args<v128, v128, uint32_t, c_v128, c_v128, uint32_t>(
   1907          reinterpret_cast<fptr>(v128_store_aligned),
   1908          reinterpret_cast<fptr>(v128_load_aligned),
   1909          reinterpret_cast<fptr>(u32_load_aligned), simd, d,
   1910          reinterpret_cast<fptr>(c_v128_store_aligned),
   1911          reinterpret_cast<fptr>(c_v128_load_aligned),
   1912          reinterpret_cast<fptr>(c_u32_load_aligned),
   1913          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
   1914    } else if (typeid(CRet) == typeid(c_v256) &&
   1915               typeid(CArg1) == typeid(c_v256) &&
   1916               typeid(CArg2) == typeid(c_v256)) {
   1917      // V256_V256V256
   1918      error = CompareSimd2Args<v256, v256, v256, c_v256, c_v256, c_v256>(
   1919          reinterpret_cast<fptr>(v256_store_aligned),
   1920          reinterpret_cast<fptr>(v256_load_aligned),
   1921          reinterpret_cast<fptr>(v256_load_aligned), simd, d,
   1922          reinterpret_cast<fptr>(c_v256_store_aligned),
   1923          reinterpret_cast<fptr>(c_v256_load_aligned),
   1924          reinterpret_cast<fptr>(c_v256_load_aligned),
   1925          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
   1926    } else if (typeid(CRet) == typeid(uint64_t) &&
   1927               typeid(CArg1) == typeid(c_v256) &&
   1928               typeid(CArg2) == typeid(c_v256)) {
   1929      // U64_V256V256
   1930      error = CompareSimd2Args<uint64_t, v256, v256, uint64_t, c_v256, c_v256>(
   1931          reinterpret_cast<fptr>(u64_store_aligned),
   1932          reinterpret_cast<fptr>(v256_load_aligned),
   1933          reinterpret_cast<fptr>(v256_load_aligned), simd, d,
   1934          reinterpret_cast<fptr>(c_u64_store_aligned),
   1935          reinterpret_cast<fptr>(c_v256_load_aligned),
   1936          reinterpret_cast<fptr>(c_v256_load_aligned),
   1937          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
   1938    } else if (typeid(CRet) == typeid(int64_t) &&
   1939               typeid(CArg1) == typeid(c_v256) &&
   1940               typeid(CArg2) == typeid(c_v256)) {
   1941      // S64_V256V256
   1942      error = CompareSimd2Args<int64_t, v256, v256, int64_t, c_v256, c_v256>(
   1943          reinterpret_cast<fptr>(s64_store_aligned),
   1944          reinterpret_cast<fptr>(v256_load_aligned),
   1945          reinterpret_cast<fptr>(v256_load_aligned), simd, d,
   1946          reinterpret_cast<fptr>(c_s64_store_aligned),
   1947          reinterpret_cast<fptr>(c_v256_load_aligned),
   1948          reinterpret_cast<fptr>(c_v256_load_aligned),
   1949          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
   1950    } else if (typeid(CRet) == typeid(uint32_t) &&
   1951               typeid(CArg1) == typeid(c_v256) &&
   1952               typeid(CArg2) == typeid(c_v256)) {
   1953      // U32_V256V256
   1954      error = CompareSimd2Args<uint32_t, v256, v256, uint32_t, c_v256, c_v256>(
   1955          reinterpret_cast<fptr>(u32_store_aligned),
   1956          reinterpret_cast<fptr>(v256_load_aligned),
   1957          reinterpret_cast<fptr>(v256_load_aligned), simd, d,
   1958          reinterpret_cast<fptr>(c_u32_store_aligned),
   1959          reinterpret_cast<fptr>(c_v256_load_aligned),
   1960          reinterpret_cast<fptr>(c_v256_load_aligned),
   1961          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
   1962    } else if (typeid(CRet) == typeid(c_v256) &&
   1963               typeid(CArg1) == typeid(c_v128) &&
   1964               typeid(CArg2) == typeid(c_v128)) {
   1965      // V256_V128V128
   1966      error = CompareSimd2Args<v256, v128, v128, c_v256, c_v128, c_v128>(
   1967          reinterpret_cast<fptr>(v256_store_aligned),
   1968          reinterpret_cast<fptr>(v128_load_aligned),
   1969          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
   1970          reinterpret_cast<fptr>(c_v256_store_aligned),
   1971          reinterpret_cast<fptr>(c_v128_load_aligned),
   1972          reinterpret_cast<fptr>(c_v128_load_aligned),
   1973          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
   1974    } else if (typeid(CRet) == typeid(c_v256) &&
   1975               typeid(CArg1) == typeid(c_v256) &&
   1976               typeid(CArg2) == typeid(uint32_t)) {
   1977      // V256_V256U32
   1978      error = CompareSimd2Args<v256, v256, uint32_t, c_v256, c_v256, uint32_t>(
   1979          reinterpret_cast<fptr>(v256_store_aligned),
   1980          reinterpret_cast<fptr>(v256_load_aligned),
   1981          reinterpret_cast<fptr>(u32_load_aligned), simd, d,
   1982          reinterpret_cast<fptr>(c_v256_store_aligned),
   1983          reinterpret_cast<fptr>(c_v256_load_aligned),
   1984          reinterpret_cast<fptr>(c_u32_load_aligned),
   1985          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
   1986 
   1987    } else {
   1988      FAIL() << "Internal error: Unknown intrinsic function "
   1989             << typeid(CRet).name() << " " << name << "("
   1990             << typeid(CArg1).name() << ", " << typeid(CArg2).name() << ")";
   1991    }
   1992  }
   1993 
   1994  EXPECT_EQ(0, error) << "Error: mismatch for " << name << "("
   1995                      << Print(s1, sizeof(CArg1)) << ", "
   1996                      << Print(s2, sizeof(CArg2)) << ") -> "
   1997                      << Print(d, sizeof(CRet)) << " (simd), "
   1998                      << Print(ref_d, sizeof(CRet)) << " (ref)";
   1999 }
   2000 
   2001 template <typename CRet, typename CArg1, typename CArg2, typename CArg3>
   2002 void TestSimd3Args(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
   2003                   const char *name) {
   2004  ACMRandom rnd(ACMRandom::DeterministicSeed());
   2005  fptr ref_simd;
   2006  fptr simd;
   2007  int error = 0;
   2008  DECLARE_ALIGNED(32, uint8_t, s1[32]);
   2009  DECLARE_ALIGNED(32, uint8_t, s2[32]);
   2010  DECLARE_ALIGNED(32, uint8_t, s3[32]);
   2011  DECLARE_ALIGNED(32, uint8_t, d[32]);
   2012  DECLARE_ALIGNED(32, uint8_t, ref_d[32]);
   2013  assert(sizeof(CArg1) <= 32 && sizeof(CArg2) <= 32 && sizeof(CArg3) <= 32 &&
   2014         sizeof(CRet) <= 32);
   2015  memset(ref_d, 0, sizeof(ref_d));
   2016  memset(d, 0, sizeof(d));
   2017 
   2018  Map(name, &ref_simd, &simd);
   2019  if (simd == nullptr || ref_simd == nullptr) {
   2020    FAIL() << "Internal error: Unknown intrinsic function " << name;
   2021  }
   2022 
   2023  for (unsigned int count = 0;
   2024       count < iterations && !error && !testing::Test::HasFailure(); count++) {
   2025    for (unsigned int c = 0; c < sizeof(CArg1); c++) s1[c] = rnd.Rand8();
   2026 
   2027    for (unsigned int c = 0; c < sizeof(CArg2); c++) s2[c] = rnd.Rand8();
   2028 
   2029    for (unsigned int c = 0; c < sizeof(CArg3); c++) s3[c] = rnd.Rand8();
   2030 
   2031    if (maskwidth) SetMask(s3, sizeof(CArg3), mask, maskwidth);
   2032 
   2033    if (typeid(CRet) == typeid(c_v128) && typeid(CArg1) == typeid(c_v128) &&
   2034        typeid(CArg2) == typeid(c_v128) && typeid(CArg3) == typeid(c_v128)) {
   2035      // V128_V128V128V128
   2036      error = CompareSimd3Args<v128, v128, v128, v128, c_v128, c_v128, c_v128,
   2037                               c_v128>(
   2038          reinterpret_cast<fptr>(v128_store_aligned),
   2039          reinterpret_cast<fptr>(v128_load_aligned),
   2040          reinterpret_cast<fptr>(v128_load_aligned),
   2041          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
   2042          reinterpret_cast<fptr>(c_v128_store_aligned),
   2043          reinterpret_cast<fptr>(c_v128_load_aligned),
   2044          reinterpret_cast<fptr>(c_v128_load_aligned),
   2045          reinterpret_cast<fptr>(c_v128_load_aligned),
   2046          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2, s3);
   2047    } else if (typeid(CRet) == typeid(c_v256) &&
   2048               typeid(CArg1) == typeid(c_v256) &&
   2049               typeid(CArg2) == typeid(c_v256) &&
   2050               typeid(CArg3) == typeid(c_v256)) {
   2051      // V256_V256V256V256
   2052      error = CompareSimd3Args<v256, v256, v256, v256, c_v256, c_v256, c_v256,
   2053                               c_v256>(
   2054          reinterpret_cast<fptr>(v256_store_aligned),
   2055          reinterpret_cast<fptr>(v256_load_aligned),
   2056          reinterpret_cast<fptr>(v256_load_aligned),
   2057          reinterpret_cast<fptr>(v256_load_aligned), simd, d,
   2058          reinterpret_cast<fptr>(c_v256_store_aligned),
   2059          reinterpret_cast<fptr>(c_v256_load_aligned),
   2060          reinterpret_cast<fptr>(c_v256_load_aligned),
   2061          reinterpret_cast<fptr>(c_v256_load_aligned),
   2062          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2, s3);
   2063    } else {
   2064      FAIL() << "Internal error: Unknown intrinsic function "
   2065             << typeid(CRet).name() << " " << name << "("
   2066             << typeid(CArg1).name() << ", " << typeid(CArg2).name() << ", "
   2067             << typeid(CArg3).name() << ")";
   2068    }
   2069  }
   2070 
   2071  EXPECT_EQ(0, error) << "Error: mismatch for " << name << "("
   2072                      << Print(s1, sizeof(CArg1)) << ", "
   2073                      << Print(s2, sizeof(CArg2)) << ", "
   2074                      << Print(s3, sizeof(CArg3)) << ") -> "
   2075                      << Print(d, sizeof(CRet)) << " (simd), "
   2076                      << Print(ref_d, sizeof(CRet)) << " (ref)";
   2077 }
   2078 
   2079 // Instantiations to make the functions callable from another files
   2080 template void TestSimd1Arg<c_v64, uint8_t>(uint32_t, uint32_t, uint32_t,
   2081                                           const char *);
   2082 template void TestSimd1Arg<c_v64, uint16_t>(uint32_t, uint32_t, uint32_t,
   2083                                            const char *);
   2084 template void TestSimd1Arg<c_v64, uint32_t>(uint32_t, uint32_t, uint32_t,
   2085                                            const char *);
   2086 template void TestSimd1Arg<c_v64, c_v64>(uint32_t, uint32_t, uint32_t,
   2087                                         const char *);
   2088 template void TestSimd1Arg<uint32_t, c_v64>(uint32_t, uint32_t, uint32_t,
   2089                                            const char *);
   2090 template void TestSimd1Arg<int32_t, c_v64>(uint32_t, uint32_t, uint32_t,
   2091                                           const char *);
   2092 template void TestSimd1Arg<uint64_t, c_v64>(uint32_t, uint32_t, uint32_t,
   2093                                            const char *);
   2094 template void TestSimd1Arg<int64_t, c_v64>(uint32_t, uint32_t, uint32_t,
   2095                                           const char *);
   2096 template void TestSimd2Args<c_v64, uint32_t, uint32_t>(uint32_t, uint32_t,
   2097                                                       uint32_t, const char *);
   2098 template void TestSimd2Args<c_v64, c_v64, c_v64>(uint32_t, uint32_t, uint32_t,
   2099                                                 const char *);
   2100 template void TestSimd2Args<c_v64, c_v64, uint32_t>(uint32_t, uint32_t,
   2101                                                    uint32_t, const char *);
   2102 template void TestSimd2Args<int64_t, c_v64, c_v64>(uint32_t, uint32_t, uint32_t,
   2103                                                   const char *);
   2104 template void TestSimd2Args<uint32_t, c_v64, c_v64>(uint32_t, uint32_t,
   2105                                                    uint32_t, const char *);
   2106 template void TestSimd1Arg<c_v128, c_v128>(uint32_t, uint32_t, uint32_t,
   2107                                           const char *);
   2108 template void TestSimd1Arg<c_v128, uint8_t>(uint32_t, uint32_t, uint32_t,
   2109                                            const char *);
   2110 template void TestSimd1Arg<c_v128, uint16_t>(uint32_t, uint32_t, uint32_t,
   2111                                             const char *);
   2112 template void TestSimd1Arg<c_v128, uint32_t>(uint32_t, uint32_t, uint32_t,
   2113                                             const char *);
   2114 template void TestSimd1Arg<c_v128, uint64_t>(uint32_t, uint32_t, uint32_t,
   2115                                             const char *);
   2116 template void TestSimd1Arg<c_v128, c_v64>(uint32_t, uint32_t, uint32_t,
   2117                                          const char *);
   2118 template void TestSimd1Arg<uint32_t, c_v128>(uint32_t, uint32_t, uint32_t,
   2119                                             const char *);
   2120 template void TestSimd1Arg<uint64_t, c_v128>(uint32_t, uint32_t, uint32_t,
   2121                                             const char *);
   2122 template void TestSimd1Arg<c_v64, c_v128>(uint32_t, uint32_t, uint32_t,
   2123                                          const char *);
   2124 template void TestSimd2Args<c_v128, c_v128, c_v128>(uint32_t, uint32_t,
   2125                                                    uint32_t, const char *);
   2126 template void TestSimd2Args<c_v128, c_v128, uint32_t>(uint32_t, uint32_t,
   2127                                                      uint32_t, const char *);
   2128 template void TestSimd2Args<c_v128, uint64_t, uint64_t>(uint32_t, uint32_t,
   2129                                                        uint32_t, const char *);
   2130 template void TestSimd2Args<c_v128, c_v64, c_v64>(uint32_t, uint32_t, uint32_t,
   2131                                                  const char *);
   2132 template void TestSimd2Args<uint64_t, c_v128, c_v128>(uint32_t, uint32_t,
   2133                                                      uint32_t, const char *);
   2134 template void TestSimd2Args<int64_t, c_v128, c_v128>(uint32_t, uint32_t,
   2135                                                     uint32_t, const char *);
   2136 template void TestSimd2Args<uint32_t, c_v128, c_v128>(uint32_t, uint32_t,
   2137                                                      uint32_t, const char *);
   2138 template void TestSimd3Args<c_v128, c_v128, c_v128, c_v128>(uint32_t, uint32_t,
   2139                                                            uint32_t,
   2140                                                            const char *);
   2141 template void TestSimd1Arg<c_v256, c_v128>(uint32_t, uint32_t, uint32_t,
   2142                                           const char *);
   2143 template void TestSimd1Arg<c_v256, c_v256>(uint32_t, uint32_t, uint32_t,
   2144                                           const char *);
   2145 template void TestSimd1Arg<uint64_t, c_v256>(uint32_t, uint32_t, uint32_t,
   2146                                             const char *);
   2147 template void TestSimd1Arg<c_v256, uint8_t>(uint32_t, uint32_t, uint32_t,
   2148                                            const char *);
   2149 template void TestSimd1Arg<c_v256, uint16_t>(uint32_t, uint32_t, uint32_t,
   2150                                             const char *);
   2151 template void TestSimd1Arg<c_v256, uint32_t>(uint32_t, uint32_t, uint32_t,
   2152                                             const char *);
   2153 template void TestSimd1Arg<c_v256, uint64_t>(uint32_t, uint32_t, uint32_t,
   2154                                             const char *);
   2155 template void TestSimd1Arg<uint32_t, c_v256>(uint32_t, uint32_t, uint32_t,
   2156                                             const char *);
   2157 template void TestSimd1Arg<c_v64, c_v256>(uint32_t, uint32_t, uint32_t,
   2158                                          const char *);
   2159 template void TestSimd2Args<c_v256, c_v128, c_v128>(uint32_t, uint32_t,
   2160                                                    uint32_t, const char *);
   2161 template void TestSimd2Args<c_v256, c_v256, c_v256>(uint32_t, uint32_t,
   2162                                                    uint32_t, const char *);
   2163 template void TestSimd2Args<c_v256, c_v256, uint32_t>(uint32_t, uint32_t,
   2164                                                      uint32_t, const char *);
   2165 template void TestSimd2Args<uint64_t, c_v256, c_v256>(uint32_t, uint32_t,
   2166                                                      uint32_t, const char *);
   2167 template void TestSimd2Args<int64_t, c_v256, c_v256>(uint32_t, uint32_t,
   2168                                                     uint32_t, const char *);
   2169 template void TestSimd2Args<uint32_t, c_v256, c_v256>(uint32_t, uint32_t,
   2170                                                      uint32_t, const char *);
   2171 template void TestSimd3Args<c_v256, c_v256, c_v256, c_v256>(uint32_t, uint32_t,
   2172                                                            uint32_t,
   2173                                                            const char *);
   2174 
   2175 }  // namespace SIMD_NAMESPACE