tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

v64_intrinsics_c.h (23452B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_
     13 #define AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_
     14 
     15 /* Note: This implements the intrinsics in plain, unoptimised C.
     16   Intended for reference, porting or debugging. */
     17 
     18 #include <stdio.h>
     19 #include <stdlib.h>
     20 
     21 #include "config/aom_config.h"
     22 
     23 typedef union {
     24  uint8_t u8[8];
     25  uint16_t u16[4];
     26  uint32_t u32[2];
     27  uint64_t u64;
     28  int8_t s8[8];
     29  int16_t s16[4];
     30  int32_t s32[2];
     31  int64_t s64;
     32 } c_v64;
     33 
     34 SIMD_INLINE uint32_t c_v64_low_u32(c_v64 a) {
     35  return a.u32[!!CONFIG_BIG_ENDIAN];
     36 }
     37 
     38 SIMD_INLINE uint32_t c_v64_high_u32(c_v64 a) {
     39  return a.u32[!CONFIG_BIG_ENDIAN];
     40 }
     41 
     42 SIMD_INLINE int32_t c_v64_low_s32(c_v64 a) {
     43  return a.s32[!!CONFIG_BIG_ENDIAN];
     44 }
     45 
     46 SIMD_INLINE int32_t c_v64_high_s32(c_v64 a) {
     47  return a.s32[!CONFIG_BIG_ENDIAN];
     48 }
     49 
     50 SIMD_INLINE c_v64 c_v64_from_32(uint32_t x, uint32_t y) {
     51  c_v64 t;
     52  t.u32[!CONFIG_BIG_ENDIAN] = x;
     53  t.u32[!!CONFIG_BIG_ENDIAN] = y;
     54  return t;
     55 }
     56 
     57 SIMD_INLINE c_v64 c_v64_from_64(uint64_t x) {
     58  c_v64 t;
     59  t.u64 = x;
     60  return t;
     61 }
     62 
     63 SIMD_INLINE uint64_t c_v64_u64(c_v64 x) { return x.u64; }
     64 
     65 SIMD_INLINE c_v64 c_v64_from_16(uint16_t a, uint16_t b, uint16_t c,
     66                                uint16_t d) {
     67  c_v64 t;
     68  if (CONFIG_BIG_ENDIAN) {
     69    t.u16[0] = a;
     70    t.u16[1] = b;
     71    t.u16[2] = c;
     72    t.u16[3] = d;
     73  } else {
     74    t.u16[3] = a;
     75    t.u16[2] = b;
     76    t.u16[1] = c;
     77    t.u16[0] = d;
     78  }
     79  return t;
     80 }
     81 
     82 SIMD_INLINE uint32_t c_u32_load_unaligned(const void *p) {
     83  uint32_t t;
     84  uint8_t *pp = (uint8_t *)p;
     85  uint8_t *q = (uint8_t *)&t;
     86  int c;
     87  for (c = 0; c < 4; c++) q[c] = pp[c];
     88  return t;
     89 }
     90 
     91 SIMD_INLINE void c_u32_store_unaligned(void *p, uint32_t a) {
     92  uint8_t *pp = (uint8_t *)p;
     93  uint8_t *q = (uint8_t *)&a;
     94  int c;
     95  for (c = 0; c < 4; c++) pp[c] = q[c];
     96 }
     97 
     98 SIMD_INLINE uint32_t c_u32_load_aligned(const void *p) {
     99  if (SIMD_CHECK && (uintptr_t)p & 3) {
    100    fprintf(stderr, "Error: Unaligned u32 load at %p\n", p);
    101    abort();
    102  }
    103  return c_u32_load_unaligned(p);
    104 }
    105 
    106 SIMD_INLINE void c_u32_store_aligned(void *p, uint32_t a) {
    107  if (SIMD_CHECK && (uintptr_t)p & 3) {
    108    fprintf(stderr, "Error: Unaligned u32 store at %p\n", p);
    109    abort();
    110  }
    111  c_u32_store_unaligned(p, a);
    112 }
    113 
    114 SIMD_INLINE c_v64 c_v64_load_unaligned(const void *p) {
    115  c_v64 t;
    116  uint8_t *pp = (uint8_t *)p;
    117  uint8_t *q = (uint8_t *)&t;
    118  int c;
    119  for (c = 0; c < 8; c++) q[c] = pp[c];
    120  return t;
    121 }
    122 
    123 SIMD_INLINE c_v64 c_v64_load_aligned(const void *p) {
    124  if (SIMD_CHECK && (uintptr_t)p & 7) {
    125    fprintf(stderr, "Error: Unaligned c_v64 load at %p\n", p);
    126    abort();
    127  }
    128  return c_v64_load_unaligned(p);
    129 }
    130 
    131 SIMD_INLINE void c_v64_store_unaligned(void *p, c_v64 a) {
    132  uint8_t *q = (uint8_t *)p;
    133  uint8_t *r = (uint8_t *)&a;
    134  int c;
    135  for (c = 0; c < 8; c++) q[c] = r[c];
    136 }
    137 
    138 SIMD_INLINE void c_v64_store_aligned(void *p, c_v64 a) {
    139  if (SIMD_CHECK && (uintptr_t)p & 7) {
    140    fprintf(stderr, "Error: Unaligned c_v64 store at %p\n", p);
    141    abort();
    142  }
    143  c_v64_store_unaligned(p, a);
    144 }
    145 
    146 SIMD_INLINE c_v64 c_v64_zero(void) {
    147  c_v64 t;
    148  t.u64 = 0;
    149  return t;
    150 }
    151 
    152 SIMD_INLINE c_v64 c_v64_dup_8(uint8_t x) {
    153  c_v64 t;
    154  t.u8[0] = t.u8[1] = t.u8[2] = t.u8[3] = t.u8[4] = t.u8[5] = t.u8[6] =
    155      t.u8[7] = x;
    156  return t;
    157 }
    158 
    159 SIMD_INLINE c_v64 c_v64_dup_16(uint16_t x) {
    160  c_v64 t;
    161  t.u16[0] = t.u16[1] = t.u16[2] = t.u16[3] = x;
    162  return t;
    163 }
    164 
    165 SIMD_INLINE c_v64 c_v64_dup_32(uint32_t x) {
    166  c_v64 t;
    167  t.u32[0] = t.u32[1] = x;
    168  return t;
    169 }
    170 
    171 SIMD_INLINE c_v64 c_v64_add_8(c_v64 a, c_v64 b) {
    172  c_v64 t;
    173  int c;
    174  for (c = 0; c < 8; c++) t.u8[c] = (uint8_t)(a.u8[c] + b.u8[c]);
    175  return t;
    176 }
    177 
    178 SIMD_INLINE c_v64 c_v64_add_16(c_v64 a, c_v64 b) {
    179  c_v64 t;
    180  int c;
    181  for (c = 0; c < 4; c++) t.u16[c] = (uint16_t)(a.u16[c] + b.u16[c]);
    182  return t;
    183 }
    184 
    185 SIMD_INLINE c_v64 c_v64_sadd_u8(c_v64 a, c_v64 b) {
    186  c_v64 t;
    187  int c;
    188  for (c = 0; c < 8; c++)
    189    t.u8[c] = SIMD_CLAMP((int16_t)a.u8[c] + (int16_t)b.u8[c], 0, 255);
    190  return t;
    191 }
    192 
    193 SIMD_INLINE c_v64 c_v64_sadd_s8(c_v64 a, c_v64 b) {
    194  c_v64 t;
    195  int c;
    196  for (c = 0; c < 8; c++)
    197    t.s8[c] = SIMD_CLAMP((int16_t)a.s8[c] + (int16_t)b.s8[c], -128, 127);
    198  return t;
    199 }
    200 
    201 SIMD_INLINE c_v64 c_v64_sadd_s16(c_v64 a, c_v64 b) {
    202  c_v64 t;
    203  int c;
    204  for (c = 0; c < 4; c++)
    205    t.s16[c] = SIMD_CLAMP((int32_t)a.s16[c] + (int32_t)b.s16[c], -32768, 32767);
    206  return t;
    207 }
    208 
    209 SIMD_INLINE c_v64 c_v64_add_32(c_v64 a, c_v64 b) {
    210  c_v64 t;
    211  t.u32[0] = (uint32_t)((uint64_t)a.u32[0] + b.u32[0]);
    212  t.u32[1] = (uint32_t)((uint64_t)a.u32[1] + b.u32[1]);
    213  return t;
    214 }
    215 
    216 SIMD_INLINE c_v64 c_v64_sub_8(c_v64 a, c_v64 b) {
    217  c_v64 t;
    218  int c;
    219  for (c = 0; c < 8; c++) t.u8[c] = (uint8_t)(a.u8[c] - b.u8[c]);
    220  return t;
    221 }
    222 
    223 SIMD_INLINE c_v64 c_v64_ssub_u8(c_v64 a, c_v64 b) {
    224  c_v64 t;
    225  int c;
    226  for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] < b.u8[c] ? 0 : a.u8[c] - b.u8[c];
    227  return t;
    228 }
    229 
    230 SIMD_INLINE c_v64 c_v64_ssub_s8(c_v64 a, c_v64 b) {
    231  c_v64 t;
    232  int c;
    233  for (c = 0; c < 8; c++) {
    234    int16_t d = (int16_t)a.s8[c] - (int16_t)b.s8[c];
    235    t.s8[c] = SIMD_CLAMP(d, -128, 127);
    236  }
    237  return t;
    238 }
    239 
    240 SIMD_INLINE c_v64 c_v64_sub_16(c_v64 a, c_v64 b) {
    241  c_v64 t;
    242  int c;
    243  for (c = 0; c < 4; c++) t.u16[c] = (uint16_t)(a.u16[c] - b.u16[c]);
    244  return t;
    245 }
    246 
    247 SIMD_INLINE c_v64 c_v64_ssub_s16(c_v64 a, c_v64 b) {
    248  c_v64 t;
    249  int c;
    250  for (c = 0; c < 4; c++)
    251    t.s16[c] = SIMD_CLAMP((int32_t)a.s16[c] - (int32_t)b.s16[c], -32768, 32767);
    252  return t;
    253 }
    254 
    255 SIMD_INLINE c_v64 c_v64_ssub_u16(c_v64 a, c_v64 b) {
    256  c_v64 t;
    257  int c;
    258  for (c = 0; c < 4; c++)
    259    t.u16[c] =
    260        (int32_t)a.u16[c] - (int32_t)b.u16[c] < 0 ? 0 : a.u16[c] - b.u16[c];
    261  return t;
    262 }
    263 
    264 SIMD_INLINE c_v64 c_v64_sub_32(c_v64 a, c_v64 b) {
    265  c_v64 t;
    266  t.u32[0] = (uint32_t)((int64_t)a.u32[0] - b.u32[0]);
    267  t.u32[1] = (uint32_t)((int64_t)a.u32[1] - b.u32[1]);
    268  return t;
    269 }
    270 
    271 SIMD_INLINE c_v64 c_v64_abs_s16(c_v64 a) {
    272  c_v64 t;
    273  int c;
    274  for (c = 0; c < 4; c++)
    275    t.u16[c] = (uint16_t)((int16_t)a.u16[c] > 0 ? a.u16[c] : -a.u16[c]);
    276  return t;
    277 }
    278 
    279 SIMD_INLINE c_v64 c_v64_abs_s8(c_v64 a) {
    280  c_v64 t;
    281  int c;
    282  for (c = 0; c < 8; c++)
    283    t.u8[c] = (uint8_t)((int8_t)a.u8[c] > 0 ? a.u8[c] : -a.u8[c]);
    284  return t;
    285 }
    286 
    287 SIMD_INLINE c_v64 _c_v64_zip_8(c_v64 a, c_v64 b, int mode) {
    288  c_v64 t;
    289  if (mode) {
    290    t.u8[7] = a.u8[7];
    291    t.u8[6] = b.u8[7];
    292    t.u8[5] = a.u8[6];
    293    t.u8[4] = b.u8[6];
    294    t.u8[3] = a.u8[5];
    295    t.u8[2] = b.u8[5];
    296    t.u8[1] = a.u8[4];
    297    t.u8[0] = b.u8[4];
    298  } else {
    299    t.u8[7] = a.u8[3];
    300    t.u8[6] = b.u8[3];
    301    t.u8[5] = a.u8[2];
    302    t.u8[4] = b.u8[2];
    303    t.u8[3] = a.u8[1];
    304    t.u8[2] = b.u8[1];
    305    t.u8[1] = a.u8[0];
    306    t.u8[0] = b.u8[0];
    307  }
    308  return t;
    309 }
    310 
    311 SIMD_INLINE c_v64 c_v64_ziplo_8(c_v64 a, c_v64 b) {
    312  return CONFIG_BIG_ENDIAN ? _c_v64_zip_8(b, a, 1) : _c_v64_zip_8(a, b, 0);
    313 }
    314 
    315 SIMD_INLINE c_v64 c_v64_ziphi_8(c_v64 a, c_v64 b) {
    316  return CONFIG_BIG_ENDIAN ? _c_v64_zip_8(b, a, 0) : _c_v64_zip_8(a, b, 1);
    317 }
    318 
    319 SIMD_INLINE c_v64 _c_v64_zip_16(c_v64 a, c_v64 b, int mode) {
    320  c_v64 t;
    321  if (mode) {
    322    t.u16[3] = a.u16[3];
    323    t.u16[2] = b.u16[3];
    324    t.u16[1] = a.u16[2];
    325    t.u16[0] = b.u16[2];
    326  } else {
    327    t.u16[3] = a.u16[1];
    328    t.u16[2] = b.u16[1];
    329    t.u16[1] = a.u16[0];
    330    t.u16[0] = b.u16[0];
    331  }
    332  return t;
    333 }
    334 
    335 SIMD_INLINE c_v64 c_v64_ziplo_16(c_v64 a, c_v64 b) {
    336  return CONFIG_BIG_ENDIAN ? _c_v64_zip_16(b, a, 1) : _c_v64_zip_16(a, b, 0);
    337 }
    338 
    339 SIMD_INLINE c_v64 c_v64_ziphi_16(c_v64 a, c_v64 b) {
    340  return CONFIG_BIG_ENDIAN ? _c_v64_zip_16(b, a, 0) : _c_v64_zip_16(a, b, 1);
    341 }
    342 
    343 SIMD_INLINE c_v64 _c_v64_zip_32(c_v64 a, c_v64 b, int mode) {
    344  c_v64 t;
    345  if (mode) {
    346    t.u32[1] = a.u32[1];
    347    t.u32[0] = b.u32[1];
    348  } else {
    349    t.u32[1] = a.u32[0];
    350    t.u32[0] = b.u32[0];
    351  }
    352  return t;
    353 }
    354 
    355 SIMD_INLINE c_v64 c_v64_ziplo_32(c_v64 a, c_v64 b) {
    356  return CONFIG_BIG_ENDIAN ? _c_v64_zip_32(b, a, 1) : _c_v64_zip_32(a, b, 0);
    357 }
    358 
    359 SIMD_INLINE c_v64 c_v64_ziphi_32(c_v64 a, c_v64 b) {
    360  return CONFIG_BIG_ENDIAN ? _c_v64_zip_32(b, a, 0) : _c_v64_zip_32(a, b, 1);
    361 }
    362 
    363 SIMD_INLINE c_v64 _c_v64_unzip_8(c_v64 a, c_v64 b, int mode) {
    364  c_v64 t;
    365  if (mode) {
    366    t.u8[7] = b.u8[7];
    367    t.u8[6] = b.u8[5];
    368    t.u8[5] = b.u8[3];
    369    t.u8[4] = b.u8[1];
    370    t.u8[3] = a.u8[7];
    371    t.u8[2] = a.u8[5];
    372    t.u8[1] = a.u8[3];
    373    t.u8[0] = a.u8[1];
    374  } else {
    375    t.u8[7] = a.u8[6];
    376    t.u8[6] = a.u8[4];
    377    t.u8[5] = a.u8[2];
    378    t.u8[4] = a.u8[0];
    379    t.u8[3] = b.u8[6];
    380    t.u8[2] = b.u8[4];
    381    t.u8[1] = b.u8[2];
    382    t.u8[0] = b.u8[0];
    383  }
    384  return t;
    385 }
    386 
    387 SIMD_INLINE c_v64 c_v64_unziplo_8(c_v64 a, c_v64 b) {
    388  return CONFIG_BIG_ENDIAN ? _c_v64_unzip_8(a, b, 1) : _c_v64_unzip_8(a, b, 0);
    389 }
    390 
    391 SIMD_INLINE c_v64 c_v64_unziphi_8(c_v64 a, c_v64 b) {
    392  return CONFIG_BIG_ENDIAN ? _c_v64_unzip_8(b, a, 0) : _c_v64_unzip_8(b, a, 1);
    393 }
    394 
    395 SIMD_INLINE c_v64 _c_v64_unzip_16(c_v64 a, c_v64 b, int mode) {
    396  c_v64 t;
    397  if (mode) {
    398    t.u16[3] = b.u16[3];
    399    t.u16[2] = b.u16[1];
    400    t.u16[1] = a.u16[3];
    401    t.u16[0] = a.u16[1];
    402  } else {
    403    t.u16[3] = a.u16[2];
    404    t.u16[2] = a.u16[0];
    405    t.u16[1] = b.u16[2];
    406    t.u16[0] = b.u16[0];
    407  }
    408  return t;
    409 }
    410 
    411 SIMD_INLINE c_v64 c_v64_unziplo_16(c_v64 a, c_v64 b) {
    412  return CONFIG_BIG_ENDIAN ? _c_v64_unzip_16(a, b, 1)
    413                           : _c_v64_unzip_16(a, b, 0);
    414 }
    415 
    416 SIMD_INLINE c_v64 c_v64_unziphi_16(c_v64 a, c_v64 b) {
    417  return CONFIG_BIG_ENDIAN ? _c_v64_unzip_16(b, a, 0)
    418                           : _c_v64_unzip_16(b, a, 1);
    419 }
    420 
    421 SIMD_INLINE c_v64 c_v64_unpacklo_u8_s16(c_v64 a) {
    422  c_v64 t;
    423  int endian = !!CONFIG_BIG_ENDIAN * 4;
    424  t.s16[3] = (int16_t)a.u8[3 + endian];
    425  t.s16[2] = (int16_t)a.u8[2 + endian];
    426  t.s16[1] = (int16_t)a.u8[1 + endian];
    427  t.s16[0] = (int16_t)a.u8[0 + endian];
    428  return t;
    429 }
    430 
    431 SIMD_INLINE c_v64 c_v64_unpackhi_u8_s16(c_v64 a) {
    432  c_v64 t;
    433  int endian = !!CONFIG_BIG_ENDIAN * 4;
    434  t.s16[3] = (int16_t)a.u8[7 - endian];
    435  t.s16[2] = (int16_t)a.u8[6 - endian];
    436  t.s16[1] = (int16_t)a.u8[5 - endian];
    437  t.s16[0] = (int16_t)a.u8[4 - endian];
    438  return t;
    439 }
    440 
    441 SIMD_INLINE c_v64 c_v64_unpacklo_s8_s16(c_v64 a) {
    442  c_v64 t;
    443  int endian = !!CONFIG_BIG_ENDIAN * 4;
    444  t.s16[3] = (int16_t)a.s8[3 + endian];
    445  t.s16[2] = (int16_t)a.s8[2 + endian];
    446  t.s16[1] = (int16_t)a.s8[1 + endian];
    447  t.s16[0] = (int16_t)a.s8[0 + endian];
    448  return t;
    449 }
    450 
    451 SIMD_INLINE c_v64 c_v64_unpackhi_s8_s16(c_v64 a) {
    452  c_v64 t;
    453  int endian = !!CONFIG_BIG_ENDIAN * 4;
    454  t.s16[3] = (int16_t)a.s8[7 - endian];
    455  t.s16[2] = (int16_t)a.s8[6 - endian];
    456  t.s16[1] = (int16_t)a.s8[5 - endian];
    457  t.s16[0] = (int16_t)a.s8[4 - endian];
    458  return t;
    459 }
    460 
    461 SIMD_INLINE c_v64 c_v64_pack_s32_s16(c_v64 a, c_v64 b) {
    462  c_v64 t;
    463  if (CONFIG_BIG_ENDIAN) {
    464    c_v64 u = a;
    465    a = b;
    466    b = u;
    467  }
    468  t.s16[3] = SIMD_CLAMP(a.s32[1], -32768, 32767);
    469  t.s16[2] = SIMD_CLAMP(a.s32[0], -32768, 32767);
    470  t.s16[1] = SIMD_CLAMP(b.s32[1], -32768, 32767);
    471  t.s16[0] = SIMD_CLAMP(b.s32[0], -32768, 32767);
    472  return t;
    473 }
    474 
    475 SIMD_INLINE c_v64 c_v64_pack_s32_u16(c_v64 a, c_v64 b) {
    476  c_v64 t;
    477  if (CONFIG_BIG_ENDIAN) {
    478    c_v64 u = a;
    479    a = b;
    480    b = u;
    481  }
    482  t.u16[3] = SIMD_CLAMP(a.s32[1], 0, 65535);
    483  t.u16[2] = SIMD_CLAMP(a.s32[0], 0, 65535);
    484  t.u16[1] = SIMD_CLAMP(b.s32[1], 0, 65535);
    485  t.u16[0] = SIMD_CLAMP(b.s32[0], 0, 65535);
    486  return t;
    487 }
    488 
    489 SIMD_INLINE c_v64 c_v64_pack_s16_u8(c_v64 a, c_v64 b) {
    490  c_v64 t;
    491  if (CONFIG_BIG_ENDIAN) {
    492    c_v64 u = a;
    493    a = b;
    494    b = u;
    495  }
    496  t.u8[7] = SIMD_CLAMP(a.s16[3], 0, 255);
    497  t.u8[6] = SIMD_CLAMP(a.s16[2], 0, 255);
    498  t.u8[5] = SIMD_CLAMP(a.s16[1], 0, 255);
    499  t.u8[4] = SIMD_CLAMP(a.s16[0], 0, 255);
    500  t.u8[3] = SIMD_CLAMP(b.s16[3], 0, 255);
    501  t.u8[2] = SIMD_CLAMP(b.s16[2], 0, 255);
    502  t.u8[1] = SIMD_CLAMP(b.s16[1], 0, 255);
    503  t.u8[0] = SIMD_CLAMP(b.s16[0], 0, 255);
    504  return t;
    505 }
    506 
    507 SIMD_INLINE c_v64 c_v64_pack_s16_s8(c_v64 a, c_v64 b) {
    508  c_v64 t;
    509  if (CONFIG_BIG_ENDIAN) {
    510    c_v64 u = a;
    511    a = b;
    512    b = u;
    513  }
    514  t.s8[7] = SIMD_CLAMP(a.s16[3], -128, 127);
    515  t.s8[6] = SIMD_CLAMP(a.s16[2], -128, 127);
    516  t.s8[5] = SIMD_CLAMP(a.s16[1], -128, 127);
    517  t.s8[4] = SIMD_CLAMP(a.s16[0], -128, 127);
    518  t.s8[3] = SIMD_CLAMP(b.s16[3], -128, 127);
    519  t.s8[2] = SIMD_CLAMP(b.s16[2], -128, 127);
    520  t.s8[1] = SIMD_CLAMP(b.s16[1], -128, 127);
    521  t.s8[0] = SIMD_CLAMP(b.s16[0], -128, 127);
    522  return t;
    523 }
    524 
    525 SIMD_INLINE c_v64 c_v64_unpacklo_u16_s32(c_v64 a) {
    526  c_v64 t;
    527  t.s32[1] = a.u16[1 + !!CONFIG_BIG_ENDIAN * 2];
    528  t.s32[0] = a.u16[0 + !!CONFIG_BIG_ENDIAN * 2];
    529  return t;
    530 }
    531 
    532 SIMD_INLINE c_v64 c_v64_unpacklo_s16_s32(c_v64 a) {
    533  c_v64 t;
    534  t.s32[1] = a.s16[1 + !!CONFIG_BIG_ENDIAN * 2];
    535  t.s32[0] = a.s16[0 + !!CONFIG_BIG_ENDIAN * 2];
    536  return t;
    537 }
    538 
    539 SIMD_INLINE c_v64 c_v64_unpackhi_u16_s32(c_v64 a) {
    540  c_v64 t;
    541  t.s32[1] = a.u16[3 - !!CONFIG_BIG_ENDIAN * 2];
    542  t.s32[0] = a.u16[2 - !!CONFIG_BIG_ENDIAN * 2];
    543  return t;
    544 }
    545 
    546 SIMD_INLINE c_v64 c_v64_unpackhi_s16_s32(c_v64 a) {
    547  c_v64 t;
    548  t.s32[1] = a.s16[3 - !!CONFIG_BIG_ENDIAN * 2];
    549  t.s32[0] = a.s16[2 - !!CONFIG_BIG_ENDIAN * 2];
    550  return t;
    551 }
    552 
    553 SIMD_INLINE c_v64 c_v64_shuffle_8(c_v64 a, c_v64 pattern) {
    554  c_v64 t;
    555  int c;
    556  for (c = 0; c < 8; c++) {
    557    if (SIMD_CHECK && (pattern.u8[c] & ~7)) {
    558      fprintf(stderr, "Error: Undefined v64_shuffle_8 index %d/%d\n",
    559              pattern.u8[c], c);
    560      abort();
    561    }
    562    t.u8[c] =
    563        a.u8[CONFIG_BIG_ENDIAN ? 7 - (pattern.u8[c] & 7) : pattern.u8[c] & 7];
    564  }
    565  return t;
    566 }
    567 
    568 SIMD_INLINE int64_t c_v64_dotp_su8(c_v64 a, c_v64 b) {
    569  return a.s8[7] * b.u8[7] + a.s8[6] * b.u8[6] + a.s8[5] * b.u8[5] +
    570         a.s8[4] * b.u8[4] + a.s8[3] * b.u8[3] + a.s8[2] * b.u8[2] +
    571         a.s8[1] * b.u8[1] + a.s8[0] * b.u8[0];
    572 }
    573 
    574 SIMD_INLINE int64_t c_v64_dotp_s16(c_v64 a, c_v64 b) {
    575  return (int64_t)(a.s16[3] * b.s16[3] + a.s16[2] * b.s16[2]) +
    576         (int64_t)(a.s16[1] * b.s16[1] + a.s16[0] * b.s16[0]);
    577 }
    578 
    579 SIMD_INLINE uint64_t c_v64_hadd_u8(c_v64 a) {
    580  return a.u8[7] + a.u8[6] + a.u8[5] + a.u8[4] + a.u8[3] + a.u8[2] + a.u8[1] +
    581         a.u8[0];
    582 }
    583 
    584 SIMD_INLINE int64_t c_v64_hadd_s16(c_v64 a) {
    585  return a.s16[3] + a.s16[2] + a.s16[1] + a.s16[0];
    586 }
    587 
    588 typedef struct {
    589  uint32_t val;
    590  int count;
    591 } c_sad64_internal;
    592 
    593 SIMD_INLINE c_sad64_internal c_v64_sad_u8_init(void) {
    594  c_sad64_internal t;
    595  t.val = t.count = 0;
    596  return t;
    597 }
    598 
    599 /* Implementation dependent return value.  Result must be finalised with
    600   v64_sad_u8_sum(). The result for more than 32 v64_sad_u8() calls is
    601   undefined. */
    602 SIMD_INLINE c_sad64_internal c_v64_sad_u8(c_sad64_internal s, c_v64 a,
    603                                          c_v64 b) {
    604  int c;
    605  for (c = 0; c < 8; c++)
    606    s.val += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
    607  s.count++;
    608  if (SIMD_CHECK && s.count > 32) {
    609    fprintf(stderr,
    610            "Error: sad called 32 times returning an undefined result\n");
    611    abort();
    612  }
    613  return s;
    614 }
    615 
    616 SIMD_INLINE uint32_t c_v64_sad_u8_sum(c_sad64_internal s) { return s.val; }
    617 
    618 typedef uint32_t c_ssd64_internal;
    619 
    620 /* Implementation dependent return value.  Result must be finalised with
    621 * v64_ssd_u8_sum(). */
    622 SIMD_INLINE c_ssd64_internal c_v64_ssd_u8_init(void) { return 0; }
    623 
    624 SIMD_INLINE c_ssd64_internal c_v64_ssd_u8(c_ssd64_internal s, c_v64 a,
    625                                          c_v64 b) {
    626  int c;
    627  for (c = 0; c < 8; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]);
    628  return s;
    629 }
    630 
    631 SIMD_INLINE uint32_t c_v64_ssd_u8_sum(c_ssd64_internal s) { return s; }
    632 
    633 SIMD_INLINE c_v64 c_v64_or(c_v64 a, c_v64 b) {
    634  c_v64 t;
    635  t.u64 = a.u64 | b.u64;
    636  return t;
    637 }
    638 
    639 SIMD_INLINE c_v64 c_v64_xor(c_v64 a, c_v64 b) {
    640  c_v64 t;
    641  t.u64 = a.u64 ^ b.u64;
    642  return t;
    643 }
    644 
    645 SIMD_INLINE c_v64 c_v64_and(c_v64 a, c_v64 b) {
    646  c_v64 t;
    647  t.u64 = a.u64 & b.u64;
    648  return t;
    649 }
    650 
    651 SIMD_INLINE c_v64 c_v64_andn(c_v64 a, c_v64 b) {
    652  c_v64 t;
    653  t.u64 = a.u64 & ~b.u64;
    654  return t;
    655 }
    656 
    657 SIMD_INLINE c_v64 c_v64_mullo_s16(c_v64 a, c_v64 b) {
    658  c_v64 t;
    659  int c;
    660  for (c = 0; c < 4; c++) t.s16[c] = (int16_t)(a.s16[c] * b.s16[c]);
    661  return t;
    662 }
    663 
    664 SIMD_INLINE c_v64 c_v64_mulhi_s16(c_v64 a, c_v64 b) {
    665  c_v64 t;
    666  int c;
    667  for (c = 0; c < 4; c++) t.s16[c] = (a.s16[c] * b.s16[c]) >> 16;
    668  return t;
    669 }
    670 
    671 SIMD_INLINE c_v64 c_v64_mullo_s32(c_v64 a, c_v64 b) {
    672  c_v64 t;
    673  t.s32[0] = (int32_t)((int64_t)a.s32[0] * b.s32[0]);
    674  t.s32[1] = (int32_t)((int64_t)a.s32[1] * b.s32[1]);
    675  return t;
    676 }
    677 
    678 SIMD_INLINE c_v64 c_v64_madd_s16(c_v64 a, c_v64 b) {
    679  c_v64 t;
    680  t.s32[0] = a.s16[0] * b.s16[0] + a.s16[1] * b.s16[1];
    681  t.s32[1] = a.s16[2] * b.s16[2] + a.s16[3] * b.s16[3];
    682  return t;
    683 }
    684 
    685 SIMD_INLINE c_v64 c_v64_madd_us8(c_v64 a, c_v64 b) {
    686  c_v64 t;
    687  int32_t u;
    688  u = a.u8[0] * b.s8[0] + a.u8[1] * b.s8[1];
    689  t.s16[0] = SIMD_CLAMP(u, -32768, 32767);
    690  u = a.u8[2] * b.s8[2] + a.u8[3] * b.s8[3];
    691  t.s16[1] = SIMD_CLAMP(u, -32768, 32767);
    692  u = a.u8[4] * b.s8[4] + a.u8[5] * b.s8[5];
    693  t.s16[2] = SIMD_CLAMP(u, -32768, 32767);
    694  u = a.u8[6] * b.s8[6] + a.u8[7] * b.s8[7];
    695  t.s16[3] = SIMD_CLAMP(u, -32768, 32767);
    696  return t;
    697 }
    698 
    699 SIMD_INLINE c_v64 c_v64_avg_u8(c_v64 a, c_v64 b) {
    700  c_v64 t;
    701  int c;
    702  for (c = 0; c < 8; c++) t.u8[c] = (a.u8[c] + b.u8[c] + 1) >> 1;
    703  return t;
    704 }
    705 
    706 SIMD_INLINE c_v64 c_v64_rdavg_u8(c_v64 a, c_v64 b) {
    707  c_v64 t;
    708  int c;
    709  for (c = 0; c < 8; c++) t.u8[c] = (a.u8[c] + b.u8[c]) >> 1;
    710  return t;
    711 }
    712 
    713 SIMD_INLINE c_v64 c_v64_rdavg_u16(c_v64 a, c_v64 b) {
    714  c_v64 t;
    715  int c;
    716  for (c = 0; c < 4; c++) t.u16[c] = (a.u16[c] + b.u16[c]) >> 1;
    717  return t;
    718 }
    719 
    720 SIMD_INLINE c_v64 c_v64_avg_u16(c_v64 a, c_v64 b) {
    721  c_v64 t;
    722  int c;
    723  for (c = 0; c < 4; c++) t.u16[c] = (a.u16[c] + b.u16[c] + 1) >> 1;
    724  return t;
    725 }
    726 
    727 SIMD_INLINE c_v64 c_v64_min_u8(c_v64 a, c_v64 b) {
    728  c_v64 t;
    729  int c;
    730  for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] > b.u8[c] ? b.u8[c] : a.u8[c];
    731  return t;
    732 }
    733 
    734 SIMD_INLINE c_v64 c_v64_max_u8(c_v64 a, c_v64 b) {
    735  c_v64 t;
    736  int c;
    737  for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] > b.u8[c] ? a.u8[c] : b.u8[c];
    738  return t;
    739 }
    740 
    741 SIMD_INLINE c_v64 c_v64_min_s8(c_v64 a, c_v64 b) {
    742  c_v64 t;
    743  int c;
    744  for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] > b.s8[c] ? b.s8[c] : a.s8[c];
    745  return t;
    746 }
    747 
    748 SIMD_INLINE c_v64 c_v64_max_s8(c_v64 a, c_v64 b) {
    749  c_v64 t;
    750  int c;
    751  for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] > b.s8[c] ? a.s8[c] : b.s8[c];
    752  return t;
    753 }
    754 
    755 SIMD_INLINE c_v64 c_v64_min_s16(c_v64 a, c_v64 b) {
    756  c_v64 t;
    757  int c;
    758  for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] > b.s16[c] ? b.s16[c] : a.s16[c];
    759  return t;
    760 }
    761 
    762 SIMD_INLINE c_v64 c_v64_max_s16(c_v64 a, c_v64 b) {
    763  c_v64 t;
    764  int c;
    765  for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] > b.s16[c] ? a.s16[c] : b.s16[c];
    766  return t;
    767 }
    768 
    769 SIMD_INLINE c_v64 c_v64_cmpgt_s8(c_v64 a, c_v64 b) {
    770  c_v64 t;
    771  int c;
    772  for (c = 0; c < 8; c++) t.s8[c] = -(a.s8[c] > b.s8[c]);
    773  return t;
    774 }
    775 
    776 SIMD_INLINE c_v64 c_v64_cmplt_s8(c_v64 a, c_v64 b) {
    777  c_v64 t;
    778  int c;
    779  for (c = 0; c < 8; c++) t.s8[c] = -(a.s8[c] < b.s8[c]);
    780  return t;
    781 }
    782 
    783 SIMD_INLINE c_v64 c_v64_cmpeq_8(c_v64 a, c_v64 b) {
    784  c_v64 t;
    785  int c;
    786  for (c = 0; c < 8; c++) t.s8[c] = -(a.u8[c] == b.u8[c]);
    787  return t;
    788 }
    789 
    790 SIMD_INLINE c_v64 c_v64_cmpgt_s16(c_v64 a, c_v64 b) {
    791  c_v64 t;
    792  int c;
    793  for (c = 0; c < 4; c++) t.s16[c] = -(a.s16[c] > b.s16[c]);
    794  return t;
    795 }
    796 
    797 SIMD_INLINE c_v64 c_v64_cmplt_s16(c_v64 a, c_v64 b) {
    798  c_v64 t;
    799  int c;
    800  for (c = 0; c < 4; c++) t.s16[c] = -(a.s16[c] < b.s16[c]);
    801  return t;
    802 }
    803 
    804 SIMD_INLINE c_v64 c_v64_cmpeq_16(c_v64 a, c_v64 b) {
    805  c_v64 t;
    806  int c;
    807  for (c = 0; c < 4; c++) t.s16[c] = -(a.u16[c] == b.u16[c]);
    808  return t;
    809 }
    810 
    811 SIMD_INLINE c_v64 c_v64_shl_8(c_v64 a, unsigned int n) {
    812  c_v64 t;
    813  int c;
    814  if (SIMD_CHECK && n > 7) {
    815    fprintf(stderr, "Error: Undefined u8 shift left %d\n", n);
    816    abort();
    817  }
    818  for (c = 0; c < 8; c++) t.s8[c] = (int8_t)(a.u8[c] << n);
    819  return t;
    820 }
    821 
    822 SIMD_INLINE c_v64 c_v64_shr_u8(c_v64 a, unsigned int n) {
    823  c_v64 t;
    824  int c;
    825  if (SIMD_CHECK && n > 7) {
    826    fprintf(stderr, "Error: Undefined u8 shift right %d\n", n);
    827    abort();
    828  }
    829  for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] >> n;
    830  return t;
    831 }
    832 
    833 SIMD_INLINE c_v64 c_v64_shr_s8(c_v64 a, unsigned int n) {
    834  c_v64 t;
    835  int c;
    836  if (SIMD_CHECK && n > 7) {
    837    fprintf(stderr, "Error: Undefined s8 shift right %d\n", n);
    838    abort();
    839  }
    840  for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] >> n;
    841  return t;
    842 }
    843 
    844 SIMD_INLINE c_v64 c_v64_shl_16(c_v64 a, unsigned int n) {
    845  c_v64 t;
    846  int c;
    847  if (SIMD_CHECK && n > 15) {
    848    fprintf(stderr, "Error: Undefined u16 shift left %d\n", n);
    849    abort();
    850  }
    851  for (c = 0; c < 4; c++) t.u16[c] = (uint16_t)(a.u16[c] << n);
    852  return t;
    853 }
    854 
    855 SIMD_INLINE c_v64 c_v64_shr_u16(c_v64 a, unsigned int n) {
    856  c_v64 t;
    857  int c;
    858  if (SIMD_CHECK && n > 15) {
    859    fprintf(stderr, "Error: Undefined u16 shift right %d\n", n);
    860    abort();
    861  }
    862  for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] >> n;
    863  return t;
    864 }
    865 
    866 SIMD_INLINE c_v64 c_v64_shr_s16(c_v64 a, unsigned int n) {
    867  c_v64 t;
    868  int c;
    869  if (SIMD_CHECK && n > 15) {
    870    fprintf(stderr, "Error: undefined s16 shift right %d\n", n);
    871    abort();
    872  }
    873  for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] >> n;
    874  return t;
    875 }
    876 
    877 SIMD_INLINE c_v64 c_v64_shl_32(c_v64 a, unsigned int n) {
    878  c_v64 t;
    879  if (SIMD_CHECK && n > 31) {
    880    fprintf(stderr, "Error: undefined u32 shift left %d\n", n);
    881    abort();
    882  }
    883  t.u32[1] = a.u32[1] << n;
    884  t.u32[0] = a.u32[0] << n;
    885  return t;
    886 }
    887 
    888 SIMD_INLINE c_v64 c_v64_shr_u32(c_v64 a, unsigned int n) {
    889  c_v64 t;
    890  if (SIMD_CHECK && n > 31) {
    891    fprintf(stderr, "Error: undefined u32 shift right %d\n", n);
    892    abort();
    893  }
    894  t.u32[1] = a.u32[1] >> n;
    895  t.u32[0] = a.u32[0] >> n;
    896  return t;
    897 }
    898 
    899 SIMD_INLINE c_v64 c_v64_shr_s32(c_v64 a, unsigned int n) {
    900  c_v64 t;
    901  if (SIMD_CHECK && n > 31) {
    902    fprintf(stderr, "Error: undefined s32 shift right %d\n", n);
    903    abort();
    904  }
    905  t.s32[1] = a.s32[1] >> n;
    906  t.s32[0] = a.s32[0] >> n;
    907  return t;
    908 }
    909 
    910 SIMD_INLINE c_v64 c_v64_shr_n_byte(c_v64 x, unsigned int i) {
    911  c_v64 t;
    912  t.u64 = x.u64 >> i * 8;
    913  return t;
    914 }
    915 
    916 SIMD_INLINE c_v64 c_v64_shl_n_byte(c_v64 x, unsigned int i) {
    917  c_v64 t;
    918  t.u64 = x.u64 << i * 8;
    919  return t;
    920 }
    921 
    922 SIMD_INLINE c_v64 c_v64_align(c_v64 a, c_v64 b, unsigned int c) {
    923  if (SIMD_CHECK && c > 7) {
    924    fprintf(stderr, "Error: undefined alignment %d\n", c);
    925    abort();
    926  }
    927  return c ? c_v64_or(c_v64_shr_n_byte(b, c), c_v64_shl_n_byte(a, 8 - c)) : b;
    928 }
    929 
    930 SIMD_INLINE c_v64 c_v64_shl_n_8(c_v64 a, unsigned int c) {
    931  return c_v64_shl_8(a, c);
    932 }
    933 
    934 SIMD_INLINE c_v64 c_v64_shr_n_u8(c_v64 a, unsigned int c) {
    935  return c_v64_shr_u8(a, c);
    936 }
    937 
    938 SIMD_INLINE c_v64 c_v64_shr_n_s8(c_v64 a, unsigned int c) {
    939  return c_v64_shr_s8(a, c);
    940 }
    941 
    942 SIMD_INLINE c_v64 c_v64_shl_n_16(c_v64 a, unsigned int c) {
    943  return c_v64_shl_16(a, c);
    944 }
    945 
    946 SIMD_INLINE c_v64 c_v64_shr_n_u16(c_v64 a, unsigned int c) {
    947  return c_v64_shr_u16(a, c);
    948 }
    949 
    950 SIMD_INLINE c_v64 c_v64_shr_n_s16(c_v64 a, unsigned int c) {
    951  return c_v64_shr_s16(a, c);
    952 }
    953 
    954 SIMD_INLINE c_v64 c_v64_shl_n_32(c_v64 a, unsigned int c) {
    955  return c_v64_shl_32(a, c);
    956 }
    957 
    958 SIMD_INLINE c_v64 c_v64_shr_n_u32(c_v64 a, unsigned int c) {
    959  return c_v64_shr_u32(a, c);
    960 }
    961 
    962 SIMD_INLINE c_v64 c_v64_shr_n_s32(c_v64 a, unsigned int c) {
    963  return c_v64_shr_s32(a, c);
    964 }
    965 
    966 #endif  // AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_