tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

v256_intrinsics_v128.h (27920B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_
     13 #define AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_
     14 
     15 #include "config/aom_config.h"
     16 
     17 #if HAVE_NEON
     18 #error "Do not use this file for Neon"
     19 #endif
     20 
     21 #if HAVE_SSE2
     22 #include "aom_dsp/simd/v128_intrinsics_x86.h"
     23 #else
     24 #include "aom_dsp/simd/v128_intrinsics.h"
     25 #endif
     26 
     27 typedef struct {
     28  v128 val[2];
     29 } v256;
     30 
     31 SIMD_INLINE uint32_t v256_low_u32(v256 a) { return v128_low_u32(a.val[0]); }
     32 
     33 SIMD_INLINE v64 v256_low_v64(v256 a) { return v128_low_v64(a.val[0]); }
     34 
     35 SIMD_INLINE uint64_t v256_low_u64(v256 a) { return v64_u64(v256_low_v64(a)); }
     36 
     37 SIMD_INLINE v128 v256_low_v128(v256 a) { return a.val[0]; }
     38 
     39 SIMD_INLINE v128 v256_high_v128(v256 a) { return a.val[1]; }
     40 
     41 SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) {
     42  v256 t;
     43  t.val[1] = hi;
     44  t.val[0] = lo;
     45  return t;
     46 }
     47 
     48 SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
     49  return v256_from_v128(v128_from_64(a, b), v128_from_64(c, d));
     50 }
     51 
     52 SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
     53  return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d));
     54 }
     55 
     56 SIMD_INLINE v256 v256_load_unaligned(const void *p) {
     57  return v256_from_v128(v128_load_unaligned((uint8_t *)p + 16),
     58                        v128_load_unaligned(p));
     59 }
     60 
     61 SIMD_INLINE v256 v256_load_aligned(const void *p) {
     62  return v256_from_v128(v128_load_aligned((uint8_t *)p + 16),
     63                        v128_load_aligned(p));
     64 }
     65 
     66 SIMD_INLINE void v256_store_unaligned(void *p, v256 a) {
     67  v128_store_unaligned(p, a.val[0]);
     68  v128_store_unaligned((uint8_t *)p + 16, a.val[1]);
     69 }
     70 
     71 SIMD_INLINE void v256_store_aligned(void *p, v256 a) {
     72  v128_store_aligned(p, a.val[0]);
     73  v128_store_aligned((uint8_t *)p + 16, a.val[1]);
     74 }
     75 
     76 SIMD_INLINE v256 v256_zero(void) {
     77  return v256_from_v128(v128_zero(), v128_zero());
     78 }
     79 
     80 SIMD_INLINE v256 v256_dup_8(uint8_t x) {
     81  v128 t = v128_dup_8(x);
     82  return v256_from_v128(t, t);
     83 }
     84 
     85 SIMD_INLINE v256 v256_dup_16(uint16_t x) {
     86  v128 t = v128_dup_16(x);
     87  return v256_from_v128(t, t);
     88 }
     89 
     90 SIMD_INLINE v256 v256_dup_32(uint32_t x) {
     91  v128 t = v128_dup_32(x);
     92  return v256_from_v128(t, t);
     93 }
     94 
     95 SIMD_INLINE v256 v256_dup_64(uint64_t x) {
     96  v128 t = v128_dup_64(x);
     97  return v256_from_v128(t, t);
     98 }
     99 
    100 SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) {
    101  return v128_dotp_su8(a.val[1], b.val[1]) + v128_dotp_su8(a.val[0], b.val[0]);
    102 }
    103 
    104 SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
    105  return v128_dotp_s16(a.val[1], b.val[1]) + v128_dotp_s16(a.val[0], b.val[0]);
    106 }
    107 
    108 SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) {
    109  return v128_dotp_s32(a.val[1], b.val[1]) + v128_dotp_s32(a.val[0], b.val[0]);
    110 }
    111 
    112 SIMD_INLINE uint64_t v256_hadd_u8(v256 a) {
    113  return v128_hadd_u8(a.val[1]) + v128_hadd_u8(a.val[0]);
    114 }
    115 
    116 typedef struct {
    117  sad128_internal val[2];
    118 } sad256_internal;
    119 
    120 SIMD_INLINE sad256_internal v256_sad_u8_init(void) {
    121  sad256_internal t;
    122  t.val[1] = v128_sad_u8_init();
    123  t.val[0] = v128_sad_u8_init();
    124  return t;
    125 }
    126 
    127 /* Implementation dependent return value.  Result must be finalised with
    128   v256_sad_u8_sum().
    129   The result for more than 16 v256_sad_u8() calls is undefined. */
    130 SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
    131  sad256_internal t;
    132  t.val[1] = v128_sad_u8(s.val[1], a.val[1], b.val[1]);
    133  t.val[0] = v128_sad_u8(s.val[0], a.val[0], b.val[0]);
    134  return t;
    135 }
    136 
    137 SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) {
    138  return v128_sad_u8_sum(s.val[1]) + v128_sad_u8_sum(s.val[0]);
    139 }
    140 
    141 typedef struct {
    142  ssd128_internal val[2];
    143 } ssd256_internal;
    144 
    145 SIMD_INLINE ssd256_internal v256_ssd_u8_init(void) {
    146  ssd256_internal t;
    147  t.val[1] = v128_ssd_u8_init();
    148  t.val[0] = v128_ssd_u8_init();
    149  return t;
    150 }
    151 
    152 /* Implementation dependent return value.  Result must be finalised with
    153 * v256_ssd_u8_sum(). */
    154 SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
    155  ssd256_internal t;
    156  t.val[1] = v128_ssd_u8(s.val[1], a.val[1], b.val[1]);
    157  t.val[0] = v128_ssd_u8(s.val[0], a.val[0], b.val[0]);
    158  return t;
    159 }
    160 
    161 SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
    162  return v128_ssd_u8_sum(s.val[1]) + v128_ssd_u8_sum(s.val[0]);
    163 }
    164 
    165 SIMD_INLINE v256 v256_or(v256 a, v256 b) {
    166  return v256_from_v128(v128_or(a.val[1], b.val[1]),
    167                        v128_or(a.val[0], b.val[0]));
    168 }
    169 
    170 SIMD_INLINE v256 v256_xor(v256 a, v256 b) {
    171  return v256_from_v128(v128_xor(a.val[1], b.val[1]),
    172                        v128_xor(a.val[0], b.val[0]));
    173 }
    174 
    175 SIMD_INLINE v256 v256_and(v256 a, v256 b) {
    176  return v256_from_v128(v128_and(a.val[1], b.val[1]),
    177                        v128_and(a.val[0], b.val[0]));
    178 }
    179 
    180 SIMD_INLINE v256 v256_andn(v256 a, v256 b) {
    181  return v256_from_v128(v128_andn(a.val[1], b.val[1]),
    182                        v128_andn(a.val[0], b.val[0]));
    183 }
    184 
    185 SIMD_INLINE v256 v256_add_8(v256 a, v256 b) {
    186  return v256_from_v128(v128_add_8(a.val[1], b.val[1]),
    187                        v128_add_8(a.val[0], b.val[0]));
    188 }
    189 
    190 SIMD_INLINE v256 v256_add_16(v256 a, v256 b) {
    191  return v256_from_v128(v128_add_16(a.val[1], b.val[1]),
    192                        v128_add_16(a.val[0], b.val[0]));
    193 }
    194 
    195 SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) {
    196  return v256_from_v128(v128_sadd_s8(a.val[1], b.val[1]),
    197                        v128_sadd_s8(a.val[0], b.val[0]));
    198 }
    199 
    200 SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) {
    201  return v256_from_v128(v128_sadd_u8(a.val[1], b.val[1]),
    202                        v128_sadd_u8(a.val[0], b.val[0]));
    203 }
    204 
    205 SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) {
    206  return v256_from_v128(v128_sadd_s16(a.val[1], b.val[1]),
    207                        v128_sadd_s16(a.val[0], b.val[0]));
    208 }
    209 
    210 SIMD_INLINE v256 v256_add_32(v256 a, v256 b) {
    211  return v256_from_v128(v128_add_32(a.val[1], b.val[1]),
    212                        v128_add_32(a.val[0], b.val[0]));
    213 }
    214 
    215 SIMD_INLINE v256 v256_add_64(v256 a, v256 b) {
    216  return v256_from_v128(v128_add_64(a.val[1], b.val[1]),
    217                        v128_add_64(a.val[0], b.val[0]));
    218 }
    219 
    220 SIMD_INLINE v256 v256_padd_u8(v256 a) {
    221  return v256_from_v128(v128_padd_u8(a.val[1]), v128_padd_u8(a.val[0]));
    222 }
    223 
    224 SIMD_INLINE v256 v256_padd_s16(v256 a) {
    225  return v256_from_v128(v128_padd_s16(a.val[1]), v128_padd_s16(a.val[0]));
    226 }
    227 
    228 SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) {
    229  return v256_from_v128(v128_sub_8(a.val[1], b.val[1]),
    230                        v128_sub_8(a.val[0], b.val[0]));
    231 }
    232 
    233 SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) {
    234  return v256_from_v128(v128_ssub_u8(a.val[1], b.val[1]),
    235                        v128_ssub_u8(a.val[0], b.val[0]));
    236 }
    237 
    238 SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) {
    239  return v256_from_v128(v128_ssub_s8(a.val[1], b.val[1]),
    240                        v128_ssub_s8(a.val[0], b.val[0]));
    241 }
    242 
    243 SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) {
    244  return v256_from_v128(v128_sub_16(a.val[1], b.val[1]),
    245                        v128_sub_16(a.val[0], b.val[0]));
    246 }
    247 
    248 SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) {
    249  return v256_from_v128(v128_ssub_s16(a.val[1], b.val[1]),
    250                        v128_ssub_s16(a.val[0], b.val[0]));
    251 }
    252 
    253 SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) {
    254  return v256_from_v128(v128_ssub_u16(a.val[1], b.val[1]),
    255                        v128_ssub_u16(a.val[0], b.val[0]));
    256 }
    257 
    258 SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) {
    259  return v256_from_v128(v128_sub_32(a.val[1], b.val[1]),
    260                        v128_sub_32(a.val[0], b.val[0]));
    261 }
    262 
    263 SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) {
    264  return v256_from_v128(v128_sub_64(a.val[1], b.val[1]),
    265                        v128_sub_64(a.val[0], b.val[0]));
    266 }
    267 
    268 SIMD_INLINE v256 v256_abs_s16(v256 a) {
    269  return v256_from_v128(v128_abs_s16(a.val[1]), v128_abs_s16(a.val[0]));
    270 }
    271 
    272 SIMD_INLINE v256 v256_abs_s8(v256 a) {
    273  return v256_from_v128(v128_abs_s8(a.val[1]), v128_abs_s8(a.val[0]));
    274 }
    275 
    276 SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) {
    277  v128 lo_bits = v128_mullo_s16(a, b);
    278  v128 hi_bits = v128_mulhi_s16(a, b);
    279  return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits),
    280                        v128_ziplo_16(hi_bits, lo_bits));
    281 }
    282 
    283 SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
    284  return v256_from_v128(v128_mullo_s16(a.val[1], b.val[1]),
    285                        v128_mullo_s16(a.val[0], b.val[0]));
    286 }
    287 
    288 SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) {
    289  return v256_from_v128(v128_mulhi_s16(a.val[1], b.val[1]),
    290                        v128_mulhi_s16(a.val[0], b.val[0]));
    291 }
    292 
    293 SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
    294  return v256_from_v128(v128_mullo_s32(a.val[1], b.val[1]),
    295                        v128_mullo_s32(a.val[0], b.val[0]));
    296 }
    297 
    298 SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) {
    299  return v256_from_v128(v128_madd_s16(a.val[1], b.val[1]),
    300                        v128_madd_s16(a.val[0], b.val[0]));
    301 }
    302 
    303 SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) {
    304  return v256_from_v128(v128_madd_us8(a.val[1], b.val[1]),
    305                        v128_madd_us8(a.val[0], b.val[0]));
    306 }
    307 
    308 SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) {
    309  return v256_from_v128(v128_avg_u8(a.val[1], b.val[1]),
    310                        v128_avg_u8(a.val[0], b.val[0]));
    311 }
    312 
    313 SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) {
    314  return v256_from_v128(v128_rdavg_u8(a.val[1], b.val[1]),
    315                        v128_rdavg_u8(a.val[0], b.val[0]));
    316 }
    317 
    318 SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) {
    319  return v256_from_v128(v128_rdavg_u16(a.val[1], b.val[1]),
    320                        v128_rdavg_u16(a.val[0], b.val[0]));
    321 }
    322 
    323 SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) {
    324  return v256_from_v128(v128_avg_u16(a.val[1], b.val[1]),
    325                        v128_avg_u16(a.val[0], b.val[0]));
    326 }
    327 
    328 SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) {
    329  return v256_from_v128(v128_min_u8(a.val[1], b.val[1]),
    330                        v128_min_u8(a.val[0], b.val[0]));
    331 }
    332 
    333 SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) {
    334  return v256_from_v128(v128_max_u8(a.val[1], b.val[1]),
    335                        v128_max_u8(a.val[0], b.val[0]));
    336 }
    337 
    338 SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) {
    339  return v256_from_v128(v128_min_s8(a.val[1], b.val[1]),
    340                        v128_min_s8(a.val[0], b.val[0]));
    341 }
    342 
    343 SIMD_INLINE uint32_t v256_movemask_8(v256 a) {
    344  return (v128_movemask_8(v256_high_v128(a)) << 16) |
    345         v128_movemask_8(v256_low_v128(a));
    346 }
    347 
    348 SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) {
    349  return v256_from_v128(v128_blend_8(a.val[1], b.val[1], c.val[1]),
    350                        v128_blend_8(a.val[0], b.val[0], c.val[0]));
    351 }
    352 
    353 SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) {
    354  return v256_from_v128(v128_max_s8(a.val[1], b.val[1]),
    355                        v128_max_s8(a.val[0], b.val[0]));
    356 }
    357 
    358 SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) {
    359  return v256_from_v128(v128_min_s16(a.val[1], b.val[1]),
    360                        v128_min_s16(a.val[0], b.val[0]));
    361 }
    362 
    363 SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) {
    364  return v256_from_v128(v128_max_s16(a.val[1], b.val[1]),
    365                        v128_max_s16(a.val[0], b.val[0]));
    366 }
    367 
    368 SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) {
    369  return v256_from_v128(v128_min_s32(a.val[1], b.val[1]),
    370                        v128_min_s32(a.val[0], b.val[0]));
    371 }
    372 
    373 SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) {
    374  return v256_from_v128(v128_max_s32(a.val[1], b.val[1]),
    375                        v128_max_s32(a.val[0], b.val[0]));
    376 }
    377 
    378 SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) {
    379  return v256_from_v128(v128_ziphi_8(a.val[0], b.val[0]),
    380                        v128_ziplo_8(a.val[0], b.val[0]));
    381 }
    382 
    383 SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) {
    384  return v256_from_v128(v128_ziphi_8(a.val[1], b.val[1]),
    385                        v128_ziplo_8(a.val[1], b.val[1]));
    386 }
    387 
    388 SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) {
    389  return v256_from_v128(v128_ziphi_16(a.val[0], b.val[0]),
    390                        v128_ziplo_16(a.val[0], b.val[0]));
    391 }
    392 
    393 SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) {
    394  return v256_from_v128(v128_ziphi_16(a.val[1], b.val[1]),
    395                        v128_ziplo_16(a.val[1], b.val[1]));
    396 }
    397 
    398 SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) {
    399  return v256_from_v128(v128_ziphi_32(a.val[0], b.val[0]),
    400                        v128_ziplo_32(a.val[0], b.val[0]));
    401 }
    402 
    403 SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) {
    404  return v256_from_v128(v128_ziphi_32(a.val[1], b.val[1]),
    405                        v128_ziplo_32(a.val[1], b.val[1]));
    406 }
    407 
    408 SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) {
    409  return v256_from_v128(v128_ziphi_64(a.val[0], b.val[0]),
    410                        v128_ziplo_64(a.val[0], b.val[0]));
    411 }
    412 
    413 SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) {
    414  return v256_from_v128(v128_ziphi_64(a.val[1], b.val[1]),
    415                        v128_ziplo_64(a.val[1], b.val[1]));
    416 }
    417 
    418 SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
    419  return v256_from_v128(a.val[0], b.val[0]);
    420 }
    421 
    422 SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
    423  return v256_from_v128(a.val[1], b.val[1]);
    424 }
    425 
    426 SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) {
    427  return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b));
    428 }
    429 
    430 SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) {
    431  return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b));
    432 }
    433 
    434 SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) {
    435  return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b));
    436 }
    437 
    438 SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
    439  return v256_from_v128(v128_unziplo_8(a.val[1], a.val[0]),
    440                        v128_unziplo_8(b.val[1], b.val[0]));
    441 }
    442 
    443 SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
    444  return v256_from_v128(v128_unziphi_8(a.val[1], a.val[0]),
    445                        v128_unziphi_8(b.val[1], b.val[0]));
    446 }
    447 
    448 SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
    449  return v256_from_v128(v128_unziplo_16(a.val[1], a.val[0]),
    450                        v128_unziplo_16(b.val[1], b.val[0]));
    451 }
    452 
    453 SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
    454  return v256_from_v128(v128_unziphi_16(a.val[1], a.val[0]),
    455                        v128_unziphi_16(b.val[1], b.val[0]));
    456 }
    457 
    458 SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
    459  return v256_from_v128(v128_unziplo_32(a.val[1], a.val[0]),
    460                        v128_unziplo_32(b.val[1], b.val[0]));
    461 }
    462 
    463 SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
    464  return v256_from_v128(v128_unziphi_32(a.val[1], a.val[0]),
    465                        v128_unziphi_32(b.val[1], b.val[0]));
    466 }
    467 
    468 SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) {
    469 #if HAVE_SSE2
    470  return v256_from_v128(
    471      _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a.val[0]),
    472                                      _mm_castsi128_pd(a.val[1]), 0)),
    473      _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b.val[0]),
    474                                      _mm_castsi128_pd(b.val[1]), 0)));
    475 #else
    476  return v256_from_v64(v128_low_v64(a.val[1]), v128_low_v64(a.val[0]),
    477                       v128_low_v64(b.val[1]), v128_low_v64(b.val[0]));
    478 #endif
    479 }
    480 
    481 SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) {
    482 #if HAVE_SSE2
    483  return v256_from_v128(
    484      _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a.val[0]),
    485                                      _mm_castsi128_pd(a.val[1]), 3)),
    486      _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b.val[0]),
    487                                      _mm_castsi128_pd(b.val[1]), 3)));
    488 #else
    489  return v256_from_v64(v128_high_v64(a.val[1]), v128_high_v64(a.val[0]),
    490                       v128_high_v64(b.val[1]), v128_high_v64(b.val[0]));
    491 #endif
    492 }
    493 
    494 SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) {
    495  return v256_from_v128(v128_unpackhi_u8_s16(a), v128_unpacklo_u8_s16(a));
    496 }
    497 
    498 SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
    499  return v256_from_v128(v128_unpackhi_u8_s16(a.val[0]),
    500                        v128_unpacklo_u8_s16(a.val[0]));
    501 }
    502 
    503 SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
    504  return v256_from_v128(v128_unpackhi_u8_s16(a.val[1]),
    505                        v128_unpacklo_u8_s16(a.val[1]));
    506 }
    507 
    508 SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) {
    509  return v256_from_v128(v128_unpackhi_s8_s16(a), v128_unpacklo_s8_s16(a));
    510 }
    511 
    512 SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) {
    513  return v256_from_v128(v128_unpackhi_s8_s16(a.val[0]),
    514                        v128_unpacklo_s8_s16(a.val[0]));
    515 }
    516 
    517 SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) {
    518  return v256_from_v128(v128_unpackhi_s8_s16(a.val[1]),
    519                        v128_unpacklo_s8_s16(a.val[1]));
    520 }
    521 
    522 SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
    523  return v256_from_v128(v128_pack_s32_s16(a.val[1], a.val[0]),
    524                        v128_pack_s32_s16(b.val[1], b.val[0]));
    525 }
    526 
    527 SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) {
    528  return v256_from_v128(v128_pack_s32_u16(a.val[1], a.val[0]),
    529                        v128_pack_s32_u16(b.val[1], b.val[0]));
    530 }
    531 
    532 SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
    533  return v256_from_v128(v128_pack_s16_u8(a.val[1], a.val[0]),
    534                        v128_pack_s16_u8(b.val[1], b.val[0]));
    535 }
    536 
    537 SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
    538  return v256_from_v128(v128_pack_s16_s8(a.val[1], a.val[0]),
    539                        v128_pack_s16_s8(b.val[1], b.val[0]));
    540 }
    541 
    542 SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
    543  return v256_from_v128(v128_unpackhi_u16_s32(a), v128_unpacklo_u16_s32(a));
    544 }
    545 
    546 SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
    547  return v256_from_v128(v128_unpackhi_s16_s32(a), v128_unpacklo_s16_s32(a));
    548 }
    549 
    550 SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
    551  return v256_from_v128(v128_unpackhi_u16_s32(a.val[0]),
    552                        v128_unpacklo_u16_s32(a.val[0]));
    553 }
    554 
    555 SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
    556  return v256_from_v128(v128_unpackhi_s16_s32(a.val[0]),
    557                        v128_unpacklo_s16_s32(a.val[0]));
    558 }
    559 
    560 SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
    561  return v256_from_v128(v128_unpackhi_u16_s32(a.val[1]),
    562                        v128_unpacklo_u16_s32(a.val[1]));
    563 }
    564 
    565 SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
    566  return v256_from_v128(v128_unpackhi_s16_s32(a.val[1]),
    567                        v128_unpacklo_s16_s32(a.val[1]));
    568 }
    569 
    570 SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) {
    571  return v256_from_v128(v128_cmpgt_s8(a.val[1], b.val[1]),
    572                        v128_cmpgt_s8(a.val[0], b.val[0]));
    573 }
    574 
    575 SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) {
    576  return v256_from_v128(v128_cmplt_s8(a.val[1], b.val[1]),
    577                        v128_cmplt_s8(a.val[0], b.val[0]));
    578 }
    579 
    580 SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) {
    581  return v256_from_v128(v128_cmpeq_8(a.val[1], b.val[1]),
    582                        v128_cmpeq_8(a.val[0], b.val[0]));
    583 }
    584 
    585 SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
    586  return v256_from_v128(v128_cmpgt_s16(a.val[1], b.val[1]),
    587                        v128_cmpgt_s16(a.val[0], b.val[0]));
    588 }
    589 
    590 SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
    591  return v256_from_v128(v128_cmplt_s16(a.val[1], b.val[1]),
    592                        v128_cmplt_s16(a.val[0], b.val[0]));
    593 }
    594 
    595 SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) {
    596  return v256_from_v128(v128_cmpeq_16(a.val[1], b.val[1]),
    597                        v128_cmpeq_16(a.val[0], b.val[0]));
    598 }
    599 
    600 SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) {
    601  return v256_from_v128(v128_cmpgt_s32(a.val[1], b.val[1]),
    602                        v128_cmpgt_s32(a.val[0], b.val[0]));
    603 }
    604 
    605 SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) {
    606  return v256_from_v128(v128_cmplt_s32(a.val[1], b.val[1]),
    607                        v128_cmplt_s32(a.val[0], b.val[0]));
    608 }
    609 
    610 SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) {
    611  return v256_from_v128(v128_cmpeq_32(a.val[1], b.val[1]),
    612                        v128_cmpeq_32(a.val[0], b.val[0]));
    613 }
    614 
    615 SIMD_INLINE v256 v256_shuffle_8(v256 x, v256 pattern) {
    616  v128 c16 = v128_dup_8(16);
    617  v128 maskhi = v128_cmplt_s8(pattern.val[1], c16);
    618  v128 masklo = v128_cmplt_s8(pattern.val[0], c16);
    619  return v256_from_v128(
    620      v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[1], c16)),
    621                   v128_shuffle_8(x.val[0], pattern.val[1]), maskhi),
    622      v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[0], c16)),
    623                   v128_shuffle_8(x.val[0], pattern.val[0]), masklo));
    624 }
    625 
    626 SIMD_INLINE v256 v256_wideshuffle_8(v256 x, v256 y, v256 pattern) {
    627  v128 c16 = v128_dup_8(16);
    628  v128 c32 = v128_dup_8(32);
    629  v128 c48 = v128_dup_8(48);
    630  v128 maskhi16 = v128_cmpgt_s8(c16, pattern.val[1]);
    631  v128 masklo16 = v128_cmpgt_s8(c16, pattern.val[0]);
    632  v128 maskhi48 = v128_cmpgt_s8(c48, pattern.val[1]);
    633  v128 masklo48 = v128_cmpgt_s8(c48, pattern.val[0]);
    634  v256 r1 = v256_from_v128(
    635      v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[1], c48)),
    636                   v128_shuffle_8(x.val[0], v128_sub_8(pattern.val[1], c32)),
    637                   maskhi48),
    638      v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[0], c48)),
    639                   v128_shuffle_8(x.val[0], v128_sub_8(pattern.val[0], c32)),
    640                   masklo48));
    641  v256 r2 = v256_from_v128(
    642      v128_blend_8(v128_shuffle_8(y.val[1], v128_sub_8(pattern.val[1], c16)),
    643                   v128_shuffle_8(y.val[0], pattern.val[1]), maskhi16),
    644      v128_blend_8(v128_shuffle_8(y.val[1], v128_sub_8(pattern.val[0], c16)),
    645                   v128_shuffle_8(y.val[0], pattern.val[0]), masklo16));
    646  return v256_blend_8(r1, r2, v256_cmpgt_s8(v256_from_v128(c32, c32), pattern));
    647 }
    648 
    649 SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
    650  return v256_from_v128(
    651      v128_shuffle_8(v256_high_v128(a), v256_high_v128(pattern)),
    652      v128_shuffle_8(v256_low_v128(a), v256_low_v128(pattern)));
    653 }
    654 
    655 SIMD_INLINE v256 v256_shl_8(v256 a, const unsigned int c) {
    656  return v256_from_v128(v128_shl_8(a.val[1], c), v128_shl_8(a.val[0], c));
    657 }
    658 
    659 SIMD_INLINE v256 v256_shr_u8(v256 a, const unsigned int c) {
    660  return v256_from_v128(v128_shr_u8(a.val[1], c), v128_shr_u8(a.val[0], c));
    661 }
    662 
    663 SIMD_INLINE v256 v256_shr_s8(v256 a, const unsigned int c) {
    664  return v256_from_v128(v128_shr_s8(a.val[1], c), v128_shr_s8(a.val[0], c));
    665 }
    666 
    667 SIMD_INLINE v256 v256_shl_16(v256 a, const unsigned int c) {
    668  return v256_from_v128(v128_shl_16(a.val[1], c), v128_shl_16(a.val[0], c));
    669 }
    670 
    671 SIMD_INLINE v256 v256_shr_u16(v256 a, const unsigned int c) {
    672  return v256_from_v128(v128_shr_u16(a.val[1], c), v128_shr_u16(a.val[0], c));
    673 }
    674 
    675 SIMD_INLINE v256 v256_shr_s16(v256 a, const unsigned int c) {
    676  return v256_from_v128(v128_shr_s16(a.val[1], c), v128_shr_s16(a.val[0], c));
    677 }
    678 
    679 SIMD_INLINE v256 v256_shl_32(v256 a, const unsigned int c) {
    680  return v256_from_v128(v128_shl_32(a.val[1], c), v128_shl_32(a.val[0], c));
    681 }
    682 
    683 SIMD_INLINE v256 v256_shr_u32(v256 a, const unsigned int c) {
    684  return v256_from_v128(v128_shr_u32(a.val[1], c), v128_shr_u32(a.val[0], c));
    685 }
    686 
    687 SIMD_INLINE v256 v256_shr_s32(v256 a, const unsigned int c) {
    688  return v256_from_v128(v128_shr_s32(a.val[1], c), v128_shr_s32(a.val[0], c));
    689 }
    690 
    691 SIMD_INLINE v256 v256_shl_64(v256 a, const unsigned int c) {
    692  return v256_from_v128(v128_shl_64(a.val[1], c), v128_shl_64(a.val[0], c));
    693 }
    694 
    695 SIMD_INLINE v256 v256_shr_u64(v256 a, const unsigned int c) {
    696  return v256_from_v128(v128_shr_u64(a.val[1], c), v128_shr_u64(a.val[0], c));
    697 }
    698 
    699 SIMD_INLINE v256 v256_shr_s64(v256 a, const unsigned int c) {
    700  return v256_from_v128(v128_shr_s64(a.val[1], c), v128_shr_s64(a.val[0], c));
    701 }
    702 
    703 /* These intrinsics require immediate values, so we must use #defines
    704   to enforce that. */
    705 #define v256_shl_n_byte(a, n)                                                \
    706  ((n) < 16 ? v256_from_v128(v128_or(v128_shl_n_byte(a.val[1], n),           \
    707                                     v128_shr_n_byte(a.val[0], 16 - (n))),   \
    708                             v128_shl_n_byte(a.val[0], (n)))                 \
    709            : v256_from_v128(                                                \
    710                  (n) > 16 ? v128_shl_n_byte(a.val[0], (n) - 16) : a.val[0], \
    711                  v128_zero()))
    712 
    713 #define v256_shr_n_byte(a, n)                                                 \
    714  (n == 0                                                                     \
    715       ? a                                                                    \
    716       : ((n) < 16                                                            \
    717              ? v256_from_v128(v128_shr_n_byte(a.val[1], n),                  \
    718                               v128_or(v128_shr_n_byte(a.val[0], n),          \
    719                                       v128_shl_n_byte(a.val[1], 16 - (n))))  \
    720              : v256_from_v128(v128_zero(),                                   \
    721                               (n) > 16 ? v128_shr_n_byte(a.val[1], (n) - 16) \
    722                                        : a.val[1])))
    723 
    724 #define v256_align(a, b, c) \
    725  ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b)
    726 
    727 #define v256_shl_n_8(a, n) \
    728  v256_from_v128(v128_shl_n_8(a.val[1], n), v128_shl_n_8(a.val[0], n))
    729 #define v256_shl_n_16(a, n) \
    730  v256_from_v128(v128_shl_n_16(a.val[1], n), v128_shl_n_16(a.val[0], n))
    731 #define v256_shl_n_32(a, n) \
    732  v256_from_v128(v128_shl_n_32(a.val[1], n), v128_shl_n_32(a.val[0], n))
    733 #define v256_shl_n_64(a, n) \
    734  v256_from_v128(v128_shl_n_64(a.val[1], n), v128_shl_n_64(a.val[0], n))
    735 #define v256_shr_n_u8(a, n) \
    736  v256_from_v128(v128_shr_n_u8(a.val[1], n), v128_shr_n_u8(a.val[0], n))
    737 #define v256_shr_n_u16(a, n) \
    738  v256_from_v128(v128_shr_n_u16(a.val[1], n), v128_shr_n_u16(a.val[0], n))
    739 #define v256_shr_n_u32(a, n) \
    740  v256_from_v128(v128_shr_n_u32(a.val[1], n), v128_shr_n_u32(a.val[0], n))
    741 #define v256_shr_n_u64(a, n) \
    742  v256_from_v128(v128_shr_n_u64(a.val[1], n), v128_shr_n_u64(a.val[0], n))
    743 #define v256_shr_n_s8(a, n) \
    744  v256_from_v128(v128_shr_n_s8(a.val[1], n), v128_shr_n_s8(a.val[0], n))
    745 #define v256_shr_n_s16(a, n) \
    746  v256_from_v128(v128_shr_n_s16(a.val[1], n), v128_shr_n_s16(a.val[0], n))
    747 #define v256_shr_n_s32(a, n) \
    748  v256_from_v128(v128_shr_n_s32(a.val[1], n), v128_shr_n_s32(a.val[0], n))
    749 #define v256_shr_n_s64(a, n) \
    750  v256_from_v128(v128_shr_n_s64(a.val[1], n), v128_shr_n_s64(a.val[0], n))
    751 
    752 #define v256_shr_n_word(a, n) v256_shr_n_byte(a, 2 * (n))
    753 #define v256_shl_n_word(a, n) v256_shl_n_byte(a, 2 * (n))
    754 
    755 typedef struct {
    756  sad128_internal_u16 val[2];
    757 } sad256_internal_u16;
    758 
    759 SIMD_INLINE sad256_internal_u16 v256_sad_u16_init(void) {
    760  sad256_internal_u16 t;
    761  t.val[1] = v128_sad_u16_init();
    762  t.val[0] = v128_sad_u16_init();
    763  return t;
    764 }
    765 
    766 /* Implementation dependent return value.  Result must be finalised with
    767   v256_sad_u16_sum().
    768   The result for more than 16 v256_sad_u16() calls is undefined. */
    769 SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a,
    770                                             v256 b) {
    771  sad256_internal_u16 t;
    772  t.val[1] = v128_sad_u16(s.val[1], a.val[1], b.val[1]);
    773  t.val[0] = v128_sad_u16(s.val[0], a.val[0], b.val[0]);
    774  return t;
    775 }
    776 
    777 SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) {
    778  return v128_sad_u16_sum(s.val[1]) + v128_sad_u16_sum(s.val[0]);
    779 }
    780 
    781 typedef struct {
    782  ssd128_internal_s16 val[2];
    783 } ssd256_internal_s16;
    784 
    785 SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init(void) {
    786  ssd256_internal_s16 t;
    787  t.val[1] = v128_ssd_s16_init();
    788  t.val[0] = v128_ssd_s16_init();
    789  return t;
    790 }
    791 
    792 /* Implementation dependent return value.  Result must be finalised with
    793 * v256_ssd_s16_sum(). */
    794 SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a,
    795                                             v256 b) {
    796  ssd256_internal_s16 t;
    797  t.val[1] = v128_ssd_s16(s.val[1], a.val[1], b.val[1]);
    798  t.val[0] = v128_ssd_s16(s.val[0], a.val[0], b.val[0]);
    799  return t;
    800 }
    801 
    802 SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) {
    803  return v128_ssd_s16_sum(s.val[1]) + v128_ssd_s16_sum(s.val[0]);
    804 }
    805 
    806 #endif  // AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_