tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

v128_intrinsics_x86.h (20903B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_
     13 #define AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_
     14 
     15 #include <stdint.h>
     16 #include "aom_dsp/simd/v64_intrinsics_x86.h"
     17 
     18 typedef __m128i v128;
     19 
     20 SIMD_INLINE uint32_t v128_low_u32(v128 a) {
     21  return (uint32_t)_mm_cvtsi128_si32(a);
     22 }
     23 
     24 SIMD_INLINE v64 v128_low_v64(v128 a) {
     25  return _mm_unpacklo_epi64(a, v64_zero());
     26 }
     27 
     28 SIMD_INLINE v64 v128_high_v64(v128 a) { return _mm_srli_si128(a, 8); }
     29 
     30 SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) {
     31  return _mm_unpacklo_epi64(b, a);
     32 }
     33 
     34 SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) {
     35  return v128_from_v64(v64_from_64(a), v64_from_64(b));
     36 }
     37 
     38 SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
     39  return _mm_set_epi32((int)a, (int)b, (int)c, (int)d);
     40 }
     41 
     42 SIMD_INLINE v128 v128_load_aligned(const void *p) {
     43  return _mm_load_si128((__m128i *)p);
     44 }
     45 
     46 SIMD_INLINE v128 v128_load_unaligned(const void *p) {
     47 #if defined(__SSSE3__)
     48  return _mm_lddqu_si128((__m128i *)p);
     49 #else
     50  return _mm_loadu_si128((__m128i *)p);
     51 #endif
     52 }
     53 
     54 SIMD_INLINE void v128_store_aligned(void *p, v128 a) {
     55  _mm_store_si128((__m128i *)p, a);
     56 }
     57 
     58 SIMD_INLINE void v128_store_unaligned(void *p, v128 a) {
     59  _mm_storeu_si128((__m128i *)p, a);
     60 }
     61 
     62 // The following function requires an immediate.
     63 // Some compilers will check this during optimisation, others wont.
     64 #if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
     65 #if defined(__SSSE3__)
     66 SIMD_INLINE v128 v128_align(v128 a, v128 b, const unsigned int c) {
     67  return c ? _mm_alignr_epi8(a, b, c) : b;
     68 }
     69 #else
     70 #define v128_align(a, b, c) \
     71  ((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b))
     72 #endif
     73 #else
     74 #if defined(__SSSE3__)
     75 #define v128_align(a, b, c) ((c) ? _mm_alignr_epi8(a, b, (uint8_t)(c)) : (b))
     76 #else
     77 #define v128_align(a, b, c) \
     78  ((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b))
     79 #endif
     80 #endif
     81 
     82 SIMD_INLINE v128 v128_zero(void) { return _mm_setzero_si128(); }
     83 
     84 SIMD_INLINE v128 v128_dup_8(uint8_t x) { return _mm_set1_epi8((char)x); }
     85 
     86 SIMD_INLINE v128 v128_dup_16(uint16_t x) { return _mm_set1_epi16((short)x); }
     87 
     88 SIMD_INLINE v128 v128_dup_32(uint32_t x) { return _mm_set1_epi32((int)x); }
     89 
     90 SIMD_INLINE v128 v128_dup_64(uint64_t x) {
     91  // _mm_set_pi64x and _mm_cvtsi64x_si64 missing in some compilers
     92  return _mm_set_epi32((int32_t)(x >> 32), (int32_t)x, (int32_t)(x >> 32),
     93                       (int32_t)x);
     94 }
     95 
     96 SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return _mm_add_epi8(a, b); }
     97 
     98 SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return _mm_add_epi16(a, b); }
     99 
    100 SIMD_INLINE v128 v128_sadd_u8(v128 a, v128 b) { return _mm_adds_epu8(a, b); }
    101 
    102 SIMD_INLINE v128 v128_sadd_s8(v128 a, v128 b) { return _mm_adds_epi8(a, b); }
    103 
    104 SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return _mm_adds_epi16(a, b); }
    105 
    106 SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return _mm_add_epi32(a, b); }
    107 
    108 SIMD_INLINE v128 v128_add_64(v128 a, v128 b) { return _mm_add_epi64(a, b); }
    109 
    110 SIMD_INLINE v128 v128_padd_s16(v128 a) {
    111  return _mm_madd_epi16(a, _mm_set1_epi16(1));
    112 }
    113 
    114 SIMD_INLINE v128 v128_sub_8(v128 a, v128 b) { return _mm_sub_epi8(a, b); }
    115 
    116 SIMD_INLINE v128 v128_ssub_u8(v128 a, v128 b) { return _mm_subs_epu8(a, b); }
    117 
    118 SIMD_INLINE v128 v128_ssub_s8(v128 a, v128 b) { return _mm_subs_epi8(a, b); }
    119 
    120 SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return _mm_sub_epi16(a, b); }
    121 
    122 SIMD_INLINE v128 v128_ssub_s16(v128 a, v128 b) { return _mm_subs_epi16(a, b); }
    123 
    124 SIMD_INLINE v128 v128_ssub_u16(v128 a, v128 b) { return _mm_subs_epu16(a, b); }
    125 
    126 SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return _mm_sub_epi32(a, b); }
    127 
    128 SIMD_INLINE v128 v128_sub_64(v128 a, v128 b) { return _mm_sub_epi64(a, b); }
    129 
    130 SIMD_INLINE v128 v128_abs_s16(v128 a) {
    131 #if defined(__SSSE3__)
    132  return _mm_abs_epi16(a);
    133 #else
    134  return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a));
    135 #endif
    136 }
    137 
    138 SIMD_INLINE v128 v128_abs_s8(v128 a) {
    139 #if defined(__SSSE3__)
    140  return _mm_abs_epi8(a);
    141 #else
    142  v128 sign = _mm_cmplt_epi8(a, _mm_setzero_si128());
    143  return _mm_xor_si128(sign, _mm_add_epi8(a, sign));
    144 #endif
    145 }
    146 
    147 SIMD_INLINE v128 v128_ziplo_8(v128 a, v128 b) {
    148  return _mm_unpacklo_epi8(b, a);
    149 }
    150 
    151 SIMD_INLINE v128 v128_ziphi_8(v128 a, v128 b) {
    152  return _mm_unpackhi_epi8(b, a);
    153 }
    154 
    155 SIMD_INLINE v128 v128_ziplo_16(v128 a, v128 b) {
    156  return _mm_unpacklo_epi16(b, a);
    157 }
    158 
    159 SIMD_INLINE v128 v128_ziphi_16(v128 a, v128 b) {
    160  return _mm_unpackhi_epi16(b, a);
    161 }
    162 
    163 SIMD_INLINE v128 v128_ziplo_32(v128 a, v128 b) {
    164  return _mm_unpacklo_epi32(b, a);
    165 }
    166 
    167 SIMD_INLINE v128 v128_ziphi_32(v128 a, v128 b) {
    168  return _mm_unpackhi_epi32(b, a);
    169 }
    170 
    171 SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) {
    172  return _mm_unpacklo_epi64(b, a);
    173 }
    174 
    175 SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) {
    176  return _mm_unpackhi_epi64(b, a);
    177 }
    178 
    179 SIMD_INLINE v128 v128_zip_8(v64 a, v64 b) { return _mm_unpacklo_epi8(b, a); }
    180 
    181 SIMD_INLINE v128 v128_zip_16(v64 a, v64 b) { return _mm_unpacklo_epi16(b, a); }
    182 
    183 SIMD_INLINE v128 v128_zip_32(v64 a, v64 b) { return _mm_unpacklo_epi32(b, a); }
    184 
    185 SIMD_INLINE v128 v128_unziphi_8(v128 a, v128 b) {
    186  return _mm_packs_epi16(_mm_srai_epi16(b, 8), _mm_srai_epi16(a, 8));
    187 }
    188 
    189 SIMD_INLINE v128 v128_unziplo_8(v128 a, v128 b) {
    190 #if defined(__SSSE3__)
    191 #ifdef __x86_64__
    192  v128 order = _mm_cvtsi64_si128(0x0e0c0a0806040200LL);
    193 #else
    194  v128 order = _mm_set_epi32(0, 0, 0x0e0c0a08, 0x06040200);
    195 #endif
    196  return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order),
    197                            _mm_shuffle_epi8(a, order));
    198 #else
    199  return v128_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1));
    200 #endif
    201 }
    202 
    203 SIMD_INLINE v128 v128_unziphi_16(v128 a, v128 b) {
    204  return _mm_packs_epi32(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16));
    205 }
    206 
    207 SIMD_INLINE v128 v128_unziplo_16(v128 a, v128 b) {
    208 #if defined(__SSSE3__)
    209 #ifdef __x86_64__
    210  v128 order = _mm_cvtsi64_si128(0x0d0c090805040100LL);
    211 #else
    212  v128 order = _mm_set_epi32(0, 0, 0x0d0c0908, 0x05040100);
    213 #endif
    214  return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order),
    215                            _mm_shuffle_epi8(a, order));
    216 #else
    217  return v128_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2));
    218 #endif
    219 }
    220 
    221 SIMD_INLINE v128 v128_unziphi_32(v128 a, v128 b) {
    222  return _mm_castps_si128(_mm_shuffle_ps(
    223      _mm_castsi128_ps(b), _mm_castsi128_ps(a), _MM_SHUFFLE(3, 1, 3, 1)));
    224 }
    225 
    226 SIMD_INLINE v128 v128_unziplo_32(v128 a, v128 b) {
    227  return _mm_castps_si128(_mm_shuffle_ps(
    228      _mm_castsi128_ps(b), _mm_castsi128_ps(a), _MM_SHUFFLE(2, 0, 2, 0)));
    229 }
    230 
    231 SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) {
    232  return _mm_unpacklo_epi8(a, _mm_setzero_si128());
    233 }
    234 
    235 SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) {
    236  return _mm_unpacklo_epi8(a, _mm_setzero_si128());
    237 }
    238 
    239 SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) {
    240  return _mm_unpackhi_epi8(a, _mm_setzero_si128());
    241 }
    242 
    243 SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) {
    244  return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8);
    245 }
    246 
    247 SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) {
    248  return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8);
    249 }
    250 
    251 SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) {
    252  return _mm_srai_epi16(_mm_unpackhi_epi8(a, a), 8);
    253 }
    254 
    255 SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
    256  return _mm_packs_epi32(b, a);
    257 }
    258 
    259 SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) {
    260 #if defined(__SSE4_1__)
    261  return _mm_packus_epi32(b, a);
    262 #else
    263  return v128_from_v64(v64_pack_s32_u16(v128_high_v64(a), v128_low_v64(a)),
    264                       v64_pack_s32_u16(v128_high_v64(b), v128_low_v64(b)));
    265 #endif
    266 }
    267 
    268 SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
    269  return _mm_packus_epi16(b, a);
    270 }
    271 
    272 SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) {
    273  return _mm_packs_epi16(b, a);
    274 }
    275 
    276 SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) {
    277  return _mm_unpacklo_epi16(a, _mm_setzero_si128());
    278 }
    279 
    280 SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) {
    281  return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16);
    282 }
    283 
    284 SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) {
    285  return _mm_unpacklo_epi16(a, _mm_setzero_si128());
    286 }
    287 
    288 SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) {
    289  return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16);
    290 }
    291 
    292 SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) {
    293  return _mm_unpackhi_epi16(a, _mm_setzero_si128());
    294 }
    295 
    296 SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) {
    297  return _mm_srai_epi32(_mm_unpackhi_epi16(a, a), 16);
    298 }
    299 
    300 SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) {
    301 #if defined(__SSSE3__)
    302  return _mm_shuffle_epi8(x, pattern);
    303 #else
    304  v128 output;
    305  unsigned char *input = (unsigned char *)&x;
    306  unsigned char *index = (unsigned char *)&pattern;
    307  unsigned char *selected = (unsigned char *)&output;
    308  int counter;
    309 
    310  for (counter = 0; counter < 16; counter++) {
    311    selected[counter] = input[index[counter] & 15];
    312  }
    313 
    314  return output;
    315 #endif
    316 }
    317 
    318 SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) {
    319  v128 t1 = _mm_madd_epi16(v128_unpackhi_s8_s16(a), v128_unpackhi_u8_s16(b));
    320  v128 t2 = _mm_madd_epi16(v128_unpacklo_s8_s16(a), v128_unpacklo_u8_s16(b));
    321  v128 t = v128_add_32(t1, t2);
    322  t = v128_add_32(t, _mm_srli_si128(t, 8));
    323  t = v128_add_32(t, _mm_srli_si128(t, 4));
    324  return (int32_t)v128_low_u32(t);
    325 }
    326 
    327 SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
    328  v128 r = _mm_madd_epi16(a, b);
    329 #if defined(__SSE4_1__) && defined(__x86_64__)
    330  v128 c = _mm_add_epi64(_mm_cvtepi32_epi64(r),
    331                         _mm_cvtepi32_epi64(_mm_srli_si128(r, 8)));
    332  return _mm_cvtsi128_si64(_mm_add_epi64(c, _mm_srli_si128(c, 8)));
    333 #else
    334  return (int64_t)_mm_cvtsi128_si32(r) +
    335         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) +
    336         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 8)) +
    337         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 12));
    338 #endif
    339 }
    340 
    341 SIMD_INLINE uint64_t v128_hadd_u8(v128 a) {
    342  v128 t = _mm_sad_epu8(a, _mm_setzero_si128());
    343  return v64_low_u32(v128_low_v64(t)) + v64_low_u32(v128_high_v64(t));
    344 }
    345 
    346 typedef v128 sad128_internal;
    347 
    348 SIMD_INLINE sad128_internal v128_sad_u8_init(void) {
    349  return _mm_setzero_si128();
    350 }
    351 
    352 /* Implementation dependent return value.  Result must be finalised with
    353   v128_sad_sum().
    354   The result for more than 32 v128_sad_u8() calls is undefined. */
    355 SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) {
    356  return _mm_add_epi64(s, _mm_sad_epu8(a, b));
    357 }
    358 
    359 SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) {
    360  return v128_low_u32(_mm_add_epi32(s, _mm_unpackhi_epi64(s, s)));
    361 }
    362 
    363 typedef int32_t ssd128_internal;
    364 
    365 SIMD_INLINE ssd128_internal v128_ssd_u8_init(void) { return 0; }
    366 
    367 /* Implementation dependent return value.  Result must be finalised with
    368 * v128_ssd_sum(). */
    369 SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) {
    370  v128 z = _mm_setzero_si128();
    371  v128 l = _mm_sub_epi16(_mm_unpacklo_epi8(a, z), _mm_unpacklo_epi8(b, z));
    372  v128 h = _mm_sub_epi16(_mm_unpackhi_epi8(a, z), _mm_unpackhi_epi8(b, z));
    373  v128 rl = _mm_madd_epi16(l, l);
    374  v128 rh = _mm_madd_epi16(h, h);
    375  v128 r = _mm_add_epi32(rl, rh);
    376  r = _mm_add_epi32(r, _mm_srli_si128(r, 8));
    377  r = _mm_add_epi32(r, _mm_srli_si128(r, 4));
    378  return s + _mm_cvtsi128_si32(r);
    379 }
    380 
    381 SIMD_INLINE int32_t v128_ssd_u8_sum(ssd128_internal s) { return s; }
    382 
    383 SIMD_INLINE v128 v128_or(v128 a, v128 b) { return _mm_or_si128(a, b); }
    384 
    385 SIMD_INLINE v128 v128_xor(v128 a, v128 b) { return _mm_xor_si128(a, b); }
    386 
    387 SIMD_INLINE v128 v128_and(v128 a, v128 b) { return _mm_and_si128(a, b); }
    388 
    389 SIMD_INLINE v128 v128_andn(v128 a, v128 b) { return _mm_andnot_si128(b, a); }
    390 
    391 SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) {
    392  v64 lo_bits = v64_mullo_s16(a, b);
    393  v64 hi_bits = v64_mulhi_s16(a, b);
    394  return v128_from_v64(v64_ziphi_16(hi_bits, lo_bits),
    395                       v64_ziplo_16(hi_bits, lo_bits));
    396 }
    397 
    398 SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) {
    399  return _mm_mullo_epi16(a, b);
    400 }
    401 
    402 SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) {
    403  return _mm_mulhi_epi16(a, b);
    404 }
    405 
    406 SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
    407 #if defined(__SSE4_1__)
    408  return _mm_mullo_epi32(a, b);
    409 #else
    410  return _mm_unpacklo_epi32(
    411      _mm_shuffle_epi32(_mm_mul_epu32(a, b), 8),
    412      _mm_shuffle_epi32(
    413          _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)), 8));
    414 #endif
    415 }
    416 
    417 SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) {
    418  v128 r = v128_mullo_s32(a, b);
    419  return (int64_t)_mm_cvtsi128_si32(r) +
    420         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) +
    421         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 8)) +
    422         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 12));
    423 }
    424 
    425 SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return _mm_madd_epi16(a, b); }
    426 
    427 SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) {
    428 #if defined(__SSSE3__)
    429  return _mm_maddubs_epi16(a, b);
    430 #else
    431  return _mm_packs_epi32(
    432      _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()),
    433                     _mm_srai_epi16(_mm_unpacklo_epi8(b, b), 8)),
    434      _mm_madd_epi16(_mm_unpackhi_epi8(a, _mm_setzero_si128()),
    435                     _mm_srai_epi16(_mm_unpackhi_epi8(b, b), 8)));
    436 #endif
    437 }
    438 
    439 SIMD_INLINE v128 v128_padd_u8(v128 a) {
    440  return v128_madd_us8(a, _mm_set1_epi8(1));
    441 }
    442 
    443 SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return _mm_avg_epu8(a, b); }
    444 
    445 SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) {
    446  return _mm_sub_epi8(_mm_avg_epu8(a, b),
    447                      _mm_and_si128(_mm_xor_si128(a, b), v128_dup_8(1)));
    448 }
    449 
    450 SIMD_INLINE v128 v128_rdavg_u16(v128 a, v128 b) {
    451  return _mm_sub_epi16(_mm_avg_epu16(a, b),
    452                       _mm_and_si128(_mm_xor_si128(a, b), v128_dup_16(1)));
    453 }
    454 
    455 SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return _mm_avg_epu16(a, b); }
    456 
    457 SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return _mm_min_epu8(a, b); }
    458 
    459 SIMD_INLINE v128 v128_max_u8(v128 a, v128 b) { return _mm_max_epu8(a, b); }
    460 
    461 SIMD_INLINE v128 v128_min_s8(v128 a, v128 b) {
    462 #if defined(__SSE4_1__)
    463  return _mm_min_epi8(a, b);
    464 #else
    465  v128 mask = _mm_cmplt_epi8(a, b);
    466  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
    467 #endif
    468 }
    469 
    470 SIMD_INLINE uint32_t v128_movemask_8(v128 a) { return _mm_movemask_epi8(a); }
    471 
    472 SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) {
    473 #if defined(__SSE4_1__)
    474  return _mm_blendv_epi8(a, b, c);
    475 #else
    476  c = _mm_cmplt_epi8(c, v128_zero());
    477  return v128_or(v128_and(b, c), v128_andn(a, c));
    478 #endif
    479 }
    480 
    481 SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) {
    482 #if defined(__SSE4_1__)
    483  return _mm_max_epi8(a, b);
    484 #else
    485  v128 mask = _mm_cmplt_epi8(b, a);
    486  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
    487 #endif
    488 }
    489 
    490 SIMD_INLINE v128 v128_min_s16(v128 a, v128 b) { return _mm_min_epi16(a, b); }
    491 
    492 SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return _mm_max_epi16(a, b); }
    493 
    494 SIMD_INLINE v128 v128_min_s32(v128 a, v128 b) {
    495 #if defined(__SSE4_1__)
    496  return _mm_min_epi32(a, b);
    497 #else
    498  v128 mask = _mm_cmplt_epi32(a, b);
    499  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
    500 #endif
    501 }
    502 
    503 SIMD_INLINE v128 v128_max_s32(v128 a, v128 b) {
    504 #if defined(__SSE4_1__)
    505  return _mm_max_epi32(a, b);
    506 #else
    507  v128 mask = _mm_cmplt_epi32(b, a);
    508  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
    509 #endif
    510 }
    511 
    512 SIMD_INLINE v128 v128_cmpgt_s8(v128 a, v128 b) { return _mm_cmpgt_epi8(a, b); }
    513 
    514 SIMD_INLINE v128 v128_cmplt_s8(v128 a, v128 b) { return _mm_cmplt_epi8(a, b); }
    515 
    516 SIMD_INLINE v128 v128_cmpeq_8(v128 a, v128 b) { return _mm_cmpeq_epi8(a, b); }
    517 
    518 SIMD_INLINE v128 v128_cmpgt_s16(v128 a, v128 b) {
    519  return _mm_cmpgt_epi16(a, b);
    520 }
    521 
    522 SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) {
    523  return _mm_cmplt_epi16(a, b);
    524 }
    525 
    526 SIMD_INLINE v128 v128_cmpeq_32(v128 a, v128 b) { return _mm_cmpeq_epi32(a, b); }
    527 
    528 SIMD_INLINE v128 v128_cmpgt_s32(v128 a, v128 b) {
    529  return _mm_cmpgt_epi32(a, b);
    530 }
    531 
    532 SIMD_INLINE v128 v128_cmplt_s32(v128 a, v128 b) {
    533  return _mm_cmplt_epi32(a, b);
    534 }
    535 
    536 SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return _mm_cmpeq_epi16(a, b); }
    537 
    538 SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
    539  return _mm_and_si128(_mm_set1_epi8((char)(0xff << c)),
    540                       _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c)));
    541 }
    542 
    543 SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) {
    544  return _mm_and_si128(_mm_set1_epi8((char)(0xff >> c)),
    545                       _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c)));
    546 }
    547 
    548 SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) {
    549  __m128i x = _mm_cvtsi32_si128((int)(c + 8));
    550  return _mm_packs_epi16(_mm_sra_epi16(_mm_unpacklo_epi8(a, a), x),
    551                         _mm_sra_epi16(_mm_unpackhi_epi8(a, a), x));
    552 }
    553 
    554 SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) {
    555  return _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c));
    556 }
    557 
    558 SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) {
    559  return _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c));
    560 }
    561 
    562 SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) {
    563  return _mm_sra_epi16(a, _mm_cvtsi32_si128((int)c));
    564 }
    565 
    566 SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) {
    567  return _mm_sll_epi32(a, _mm_cvtsi32_si128((int)c));
    568 }
    569 
    570 SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) {
    571  return _mm_srl_epi32(a, _mm_cvtsi32_si128((int)c));
    572 }
    573 
    574 SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
    575  return _mm_sra_epi32(a, _mm_cvtsi32_si128((int)c));
    576 }
    577 
    578 SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) {
    579  return _mm_sll_epi64(a, _mm_cvtsi32_si128((int)c));
    580 }
    581 
    582 SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) {
    583  return _mm_srl_epi64(a, _mm_cvtsi32_si128((int)c));
    584 }
    585 
    586 SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) {
    587  // _mm_sra_epi64 is missing in gcc?
    588  return v128_from_64((uint64_t)((int64_t)v64_u64(v128_high_v64(a)) >> c),
    589                      (uint64_t)((int64_t)v64_u64(v128_low_v64(a)) >> c));
    590  // return _mm_sra_epi64(a, _mm_cvtsi32_si128((int)c));
    591 }
    592 
    593 /* These intrinsics require immediate values, so we must use #defines
    594   to enforce that. */
    595 #define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c) & 127)
    596 #define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c) & 127)
    597 #define v128_shl_n_8(a, c) \
    598  _mm_and_si128(_mm_set1_epi8((char)(0xff << (c))), _mm_slli_epi16(a, c))
    599 #define v128_shr_n_u8(a, c) \
    600  _mm_and_si128(_mm_set1_epi8((char)(0xff >> (c))), _mm_srli_epi16(a, c))
    601 #define v128_shr_n_s8(a, c)                                         \
    602  _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), \
    603                  _mm_srai_epi16(_mm_unpackhi_epi8(a, a), (c) + 8))
    604 #define v128_shl_n_16(a, c) _mm_slli_epi16(a, c)
    605 #define v128_shr_n_u16(a, c) _mm_srli_epi16(a, c)
    606 #define v128_shr_n_s16(a, c) _mm_srai_epi16(a, c)
    607 #define v128_shl_n_32(a, c) _mm_slli_epi32(a, c)
    608 #define v128_shr_n_u32(a, c) _mm_srli_epi32(a, c)
    609 #define v128_shr_n_s32(a, c) _mm_srai_epi32(a, c)
    610 #define v128_shl_n_64(a, c) _mm_slli_epi64(a, c)
    611 #define v128_shr_n_u64(a, c) _mm_srli_epi64(a, c)
    612 #define v128_shr_n_s64(a, c) \
    613  v128_shr_s64(a, c)  // _mm_srai_epi64 missing in gcc?
    614 
    615 typedef v128 sad128_internal_u16;
    616 
    617 SIMD_INLINE sad128_internal_u16 v128_sad_u16_init(void) { return v128_zero(); }
    618 
    619 /* Implementation dependent return value.  Result must be finalised with
    620 * v128_sad_u16_sum(). */
    621 SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a,
    622                                             v128 b) {
    623 #if defined(__SSE4_1__)
    624  v128 t = v128_sub_16(_mm_max_epu16(a, b), _mm_min_epu16(a, b));
    625 #else
    626  v128 t = v128_cmplt_s16(v128_xor(a, v128_dup_16(32768)),
    627                          v128_xor(b, v128_dup_16(32768)));
    628  t = v128_sub_16(v128_or(v128_and(b, t), v128_andn(a, t)),
    629                  v128_or(v128_and(a, t), v128_andn(b, t)));
    630 #endif
    631  return v128_add_32(
    632      s, v128_add_32(v128_unpackhi_u16_s32(t), v128_unpacklo_u16_s32(t)));
    633 }
    634 
    635 SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) {
    636  return v128_low_u32(s) + v128_low_u32(v128_shr_n_byte(s, 4)) +
    637         v128_low_u32(v128_shr_n_byte(s, 8)) +
    638         v128_low_u32(v128_shr_n_byte(s, 12));
    639 }
    640 
    641 typedef v128 ssd128_internal_s16;
    642 
    643 SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init(void) { return v128_zero(); }
    644 
    645 /* Implementation dependent return value.  Result must be finalised with
    646 * v128_ssd_s16_sum(). */
    647 SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a,
    648                                             v128 b) {
    649  v128 d = v128_sub_16(a, b);
    650  d = v128_madd_s16(d, d);
    651  return v128_add_64(s, v128_add_64(_mm_unpackhi_epi32(d, v128_zero()),
    652                                    _mm_unpacklo_epi32(d, v128_zero())));
    653 }
    654 
    655 SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) {
    656  return v64_u64(v128_low_v64(s)) + v64_u64(v128_high_v64(s));
    657 }
    658 
    659 #endif  // AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_