tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

v256_intrinsics_x86.h (26504B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_
     13 #define AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_
     14 
     15 #if !defined(__AVX2__)
     16 
     17 #include "aom_dsp/simd/v256_intrinsics_v128.h"
     18 
     19 #else
     20 
     21 // The _m256i type seems to cause problems for g++'s mangling prior to
     22 // version 5, but adding -fabi-version=0 fixes this.
     23 #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 5 && \
     24    defined(__AVX2__) && defined(__cplusplus)
     25 #pragma GCC optimize "-fabi-version=0"
     26 #endif
     27 
     28 #include <immintrin.h>
     29 
     30 #include "aom_dsp/simd/v128_intrinsics_x86.h"
     31 
     32 typedef __m256i v256;
     33 
     34 SIMD_INLINE uint32_t v256_low_u32(v256 a) {
     35  return (uint32_t)_mm_cvtsi128_si32(_mm256_extracti128_si256(a, 0));
     36 }
     37 
     38 SIMD_INLINE v64 v256_low_v64(v256 a) {
     39  return _mm_unpacklo_epi64(_mm256_extracti128_si256(a, 0), v64_zero());
     40 }
     41 
     42 SIMD_INLINE uint64_t v256_low_u64(v256 a) { return v64_u64(v256_low_v64(a)); }
     43 
     44 SIMD_INLINE v128 v256_low_v128(v256 a) { return _mm256_castsi256_si128(a); }
     45 
     46 SIMD_INLINE v128 v256_high_v128(v256 a) {
     47  return _mm256_extracti128_si256(a, 1);
     48 }
     49 
     50 SIMD_INLINE v256 v256_from_v128(v128 a, v128 b) {
     51  // gcc seems to be missing _mm256_set_m128i()
     52  return _mm256_inserti128_si256(_mm256_castsi128_si256(b), a, 1);
     53 }
     54 
     55 SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
     56  return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d));
     57 }
     58 
     59 SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
     60  return _mm256_set_epi64x((int64_t)a, (int64_t)b, (int64_t)c, (int64_t)d);
     61 }
     62 
     63 SIMD_INLINE v256 v256_load_aligned(const void *p) {
     64  return _mm256_load_si256((const __m256i *)p);
     65 }
     66 
     67 SIMD_INLINE v256 v256_load_unaligned(const void *p) {
     68  return _mm256_loadu_si256((const __m256i *)p);
     69 }
     70 
     71 SIMD_INLINE void v256_store_aligned(void *p, v256 a) {
     72  _mm256_store_si256((__m256i *)p, a);
     73 }
     74 
     75 SIMD_INLINE void v256_store_unaligned(void *p, v256 a) {
     76  _mm256_storeu_si256((__m256i *)p, a);
     77 }
     78 
     79 SIMD_INLINE v256 v256_zero(void) { return _mm256_setzero_si256(); }
     80 
     81 SIMD_INLINE v256 v256_dup_8(uint8_t x) { return _mm256_set1_epi8((char)x); }
     82 
     83 SIMD_INLINE v256 v256_dup_16(uint16_t x) { return _mm256_set1_epi16((short)x); }
     84 
     85 SIMD_INLINE v256 v256_dup_32(uint32_t x) { return _mm256_set1_epi32((int)x); }
     86 
     87 SIMD_INLINE v256 v256_dup_64(uint64_t x) {
     88  return _mm256_set1_epi64x((int64_t)x);
     89 }
     90 
     91 SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return _mm256_add_epi8(a, b); }
     92 
     93 SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return _mm256_add_epi16(a, b); }
     94 
     95 SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) { return _mm256_adds_epu8(a, b); }
     96 
     97 SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) { return _mm256_adds_epi8(a, b); }
     98 
     99 SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) {
    100  return _mm256_adds_epi16(a, b);
    101 }
    102 
    103 SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return _mm256_add_epi32(a, b); }
    104 
    105 SIMD_INLINE v256 v256_add_64(v256 a, v256 b) { return _mm256_add_epi64(a, b); }
    106 
    107 SIMD_INLINE v256 v256_padd_u8(v256 a) {
    108  return _mm256_maddubs_epi16(a, _mm256_set1_epi8(1));
    109 }
    110 
    111 SIMD_INLINE v256 v256_padd_s16(v256 a) {
    112  return _mm256_madd_epi16(a, _mm256_set1_epi16(1));
    113 }
    114 
    115 SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { return _mm256_sub_epi8(a, b); }
    116 
    117 SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return _mm256_subs_epu8(a, b); }
    118 
    119 SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { return _mm256_subs_epi8(a, b); }
    120 
    121 SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { return _mm256_sub_epi16(a, b); }
    122 
    123 SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) {
    124  return _mm256_subs_epi16(a, b);
    125 }
    126 
    127 SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) {
    128  return _mm256_subs_epu16(a, b);
    129 }
    130 
    131 SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return _mm256_sub_epi32(a, b); }
    132 
    133 SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) { return _mm256_sub_epi64(a, b); }
    134 
    135 SIMD_INLINE v256 v256_abs_s16(v256 a) { return _mm256_abs_epi16(a); }
    136 
    137 SIMD_INLINE v256 v256_abs_s8(v256 a) { return _mm256_abs_epi8(a); }
    138 
    139 // AVX doesn't have the direct intrinsics to zip/unzip 8, 16, 32 bit
    140 // lanes of lower or upper halves of a 256bit vector because the
    141 // unpack/pack intrinsics operate on the 256 bit input vector as 2
    142 // independent 128 bit vectors.
    143 SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) {
    144  return _mm256_unpacklo_epi8(
    145      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
    146      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
    147 }
    148 
    149 SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) {
    150  return _mm256_unpackhi_epi8(
    151      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
    152      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
    153 }
    154 
    155 SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) {
    156  return _mm256_unpacklo_epi16(
    157      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
    158      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
    159 }
    160 
    161 SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) {
    162  return _mm256_unpackhi_epi16(
    163      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
    164      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
    165 }
    166 
    167 SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) {
    168  return _mm256_unpacklo_epi32(
    169      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
    170      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
    171 }
    172 
    173 SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) {
    174  return _mm256_unpackhi_epi32(
    175      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
    176      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
    177 }
    178 
    179 SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) {
    180  return _mm256_unpacklo_epi64(
    181      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
    182      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
    183 }
    184 
    185 SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) {
    186  return _mm256_unpackhi_epi64(
    187      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
    188      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
    189 }
    190 
    191 SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
    192  return _mm256_permute2x128_si256(a, b, 0x02);
    193 }
    194 
    195 SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
    196  return _mm256_permute2x128_si256(a, b, 0x13);
    197 }
    198 
    199 SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) {
    200  return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b));
    201 }
    202 
    203 SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) {
    204  return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b));
    205 }
    206 
    207 SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) {
    208  return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b));
    209 }
    210 
    211 SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
    212  return _mm256_permute4x64_epi64(
    213      _mm256_packs_epi16(_mm256_srai_epi16(b, 8), _mm256_srai_epi16(a, 8)),
    214      _MM_SHUFFLE(3, 1, 2, 0));
    215 }
    216 
    217 SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
    218  return v256_unziphi_8(_mm256_slli_si256(a, 1), _mm256_slli_si256(b, 1));
    219 }
    220 
    221 SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
    222  return _mm256_permute4x64_epi64(
    223      _mm256_packs_epi32(_mm256_srai_epi32(b, 16), _mm256_srai_epi32(a, 16)),
    224      _MM_SHUFFLE(3, 1, 2, 0));
    225 }
    226 
    227 SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
    228  return v256_unziphi_16(_mm256_slli_si256(a, 2), _mm256_slli_si256(b, 2));
    229 }
    230 
    231 SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
    232  return _mm256_permute4x64_epi64(
    233      _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(b),
    234                                            _mm256_castsi256_ps(a),
    235                                            _MM_SHUFFLE(3, 1, 3, 1))),
    236      _MM_SHUFFLE(3, 1, 2, 0));
    237 }
    238 
    239 SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
    240  return _mm256_permute4x64_epi64(
    241      _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(b),
    242                                            _mm256_castsi256_ps(a),
    243                                            _MM_SHUFFLE(2, 0, 2, 0))),
    244      _MM_SHUFFLE(3, 1, 2, 0));
    245 }
    246 
    247 SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) {
    248  return _mm256_permute4x64_epi64(
    249      _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(b),
    250                                            _mm256_castsi256_pd(a), 15)),
    251      _MM_SHUFFLE(3, 1, 2, 0));
    252 }
    253 
    254 SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) {
    255  return _mm256_permute4x64_epi64(
    256      _mm256_castpd_si256(
    257          _mm256_shuffle_pd(_mm256_castsi256_pd(b), _mm256_castsi256_pd(a), 0)),
    258      _MM_SHUFFLE(3, 1, 2, 0));
    259 }
    260 
    261 SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { return _mm256_cvtepu8_epi16(a); }
    262 
    263 SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
    264  return _mm256_unpacklo_epi8(
    265      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)),
    266      _mm256_setzero_si256());
    267 }
    268 
    269 SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
    270  return _mm256_unpackhi_epi8(
    271      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)),
    272      _mm256_setzero_si256());
    273 }
    274 
    275 SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) {
    276  return v256_from_v128(v128_unpackhi_s8_s16(a), v128_unpacklo_s8_s16(a));
    277 }
    278 
    279 SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) {
    280  return _mm256_srai_epi16(
    281      _mm256_unpacklo_epi8(
    282          a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))),
    283      8);
    284 }
    285 
    286 SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) {
    287  return _mm256_srai_epi16(
    288      _mm256_unpackhi_epi8(
    289          a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))),
    290      8);
    291 }
    292 
    293 SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
    294  return _mm256_permute4x64_epi64(_mm256_packs_epi32(b, a),
    295                                  _MM_SHUFFLE(3, 1, 2, 0));
    296 }
    297 
    298 SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) {
    299  return _mm256_permute4x64_epi64(_mm256_packus_epi32(b, a),
    300                                  _MM_SHUFFLE(3, 1, 2, 0));
    301 }
    302 
    303 SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
    304  return _mm256_permute4x64_epi64(_mm256_packus_epi16(b, a),
    305                                  _MM_SHUFFLE(3, 1, 2, 0));
    306 }
    307 
    308 SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
    309  return _mm256_permute4x64_epi64(_mm256_packs_epi16(b, a),
    310                                  _MM_SHUFFLE(3, 1, 2, 0));
    311 }
    312 
    313 SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
    314  return _mm256_cvtepu16_epi32(a);
    315 }
    316 
    317 SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
    318  return _mm256_cvtepi16_epi32(a);
    319 }
    320 
    321 SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
    322  return _mm256_unpacklo_epi16(
    323      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)),
    324      _mm256_setzero_si256());
    325 }
    326 
    327 SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
    328  return _mm256_srai_epi32(
    329      _mm256_unpacklo_epi16(
    330          a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))),
    331      16);
    332 }
    333 
    334 SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
    335  return _mm256_unpackhi_epi16(
    336      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)),
    337      _mm256_setzero_si256());
    338 }
    339 
    340 SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
    341  return _mm256_srai_epi32(
    342      _mm256_unpackhi_epi16(
    343          a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))),
    344      16);
    345 }
    346 
    347 SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) {
    348  return _mm256_blendv_epi8(
    349      _mm256_shuffle_epi8(
    350          _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 1, 0, 1)), pattern),
    351      _mm256_shuffle_epi8(
    352          _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 0, 0, 0)), pattern),
    353      _mm256_cmpgt_epi8(v256_dup_8(16), pattern));
    354 }
    355 
    356 SIMD_INLINE v256 v256_wideshuffle_8(v256 a, v256 b, v256 pattern) {
    357  v256 c32 = v256_dup_8(32);
    358  v256 p32 = v256_sub_8(pattern, c32);
    359  v256 r1 = _mm256_blendv_epi8(
    360      _mm256_shuffle_epi8(
    361          _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 1, 0, 1)), p32),
    362      _mm256_shuffle_epi8(
    363          _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 0, 0, 0)), p32),
    364      _mm256_cmpgt_epi8(v256_dup_8(48), pattern));
    365  v256 r2 = _mm256_blendv_epi8(
    366      _mm256_shuffle_epi8(
    367          _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 3, 0, 3)), pattern),
    368      _mm256_shuffle_epi8(
    369          _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 2, 0, 2)), pattern),
    370      _mm256_cmpgt_epi8(v256_dup_8(16), pattern));
    371  return _mm256_blendv_epi8(r1, r2, _mm256_cmpgt_epi8(c32, pattern));
    372 }
    373 
    374 SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
    375  return _mm256_shuffle_epi8(a, pattern);
    376 }
    377 
    378 SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) {
    379  v256 t1 = _mm256_madd_epi16(v256_unpackhi_s8_s16(a), v256_unpackhi_u8_s16(b));
    380  v256 t2 = _mm256_madd_epi16(v256_unpacklo_s8_s16(a), v256_unpacklo_u8_s16(b));
    381  t1 = _mm256_add_epi32(t1, t2);
    382  v128 t = _mm_add_epi32(_mm256_extracti128_si256(t1, 0),
    383                         _mm256_extracti128_si256(t1, 1));
    384  t = _mm_add_epi32(t, _mm_srli_si128(t, 8));
    385  t = _mm_add_epi32(t, _mm_srli_si128(t, 4));
    386  return (int32_t)v128_low_u32(t);
    387 }
    388 
    389 SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
    390  v256 r = _mm256_madd_epi16(a, b);
    391 #if defined(__x86_64__)
    392  v128 t;
    393  r = _mm256_add_epi64(_mm256_cvtepi32_epi64(v256_high_v128(r)),
    394                       _mm256_cvtepi32_epi64(v256_low_v128(r)));
    395  t = v256_low_v128(_mm256_add_epi64(
    396      r, _mm256_permute2x128_si256(r, r, _MM_SHUFFLE(2, 0, 0, 1))));
    397  return _mm_cvtsi128_si64(_mm_add_epi64(t, _mm_srli_si128(t, 8)));
    398 #else
    399  v128 l = v256_low_v128(r);
    400  v128 h = v256_high_v128(r);
    401  return (int64_t)_mm_cvtsi128_si32(l) +
    402         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 4)) +
    403         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 8)) +
    404         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 12)) +
    405         (int64_t)_mm_cvtsi128_si32(h) +
    406         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 4)) +
    407         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 8)) +
    408         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 12));
    409 #endif
    410 }
    411 
    412 SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) {
    413  v256 r = _mm256_mullo_epi32(a, b);
    414 #if defined(__x86_64__)
    415  v128 t;
    416  r = _mm256_add_epi64(_mm256_cvtepi32_epi64(v256_high_v128(r)),
    417                       _mm256_cvtepi32_epi64(v256_low_v128(r)));
    418  t = v256_low_v128(_mm256_add_epi64(
    419      r, _mm256_permute2x128_si256(r, r, _MM_SHUFFLE(2, 0, 0, 1))));
    420  return _mm_cvtsi128_si64(_mm_add_epi64(t, _mm_srli_si128(t, 8)));
    421 #else
    422  v128 l = v256_low_v128(r);
    423  v128 h = v256_high_v128(r);
    424  return (int64_t)_mm_cvtsi128_si32(l) +
    425         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 4)) +
    426         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 8)) +
    427         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 12)) +
    428         (int64_t)_mm_cvtsi128_si32(h) +
    429         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 4)) +
    430         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 8)) +
    431         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 12));
    432 #endif
    433 }
    434 
    435 SIMD_INLINE uint64_t v256_hadd_u8(v256 a) {
    436  v256 t = _mm256_sad_epu8(a, _mm256_setzero_si256());
    437  v128 lo = v256_low_v128(t);
    438  v128 hi = v256_high_v128(t);
    439  lo = v128_add_32(lo, hi);
    440  return v64_low_u32(v128_low_v64(lo)) + v128_low_u32(v128_high_v64(lo));
    441 }
    442 
    443 typedef v256 sad256_internal;
    444 
    445 SIMD_INLINE sad256_internal v256_sad_u8_init(void) {
    446  return _mm256_setzero_si256();
    447 }
    448 
    449 /* Implementation dependent return value.  Result must be finalised with
    450   v256_sad_u8_sum().
    451   The result for more than 32 v256_sad_u8() calls is undefined. */
    452 SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
    453  return _mm256_add_epi64(s, _mm256_sad_epu8(a, b));
    454 }
    455 
    456 SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) {
    457  v256 t = _mm256_add_epi32(s, _mm256_unpackhi_epi64(s, s));
    458  return v128_low_u32(_mm_add_epi32(v256_high_v128(t), v256_low_v128(t)));
    459 }
    460 
    461 typedef v256 ssd256_internal;
    462 
    463 SIMD_INLINE ssd256_internal v256_ssd_u8_init(void) {
    464  return _mm256_setzero_si256();
    465 }
    466 
    467 /* Implementation dependent return value.  Result must be finalised with
    468 * v256_ssd_u8_sum(). */
    469 SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
    470  v256 l = _mm256_sub_epi16(_mm256_unpacklo_epi8(a, _mm256_setzero_si256()),
    471                            _mm256_unpacklo_epi8(b, _mm256_setzero_si256()));
    472  v256 h = _mm256_sub_epi16(_mm256_unpackhi_epi8(a, _mm256_setzero_si256()),
    473                            _mm256_unpackhi_epi8(b, _mm256_setzero_si256()));
    474  v256 rl = _mm256_madd_epi16(l, l);
    475  v256 rh = _mm256_madd_epi16(h, h);
    476  v128 c = _mm_cvtsi32_si128(32);
    477  rl = _mm256_add_epi32(rl, _mm256_srli_si256(rl, 8));
    478  rl = _mm256_add_epi32(rl, _mm256_srli_si256(rl, 4));
    479  rh = _mm256_add_epi32(rh, _mm256_srli_si256(rh, 8));
    480  rh = _mm256_add_epi32(rh, _mm256_srli_si256(rh, 4));
    481  return _mm256_add_epi64(
    482      s,
    483      _mm256_srl_epi64(_mm256_sll_epi64(_mm256_unpacklo_epi64(rl, rh), c), c));
    484 }
    485 
    486 SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
    487  v256 t = _mm256_add_epi32(s, _mm256_unpackhi_epi64(s, s));
    488  return v128_low_u32(_mm_add_epi32(v256_high_v128(t), v256_low_v128(t)));
    489 }
    490 
    491 SIMD_INLINE v256 v256_or(v256 a, v256 b) { return _mm256_or_si256(a, b); }
    492 
    493 SIMD_INLINE v256 v256_xor(v256 a, v256 b) { return _mm256_xor_si256(a, b); }
    494 
    495 SIMD_INLINE v256 v256_and(v256 a, v256 b) { return _mm256_and_si256(a, b); }
    496 
    497 SIMD_INLINE v256 v256_andn(v256 a, v256 b) { return _mm256_andnot_si256(b, a); }
    498 
    499 SIMD_INLINE v256 v256_mul_s16(v64 a, v64 b) {
    500  v128 lo_bits = v128_mullo_s16(a, b);
    501  v128 hi_bits = v128_mulhi_s16(a, b);
    502  return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits),
    503                        v128_ziplo_16(hi_bits, lo_bits));
    504 }
    505 
    506 SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
    507  return _mm256_mullo_epi16(a, b);
    508 }
    509 
    510 SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) {
    511  return _mm256_mulhi_epi16(a, b);
    512 }
    513 
    514 SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
    515  return _mm256_mullo_epi32(a, b);
    516 }
    517 
    518 SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) {
    519  return _mm256_madd_epi16(a, b);
    520 }
    521 
    522 SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) {
    523  return _mm256_maddubs_epi16(a, b);
    524 }
    525 
    526 SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { return _mm256_avg_epu8(a, b); }
    527 
    528 SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) {
    529  return _mm256_sub_epi8(
    530      _mm256_avg_epu8(a, b),
    531      _mm256_and_si256(_mm256_xor_si256(a, b), v256_dup_8(1)));
    532 }
    533 
    534 SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) {
    535  return _mm256_sub_epi16(
    536      _mm256_avg_epu16(a, b),
    537      _mm256_and_si256(_mm256_xor_si256(a, b), v256_dup_16(1)));
    538 }
    539 
    540 SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return _mm256_avg_epu16(a, b); }
    541 
    542 SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return _mm256_min_epu8(a, b); }
    543 
    544 SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return _mm256_max_epu8(a, b); }
    545 
    546 SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return _mm256_min_epi8(a, b); }
    547 
    548 SIMD_INLINE uint32_t v256_movemask_8(v256 a) {
    549  return (uint32_t)_mm256_movemask_epi8(a);
    550 }
    551 
    552 SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) {
    553  return _mm256_blendv_epi8(a, b, c);
    554 }
    555 
    556 SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return _mm256_max_epi8(a, b); }
    557 
    558 SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return _mm256_min_epi16(a, b); }
    559 
    560 SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return _mm256_max_epi16(a, b); }
    561 
    562 SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) { return _mm256_min_epi32(a, b); }
    563 
    564 SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) { return _mm256_max_epi32(a, b); }
    565 
    566 SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) {
    567  return _mm256_cmpgt_epi8(a, b);
    568 }
    569 
    570 SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) {
    571  return _mm256_cmpgt_epi8(b, a);
    572 }
    573 
    574 SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) {
    575  return _mm256_cmpeq_epi8(a, b);
    576 }
    577 
    578 SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
    579  return _mm256_cmpgt_epi16(a, b);
    580 }
    581 
    582 SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
    583  return _mm256_cmpgt_epi16(b, a);
    584 }
    585 
    586 SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) {
    587  return _mm256_cmpeq_epi16(a, b);
    588 }
    589 
    590 SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) {
    591  return _mm256_cmpgt_epi32(a, b);
    592 }
    593 
    594 SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) {
    595  return _mm256_cmpgt_epi32(b, a);
    596 }
    597 
    598 SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) {
    599  return _mm256_cmpeq_epi32(a, b);
    600 }
    601 
    602 SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) {
    603  return _mm256_and_si256(_mm256_set1_epi8((char)(0xff << c)),
    604                          _mm256_sll_epi16(a, _mm_cvtsi32_si128((int)c)));
    605 }
    606 
    607 SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) {
    608  return _mm256_and_si256(_mm256_set1_epi8((char)(0xff >> c)),
    609                          _mm256_srl_epi16(a, _mm_cvtsi32_si128((int)c)));
    610 }
    611 
    612 SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) {
    613  __m128i x = _mm_cvtsi32_si128((int)(c + 8));
    614  return _mm256_packs_epi16(_mm256_sra_epi16(_mm256_unpacklo_epi8(a, a), x),
    615                            _mm256_sra_epi16(_mm256_unpackhi_epi8(a, a), x));
    616 }
    617 
    618 SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) {
    619  return _mm256_sll_epi16(a, _mm_cvtsi32_si128((int)c));
    620 }
    621 
    622 SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) {
    623  return _mm256_srl_epi16(a, _mm_cvtsi32_si128((int)c));
    624 }
    625 
    626 SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) {
    627  return _mm256_sra_epi16(a, _mm_cvtsi32_si128((int)c));
    628 }
    629 
    630 SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) {
    631  return _mm256_sll_epi32(a, _mm_cvtsi32_si128((int)c));
    632 }
    633 
    634 SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) {
    635  return _mm256_srl_epi32(a, _mm_cvtsi32_si128((int)c));
    636 }
    637 
    638 SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) {
    639  return _mm256_sra_epi32(a, _mm_cvtsi32_si128((int)c));
    640 }
    641 
    642 SIMD_INLINE v256 v256_shl_64(v256 a, unsigned int c) {
    643  return _mm256_sll_epi64(a, _mm_cvtsi32_si128((int)c));
    644 }
    645 
    646 SIMD_INLINE v256 v256_shr_u64(v256 a, unsigned int c) {
    647  return _mm256_srl_epi64(a, _mm_cvtsi32_si128((int)c));
    648 }
    649 
    650 SIMD_INLINE v256 v256_shr_s64(v256 a, unsigned int c) {
    651 #if defined(__AVX512VL__)
    652  return _mm256_sra_epi64(a, _mm_cvtsi32_si128((int)c));
    653 #else
    654  return v256_from_v128(v128_shr_s64(v256_high_v128(a), c),
    655                        v128_shr_s64(v256_low_v128(a), c));
    656 #endif
    657 }
    658 
    659 /* These intrinsics require immediate values, so we must use #defines
    660   to enforce that. */
    661 // _mm256_slli_si256 works on 128 bit lanes and can't be used
    662 #define v256_shl_n_byte(a, n)                                                \
    663  ((n) < 16 ? v256_from_v128(                                                \
    664                  v128_align(v256_high_v128(a), v256_low_v128(a), 16 - (n)), \
    665                  v128_shl_n_byte(v256_low_v128(a), n))                      \
    666            : _mm256_inserti128_si256(                                       \
    667                  _mm256_setzero_si256(),                                    \
    668                  v128_shl_n_byte(v256_low_v128(a), (n) - 16), 1))
    669 
    670 // _mm256_srli_si256 works on 128 bit lanes and can't be used
    671 #define v256_shr_n_byte(a, n)                                                 \
    672  ((n) < 16                                                                   \
    673       ? _mm256_alignr_epi8(                                                  \
    674             _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), a, n)  \
    675       : ((n) == 16 ? _mm256_permute2x128_si256(_mm256_setzero_si256(), a, 3) \
    676                    : _mm256_inserti128_si256(                                \
    677                          _mm256_setzero_si256(),                             \
    678                          v128_shr_n_byte(v256_high_v128(a), (n) - 16), 0)))
    679 
    680 // _mm256_alignr_epi8 works on two 128 bit lanes and can't be used
    681 #define v256_align(a, b, c) \
    682  ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b)
    683 
    684 #define v256_shl_n_8(a, c)                                \
    685  _mm256_and_si256(_mm256_set1_epi8((char)(0xff << (c))), \
    686                   _mm256_slli_epi16(a, c))
    687 #define v256_shr_n_u8(a, c)                               \
    688  _mm256_and_si256(_mm256_set1_epi8((char)(0xff >> (c))), \
    689                   _mm256_srli_epi16(a, c))
    690 #define v256_shr_n_s8(a, c)                                                  \
    691  _mm256_packs_epi16(_mm256_srai_epi16(_mm256_unpacklo_epi8(a, a), (c) + 8), \
    692                     _mm256_srai_epi16(_mm256_unpackhi_epi8(a, a), (c) + 8))
    693 #define v256_shl_n_16(a, c) _mm256_slli_epi16(a, c)
    694 #define v256_shr_n_u16(a, c) _mm256_srli_epi16(a, c)
    695 #define v256_shr_n_s16(a, c) _mm256_srai_epi16(a, c)
    696 #define v256_shl_n_32(a, c) _mm256_slli_epi32(a, c)
    697 #define v256_shr_n_u32(a, c) _mm256_srli_epi32(a, c)
    698 #define v256_shr_n_s32(a, c) _mm256_srai_epi32(a, c)
    699 #define v256_shl_n_64(a, c) _mm256_slli_epi64(a, c)
    700 #define v256_shr_n_u64(a, c) _mm256_srli_epi64(a, c)
    701 #define v256_shr_n_s64(a, c) \
    702  v256_shr_s64((a), (c))  // _mm256_srai_epi64 broken in gcc?
    703 #define v256_shr_n_word(a, n) v256_shr_n_byte(a, 2 * (n))
    704 #define v256_shl_n_word(a, n) v256_shl_n_byte(a, 2 * (n))
    705 
    706 typedef v256 sad256_internal_u16;
    707 
    708 SIMD_INLINE sad256_internal_u16 v256_sad_u16_init(void) { return v256_zero(); }
    709 
    710 /* Implementation dependent return value.  Result must be finalised with
    711 * v256_sad_u16_sum(). */
    712 SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a,
    713                                             v256 b) {
    714 #if defined(__SSE4_1__)
    715  v256 t = v256_sub_16(_mm256_max_epu16(a, b), _mm256_min_epu16(a, b));
    716 #else
    717  v256 t = v256_cmplt_s16(v256_xor(a, v256_dup_16(32768)),
    718                          v256_xor(b, v256_dup_16(32768)));
    719  t = v256_sub_16(v256_or(v256_and(b, t), v256_andn(a, t)),
    720                  v256_or(v256_and(a, t), v256_andn(b, t)));
    721 #endif
    722  return v256_add_32(
    723      s, v256_add_32(v256_unpackhi_u16_s32(t), v256_unpacklo_u16_s32(t)));
    724 }
    725 
    726 SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) {
    727  v128 t = v128_add_32(v256_high_v128(s), v256_low_v128(s));
    728  return v128_low_u32(t) + v128_low_u32(v128_shr_n_byte(t, 4)) +
    729         v128_low_u32(v128_shr_n_byte(t, 8)) +
    730         v128_low_u32(v128_shr_n_byte(t, 12));
    731 }
    732 
    733 typedef v256 ssd256_internal_s16;
    734 
    735 SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init(void) { return v256_zero(); }
    736 
    737 /* Implementation dependent return value.  Result must be finalised with
    738 * v256_ssd_s16_sum(). */
    739 SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a,
    740                                             v256 b) {
    741  v256 d = v256_sub_16(a, b);
    742  d = v256_madd_s16(d, d);
    743  return v256_add_64(s, v256_add_64(_mm256_unpackhi_epi32(d, v256_zero()),
    744                                    _mm256_unpacklo_epi32(d, v256_zero())));
    745 }
    746 
    747 SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) {
    748  v128 t = v128_add_64(v256_high_v128(s), v256_low_v128(s));
    749  return v64_u64(v128_low_v64(t)) + v64_u64(v128_high_v64(t));
    750 }
    751 
    752 #endif
    753 
    754 #endif  // AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_