tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

aom_subpixel_8t_intrin_avx2.c (60817B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <immintrin.h>
     13 
     14 #include "config/aom_dsp_rtcd.h"
     15 
     16 #include "aom_dsp/x86/convolve.h"
     17 #include "aom_dsp/x86/convolve_avx2.h"
     18 #include "aom_dsp/x86/synonyms_avx2.h"
     19 #include "aom_ports/mem.h"
     20 
     21 #if defined(__clang__)
     22 #if (__clang_major__ > 0 && __clang_major__ < 3) ||            \
     23    (__clang_major__ == 3 && __clang_minor__ <= 3) ||          \
     24    (defined(__APPLE__) && defined(__apple_build_version__) && \
     25     ((__clang_major__ == 4 && __clang_minor__ <= 2) ||        \
     26      (__clang_major__ == 5 && __clang_minor__ == 0)))
     27 #define MM256_BROADCASTSI128_SI256(x) \
     28  _mm_broadcastsi128_si256((__m128i const *)&(x))
     29 #else  // clang > 3.3, and not 5.0 on macosx.
     30 #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
     31 #endif  // clang <= 3.3
     32 #elif defined(__GNUC__)
     33 #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
     34 #define MM256_BROADCASTSI128_SI256(x) \
     35  _mm_broadcastsi128_si256((__m128i const *)&(x))
     36 #elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
     37 #define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
     38 #else  // gcc > 4.7
     39 #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
     40 #endif  // gcc <= 4.6
     41 #else   // !(gcc || clang)
     42 #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
     43 #endif  // __clang__
     44 
     45 static inline void xx_storeu2_epi32(const uint8_t *output_ptr,
     46                                    const ptrdiff_t stride, const __m256i *a) {
     47  *((int *)(output_ptr)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*a));
     48  *((int *)(output_ptr + stride)) =
     49      _mm_cvtsi128_si32(_mm256_extracti128_si256(*a, 1));
     50 }
     51 
     52 static inline __m256i xx_loadu2_epi64(const void *hi, const void *lo) {
     53  __m256i a = _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)(lo)));
     54  a = _mm256_inserti128_si256(a, _mm_loadl_epi64((const __m128i *)(hi)), 1);
     55  return a;
     56 }
     57 
     58 static inline void xx_storeu2_epi64(const uint8_t *output_ptr,
     59                                    const ptrdiff_t stride, const __m256i *a) {
     60  _mm_storel_epi64((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
     61  _mm_storel_epi64((__m128i *)(output_ptr + stride),
     62                   _mm256_extractf128_si256(*a, 1));
     63 }
     64 
     65 static inline void xx_store2_mi128(const uint8_t *output_ptr,
     66                                   const ptrdiff_t stride, const __m256i *a) {
     67  _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
     68  _mm_store_si128((__m128i *)(output_ptr + stride),
     69                  _mm256_extractf128_si256(*a, 1));
     70 }
     71 
     72 static void aom_filter_block1d4_h4_avx2(
     73    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
     74    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
     75  __m128i filtersReg;
     76  __m256i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1;
     77  unsigned int i;
     78  ptrdiff_t src_stride, dst_stride;
     79  src_ptr -= 3;
     80  addFilterReg32 = _mm256_set1_epi16(32);
     81  filtersReg = _mm_loadu_si128((const __m128i *)filter);
     82  filtersReg = _mm_srai_epi16(filtersReg, 1);
     83  // converting the 16 bit (short) to 8 bit (byte) and have the same data
     84  // in both lanes of 128 bit register.
     85  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
     86  // have the same data in both lanes of a 256 bit register
     87  const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
     88 
     89  firstFilters =
     90      _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u));
     91  filt1Reg = _mm256_load_si256((__m256i const *)(filt4_d4_global_avx2));
     92 
     93  // multiple the size of the source and destination stride by two
     94  src_stride = src_pixels_per_line << 1;
     95  dst_stride = output_pitch << 1;
     96  for (i = output_height; i > 1; i -= 2) {
     97    // load the 2 strides of source
     98    srcReg32b1 = yy_loadu2_128(src_ptr + src_pixels_per_line, src_ptr);
     99 
    100    // filter the source buffer
    101    srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
    102 
    103    // multiply 4 adjacent elements with the filter and add the result
    104    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
    105 
    106    srcRegFilt32b1_1 =
    107        _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
    108 
    109    // shift by 6 bit each 16 bit
    110    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
    111    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
    112 
    113    // shrink to 8 bit each 16 bits, the first lane contain the first
    114    // convolve result and the second lane contain the second convolve result
    115    srcRegFilt32b1_1 =
    116        _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
    117 
    118    src_ptr += src_stride;
    119 
    120    xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1);
    121    output_ptr += dst_stride;
    122  }
    123 
    124  // if the number of strides is odd.
    125  // process only 4 bytes
    126  if (i > 0) {
    127    __m128i srcReg1, srcRegFilt1_1;
    128 
    129    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
    130 
    131    // filter the source buffer
    132    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
    133 
    134    // multiply 4 adjacent elements with the filter and add the result
    135    srcRegFilt1_1 =
    136        _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
    137 
    138    srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
    139    // shift by 6 bit each 16 bit
    140    srcRegFilt1_1 =
    141        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
    142    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
    143 
    144    // shrink to 8 bit each 16 bits, the first lane contain the first
    145    // convolve result and the second lane contain the second convolve result
    146    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
    147 
    148    // save 4 bytes
    149    *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
    150  }
    151 }
    152 
    153 static void aom_filter_block1d4_h8_avx2(
    154    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
    155    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
    156  __m128i filtersReg;
    157  __m256i addFilterReg32, filt1Reg, filt2Reg;
    158  __m256i firstFilters, secondFilters;
    159  __m256i srcRegFilt32b1_1, srcRegFilt32b2;
    160  __m256i srcReg32b1;
    161  unsigned int i;
    162  ptrdiff_t src_stride, dst_stride;
    163  src_ptr -= 3;
    164  addFilterReg32 = _mm256_set1_epi16(32);
    165  filtersReg = _mm_loadu_si128((const __m128i *)filter);
    166  filtersReg = _mm_srai_epi16(filtersReg, 1);
    167  // converting the 16 bit (short) to 8 bit (byte) and have the same data
    168  // in both lanes of 128 bit register.
    169  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
    170  // have the same data in both lanes of a 256 bit register
    171  const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
    172 
    173  // duplicate only the first 32 bits
    174  firstFilters = _mm256_shuffle_epi32(filtersReg32, 0);
    175  // duplicate only the second 32 bits
    176  secondFilters = _mm256_shuffle_epi32(filtersReg32, 0x55);
    177 
    178  filt1Reg = _mm256_load_si256((__m256i const *)filt_d4_global_avx2);
    179  filt2Reg = _mm256_load_si256((__m256i const *)(filt_d4_global_avx2 + 32));
    180 
    181  // multiple the size of the source and destination stride by two
    182  src_stride = src_pixels_per_line << 1;
    183  dst_stride = output_pitch << 1;
    184  for (i = output_height; i > 1; i -= 2) {
    185    // load the 2 strides of source
    186    srcReg32b1 = yy_loadu2_128(src_ptr + src_pixels_per_line, src_ptr);
    187 
    188    // filter the source buffer
    189    srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
    190 
    191    // multiply 4 adjacent elements with the filter and add the result
    192    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
    193 
    194    // filter the source buffer
    195    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
    196 
    197    // multiply 4 adjacent elements with the filter and add the result
    198    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
    199 
    200    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
    201 
    202    srcRegFilt32b1_1 =
    203        _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
    204 
    205    // shift by 6 bit each 16 bit
    206    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
    207    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
    208 
    209    // shrink to 8 bit each 16 bits, the first lane contain the first
    210    // convolve result and the second lane contain the second convolve result
    211    srcRegFilt32b1_1 =
    212        _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
    213 
    214    src_ptr += src_stride;
    215 
    216    xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1);
    217    output_ptr += dst_stride;
    218  }
    219 
    220  // if the number of strides is odd.
    221  // process only 4 bytes
    222  if (i > 0) {
    223    __m128i srcReg1, srcRegFilt1_1;
    224    __m128i srcRegFilt2;
    225 
    226    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
    227 
    228    // filter the source buffer
    229    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
    230 
    231    // multiply 4 adjacent elements with the filter and add the result
    232    srcRegFilt1_1 =
    233        _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
    234 
    235    // filter the source buffer
    236    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
    237 
    238    // multiply 4 adjacent elements with the filter and add the result
    239    srcRegFilt2 =
    240        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
    241 
    242    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
    243    srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
    244    // shift by 6 bit each 16 bit
    245    srcRegFilt1_1 =
    246        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
    247    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
    248 
    249    // shrink to 8 bit each 16 bits, the first lane contain the first
    250    // convolve result and the second lane contain the second convolve result
    251    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
    252 
    253    // save 4 bytes
    254    *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
    255  }
    256 }
    257 
    258 static void aom_filter_block1d8_h4_avx2(
    259    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
    260    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
    261  __m128i filtersReg;
    262  __m256i addFilterReg32, filt2Reg, filt3Reg;
    263  __m256i secondFilters, thirdFilters;
    264  __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
    265  __m256i srcReg32b1, filtersReg32;
    266  unsigned int i;
    267  ptrdiff_t src_stride, dst_stride;
    268  src_ptr -= 3;
    269  addFilterReg32 = _mm256_set1_epi16(32);
    270  filtersReg = _mm_loadu_si128((const __m128i *)filter);
    271  filtersReg = _mm_srai_epi16(filtersReg, 1);
    272  // converting the 16 bit (short) to 8 bit (byte) and have the same data
    273  // in both lanes of 128 bit register.
    274  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
    275  // have the same data in both lanes of a 256 bit register
    276  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
    277 
    278  // duplicate only the second 16 bits (third and forth byte)
    279  // across 256 bit register
    280  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
    281  // duplicate only the third 16 bits (fifth and sixth byte)
    282  // across 256 bit register
    283  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
    284 
    285  filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
    286  filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
    287 
    288  // multiply the size of the source and destination stride by two
    289  src_stride = src_pixels_per_line << 1;
    290  dst_stride = output_pitch << 1;
    291  for (i = output_height; i > 1; i -= 2) {
    292    // load the 2 strides of source
    293    srcReg32b1 = yy_loadu2_128(src_ptr + src_pixels_per_line, src_ptr);
    294 
    295    // filter the source buffer
    296    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
    297    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
    298 
    299    // multiply 2 adjacent elements with the filter and add the result
    300    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
    301    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
    302 
    303    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
    304 
    305    // shift by 6 bit each 16 bit
    306    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
    307    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
    308 
    309    // shrink to 8 bit each 16 bits
    310    srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b1_1);
    311 
    312    src_ptr += src_stride;
    313 
    314    xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1);
    315    output_ptr += dst_stride;
    316  }
    317 
    318  // if the number of strides is odd.
    319  // process only 8 bytes
    320  if (i > 0) {
    321    __m128i srcReg1, srcRegFilt1_1;
    322    __m128i srcRegFilt2, srcRegFilt3;
    323 
    324    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
    325 
    326    // filter the source buffer
    327    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
    328    srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
    329 
    330    // multiply 2 adjacent elements with the filter and add the result
    331    srcRegFilt2 =
    332        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
    333    srcRegFilt3 =
    334        _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(thirdFilters));
    335 
    336    // add and saturate the results together
    337    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt2, srcRegFilt3);
    338 
    339    // shift by 6 bit each 16 bit
    340    srcRegFilt1_1 =
    341        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
    342    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
    343 
    344    // shrink to 8 bit each 16 bits
    345    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
    346 
    347    // save 8 bytes
    348    _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1);
    349  }
    350 }
    351 
    352 static void aom_filter_block1d8_h8_avx2(
    353    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
    354    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
    355  __m128i filtersReg;
    356  __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
    357  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
    358  __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
    359  __m256i srcReg32b1;
    360  unsigned int i;
    361  ptrdiff_t src_stride, dst_stride;
    362  src_ptr -= 3;
    363  addFilterReg32 = _mm256_set1_epi16(32);
    364  filtersReg = _mm_loadu_si128((const __m128i *)filter);
    365  filtersReg = _mm_srai_epi16(filtersReg, 1);
    366  // converting the 16 bit (short) to 8 bit (byte) and have the same data
    367  // in both lanes of 128 bit register.
    368  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
    369  // have the same data in both lanes of a 256 bit register
    370  const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
    371 
    372  // duplicate only the first 16 bits (first and second byte)
    373  // across 256 bit register
    374  firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
    375  // duplicate only the second 16 bits (third and forth byte)
    376  // across 256 bit register
    377  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
    378  // duplicate only the third 16 bits (fifth and sixth byte)
    379  // across 256 bit register
    380  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
    381  // duplicate only the forth 16 bits (seventh and eighth byte)
    382  // across 256 bit register
    383  forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
    384 
    385  filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2);
    386  filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
    387  filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
    388  filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
    389 
    390  // multiple the size of the source and destination stride by two
    391  src_stride = src_pixels_per_line << 1;
    392  dst_stride = output_pitch << 1;
    393  for (i = output_height; i > 1; i -= 2) {
    394    // load the 2 strides of source
    395    srcReg32b1 = yy_loadu2_128(src_ptr + src_pixels_per_line, src_ptr);
    396 
    397    // filter the source buffer
    398    srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
    399    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
    400 
    401    // multiply 2 adjacent elements with the filter and add the result
    402    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
    403    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
    404 
    405    // add and saturate the results together
    406    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
    407 
    408    // filter the source buffer
    409    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
    410    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
    411 
    412    // multiply 2 adjacent elements with the filter and add the result
    413    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
    414    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
    415 
    416    __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
    417    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23);
    418 
    419    // shift by 6 bit each 16 bit
    420    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
    421    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
    422 
    423    // shrink to 8 bit each 16 bits, the first lane contain the first
    424    // convolve result and the second lane contain the second convolve result
    425    srcRegFilt32b1_1 =
    426        _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
    427 
    428    src_ptr += src_stride;
    429 
    430    xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1);
    431    output_ptr += dst_stride;
    432  }
    433 
    434  // if the number of strides is odd.
    435  // process only 8 bytes
    436  if (i > 0) {
    437    __m128i srcReg1, srcRegFilt1_1;
    438    __m128i srcRegFilt2, srcRegFilt3;
    439 
    440    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
    441 
    442    // filter the source buffer
    443    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
    444    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg));
    445 
    446    // multiply 2 adjacent elements with the filter and add the result
    447    srcRegFilt1_1 =
    448        _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
    449    srcRegFilt2 =
    450        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
    451 
    452    // add and saturate the results together
    453    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
    454 
    455    // filter the source buffer
    456    srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
    457    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
    458 
    459    // multiply 2 adjacent elements with the filter and add the result
    460    srcRegFilt3 =
    461        _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
    462    srcRegFilt2 =
    463        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
    464 
    465    // add and saturate the results together
    466    srcRegFilt1_1 =
    467        _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
    468 
    469    // shift by 6 bit each 16 bit
    470    srcRegFilt1_1 =
    471        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
    472    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
    473 
    474    // shrink to 8 bit each 16 bits, the first lane contain the first
    475    // convolve result and the second lane contain the second convolve
    476    // result
    477    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
    478 
    479    // save 8 bytes
    480    _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1);
    481  }
    482 }
    483 
    484 static void aom_filter_block1d16_h4_avx2(
    485    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
    486    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
    487  __m128i filtersReg;
    488  __m256i addFilterReg32, filt2Reg, filt3Reg;
    489  __m256i secondFilters, thirdFilters;
    490  __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
    491  __m256i srcReg32b1, srcReg32b2, filtersReg32;
    492  unsigned int i;
    493  ptrdiff_t src_stride, dst_stride;
    494  src_ptr -= 3;
    495  addFilterReg32 = _mm256_set1_epi16(32);
    496  filtersReg = _mm_loadu_si128((const __m128i *)filter);
    497  filtersReg = _mm_srai_epi16(filtersReg, 1);
    498  // converting the 16 bit (short) to 8 bit (byte) and have the same data
    499  // in both lanes of 128 bit register.
    500  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
    501  // have the same data in both lanes of a 256 bit register
    502  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
    503 
    504  // duplicate only the second 16 bits (third and forth byte)
    505  // across 256 bit register
    506  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
    507  // duplicate only the third 16 bits (fifth and sixth byte)
    508  // across 256 bit register
    509  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
    510 
    511  filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
    512  filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
    513 
    514  // multiply the size of the source and destination stride by two
    515  src_stride = src_pixels_per_line << 1;
    516  dst_stride = output_pitch << 1;
    517  for (i = output_height; i > 1; i -= 2) {
    518    // load the 2 strides of source
    519    srcReg32b1 = yy_loadu2_128(src_ptr + src_pixels_per_line, src_ptr);
    520 
    521    // filter the source buffer
    522    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
    523    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
    524 
    525    // multiply 2 adjacent elements with the filter and add the result
    526    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
    527    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
    528 
    529    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
    530 
    531    // reading 2 strides of the next 16 bytes
    532    // (part of it was being read by earlier read)
    533    srcReg32b2 = yy_loadu2_128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
    534 
    535    // filter the source buffer
    536    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
    537    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
    538 
    539    // multiply 2 adjacent elements with the filter and add the result
    540    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
    541    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
    542 
    543    // add and saturate the results together
    544    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
    545 
    546    // shift by 6 bit each 16 bit
    547    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
    548    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
    549    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
    550    srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6);
    551 
    552    // shrink to 8 bit each 16 bits, the first lane contain the first
    553    // convolve result and the second lane contain the second convolve result
    554    srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
    555 
    556    src_ptr += src_stride;
    557 
    558    xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1);
    559    output_ptr += dst_stride;
    560  }
    561 
    562  // if the number of strides is odd.
    563  // process only 16 bytes
    564  if (i > 0) {
    565    __m256i srcReg1, srcReg12;
    566    __m256i srcRegFilt2, srcRegFilt3, srcRegFilt1_1;
    567 
    568    srcReg1 = _mm256_loadu_si256((const __m256i *)(src_ptr));
    569    srcReg12 = _mm256_permute4x64_epi64(srcReg1, 0x94);
    570 
    571    // filter the source buffer
    572    srcRegFilt2 = _mm256_shuffle_epi8(srcReg12, filt2Reg);
    573    srcRegFilt3 = _mm256_shuffle_epi8(srcReg12, filt3Reg);
    574 
    575    // multiply 2 adjacent elements with the filter and add the result
    576    srcRegFilt2 = _mm256_maddubs_epi16(srcRegFilt2, secondFilters);
    577    srcRegFilt3 = _mm256_maddubs_epi16(srcRegFilt3, thirdFilters);
    578 
    579    // add and saturate the results together
    580    srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt2, srcRegFilt3);
    581 
    582    // shift by 6 bit each 16 bit
    583    srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt1_1, addFilterReg32);
    584    srcRegFilt1_1 = _mm256_srai_epi16(srcRegFilt1_1, 6);
    585 
    586    // shrink to 8 bit each 16 bits, the first lane contain the first
    587    // convolve result and the second lane contain the second convolve
    588    // result
    589    srcRegFilt1_1 = _mm256_packus_epi16(srcRegFilt1_1, srcRegFilt1_1);
    590    srcRegFilt1_1 = _mm256_permute4x64_epi64(srcRegFilt1_1, 0x8);
    591 
    592    // save 16 bytes
    593    _mm_store_si128((__m128i *)output_ptr,
    594                    _mm256_castsi256_si128(srcRegFilt1_1));
    595  }
    596 }
    597 
    598 static void aom_filter_block1d16_h8_avx2(
    599    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
    600    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
    601  __m128i filtersReg;
    602  __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
    603  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
    604  __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
    605  __m256i srcReg32b1, srcReg32b2, filtersReg32;
    606  unsigned int i;
    607  ptrdiff_t src_stride, dst_stride;
    608  src_ptr -= 3;
    609  addFilterReg32 = _mm256_set1_epi16(32);
    610  filtersReg = _mm_loadu_si128((const __m128i *)filter);
    611  filtersReg = _mm_srai_epi16(filtersReg, 1);
    612  // converting the 16 bit (short) to 8 bit (byte) and have the same data
    613  // in both lanes of 128 bit register.
    614  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
    615  // have the same data in both lanes of a 256 bit register
    616  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
    617 
    618  // duplicate only the first 16 bits (first and second byte)
    619  // across 256 bit register
    620  firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
    621  // duplicate only the second 16 bits (third and forth byte)
    622  // across 256 bit register
    623  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
    624  // duplicate only the third 16 bits (fifth and sixth byte)
    625  // across 256 bit register
    626  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
    627  // duplicate only the forth 16 bits (seventh and eighth byte)
    628  // across 256 bit register
    629  forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
    630 
    631  filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2);
    632  filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
    633  filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
    634  filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
    635 
    636  // multiple the size of the source and destination stride by two
    637  src_stride = src_pixels_per_line << 1;
    638  dst_stride = output_pitch << 1;
    639  for (i = output_height; i > 1; i -= 2) {
    640    // load the 2 strides of source
    641    srcReg32b1 = yy_loadu2_128(src_ptr + src_pixels_per_line, src_ptr);
    642 
    643    // filter the source buffer
    644    srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
    645    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
    646 
    647    // multiply 2 adjacent elements with the filter and add the result
    648    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
    649    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
    650 
    651    // add and saturate the results together
    652    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
    653 
    654    // filter the source buffer
    655    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
    656    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
    657 
    658    // multiply 2 adjacent elements with the filter and add the result
    659    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
    660    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
    661 
    662    __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
    663    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23);
    664 
    665    // reading 2 strides of the next 16 bytes
    666    // (part of it was being read by earlier read)
    667    srcReg32b2 = yy_loadu2_128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
    668 
    669    // filter the source buffer
    670    srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
    671    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
    672 
    673    // multiply 2 adjacent elements with the filter and add the result
    674    srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
    675    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
    676 
    677    // add and saturate the results together
    678    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
    679 
    680    // filter the source buffer
    681    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
    682    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
    683 
    684    // multiply 2 adjacent elements with the filter and add the result
    685    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
    686    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
    687 
    688    // add and saturate the results together
    689    srcRegFilt32b2_1 = _mm256_adds_epi16(
    690        srcRegFilt32b2_1, _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2));
    691 
    692    // shift by 6 bit each 16 bit
    693    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
    694    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
    695    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
    696    srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6);
    697 
    698    // shrink to 8 bit each 16 bits, the first lane contain the first
    699    // convolve result and the second lane contain the second convolve result
    700    srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
    701 
    702    src_ptr += src_stride;
    703 
    704    xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1);
    705    output_ptr += dst_stride;
    706  }
    707 
    708  // if the number of strides is odd.
    709  // process only 16 bytes
    710  if (i > 0) {
    711    __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
    712    __m128i srcRegFilt2, srcRegFilt3;
    713 
    714    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
    715 
    716    // filter the source buffer
    717    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
    718    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg));
    719 
    720    // multiply 2 adjacent elements with the filter and add the result
    721    srcRegFilt1_1 =
    722        _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
    723    srcRegFilt2 =
    724        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
    725 
    726    // add and saturate the results together
    727    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
    728 
    729    // filter the source buffer
    730    srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
    731    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
    732 
    733    // multiply 2 adjacent elements with the filter and add the result
    734    srcRegFilt3 =
    735        _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
    736    srcRegFilt2 =
    737        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
    738 
    739    // add and saturate the results together
    740    srcRegFilt1_1 =
    741        _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
    742 
    743    // reading the next 16 bytes
    744    // (part of it was being read by earlier read)
    745    srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
    746 
    747    // filter the source buffer
    748    srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg));
    749    srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg));
    750 
    751    // multiply 2 adjacent elements with the filter and add the result
    752    srcRegFilt2_1 =
    753        _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters));
    754    srcRegFilt2 =
    755        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
    756 
    757    // add and saturate the results together
    758    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
    759 
    760    // filter the source buffer
    761    srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg));
    762    srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg));
    763 
    764    // multiply 2 adjacent elements with the filter and add the result
    765    srcRegFilt3 =
    766        _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
    767    srcRegFilt2 =
    768        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
    769 
    770    // add and saturate the results together
    771    srcRegFilt2_1 =
    772        _mm_adds_epi16(srcRegFilt2_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
    773 
    774    // shift by 6 bit each 16 bit
    775    srcRegFilt1_1 =
    776        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
    777    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
    778 
    779    srcRegFilt2_1 =
    780        _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg32));
    781    srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 6);
    782 
    783    // shrink to 8 bit each 16 bits, the first lane contain the first
    784    // convolve result and the second lane contain the second convolve
    785    // result
    786    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
    787 
    788    // save 16 bytes
    789    _mm_store_si128((__m128i *)output_ptr, srcRegFilt1_1);
    790  }
    791 }
    792 
    793 static void aom_filter_block1d8_v4_avx2(
    794    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
    795    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
    796  __m128i filtersReg;
    797  __m256i filtersReg32, addFilterReg32;
    798  __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
    799  __m256i srcReg23_34_lo, srcReg45_56_lo;
    800  __m256i resReg23_34_lo, resReg45_56_lo;
    801  __m256i resReglo, resReg;
    802  __m256i secondFilters, thirdFilters;
    803  unsigned int i;
    804  ptrdiff_t src_stride, dst_stride;
    805 
    806  addFilterReg32 = _mm256_set1_epi16(32);
    807  filtersReg = _mm_loadu_si128((const __m128i *)filter);
    808  // converting the 16 bit (short) to  8 bit (byte) and have the
    809  // same data in both lanes of 128 bit register.
    810  filtersReg = _mm_srai_epi16(filtersReg, 1);
    811  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
    812  // have the same data in both lanes of a 256 bit register
    813  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
    814 
    815  // duplicate only the second 16 bits (third and forth byte)
    816  // across 256 bit register
    817  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
    818  // duplicate only the third 16 bits (fifth and sixth byte)
    819  // across 256 bit register
    820  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
    821 
    822  // multiple the size of the source and destination stride by two
    823  src_stride = src_pitch << 1;
    824  dst_stride = out_pitch << 1;
    825 
    826  srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
    827  srcReg4x = _mm256_castsi128_si256(
    828      _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)));
    829 
    830  // have consecutive loads on the same 256 register
    831  srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
    832 
    833  srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
    834 
    835  for (i = output_height; i > 1; i -= 2) {
    836    // load the last 2 loads of 16 bytes and have every two
    837    // consecutive loads in the same 256 bit register
    838    srcReg5x = _mm256_castsi128_si256(
    839        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)));
    840    srcReg45 =
    841        _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
    842 
    843    srcReg6x = _mm256_castsi128_si256(
    844        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
    845    srcReg56 =
    846        _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
    847 
    848    // merge every two consecutive registers
    849    srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
    850 
    851    // multiply 2 adjacent elements with the filter and add the result
    852    resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters);
    853    resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters);
    854 
    855    // add and saturate the results together
    856    resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo);
    857 
    858    // shift by 6 bit each 16 bit
    859    resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
    860    resReglo = _mm256_srai_epi16(resReglo, 6);
    861 
    862    // shrink to 8 bit each 16 bits, the first lane contain the first
    863    // convolve result and the second lane contain the second convolve
    864    // result
    865    resReg = _mm256_packus_epi16(resReglo, resReglo);
    866 
    867    src_ptr += src_stride;
    868 
    869    xx_storeu2_epi64(output_ptr, out_pitch, &resReg);
    870 
    871    output_ptr += dst_stride;
    872 
    873    // save part of the registers for next strides
    874    srcReg23_34_lo = srcReg45_56_lo;
    875    srcReg4x = srcReg6x;
    876  }
    877 }
    878 
    879 static void aom_filter_block1d8_v8_avx2(
    880    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
    881    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
    882  __m128i filtersReg;
    883  __m256i addFilterReg32;
    884  __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
    885  __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
    886  __m256i srcReg32b11, srcReg32b12, filtersReg32;
    887  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
    888  unsigned int i;
    889  ptrdiff_t src_stride, dst_stride;
    890 
    891  addFilterReg32 = _mm256_set1_epi16(32);
    892  filtersReg = _mm_loadu_si128((const __m128i *)filter);
    893  // converting the 16 bit (short) to  8 bit (byte) and have the
    894  // same data in both lanes of 128 bit register.
    895  filtersReg = _mm_srai_epi16(filtersReg, 1);
    896  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
    897  // have the same data in both lanes of a 256 bit register
    898  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
    899 
    900  // duplicate only the first 16 bits (first and second byte)
    901  // across 256 bit register
    902  firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
    903  // duplicate only the second 16 bits (third and forth byte)
    904  // across 256 bit register
    905  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
    906  // duplicate only the third 16 bits (fifth and sixth byte)
    907  // across 256 bit register
    908  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
    909  // duplicate only the forth 16 bits (seventh and eighth byte)
    910  // across 256 bit register
    911  forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
    912 
    913  // multiple the size of the source and destination stride by two
    914  src_stride = src_pitch << 1;
    915  dst_stride = out_pitch << 1;
    916 
    917  // load 16 bytes 7 times in stride of src_pitch
    918  srcReg32b1 = xx_loadu2_epi64(src_ptr + src_pitch, src_ptr);
    919  srcReg32b3 =
    920      xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
    921  srcReg32b5 =
    922      xx_loadu2_epi64(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4);
    923  srcReg32b7 = _mm256_castsi128_si256(
    924      _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
    925 
    926  // have each consecutive loads on the same 256 register
    927  srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21);
    928  srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21);
    929  srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21);
    930  // merge every two consecutive registers except the last one
    931  srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
    932  srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
    933  srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
    934 
    935  for (i = output_height; i > 1; i -= 2) {
    936    // load the last 2 loads of 16 bytes and have every two
    937    // consecutive loads in the same 256 bit register
    938    srcReg32b8 = _mm256_castsi128_si256(
    939        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)));
    940    srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
    941                                         _mm256_castsi256_si128(srcReg32b8), 1);
    942    srcReg32b9 = _mm256_castsi128_si256(
    943        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 8)));
    944    srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
    945                                         _mm256_castsi256_si128(srcReg32b9), 1);
    946 
    947    // merge every two consecutive registers
    948    // save
    949    srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
    950 
    951    // multiply 2 adjacent elements with the filter and add the result
    952    srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
    953    srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
    954 
    955    // add and saturate the results together
    956    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
    957 
    958    // multiply 2 adjacent elements with the filter and add the result
    959    srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
    960    srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
    961 
    962    // add and saturate the results together
    963    srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
    964                                    _mm256_adds_epi16(srcReg32b8, srcReg32b12));
    965 
    966    // shift by 6 bit each 16 bit
    967    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32);
    968    srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6);
    969 
    970    // shrink to 8 bit each 16 bits, the first lane contain the first
    971    // convolve result and the second lane contain the second convolve
    972    // result
    973    srcReg32b1 = _mm256_packus_epi16(srcReg32b10, _mm256_setzero_si256());
    974 
    975    src_ptr += src_stride;
    976 
    977    xx_storeu2_epi64(output_ptr, out_pitch, &srcReg32b1);
    978 
    979    output_ptr += dst_stride;
    980 
    981    // save part of the registers for next strides
    982    srcReg32b10 = srcReg32b11;
    983    srcReg32b11 = srcReg32b2;
    984    srcReg32b2 = srcReg32b4;
    985    srcReg32b7 = srcReg32b9;
    986  }
    987  if (i > 0) {
    988    __m128i srcRegFilt1, srcRegFilt4, srcRegFilt6, srcRegFilt8;
    989    // load the last 16 bytes
    990    srcRegFilt8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
    991 
    992    // merge the last 2 results together
    993    srcRegFilt4 =
    994        _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
    995 
    996    // multiply 2 adjacent elements with the filter and add the result
    997    srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
    998                                    _mm256_castsi256_si128(firstFilters));
    999    srcRegFilt4 =
   1000        _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters));
   1001 
   1002    // add and saturate the results together
   1003    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
   1004 
   1005    // multiply 2 adjacent elements with the filter and add the result
   1006    srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
   1007                                    _mm256_castsi256_si128(secondFilters));
   1008 
   1009    // multiply 2 adjacent elements with the filter and add the result
   1010    srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
   1011                                    _mm256_castsi256_si128(thirdFilters));
   1012 
   1013    // add and saturate the results together
   1014    srcRegFilt1 =
   1015        _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6));
   1016 
   1017    // shift by 6 bit each 16 bit
   1018    srcRegFilt1 =
   1019        _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32));
   1020    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6);
   1021 
   1022    // shrink to 8 bit each 16 bits, the first lane contain the first
   1023    // convolve result and the second lane contain the second convolve result
   1024    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, _mm_setzero_si128());
   1025 
   1026    // save 8 bytes
   1027    _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1);
   1028  }
   1029 }
   1030 
   1031 static void aom_filter_block1d16_v4_avx2(
   1032    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
   1033    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
   1034  __m128i filtersReg;
   1035  __m256i filtersReg32, addFilterReg32;
   1036  __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
   1037  __m256i srcReg23_34_lo, srcReg23_34_hi, srcReg45_56_lo, srcReg45_56_hi;
   1038  __m256i resReg23_34_lo, resReg23_34_hi, resReg45_56_lo, resReg45_56_hi;
   1039  __m256i resReglo, resReghi, resReg;
   1040  __m256i secondFilters, thirdFilters;
   1041  unsigned int i;
   1042  ptrdiff_t src_stride, dst_stride;
   1043 
   1044  addFilterReg32 = _mm256_set1_epi16(32);
   1045  filtersReg = _mm_loadu_si128((const __m128i *)filter);
   1046  // converting the 16 bit (short) to  8 bit (byte) and have the
   1047  // same data in both lanes of 128 bit register.
   1048  filtersReg = _mm_srai_epi16(filtersReg, 1);
   1049  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
   1050  // have the same data in both lanes of a 256 bit register
   1051  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
   1052 
   1053  // duplicate only the second 16 bits (third and forth byte)
   1054  // across 256 bit register
   1055  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
   1056  // duplicate only the third 16 bits (fifth and sixth byte)
   1057  // across 256 bit register
   1058  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
   1059 
   1060  // multiple the size of the source and destination stride by two
   1061  src_stride = src_pitch << 1;
   1062  dst_stride = out_pitch << 1;
   1063 
   1064  srcReg23 = yy_loadu2_128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
   1065  srcReg4x = _mm256_castsi128_si256(
   1066      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
   1067 
   1068  // have consecutive loads on the same 256 register
   1069  srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
   1070 
   1071  srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
   1072  srcReg23_34_hi = _mm256_unpackhi_epi8(srcReg23, srcReg34);
   1073 
   1074  for (i = output_height; i > 1; i -= 2) {
   1075    // load the last 2 loads of 16 bytes and have every two
   1076    // consecutive loads in the same 256 bit register
   1077    srcReg5x = _mm256_castsi128_si256(
   1078        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
   1079    srcReg45 =
   1080        _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
   1081 
   1082    srcReg6x = _mm256_castsi128_si256(
   1083        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
   1084    srcReg56 =
   1085        _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
   1086 
   1087    // merge every two consecutive registers
   1088    srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
   1089    srcReg45_56_hi = _mm256_unpackhi_epi8(srcReg45, srcReg56);
   1090 
   1091    // multiply 2 adjacent elements with the filter and add the result
   1092    resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters);
   1093    resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters);
   1094 
   1095    // add and saturate the results together
   1096    resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo);
   1097 
   1098    // multiply 2 adjacent elements with the filter and add the result
   1099    resReg23_34_hi = _mm256_maddubs_epi16(srcReg23_34_hi, secondFilters);
   1100    resReg45_56_hi = _mm256_maddubs_epi16(srcReg45_56_hi, thirdFilters);
   1101 
   1102    // add and saturate the results together
   1103    resReghi = _mm256_adds_epi16(resReg23_34_hi, resReg45_56_hi);
   1104 
   1105    // shift by 6 bit each 16 bit
   1106    resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
   1107    resReghi = _mm256_adds_epi16(resReghi, addFilterReg32);
   1108    resReglo = _mm256_srai_epi16(resReglo, 6);
   1109    resReghi = _mm256_srai_epi16(resReghi, 6);
   1110 
   1111    // shrink to 8 bit each 16 bits, the first lane contain the first
   1112    // convolve result and the second lane contain the second convolve
   1113    // result
   1114    resReg = _mm256_packus_epi16(resReglo, resReghi);
   1115 
   1116    src_ptr += src_stride;
   1117 
   1118    xx_store2_mi128(output_ptr, out_pitch, &resReg);
   1119 
   1120    output_ptr += dst_stride;
   1121 
   1122    // save part of the registers for next strides
   1123    srcReg23_34_lo = srcReg45_56_lo;
   1124    srcReg23_34_hi = srcReg45_56_hi;
   1125    srcReg4x = srcReg6x;
   1126  }
   1127 }
   1128 
   1129 static void aom_filter_block1d16_v8_avx2(
   1130    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
   1131    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
   1132  __m128i filtersReg;
   1133  __m256i addFilterReg32;
   1134  __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
   1135  __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
   1136  __m256i srcReg32b11, srcReg32b12, filtersReg32;
   1137  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
   1138  unsigned int i;
   1139  ptrdiff_t src_stride, dst_stride;
   1140 
   1141  addFilterReg32 = _mm256_set1_epi16(32);
   1142  filtersReg = _mm_loadu_si128((const __m128i *)filter);
   1143  // converting the 16 bit (short) to  8 bit (byte) and have the
   1144  // same data in both lanes of 128 bit register.
   1145  filtersReg = _mm_srai_epi16(filtersReg, 1);
   1146  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
   1147  // have the same data in both lanes of a 256 bit register
   1148  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
   1149 
   1150  // duplicate only the first 16 bits (first and second byte)
   1151  // across 256 bit register
   1152  firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
   1153  // duplicate only the second 16 bits (third and forth byte)
   1154  // across 256 bit register
   1155  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
   1156  // duplicate only the third 16 bits (fifth and sixth byte)
   1157  // across 256 bit register
   1158  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
   1159  // duplicate only the forth 16 bits (seventh and eighth byte)
   1160  // across 256 bit register
   1161  forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
   1162 
   1163  // multiple the size of the source and destination stride by two
   1164  src_stride = src_pitch << 1;
   1165  dst_stride = out_pitch << 1;
   1166 
   1167  // load 16 bytes 7 times in stride of src_pitch
   1168  srcReg32b1 = yy_loadu2_128(src_ptr + src_pitch, src_ptr);
   1169  srcReg32b3 = yy_loadu2_128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
   1170  srcReg32b5 = yy_loadu2_128(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4);
   1171  srcReg32b7 = _mm256_castsi128_si256(
   1172      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
   1173 
   1174  // have each consecutive loads on the same 256 register
   1175  srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21);
   1176  srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21);
   1177  srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21);
   1178  // merge every two consecutive registers except the last one
   1179  srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
   1180  srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);
   1181 
   1182  // save
   1183  srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
   1184  srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
   1185  srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
   1186  srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
   1187 
   1188  for (i = output_height; i > 1; i -= 2) {
   1189    // load the last 2 loads of 16 bytes and have every two
   1190    // consecutive loads in the same 256 bit register
   1191    srcReg32b8 = _mm256_castsi128_si256(
   1192        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
   1193    srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
   1194                                         _mm256_castsi256_si128(srcReg32b8), 1);
   1195    srcReg32b9 = _mm256_castsi128_si256(
   1196        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
   1197    srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
   1198                                         _mm256_castsi256_si128(srcReg32b9), 1);
   1199 
   1200    // merge every two consecutive registers
   1201    // save
   1202    srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
   1203    srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
   1204 
   1205    // multiply 2 adjacent elements with the filter and add the result
   1206    srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
   1207    srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
   1208 
   1209    // add and saturate the results together
   1210    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
   1211 
   1212    // multiply 2 adjacent elements with the filter and add the result
   1213    srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
   1214    srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
   1215 
   1216    // add and saturate the results together
   1217    srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
   1218                                    _mm256_adds_epi16(srcReg32b8, srcReg32b12));
   1219 
   1220    // multiply 2 adjacent elements with the filter and add the result
   1221    srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
   1222    srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
   1223 
   1224    srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
   1225 
   1226    // multiply 2 adjacent elements with the filter and add the result
   1227    srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
   1228    srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
   1229 
   1230    // add and saturate the results together
   1231    srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
   1232                                   _mm256_adds_epi16(srcReg32b8, srcReg32b12));
   1233 
   1234    // shift by 6 bit each 16 bit
   1235    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32);
   1236    srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg32);
   1237    srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6);
   1238    srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 6);
   1239 
   1240    // shrink to 8 bit each 16 bits, the first lane contain the first
   1241    // convolve result and the second lane contain the second convolve
   1242    // result
   1243    srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
   1244 
   1245    src_ptr += src_stride;
   1246 
   1247    xx_store2_mi128(output_ptr, out_pitch, &srcReg32b1);
   1248 
   1249    output_ptr += dst_stride;
   1250 
   1251    // save part of the registers for next strides
   1252    srcReg32b10 = srcReg32b11;
   1253    srcReg32b1 = srcReg32b3;
   1254    srcReg32b11 = srcReg32b2;
   1255    srcReg32b3 = srcReg32b5;
   1256    srcReg32b2 = srcReg32b4;
   1257    srcReg32b5 = srcReg32b7;
   1258    srcReg32b7 = srcReg32b9;
   1259  }
   1260  if (i > 0) {
   1261    __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
   1262    __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
   1263    // load the last 16 bytes
   1264    srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
   1265 
   1266    // merge the last 2 results together
   1267    srcRegFilt4 =
   1268        _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
   1269    srcRegFilt7 =
   1270        _mm_unpackhi_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
   1271 
   1272    // multiply 2 adjacent elements with the filter and add the result
   1273    srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
   1274                                    _mm256_castsi256_si128(firstFilters));
   1275    srcRegFilt4 =
   1276        _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters));
   1277    srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
   1278                                    _mm256_castsi256_si128(firstFilters));
   1279    srcRegFilt7 =
   1280        _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters));
   1281 
   1282    // add and saturate the results together
   1283    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
   1284    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);
   1285 
   1286    // multiply 2 adjacent elements with the filter and add the result
   1287    srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
   1288                                    _mm256_castsi256_si128(secondFilters));
   1289    srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
   1290                                    _mm256_castsi256_si128(secondFilters));
   1291 
   1292    // multiply 2 adjacent elements with the filter and add the result
   1293    srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
   1294                                    _mm256_castsi256_si128(thirdFilters));
   1295    srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
   1296                                    _mm256_castsi256_si128(thirdFilters));
   1297 
   1298    // add and saturate the results together
   1299    srcRegFilt1 =
   1300        _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6));
   1301    srcRegFilt3 =
   1302        _mm_adds_epi16(srcRegFilt3, _mm_adds_epi16(srcRegFilt5, srcRegFilt7));
   1303 
   1304    // shift by 6 bit each 16 bit
   1305    srcRegFilt1 =
   1306        _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32));
   1307    srcRegFilt3 =
   1308        _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg32));
   1309    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6);
   1310    srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 6);
   1311 
   1312    // shrink to 8 bit each 16 bits, the first lane contain the first
   1313    // convolve result and the second lane contain the second convolve
   1314    // result
   1315    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
   1316 
   1317    // save 16 bytes
   1318    _mm_store_si128((__m128i *)output_ptr, srcRegFilt1);
   1319  }
   1320 }
   1321 
   1322 static void aom_filter_block1d4_v4_avx2(
   1323    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
   1324    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
   1325  __m128i filtersReg;
   1326  __m256i filtersReg32, addFilterReg32;
   1327  __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
   1328  __m256i srcReg23_34_lo, srcReg45_56_lo;
   1329  __m256i srcReg2345_3456_lo;
   1330  __m256i resReglo, resReg;
   1331  __m256i firstFilters;
   1332  unsigned int i;
   1333  ptrdiff_t src_stride, dst_stride;
   1334 
   1335  addFilterReg32 = _mm256_set1_epi16(32);
   1336  filtersReg = _mm_loadu_si128((const __m128i *)filter);
   1337  // converting the 16 bit (short) to  8 bit (byte) and have the
   1338  // same data in both lanes of 128 bit register.
   1339  filtersReg = _mm_srai_epi16(filtersReg, 1);
   1340  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
   1341  // have the same data in both lanes of a 256 bit register
   1342  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
   1343 
   1344  firstFilters =
   1345      _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u));
   1346 
   1347  // multiple the size of the source and destination stride by two
   1348  src_stride = src_pitch << 1;
   1349  dst_stride = out_pitch << 1;
   1350 
   1351  srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
   1352  srcReg4x = _mm256_castsi128_si256(
   1353      _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)));
   1354 
   1355  // have consecutive loads on the same 256 register
   1356  srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
   1357 
   1358  srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
   1359 
   1360  for (i = output_height; i > 1; i -= 2) {
   1361    // load the last 2 loads of 16 bytes and have every two
   1362    // consecutive loads in the same 256 bit register
   1363    srcReg5x = _mm256_castsi128_si256(
   1364        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)));
   1365    srcReg45 =
   1366        _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
   1367 
   1368    srcReg6x = _mm256_castsi128_si256(
   1369        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
   1370    srcReg56 =
   1371        _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
   1372 
   1373    // merge every two consecutive registers
   1374    srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
   1375 
   1376    srcReg2345_3456_lo = _mm256_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo);
   1377 
   1378    // multiply 2 adjacent elements with the filter and add the result
   1379    resReglo = _mm256_maddubs_epi16(srcReg2345_3456_lo, firstFilters);
   1380 
   1381    resReglo = _mm256_hadds_epi16(resReglo, _mm256_setzero_si256());
   1382 
   1383    // shift by 6 bit each 16 bit
   1384    resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
   1385    resReglo = _mm256_srai_epi16(resReglo, 6);
   1386 
   1387    // shrink to 8 bit each 16 bits, the first lane contain the first
   1388    // convolve result and the second lane contain the second convolve
   1389    // result
   1390    resReg = _mm256_packus_epi16(resReglo, resReglo);
   1391 
   1392    src_ptr += src_stride;
   1393 
   1394    xx_storeu2_epi32(output_ptr, out_pitch, &resReg);
   1395 
   1396    output_ptr += dst_stride;
   1397 
   1398    // save part of the registers for next strides
   1399    srcReg23_34_lo = srcReg45_56_lo;
   1400    srcReg4x = srcReg6x;
   1401  }
   1402 }
   1403 
   1404 #if HAVE_AVX2 && HAVE_SSSE3
   1405 filter8_1dfunction aom_filter_block1d4_v8_ssse3;
   1406 filter8_1dfunction aom_filter_block1d16_v2_ssse3;
   1407 filter8_1dfunction aom_filter_block1d16_h2_ssse3;
   1408 filter8_1dfunction aom_filter_block1d8_v2_ssse3;
   1409 filter8_1dfunction aom_filter_block1d8_h2_ssse3;
   1410 filter8_1dfunction aom_filter_block1d4_v2_ssse3;
   1411 filter8_1dfunction aom_filter_block1d4_h2_ssse3;
   1412 #define aom_filter_block1d4_v8_avx2 aom_filter_block1d4_v8_ssse3
   1413 #define aom_filter_block1d16_v2_avx2 aom_filter_block1d16_v2_ssse3
   1414 #define aom_filter_block1d16_h2_avx2 aom_filter_block1d16_h2_ssse3
   1415 #define aom_filter_block1d8_v2_avx2 aom_filter_block1d8_v2_ssse3
   1416 #define aom_filter_block1d8_h2_avx2 aom_filter_block1d8_h2_ssse3
   1417 #define aom_filter_block1d4_v2_avx2 aom_filter_block1d4_v2_ssse3
   1418 #define aom_filter_block1d4_h2_avx2 aom_filter_block1d4_h2_ssse3
   1419 // void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
   1420 //                                uint8_t *dst, ptrdiff_t dst_stride,
   1421 //                                const int16_t *filter_x, int x_step_q4,
   1422 //                                const int16_t *filter_y, int y_step_q4,
   1423 //                                int w, int h);
   1424 // void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
   1425 //                               uint8_t *dst, ptrdiff_t dst_stride,
   1426 //                               const int16_t *filter_x, int x_step_q4,
   1427 //                               const int16_t *filter_y, int y_step_q4,
   1428 //                               int w, int h);
   1429 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2)
   1430 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2)
   1431 
   1432 #endif  // HAVE_AX2 && HAVE_SSSE3