tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

highbd_convolve_avx2.c (47756B)


      1 /*
      2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 #include <immintrin.h>
     12 #include <string.h>
     13 
     14 #include "config/av1_rtcd.h"
     15 
     16 #include "aom_dsp/x86/convolve.h"
     17 #include "aom_dsp/x86/convolve_avx2.h"
     18 #include "aom_dsp/x86/synonyms.h"
     19 
     20 // -----------------------------------------------------------------------------
     21 // Copy and average
     22 
     23 static const uint8_t ip_shuffle_f2f3[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
     24                                             7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
     25                                             4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
     26 static const uint8_t ip_shuffle_f4f5[32] = { 4, 5, 6,  7,  6,  7,  8,  9,
     27                                             8, 9, 10, 11, 10, 11, 12, 13,
     28                                             4, 5, 6,  7,  6,  7,  8,  9,
     29                                             8, 9, 10, 11, 10, 11, 12, 13 };
     30 
     31 void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride,
     32                                    uint16_t *dst, int dst_stride, int w, int h,
     33                                    const InterpFilterParams *filter_params_x,
     34                                    const int subpel_x_qn,
     35                                    ConvolveParams *conv_params, int bd);
     36 void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride,
     37                                    uint16_t *dst, int dst_stride, int w, int h,
     38                                    const InterpFilterParams *filter_params_y,
     39                                    const int subpel_y_qn, int bd);
     40 
     41 void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride,
     42                                   uint16_t *dst, int dst_stride, int w, int h,
     43                                   const InterpFilterParams *filter_params_y,
     44                                   const int subpel_y_qn, int bd) {
     45  if (filter_params_y->taps == 12) {
     46    av1_highbd_convolve_y_sr_ssse3(src, src_stride, dst, dst_stride, w, h,
     47                                   filter_params_y, subpel_y_qn, bd);
     48    return;
     49  }
     50  int i, j;
     51  const int fo_vert = filter_params_y->taps / 2 - 1;
     52  const uint16_t *const src_ptr = src - fo_vert * src_stride;
     53 
     54  __m256i s[8], coeffs_y[4];
     55 
     56  const int bits = FILTER_BITS;
     57 
     58  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
     59  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
     60  const __m256i clip_pixel =
     61      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
     62  const __m256i zero = _mm256_setzero_si256();
     63 
     64  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
     65 
     66  for (j = 0; j < w; j += 8) {
     67    const uint16_t *data = &src_ptr[j];
     68    /* Vertical filter */
     69    {
     70      __m256i src6;
     71      __m256i s01 = _mm256_permute2x128_si256(
     72          _mm256_castsi128_si256(
     73              _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
     74          _mm256_castsi128_si256(
     75              _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
     76          0x20);
     77      __m256i s12 = _mm256_permute2x128_si256(
     78          _mm256_castsi128_si256(
     79              _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
     80          _mm256_castsi128_si256(
     81              _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
     82          0x20);
     83      __m256i s23 = _mm256_permute2x128_si256(
     84          _mm256_castsi128_si256(
     85              _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
     86          _mm256_castsi128_si256(
     87              _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
     88          0x20);
     89      __m256i s34 = _mm256_permute2x128_si256(
     90          _mm256_castsi128_si256(
     91              _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
     92          _mm256_castsi128_si256(
     93              _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
     94          0x20);
     95      __m256i s45 = _mm256_permute2x128_si256(
     96          _mm256_castsi128_si256(
     97              _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
     98          _mm256_castsi128_si256(
     99              _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
    100          0x20);
    101      src6 = _mm256_castsi128_si256(
    102          _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
    103      __m256i s56 = _mm256_permute2x128_si256(
    104          _mm256_castsi128_si256(
    105              _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
    106          src6, 0x20);
    107 
    108      s[0] = _mm256_unpacklo_epi16(s01, s12);
    109      s[1] = _mm256_unpacklo_epi16(s23, s34);
    110      s[2] = _mm256_unpacklo_epi16(s45, s56);
    111 
    112      s[4] = _mm256_unpackhi_epi16(s01, s12);
    113      s[5] = _mm256_unpackhi_epi16(s23, s34);
    114      s[6] = _mm256_unpackhi_epi16(s45, s56);
    115 
    116      for (i = 0; i < h; i += 2) {
    117        data = &src_ptr[i * src_stride + j];
    118 
    119        const __m256i s67 = _mm256_permute2x128_si256(
    120            src6,
    121            _mm256_castsi128_si256(
    122                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
    123            0x20);
    124 
    125        src6 = _mm256_castsi128_si256(
    126            _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
    127 
    128        const __m256i s78 = _mm256_permute2x128_si256(
    129            _mm256_castsi128_si256(
    130                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
    131            src6, 0x20);
    132 
    133        s[3] = _mm256_unpacklo_epi16(s67, s78);
    134        s[7] = _mm256_unpackhi_epi16(s67, s78);
    135 
    136        const __m256i res_a = convolve(s, coeffs_y);
    137 
    138        __m256i res_a_round = _mm256_sra_epi32(
    139            _mm256_add_epi32(res_a, round_const_bits), round_shift_bits);
    140 
    141        if (w - j > 4) {
    142          const __m256i res_b = convolve(s + 4, coeffs_y);
    143          __m256i res_b_round = _mm256_sra_epi32(
    144              _mm256_add_epi32(res_b, round_const_bits), round_shift_bits);
    145 
    146          __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
    147          res_16bit = _mm256_min_epi16(res_16bit, clip_pixel);
    148          res_16bit = _mm256_max_epi16(res_16bit, zero);
    149 
    150          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
    151                           _mm256_castsi256_si128(res_16bit));
    152          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
    153                           _mm256_extracti128_si256(res_16bit, 1));
    154        } else if (w == 4) {
    155          res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
    156          res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
    157          res_a_round = _mm256_max_epi16(res_a_round, zero);
    158 
    159          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
    160                           _mm256_castsi256_si128(res_a_round));
    161          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
    162                           _mm256_extracti128_si256(res_a_round, 1));
    163        } else {
    164          res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
    165          res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
    166          res_a_round = _mm256_max_epi16(res_a_round, zero);
    167 
    168          xx_storel_32(&dst[i * dst_stride + j],
    169                       _mm256_castsi256_si128(res_a_round));
    170          xx_storel_32(&dst[i * dst_stride + j + dst_stride],
    171                       _mm256_extracti128_si256(res_a_round, 1));
    172        }
    173 
    174        s[0] = s[1];
    175        s[1] = s[2];
    176        s[2] = s[3];
    177 
    178        s[4] = s[5];
    179        s[5] = s[6];
    180        s[6] = s[7];
    181      }
    182    }
    183  }
    184 }
    185 
    186 void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride,
    187                                   uint16_t *dst, int dst_stride, int w, int h,
    188                                   const InterpFilterParams *filter_params_x,
    189                                   const int subpel_x_qn,
    190                                   ConvolveParams *conv_params, int bd) {
    191  if (filter_params_x->taps == 12) {
    192    av1_highbd_convolve_x_sr_ssse3(src, src_stride, dst, dst_stride, w, h,
    193                                   filter_params_x, subpel_x_qn, conv_params,
    194                                   bd);
    195    return;
    196  }
    197  int i, j;
    198  const int fo_horiz = filter_params_x->taps / 2 - 1;
    199  const uint16_t *const src_ptr = src - fo_horiz;
    200 
    201  // Check that, even with 12-bit input, the intermediate values will fit
    202  // into an unsigned 16-bit intermediate array.
    203  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
    204 
    205  __m256i s[4], coeffs_x[4];
    206 
    207  const __m256i round_const_x =
    208      _mm256_set1_epi32(((1 << conv_params->round_0) >> 1));
    209  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
    210 
    211  const int bits = FILTER_BITS - conv_params->round_0;
    212  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
    213  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
    214  const __m256i clip_pixel =
    215      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
    216  const __m256i zero = _mm256_setzero_si256();
    217 
    218  assert(bits >= 0);
    219  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
    220         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
    221 
    222  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
    223 
    224  for (j = 0; j < w; j += 8) {
    225    /* Horizontal filter */
    226    for (i = 0; i < h; i += 2) {
    227      const __m256i row0 =
    228          _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
    229      __m256i row1 =
    230          _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
    231 
    232      const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
    233      const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
    234 
    235      // even pixels
    236      s[0] = _mm256_alignr_epi8(r1, r0, 0);
    237      s[1] = _mm256_alignr_epi8(r1, r0, 4);
    238      s[2] = _mm256_alignr_epi8(r1, r0, 8);
    239      s[3] = _mm256_alignr_epi8(r1, r0, 12);
    240 
    241      __m256i res_even = convolve(s, coeffs_x);
    242      res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
    243                                  round_shift_x);
    244 
    245      // odd pixels
    246      s[0] = _mm256_alignr_epi8(r1, r0, 2);
    247      s[1] = _mm256_alignr_epi8(r1, r0, 6);
    248      s[2] = _mm256_alignr_epi8(r1, r0, 10);
    249      s[3] = _mm256_alignr_epi8(r1, r0, 14);
    250 
    251      __m256i res_odd = convolve(s, coeffs_x);
    252      res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
    253                                 round_shift_x);
    254 
    255      res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_bits),
    256                                  round_shift_bits);
    257      res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_bits),
    258                                 round_shift_bits);
    259 
    260      __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
    261      __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
    262 
    263      __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
    264      res = _mm256_min_epi16(res, clip_pixel);
    265      res = _mm256_max_epi16(res, zero);
    266 
    267      if (w - j > 4) {
    268        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
    269                         _mm256_castsi256_si128(res));
    270        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
    271                         _mm256_extracti128_si256(res, 1));
    272      } else if (w == 4) {
    273        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
    274                         _mm256_castsi256_si128(res));
    275        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
    276                         _mm256_extracti128_si256(res, 1));
    277      } else {
    278        xx_storel_32(&dst[i * dst_stride + j], _mm256_castsi256_si128(res));
    279        xx_storel_32(&dst[i * dst_stride + j + dst_stride],
    280                     _mm256_extracti128_si256(res, 1));
    281      }
    282    }
    283  }
    284 }
    285 
    286 #define CONV8_ROUNDING_BITS (7)
    287 
    288 // -----------------------------------------------------------------------------
    289 // Horizontal and vertical filtering
    290 
    291 static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
    292                                              7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
    293                                              4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
    294 
    295 static const uint8_t signal_pattern_1[32] = { 4, 5, 6,  7,  6,  7,  8,  9,
    296                                              8, 9, 10, 11, 10, 11, 12, 13,
    297                                              4, 5, 6,  7,  6,  7,  8,  9,
    298                                              8, 9, 10, 11, 10, 11, 12, 13 };
    299 
    300 static const uint8_t signal_pattern_2[32] = { 6,  7,  8,  9,  8,  9,  10, 11,
    301                                              10, 11, 12, 13, 12, 13, 14, 15,
    302                                              6,  7,  8,  9,  8,  9,  10, 11,
    303                                              10, 11, 12, 13, 12, 13, 14, 15 };
    304 
    305 static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 };
    306 
    307 // -----------------------------------------------------------------------------
    308 // Horizontal Filtering
    309 
    310 static inline void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) {
    311  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
    312  const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0);
    313  const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1);
    314  const __m256i c = _mm256_permutevar8x32_epi32(*s, idx);
    315 
    316  p[0] = _mm256_shuffle_epi8(*s, sf0);  // x0x6
    317  p[1] = _mm256_shuffle_epi8(*s, sf1);  // x1x7
    318  p[2] = _mm256_shuffle_epi8(c, sf0);   // x2x4
    319  p[3] = _mm256_shuffle_epi8(c, sf1);   // x3x5
    320 }
    321 
    322 // Note:
    323 //  Shared by 8x2 and 16x1 block
    324 static inline void pack_16_pixels(const __m256i *s0, const __m256i *s1,
    325                                  __m256i *x /*x[8]*/) {
    326  __m256i pp[8];
    327  pack_pixels(s0, pp);
    328  pack_pixels(s1, &pp[4]);
    329  x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20);
    330  x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20);
    331  x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20);
    332  x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20);
    333  x[4] = x[2];
    334  x[5] = x[3];
    335  x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31);
    336  x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31);
    337 }
    338 
    339 static inline void pack_8x1_pixels(const uint16_t *src, __m256i *x) {
    340  __m256i pp[8];
    341  __m256i s0;
    342  s0 = _mm256_loadu_si256((const __m256i *)src);
    343  pack_pixels(&s0, pp);
    344  x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30);
    345  x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30);
    346  x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30);
    347  x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30);
    348 }
    349 
    350 static inline void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride,
    351                                   __m256i *x) {
    352  __m256i s0, s1;
    353  s0 = _mm256_loadu_si256((const __m256i *)src);
    354  s1 = _mm256_loadu_si256((const __m256i *)(src + stride));
    355  pack_16_pixels(&s0, &s1, x);
    356 }
    357 
    358 static inline void pack_16x1_pixels(const uint16_t *src, __m256i *x) {
    359  __m256i s0, s1;
    360  s0 = _mm256_loadu_si256((const __m256i *)src);
    361  s1 = _mm256_loadu_si256((const __m256i *)(src + 8));
    362  pack_16_pixels(&s0, &s1, x);
    363 }
    364 
    365 // Note:
    366 //  Shared by horizontal and vertical filtering
    367 static inline void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) {
    368  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
    369  const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
    370  const __m256i p0 = _mm256_set1_epi32(0x03020100);
    371  const __m256i p1 = _mm256_set1_epi32(0x07060504);
    372  const __m256i p2 = _mm256_set1_epi32(0x0b0a0908);
    373  const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c);
    374  f[0] = _mm256_shuffle_epi8(hh, p0);
    375  f[1] = _mm256_shuffle_epi8(hh, p1);
    376  f[2] = _mm256_shuffle_epi8(hh, p2);
    377  f[3] = _mm256_shuffle_epi8(hh, p3);
    378 }
    379 
    380 static inline void pack_filters_4tap(const int16_t *filter,
    381                                     __m256i *f /*f[4]*/) {
    382  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
    383  const __m256i coeff = _mm256_broadcastsi128_si256(h);
    384 
    385  // coeffs 2 3 2 3 2 3 2 3
    386  f[0] = _mm256_shuffle_epi32(coeff, 0x55);
    387  // coeffs 4 5 4 5 4 5 4 5
    388  f[1] = _mm256_shuffle_epi32(coeff, 0xaa);
    389 }
    390 
    391 static inline void filter_8x1_pixels(const __m256i *sig /*sig[4]*/,
    392                                     const __m256i *fil /*fil[4]*/,
    393                                     __m256i *y) {
    394  __m256i a, a0, a1;
    395 
    396  a0 = _mm256_madd_epi16(fil[0], sig[0]);
    397  a1 = _mm256_madd_epi16(fil[3], sig[3]);
    398  a = _mm256_add_epi32(a0, a1);
    399 
    400  a0 = _mm256_madd_epi16(fil[1], sig[1]);
    401  a1 = _mm256_madd_epi16(fil[2], sig[2]);
    402 
    403  {
    404    const __m256i min = _mm256_min_epi32(a0, a1);
    405    a = _mm256_add_epi32(a, min);
    406  }
    407  {
    408    const __m256i max = _mm256_max_epi32(a0, a1);
    409    a = _mm256_add_epi32(a, max);
    410  }
    411  {
    412    const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
    413    a = _mm256_add_epi32(a, rounding);
    414    *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS);
    415  }
    416 }
    417 
    418 static inline void store_8x1_pixels(const __m256i *y, const __m256i *mask,
    419                                    uint16_t *dst) {
    420  const __m128i a0 = _mm256_castsi256_si128(*y);
    421  const __m128i a1 = _mm256_extractf128_si256(*y, 1);
    422  __m128i res = _mm_packus_epi32(a0, a1);
    423  res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
    424  _mm_storeu_si128((__m128i *)dst, res);
    425 }
    426 
    427 static inline void store_8x2_pixels(const __m256i *y0, const __m256i *y1,
    428                                    const __m256i *mask, uint16_t *dst,
    429                                    ptrdiff_t pitch) {
    430  __m256i a = _mm256_packus_epi32(*y0, *y1);
    431  a = _mm256_min_epi16(a, *mask);
    432  _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
    433  _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
    434 }
    435 
    436 static inline void store_16x1_pixels(const __m256i *y0, const __m256i *y1,
    437                                     const __m256i *mask, uint16_t *dst) {
    438  __m256i a = _mm256_packus_epi32(*y0, *y1);
    439  a = _mm256_min_epi16(a, *mask);
    440  _mm256_storeu_si256((__m256i *)dst, a);
    441 }
    442 
    443 static void aom_highbd_filter_block1d8_h8_avx2(
    444    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
    445    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
    446  __m256i signal[8], res0, res1;
    447  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
    448 
    449  __m256i ff[4];
    450  pack_filters(filter, ff);
    451 
    452  src_ptr -= 3;
    453  do {
    454    pack_8x2_pixels(src_ptr, src_pitch, signal);
    455    filter_8x1_pixels(signal, ff, &res0);
    456    filter_8x1_pixels(&signal[4], ff, &res1);
    457    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
    458    height -= 2;
    459    src_ptr += src_pitch << 1;
    460    dst_ptr += dst_pitch << 1;
    461  } while (height > 1);
    462 
    463  if (height > 0) {
    464    pack_8x1_pixels(src_ptr, signal);
    465    filter_8x1_pixels(signal, ff, &res0);
    466    store_8x1_pixels(&res0, &max, dst_ptr);
    467  }
    468 }
    469 
    470 static void aom_highbd_filter_block1d16_h8_avx2(
    471    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
    472    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
    473  __m256i signal[8], res0, res1;
    474  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
    475 
    476  __m256i ff[4];
    477  pack_filters(filter, ff);
    478 
    479  src_ptr -= 3;
    480  do {
    481    pack_16x1_pixels(src_ptr, signal);
    482    filter_8x1_pixels(signal, ff, &res0);
    483    filter_8x1_pixels(&signal[4], ff, &res1);
    484    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
    485    height -= 1;
    486    src_ptr += src_pitch;
    487    dst_ptr += dst_pitch;
    488  } while (height > 0);
    489 }
    490 
    491 static void aom_highbd_filter_block1d4_h4_avx2(
    492    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
    493    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
    494  const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
    495  __m256i ff[2], s[2];
    496  uint32_t i;
    497  const __m256i clip_pixel =
    498      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
    499  const __m256i zero = _mm256_setzero_si256();
    500 
    501  static const uint8_t shuffle_mask[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
    502                                            7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
    503                                            4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
    504 
    505  __m256i mask = _mm256_loadu_si256((__m256i *)shuffle_mask);
    506  __m256i ip_mask_f2f3 = _mm256_loadu_si256((__m256i *)ip_shuffle_f2f3);
    507  __m256i ip_mask_f4f5 = _mm256_loadu_si256((__m256i *)ip_shuffle_f4f5);
    508 
    509  pack_filters_4tap(filter, ff);
    510  src_ptr -= 3;
    511  for (i = 0; i <= (height - 2); i += 2) {
    512    __m256i row0 = _mm256_castsi128_si256(
    513        _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 2]));
    514    __m256i row1 = _mm256_castsi128_si256(
    515        _mm_loadu_si128((__m128i *)&src_ptr[(i + 1) * src_pitch + 2]));
    516 
    517    s[0] = _mm256_inserti128_si256(row0, _mm256_castsi256_si128(row1), 1);
    518    s[1] = _mm256_alignr_epi8(s[0], s[0], 4);
    519 
    520    s[0] = _mm256_shuffle_epi8(s[0], mask);
    521    s[1] = _mm256_shuffle_epi8(s[1], mask);
    522 
    523    __m256i res = convolve_4tap(s, ff);
    524    res =
    525        _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS);
    526 
    527    res = _mm256_packs_epi32(res, res);
    528    res = _mm256_min_epi16(res, clip_pixel);
    529    res = _mm256_max_epi16(res, zero);
    530 
    531    _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch],
    532                     _mm256_castsi256_si128(res));
    533    _mm_storel_epi64((__m128i *)&dst_ptr[(i + 1) * dst_pitch],
    534                     _mm256_extracti128_si256(res, 1));
    535  }
    536  if (height % 2 != 0) {
    537    i = height - 1;
    538    const __m256i row0_0 = _mm256_castsi128_si256(
    539        _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 2]));
    540    const __m256i row0_1 = _mm256_castsi128_si256(
    541        _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 6]));
    542 
    543    const __m256i r0 =
    544        _mm256_inserti128_si256(row0_0, _mm256_castsi256_si128(row0_1), 1);
    545 
    546    s[0] = _mm256_shuffle_epi8(r0, ip_mask_f2f3);
    547    s[1] = _mm256_shuffle_epi8(r0, ip_mask_f4f5);
    548 
    549    __m256i res = convolve_4tap(s, ff);
    550    res =
    551        _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS);
    552 
    553    res = _mm256_packs_epi32(res, res);
    554    res = _mm256_min_epi16(res, clip_pixel);
    555    res = _mm256_max_epi16(res, zero);
    556 
    557    _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch],
    558                     _mm256_castsi256_si128(res));
    559  }
    560 }
    561 
    562 static void aom_highbd_filter_block1d8_h4_avx2(
    563    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
    564    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
    565  const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
    566  __m256i ff[2], s[2];
    567  uint32_t i = 0;
    568  const __m256i clip_pixel =
    569      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
    570  const __m256i zero = _mm256_setzero_si256();
    571 
    572  static const uint8_t shuffle_mask[32] = { 0, 1, 8,  9,  2, 3, 10, 11,
    573                                            4, 5, 12, 13, 6, 7, 14, 15,
    574                                            0, 1, 8,  9,  2, 3, 10, 11,
    575                                            4, 5, 12, 13, 6, 7, 14, 15 };
    576 
    577  __m256i mask = _mm256_loadu_si256((__m256i *)shuffle_mask);
    578  __m256i ip_mask_f2f3 = _mm256_loadu_si256((__m256i *)ip_shuffle_f2f3);
    579  __m256i ip_mask_f4f5 = _mm256_loadu_si256((__m256i *)ip_shuffle_f4f5);
    580 
    581  pack_filters_4tap(filter, ff);
    582  src_ptr -= 3;
    583 
    584  /* Horizontal filter */
    585 
    586  for (i = 0; i <= (height - 2); i += 2) {
    587    const __m256i row0 =
    588        _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 2]);
    589    __m256i row1 =
    590        _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_pitch + 2]);
    591 
    592    const __m256i r0 =
    593        _mm256_inserti128_si256(row0, _mm256_castsi256_si128(row1), 1);
    594    const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
    595 
    596    // even pixels
    597    s[0] = r0;
    598    s[1] = _mm256_alignr_epi8(r1, r0, 4);
    599 
    600    __m256i res_even = convolve_4tap(s, ff);
    601    res_even = _mm256_srai_epi32(_mm256_add_epi32(res_even, rounding),
    602                                 CONV8_ROUNDING_BITS);
    603 
    604    // odd pixels
    605    s[0] = _mm256_alignr_epi8(r1, r0, 2);
    606    s[1] = _mm256_alignr_epi8(r1, r0, 6);
    607 
    608    __m256i res_odd = convolve_4tap(s, ff);
    609    res_odd = _mm256_srai_epi32(_mm256_add_epi32(res_odd, rounding),
    610                                CONV8_ROUNDING_BITS);
    611 
    612    __m256i res = _mm256_packs_epi32(res_even, res_odd);
    613    res = _mm256_shuffle_epi8(res, mask);
    614 
    615    res = _mm256_min_epi16(res, clip_pixel);
    616    res = _mm256_max_epi16(res, zero);
    617 
    618    _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch],
    619                     _mm256_castsi256_si128(res));
    620    _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch],
    621                     _mm256_extracti128_si256(res, 1));
    622  }
    623 
    624  if (height % 2 != 0) {
    625    i = height - 1;
    626    const __m256i row0_0 =
    627        _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 2]);
    628    const __m256i row0_1 =
    629        _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 6]);
    630 
    631    const __m256i r0 =
    632        _mm256_inserti128_si256(row0_0, _mm256_castsi256_si128(row0_1), 1);
    633 
    634    s[0] = _mm256_shuffle_epi8(r0, ip_mask_f2f3);
    635    s[1] = _mm256_shuffle_epi8(r0, ip_mask_f4f5);
    636 
    637    __m256i res = convolve_4tap(s, ff);
    638    res =
    639        _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS);
    640 
    641    res = _mm256_packs_epi32(res, res);
    642    res = _mm256_min_epi16(res, clip_pixel);
    643    res = _mm256_max_epi16(res, zero);
    644 
    645    _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch],
    646                     _mm256_castsi256_si128(res));
    647    _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch + 4],
    648                     _mm256_extracti128_si256(res, 1));
    649  }
    650 }
    651 
    652 static void aom_highbd_filter_block1d16_h4_avx2(
    653    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
    654    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
    655  aom_highbd_filter_block1d8_h4_avx2(src_ptr, src_pitch, dst_ptr, dst_pitch,
    656                                     height, filter, bd);
    657  aom_highbd_filter_block1d8_h4_avx2(src_ptr + 8, src_pitch, dst_ptr + 8,
    658                                     dst_pitch, height, filter, bd);
    659 }
    660 
    661 // -----------------------------------------------------------------------------
    662 // 2-tap horizontal filtering
    663 
    664 static inline void pack_2t_filter(const int16_t *filter, __m256i *f) {
    665  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
    666  const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
    667  const __m256i p = _mm256_set1_epi32(0x09080706);
    668  f[0] = _mm256_shuffle_epi8(hh, p);
    669 }
    670 
    671 // can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels()
    672 // the difference is s0/s1 specifies first and second rows or,
    673 // first 16 samples and 8-sample shifted 16 samples
    674 static inline void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1,
    675                                     __m256i *sig) {
    676  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
    677  const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
    678  __m256i x0 = _mm256_shuffle_epi8(*s0, sf2);
    679  __m256i x1 = _mm256_shuffle_epi8(*s1, sf2);
    680  __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx);
    681  __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx);
    682  r0 = _mm256_shuffle_epi8(r0, sf2);
    683  r1 = _mm256_shuffle_epi8(r1, sf2);
    684  sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20);
    685  sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20);
    686 }
    687 
    688 static inline void pack_8x2_2t_pixels(const uint16_t *src,
    689                                      const ptrdiff_t pitch, __m256i *sig) {
    690  const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
    691  const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
    692  pack_16_2t_pixels(&r0, &r1, sig);
    693 }
    694 
    695 static inline void pack_16x1_2t_pixels(const uint16_t *src,
    696                                       __m256i *sig /*sig[2]*/) {
    697  const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
    698  const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8));
    699  pack_16_2t_pixels(&r0, &r1, sig);
    700 }
    701 
    702 static inline void pack_8x1_2t_pixels(const uint16_t *src,
    703                                      __m256i *sig /*sig[2]*/) {
    704  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
    705  const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
    706  __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
    707  __m256i x0 = _mm256_shuffle_epi8(r0, sf2);
    708  r0 = _mm256_permutevar8x32_epi32(r0, idx);
    709  r0 = _mm256_shuffle_epi8(r0, sf2);
    710  sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20);
    711 }
    712 
    713 // can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels()
    714 static inline void filter_16_2t_pixels(const __m256i *sig, const __m256i *f,
    715                                       __m256i *y0, __m256i *y1) {
    716  const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
    717  __m256i x0 = _mm256_madd_epi16(sig[0], *f);
    718  __m256i x1 = _mm256_madd_epi16(sig[1], *f);
    719  x0 = _mm256_add_epi32(x0, rounding);
    720  x1 = _mm256_add_epi32(x1, rounding);
    721  *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
    722  *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS);
    723 }
    724 
    725 static inline void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f,
    726                                        __m256i *y0) {
    727  const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
    728  __m256i x0 = _mm256_madd_epi16(sig[0], *f);
    729  x0 = _mm256_add_epi32(x0, rounding);
    730  *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
    731 }
    732 
    733 static void aom_highbd_filter_block1d8_h2_avx2(
    734    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
    735    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
    736  __m256i signal[2], res0, res1;
    737  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
    738 
    739  __m256i ff;
    740  pack_2t_filter(filter, &ff);
    741 
    742  src_ptr -= 3;
    743  do {
    744    pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
    745    filter_16_2t_pixels(signal, &ff, &res0, &res1);
    746    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
    747    height -= 2;
    748    src_ptr += src_pitch << 1;
    749    dst_ptr += dst_pitch << 1;
    750  } while (height > 1);
    751 
    752  if (height > 0) {
    753    pack_8x1_2t_pixels(src_ptr, signal);
    754    filter_8x1_2t_pixels(signal, &ff, &res0);
    755    store_8x1_pixels(&res0, &max, dst_ptr);
    756  }
    757 }
    758 
    759 static void aom_highbd_filter_block1d16_h2_avx2(
    760    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
    761    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
    762  __m256i signal[2], res0, res1;
    763  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
    764 
    765  __m256i ff;
    766  pack_2t_filter(filter, &ff);
    767 
    768  src_ptr -= 3;
    769  do {
    770    pack_16x1_2t_pixels(src_ptr, signal);
    771    filter_16_2t_pixels(signal, &ff, &res0, &res1);
    772    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
    773    height -= 1;
    774    src_ptr += src_pitch;
    775    dst_ptr += dst_pitch;
    776  } while (height > 0);
    777 }
    778 
    779 // -----------------------------------------------------------------------------
    780 // Vertical Filtering
    781 
    782 static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
    783  __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src));
    784  __m256i s1 =
    785      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch)));
    786  __m256i s2 = _mm256_castsi128_si256(
    787      _mm_loadu_si128((const __m128i *)(src + 2 * pitch)));
    788  __m256i s3 = _mm256_castsi128_si256(
    789      _mm_loadu_si128((const __m128i *)(src + 3 * pitch)));
    790  __m256i s4 = _mm256_castsi128_si256(
    791      _mm_loadu_si128((const __m128i *)(src + 4 * pitch)));
    792  __m256i s5 = _mm256_castsi128_si256(
    793      _mm_loadu_si128((const __m128i *)(src + 5 * pitch)));
    794  __m256i s6 = _mm256_castsi128_si256(
    795      _mm_loadu_si128((const __m128i *)(src + 6 * pitch)));
    796 
    797  s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
    798  s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1);
    799  s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1);
    800  s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1);
    801  s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1);
    802  s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1);
    803 
    804  sig[0] = _mm256_unpacklo_epi16(s0, s1);
    805  sig[4] = _mm256_unpackhi_epi16(s0, s1);
    806  sig[1] = _mm256_unpacklo_epi16(s2, s3);
    807  sig[5] = _mm256_unpackhi_epi16(s2, s3);
    808  sig[2] = _mm256_unpacklo_epi16(s4, s5);
    809  sig[6] = _mm256_unpackhi_epi16(s4, s5);
    810  sig[8] = s6;
    811 }
    812 
    813 static inline void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch,
    814                                   __m256i *sig) {
    815  // base + 7th row
    816  __m256i s0 = _mm256_castsi128_si256(
    817      _mm_loadu_si128((const __m128i *)(src + 7 * pitch)));
    818  // base + 8th row
    819  __m256i s1 = _mm256_castsi128_si256(
    820      _mm_loadu_si128((const __m128i *)(src + 8 * pitch)));
    821  __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1);
    822  __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
    823  sig[3] = _mm256_unpacklo_epi16(s2, s3);
    824  sig[7] = _mm256_unpackhi_epi16(s2, s3);
    825  sig[8] = s1;
    826 }
    827 
    828 static inline void filter_8x9_pixels(const __m256i *sig, const __m256i *f,
    829                                     __m256i *y0, __m256i *y1) {
    830  filter_8x1_pixels(sig, f, y0);
    831  filter_8x1_pixels(&sig[4], f, y1);
    832 }
    833 
    834 static inline void update_pixels(__m256i *sig) {
    835  int i;
    836  for (i = 0; i < 3; ++i) {
    837    sig[i] = sig[i + 1];
    838    sig[i + 4] = sig[i + 5];
    839  }
    840 }
    841 
    842 static void aom_highbd_filter_block1d8_v8_avx2(
    843    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
    844    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
    845  __m256i signal[9], res0, res1;
    846  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
    847 
    848  __m256i ff[4];
    849  pack_filters(filter, ff);
    850 
    851  pack_8x9_init(src_ptr, src_pitch, signal);
    852 
    853  do {
    854    pack_8x9_pixels(src_ptr, src_pitch, signal);
    855 
    856    filter_8x9_pixels(signal, ff, &res0, &res1);
    857    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
    858    update_pixels(signal);
    859 
    860    src_ptr += src_pitch << 1;
    861    dst_ptr += dst_pitch << 1;
    862    height -= 2;
    863  } while (height > 0);
    864 }
    865 
    866 static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
    867  __m256i u0, u1, u2, u3;
    868  // load 0-6 rows
    869  const __m256i s0 = _mm256_loadu_si256((const __m256i *)src);
    870  const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
    871  const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch));
    872  const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch));
    873  const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch));
    874  const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch));
    875  const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch));
    876 
    877  u0 = _mm256_permute2x128_si256(s0, s1, 0x20);  // 0, 1 low
    878  u1 = _mm256_permute2x128_si256(s0, s1, 0x31);  // 0, 1 high
    879 
    880  u2 = _mm256_permute2x128_si256(s1, s2, 0x20);  // 1, 2 low
    881  u3 = _mm256_permute2x128_si256(s1, s2, 0x31);  // 1, 2 high
    882 
    883  sig[0] = _mm256_unpacklo_epi16(u0, u2);
    884  sig[4] = _mm256_unpackhi_epi16(u0, u2);
    885 
    886  sig[8] = _mm256_unpacklo_epi16(u1, u3);
    887  sig[12] = _mm256_unpackhi_epi16(u1, u3);
    888 
    889  u0 = _mm256_permute2x128_si256(s2, s3, 0x20);
    890  u1 = _mm256_permute2x128_si256(s2, s3, 0x31);
    891 
    892  u2 = _mm256_permute2x128_si256(s3, s4, 0x20);
    893  u3 = _mm256_permute2x128_si256(s3, s4, 0x31);
    894 
    895  sig[1] = _mm256_unpacklo_epi16(u0, u2);
    896  sig[5] = _mm256_unpackhi_epi16(u0, u2);
    897 
    898  sig[9] = _mm256_unpacklo_epi16(u1, u3);
    899  sig[13] = _mm256_unpackhi_epi16(u1, u3);
    900 
    901  u0 = _mm256_permute2x128_si256(s4, s5, 0x20);
    902  u1 = _mm256_permute2x128_si256(s4, s5, 0x31);
    903 
    904  u2 = _mm256_permute2x128_si256(s5, s6, 0x20);
    905  u3 = _mm256_permute2x128_si256(s5, s6, 0x31);
    906 
    907  sig[2] = _mm256_unpacklo_epi16(u0, u2);
    908  sig[6] = _mm256_unpackhi_epi16(u0, u2);
    909 
    910  sig[10] = _mm256_unpacklo_epi16(u1, u3);
    911  sig[14] = _mm256_unpackhi_epi16(u1, u3);
    912 
    913  sig[16] = s6;
    914 }
    915 
    916 static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch,
    917                             __m256i *sig) {
    918  // base + 7th row
    919  const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch));
    920  // base + 8th row
    921  const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch));
    922 
    923  __m256i u0, u1, u2, u3;
    924  u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20);
    925  u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31);
    926 
    927  u2 = _mm256_permute2x128_si256(s7, s8, 0x20);
    928  u3 = _mm256_permute2x128_si256(s7, s8, 0x31);
    929 
    930  sig[3] = _mm256_unpacklo_epi16(u0, u2);
    931  sig[7] = _mm256_unpackhi_epi16(u0, u2);
    932 
    933  sig[11] = _mm256_unpacklo_epi16(u1, u3);
    934  sig[15] = _mm256_unpackhi_epi16(u1, u3);
    935 
    936  sig[16] = s8;
    937 }
    938 
    939 static inline void filter_16x9_pixels(const __m256i *sig, const __m256i *f,
    940                                      __m256i *y0, __m256i *y1) {
    941  __m256i res[4];
    942  int i;
    943  for (i = 0; i < 4; ++i) {
    944    filter_8x1_pixels(&sig[i << 2], f, &res[i]);
    945  }
    946 
    947  {
    948    const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]);
    949    const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]);
    950    *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20);
    951    *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31);
    952  }
    953 }
    954 
    955 static inline void store_16x2_pixels(const __m256i *y0, const __m256i *y1,
    956                                     const __m256i *mask, uint16_t *dst,
    957                                     ptrdiff_t pitch) {
    958  __m256i p = _mm256_min_epi16(*y0, *mask);
    959  _mm256_storeu_si256((__m256i *)dst, p);
    960  p = _mm256_min_epi16(*y1, *mask);
    961  _mm256_storeu_si256((__m256i *)(dst + pitch), p);
    962 }
    963 
    964 static void update_16x9_pixels(__m256i *sig) {
    965  update_pixels(&sig[0]);
    966  update_pixels(&sig[8]);
    967 }
    968 
    969 static void aom_highbd_filter_block1d16_v8_avx2(
    970    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
    971    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
    972  __m256i signal[17], res0, res1;
    973  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
    974 
    975  __m256i ff[4];
    976  pack_filters(filter, ff);
    977 
    978  pack_16x9_init(src_ptr, src_pitch, signal);
    979 
    980  do {
    981    pack_16x9_pixels(src_ptr, src_pitch, signal);
    982    filter_16x9_pixels(signal, ff, &res0, &res1);
    983    store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
    984    update_16x9_pixels(signal);
    985 
    986    src_ptr += src_pitch << 1;
    987    dst_ptr += dst_pitch << 1;
    988    height -= 2;
    989  } while (height > 0);
    990 }
    991 
    992 static void aom_highbd_filter_block1d4_v4_avx2(
    993    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
    994    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
    995  const int bits = FILTER_BITS;
    996 
    997  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
    998  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
    999  const __m256i clip_pixel =
   1000      _mm256_set1_epi32(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
   1001  const __m256i zero = _mm256_setzero_si256();
   1002  uint32_t i;
   1003  __m256i s[2], ff[2];
   1004 
   1005  pack_filters_4tap(filter, ff);
   1006 
   1007  const uint16_t *data = src_ptr;
   1008  /* Vertical filter */
   1009  {
   1010    __m128i s2 = _mm_loadl_epi64((__m128i *)(data + 2 * src_pitch));
   1011    __m128i s3 = _mm_loadl_epi64((__m128i *)(data + 3 * src_pitch));
   1012 
   1013    __m256i s23 = _mm256_inserti128_si256(_mm256_castsi128_si256(s2), s3, 1);
   1014 
   1015    __m128i s4 = _mm_loadl_epi64((__m128i *)(data + 4 * src_pitch));
   1016 
   1017    __m256i s34 = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s4, 1);
   1018 
   1019    s[0] = _mm256_unpacklo_epi16(s23, s34);
   1020 
   1021    for (i = 0; i < height; i += 2) {
   1022      data = &src_ptr[i * src_pitch];
   1023 
   1024      __m128i s5 = _mm_loadl_epi64((__m128i *)(data + 5 * src_pitch));
   1025      __m128i s6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_pitch));
   1026 
   1027      __m256i s45 = _mm256_inserti128_si256(_mm256_castsi128_si256(s4), s5, 1);
   1028      __m256i s56 = _mm256_inserti128_si256(_mm256_castsi128_si256(s5), s6, 1);
   1029 
   1030      s[1] = _mm256_unpacklo_epi16(s45, s56);
   1031 
   1032      const __m256i res_a = convolve_4tap(s, ff);
   1033 
   1034      __m256i res_a_round = _mm256_sra_epi32(
   1035          _mm256_add_epi32(res_a, round_const_bits), round_shift_bits);
   1036 
   1037      __m256i res_16bit = _mm256_min_epi32(res_a_round, clip_pixel);
   1038      res_16bit = _mm256_max_epi32(res_16bit, zero);
   1039      res_16bit = _mm256_packs_epi32(res_16bit, res_16bit);
   1040 
   1041      _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch],
   1042                       _mm256_castsi256_si128(res_16bit));
   1043      _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch],
   1044                       _mm256_extracti128_si256(res_16bit, 1));
   1045 
   1046      s[0] = s[1];
   1047      s4 = s6;
   1048    }
   1049  }
   1050 }
   1051 
   1052 static void aom_highbd_filter_block1d8_v4_avx2(
   1053    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
   1054    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
   1055  const int bits = FILTER_BITS;
   1056 
   1057  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
   1058  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
   1059  const __m256i clip_pixel =
   1060      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
   1061  const __m256i zero = _mm256_setzero_si256();
   1062  __m256i s[4], ff[2];
   1063  uint32_t i;
   1064  pack_filters_4tap(filter, ff);
   1065 
   1066  const uint16_t *data = src_ptr;
   1067  /* Vertical filter */
   1068  {
   1069    __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_pitch));
   1070    __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_pitch));
   1071 
   1072    __m256i s23 = _mm256_inserti128_si256(_mm256_castsi128_si256(s2), s3, 1);
   1073 
   1074    __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_pitch));
   1075 
   1076    __m256i s34 = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s4, 1);
   1077 
   1078    s[0] = _mm256_unpacklo_epi16(s23, s34);
   1079    s[2] = _mm256_unpackhi_epi16(s23, s34);
   1080 
   1081    for (i = 0; i < height; i += 2) {
   1082      data = &src_ptr[i * src_pitch];
   1083 
   1084      __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_pitch));
   1085      __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_pitch));
   1086 
   1087      __m256i s45 = _mm256_inserti128_si256(_mm256_castsi128_si256(s4), s5, 1);
   1088      __m256i s56 = _mm256_inserti128_si256(_mm256_castsi128_si256(s5), s6, 1);
   1089 
   1090      s[1] = _mm256_unpacklo_epi16(s45, s56);
   1091      s[3] = _mm256_unpackhi_epi16(s45, s56);
   1092 
   1093      const __m256i res_a = convolve_4tap(s, ff);
   1094 
   1095      __m256i res_a_round = _mm256_sra_epi32(
   1096          _mm256_add_epi32(res_a, round_const_bits), round_shift_bits);
   1097 
   1098      const __m256i res_b = convolve_4tap(s + 2, ff);
   1099      __m256i res_b_round = _mm256_sra_epi32(
   1100          _mm256_add_epi32(res_b, round_const_bits), round_shift_bits);
   1101 
   1102      __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
   1103      res_16bit = _mm256_min_epi16(res_16bit, clip_pixel);
   1104      res_16bit = _mm256_max_epi16(res_16bit, zero);
   1105 
   1106      _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch],
   1107                       _mm256_castsi256_si128(res_16bit));
   1108      _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch],
   1109                       _mm256_extracti128_si256(res_16bit, 1));
   1110 
   1111      s[0] = s[1];
   1112      s[2] = s[3];
   1113      s4 = s6;
   1114    }
   1115  }
   1116 }
   1117 
   1118 static void aom_highbd_filter_block1d16_v4_avx2(
   1119    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
   1120    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
   1121  aom_highbd_filter_block1d8_v4_avx2(src_ptr, src_pitch, dst_ptr, dst_pitch,
   1122                                     height, filter, bd);
   1123 
   1124  aom_highbd_filter_block1d8_v4_avx2(src_ptr + 8, src_pitch, dst_ptr + 8,
   1125                                     dst_pitch, height, filter, bd);
   1126 }
   1127 
   1128 // -----------------------------------------------------------------------------
   1129 // 2-tap vertical filtering
   1130 
   1131 static void pack_16x2_init(const uint16_t *src, __m256i *sig) {
   1132  sig[2] = _mm256_loadu_si256((const __m256i *)src);
   1133 }
   1134 
   1135 static inline void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch,
   1136                                       __m256i *sig) {
   1137  // load the next row
   1138  const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch));
   1139  sig[0] = _mm256_unpacklo_epi16(sig[2], u);
   1140  sig[1] = _mm256_unpackhi_epi16(sig[2], u);
   1141  sig[2] = u;
   1142 }
   1143 
   1144 static inline void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f,
   1145                                         __m256i *y0, __m256i *y1) {
   1146  filter_16_2t_pixels(sig, f, y0, y1);
   1147 }
   1148 
   1149 static void aom_highbd_filter_block1d16_v2_avx2(
   1150    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
   1151    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
   1152  __m256i signal[3], res0, res1;
   1153  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
   1154  __m256i ff;
   1155 
   1156  pack_2t_filter(filter, &ff);
   1157  pack_16x2_init(src_ptr, signal);
   1158 
   1159  do {
   1160    pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
   1161    filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
   1162    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
   1163 
   1164    src_ptr += src_pitch;
   1165    dst_ptr += dst_pitch;
   1166    height -= 1;
   1167  } while (height > 0);
   1168 }
   1169 
   1170 static inline void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) {
   1171  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
   1172  const __m128i p = _mm_set1_epi32(0x09080706);
   1173  f[0] = _mm_shuffle_epi8(h, p);
   1174 }
   1175 
   1176 static void pack_8x2_init(const uint16_t *src, __m128i *sig) {
   1177  sig[2] = _mm_loadu_si128((const __m128i *)src);
   1178 }
   1179 
   1180 static inline void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch,
   1181                                          __m128i *sig) {
   1182  // load the next row
   1183  const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch));
   1184  sig[0] = _mm_unpacklo_epi16(sig[2], u);
   1185  sig[1] = _mm_unpackhi_epi16(sig[2], u);
   1186  sig[2] = u;
   1187 }
   1188 
   1189 static inline void filter_8_2t_pixels(const __m128i *sig, const __m128i *f,
   1190                                      __m128i *y0, __m128i *y1) {
   1191  const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
   1192  __m128i x0 = _mm_madd_epi16(sig[0], *f);
   1193  __m128i x1 = _mm_madd_epi16(sig[1], *f);
   1194  x0 = _mm_add_epi32(x0, rounding);
   1195  x1 = _mm_add_epi32(x1, rounding);
   1196  *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS);
   1197  *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS);
   1198 }
   1199 
   1200 static inline void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1,
   1201                                           const __m128i *mask, uint16_t *dst) {
   1202  __m128i res = _mm_packus_epi32(*y0, *y1);
   1203  res = _mm_min_epi16(res, *mask);
   1204  _mm_storeu_si128((__m128i *)dst, res);
   1205 }
   1206 
   1207 static void aom_highbd_filter_block1d8_v2_avx2(
   1208    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
   1209    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
   1210  __m128i signal[3], res0, res1;
   1211  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
   1212  __m128i ff;
   1213 
   1214  pack_8x1_2t_filter(filter, &ff);
   1215  pack_8x2_init(src_ptr, signal);
   1216 
   1217  do {
   1218    pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
   1219    filter_8_2t_pixels(signal, &ff, &res0, &res1);
   1220    store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr);
   1221 
   1222    src_ptr += src_pitch;
   1223    dst_ptr += dst_pitch;
   1224    height -= 1;
   1225  } while (height > 0);
   1226 }
   1227 
   1228 void aom_highbd_filter_block1d4_h8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
   1229                                        ptrdiff_t, uint32_t, const int16_t *,
   1230                                        int);
   1231 void aom_highbd_filter_block1d4_h2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
   1232                                        ptrdiff_t, uint32_t, const int16_t *,
   1233                                        int);
   1234 void aom_highbd_filter_block1d4_v8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
   1235                                        ptrdiff_t, uint32_t, const int16_t *,
   1236                                        int);
   1237 void aom_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
   1238                                        ptrdiff_t, uint32_t, const int16_t *,
   1239                                        int);
   1240 #define aom_highbd_filter_block1d4_h8_avx2 aom_highbd_filter_block1d4_h8_sse2
   1241 #define aom_highbd_filter_block1d4_h2_avx2 aom_highbd_filter_block1d4_h2_sse2
   1242 #define aom_highbd_filter_block1d4_v8_avx2 aom_highbd_filter_block1d4_v8_sse2
   1243 #define aom_highbd_filter_block1d4_v2_avx2 aom_highbd_filter_block1d4_v2_sse2
   1244 
   1245 HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2)
   1246 HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2)
   1247 
   1248 #undef HIGHBD_FUNC