tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

highbd_jnt_convolve_avx2.c (36207B)


      1 /*
      2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <immintrin.h>
     13 #include <assert.h>
     14 
     15 #include "config/av1_rtcd.h"
     16 
     17 #include "aom_dsp/x86/convolve_avx2.h"
     18 #include "aom_dsp/x86/convolve_common_intrin.h"
     19 #include "aom_dsp/x86/convolve_sse4_1.h"
     20 #include "aom_dsp/x86/synonyms.h"
     21 #include "aom_dsp/aom_dsp_common.h"
     22 #include "aom_dsp/aom_filter.h"
     23 #include "av1/common/convolve.h"
     24 
     25 void av1_highbd_dist_wtd_convolve_2d_copy_avx2(const uint16_t *src,
     26                                               int src_stride, uint16_t *dst0,
     27                                               int dst_stride0, int w, int h,
     28                                               ConvolveParams *conv_params,
     29                                               int bd) {
     30  CONV_BUF_TYPE *dst = conv_params->dst;
     31  int dst_stride = conv_params->dst_stride;
     32 
     33  const int bits =
     34      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
     35  const __m128i left_shift = _mm_cvtsi32_si128(bits);
     36  const int do_average = conv_params->do_average;
     37  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
     38  const int w0 = conv_params->fwd_offset;
     39  const int w1 = conv_params->bck_offset;
     40  const __m256i wt0 = _mm256_set1_epi32(w0);
     41  const __m256i wt1 = _mm256_set1_epi32(w1);
     42  const __m256i zero = _mm256_setzero_si256();
     43  int i, j;
     44 
     45  const int offset_0 =
     46      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
     47  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
     48  const __m256i offset_const = _mm256_set1_epi32(offset);
     49  const __m256i offset_const_16b = _mm256_set1_epi16(offset);
     50  const int rounding_shift =
     51      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
     52  const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
     53  const __m256i clip_pixel_to_bd =
     54      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
     55 
     56  assert(bits <= 4);
     57 
     58  if (!(w % 16)) {
     59    for (i = 0; i < h; i += 1) {
     60      for (j = 0; j < w; j += 16) {
     61        const __m256i src_16bit =
     62            _mm256_loadu_si256((__m256i *)(&src[i * src_stride + j]));
     63 
     64        const __m256i res = _mm256_sll_epi16(src_16bit, left_shift);
     65 
     66        if (do_average) {
     67          const __m256i data_0 =
     68              _mm256_loadu_si256((__m256i *)(&dst[i * dst_stride + j]));
     69 
     70          const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_0, zero);
     71          const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_0, zero);
     72 
     73          const __m256i res_32b_lo = _mm256_unpacklo_epi16(res, zero);
     74          const __m256i res_unsigned_lo =
     75              _mm256_add_epi32(res_32b_lo, offset_const);
     76 
     77          const __m256i comp_avg_res_lo =
     78              highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
     79                              use_dist_wtd_comp_avg);
     80 
     81          const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero);
     82          const __m256i res_unsigned_hi =
     83              _mm256_add_epi32(res_32b_hi, offset_const);
     84 
     85          const __m256i comp_avg_res_hi =
     86              highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
     87                              use_dist_wtd_comp_avg);
     88 
     89          const __m256i round_result_lo = highbd_convolve_rounding(
     90              &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
     91          const __m256i round_result_hi = highbd_convolve_rounding(
     92              &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
     93 
     94          const __m256i res_16b =
     95              _mm256_packus_epi32(round_result_lo, round_result_hi);
     96          const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
     97 
     98          _mm256_store_si256((__m256i *)(&dst0[i * dst_stride0 + j]), res_clip);
     99        } else {
    100          const __m256i res_unsigned_16b =
    101              _mm256_adds_epu16(res, offset_const_16b);
    102 
    103          _mm256_store_si256((__m256i *)(&dst[i * dst_stride + j]),
    104                             res_unsigned_16b);
    105        }
    106      }
    107    }
    108  } else if (!(w % 4)) {
    109    for (i = 0; i < h; i += 2) {
    110      for (j = 0; j < w; j += 8) {
    111        const __m128i src_row_0 =
    112            _mm_loadu_si128((__m128i *)(&src[i * src_stride + j]));
    113        const __m128i src_row_1 =
    114            _mm_loadu_si128((__m128i *)(&src[i * src_stride + j + src_stride]));
    115        // since not all compilers yet support _mm256_set_m128i()
    116        const __m256i src_10 = _mm256_insertf128_si256(
    117            _mm256_castsi128_si256(src_row_0), src_row_1, 1);
    118 
    119        const __m256i res = _mm256_sll_epi16(src_10, left_shift);
    120 
    121        if (w - j < 8) {
    122          if (do_average) {
    123            const __m256i data_0 = _mm256_castsi128_si256(
    124                _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
    125            const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
    126                (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
    127            const __m256i data_01 =
    128                _mm256_permute2x128_si256(data_0, data_1, 0x20);
    129 
    130            const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
    131 
    132            const __m256i res_32b = _mm256_unpacklo_epi16(res, zero);
    133            const __m256i res_unsigned_lo =
    134                _mm256_add_epi32(res_32b, offset_const);
    135 
    136            const __m256i comp_avg_res =
    137                highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1,
    138                                use_dist_wtd_comp_avg);
    139 
    140            const __m256i round_result = highbd_convolve_rounding(
    141                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
    142 
    143            const __m256i res_16b =
    144                _mm256_packus_epi32(round_result, round_result);
    145            const __m256i res_clip =
    146                _mm256_min_epi16(res_16b, clip_pixel_to_bd);
    147 
    148            const __m128i res_0 = _mm256_castsi256_si128(res_clip);
    149            const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
    150 
    151            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
    152            _mm_storel_epi64(
    153                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
    154          } else {
    155            const __m256i res_unsigned_16b =
    156                _mm256_adds_epu16(res, offset_const_16b);
    157 
    158            const __m128i res_0 = _mm256_castsi256_si128(res_unsigned_16b);
    159            const __m128i res_1 = _mm256_extracti128_si256(res_unsigned_16b, 1);
    160 
    161            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
    162            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
    163                             res_1);
    164          }
    165        } else {
    166          if (do_average) {
    167            const __m256i data_0 = _mm256_castsi128_si256(
    168                _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
    169            const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
    170                (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
    171            const __m256i data_01 =
    172                _mm256_permute2x128_si256(data_0, data_1, 0x20);
    173 
    174            const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
    175            const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
    176 
    177            const __m256i res_32b_lo = _mm256_unpacklo_epi16(res, zero);
    178            const __m256i res_unsigned_lo =
    179                _mm256_add_epi32(res_32b_lo, offset_const);
    180 
    181            const __m256i comp_avg_res_lo =
    182                highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
    183                                use_dist_wtd_comp_avg);
    184 
    185            const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero);
    186            const __m256i res_unsigned_hi =
    187                _mm256_add_epi32(res_32b_hi, offset_const);
    188 
    189            const __m256i comp_avg_res_hi =
    190                highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
    191                                use_dist_wtd_comp_avg);
    192 
    193            const __m256i round_result_lo =
    194                highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
    195                                         &rounding_const, rounding_shift);
    196            const __m256i round_result_hi =
    197                highbd_convolve_rounding(&comp_avg_res_hi, &offset_const,
    198                                         &rounding_const, rounding_shift);
    199 
    200            const __m256i res_16b =
    201                _mm256_packus_epi32(round_result_lo, round_result_hi);
    202            const __m256i res_clip =
    203                _mm256_min_epi16(res_16b, clip_pixel_to_bd);
    204 
    205            const __m128i res_0 = _mm256_castsi256_si128(res_clip);
    206            const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
    207 
    208            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
    209            _mm_store_si128(
    210                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
    211          } else {
    212            const __m256i res_unsigned_16b =
    213                _mm256_adds_epu16(res, offset_const_16b);
    214            const __m128i res_0 = _mm256_castsi256_si128(res_unsigned_16b);
    215            const __m128i res_1 = _mm256_extracti128_si256(res_unsigned_16b, 1);
    216 
    217            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
    218            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
    219                            res_1);
    220          }
    221        }
    222      }
    223    }
    224  }
    225 }
    226 
    227 void av1_highbd_dist_wtd_convolve_2d_avx2(
    228    const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
    229    int h, const InterpFilterParams *filter_params_x,
    230    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
    231    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
    232  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
    233  CONV_BUF_TYPE *dst = conv_params->dst;
    234  int dst_stride = conv_params->dst_stride;
    235  int im_h = h + filter_params_y->taps - 1;
    236  int im_stride = 8;
    237  int i, j;
    238  const int fo_vert = filter_params_y->taps / 2 - 1;
    239  const int fo_horiz = filter_params_x->taps / 2 - 1;
    240  const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
    241 
    242  // Check that, even with 12-bit input, the intermediate values will fit
    243  // into an unsigned 16-bit intermediate array.
    244  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
    245 
    246  __m256i s[8], coeffs_y[4], coeffs_x[4];
    247  const int do_average = conv_params->do_average;
    248  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
    249 
    250  const int w0 = conv_params->fwd_offset;
    251  const int w1 = conv_params->bck_offset;
    252  const __m256i wt0 = _mm256_set1_epi32(w0);
    253  const __m256i wt1 = _mm256_set1_epi32(w1);
    254  const __m256i zero = _mm256_setzero_si256();
    255 
    256  const __m256i round_const_x = _mm256_set1_epi32(
    257      ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
    258  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
    259 
    260  const __m256i round_const_y = _mm256_set1_epi32(
    261      ((1 << conv_params->round_1) >> 1) -
    262      (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
    263  const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
    264 
    265  const int offset_0 =
    266      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
    267  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
    268  const __m256i offset_const = _mm256_set1_epi32(offset);
    269  const int rounding_shift =
    270      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
    271  const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
    272 
    273  const __m256i clip_pixel_to_bd =
    274      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
    275 
    276  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
    277  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
    278 
    279  for (j = 0; j < w; j += 8) {
    280    /* Horizontal filter */
    281    {
    282      for (i = 0; i < im_h; i += 2) {
    283        const __m256i row0 =
    284            _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
    285        __m256i row1 = _mm256_setzero_si256();
    286        if (i + 1 < im_h)
    287          row1 =
    288              _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
    289 
    290        const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
    291        const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
    292 
    293        // even pixels
    294        s[0] = _mm256_alignr_epi8(r1, r0, 0);
    295        s[1] = _mm256_alignr_epi8(r1, r0, 4);
    296        s[2] = _mm256_alignr_epi8(r1, r0, 8);
    297        s[3] = _mm256_alignr_epi8(r1, r0, 12);
    298 
    299        __m256i res_even = convolve(s, coeffs_x);
    300        res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
    301                                    round_shift_x);
    302 
    303        // odd pixels
    304        s[0] = _mm256_alignr_epi8(r1, r0, 2);
    305        s[1] = _mm256_alignr_epi8(r1, r0, 6);
    306        s[2] = _mm256_alignr_epi8(r1, r0, 10);
    307        s[3] = _mm256_alignr_epi8(r1, r0, 14);
    308 
    309        __m256i res_odd = convolve(s, coeffs_x);
    310        res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
    311                                   round_shift_x);
    312 
    313        __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
    314        __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
    315        __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
    316 
    317        _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
    318      }
    319    }
    320 
    321    /* Vertical filter */
    322    {
    323      __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
    324      __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
    325      __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
    326      __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
    327      __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
    328      __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
    329 
    330      s[0] = _mm256_unpacklo_epi16(s0, s1);
    331      s[1] = _mm256_unpacklo_epi16(s2, s3);
    332      s[2] = _mm256_unpacklo_epi16(s4, s5);
    333 
    334      s[4] = _mm256_unpackhi_epi16(s0, s1);
    335      s[5] = _mm256_unpackhi_epi16(s2, s3);
    336      s[6] = _mm256_unpackhi_epi16(s4, s5);
    337 
    338      for (i = 0; i < h; i += 2) {
    339        const int16_t *data = &im_block[i * im_stride];
    340 
    341        const __m256i s6 =
    342            _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
    343        const __m256i s7 =
    344            _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
    345 
    346        s[3] = _mm256_unpacklo_epi16(s6, s7);
    347        s[7] = _mm256_unpackhi_epi16(s6, s7);
    348 
    349        const __m256i res_a = convolve(s, coeffs_y);
    350 
    351        const __m256i res_a_round = _mm256_sra_epi32(
    352            _mm256_add_epi32(res_a, round_const_y), round_shift_y);
    353 
    354        const __m256i res_unsigned_lo =
    355            _mm256_add_epi32(res_a_round, offset_const);
    356 
    357        if (w - j < 8) {
    358          if (do_average) {
    359            const __m256i data_0 = _mm256_castsi128_si256(
    360                _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
    361            const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
    362                (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
    363            const __m256i data_01 =
    364                _mm256_permute2x128_si256(data_0, data_1, 0x20);
    365 
    366            const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
    367 
    368            const __m256i comp_avg_res =
    369                highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1,
    370                                use_dist_wtd_comp_avg);
    371 
    372            const __m256i round_result = highbd_convolve_rounding(
    373                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
    374 
    375            const __m256i res_16b =
    376                _mm256_packus_epi32(round_result, round_result);
    377            const __m256i res_clip =
    378                _mm256_min_epi16(res_16b, clip_pixel_to_bd);
    379 
    380            const __m128i res_0 = _mm256_castsi256_si128(res_clip);
    381            const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
    382 
    383            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
    384            _mm_storel_epi64(
    385                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
    386          } else {
    387            __m256i res_16b =
    388                _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo);
    389            const __m128i res_0 = _mm256_castsi256_si128(res_16b);
    390            const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
    391 
    392            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
    393            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
    394                             res_1);
    395          }
    396        } else {
    397          const __m256i res_b = convolve(s + 4, coeffs_y);
    398          const __m256i res_b_round = _mm256_sra_epi32(
    399              _mm256_add_epi32(res_b, round_const_y), round_shift_y);
    400 
    401          __m256i res_unsigned_hi = _mm256_add_epi32(res_b_round, offset_const);
    402 
    403          if (do_average) {
    404            const __m256i data_0 = _mm256_castsi128_si256(
    405                _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
    406            const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
    407                (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
    408            const __m256i data_01 =
    409                _mm256_permute2x128_si256(data_0, data_1, 0x20);
    410 
    411            const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
    412            const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
    413 
    414            const __m256i comp_avg_res_lo =
    415                highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
    416                                use_dist_wtd_comp_avg);
    417            const __m256i comp_avg_res_hi =
    418                highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
    419                                use_dist_wtd_comp_avg);
    420 
    421            const __m256i round_result_lo =
    422                highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
    423                                         &rounding_const, rounding_shift);
    424            const __m256i round_result_hi =
    425                highbd_convolve_rounding(&comp_avg_res_hi, &offset_const,
    426                                         &rounding_const, rounding_shift);
    427 
    428            const __m256i res_16b =
    429                _mm256_packus_epi32(round_result_lo, round_result_hi);
    430            const __m256i res_clip =
    431                _mm256_min_epi16(res_16b, clip_pixel_to_bd);
    432 
    433            const __m128i res_0 = _mm256_castsi256_si128(res_clip);
    434            const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
    435 
    436            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
    437            _mm_store_si128(
    438                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
    439          } else {
    440            __m256i res_16b =
    441                _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
    442            const __m128i res_0 = _mm256_castsi256_si128(res_16b);
    443            const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
    444 
    445            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
    446            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
    447                            res_1);
    448          }
    449        }
    450 
    451        s[0] = s[1];
    452        s[1] = s[2];
    453        s[2] = s[3];
    454 
    455        s[4] = s[5];
    456        s[5] = s[6];
    457        s[6] = s[7];
    458      }
    459    }
    460  }
    461 }
    462 
    463 void av1_highbd_dist_wtd_convolve_x_avx2(
    464    const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
    465    int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
    466    ConvolveParams *conv_params, int bd) {
    467  CONV_BUF_TYPE *dst = conv_params->dst;
    468  int dst_stride = conv_params->dst_stride;
    469  const int fo_horiz = filter_params_x->taps / 2 - 1;
    470  const uint16_t *const src_ptr = src - fo_horiz;
    471  const int bits = FILTER_BITS - conv_params->round_1;
    472 
    473  int i, j;
    474  __m256i s[4], coeffs_x[4];
    475 
    476  const int do_average = conv_params->do_average;
    477  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
    478  const int w0 = conv_params->fwd_offset;
    479  const int w1 = conv_params->bck_offset;
    480  const __m256i wt0 = _mm256_set1_epi32(w0);
    481  const __m256i wt1 = _mm256_set1_epi32(w1);
    482  const __m256i zero = _mm256_setzero_si256();
    483 
    484  const __m256i round_const_x =
    485      _mm256_set1_epi32(((1 << conv_params->round_0) >> 1));
    486  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
    487  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
    488 
    489  const int offset_0 =
    490      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
    491  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
    492  const __m256i offset_const = _mm256_set1_epi32(offset);
    493  const int rounding_shift =
    494      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
    495  const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
    496  const __m256i clip_pixel_to_bd =
    497      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
    498 
    499  assert(bits >= 0);
    500  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
    501 
    502  for (j = 0; j < w; j += 8) {
    503    /* Horizontal filter */
    504    for (i = 0; i < h; i += 2) {
    505      const __m256i row0 =
    506          _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
    507      __m256i row1 =
    508          _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
    509 
    510      const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
    511      const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
    512 
    513      // even pixels
    514      s[0] = _mm256_alignr_epi8(r1, r0, 0);
    515      s[1] = _mm256_alignr_epi8(r1, r0, 4);
    516      s[2] = _mm256_alignr_epi8(r1, r0, 8);
    517      s[3] = _mm256_alignr_epi8(r1, r0, 12);
    518 
    519      __m256i res_even = convolve(s, coeffs_x);
    520      res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
    521                                  round_shift_x);
    522 
    523      // odd pixels
    524      s[0] = _mm256_alignr_epi8(r1, r0, 2);
    525      s[1] = _mm256_alignr_epi8(r1, r0, 6);
    526      s[2] = _mm256_alignr_epi8(r1, r0, 10);
    527      s[3] = _mm256_alignr_epi8(r1, r0, 14);
    528 
    529      __m256i res_odd = convolve(s, coeffs_x);
    530      res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
    531                                 round_shift_x);
    532 
    533      res_even = _mm256_sll_epi32(res_even, round_shift_bits);
    534      res_odd = _mm256_sll_epi32(res_odd, round_shift_bits);
    535 
    536      __m256i res1 = _mm256_unpacklo_epi32(res_even, res_odd);
    537 
    538      __m256i res_unsigned_lo = _mm256_add_epi32(res1, offset_const);
    539 
    540      if (w - j < 8) {
    541        if (do_average) {
    542          const __m256i data_0 = _mm256_castsi128_si256(
    543              _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
    544          const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
    545              (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
    546          const __m256i data_01 =
    547              _mm256_permute2x128_si256(data_0, data_1, 0x20);
    548 
    549          const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
    550 
    551          const __m256i comp_avg_res = highbd_comp_avg(
    552              &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg);
    553 
    554          const __m256i round_result = highbd_convolve_rounding(
    555              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
    556 
    557          const __m256i res_16b =
    558              _mm256_packus_epi32(round_result, round_result);
    559          const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
    560 
    561          const __m128i res_0 = _mm256_castsi256_si128(res_clip);
    562          const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
    563 
    564          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
    565          _mm_storel_epi64(
    566              (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
    567        } else {
    568          __m256i res_16b =
    569              _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo);
    570          const __m128i res_0 = _mm256_castsi256_si128(res_16b);
    571          const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
    572 
    573          _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
    574          _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
    575                           res_1);
    576        }
    577      } else {
    578        __m256i res2 = _mm256_unpackhi_epi32(res_even, res_odd);
    579        __m256i res_unsigned_hi = _mm256_add_epi32(res2, offset_const);
    580 
    581        if (do_average) {
    582          const __m256i data_0 = _mm256_castsi128_si256(
    583              _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
    584          const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
    585              (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
    586          const __m256i data_01 =
    587              _mm256_permute2x128_si256(data_0, data_1, 0x20);
    588 
    589          const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
    590          const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
    591 
    592          const __m256i comp_avg_res_lo =
    593              highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
    594                              use_dist_wtd_comp_avg);
    595          const __m256i comp_avg_res_hi =
    596              highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
    597                              use_dist_wtd_comp_avg);
    598 
    599          const __m256i round_result_lo = highbd_convolve_rounding(
    600              &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
    601          const __m256i round_result_hi = highbd_convolve_rounding(
    602              &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
    603 
    604          const __m256i res_16b =
    605              _mm256_packus_epi32(round_result_lo, round_result_hi);
    606          const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
    607 
    608          const __m128i res_0 = _mm256_castsi256_si128(res_clip);
    609          const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
    610 
    611          _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
    612          _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]),
    613                          res_1);
    614        } else {
    615          __m256i res_16b =
    616              _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
    617          const __m128i res_0 = _mm256_castsi256_si128(res_16b);
    618          const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
    619 
    620          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
    621          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
    622                          res_1);
    623        }
    624      }
    625    }
    626  }
    627 }
    628 
    629 void av1_highbd_dist_wtd_convolve_y_avx2(
    630    const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
    631    int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn,
    632    ConvolveParams *conv_params, int bd) {
    633  CONV_BUF_TYPE *dst = conv_params->dst;
    634  int dst_stride = conv_params->dst_stride;
    635  const int fo_vert = filter_params_y->taps / 2 - 1;
    636  const uint16_t *const src_ptr = src - fo_vert * src_stride;
    637  const int bits = FILTER_BITS - conv_params->round_0;
    638 
    639  assert(bits >= 0);
    640  int i, j;
    641  __m256i s[8], coeffs_y[4];
    642  const int do_average = conv_params->do_average;
    643  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
    644 
    645  const int w0 = conv_params->fwd_offset;
    646  const int w1 = conv_params->bck_offset;
    647  const __m256i wt0 = _mm256_set1_epi32(w0);
    648  const __m256i wt1 = _mm256_set1_epi32(w1);
    649  const __m256i round_const_y =
    650      _mm256_set1_epi32(((1 << conv_params->round_1) >> 1));
    651  const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
    652  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
    653 
    654  const int offset_0 =
    655      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
    656  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
    657  const __m256i offset_const = _mm256_set1_epi32(offset);
    658  const int rounding_shift =
    659      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
    660  const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
    661  const __m256i clip_pixel_to_bd =
    662      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
    663  const __m256i zero = _mm256_setzero_si256();
    664 
    665  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
    666 
    667  for (j = 0; j < w; j += 8) {
    668    const uint16_t *data = &src_ptr[j];
    669    /* Vertical filter */
    670    {
    671      __m256i src6;
    672      __m256i s01 = _mm256_permute2x128_si256(
    673          _mm256_castsi128_si256(
    674              _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
    675          _mm256_castsi128_si256(
    676              _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
    677          0x20);
    678      __m256i s12 = _mm256_permute2x128_si256(
    679          _mm256_castsi128_si256(
    680              _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
    681          _mm256_castsi128_si256(
    682              _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
    683          0x20);
    684      __m256i s23 = _mm256_permute2x128_si256(
    685          _mm256_castsi128_si256(
    686              _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
    687          _mm256_castsi128_si256(
    688              _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
    689          0x20);
    690      __m256i s34 = _mm256_permute2x128_si256(
    691          _mm256_castsi128_si256(
    692              _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
    693          _mm256_castsi128_si256(
    694              _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
    695          0x20);
    696      __m256i s45 = _mm256_permute2x128_si256(
    697          _mm256_castsi128_si256(
    698              _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
    699          _mm256_castsi128_si256(
    700              _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
    701          0x20);
    702      src6 = _mm256_castsi128_si256(
    703          _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
    704      __m256i s56 = _mm256_permute2x128_si256(
    705          _mm256_castsi128_si256(
    706              _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
    707          src6, 0x20);
    708 
    709      s[0] = _mm256_unpacklo_epi16(s01, s12);
    710      s[1] = _mm256_unpacklo_epi16(s23, s34);
    711      s[2] = _mm256_unpacklo_epi16(s45, s56);
    712 
    713      s[4] = _mm256_unpackhi_epi16(s01, s12);
    714      s[5] = _mm256_unpackhi_epi16(s23, s34);
    715      s[6] = _mm256_unpackhi_epi16(s45, s56);
    716 
    717      for (i = 0; i < h; i += 2) {
    718        data = &src_ptr[i * src_stride + j];
    719 
    720        const __m256i s67 = _mm256_permute2x128_si256(
    721            src6,
    722            _mm256_castsi128_si256(
    723                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
    724            0x20);
    725 
    726        src6 = _mm256_castsi128_si256(
    727            _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
    728 
    729        const __m256i s78 = _mm256_permute2x128_si256(
    730            _mm256_castsi128_si256(
    731                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
    732            src6, 0x20);
    733 
    734        s[3] = _mm256_unpacklo_epi16(s67, s78);
    735        s[7] = _mm256_unpackhi_epi16(s67, s78);
    736 
    737        const __m256i res_a = convolve(s, coeffs_y);
    738 
    739        __m256i res_a_round = _mm256_sll_epi32(res_a, round_shift_bits);
    740        res_a_round = _mm256_sra_epi32(
    741            _mm256_add_epi32(res_a_round, round_const_y), round_shift_y);
    742 
    743        __m256i res_unsigned_lo = _mm256_add_epi32(res_a_round, offset_const);
    744 
    745        if (w - j < 8) {
    746          if (do_average) {
    747            const __m256i data_0 = _mm256_castsi128_si256(
    748                _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
    749            const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
    750                (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
    751            const __m256i data_01 =
    752                _mm256_permute2x128_si256(data_0, data_1, 0x20);
    753 
    754            const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
    755 
    756            const __m256i comp_avg_res =
    757                highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1,
    758                                use_dist_wtd_comp_avg);
    759 
    760            const __m256i round_result = highbd_convolve_rounding(
    761                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
    762 
    763            const __m256i res_16b =
    764                _mm256_packus_epi32(round_result, round_result);
    765            const __m256i res_clip =
    766                _mm256_min_epi16(res_16b, clip_pixel_to_bd);
    767 
    768            const __m128i res_0 = _mm256_castsi256_si128(res_clip);
    769            const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
    770 
    771            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
    772            _mm_storel_epi64(
    773                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
    774          } else {
    775            __m256i res_16b =
    776                _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo);
    777            const __m128i res_0 = _mm256_castsi256_si128(res_16b);
    778            const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
    779 
    780            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
    781            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
    782                             res_1);
    783          }
    784        } else {
    785          const __m256i res_b = convolve(s + 4, coeffs_y);
    786          __m256i res_b_round = _mm256_sll_epi32(res_b, round_shift_bits);
    787          res_b_round = _mm256_sra_epi32(
    788              _mm256_add_epi32(res_b_round, round_const_y), round_shift_y);
    789 
    790          __m256i res_unsigned_hi = _mm256_add_epi32(res_b_round, offset_const);
    791 
    792          if (do_average) {
    793            const __m256i data_0 = _mm256_castsi128_si256(
    794                _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
    795            const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
    796                (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
    797            const __m256i data_01 =
    798                _mm256_permute2x128_si256(data_0, data_1, 0x20);
    799 
    800            const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
    801            const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
    802 
    803            const __m256i comp_avg_res_lo =
    804                highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
    805                                use_dist_wtd_comp_avg);
    806            const __m256i comp_avg_res_hi =
    807                highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
    808                                use_dist_wtd_comp_avg);
    809 
    810            const __m256i round_result_lo =
    811                highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
    812                                         &rounding_const, rounding_shift);
    813            const __m256i round_result_hi =
    814                highbd_convolve_rounding(&comp_avg_res_hi, &offset_const,
    815                                         &rounding_const, rounding_shift);
    816 
    817            const __m256i res_16b =
    818                _mm256_packus_epi32(round_result_lo, round_result_hi);
    819            const __m256i res_clip =
    820                _mm256_min_epi16(res_16b, clip_pixel_to_bd);
    821 
    822            const __m128i res_0 = _mm256_castsi256_si128(res_clip);
    823            const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
    824 
    825            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
    826            _mm_store_si128(
    827                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
    828          } else {
    829            __m256i res_16b =
    830                _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
    831            const __m128i res_0 = _mm256_castsi256_si128(res_16b);
    832            const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
    833 
    834            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
    835            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
    836                            res_1);
    837          }
    838        }
    839        s[0] = s[1];
    840        s[1] = s[2];
    841        s[2] = s[3];
    842 
    843        s[4] = s[5];
    844        s[5] = s[6];
    845        s[6] = s[7];
    846      }
    847    }
    848  }
    849 }