tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

convolve_avx2.c (39711B)


      1 /*
      2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <immintrin.h>
     13 
     14 #include "config/av1_rtcd.h"
     15 
     16 #if CONFIG_SVT_AV1
     17 #include "third_party/SVT-AV1/convolve_avx2.h"
     18 #endif
     19 
     20 #include "aom_dsp/aom_dsp_common.h"
     21 #include "aom_dsp/x86/convolve_avx2.h"
     22 #include "aom_dsp/x86/convolve_common_intrin.h"
     23 #include "aom_dsp/x86/synonyms.h"
     24 
     25 static inline void av1_convolve_y_sr_general_avx2(
     26    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
     27    int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn) {
     28  // right shift is F-1 because we are already dividing
     29  // filter co-efficients by 2
     30  const int right_shift_bits = (FILTER_BITS - 1);
     31  __m128i right_shift = _mm_cvtsi32_si128(right_shift_bits);
     32  __m256i right_shift_const = _mm256_set1_epi16((1 << right_shift_bits) >> 1);
     33 
     34  __m256i coeffs[6], s[12];
     35  __m128i d[10];
     36 
     37  int i, vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);
     38 
     39  if (vert_tap == 6)
     40    prepare_coeffs_6t_lowbd(filter_params_y, subpel_y_qn, coeffs);
     41  else if (vert_tap == 12) {
     42    prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs);
     43  } else {
     44    prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs);
     45  }
     46 
     47  // vert_filt as 4 tap
     48  if (vert_tap == 4) {
     49    const int fo_vert = 1;
     50    const uint8_t *const src_ptr = src - fo_vert * src_stride;
     51    for (int j = 0; j < w; j += 16) {
     52      const uint8_t *data = &src_ptr[j];
     53      d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
     54      d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
     55      d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
     56      d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
     57      d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
     58 
     59      // Load lines a and b. Line a to lower 128, line b to upper 128
     60      const __m256i src_01a = _mm256_permute2x128_si256(
     61          _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
     62 
     63      const __m256i src_12a = _mm256_permute2x128_si256(
     64          _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
     65 
     66      const __m256i src_23a = _mm256_permute2x128_si256(
     67          _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
     68 
     69      const __m256i src_34a = _mm256_permute2x128_si256(
     70          _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
     71 
     72      s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
     73      s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
     74 
     75      s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
     76      s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
     77 
     78      for (i = 0; i < h; i += 2) {
     79        data = &src_ptr[i * src_stride + j];
     80        d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
     81        const __m256i src_45a = _mm256_permute2x128_si256(
     82            _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
     83 
     84        d[4] = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
     85        const __m256i src_56a = _mm256_permute2x128_si256(
     86            _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[4]), 0x20);
     87 
     88        s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
     89        s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
     90 
     91        const __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1);
     92        /* rounding code */
     93        // shift by F - 1
     94        const __m256i res_16b_lo = _mm256_sra_epi16(
     95            _mm256_add_epi16(res_lo, right_shift_const), right_shift);
     96        // 8 bit conversion and saturation to uint8
     97        __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
     98 
     99        if (w - j > 8) {
    100          const __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1);
    101 
    102          /* rounding code */
    103          // shift by F - 1
    104          const __m256i res_16b_hi = _mm256_sra_epi16(
    105              _mm256_add_epi16(res_hi, right_shift_const), right_shift);
    106          // 8 bit conversion and saturation to uint8
    107          __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
    108 
    109          __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
    110 
    111          const __m128i res_0 = _mm256_castsi256_si128(res_a);
    112          const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
    113 
    114          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
    115          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
    116                           res_1);
    117        } else {
    118          const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
    119          const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
    120          if (w - j > 4) {
    121            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
    122            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
    123                             res_1);
    124          } else if (w - j > 2) {
    125            xx_storel_32(&dst[i * dst_stride + j], res_0);
    126            xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
    127          } else {
    128            __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
    129            __m128i *const p_1 =
    130                (__m128i *)&dst[i * dst_stride + j + dst_stride];
    131            *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
    132            *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
    133          }
    134        }
    135        s[0] = s[1];
    136        s[1] = s[2];
    137 
    138        s[3] = s[4];
    139        s[4] = s[5];
    140      }
    141    }
    142  } else if (vert_tap == 6) {
    143    const int fo_vert = vert_tap / 2 - 1;
    144    const uint8_t *const src_ptr = src - fo_vert * src_stride;
    145 
    146    for (int j = 0; j < w; j += 16) {
    147      const uint8_t *data = &src_ptr[j];
    148      __m256i src6;
    149 
    150      d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
    151      d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
    152      d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
    153      d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
    154      // Load lines a and b. Line a to lower 128, line b to upper 128
    155      const __m256i src_01a = _mm256_permute2x128_si256(
    156          _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
    157 
    158      const __m256i src_12a = _mm256_permute2x128_si256(
    159          _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
    160 
    161      const __m256i src_23a = _mm256_permute2x128_si256(
    162          _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
    163 
    164      src6 = _mm256_castsi128_si256(
    165          _mm_loadu_si128((__m128i *)(data + 4 * src_stride)));
    166      const __m256i src_34a =
    167          _mm256_permute2x128_si256(_mm256_castsi128_si256(d[3]), src6, 0x20);
    168 
    169      s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
    170      s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
    171 
    172      s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
    173      s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
    174 
    175      for (i = 0; i < h; i += 2) {
    176        data = &src_ptr[i * src_stride + j];
    177        const __m256i src_45a = _mm256_permute2x128_si256(
    178            src6,
    179            _mm256_castsi128_si256(
    180                _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
    181            0x20);
    182 
    183        src6 = _mm256_castsi128_si256(
    184            _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
    185        const __m256i src_56a = _mm256_permute2x128_si256(
    186            _mm256_castsi128_si256(
    187                _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
    188            src6, 0x20);
    189 
    190        s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
    191        s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
    192 
    193        const __m256i res_lo = convolve_lowbd_6tap(s, coeffs);
    194 
    195        /* rounding code */
    196        // shift by F - 1
    197        const __m256i res_16b_lo = _mm256_sra_epi16(
    198            _mm256_add_epi16(res_lo, right_shift_const), right_shift);
    199        // 8 bit conversion and saturation to uint8
    200        __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
    201 
    202        if (w - j > 8) {
    203          const __m256i res_hi = convolve_lowbd_6tap(s + 3, coeffs);
    204 
    205          /* rounding code */
    206          // shift by F - 1
    207          const __m256i res_16b_hi = _mm256_sra_epi16(
    208              _mm256_add_epi16(res_hi, right_shift_const), right_shift);
    209          // 8 bit conversion and saturation to uint8
    210          __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
    211 
    212          __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
    213 
    214          const __m128i res_0 = _mm256_castsi256_si128(res_a);
    215          const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
    216 
    217          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
    218          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
    219                           res_1);
    220        } else {
    221          const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
    222          const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
    223          if (w - j > 4) {
    224            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
    225            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
    226                             res_1);
    227          } else if (w - j > 2) {
    228            xx_storel_32(&dst[i * dst_stride + j], res_0);
    229            xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
    230          } else {
    231            __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
    232            __m128i *const p_1 =
    233                (__m128i *)&dst[i * dst_stride + j + dst_stride];
    234            *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
    235            *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
    236          }
    237        }
    238        s[0] = s[1];
    239        s[1] = s[2];
    240        s[3] = s[4];
    241        s[4] = s[5];
    242      }
    243    }
    244  } else if (vert_tap == 12) {  // vert_tap == 12
    245    const int fo_vert = filter_params_y->taps / 2 - 1;
    246    const uint8_t *const src_ptr = src - fo_vert * src_stride;
    247    const __m256i v_zero = _mm256_setzero_si256();
    248    right_shift = _mm_cvtsi32_si128(FILTER_BITS);
    249    right_shift_const = _mm256_set1_epi32((1 << FILTER_BITS) >> 1);
    250 
    251    for (int j = 0; j < w; j += 8) {
    252      const uint8_t *data = &src_ptr[j];
    253      __m256i src10;
    254 
    255      d[0] = _mm_loadl_epi64((__m128i *)(data + 0 * src_stride));
    256      d[1] = _mm_loadl_epi64((__m128i *)(data + 1 * src_stride));
    257      d[2] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride));
    258      d[3] = _mm_loadl_epi64((__m128i *)(data + 3 * src_stride));
    259      d[4] = _mm_loadl_epi64((__m128i *)(data + 4 * src_stride));
    260      d[5] = _mm_loadl_epi64((__m128i *)(data + 5 * src_stride));
    261      d[6] = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
    262      d[7] = _mm_loadl_epi64((__m128i *)(data + 7 * src_stride));
    263      d[8] = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
    264      d[9] = _mm_loadl_epi64((__m128i *)(data + 9 * src_stride));
    265      // Load lines a and b. Line a to lower 128, line b to upper 128
    266      const __m256i src_01a = _mm256_permute2x128_si256(
    267          _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
    268 
    269      const __m256i src_12a = _mm256_permute2x128_si256(
    270          _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
    271 
    272      const __m256i src_23a = _mm256_permute2x128_si256(
    273          _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
    274 
    275      const __m256i src_34a = _mm256_permute2x128_si256(
    276          _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
    277 
    278      const __m256i src_45a = _mm256_permute2x128_si256(
    279          _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
    280 
    281      const __m256i src_56a = _mm256_permute2x128_si256(
    282          _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[6]), 0x20);
    283 
    284      const __m256i src_67a = _mm256_permute2x128_si256(
    285          _mm256_castsi128_si256(d[6]), _mm256_castsi128_si256(d[7]), 0x20);
    286 
    287      const __m256i src_78a = _mm256_permute2x128_si256(
    288          _mm256_castsi128_si256(d[7]), _mm256_castsi128_si256(d[8]), 0x20);
    289 
    290      const __m256i src_89a = _mm256_permute2x128_si256(
    291          _mm256_castsi128_si256(d[8]), _mm256_castsi128_si256(d[9]), 0x20);
    292 
    293      src10 = _mm256_castsi128_si256(
    294          _mm_loadl_epi64((__m128i *)(data + 10 * src_stride)));
    295      const __m256i src_910a =
    296          _mm256_permute2x128_si256(_mm256_castsi128_si256(d[9]), src10, 0x20);
    297 
    298      const __m256i src_01 = _mm256_unpacklo_epi8(src_01a, v_zero);
    299      const __m256i src_12 = _mm256_unpacklo_epi8(src_12a, v_zero);
    300      const __m256i src_23 = _mm256_unpacklo_epi8(src_23a, v_zero);
    301      const __m256i src_34 = _mm256_unpacklo_epi8(src_34a, v_zero);
    302      const __m256i src_45 = _mm256_unpacklo_epi8(src_45a, v_zero);
    303      const __m256i src_56 = _mm256_unpacklo_epi8(src_56a, v_zero);
    304      const __m256i src_67 = _mm256_unpacklo_epi8(src_67a, v_zero);
    305      const __m256i src_78 = _mm256_unpacklo_epi8(src_78a, v_zero);
    306      const __m256i src_89 = _mm256_unpacklo_epi8(src_89a, v_zero);
    307      const __m256i src_910 = _mm256_unpacklo_epi8(src_910a, v_zero);
    308 
    309      s[0] = _mm256_unpacklo_epi16(src_01, src_12);
    310      s[1] = _mm256_unpacklo_epi16(src_23, src_34);
    311      s[2] = _mm256_unpacklo_epi16(src_45, src_56);
    312      s[3] = _mm256_unpacklo_epi16(src_67, src_78);
    313      s[4] = _mm256_unpacklo_epi16(src_89, src_910);
    314 
    315      s[6] = _mm256_unpackhi_epi16(src_01, src_12);
    316      s[7] = _mm256_unpackhi_epi16(src_23, src_34);
    317      s[8] = _mm256_unpackhi_epi16(src_45, src_56);
    318      s[9] = _mm256_unpackhi_epi16(src_67, src_78);
    319      s[10] = _mm256_unpackhi_epi16(src_89, src_910);
    320 
    321      for (i = 0; i < h; i += 2) {
    322        data = &src_ptr[i * src_stride + j];
    323        const __m256i src_1011a = _mm256_permute2x128_si256(
    324            src10,
    325            _mm256_castsi128_si256(
    326                _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))),
    327            0x20);
    328 
    329        src10 = _mm256_castsi128_si256(
    330            _mm_loadl_epi64((__m128i *)(data + 12 * src_stride)));
    331 
    332        const __m256i src_1112a = _mm256_permute2x128_si256(
    333            _mm256_castsi128_si256(
    334                _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))),
    335            src10, 0x20);
    336 
    337        const __m256i src_1011 = _mm256_unpacklo_epi8(src_1011a, v_zero);
    338        const __m256i src_1112 = _mm256_unpacklo_epi8(src_1112a, v_zero);
    339 
    340        s[5] = _mm256_unpacklo_epi16(src_1011, src_1112);
    341        s[11] = _mm256_unpackhi_epi16(src_1011, src_1112);
    342 
    343        const __m256i res_lo = convolve_12taps(s, coeffs);
    344 
    345        const __m256i res_32b_lo = _mm256_sra_epi32(
    346            _mm256_add_epi32(res_lo, right_shift_const), right_shift);
    347        // 8 bit conversion and saturation to uint8
    348        __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
    349        __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
    350 
    351        if (w - j > 4) {
    352          const __m256i res_hi = convolve_12taps(s + 6, coeffs);
    353 
    354          const __m256i res_32b_hi = _mm256_sra_epi32(
    355              _mm256_add_epi32(res_hi, right_shift_const), right_shift);
    356          __m256i res_16b_hi = _mm256_packs_epi32(res_32b_hi, res_32b_hi);
    357          // 8 bit conversion and saturation to uint8
    358          __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
    359 
    360          __m256i res_a = _mm256_unpacklo_epi32(res_8b_lo, res_8b_hi);
    361 
    362          const __m128i res_0 = _mm256_extracti128_si256(res_a, 0);
    363          const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
    364 
    365          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
    366          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
    367                           res_1);
    368        } else {
    369          const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
    370          const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
    371          if (w - j > 2) {
    372            *(int *)&dst[i * dst_stride + j] = _mm_cvtsi128_si32(res_0);
    373            *(int *)&dst[i * dst_stride + j + dst_stride] =
    374                _mm_cvtsi128_si32(res_1);
    375          } else {
    376            *(uint16_t *)&dst[i * dst_stride + j] =
    377                (uint16_t)_mm_cvtsi128_si32(res_0);
    378            *(uint16_t *)&dst[i * dst_stride + j + dst_stride] =
    379                (uint16_t)_mm_cvtsi128_si32(res_1);
    380          }
    381        }
    382        s[0] = s[1];
    383        s[1] = s[2];
    384        s[2] = s[3];
    385        s[3] = s[4];
    386        s[4] = s[5];
    387 
    388        s[6] = s[7];
    389        s[7] = s[8];
    390        s[8] = s[9];
    391        s[9] = s[10];
    392        s[10] = s[11];
    393      }
    394    }
    395  } else {
    396    const int fo_vert = filter_params_y->taps / 2 - 1;
    397    const uint8_t *const src_ptr = src - fo_vert * src_stride;
    398 
    399    for (int j = 0; j < w; j += 16) {
    400      const uint8_t *data = &src_ptr[j];
    401      __m256i src6;
    402 
    403      d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
    404      d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
    405      d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
    406      d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
    407      d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
    408      d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
    409      // Load lines a and b. Line a to lower 128, line b to upper 128
    410      const __m256i src_01a = _mm256_permute2x128_si256(
    411          _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
    412 
    413      const __m256i src_12a = _mm256_permute2x128_si256(
    414          _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
    415 
    416      const __m256i src_23a = _mm256_permute2x128_si256(
    417          _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
    418 
    419      const __m256i src_34a = _mm256_permute2x128_si256(
    420          _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
    421 
    422      const __m256i src_45a = _mm256_permute2x128_si256(
    423          _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
    424 
    425      src6 = _mm256_castsi128_si256(
    426          _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
    427      const __m256i src_56a =
    428          _mm256_permute2x128_si256(_mm256_castsi128_si256(d[5]), src6, 0x20);
    429 
    430      s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
    431      s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
    432      s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
    433 
    434      s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
    435      s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
    436      s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
    437 
    438      for (i = 0; i < h; i += 2) {
    439        data = &src_ptr[i * src_stride + j];
    440        const __m256i src_67a = _mm256_permute2x128_si256(
    441            src6,
    442            _mm256_castsi128_si256(
    443                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
    444            0x20);
    445 
    446        src6 = _mm256_castsi128_si256(
    447            _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
    448        const __m256i src_78a = _mm256_permute2x128_si256(
    449            _mm256_castsi128_si256(
    450                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
    451            src6, 0x20);
    452 
    453        s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
    454        s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
    455 
    456        const __m256i res_lo = convolve_lowbd(s, coeffs);
    457 
    458        /* rounding code */
    459        // shift by F - 1
    460        const __m256i res_16b_lo = _mm256_sra_epi16(
    461            _mm256_add_epi16(res_lo, right_shift_const), right_shift);
    462        // 8 bit conversion and saturation to uint8
    463        __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
    464 
    465        if (w - j > 8) {
    466          const __m256i res_hi = convolve_lowbd(s + 4, coeffs);
    467 
    468          /* rounding code */
    469          // shift by F - 1
    470          const __m256i res_16b_hi = _mm256_sra_epi16(
    471              _mm256_add_epi16(res_hi, right_shift_const), right_shift);
    472          // 8 bit conversion and saturation to uint8
    473          __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
    474 
    475          __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
    476 
    477          const __m128i res_0 = _mm256_castsi256_si128(res_a);
    478          const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
    479 
    480          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
    481          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
    482                           res_1);
    483        } else {
    484          const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
    485          const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
    486          if (w - j > 4) {
    487            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
    488            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
    489                             res_1);
    490          } else if (w - j > 2) {
    491            xx_storel_32(&dst[i * dst_stride + j], res_0);
    492            xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
    493          } else {
    494            __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
    495            __m128i *const p_1 =
    496                (__m128i *)&dst[i * dst_stride + j + dst_stride];
    497            *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
    498            *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
    499          }
    500        }
    501        s[0] = s[1];
    502        s[1] = s[2];
    503        s[2] = s[3];
    504 
    505        s[4] = s[5];
    506        s[5] = s[6];
    507        s[6] = s[7];
    508      }
    509    }
    510  }
    511 }
    512 
    513 void av1_convolve_y_sr_avx2(const uint8_t *src, int32_t src_stride,
    514                            uint8_t *dst, int32_t dst_stride, int32_t w,
    515                            int32_t h,
    516                            const InterpFilterParams *filter_params_y,
    517                            const int32_t subpel_y_qn) {
    518 #if CONFIG_SVT_AV1
    519  const int vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);
    520 
    521  if (vert_tap == 12) {
    522    av1_convolve_y_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
    523                                   filter_params_y, subpel_y_qn);
    524  } else {
    525    av1_convolve_y_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h,
    526                                       filter_params_y, subpel_y_qn);
    527  }
    528 #else
    529  av1_convolve_y_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
    530                                 filter_params_y, subpel_y_qn);
    531 #endif
    532 }
    533 
    534 static inline void av1_convolve_x_sr_general_avx2(
    535    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
    536    int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
    537    ConvolveParams *conv_params) {
    538  const int bits = FILTER_BITS - conv_params->round_0;
    539  const __m128i round_shift = _mm_cvtsi32_si128(bits);
    540  __m256i round_0_const =
    541      _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
    542  __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
    543  __m256i round_const = _mm256_set1_epi16((1 << bits) >> 1);
    544  int i, horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
    545 
    546  assert(bits >= 0);
    547  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
    548         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
    549  assert(conv_params->round_0 > 0);
    550 
    551  __m256i coeffs[6], filt[4];
    552  filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
    553  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
    554 
    555  if (horiz_tap == 6)
    556    prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs);
    557  else if (horiz_tap == 12) {
    558    prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs);
    559  } else {
    560    prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs);
    561  }
    562 
    563  // horz_filt as 4 tap
    564  if (horiz_tap == 4) {
    565    const int fo_horiz = 1;
    566    const uint8_t *const src_ptr = src - fo_horiz;
    567    if (w <= 8) {
    568      for (i = 0; i < h; i += 2) {
    569        const __m256i data = _mm256_permute2x128_si256(
    570            _mm256_castsi128_si256(
    571                _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
    572            _mm256_castsi128_si256(_mm_loadu_si128(
    573                (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
    574            0x20);
    575 
    576        __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
    577 
    578        res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
    579                                   round_0_shift);
    580 
    581        res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
    582                                   round_shift);
    583 
    584        /* rounding code */
    585        // 8 bit conversion and saturation to uint8
    586        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
    587 
    588        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
    589        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
    590 
    591        if (w > 4) {
    592          _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
    593          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
    594        } else if (w > 2) {
    595          xx_storel_32(&dst[i * dst_stride], res_0);
    596          xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
    597        } else {
    598          __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
    599          __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
    600          *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
    601          *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
    602        }
    603      }
    604    } else {
    605      for (i = 0; i < h; ++i) {
    606        for (int j = 0; j < w; j += 16) {
    607          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
    608          // 18 19 20 21 22 23
    609          const __m256i data = _mm256_inserti128_si256(
    610              _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
    611              _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
    612              1);
    613 
    614          __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
    615 
    616          res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
    617                                     round_0_shift);
    618 
    619          res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
    620                                     round_shift);
    621 
    622          /* rounding code */
    623          // 8 bit conversion and saturation to uint8
    624          __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
    625 
    626          // Store values into the destination buffer
    627          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
    628          res_8b = _mm256_permute4x64_epi64(res_8b, 216);
    629          __m128i res = _mm256_castsi256_si128(res_8b);
    630          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
    631        }
    632      }
    633    }
    634  } else if (horiz_tap == 6) {
    635    const int fo_horiz = horiz_tap / 2 - 1;
    636    const uint8_t *const src_ptr = src - fo_horiz;
    637    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
    638    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
    639 
    640    if (w <= 8) {
    641      for (i = 0; i < h; i += 2) {
    642        const __m256i data = _mm256_permute2x128_si256(
    643            _mm256_castsi128_si256(
    644                _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
    645            _mm256_castsi128_si256(_mm_loadu_si128(
    646                (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
    647            0x20);
    648 
    649        __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt);
    650 
    651        res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
    652                                   round_0_shift);
    653 
    654        res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
    655                                   round_shift);
    656 
    657        /* rounding code */
    658        // 8 bit conversion and saturation to uint8
    659        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
    660 
    661        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
    662        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
    663        if (w > 4) {
    664          _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
    665          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
    666        } else if (w > 2) {
    667          xx_storel_32(&dst[i * dst_stride], res_0);
    668          xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
    669        } else {
    670          __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
    671          __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
    672          *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
    673          *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
    674        }
    675      }
    676    } else {
    677      for (i = 0; i < h; ++i) {
    678        for (int j = 0; j < w; j += 16) {
    679          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
    680          // 18 19 20 21 22 23
    681          const __m256i data = _mm256_inserti128_si256(
    682              _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
    683              _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
    684              1);
    685 
    686          __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt);
    687 
    688          res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
    689                                     round_0_shift);
    690 
    691          res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
    692                                     round_shift);
    693 
    694          /* rounding code */
    695          // 8 bit conversion and saturation to uint8
    696          __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
    697 
    698          // Store values into the destination buffer
    699          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
    700          res_8b = _mm256_permute4x64_epi64(res_8b, 216);
    701          __m128i res = _mm256_castsi256_si128(res_8b);
    702          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
    703        }
    704      }
    705    }
    706  } else if (horiz_tap == 12) {  // horiz_tap == 12
    707    const int fo_horiz = filter_params_x->taps / 2 - 1;
    708    const uint8_t *const src_ptr = src - fo_horiz;
    709    const __m256i v_zero = _mm256_setzero_si256();
    710    round_0_const = _mm256_set1_epi32((1 << (conv_params->round_0)) >> 1);
    711    round_const = _mm256_set1_epi32((1 << bits) >> 1);
    712    round_0_shift = _mm_cvtsi32_si128(conv_params->round_0);
    713    __m256i s[6];
    714 
    715    if (w <= 4) {
    716      for (i = 0; i < h; i += 2) {
    717        const __m256i data = _mm256_permute2x128_si256(
    718            _mm256_castsi128_si256(
    719                _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
    720            _mm256_castsi128_si256(_mm_loadu_si128(
    721                (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
    722            0x20);
    723        // row0 0..7 row1 0..7
    724        const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero);
    725        // row0 8..F row1 8..F
    726        const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero);
    727 
    728        // row0 00 00 01 01 .. 03 03 row1 00 00 01 01 .. 03 03
    729        const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo);
    730        // row0 04 04 .. 07 07 row1 04 04 .. 07 07
    731        const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo);
    732 
    733        // row0 08 08 09 09 .. 0B 0B row1 08 08 09 09 .. 0B 0B
    734        const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi);
    735        // row0 0C 0C .. 0F 0F row1 0C 0C .. 0F 0F
    736        const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi);
    737 
    738        // 00 01 01 02 02 03 03 04 10 11 11 12 12 13 13 14
    739        s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2);
    740        // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16
    741        s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10);
    742        // 04 05 05 06 06 07 07 08 14 15 15 16 16 17 17 18
    743        s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2);
    744        // 06 07 07 08 08 09 09 0A 16 17 17 18 18 19 19 1A
    745        s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10);
    746        // 08 09 09 0A 0A 0B 0B 0C 18 19 19 1A 1A 1B 1B 1C
    747        s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2);
    748        // 0A 0B 0B 0C 0C 0D 0D 0E 1A 1B 1B 1C 1C 1D 1D 1E
    749        s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10);
    750 
    751        const __m256i res_lo = convolve_12taps(s, coeffs);
    752 
    753        __m256i res_32b_lo = _mm256_sra_epi32(
    754            _mm256_add_epi32(res_lo, round_0_const), round_0_shift);
    755 
    756        // 00 01 02 03 10 12 13 14
    757        res_32b_lo = _mm256_sra_epi32(_mm256_add_epi32(res_32b_lo, round_const),
    758                                      round_shift);
    759        // 8 bit conversion and saturation to uint8
    760        // 00 01 02 03 00 01 02 03 10 11 12 13 10 11 12 13
    761        __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
    762        // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03
    763        // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13
    764        __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
    765 
    766        // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03
    767        const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
    768        // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13
    769        const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
    770        if (w > 2) {
    771          // 00 01 02 03
    772          *(int *)&dst[i * dst_stride] = _mm_cvtsi128_si32(res_0);
    773          // 10 11 12 13
    774          *(int *)&dst[i * dst_stride + dst_stride] = _mm_cvtsi128_si32(res_1);
    775        } else {
    776          // 00 01
    777          *(uint16_t *)&dst[i * dst_stride] =
    778              (uint16_t)_mm_cvtsi128_si32(res_0);
    779          // 10 11
    780          *(uint16_t *)&dst[i * dst_stride + dst_stride] =
    781              (uint16_t)_mm_cvtsi128_si32(res_1);
    782        }
    783      }
    784    } else {
    785      for (i = 0; i < h; i++) {
    786        for (int j = 0; j < w; j += 8) {
    787          const __m256i data = _mm256_permute2x128_si256(
    788              _mm256_castsi128_si256(
    789                  _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))),
    790              _mm256_castsi128_si256(_mm_loadu_si128(
    791                  (__m128i *)(&src_ptr[i * src_stride + j + 4]))),
    792              0x20);
    793          // row0 0..7 4..B
    794          const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero);
    795          // row0 8..F C..13
    796          const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero);
    797 
    798          // row0 00 00 01 01 .. 03 03 04 04 05 05 .. 07 07
    799          const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo);
    800          // row0 04 04 .. 07 07 08 08 .. 0B 0B
    801          const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo);
    802 
    803          // row0 08 08 09 09 .. 0B 0B 0C 0C 0D 0D .. 0F 0F
    804          const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi);
    805          // row0 0C 0C 0D 0D .. 0F 0F 10 10 11 11 .. 13 13
    806          const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi);
    807 
    808          s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2);
    809          s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10);
    810          s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2);
    811          s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10);
    812          s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2);
    813          s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10);
    814 
    815          const __m256i res_lo = convolve_12taps(s, coeffs);
    816 
    817          __m256i res_32b_lo = _mm256_sra_epi32(
    818              _mm256_add_epi32(res_lo, round_0_const), round_0_shift);
    819 
    820          res_32b_lo = _mm256_sra_epi32(
    821              _mm256_add_epi32(res_32b_lo, round_const), round_shift);
    822          // 8 bit conversion and saturation to uint8
    823          __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
    824          __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
    825          const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
    826          const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
    827          *(int *)&dst[i * dst_stride + j] = _mm_cvtsi128_si32(res_0);
    828          *(int *)&dst[i * dst_stride + j + 4] = _mm_cvtsi128_si32(res_1);
    829        }
    830      }
    831    }
    832  } else {
    833    const int fo_horiz = filter_params_x->taps / 2 - 1;
    834    const uint8_t *const src_ptr = src - fo_horiz;
    835    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
    836    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
    837 
    838    if (w <= 8) {
    839      for (i = 0; i < h; i += 2) {
    840        const __m256i data = _mm256_permute2x128_si256(
    841            _mm256_castsi128_si256(
    842                _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
    843            _mm256_castsi128_si256(_mm_loadu_si128(
    844                (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
    845            0x20);
    846 
    847        __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
    848 
    849        res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
    850                                   round_0_shift);
    851 
    852        res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
    853                                   round_shift);
    854 
    855        /* rounding code */
    856        // 8 bit conversion and saturation to uint8
    857        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
    858 
    859        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
    860        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
    861        if (w > 4) {
    862          _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
    863          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
    864        } else if (w > 2) {
    865          xx_storel_32(&dst[i * dst_stride], res_0);
    866          xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
    867        } else {
    868          __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
    869          __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
    870          *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
    871          *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
    872        }
    873      }
    874    } else {
    875      for (i = 0; i < h; ++i) {
    876        for (int j = 0; j < w; j += 16) {
    877          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
    878          // 18 19 20 21 22 23
    879          const __m256i data = _mm256_inserti128_si256(
    880              _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
    881              _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
    882              1);
    883 
    884          __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
    885 
    886          res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
    887                                     round_0_shift);
    888 
    889          res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
    890                                     round_shift);
    891 
    892          /* rounding code */
    893          // 8 bit conversion and saturation to uint8
    894          __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
    895 
    896          // Store values into the destination buffer
    897          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
    898          res_8b = _mm256_permute4x64_epi64(res_8b, 216);
    899          __m128i res = _mm256_castsi256_si128(res_8b);
    900          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
    901        }
    902      }
    903    }
    904  }
    905 }
    906 
    907 void av1_convolve_x_sr_avx2(const uint8_t *src, int32_t src_stride,
    908                            uint8_t *dst, int32_t dst_stride, int32_t w,
    909                            int32_t h,
    910                            const InterpFilterParams *filter_params_x,
    911                            const int32_t subpel_x_qn,
    912                            ConvolveParams *conv_params) {
    913 #if CONFIG_SVT_AV1
    914  const int horz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
    915 
    916  if (horz_tap == 12) {
    917    av1_convolve_x_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
    918                                   filter_params_x, subpel_x_qn, conv_params);
    919  } else {
    920    av1_convolve_x_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h,
    921                                       filter_params_x, subpel_x_qn,
    922                                       conv_params);
    923  }
    924 #else
    925  av1_convolve_x_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
    926                                 filter_params_x, subpel_x_qn, conv_params);
    927 #endif
    928 }