tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

highbd_warp_affine_avx2.c (29103B)


      1 /*
      2 * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 #include <immintrin.h>
     12 
     13 #include "config/av1_rtcd.h"
     14 
     15 #include "av1/common/warped_motion.h"
     16 
     17 void av1_highbd_warp_affine_avx2(const int32_t *mat, const uint16_t *ref,
     18                                 int width, int height, int stride,
     19                                 uint16_t *pred, int p_col, int p_row,
     20                                 int p_width, int p_height, int p_stride,
     21                                 int subsampling_x, int subsampling_y, int bd,
     22                                 ConvolveParams *conv_params, int16_t alpha,
     23                                 int16_t beta, int16_t gamma, int16_t delta) {
     24  __m256i tmp[15];
     25  const int reduce_bits_horiz = conv_params->round_0;
     26  const int reduce_bits_vert = conv_params->is_compound
     27                                   ? conv_params->round_1
     28                                   : 2 * FILTER_BITS - reduce_bits_horiz;
     29  const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
     30  const int offset_bits_horiz = bd + FILTER_BITS - 1;
     31  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
     32  const int round_bits =
     33      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
     34  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
     35  (void)max_bits_horiz;
     36  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
     37 
     38  // Check that, even with 12-bit input, the intermediate values will fit
     39  // into an unsigned 16-bit intermediate array.
     40  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
     41 
     42  const __m256i clip_pixel =
     43      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
     44  const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert);
     45  const __m256i reduce_bits_vert_const =
     46      _mm256_set1_epi32(((1 << reduce_bits_vert) >> 1));
     47  const __m256i res_add_const = _mm256_set1_epi32(1 << offset_bits_vert);
     48  const __m256i res_sub_const =
     49      _mm256_set1_epi32(-(1 << (offset_bits - conv_params->round_1)) -
     50                        (1 << (offset_bits - conv_params->round_1 - 1)));
     51  __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
     52  __m256i round_bits_const = _mm256_set1_epi32(((1 << round_bits) >> 1));
     53 
     54  const int w0 = conv_params->fwd_offset;
     55  const int w1 = conv_params->bck_offset;
     56  const __m256i wt0 = _mm256_set1_epi32(w0);
     57  const __m256i wt1 = _mm256_set1_epi32(w1);
     58 
     59  __m256i v_rbhoriz = _mm256_set1_epi32(1 << (reduce_bits_horiz - 1));
     60  __m256i v_zeros = _mm256_setzero_si256();
     61  int ohoriz = 1 << offset_bits_horiz;
     62  int mhoriz = 1 << max_bits_horiz;
     63  (void)mhoriz;
     64  int sx;
     65 
     66  for (int i = 0; i < p_height; i += 8) {
     67    for (int j = 0; j < p_width; j += 8) {
     68      // Calculate the center of this 8x8 block,
     69      // project to luma coordinates (if in a subsampled chroma plane),
     70      // apply the affine transformation,
     71      // then convert back to the original coordinates (if necessary)
     72      const int32_t src_x = (p_col + j + 4) << subsampling_x;
     73      const int32_t src_y = (p_row + i + 4) << subsampling_y;
     74      const int64_t dst_x =
     75          (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
     76      const int64_t dst_y =
     77          (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
     78      const int64_t x4 = dst_x >> subsampling_x;
     79      const int64_t y4 = dst_y >> subsampling_y;
     80 
     81      const int16_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
     82      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
     83      const int16_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
     84      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
     85 
     86      sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
     87             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
     88      sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
     89             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
     90 
     91      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
     92      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
     93 
     94      // Horizontal filter
     95      if (ix4 <= -7) {
     96        for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
     97          int iy = iy4 + k;
     98          if (iy < 0)
     99            iy = 0;
    100          else if (iy > height - 1)
    101            iy = height - 1;
    102          tmp[k + 7] = _mm256_cvtepi16_epi32(_mm_set1_epi16(
    103              (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
    104              ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz))));
    105        }
    106      } else if (ix4 >= width + 6) {
    107        for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
    108          int iy = iy4 + k;
    109          if (iy < 0)
    110            iy = 0;
    111          else if (iy > height - 1)
    112            iy = height - 1;
    113          tmp[k + 7] = _mm256_cvtepi16_epi32(
    114              _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
    115                             ref[iy * stride + (width - 1)] *
    116                                 (1 << (FILTER_BITS - reduce_bits_horiz))));
    117        }
    118      } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
    119        int32_t tmp1[8];
    120        for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
    121          const int iy = clamp(iy4 + k, 0, height - 1);
    122 
    123          sx = sx4 + beta * (k + 4);
    124          for (int l = -4; l < 4; ++l) {
    125            int ix = ix4 + l - 3;
    126            const int offs = sx >> WARPEDDIFF_PREC_BITS;
    127            const int16_t *coeffs = av1_warped_filter[offs];
    128 
    129            int32_t sum = 1 << offset_bits_horiz;
    130            for (int m = 0; m < 8; ++m) {
    131              const int sample_x = clamp(ix + m, 0, width - 1);
    132              sum += ref[iy * stride + sample_x] * coeffs[m];
    133            }
    134            sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
    135            tmp1[(l + 4) / 2 + ((l + 4) % 2) * 4] = sum;
    136            sx += alpha;
    137          }
    138          tmp[k + 7] = _mm256_loadu_si256((__m256i *)tmp1);
    139        }
    140      } else {
    141        if (beta == 0 && alpha == 0) {
    142          sx = sx4;
    143          __m128i v_01 = _mm_loadu_si128(
    144              (__m128i *)
    145                  av1_warped_filter[sx >>
    146                                    WARPEDDIFF_PREC_BITS]);  // A7A6A5A4A3A2A1A0
    147          __m256i v_c01 = _mm256_broadcastd_epi32(v_01);     // A1A0A1A0A1A0A1A0
    148          __m256i v_c23 = _mm256_broadcastd_epi32(
    149              _mm_shuffle_epi32(v_01, 1));  // A3A2A3A2A3A2A3A2
    150          __m256i v_c45 = _mm256_broadcastd_epi32(
    151              _mm_shuffle_epi32(v_01, 2));  // A5A4A5A4A5A4A5A4
    152          __m256i v_c67 = _mm256_broadcastd_epi32(
    153              _mm_shuffle_epi32(v_01, 3));  // A7A6A7A6A7A6A7A6
    154          for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
    155            int iy = iy4 + k;
    156            if (iy < 0)
    157              iy = 0;
    158            else if (iy > height - 1)
    159              iy = height - 1;
    160            iy = iy * stride;
    161 
    162            __m256i v_refl = _mm256_inserti128_si256(
    163                _mm256_setzero_si256(),
    164                _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
    165            v_refl = _mm256_inserti128_si256(
    166                v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
    167                1);  // R15 .. R0
    168 
    169            __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE);
    170 
    171            __m256i v_refu =
    172                _mm256_alignr_epi8(v_ref, v_refl, 2);  // R8R15R14...R2R1
    173            v_refl = _mm256_inserti128_si256(
    174                v_refl, _mm256_extracti128_si256(v_refu, 0), 1);
    175            v_refu = _mm256_inserti128_si256(
    176                v_refu, _mm256_extracti128_si256(v_ref, 0), 0);
    177 
    178            __m256i v_sum = _mm256_set1_epi32(ohoriz);
    179            __m256i parsum = _mm256_madd_epi16(
    180                v_c01, _mm256_alignr_epi8(v_refu, v_refl,
    181                                          0));  // R8R7R6..R1R7R6R5..R1R0
    182            __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum);
    183 
    184            parsum = _mm256_madd_epi16(
    185                v_c23,
    186                _mm256_alignr_epi8(v_refu, v_refl, 4));  // R10R9..R3R9R8..R3R2
    187            __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum);
    188            parsum = _mm256_madd_epi16(
    189                v_c45, _mm256_alignr_epi8(v_refu, v_refl,
    190                                          8));  // R12R11..R5R11R10..R5R4
    191            __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum);
    192            parsum = _mm256_madd_epi16(
    193                v_c67, _mm256_alignr_epi8(v_refu, v_refl,
    194                                          12));  // R14R13..R7R13R12..R7R6
    195            __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum);
    196 
    197            tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz),
    198                                           reduce_bits_horiz);
    199          }
    200        } else if (alpha == 0) {
    201          for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
    202            int iy = iy4 + k;
    203            if (iy < 0)
    204              iy = 0;
    205            else if (iy > height - 1)
    206              iy = height - 1;
    207            iy = iy * stride;
    208 
    209            sx = sx4 + beta * (k + 4);
    210 
    211            __m128i v_01 = _mm_loadu_si128(
    212                (__m128i *)av1_warped_filter
    213                    [sx >> WARPEDDIFF_PREC_BITS]);          // A7A6A5A4A3A2A1A0
    214            __m256i v_c01 = _mm256_broadcastd_epi32(v_01);  // A1A0A1A0A1A0A1A0
    215            __m256i v_c23 = _mm256_broadcastd_epi32(
    216                _mm_shuffle_epi32(v_01, 1));  // A3A2A3A2A3A2A3A2
    217            __m256i v_c45 = _mm256_broadcastd_epi32(
    218                _mm_shuffle_epi32(v_01, 2));  // A5A4A5A4A5A4A5A4
    219            __m256i v_c67 = _mm256_broadcastd_epi32(
    220                _mm_shuffle_epi32(v_01, 3));  // A7A6A7A6A7A6A7A6
    221 
    222            __m256i v_refl = _mm256_inserti128_si256(
    223                _mm256_setzero_si256(),
    224                _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
    225            v_refl = _mm256_inserti128_si256(
    226                v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
    227                1);  // R15 .. R0
    228 
    229            __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE);
    230 
    231            __m256i v_refu =
    232                _mm256_alignr_epi8(v_ref, v_refl, 2);  // R8R15R14...R2R1
    233 
    234            v_refl = _mm256_inserti128_si256(
    235                v_refl, _mm256_extracti128_si256(v_refu, 0), 1);
    236            v_refu = _mm256_inserti128_si256(
    237                v_refu, _mm256_extracti128_si256(v_ref, 0), 0);
    238 
    239            __m256i v_sum = _mm256_set1_epi32(ohoriz);
    240            __m256i parsum =
    241                _mm256_madd_epi16(v_c01, _mm256_alignr_epi8(v_refu, v_refl, 0));
    242            __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum);
    243 
    244            parsum =
    245                _mm256_madd_epi16(v_c23, _mm256_alignr_epi8(v_refu, v_refl, 4));
    246            __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum);
    247            parsum =
    248                _mm256_madd_epi16(v_c45, _mm256_alignr_epi8(v_refu, v_refl, 8));
    249            __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum);
    250            parsum = _mm256_madd_epi16(v_c67,
    251                                       _mm256_alignr_epi8(v_refu, v_refl, 12));
    252            __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum);
    253 
    254            tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz),
    255                                           reduce_bits_horiz);
    256          }
    257        } else if (beta == 0) {
    258          sx = sx4;
    259          __m256i v_coeff01 = _mm256_inserti128_si256(
    260              v_zeros,
    261              _mm_loadu_si128(
    262                  (__m128i *)av1_warped_filter[(sx) >> WARPEDDIFF_PREC_BITS]),
    263              0);
    264          v_coeff01 = _mm256_inserti128_si256(
    265              v_coeff01,
    266              _mm_loadu_si128(
    267                  (__m128i *)
    268                      av1_warped_filter[(sx + alpha) >> WARPEDDIFF_PREC_BITS]),
    269              1);  // B7B6..B1B0A7A6..A1A0
    270          __m256i v_coeff23 = _mm256_inserti128_si256(
    271              v_zeros,
    272              _mm_loadu_si128(
    273                  (__m128i *)av1_warped_filter[(sx + 2 * alpha) >>
    274                                               WARPEDDIFF_PREC_BITS]),
    275              0);
    276          v_coeff23 = _mm256_inserti128_si256(
    277              v_coeff23,
    278              _mm_loadu_si128(
    279                  (__m128i *)av1_warped_filter[(sx + 3 * alpha) >>
    280                                               WARPEDDIFF_PREC_BITS]),
    281              1);  // D7D6..D1D0C7C6..C1C0
    282          __m256i v_coeff45 = _mm256_inserti128_si256(
    283              v_zeros,
    284              _mm_loadu_si128(
    285                  (__m128i *)av1_warped_filter[(sx + 4 * alpha) >>
    286                                               WARPEDDIFF_PREC_BITS]),
    287              0);
    288          v_coeff45 = _mm256_inserti128_si256(
    289              v_coeff45,
    290              _mm_loadu_si128(
    291                  (__m128i *)av1_warped_filter[(sx + 5 * alpha) >>
    292                                               WARPEDDIFF_PREC_BITS]),
    293              1);  // F7F6..F1F0E7E6..E1E0
    294          __m256i v_coeff67 = _mm256_inserti128_si256(
    295              v_zeros,
    296              _mm_loadu_si128(
    297                  (__m128i *)av1_warped_filter[(sx + 6 * alpha) >>
    298                                               WARPEDDIFF_PREC_BITS]),
    299              0);
    300          v_coeff67 = _mm256_inserti128_si256(
    301              v_coeff67,
    302              _mm_loadu_si128(
    303                  (__m128i *)av1_warped_filter[(sx + 7 * alpha) >>
    304                                               WARPEDDIFF_PREC_BITS]),
    305              1);  // H7H6..H1H0G7G6..G1G0
    306 
    307          __m256i v_c0123 = _mm256_unpacklo_epi32(
    308              v_coeff01,
    309              v_coeff23);  // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0
    310          __m256i v_c0123u = _mm256_unpackhi_epi32(
    311              v_coeff01,
    312              v_coeff23);  // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4
    313          __m256i v_c4567 = _mm256_unpacklo_epi32(
    314              v_coeff45,
    315              v_coeff67);  // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0
    316          __m256i v_c4567u = _mm256_unpackhi_epi32(
    317              v_coeff45,
    318              v_coeff67);  // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4
    319 
    320          __m256i v_c01 = _mm256_unpacklo_epi64(
    321              v_c0123, v_c4567);  // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0
    322          __m256i v_c23 =
    323              _mm256_unpackhi_epi64(v_c0123, v_c4567);  // H3H2 ... A3A2
    324          __m256i v_c45 =
    325              _mm256_unpacklo_epi64(v_c0123u, v_c4567u);  // H5H4 ... A5A4
    326          __m256i v_c67 =
    327              _mm256_unpackhi_epi64(v_c0123u, v_c4567u);  // H7H6 ... A7A6
    328 
    329          for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
    330            int iy = iy4 + k;
    331            if (iy < 0)
    332              iy = 0;
    333            else if (iy > height - 1)
    334              iy = height - 1;
    335            iy = iy * stride;
    336 
    337            __m256i v_refl = _mm256_inserti128_si256(
    338                _mm256_setzero_si256(),
    339                _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
    340            v_refl = _mm256_inserti128_si256(
    341                v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
    342                1);  // R15 .. R0
    343 
    344            __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE);
    345 
    346            __m256i v_refu =
    347                _mm256_alignr_epi8(v_ref, v_refl, 2);  // R8R15R14...R2R1
    348 
    349            v_refl = _mm256_inserti128_si256(
    350                v_refl, _mm256_extracti128_si256(v_refu, 0), 1);
    351            v_refu = _mm256_inserti128_si256(
    352                v_refu, _mm256_extracti128_si256(v_ref, 0), 0);
    353 
    354            __m256i v_sum = _mm256_set1_epi32(ohoriz);
    355            __m256i parsum = _mm256_madd_epi16(
    356                v_c01, _mm256_alignr_epi8(v_refu, v_refl,
    357                                          0));  // R8R7R6..R1R7R6R5..R1R0
    358            __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum);
    359 
    360            parsum = _mm256_madd_epi16(
    361                v_c23,
    362                _mm256_alignr_epi8(v_refu, v_refl, 4));  // R10R9..R3R9R8..R3R2
    363            __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum);
    364            parsum = _mm256_madd_epi16(
    365                v_c45, _mm256_alignr_epi8(v_refu, v_refl,
    366                                          8));  // R12R11..R5R11R10..R5R4
    367            __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum);
    368            parsum = _mm256_madd_epi16(
    369                v_c67, _mm256_alignr_epi8(v_refu, v_refl,
    370                                          12));  // R14R13..R7R13R12..R7R6
    371            __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum);
    372 
    373            tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz),
    374                                           reduce_bits_horiz);
    375          }
    376 
    377        } else {
    378          for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
    379            int iy = iy4 + k;
    380            if (iy < 0)
    381              iy = 0;
    382            else if (iy > height - 1)
    383              iy = height - 1;
    384            iy = iy * stride;
    385 
    386            sx = sx4 + beta * (k + 4);
    387 
    388            __m256i v_coeff01 = _mm256_inserti128_si256(
    389                v_zeros,
    390                _mm_loadu_si128(
    391                    (__m128i *)av1_warped_filter[(sx) >> WARPEDDIFF_PREC_BITS]),
    392                0);
    393            v_coeff01 = _mm256_inserti128_si256(
    394                v_coeff01,
    395                _mm_loadu_si128(
    396                    (__m128i *)av1_warped_filter[(sx + alpha) >>
    397                                                 WARPEDDIFF_PREC_BITS]),
    398                1);  // B7B6..B1B0A7A6..A1A0
    399            __m256i v_coeff23 = _mm256_inserti128_si256(
    400                v_zeros,
    401                _mm_loadu_si128(
    402                    (__m128i *)av1_warped_filter[(sx + 2 * alpha) >>
    403                                                 WARPEDDIFF_PREC_BITS]),
    404                0);
    405            v_coeff23 = _mm256_inserti128_si256(
    406                v_coeff23,
    407                _mm_loadu_si128(
    408                    (__m128i *)av1_warped_filter[(sx + 3 * alpha) >>
    409                                                 WARPEDDIFF_PREC_BITS]),
    410                1);  // D7D6..D1D0C7C6..C1C0
    411            __m256i v_coeff45 = _mm256_inserti128_si256(
    412                v_zeros,
    413                _mm_loadu_si128(
    414                    (__m128i *)av1_warped_filter[(sx + 4 * alpha) >>
    415                                                 WARPEDDIFF_PREC_BITS]),
    416                0);
    417            v_coeff45 = _mm256_inserti128_si256(
    418                v_coeff45,
    419                _mm_loadu_si128(
    420                    (__m128i *)av1_warped_filter[(sx + 5 * alpha) >>
    421                                                 WARPEDDIFF_PREC_BITS]),
    422                1);  // F7F6..F1F0E7E6..E1E0
    423            __m256i v_coeff67 = _mm256_inserti128_si256(
    424                v_zeros,
    425                _mm_loadu_si128(
    426                    (__m128i *)av1_warped_filter[(sx + 6 * alpha) >>
    427                                                 WARPEDDIFF_PREC_BITS]),
    428                0);
    429            v_coeff67 = _mm256_inserti128_si256(
    430                v_coeff67,
    431                _mm_loadu_si128(
    432                    (__m128i *)av1_warped_filter[(sx + 7 * alpha) >>
    433                                                 WARPEDDIFF_PREC_BITS]),
    434                1);  // H7H6..H1H0G7G6..G1G0
    435 
    436            __m256i v_c0123 = _mm256_unpacklo_epi32(
    437                v_coeff01,
    438                v_coeff23);  // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0
    439            __m256i v_c0123u = _mm256_unpackhi_epi32(
    440                v_coeff01,
    441                v_coeff23);  // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4
    442            __m256i v_c4567 = _mm256_unpacklo_epi32(
    443                v_coeff45,
    444                v_coeff67);  // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0
    445            __m256i v_c4567u = _mm256_unpackhi_epi32(
    446                v_coeff45,
    447                v_coeff67);  // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4
    448 
    449            __m256i v_c01 = _mm256_unpacklo_epi64(
    450                v_c0123, v_c4567);  // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0
    451            __m256i v_c23 =
    452                _mm256_unpackhi_epi64(v_c0123, v_c4567);  // H3H2 ... A3A2
    453            __m256i v_c45 =
    454                _mm256_unpacklo_epi64(v_c0123u, v_c4567u);  // H5H4 ... A5A4
    455            __m256i v_c67 =
    456                _mm256_unpackhi_epi64(v_c0123u, v_c4567u);  // H7H6 ... A7A6
    457 
    458            __m256i v_refl = _mm256_inserti128_si256(
    459                _mm256_setzero_si256(),
    460                _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
    461            v_refl = _mm256_inserti128_si256(
    462                v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
    463                1);  // R15 .. R0
    464 
    465            __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE);
    466 
    467            __m256i v_refu =
    468                _mm256_alignr_epi8(v_ref, v_refl, 2);  // R8R15R14...R2R1
    469 
    470            v_refl = _mm256_inserti128_si256(
    471                v_refl, _mm256_extracti128_si256(v_refu, 0), 1);
    472            v_refu = _mm256_inserti128_si256(
    473                v_refu, _mm256_extracti128_si256(v_ref, 0), 0);
    474 
    475            __m256i v_sum = _mm256_set1_epi32(ohoriz);
    476            __m256i parsum =
    477                _mm256_madd_epi16(v_c01, _mm256_alignr_epi8(v_refu, v_refl, 0));
    478            __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum);
    479 
    480            parsum =
    481                _mm256_madd_epi16(v_c23, _mm256_alignr_epi8(v_refu, v_refl, 4));
    482            __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum);
    483            parsum =
    484                _mm256_madd_epi16(v_c45, _mm256_alignr_epi8(v_refu, v_refl, 8));
    485            __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum);
    486            parsum = _mm256_madd_epi16(v_c67,
    487                                       _mm256_alignr_epi8(v_refu, v_refl, 12));
    488            __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum);
    489 
    490            tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz),
    491                                           reduce_bits_horiz);
    492          }
    493        }
    494      }
    495 
    496      // Vertical filter
    497      for (int k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
    498        int sy = sy4 + delta * (k + 4);
    499        const __m256i *src = tmp + (k + 4);
    500 
    501        __m256i v_coeff01 = _mm256_inserti128_si256(
    502            v_zeros,
    503            _mm_loadu_si128(
    504                (__m128i *)av1_warped_filter[(sy) >> WARPEDDIFF_PREC_BITS]),
    505            0);
    506        v_coeff01 = _mm256_inserti128_si256(
    507            v_coeff01,
    508            _mm_loadu_si128(
    509                (__m128i *)
    510                    av1_warped_filter[(sy + gamma) >> WARPEDDIFF_PREC_BITS]),
    511            1);
    512        __m256i v_coeff23 = _mm256_inserti128_si256(
    513            v_zeros,
    514            _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 2 * gamma) >>
    515                                                         WARPEDDIFF_PREC_BITS]),
    516            0);
    517        v_coeff23 = _mm256_inserti128_si256(
    518            v_coeff23,
    519            _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 3 * gamma) >>
    520                                                         WARPEDDIFF_PREC_BITS]),
    521            1);
    522        __m256i v_coeff45 = _mm256_inserti128_si256(
    523            v_zeros,
    524            _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 4 * gamma) >>
    525                                                         WARPEDDIFF_PREC_BITS]),
    526            0);
    527        v_coeff45 = _mm256_inserti128_si256(
    528            v_coeff45,
    529            _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 5 * gamma) >>
    530                                                         WARPEDDIFF_PREC_BITS]),
    531            1);
    532        __m256i v_coeff67 = _mm256_inserti128_si256(
    533            v_zeros,
    534            _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 6 * gamma) >>
    535                                                         WARPEDDIFF_PREC_BITS]),
    536            0);
    537        v_coeff67 = _mm256_inserti128_si256(
    538            v_coeff67,
    539            _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 7 * gamma) >>
    540                                                         WARPEDDIFF_PREC_BITS]),
    541            1);
    542 
    543        __m256i v_c0123 = _mm256_unpacklo_epi32(
    544            v_coeff01,
    545            v_coeff23);  // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0
    546        __m256i v_c0123u = _mm256_unpackhi_epi32(
    547            v_coeff01,
    548            v_coeff23);  // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4
    549        __m256i v_c4567 = _mm256_unpacklo_epi32(
    550            v_coeff45,
    551            v_coeff67);  // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0
    552        __m256i v_c4567u = _mm256_unpackhi_epi32(
    553            v_coeff45,
    554            v_coeff67);  // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4
    555 
    556        __m256i v_c01 = _mm256_unpacklo_epi64(
    557            v_c0123, v_c4567);  // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0
    558        __m256i v_c23 =
    559            _mm256_unpackhi_epi64(v_c0123, v_c4567);  // H3H2 ... A3A2
    560        __m256i v_c45 =
    561            _mm256_unpacklo_epi64(v_c0123u, v_c4567u);  // H5H4 ... A5A4
    562        __m256i v_c67 =
    563            _mm256_unpackhi_epi64(v_c0123u, v_c4567u);  // H7H6 ... A7A6
    564 
    565        __m256i v_src01l =
    566            _mm256_unpacklo_epi32(src[0], src[1]);  // T13T03T11T01T12T02T10T00
    567        __m256i v_src01u =
    568            _mm256_unpackhi_epi32(src[0], src[1]);  // T17T07T15T05T16T06T14T04
    569        __m256i v_sum =
    570            _mm256_madd_epi16(_mm256_packus_epi32(v_src01l, v_src01u),
    571                              v_c01);  // S7S5S3S1S6S4S2S0
    572 
    573        __m256i v_src23l = _mm256_unpacklo_epi32(src[2], src[3]);
    574        __m256i v_src23u = _mm256_unpackhi_epi32(src[2], src[3]);
    575        v_sum = _mm256_add_epi32(
    576            v_sum,
    577            _mm256_madd_epi16(_mm256_packus_epi32(v_src23l, v_src23u), v_c23));
    578 
    579        __m256i v_src45l = _mm256_unpacklo_epi32(src[4], src[5]);
    580        __m256i v_src45u = _mm256_unpackhi_epi32(src[4], src[5]);
    581        v_sum = _mm256_add_epi32(
    582            v_sum,
    583            _mm256_madd_epi16(_mm256_packus_epi32(v_src45l, v_src45u), v_c45));
    584 
    585        __m256i v_src67l = _mm256_unpacklo_epi32(src[6], src[7]);
    586        __m256i v_src67u = _mm256_unpackhi_epi32(src[6], src[7]);
    587        v_sum = _mm256_add_epi32(
    588            v_sum,
    589            _mm256_madd_epi16(_mm256_packus_epi32(v_src67l, v_src67u), v_c67));
    590 
    591        // unpack S7S5S3S1S6S4S2S0 to S7S6S5S4S3S2S1S0
    592 
    593        __m256i v_suml =
    594            _mm256_permute4x64_epi64(v_sum, 0xD8);  // S7S5S6S4S3S1S2S0
    595        __m256i v_sumh =
    596            _mm256_permute4x64_epi64(v_sum, 0x32);      // S2S0S7S5S2S0S3S1
    597        v_sum = _mm256_unpacklo_epi32(v_suml, v_sumh);  // S7S6S5S4S3S2S1S0
    598 
    599        if (conv_params->is_compound) {
    600          __m128i *const p =
    601              (__m128i *)&conv_params
    602                  ->dst[(i + k + 4) * conv_params->dst_stride + j];
    603 
    604          v_sum = _mm256_add_epi32(v_sum, res_add_const);
    605          v_sum =
    606              _mm256_sra_epi32(_mm256_add_epi32(v_sum, reduce_bits_vert_const),
    607                               reduce_bits_vert_shift);
    608          if (conv_params->do_average) {
    609            __m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
    610            __m256i p_32 = _mm256_cvtepu16_epi32(_mm_loadu_si128(p));
    611 
    612            if (conv_params->use_dist_wtd_comp_avg) {
    613              v_sum = _mm256_add_epi32(_mm256_mullo_epi32(p_32, wt0),
    614                                       _mm256_mullo_epi32(v_sum, wt1));
    615              v_sum = _mm256_srai_epi32(v_sum, DIST_PRECISION_BITS);
    616            } else {
    617              v_sum = _mm256_srai_epi32(_mm256_add_epi32(p_32, v_sum), 1);
    618            }
    619 
    620            __m256i v_sum1 = _mm256_add_epi32(v_sum, res_sub_const);
    621            v_sum1 = _mm256_sra_epi32(
    622                _mm256_add_epi32(v_sum1, round_bits_const), round_bits_shift);
    623 
    624            __m256i v_sum16 = _mm256_packus_epi32(v_sum1, v_sum1);
    625            v_sum16 = _mm256_permute4x64_epi64(v_sum16, 0xD8);
    626            v_sum16 = _mm256_min_epi16(v_sum16, clip_pixel);
    627            _mm_storeu_si128(dst16, _mm256_extracti128_si256(v_sum16, 0));
    628          } else {
    629            v_sum = _mm256_packus_epi32(v_sum, v_sum);
    630            __m256i v_sum16 = _mm256_permute4x64_epi64(v_sum, 0xD8);
    631            _mm_storeu_si128(p, _mm256_extracti128_si256(v_sum16, 0));
    632          }
    633        } else {
    634          // Round and pack into 8 bits
    635          const __m256i round_const =
    636              _mm256_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
    637                                ((1 << reduce_bits_vert) >> 1));
    638 
    639          __m256i v_sum1 = _mm256_srai_epi32(
    640              _mm256_add_epi32(v_sum, round_const), reduce_bits_vert);
    641 
    642          v_sum1 = _mm256_packus_epi32(v_sum1, v_sum1);
    643          __m256i v_sum16 = _mm256_permute4x64_epi64(v_sum1, 0xD8);
    644          // Clamp res_16bit to the range [0, 2^bd - 1]
    645          const __m256i max_val = _mm256_set1_epi16((1 << bd) - 1);
    646          const __m256i zero = _mm256_setzero_si256();
    647          v_sum16 = _mm256_max_epi16(_mm256_min_epi16(v_sum16, max_val), zero);
    648 
    649          __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
    650 
    651          _mm_storeu_si128(p, _mm256_extracti128_si256(v_sum16, 0));
    652        }
    653      }
    654    }
    655  }
    656 }