tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

highbd_warp_plane_sse4.c (27683B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <smmintrin.h>
     13 
     14 #include "config/av1_rtcd.h"
     15 
     16 #include "av1/common/warped_motion.h"
     17 
     18 static const uint8_t warp_highbd_arrange_bytes[16] = { 0,  2,  4,  6, 8, 10,
     19                                                       12, 14, 1,  3, 5, 7,
     20                                                       9,  11, 13, 15 };
     21 
     22 static const uint8_t highbd_shuffle_alpha0_mask0[16] = {
     23  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
     24 };
     25 static const uint8_t highbd_shuffle_alpha0_mask1[16] = {
     26  4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7
     27 };
     28 static const uint8_t highbd_shuffle_alpha0_mask2[16] = { 8,  9,  10, 11, 8,  9,
     29                                                         10, 11, 8,  9,  10, 11,
     30                                                         8,  9,  10, 11 };
     31 static const uint8_t highbd_shuffle_alpha0_mask3[16] = { 12, 13, 14, 15, 12, 13,
     32                                                         14, 15, 12, 13, 14, 15,
     33                                                         12, 13, 14, 15 };
     34 
     35 static inline void highbd_prepare_horizontal_filter_coeff(int alpha, int sx,
     36                                                          __m128i *coeff) {
     37  // Filter even-index pixels
     38  const __m128i tmp_0 =
     39      _mm_loadu_si128((__m128i *)(av1_warped_filter +
     40                                  ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
     41  const __m128i tmp_2 =
     42      _mm_loadu_si128((__m128i *)(av1_warped_filter +
     43                                  ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
     44  const __m128i tmp_4 =
     45      _mm_loadu_si128((__m128i *)(av1_warped_filter +
     46                                  ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
     47  const __m128i tmp_6 =
     48      _mm_loadu_si128((__m128i *)(av1_warped_filter +
     49                                  ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
     50 
     51  // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
     52  const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
     53  // coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6
     54  const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
     55  // coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2
     56  const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
     57  // coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6
     58  const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
     59 
     60  // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
     61  coeff[0] = _mm_unpacklo_epi64(tmp_8, tmp_10);
     62  // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
     63  coeff[2] = _mm_unpackhi_epi64(tmp_8, tmp_10);
     64  // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
     65  coeff[4] = _mm_unpacklo_epi64(tmp_12, tmp_14);
     66  // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
     67  coeff[6] = _mm_unpackhi_epi64(tmp_12, tmp_14);
     68 
     69  // Filter odd-index pixels
     70  const __m128i tmp_1 =
     71      _mm_loadu_si128((__m128i *)(av1_warped_filter +
     72                                  ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
     73  const __m128i tmp_3 =
     74      _mm_loadu_si128((__m128i *)(av1_warped_filter +
     75                                  ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
     76  const __m128i tmp_5 =
     77      _mm_loadu_si128((__m128i *)(av1_warped_filter +
     78                                  ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
     79  const __m128i tmp_7 =
     80      _mm_loadu_si128((__m128i *)(av1_warped_filter +
     81                                  ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));
     82 
     83  const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
     84  const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
     85  const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
     86  const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
     87 
     88  coeff[1] = _mm_unpacklo_epi64(tmp_9, tmp_11);
     89  coeff[3] = _mm_unpackhi_epi64(tmp_9, tmp_11);
     90  coeff[5] = _mm_unpacklo_epi64(tmp_13, tmp_15);
     91  coeff[7] = _mm_unpackhi_epi64(tmp_13, tmp_15);
     92 }
     93 
     94 static inline void highbd_prepare_horizontal_filter_coeff_alpha0(
     95    int sx, __m128i *coeff) {
     96  // Filter coeff
     97  const __m128i tmp_0 = _mm_loadu_si128(
     98      (__m128i *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
     99 
    100  coeff[0] = _mm_shuffle_epi8(
    101      tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask0));
    102  coeff[2] = _mm_shuffle_epi8(
    103      tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask1));
    104  coeff[4] = _mm_shuffle_epi8(
    105      tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask2));
    106  coeff[6] = _mm_shuffle_epi8(
    107      tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask3));
    108 
    109  coeff[1] = coeff[0];
    110  coeff[3] = coeff[2];
    111  coeff[5] = coeff[4];
    112  coeff[7] = coeff[6];
    113 }
    114 
    115 static inline void highbd_filter_src_pixels(
    116    const __m128i *src, const __m128i *src2, __m128i *tmp, __m128i *coeff,
    117    const int offset_bits_horiz, const int reduce_bits_horiz, int k) {
    118  const __m128i src_1 = *src;
    119  const __m128i src2_1 = *src2;
    120 
    121  const __m128i round_const = _mm_set1_epi32((1 << offset_bits_horiz) +
    122                                             ((1 << reduce_bits_horiz) >> 1));
    123 
    124  const __m128i res_0 = _mm_madd_epi16(src_1, coeff[0]);
    125  const __m128i res_2 =
    126      _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 4), coeff[2]);
    127  const __m128i res_4 =
    128      _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 8), coeff[4]);
    129  const __m128i res_6 =
    130      _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 12), coeff[6]);
    131 
    132  __m128i res_even =
    133      _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6));
    134  res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const),
    135                           _mm_cvtsi32_si128(reduce_bits_horiz));
    136 
    137  const __m128i res_1 =
    138      _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 2), coeff[1]);
    139  const __m128i res_3 =
    140      _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 6), coeff[3]);
    141  const __m128i res_5 =
    142      _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 10), coeff[5]);
    143  const __m128i res_7 =
    144      _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 14), coeff[7]);
    145 
    146  __m128i res_odd =
    147      _mm_add_epi32(_mm_add_epi32(res_1, res_5), _mm_add_epi32(res_3, res_7));
    148  res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const),
    149                          _mm_cvtsi32_si128(reduce_bits_horiz));
    150 
    151  // Combine results into one register.
    152  // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7
    153  // as this order helps with the vertical filter.
    154  tmp[k + 7] = _mm_packs_epi32(res_even, res_odd);
    155 }
    156 
    157 static inline void highbd_horiz_filter(const __m128i *src, const __m128i *src2,
    158                                       __m128i *tmp, int sx, int alpha, int k,
    159                                       const int offset_bits_horiz,
    160                                       const int reduce_bits_horiz) {
    161  __m128i coeff[8];
    162  highbd_prepare_horizontal_filter_coeff(alpha, sx, coeff);
    163  highbd_filter_src_pixels(src, src2, tmp, coeff, offset_bits_horiz,
    164                           reduce_bits_horiz, k);
    165 }
    166 
    167 static inline void highbd_warp_horizontal_filter_alpha0_beta0(
    168    const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
    169    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
    170    const int offset_bits_horiz, const int reduce_bits_horiz) {
    171  (void)beta;
    172  (void)alpha;
    173  int k;
    174 
    175  __m128i coeff[8];
    176  highbd_prepare_horizontal_filter_coeff_alpha0(sx4, coeff);
    177 
    178  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
    179    int iy = iy4 + k;
    180    if (iy < 0)
    181      iy = 0;
    182    else if (iy > height - 1)
    183      iy = height - 1;
    184 
    185    // Load source pixels
    186    const __m128i src =
    187        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
    188    const __m128i src2 =
    189        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
    190    highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
    191                             reduce_bits_horiz, k);
    192  }
    193 }
    194 
    195 static inline void highbd_warp_horizontal_filter_alpha0(
    196    const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
    197    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
    198    const int offset_bits_horiz, const int reduce_bits_horiz) {
    199  (void)alpha;
    200  int k;
    201  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
    202    int iy = iy4 + k;
    203    if (iy < 0)
    204      iy = 0;
    205    else if (iy > height - 1)
    206      iy = height - 1;
    207    int sx = sx4 + beta * (k + 4);
    208 
    209    // Load source pixels
    210    const __m128i src =
    211        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
    212    const __m128i src2 =
    213        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
    214 
    215    __m128i coeff[8];
    216    highbd_prepare_horizontal_filter_coeff_alpha0(sx, coeff);
    217    highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
    218                             reduce_bits_horiz, k);
    219  }
    220 }
    221 
    222 static inline void highbd_warp_horizontal_filter_beta0(
    223    const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
    224    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
    225    const int offset_bits_horiz, const int reduce_bits_horiz) {
    226  (void)beta;
    227  int k;
    228  __m128i coeff[8];
    229  highbd_prepare_horizontal_filter_coeff(alpha, sx4, coeff);
    230 
    231  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
    232    int iy = iy4 + k;
    233    if (iy < 0)
    234      iy = 0;
    235    else if (iy > height - 1)
    236      iy = height - 1;
    237 
    238    // Load source pixels
    239    const __m128i src =
    240        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
    241    const __m128i src2 =
    242        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
    243    highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
    244                             reduce_bits_horiz, k);
    245  }
    246 }
    247 
    248 static inline void highbd_warp_horizontal_filter(
    249    const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
    250    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
    251    const int offset_bits_horiz, const int reduce_bits_horiz) {
    252  int k;
    253  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
    254    int iy = iy4 + k;
    255    if (iy < 0)
    256      iy = 0;
    257    else if (iy > height - 1)
    258      iy = height - 1;
    259    int sx = sx4 + beta * (k + 4);
    260 
    261    // Load source pixels
    262    const __m128i src =
    263        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
    264    const __m128i src2 =
    265        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
    266 
    267    highbd_horiz_filter(&src, &src2, tmp, sx, alpha, k, offset_bits_horiz,
    268                        reduce_bits_horiz);
    269  }
    270 }
    271 
    272 static inline void highbd_prepare_warp_horizontal_filter(
    273    const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
    274    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
    275    const int offset_bits_horiz, const int reduce_bits_horiz) {
    276  if (alpha == 0 && beta == 0)
    277    highbd_warp_horizontal_filter_alpha0_beta0(
    278        ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
    279        offset_bits_horiz, reduce_bits_horiz);
    280 
    281  else if (alpha == 0 && beta != 0)
    282    highbd_warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha,
    283                                         beta, p_height, height, i,
    284                                         offset_bits_horiz, reduce_bits_horiz);
    285 
    286  else if (alpha != 0 && beta == 0)
    287    highbd_warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha,
    288                                        beta, p_height, height, i,
    289                                        offset_bits_horiz, reduce_bits_horiz);
    290  else
    291    highbd_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
    292                                  p_height, height, i, offset_bits_horiz,
    293                                  reduce_bits_horiz);
    294 }
    295 
    296 void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
    297                                   int width, int height, int stride,
    298                                   uint16_t *pred, int p_col, int p_row,
    299                                   int p_width, int p_height, int p_stride,
    300                                   int subsampling_x, int subsampling_y, int bd,
    301                                   ConvolveParams *conv_params, int16_t alpha,
    302                                   int16_t beta, int16_t gamma, int16_t delta) {
    303  __m128i tmp[15];
    304  int i, j, k;
    305  const int reduce_bits_horiz = conv_params->round_0;
    306  const int reduce_bits_vert = conv_params->is_compound
    307                                   ? conv_params->round_1
    308                                   : 2 * FILTER_BITS - reduce_bits_horiz;
    309  const int offset_bits_horiz = bd + FILTER_BITS - 1;
    310  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
    311  assert(!(bd == 12 && reduce_bits_horiz < 5));
    312  assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
    313 
    314  // Check that, even with 12-bit input, the intermediate values will fit
    315  // into an unsigned 16-bit intermediate array.
    316  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
    317 
    318  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
    319  const __m128i clip_pixel =
    320      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
    321  const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert);
    322  const __m128i reduce_bits_vert_const =
    323      _mm_set1_epi32(((1 << reduce_bits_vert) >> 1));
    324  const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert);
    325  const int round_bits =
    326      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
    327  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
    328  const __m128i res_sub_const =
    329      _mm_set1_epi32(-(1 << (offset_bits - conv_params->round_1)) -
    330                     (1 << (offset_bits - conv_params->round_1 - 1)));
    331  __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
    332  __m128i round_bits_const = _mm_set1_epi32(((1 << round_bits) >> 1));
    333 
    334  const int w0 = conv_params->fwd_offset;
    335  const int w1 = conv_params->bck_offset;
    336  const __m128i wt0 = _mm_set1_epi32(w0);
    337  const __m128i wt1 = _mm_set1_epi32(w1);
    338 
    339  /* Note: For this code to work, the left/right frame borders need to be
    340  extended by at least 13 pixels each. By the time we get here, other
    341  code will have set up this border, but we allow an explicit check
    342  for debugging purposes.
    343  */
    344  /*for (i = 0; i < height; ++i) {
    345  for (j = 0; j < 13; ++j) {
    346  assert(ref[i * stride - 13 + j] == ref[i * stride]);
    347  assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
    348  }
    349  }*/
    350 
    351  for (i = 0; i < p_height; i += 8) {
    352    for (j = 0; j < p_width; j += 8) {
    353      const int32_t src_x = (p_col + j + 4) << subsampling_x;
    354      const int32_t src_y = (p_row + i + 4) << subsampling_y;
    355      const int64_t dst_x =
    356          (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
    357      const int64_t dst_y =
    358          (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
    359      const int64_t x4 = dst_x >> subsampling_x;
    360      const int64_t y4 = dst_y >> subsampling_y;
    361 
    362      int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
    363      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
    364      int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
    365      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
    366 
    367      // Add in all the constant terms, including rounding and offset
    368      sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
    369             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
    370      sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
    371             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
    372 
    373      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
    374      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
    375 
    376      // Horizontal filter
    377      // If the block is aligned such that, after clamping, every sample
    378      // would be taken from the leftmost/rightmost column, then we can
    379      // skip the expensive horizontal filter.
    380      if (ix4 <= -7) {
    381        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
    382          int iy = iy4 + k;
    383          if (iy < 0)
    384            iy = 0;
    385          else if (iy > height - 1)
    386            iy = height - 1;
    387          tmp[k + 7] = _mm_set1_epi16(
    388              (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
    389              ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)));
    390        }
    391      } else if (ix4 >= width + 6) {
    392        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
    393          int iy = iy4 + k;
    394          if (iy < 0)
    395            iy = 0;
    396          else if (iy > height - 1)
    397            iy = height - 1;
    398          tmp[k + 7] =
    399              _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
    400                             ref[iy * stride + (width - 1)] *
    401                                 (1 << (FILTER_BITS - reduce_bits_horiz)));
    402        }
    403      } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
    404        const int out_of_boundary_left = -(ix4 - 6);
    405        const int out_of_boundary_right = (ix4 + 8) - width;
    406 
    407        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
    408          int iy = iy4 + k;
    409          if (iy < 0)
    410            iy = 0;
    411          else if (iy > height - 1)
    412            iy = height - 1;
    413          int sx = sx4 + beta * (k + 4);
    414 
    415          // Load source pixels
    416          const __m128i src =
    417              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
    418          const __m128i src2 =
    419              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
    420 
    421          const __m128i src_01 = _mm_shuffle_epi8(
    422              src, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes));
    423          const __m128i src2_01 = _mm_shuffle_epi8(
    424              src2, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes));
    425 
    426          __m128i src_lo = _mm_unpacklo_epi64(src_01, src2_01);
    427          __m128i src_hi = _mm_unpackhi_epi64(src_01, src2_01);
    428 
    429          if (out_of_boundary_left >= 0) {
    430            const __m128i shuffle_reg_left =
    431                _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
    432            src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_left);
    433            src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_left);
    434          }
    435 
    436          if (out_of_boundary_right >= 0) {
    437            const __m128i shuffle_reg_right = _mm_loadu_si128(
    438                (__m128i *)warp_pad_right[out_of_boundary_right]);
    439            src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_right);
    440            src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_right);
    441          }
    442 
    443          const __m128i src_padded = _mm_unpacklo_epi8(src_lo, src_hi);
    444          const __m128i src2_padded = _mm_unpackhi_epi8(src_lo, src_hi);
    445 
    446          highbd_horiz_filter(&src_padded, &src2_padded, tmp, sx, alpha, k,
    447                              offset_bits_horiz, reduce_bits_horiz);
    448        }
    449      } else {
    450        highbd_prepare_warp_horizontal_filter(
    451            ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
    452            offset_bits_horiz, reduce_bits_horiz);
    453      }
    454 
    455      // Vertical filter
    456      for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
    457        int sy = sy4 + delta * (k + 4);
    458 
    459        // Load from tmp and rearrange pairs of consecutive rows into the
    460        // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
    461        const __m128i *src = tmp + (k + 4);
    462        const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
    463        const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
    464        const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
    465        const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
    466 
    467        // Filter even-index pixels
    468        const __m128i tmp_0 = _mm_loadu_si128(
    469            (__m128i *)(av1_warped_filter +
    470                        ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
    471        const __m128i tmp_2 = _mm_loadu_si128(
    472            (__m128i *)(av1_warped_filter +
    473                        ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
    474        const __m128i tmp_4 = _mm_loadu_si128(
    475            (__m128i *)(av1_warped_filter +
    476                        ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
    477        const __m128i tmp_6 = _mm_loadu_si128(
    478            (__m128i *)(av1_warped_filter +
    479                        ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
    480 
    481        const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
    482        const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
    483        const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
    484        const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
    485 
    486        const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
    487        const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
    488        const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
    489        const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
    490 
    491        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
    492        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
    493        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
    494        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
    495 
    496        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
    497                                               _mm_add_epi32(res_4, res_6));
    498 
    499        // Filter odd-index pixels
    500        const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
    501        const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
    502        const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
    503        const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
    504 
    505        const __m128i tmp_1 = _mm_loadu_si128(
    506            (__m128i *)(av1_warped_filter +
    507                        ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
    508        const __m128i tmp_3 = _mm_loadu_si128(
    509            (__m128i *)(av1_warped_filter +
    510                        ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
    511        const __m128i tmp_5 = _mm_loadu_si128(
    512            (__m128i *)(av1_warped_filter +
    513                        ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
    514        const __m128i tmp_7 = _mm_loadu_si128(
    515            (__m128i *)(av1_warped_filter +
    516                        ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
    517 
    518        const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
    519        const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
    520        const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
    521        const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
    522 
    523        const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
    524        const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
    525        const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
    526        const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
    527 
    528        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
    529        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
    530        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
    531        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
    532 
    533        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
    534                                              _mm_add_epi32(res_5, res_7));
    535 
    536        // Rearrange pixels back into the order 0 ... 7
    537        __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
    538        __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
    539 
    540        if (conv_params->is_compound) {
    541          __m128i *const p =
    542              (__m128i *)&conv_params
    543                  ->dst[(i + k + 4) * conv_params->dst_stride + j];
    544          res_lo = _mm_add_epi32(res_lo, res_add_const);
    545          res_lo = _mm_sra_epi32(_mm_add_epi32(res_lo, reduce_bits_vert_const),
    546                                 reduce_bits_vert_shift);
    547 
    548          if (conv_params->do_average) {
    549            __m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
    550            __m128i p_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p));
    551 
    552            if (conv_params->use_dist_wtd_comp_avg) {
    553              res_lo = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0),
    554                                     _mm_mullo_epi32(res_lo, wt1));
    555              res_lo = _mm_srai_epi32(res_lo, DIST_PRECISION_BITS);
    556            } else {
    557              res_lo = _mm_srai_epi32(_mm_add_epi32(p_32, res_lo), 1);
    558            }
    559 
    560            __m128i res32_lo = _mm_add_epi32(res_lo, res_sub_const);
    561            res32_lo = _mm_sra_epi32(_mm_add_epi32(res32_lo, round_bits_const),
    562                                     round_bits_shift);
    563 
    564            __m128i res16_lo = _mm_packus_epi32(res32_lo, res32_lo);
    565            res16_lo = _mm_min_epi16(res16_lo, clip_pixel);
    566            _mm_storel_epi64(dst16, res16_lo);
    567          } else {
    568            res_lo = _mm_packus_epi32(res_lo, res_lo);
    569            _mm_storel_epi64(p, res_lo);
    570          }
    571          if (p_width > 4) {
    572            __m128i *const p4 =
    573                (__m128i *)&conv_params
    574                    ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
    575 
    576            res_hi = _mm_add_epi32(res_hi, res_add_const);
    577            res_hi =
    578                _mm_sra_epi32(_mm_add_epi32(res_hi, reduce_bits_vert_const),
    579                              reduce_bits_vert_shift);
    580            if (conv_params->do_average) {
    581              __m128i *const dst16_4 =
    582                  (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
    583              __m128i p4_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p4));
    584 
    585              if (conv_params->use_dist_wtd_comp_avg) {
    586                res_hi = _mm_add_epi32(_mm_mullo_epi32(p4_32, wt0),
    587                                       _mm_mullo_epi32(res_hi, wt1));
    588                res_hi = _mm_srai_epi32(res_hi, DIST_PRECISION_BITS);
    589              } else {
    590                res_hi = _mm_srai_epi32(_mm_add_epi32(p4_32, res_hi), 1);
    591              }
    592 
    593              __m128i res32_hi = _mm_add_epi32(res_hi, res_sub_const);
    594              res32_hi = _mm_sra_epi32(
    595                  _mm_add_epi32(res32_hi, round_bits_const), round_bits_shift);
    596              __m128i res16_hi = _mm_packus_epi32(res32_hi, res32_hi);
    597              res16_hi = _mm_min_epi16(res16_hi, clip_pixel);
    598              _mm_storel_epi64(dst16_4, res16_hi);
    599            } else {
    600              res_hi = _mm_packus_epi32(res_hi, res_hi);
    601              _mm_storel_epi64(p4, res_hi);
    602            }
    603          }
    604        } else {
    605          // Round and pack into 8 bits
    606          const __m128i round_const =
    607              _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
    608                             ((1 << reduce_bits_vert) >> 1));
    609 
    610          const __m128i res_lo_round = _mm_srai_epi32(
    611              _mm_add_epi32(res_lo, round_const), reduce_bits_vert);
    612          const __m128i res_hi_round = _mm_srai_epi32(
    613              _mm_add_epi32(res_hi, round_const), reduce_bits_vert);
    614 
    615          __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
    616          // Clamp res_16bit to the range [0, 2^bd - 1]
    617          const __m128i max_val = _mm_set1_epi16((1 << bd) - 1);
    618          const __m128i zero = _mm_setzero_si128();
    619          res_16bit = _mm_max_epi16(_mm_min_epi16(res_16bit, max_val), zero);
    620 
    621          // Store, blending with 'pred' if needed
    622          __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
    623 
    624          // Note: If we're outputting a 4x4 block, we need to be very careful
    625          // to only output 4 pixels at this point, to avoid encode/decode
    626          // mismatches when encoding with multiple threads.
    627          if (p_width == 4) {
    628            _mm_storel_epi64(p, res_16bit);
    629          } else {
    630            _mm_storeu_si128(p, res_16bit);
    631          }
    632        }
    633      }
    634    }
    635  }
    636 }