tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

warp_plane_sse4.c (42048B)


      1 /*
      2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <emmintrin.h>
     13 #include <smmintrin.h>
     14 
     15 #include "config/av1_rtcd.h"
     16 
     17 #include "av1/common/warped_motion.h"
     18 
     19 /* This is a modified version of 'av1_warped_filter' from warped_motion.c:
     20   * Each coefficient is stored in 8 bits instead of 16 bits
     21   * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7
     22 
     23     This is done in order to avoid overflow: Since the tap with the largest
     24     coefficient could be any of taps 2, 3, 4 or 5, we can't use the summation
     25     order ((0 + 1) + (4 + 5)) + ((2 + 3) + (6 + 7)) used in the regular
     26     convolve functions.
     27 
     28     Instead, we use the summation order
     29     ((0 + 2) + (4 + 6)) + ((1 + 3) + (5 + 7)).
     30     The rearrangement of coefficients in this table is so that we can get the
     31     coefficients into the correct order more quickly.
     32 */
     33 /* clang-format off */
     34 DECLARE_ALIGNED(8, const int8_t,
     35                av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = {
     36  // [-1, 0)
     37  { 0, 127,   0, 0,   0,   1, 0, 0}, { 0, 127,   0, 0,  -1,   2, 0, 0},
     38  { 1, 127,  -1, 0,  -3,   4, 0, 0}, { 1, 126,  -2, 0,  -4,   6, 1, 0},
     39  { 1, 126,  -3, 0,  -5,   8, 1, 0}, { 1, 125,  -4, 0,  -6,  11, 1, 0},
     40  { 1, 124,  -4, 0,  -7,  13, 1, 0}, { 2, 123,  -5, 0,  -8,  15, 1, 0},
     41  { 2, 122,  -6, 0,  -9,  18, 1, 0}, { 2, 121,  -6, 0, -10,  20, 1, 0},
     42  { 2, 120,  -7, 0, -11,  22, 2, 0}, { 2, 119,  -8, 0, -12,  25, 2, 0},
     43  { 3, 117,  -8, 0, -13,  27, 2, 0}, { 3, 116,  -9, 0, -13,  29, 2, 0},
     44  { 3, 114, -10, 0, -14,  32, 3, 0}, { 3, 113, -10, 0, -15,  35, 2, 0},
     45  { 3, 111, -11, 0, -15,  37, 3, 0}, { 3, 109, -11, 0, -16,  40, 3, 0},
     46  { 3, 108, -12, 0, -16,  42, 3, 0}, { 4, 106, -13, 0, -17,  45, 3, 0},
     47  { 4, 104, -13, 0, -17,  47, 3, 0}, { 4, 102, -14, 0, -17,  50, 3, 0},
     48  { 4, 100, -14, 0, -17,  52, 3, 0}, { 4,  98, -15, 0, -18,  55, 4, 0},
     49  { 4,  96, -15, 0, -18,  58, 3, 0}, { 4,  94, -16, 0, -18,  60, 4, 0},
     50  { 4,  91, -16, 0, -18,  63, 4, 0}, { 4,  89, -16, 0, -18,  65, 4, 0},
     51  { 4,  87, -17, 0, -18,  68, 4, 0}, { 4,  85, -17, 0, -18,  70, 4, 0},
     52  { 4,  82, -17, 0, -18,  73, 4, 0}, { 4,  80, -17, 0, -18,  75, 4, 0},
     53  { 4,  78, -18, 0, -18,  78, 4, 0}, { 4,  75, -18, 0, -17,  80, 4, 0},
     54  { 4,  73, -18, 0, -17,  82, 4, 0}, { 4,  70, -18, 0, -17,  85, 4, 0},
     55  { 4,  68, -18, 0, -17,  87, 4, 0}, { 4,  65, -18, 0, -16,  89, 4, 0},
     56  { 4,  63, -18, 0, -16,  91, 4, 0}, { 4,  60, -18, 0, -16,  94, 4, 0},
     57  { 3,  58, -18, 0, -15,  96, 4, 0}, { 4,  55, -18, 0, -15,  98, 4, 0},
     58  { 3,  52, -17, 0, -14, 100, 4, 0}, { 3,  50, -17, 0, -14, 102, 4, 0},
     59  { 3,  47, -17, 0, -13, 104, 4, 0}, { 3,  45, -17, 0, -13, 106, 4, 0},
     60  { 3,  42, -16, 0, -12, 108, 3, 0}, { 3,  40, -16, 0, -11, 109, 3, 0},
     61  { 3,  37, -15, 0, -11, 111, 3, 0}, { 2,  35, -15, 0, -10, 113, 3, 0},
     62  { 3,  32, -14, 0, -10, 114, 3, 0}, { 2,  29, -13, 0,  -9, 116, 3, 0},
     63  { 2,  27, -13, 0,  -8, 117, 3, 0}, { 2,  25, -12, 0,  -8, 119, 2, 0},
     64  { 2,  22, -11, 0,  -7, 120, 2, 0}, { 1,  20, -10, 0,  -6, 121, 2, 0},
     65  { 1,  18,  -9, 0,  -6, 122, 2, 0}, { 1,  15,  -8, 0,  -5, 123, 2, 0},
     66  { 1,  13,  -7, 0,  -4, 124, 1, 0}, { 1,  11,  -6, 0,  -4, 125, 1, 0},
     67  { 1,   8,  -5, 0,  -3, 126, 1, 0}, { 1,   6,  -4, 0,  -2, 126, 1, 0},
     68  { 0,   4,  -3, 0,  -1, 127, 1, 0}, { 0,   2,  -1, 0,   0, 127, 0, 0},
     69  // [0, 1)
     70  { 0,   0,   1, 0, 0, 127,   0,  0}, { 0,  -1,   2, 0, 0, 127,   0,  0},
     71  { 0,  -3,   4, 1, 1, 127,  -2,  0}, { 0,  -5,   6, 1, 1, 127,  -2,  0},
     72  { 0,  -6,   8, 1, 2, 126,  -3,  0}, {-1,  -7,  11, 2, 2, 126,  -4, -1},
     73  {-1,  -8,  13, 2, 3, 125,  -5, -1}, {-1, -10,  16, 3, 3, 124,  -6, -1},
     74  {-1, -11,  18, 3, 4, 123,  -7, -1}, {-1, -12,  20, 3, 4, 122,  -7, -1},
     75  {-1, -13,  23, 3, 4, 121,  -8, -1}, {-2, -14,  25, 4, 5, 120,  -9, -1},
     76  {-1, -15,  27, 4, 5, 119, -10, -1}, {-1, -16,  30, 4, 5, 118, -11, -1},
     77  {-2, -17,  33, 5, 6, 116, -12, -1}, {-2, -17,  35, 5, 6, 114, -12, -1},
     78  {-2, -18,  38, 5, 6, 113, -13, -1}, {-2, -19,  41, 6, 7, 111, -14, -2},
     79  {-2, -19,  43, 6, 7, 110, -15, -2}, {-2, -20,  46, 6, 7, 108, -15, -2},
     80  {-2, -20,  49, 6, 7, 106, -16, -2}, {-2, -21,  51, 7, 7, 104, -16, -2},
     81  {-2, -21,  54, 7, 7, 102, -17, -2}, {-2, -21,  56, 7, 8, 100, -18, -2},
     82  {-2, -22,  59, 7, 8,  98, -18, -2}, {-2, -22,  62, 7, 8,  96, -19, -2},
     83  {-2, -22,  64, 7, 8,  94, -19, -2}, {-2, -22,  67, 8, 8,  91, -20, -2},
     84  {-2, -22,  69, 8, 8,  89, -20, -2}, {-2, -22,  72, 8, 8,  87, -21, -2},
     85  {-2, -21,  74, 8, 8,  84, -21, -2}, {-2, -22,  77, 8, 8,  82, -21, -2},
     86  {-2, -21,  79, 8, 8,  79, -21, -2}, {-2, -21,  82, 8, 8,  77, -22, -2},
     87  {-2, -21,  84, 8, 8,  74, -21, -2}, {-2, -21,  87, 8, 8,  72, -22, -2},
     88  {-2, -20,  89, 8, 8,  69, -22, -2}, {-2, -20,  91, 8, 8,  67, -22, -2},
     89  {-2, -19,  94, 8, 7,  64, -22, -2}, {-2, -19,  96, 8, 7,  62, -22, -2},
     90  {-2, -18,  98, 8, 7,  59, -22, -2}, {-2, -18, 100, 8, 7,  56, -21, -2},
     91  {-2, -17, 102, 7, 7,  54, -21, -2}, {-2, -16, 104, 7, 7,  51, -21, -2},
     92  {-2, -16, 106, 7, 6,  49, -20, -2}, {-2, -15, 108, 7, 6,  46, -20, -2},
     93  {-2, -15, 110, 7, 6,  43, -19, -2}, {-2, -14, 111, 7, 6,  41, -19, -2},
     94  {-1, -13, 113, 6, 5,  38, -18, -2}, {-1, -12, 114, 6, 5,  35, -17, -2},
     95  {-1, -12, 116, 6, 5,  33, -17, -2}, {-1, -11, 118, 5, 4,  30, -16, -1},
     96  {-1, -10, 119, 5, 4,  27, -15, -1}, {-1,  -9, 120, 5, 4,  25, -14, -2},
     97  {-1,  -8, 121, 4, 3,  23, -13, -1}, {-1,  -7, 122, 4, 3,  20, -12, -1},
     98  {-1,  -7, 123, 4, 3,  18, -11, -1}, {-1,  -6, 124, 3, 3,  16, -10, -1},
     99  {-1,  -5, 125, 3, 2,  13,  -8, -1}, {-1,  -4, 126, 2, 2,  11,  -7, -1},
    100  { 0,  -3, 126, 2, 1,   8,  -6,  0}, { 0,  -2, 127, 1, 1,   6,  -5,  0},
    101  { 0,  -2, 127, 1, 1,   4,  -3,  0}, { 0,   0, 127, 0, 0,   2,  -1,  0},
    102  // [1, 2)
    103  { 0, 0, 127,   0, 0,   1,   0, 0}, { 0, 0, 127,   0, 0,  -1,   2, 0},
    104  { 0, 1, 127,  -1, 0,  -3,   4, 0}, { 0, 1, 126,  -2, 0,  -4,   6, 1},
    105  { 0, 1, 126,  -3, 0,  -5,   8, 1}, { 0, 1, 125,  -4, 0,  -6,  11, 1},
    106  { 0, 1, 124,  -4, 0,  -7,  13, 1}, { 0, 2, 123,  -5, 0,  -8,  15, 1},
    107  { 0, 2, 122,  -6, 0,  -9,  18, 1}, { 0, 2, 121,  -6, 0, -10,  20, 1},
    108  { 0, 2, 120,  -7, 0, -11,  22, 2}, { 0, 2, 119,  -8, 0, -12,  25, 2},
    109  { 0, 3, 117,  -8, 0, -13,  27, 2}, { 0, 3, 116,  -9, 0, -13,  29, 2},
    110  { 0, 3, 114, -10, 0, -14,  32, 3}, { 0, 3, 113, -10, 0, -15,  35, 2},
    111  { 0, 3, 111, -11, 0, -15,  37, 3}, { 0, 3, 109, -11, 0, -16,  40, 3},
    112  { 0, 3, 108, -12, 0, -16,  42, 3}, { 0, 4, 106, -13, 0, -17,  45, 3},
    113  { 0, 4, 104, -13, 0, -17,  47, 3}, { 0, 4, 102, -14, 0, -17,  50, 3},
    114  { 0, 4, 100, -14, 0, -17,  52, 3}, { 0, 4,  98, -15, 0, -18,  55, 4},
    115  { 0, 4,  96, -15, 0, -18,  58, 3}, { 0, 4,  94, -16, 0, -18,  60, 4},
    116  { 0, 4,  91, -16, 0, -18,  63, 4}, { 0, 4,  89, -16, 0, -18,  65, 4},
    117  { 0, 4,  87, -17, 0, -18,  68, 4}, { 0, 4,  85, -17, 0, -18,  70, 4},
    118  { 0, 4,  82, -17, 0, -18,  73, 4}, { 0, 4,  80, -17, 0, -18,  75, 4},
    119  { 0, 4,  78, -18, 0, -18,  78, 4}, { 0, 4,  75, -18, 0, -17,  80, 4},
    120  { 0, 4,  73, -18, 0, -17,  82, 4}, { 0, 4,  70, -18, 0, -17,  85, 4},
    121  { 0, 4,  68, -18, 0, -17,  87, 4}, { 0, 4,  65, -18, 0, -16,  89, 4},
    122  { 0, 4,  63, -18, 0, -16,  91, 4}, { 0, 4,  60, -18, 0, -16,  94, 4},
    123  { 0, 3,  58, -18, 0, -15,  96, 4}, { 0, 4,  55, -18, 0, -15,  98, 4},
    124  { 0, 3,  52, -17, 0, -14, 100, 4}, { 0, 3,  50, -17, 0, -14, 102, 4},
    125  { 0, 3,  47, -17, 0, -13, 104, 4}, { 0, 3,  45, -17, 0, -13, 106, 4},
    126  { 0, 3,  42, -16, 0, -12, 108, 3}, { 0, 3,  40, -16, 0, -11, 109, 3},
    127  { 0, 3,  37, -15, 0, -11, 111, 3}, { 0, 2,  35, -15, 0, -10, 113, 3},
    128  { 0, 3,  32, -14, 0, -10, 114, 3}, { 0, 2,  29, -13, 0,  -9, 116, 3},
    129  { 0, 2,  27, -13, 0,  -8, 117, 3}, { 0, 2,  25, -12, 0,  -8, 119, 2},
    130  { 0, 2,  22, -11, 0,  -7, 120, 2}, { 0, 1,  20, -10, 0,  -6, 121, 2},
    131  { 0, 1,  18,  -9, 0,  -6, 122, 2}, { 0, 1,  15,  -8, 0,  -5, 123, 2},
    132  { 0, 1,  13,  -7, 0,  -4, 124, 1}, { 0, 1,  11,  -6, 0,  -4, 125, 1},
    133  { 0, 1,   8,  -5, 0,  -3, 126, 1}, { 0, 1,   6,  -4, 0,  -2, 126, 1},
    134  { 0, 0,   4,  -3, 0,  -1, 127, 1}, { 0, 0,   2,  -1, 0,   0, 127, 0},
    135  // dummy (replicate row index 191)
    136  { 0, 0,   2,  -1, 0,   0, 127, 0},
    137 };
    138 /* clang-format on */
    139 
    140 #if !CONFIG_HIGHWAY
    141 
    142 // Shuffle masks: we want to convert a sequence of bytes 0, 1, 2, ..., 15
    143 // in an SSE register into two sequences:
    144 // 0, 2, 2, 4, ..., 12, 12, 14, <don't care>
    145 // 1, 3, 3, 5, ..., 13, 13, 15, <don't care>
    146 DECLARE_ALIGNED(16, static const uint8_t,
    147                even_mask[16]) = { 0, 2,  2,  4,  4,  6,  6,  8,
    148                                   8, 10, 10, 12, 12, 14, 14, 0 };
    149 
    150 DECLARE_ALIGNED(16, static const uint8_t,
    151                odd_mask[16]) = { 1, 3,  3,  5,  5,  7,  7,  9,
    152                                  9, 11, 11, 13, 13, 15, 15, 0 };
    153 
    154 DECLARE_ALIGNED(16, static const uint8_t,
    155                shuffle_alpha0_mask01[16]) = { 0, 1, 0, 1, 0, 1, 0, 1,
    156                                               0, 1, 0, 1, 0, 1, 0, 1 };
    157 
    158 DECLARE_ALIGNED(16, static const uint8_t,
    159                shuffle_alpha0_mask23[16]) = { 2, 3, 2, 3, 2, 3, 2, 3,
    160                                               2, 3, 2, 3, 2, 3, 2, 3 };
    161 
    162 DECLARE_ALIGNED(16, static const uint8_t,
    163                shuffle_alpha0_mask45[16]) = { 4, 5, 4, 5, 4, 5, 4, 5,
    164                                               4, 5, 4, 5, 4, 5, 4, 5 };
    165 
    166 DECLARE_ALIGNED(16, static const uint8_t,
    167                shuffle_alpha0_mask67[16]) = { 6, 7, 6, 7, 6, 7, 6, 7,
    168                                               6, 7, 6, 7, 6, 7, 6, 7 };
    169 
    170 DECLARE_ALIGNED(16, static const uint8_t,
    171                shuffle_gamma0_mask0[16]) = { 0, 1, 2, 3, 0, 1, 2, 3,
    172                                              0, 1, 2, 3, 0, 1, 2, 3 };
    173 
    174 DECLARE_ALIGNED(16, static const uint8_t,
    175                shuffle_gamma0_mask1[16]) = { 4, 5, 6, 7, 4, 5, 6, 7,
    176                                              4, 5, 6, 7, 4, 5, 6, 7 };
    177 
    178 DECLARE_ALIGNED(16, static const uint8_t,
    179                shuffle_gamma0_mask2[16]) = { 8, 9, 10, 11, 8, 9, 10, 11,
    180                                              8, 9, 10, 11, 8, 9, 10, 11 };
    181 
    182 DECLARE_ALIGNED(16, static const uint8_t,
    183                shuffle_gamma0_mask3[16]) = { 12, 13, 14, 15, 12, 13, 14, 15,
    184                                              12, 13, 14, 15, 12, 13, 14, 15 };
    185 
    186 static inline void filter_src_pixels(__m128i src, __m128i *tmp, __m128i *coeff,
    187                                     const int offset_bits_horiz,
    188                                     const int reduce_bits_horiz, int k) {
    189  const __m128i src_even =
    190      _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)even_mask));
    191  const __m128i src_odd =
    192      _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)odd_mask));
    193  // The pixel order we need for 'src' is:
    194  // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9
    195  const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd);
    196  const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff[0]);
    197  // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13
    198  const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4),
    199                                            _mm_srli_si128(src_odd, 4));
    200  const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff[1]);
    201  // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10
    202  const __m128i src_13 =
    203      _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2));
    204  const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff[2]);
    205  // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14
    206  const __m128i src_57 = _mm_unpacklo_epi64(_mm_srli_si128(src_odd, 4),
    207                                            _mm_srli_si128(src_even, 6));
    208  const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff[3]);
    209 
    210  const __m128i round_const = _mm_set1_epi16((1 << offset_bits_horiz) +
    211                                             ((1 << reduce_bits_horiz) >> 1));
    212 
    213  // Note: The values res_02 + res_46 and res_13 + res_57 both
    214  // fit into int16s at this point, but their sum may be too wide to fit
    215  // into an int16. However, once we also add round_const, the sum of
    216  // all of these fits into a uint16.
    217  //
    218  // The wrapping behaviour of _mm_add_* is used here to make sure we
    219  // get the correct result despite converting between different
    220  // (implicit) types.
    221  const __m128i res_even = _mm_add_epi16(res_02, res_46);
    222  const __m128i res_odd = _mm_add_epi16(res_13, res_57);
    223  const __m128i res =
    224      _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const);
    225  tmp[k + 7] = _mm_srl_epi16(res, _mm_cvtsi32_si128(reduce_bits_horiz));
    226 }
    227 
    228 static inline void prepare_horizontal_filter_coeff(int alpha, int sx,
    229                                                   __m128i *coeff) {
    230  // Filter even-index pixels
    231  const __m128i tmp_0 = _mm_loadl_epi64(
    232      (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
    233  const __m128i tmp_1 = _mm_loadl_epi64(
    234      (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
    235  const __m128i tmp_2 = _mm_loadl_epi64(
    236      (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
    237  const __m128i tmp_3 = _mm_loadl_epi64(
    238      (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
    239  const __m128i tmp_4 = _mm_loadl_epi64(
    240      (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
    241  const __m128i tmp_5 = _mm_loadl_epi64(
    242      (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
    243  const __m128i tmp_6 = _mm_loadl_epi64(
    244      (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
    245  const __m128i tmp_7 = _mm_loadl_epi64(
    246      (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
    247 
    248  // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 0 2
    249  const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2);
    250  // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 1 3
    251  const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3);
    252  // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 4 6
    253  const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6);
    254  // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 5 7
    255  const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7);
    256 
    257  // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 0 2 4 6
    258  const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10);
    259  // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 0 2 4 6
    260  const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10);
    261  // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 1 3 5 7
    262  const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11);
    263  // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 1 3 5 7
    264  const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11);
    265 
    266  // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
    267  coeff[0] = _mm_unpacklo_epi64(tmp_12, tmp_14);
    268  // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
    269  coeff[1] = _mm_unpackhi_epi64(tmp_12, tmp_14);
    270  // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
    271  coeff[2] = _mm_unpacklo_epi64(tmp_13, tmp_15);
    272  // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
    273  coeff[3] = _mm_unpackhi_epi64(tmp_13, tmp_15);
    274 }
    275 
    276 static inline void prepare_horizontal_filter_coeff_alpha0(int sx,
    277                                                          __m128i *coeff) {
    278  // Filter even-index pixels
    279  const __m128i tmp_0 =
    280      _mm_loadl_epi64((__m128i *)&av1_filter_8bit[sx >> WARPEDDIFF_PREC_BITS]);
    281 
    282  // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
    283  coeff[0] =
    284      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask01));
    285  // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
    286  coeff[1] =
    287      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask23));
    288  // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
    289  coeff[2] =
    290      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask45));
    291  // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
    292  coeff[3] =
    293      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask67));
    294 }
    295 
    296 static inline void horizontal_filter(__m128i src, __m128i *tmp, int sx,
    297                                     int alpha, int k,
    298                                     const int offset_bits_horiz,
    299                                     const int reduce_bits_horiz) {
    300  __m128i coeff[4];
    301  prepare_horizontal_filter_coeff(alpha, sx, coeff);
    302  filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
    303 }
    304 
    305 static inline void warp_horizontal_filter(const uint8_t *ref, __m128i *tmp,
    306                                          int stride, int32_t ix4, int32_t iy4,
    307                                          int32_t sx4, int alpha, int beta,
    308                                          int p_height, int height, int i,
    309                                          const int offset_bits_horiz,
    310                                          const int reduce_bits_horiz) {
    311  int k;
    312  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
    313    int iy = iy4 + k;
    314    if (iy < 0)
    315      iy = 0;
    316    else if (iy > height - 1)
    317      iy = height - 1;
    318    int sx = sx4 + beta * (k + 4);
    319 
    320    // Load source pixels
    321    const __m128i src =
    322        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
    323    horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
    324                      reduce_bits_horiz);
    325  }
    326 }
    327 
    328 static inline void warp_horizontal_filter_alpha0(
    329    const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
    330    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
    331    const int offset_bits_horiz, const int reduce_bits_horiz) {
    332  (void)alpha;
    333  int k;
    334  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
    335    int iy = iy4 + k;
    336    if (iy < 0)
    337      iy = 0;
    338    else if (iy > height - 1)
    339      iy = height - 1;
    340    int sx = sx4 + beta * (k + 4);
    341 
    342    // Load source pixels
    343    const __m128i src =
    344        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
    345 
    346    __m128i coeff[4];
    347    prepare_horizontal_filter_coeff_alpha0(sx, coeff);
    348    filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
    349  }
    350 }
    351 
    352 static inline void warp_horizontal_filter_beta0(
    353    const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
    354    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
    355    const int offset_bits_horiz, const int reduce_bits_horiz) {
    356  (void)beta;
    357  int k;
    358  __m128i coeff[4];
    359  prepare_horizontal_filter_coeff(alpha, sx4, coeff);
    360 
    361  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
    362    int iy = iy4 + k;
    363    if (iy < 0)
    364      iy = 0;
    365    else if (iy > height - 1)
    366      iy = height - 1;
    367 
    368    // Load source pixels
    369    const __m128i src =
    370        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
    371    filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
    372  }
    373 }
    374 
    375 static inline void warp_horizontal_filter_alpha0_beta0(
    376    const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
    377    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
    378    const int offset_bits_horiz, const int reduce_bits_horiz) {
    379  (void)beta;
    380  (void)alpha;
    381  int k;
    382 
    383  __m128i coeff[4];
    384  prepare_horizontal_filter_coeff_alpha0(sx4, coeff);
    385 
    386  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
    387    int iy = iy4 + k;
    388    if (iy < 0)
    389      iy = 0;
    390    else if (iy > height - 1)
    391      iy = height - 1;
    392 
    393    // Load source pixels
    394    const __m128i src =
    395        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
    396    filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
    397  }
    398 }
    399 
    400 static inline void unpack_weights_and_set_round_const(
    401    ConvolveParams *conv_params, const int round_bits, const int offset_bits,
    402    __m128i *res_sub_const, __m128i *round_bits_const, __m128i *wt) {
    403  *res_sub_const =
    404      _mm_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) -
    405                     (1 << (offset_bits - conv_params->round_1 - 1)));
    406  *round_bits_const = _mm_set1_epi16(((1 << round_bits) >> 1));
    407 
    408  const int w0 = conv_params->fwd_offset;
    409  const int w1 = conv_params->bck_offset;
    410  const __m128i wt0 = _mm_set1_epi16((int16_t)w0);
    411  const __m128i wt1 = _mm_set1_epi16((int16_t)w1);
    412  *wt = _mm_unpacklo_epi16(wt0, wt1);
    413 }
    414 
    415 static inline void prepare_vertical_filter_coeffs(int gamma, int sy,
    416                                                  __m128i *coeffs) {
    417  const __m128i tmp_0 =
    418      _mm_loadu_si128((__m128i *)(av1_warped_filter +
    419                                  ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
    420  const __m128i tmp_2 =
    421      _mm_loadu_si128((__m128i *)(av1_warped_filter +
    422                                  ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
    423  const __m128i tmp_4 =
    424      _mm_loadu_si128((__m128i *)(av1_warped_filter +
    425                                  ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
    426  const __m128i tmp_6 =
    427      _mm_loadu_si128((__m128i *)(av1_warped_filter +
    428                                  ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
    429 
    430  const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
    431  const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
    432  const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
    433  const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
    434 
    435  // even coeffs
    436  coeffs[0] = _mm_unpacklo_epi64(tmp_8, tmp_10);
    437  coeffs[1] = _mm_unpackhi_epi64(tmp_8, tmp_10);
    438  coeffs[2] = _mm_unpacklo_epi64(tmp_12, tmp_14);
    439  coeffs[3] = _mm_unpackhi_epi64(tmp_12, tmp_14);
    440 
    441  const __m128i tmp_1 =
    442      _mm_loadu_si128((__m128i *)(av1_warped_filter +
    443                                  ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
    444  const __m128i tmp_3 =
    445      _mm_loadu_si128((__m128i *)(av1_warped_filter +
    446                                  ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
    447  const __m128i tmp_5 =
    448      _mm_loadu_si128((__m128i *)(av1_warped_filter +
    449                                  ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
    450  const __m128i tmp_7 =
    451      _mm_loadu_si128((__m128i *)(av1_warped_filter +
    452                                  ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
    453 
    454  const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
    455  const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
    456  const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
    457  const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
    458 
    459  // odd coeffs
    460  coeffs[4] = _mm_unpacklo_epi64(tmp_9, tmp_11);
    461  coeffs[5] = _mm_unpackhi_epi64(tmp_9, tmp_11);
    462  coeffs[6] = _mm_unpacklo_epi64(tmp_13, tmp_15);
    463  coeffs[7] = _mm_unpackhi_epi64(tmp_13, tmp_15);
    464 }
    465 
    466 static inline void prepare_vertical_filter_coeffs_gamma0(int sy,
    467                                                         __m128i *coeffs) {
    468  const __m128i tmp_0 = _mm_loadu_si128(
    469      (__m128i *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
    470 
    471  // even coeffs
    472  coeffs[0] =
    473      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask0));
    474  coeffs[1] =
    475      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask1));
    476  coeffs[2] =
    477      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask2));
    478  coeffs[3] =
    479      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask3));
    480 
    481  // odd coeffs
    482  coeffs[4] = coeffs[0];
    483  coeffs[5] = coeffs[1];
    484  coeffs[6] = coeffs[2];
    485  coeffs[7] = coeffs[3];
    486 }
    487 
    488 static inline void filter_src_pixels_vertical(__m128i *tmp, __m128i *coeffs,
    489                                              __m128i *res_lo, __m128i *res_hi,
    490                                              int k) {
    491  // Load from tmp and rearrange pairs of consecutive rows into the
    492  // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
    493  const __m128i *src = tmp + (k + 4);
    494  const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
    495  const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
    496  const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
    497  const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
    498 
    499  const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]);
    500  const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]);
    501  const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]);
    502  const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]);
    503 
    504  const __m128i res_even =
    505      _mm_add_epi32(_mm_add_epi32(res_0, res_2), _mm_add_epi32(res_4, res_6));
    506 
    507  // Filter odd-index pixels
    508  const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
    509  const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
    510  const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
    511  const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
    512 
    513  const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[4]);
    514  const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[5]);
    515  const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[6]);
    516  const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[7]);
    517 
    518  const __m128i res_odd =
    519      _mm_add_epi32(_mm_add_epi32(res_1, res_3), _mm_add_epi32(res_5, res_7));
    520 
    521  // Rearrange pixels back into the order 0 ... 7
    522  *res_lo = _mm_unpacklo_epi32(res_even, res_odd);
    523  *res_hi = _mm_unpackhi_epi32(res_even, res_odd);
    524 }
    525 
    526 static inline void store_vertical_filter_output(
    527    __m128i *res_lo, __m128i *res_hi, const __m128i *res_add_const,
    528    const __m128i *wt, const __m128i *res_sub_const, __m128i *round_bits_const,
    529    uint8_t *pred, ConvolveParams *conv_params, int i, int j, int k,
    530    const int reduce_bits_vert, int p_stride, int p_width,
    531    const int round_bits) {
    532  __m128i res_lo_1 = *res_lo;
    533  __m128i res_hi_1 = *res_hi;
    534 
    535  if (conv_params->is_compound) {
    536    __m128i *const p =
    537        (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j];
    538    res_lo_1 = _mm_srai_epi32(_mm_add_epi32(res_lo_1, *res_add_const),
    539                              reduce_bits_vert);
    540    const __m128i temp_lo_16 = _mm_packus_epi32(res_lo_1, res_lo_1);
    541    __m128i res_lo_16;
    542    if (conv_params->do_average) {
    543      __m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
    544      const __m128i p_16 = _mm_loadl_epi64(p);
    545 
    546      if (conv_params->use_dist_wtd_comp_avg) {
    547        const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16);
    548        const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, *wt);
    549        const __m128i shifted_32 =
    550            _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
    551        res_lo_16 = _mm_packus_epi32(shifted_32, shifted_32);
    552      } else {
    553        res_lo_16 = _mm_srai_epi16(_mm_add_epi16(p_16, temp_lo_16), 1);
    554      }
    555 
    556      res_lo_16 = _mm_add_epi16(res_lo_16, *res_sub_const);
    557 
    558      res_lo_16 = _mm_srai_epi16(_mm_add_epi16(res_lo_16, *round_bits_const),
    559                                 round_bits);
    560      __m128i res_8_lo = _mm_packus_epi16(res_lo_16, res_lo_16);
    561      *(int *)dst8 = _mm_cvtsi128_si32(res_8_lo);
    562    } else {
    563      _mm_storel_epi64(p, temp_lo_16);
    564    }
    565    if (p_width > 4) {
    566      __m128i *const p4 =
    567          (__m128i *)&conv_params
    568              ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
    569      res_hi_1 = _mm_srai_epi32(_mm_add_epi32(res_hi_1, *res_add_const),
    570                                reduce_bits_vert);
    571      const __m128i temp_hi_16 = _mm_packus_epi32(res_hi_1, res_hi_1);
    572      __m128i res_hi_16;
    573 
    574      if (conv_params->do_average) {
    575        __m128i *const dst8_4 =
    576            (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
    577        const __m128i p4_16 = _mm_loadl_epi64(p4);
    578 
    579        if (conv_params->use_dist_wtd_comp_avg) {
    580          const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16);
    581          const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, *wt);
    582          const __m128i shifted_32 =
    583              _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
    584          res_hi_16 = _mm_packus_epi32(shifted_32, shifted_32);
    585        } else {
    586          res_hi_16 = _mm_srai_epi16(_mm_add_epi16(p4_16, temp_hi_16), 1);
    587        }
    588        res_hi_16 = _mm_add_epi16(res_hi_16, *res_sub_const);
    589 
    590        res_hi_16 = _mm_srai_epi16(_mm_add_epi16(res_hi_16, *round_bits_const),
    591                                   round_bits);
    592        __m128i res_8_hi = _mm_packus_epi16(res_hi_16, res_hi_16);
    593        *(int *)dst8_4 = _mm_cvtsi128_si32(res_8_hi);
    594 
    595      } else {
    596        _mm_storel_epi64(p4, temp_hi_16);
    597      }
    598    }
    599  } else {
    600    const __m128i res_lo_round = _mm_srai_epi32(
    601        _mm_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert);
    602    const __m128i res_hi_round = _mm_srai_epi32(
    603        _mm_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert);
    604 
    605    const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
    606    __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
    607 
    608    // Store, blending with 'pred' if needed
    609    __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
    610 
    611    // Note: If we're outputting a 4x4 block, we need to be very careful
    612    // to only output 4 pixels at this point, to avoid encode/decode
    613    // mismatches when encoding with multiple threads.
    614    if (p_width == 4) {
    615      *(int *)p = _mm_cvtsi128_si32(res_8bit);
    616    } else {
    617      _mm_storel_epi64(p, res_8bit);
    618    }
    619  }
    620 }
    621 
    622 static inline void warp_vertical_filter(
    623    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
    624    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
    625    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
    626    const int round_bits, const int offset_bits) {
    627  int k;
    628  __m128i res_sub_const, round_bits_const, wt;
    629  unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
    630                                     &res_sub_const, &round_bits_const, &wt);
    631  // Vertical filter
    632  for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
    633    int sy = sy4 + delta * (k + 4);
    634 
    635    __m128i coeffs[8];
    636    prepare_vertical_filter_coeffs(gamma, sy, coeffs);
    637 
    638    __m128i res_lo;
    639    __m128i res_hi;
    640    filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
    641 
    642    store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
    643                                 &res_sub_const, &round_bits_const, pred,
    644                                 conv_params, i, j, k, reduce_bits_vert,
    645                                 p_stride, p_width, round_bits);
    646  }
    647 }
    648 
    649 static inline void warp_vertical_filter_gamma0(
    650    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
    651    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
    652    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
    653    const int round_bits, const int offset_bits) {
    654  int k;
    655  (void)gamma;
    656  __m128i res_sub_const, round_bits_const, wt;
    657  unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
    658                                     &res_sub_const, &round_bits_const, &wt);
    659  // Vertical filter
    660  for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
    661    int sy = sy4 + delta * (k + 4);
    662 
    663    __m128i coeffs[8];
    664    prepare_vertical_filter_coeffs_gamma0(sy, coeffs);
    665 
    666    __m128i res_lo;
    667    __m128i res_hi;
    668    filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
    669 
    670    store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
    671                                 &res_sub_const, &round_bits_const, pred,
    672                                 conv_params, i, j, k, reduce_bits_vert,
    673                                 p_stride, p_width, round_bits);
    674  }
    675 }
    676 
    677 static inline void warp_vertical_filter_delta0(
    678    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
    679    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
    680    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
    681    const int round_bits, const int offset_bits) {
    682  (void)delta;
    683  int k;
    684  __m128i res_sub_const, round_bits_const, wt;
    685  unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
    686                                     &res_sub_const, &round_bits_const, &wt);
    687 
    688  __m128i coeffs[8];
    689  prepare_vertical_filter_coeffs(gamma, sy4, coeffs);
    690  // Vertical filter
    691  for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
    692    __m128i res_lo;
    693    __m128i res_hi;
    694    filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
    695 
    696    store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
    697                                 &res_sub_const, &round_bits_const, pred,
    698                                 conv_params, i, j, k, reduce_bits_vert,
    699                                 p_stride, p_width, round_bits);
    700  }
    701 }
    702 
    703 static inline void warp_vertical_filter_gamma0_delta0(
    704    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
    705    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
    706    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
    707    const int round_bits, const int offset_bits) {
    708  (void)delta;
    709  (void)gamma;
    710  int k;
    711  __m128i res_sub_const, round_bits_const, wt;
    712  unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
    713                                     &res_sub_const, &round_bits_const, &wt);
    714 
    715  __m128i coeffs[8];
    716  prepare_vertical_filter_coeffs_gamma0(sy4, coeffs);
    717  // Vertical filter
    718  for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
    719    __m128i res_lo;
    720    __m128i res_hi;
    721    filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
    722 
    723    store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
    724                                 &res_sub_const, &round_bits_const, pred,
    725                                 conv_params, i, j, k, reduce_bits_vert,
    726                                 p_stride, p_width, round_bits);
    727  }
    728 }
    729 
    730 static inline void prepare_warp_vertical_filter(
    731    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
    732    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
    733    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
    734    const int round_bits, const int offset_bits) {
    735  if (gamma == 0 && delta == 0)
    736    warp_vertical_filter_gamma0_delta0(
    737        pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i, j,
    738        sy4, reduce_bits_vert, res_add_const, round_bits, offset_bits);
    739  else if (gamma == 0 && delta != 0)
    740    warp_vertical_filter_gamma0(pred, tmp, conv_params, gamma, delta, p_height,
    741                                p_stride, p_width, i, j, sy4, reduce_bits_vert,
    742                                res_add_const, round_bits, offset_bits);
    743  else if (gamma != 0 && delta == 0)
    744    warp_vertical_filter_delta0(pred, tmp, conv_params, gamma, delta, p_height,
    745                                p_stride, p_width, i, j, sy4, reduce_bits_vert,
    746                                res_add_const, round_bits, offset_bits);
    747  else
    748    warp_vertical_filter(pred, tmp, conv_params, gamma, delta, p_height,
    749                         p_stride, p_width, i, j, sy4, reduce_bits_vert,
    750                         res_add_const, round_bits, offset_bits);
    751 }
    752 
    753 static inline void prepare_warp_horizontal_filter(
    754    const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
    755    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
    756    const int offset_bits_horiz, const int reduce_bits_horiz) {
    757  if (alpha == 0 && beta == 0)
    758    warp_horizontal_filter_alpha0_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha,
    759                                        beta, p_height, height, i,
    760                                        offset_bits_horiz, reduce_bits_horiz);
    761  else if (alpha == 0 && beta != 0)
    762    warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
    763                                  p_height, height, i, offset_bits_horiz,
    764                                  reduce_bits_horiz);
    765  else if (alpha != 0 && beta == 0)
    766    warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
    767                                 p_height, height, i, offset_bits_horiz,
    768                                 reduce_bits_horiz);
    769  else
    770    warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
    771                           p_height, height, i, offset_bits_horiz,
    772                           reduce_bits_horiz);
    773 }
    774 
    775 void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width,
    776                            int height, int stride, uint8_t *pred, int p_col,
    777                            int p_row, int p_width, int p_height, int p_stride,
    778                            int subsampling_x, int subsampling_y,
    779                            ConvolveParams *conv_params, int16_t alpha,
    780                            int16_t beta, int16_t gamma, int16_t delta) {
    781  __m128i tmp[15];
    782  int i, j, k;
    783  const int bd = 8;
    784  const int reduce_bits_horiz = conv_params->round_0;
    785  const int reduce_bits_vert = conv_params->is_compound
    786                                   ? conv_params->round_1
    787                                   : 2 * FILTER_BITS - reduce_bits_horiz;
    788  const int offset_bits_horiz = bd + FILTER_BITS - 1;
    789  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
    790 
    791  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
    792  const __m128i reduce_bits_vert_const =
    793      _mm_set1_epi32(((1 << reduce_bits_vert) >> 1));
    794  const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert);
    795  const int round_bits =
    796      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
    797  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
    798  assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
    799 
    800  /* Note: For this code to work, the left/right frame borders need to be
    801  extended by at least 13 pixels each. By the time we get here, other
    802  code will have set up this border, but we allow an explicit check
    803  for debugging purposes.
    804  */
    805  /*for (i = 0; i < height; ++i) {
    806  for (j = 0; j < 13; ++j) {
    807  assert(ref[i * stride - 13 + j] == ref[i * stride]);
    808  assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
    809  }
    810  }*/
    811  __m128i res_add_const_1;
    812  if (conv_params->is_compound == 1) {
    813    res_add_const_1 = _mm_add_epi32(reduce_bits_vert_const, res_add_const);
    814  } else {
    815    res_add_const_1 = _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
    816                                     ((1 << reduce_bits_vert) >> 1));
    817  }
    818 
    819  for (i = 0; i < p_height; i += 8) {
    820    for (j = 0; j < p_width; j += 8) {
    821      const int32_t src_x = (p_col + j + 4) << subsampling_x;
    822      const int32_t src_y = (p_row + i + 4) << subsampling_y;
    823      const int64_t dst_x =
    824          (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
    825      const int64_t dst_y =
    826          (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
    827      const int64_t x4 = dst_x >> subsampling_x;
    828      const int64_t y4 = dst_y >> subsampling_y;
    829 
    830      int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
    831      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
    832      int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
    833      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
    834 
    835      // Add in all the constant terms, including rounding and offset
    836      sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
    837             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
    838      sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
    839             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
    840 
    841      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
    842      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
    843 
    844      // Horizontal filter
    845      // If the block is aligned such that, after clamping, every sample
    846      // would be taken from the leftmost/rightmost column, then we can
    847      // skip the expensive horizontal filter.
    848      if (ix4 <= -7) {
    849        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
    850          int iy = iy4 + k;
    851          if (iy < 0)
    852            iy = 0;
    853          else if (iy > height - 1)
    854            iy = height - 1;
    855          tmp[k + 7] = _mm_set1_epi16(
    856              (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
    857              ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)));
    858        }
    859      } else if (ix4 >= width + 6) {
    860        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
    861          int iy = iy4 + k;
    862          if (iy < 0)
    863            iy = 0;
    864          else if (iy > height - 1)
    865            iy = height - 1;
    866          tmp[k + 7] =
    867              _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
    868                             ref[iy * stride + (width - 1)] *
    869                                 (1 << (FILTER_BITS - reduce_bits_horiz)));
    870        }
    871      } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
    872        const int out_of_boundary_left = -(ix4 - 6);
    873        const int out_of_boundary_right = (ix4 + 8) - width;
    874        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
    875          int iy = iy4 + k;
    876          if (iy < 0)
    877            iy = 0;
    878          else if (iy > height - 1)
    879            iy = height - 1;
    880          int sx = sx4 + beta * (k + 4);
    881 
    882          // Load source pixels
    883          __m128i src =
    884              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
    885          if (out_of_boundary_left >= 0) {
    886            const __m128i shuffle_reg_left =
    887                _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
    888            src = _mm_shuffle_epi8(src, shuffle_reg_left);
    889          }
    890          if (out_of_boundary_right >= 0) {
    891            const __m128i shuffle_reg_right = _mm_loadu_si128(
    892                (__m128i *)warp_pad_right[out_of_boundary_right]);
    893            src = _mm_shuffle_epi8(src, shuffle_reg_right);
    894          }
    895          horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
    896                            reduce_bits_horiz);
    897        }
    898      } else {
    899        prepare_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha,
    900                                       beta, p_height, height, i,
    901                                       offset_bits_horiz, reduce_bits_horiz);
    902      }
    903 
    904      // Vertical filter
    905      prepare_warp_vertical_filter(
    906          pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i,
    907          j, sy4, reduce_bits_vert, &res_add_const_1, round_bits, offset_bits);
    908    }
    909  }
    910 }
    911 
    912 #endif  // !CONFIG_HIGHWAY