warp_plane_sse4.c (42048B)
1 /* 2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <emmintrin.h> 13 #include <smmintrin.h> 14 15 #include "config/av1_rtcd.h" 16 17 #include "av1/common/warped_motion.h" 18 19 /* This is a modified version of 'av1_warped_filter' from warped_motion.c: 20 * Each coefficient is stored in 8 bits instead of 16 bits 21 * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7 22 23 This is done in order to avoid overflow: Since the tap with the largest 24 coefficient could be any of taps 2, 3, 4 or 5, we can't use the summation 25 order ((0 + 1) + (4 + 5)) + ((2 + 3) + (6 + 7)) used in the regular 26 convolve functions. 27 28 Instead, we use the summation order 29 ((0 + 2) + (4 + 6)) + ((1 + 3) + (5 + 7)). 30 The rearrangement of coefficients in this table is so that we can get the 31 coefficients into the correct order more quickly. 32 */ 33 /* clang-format off */ 34 DECLARE_ALIGNED(8, const int8_t, 35 av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = { 36 // [-1, 0) 37 { 0, 127, 0, 0, 0, 1, 0, 0}, { 0, 127, 0, 0, -1, 2, 0, 0}, 38 { 1, 127, -1, 0, -3, 4, 0, 0}, { 1, 126, -2, 0, -4, 6, 1, 0}, 39 { 1, 126, -3, 0, -5, 8, 1, 0}, { 1, 125, -4, 0, -6, 11, 1, 0}, 40 { 1, 124, -4, 0, -7, 13, 1, 0}, { 2, 123, -5, 0, -8, 15, 1, 0}, 41 { 2, 122, -6, 0, -9, 18, 1, 0}, { 2, 121, -6, 0, -10, 20, 1, 0}, 42 { 2, 120, -7, 0, -11, 22, 2, 0}, { 2, 119, -8, 0, -12, 25, 2, 0}, 43 { 3, 117, -8, 0, -13, 27, 2, 0}, { 3, 116, -9, 0, -13, 29, 2, 0}, 44 { 3, 114, -10, 0, -14, 32, 3, 0}, { 3, 113, -10, 0, -15, 35, 2, 0}, 45 { 3, 111, -11, 0, -15, 37, 3, 0}, { 3, 109, -11, 0, -16, 40, 3, 0}, 46 { 3, 108, -12, 0, -16, 42, 3, 0}, { 4, 106, -13, 0, -17, 45, 3, 0}, 47 { 4, 104, -13, 0, -17, 47, 3, 0}, { 4, 102, -14, 0, -17, 50, 3, 0}, 48 { 4, 100, -14, 0, -17, 52, 3, 0}, { 4, 98, -15, 0, -18, 55, 4, 0}, 49 { 4, 96, -15, 0, -18, 58, 3, 0}, { 4, 94, -16, 0, -18, 60, 4, 0}, 50 { 4, 91, -16, 0, -18, 63, 4, 0}, { 4, 89, -16, 0, -18, 65, 4, 0}, 51 { 4, 87, -17, 0, -18, 68, 4, 0}, { 4, 85, -17, 0, -18, 70, 4, 0}, 52 { 4, 82, -17, 0, -18, 73, 4, 0}, { 4, 80, -17, 0, -18, 75, 4, 0}, 53 { 4, 78, -18, 0, -18, 78, 4, 0}, { 4, 75, -18, 0, -17, 80, 4, 0}, 54 { 4, 73, -18, 0, -17, 82, 4, 0}, { 4, 70, -18, 0, -17, 85, 4, 0}, 55 { 4, 68, -18, 0, -17, 87, 4, 0}, { 4, 65, -18, 0, -16, 89, 4, 0}, 56 { 4, 63, -18, 0, -16, 91, 4, 0}, { 4, 60, -18, 0, -16, 94, 4, 0}, 57 { 3, 58, -18, 0, -15, 96, 4, 0}, { 4, 55, -18, 0, -15, 98, 4, 0}, 58 { 3, 52, -17, 0, -14, 100, 4, 0}, { 3, 50, -17, 0, -14, 102, 4, 0}, 59 { 3, 47, -17, 0, -13, 104, 4, 0}, { 3, 45, -17, 0, -13, 106, 4, 0}, 60 { 3, 42, -16, 0, -12, 108, 3, 0}, { 3, 40, -16, 0, -11, 109, 3, 0}, 61 { 3, 37, -15, 0, -11, 111, 3, 0}, { 2, 35, -15, 0, -10, 113, 3, 0}, 62 { 3, 32, -14, 0, -10, 114, 3, 0}, { 2, 29, -13, 0, -9, 116, 3, 0}, 63 { 2, 27, -13, 0, -8, 117, 3, 0}, { 2, 25, -12, 0, -8, 119, 2, 0}, 64 { 2, 22, -11, 0, -7, 120, 2, 0}, { 1, 20, -10, 0, -6, 121, 2, 0}, 65 { 1, 18, -9, 0, -6, 122, 2, 0}, { 1, 15, -8, 0, -5, 123, 2, 0}, 66 { 1, 13, -7, 0, -4, 124, 1, 0}, { 1, 11, -6, 0, -4, 125, 1, 0}, 67 { 1, 8, -5, 0, -3, 126, 1, 0}, { 1, 6, -4, 0, -2, 126, 1, 0}, 68 { 0, 4, -3, 0, -1, 127, 1, 0}, { 0, 2, -1, 0, 0, 127, 0, 0}, 69 // [0, 1) 70 { 0, 0, 1, 0, 0, 127, 0, 0}, { 0, -1, 2, 0, 0, 127, 0, 0}, 71 { 0, -3, 4, 1, 1, 127, -2, 0}, { 0, -5, 6, 1, 1, 127, -2, 0}, 72 { 0, -6, 8, 1, 2, 126, -3, 0}, {-1, -7, 11, 2, 2, 126, -4, -1}, 73 {-1, -8, 13, 2, 3, 125, -5, -1}, {-1, -10, 16, 3, 3, 124, -6, -1}, 74 {-1, -11, 18, 3, 4, 123, -7, -1}, {-1, -12, 20, 3, 4, 122, -7, -1}, 75 {-1, -13, 23, 3, 4, 121, -8, -1}, {-2, -14, 25, 4, 5, 120, -9, -1}, 76 {-1, -15, 27, 4, 5, 119, -10, -1}, {-1, -16, 30, 4, 5, 118, -11, -1}, 77 {-2, -17, 33, 5, 6, 116, -12, -1}, {-2, -17, 35, 5, 6, 114, -12, -1}, 78 {-2, -18, 38, 5, 6, 113, -13, -1}, {-2, -19, 41, 6, 7, 111, -14, -2}, 79 {-2, -19, 43, 6, 7, 110, -15, -2}, {-2, -20, 46, 6, 7, 108, -15, -2}, 80 {-2, -20, 49, 6, 7, 106, -16, -2}, {-2, -21, 51, 7, 7, 104, -16, -2}, 81 {-2, -21, 54, 7, 7, 102, -17, -2}, {-2, -21, 56, 7, 8, 100, -18, -2}, 82 {-2, -22, 59, 7, 8, 98, -18, -2}, {-2, -22, 62, 7, 8, 96, -19, -2}, 83 {-2, -22, 64, 7, 8, 94, -19, -2}, {-2, -22, 67, 8, 8, 91, -20, -2}, 84 {-2, -22, 69, 8, 8, 89, -20, -2}, {-2, -22, 72, 8, 8, 87, -21, -2}, 85 {-2, -21, 74, 8, 8, 84, -21, -2}, {-2, -22, 77, 8, 8, 82, -21, -2}, 86 {-2, -21, 79, 8, 8, 79, -21, -2}, {-2, -21, 82, 8, 8, 77, -22, -2}, 87 {-2, -21, 84, 8, 8, 74, -21, -2}, {-2, -21, 87, 8, 8, 72, -22, -2}, 88 {-2, -20, 89, 8, 8, 69, -22, -2}, {-2, -20, 91, 8, 8, 67, -22, -2}, 89 {-2, -19, 94, 8, 7, 64, -22, -2}, {-2, -19, 96, 8, 7, 62, -22, -2}, 90 {-2, -18, 98, 8, 7, 59, -22, -2}, {-2, -18, 100, 8, 7, 56, -21, -2}, 91 {-2, -17, 102, 7, 7, 54, -21, -2}, {-2, -16, 104, 7, 7, 51, -21, -2}, 92 {-2, -16, 106, 7, 6, 49, -20, -2}, {-2, -15, 108, 7, 6, 46, -20, -2}, 93 {-2, -15, 110, 7, 6, 43, -19, -2}, {-2, -14, 111, 7, 6, 41, -19, -2}, 94 {-1, -13, 113, 6, 5, 38, -18, -2}, {-1, -12, 114, 6, 5, 35, -17, -2}, 95 {-1, -12, 116, 6, 5, 33, -17, -2}, {-1, -11, 118, 5, 4, 30, -16, -1}, 96 {-1, -10, 119, 5, 4, 27, -15, -1}, {-1, -9, 120, 5, 4, 25, -14, -2}, 97 {-1, -8, 121, 4, 3, 23, -13, -1}, {-1, -7, 122, 4, 3, 20, -12, -1}, 98 {-1, -7, 123, 4, 3, 18, -11, -1}, {-1, -6, 124, 3, 3, 16, -10, -1}, 99 {-1, -5, 125, 3, 2, 13, -8, -1}, {-1, -4, 126, 2, 2, 11, -7, -1}, 100 { 0, -3, 126, 2, 1, 8, -6, 0}, { 0, -2, 127, 1, 1, 6, -5, 0}, 101 { 0, -2, 127, 1, 1, 4, -3, 0}, { 0, 0, 127, 0, 0, 2, -1, 0}, 102 // [1, 2) 103 { 0, 0, 127, 0, 0, 1, 0, 0}, { 0, 0, 127, 0, 0, -1, 2, 0}, 104 { 0, 1, 127, -1, 0, -3, 4, 0}, { 0, 1, 126, -2, 0, -4, 6, 1}, 105 { 0, 1, 126, -3, 0, -5, 8, 1}, { 0, 1, 125, -4, 0, -6, 11, 1}, 106 { 0, 1, 124, -4, 0, -7, 13, 1}, { 0, 2, 123, -5, 0, -8, 15, 1}, 107 { 0, 2, 122, -6, 0, -9, 18, 1}, { 0, 2, 121, -6, 0, -10, 20, 1}, 108 { 0, 2, 120, -7, 0, -11, 22, 2}, { 0, 2, 119, -8, 0, -12, 25, 2}, 109 { 0, 3, 117, -8, 0, -13, 27, 2}, { 0, 3, 116, -9, 0, -13, 29, 2}, 110 { 0, 3, 114, -10, 0, -14, 32, 3}, { 0, 3, 113, -10, 0, -15, 35, 2}, 111 { 0, 3, 111, -11, 0, -15, 37, 3}, { 0, 3, 109, -11, 0, -16, 40, 3}, 112 { 0, 3, 108, -12, 0, -16, 42, 3}, { 0, 4, 106, -13, 0, -17, 45, 3}, 113 { 0, 4, 104, -13, 0, -17, 47, 3}, { 0, 4, 102, -14, 0, -17, 50, 3}, 114 { 0, 4, 100, -14, 0, -17, 52, 3}, { 0, 4, 98, -15, 0, -18, 55, 4}, 115 { 0, 4, 96, -15, 0, -18, 58, 3}, { 0, 4, 94, -16, 0, -18, 60, 4}, 116 { 0, 4, 91, -16, 0, -18, 63, 4}, { 0, 4, 89, -16, 0, -18, 65, 4}, 117 { 0, 4, 87, -17, 0, -18, 68, 4}, { 0, 4, 85, -17, 0, -18, 70, 4}, 118 { 0, 4, 82, -17, 0, -18, 73, 4}, { 0, 4, 80, -17, 0, -18, 75, 4}, 119 { 0, 4, 78, -18, 0, -18, 78, 4}, { 0, 4, 75, -18, 0, -17, 80, 4}, 120 { 0, 4, 73, -18, 0, -17, 82, 4}, { 0, 4, 70, -18, 0, -17, 85, 4}, 121 { 0, 4, 68, -18, 0, -17, 87, 4}, { 0, 4, 65, -18, 0, -16, 89, 4}, 122 { 0, 4, 63, -18, 0, -16, 91, 4}, { 0, 4, 60, -18, 0, -16, 94, 4}, 123 { 0, 3, 58, -18, 0, -15, 96, 4}, { 0, 4, 55, -18, 0, -15, 98, 4}, 124 { 0, 3, 52, -17, 0, -14, 100, 4}, { 0, 3, 50, -17, 0, -14, 102, 4}, 125 { 0, 3, 47, -17, 0, -13, 104, 4}, { 0, 3, 45, -17, 0, -13, 106, 4}, 126 { 0, 3, 42, -16, 0, -12, 108, 3}, { 0, 3, 40, -16, 0, -11, 109, 3}, 127 { 0, 3, 37, -15, 0, -11, 111, 3}, { 0, 2, 35, -15, 0, -10, 113, 3}, 128 { 0, 3, 32, -14, 0, -10, 114, 3}, { 0, 2, 29, -13, 0, -9, 116, 3}, 129 { 0, 2, 27, -13, 0, -8, 117, 3}, { 0, 2, 25, -12, 0, -8, 119, 2}, 130 { 0, 2, 22, -11, 0, -7, 120, 2}, { 0, 1, 20, -10, 0, -6, 121, 2}, 131 { 0, 1, 18, -9, 0, -6, 122, 2}, { 0, 1, 15, -8, 0, -5, 123, 2}, 132 { 0, 1, 13, -7, 0, -4, 124, 1}, { 0, 1, 11, -6, 0, -4, 125, 1}, 133 { 0, 1, 8, -5, 0, -3, 126, 1}, { 0, 1, 6, -4, 0, -2, 126, 1}, 134 { 0, 0, 4, -3, 0, -1, 127, 1}, { 0, 0, 2, -1, 0, 0, 127, 0}, 135 // dummy (replicate row index 191) 136 { 0, 0, 2, -1, 0, 0, 127, 0}, 137 }; 138 /* clang-format on */ 139 140 #if !CONFIG_HIGHWAY 141 142 // Shuffle masks: we want to convert a sequence of bytes 0, 1, 2, ..., 15 143 // in an SSE register into two sequences: 144 // 0, 2, 2, 4, ..., 12, 12, 14, <don't care> 145 // 1, 3, 3, 5, ..., 13, 13, 15, <don't care> 146 DECLARE_ALIGNED(16, static const uint8_t, 147 even_mask[16]) = { 0, 2, 2, 4, 4, 6, 6, 8, 148 8, 10, 10, 12, 12, 14, 14, 0 }; 149 150 DECLARE_ALIGNED(16, static const uint8_t, 151 odd_mask[16]) = { 1, 3, 3, 5, 5, 7, 7, 9, 152 9, 11, 11, 13, 13, 15, 15, 0 }; 153 154 DECLARE_ALIGNED(16, static const uint8_t, 155 shuffle_alpha0_mask01[16]) = { 0, 1, 0, 1, 0, 1, 0, 1, 156 0, 1, 0, 1, 0, 1, 0, 1 }; 157 158 DECLARE_ALIGNED(16, static const uint8_t, 159 shuffle_alpha0_mask23[16]) = { 2, 3, 2, 3, 2, 3, 2, 3, 160 2, 3, 2, 3, 2, 3, 2, 3 }; 161 162 DECLARE_ALIGNED(16, static const uint8_t, 163 shuffle_alpha0_mask45[16]) = { 4, 5, 4, 5, 4, 5, 4, 5, 164 4, 5, 4, 5, 4, 5, 4, 5 }; 165 166 DECLARE_ALIGNED(16, static const uint8_t, 167 shuffle_alpha0_mask67[16]) = { 6, 7, 6, 7, 6, 7, 6, 7, 168 6, 7, 6, 7, 6, 7, 6, 7 }; 169 170 DECLARE_ALIGNED(16, static const uint8_t, 171 shuffle_gamma0_mask0[16]) = { 0, 1, 2, 3, 0, 1, 2, 3, 172 0, 1, 2, 3, 0, 1, 2, 3 }; 173 174 DECLARE_ALIGNED(16, static const uint8_t, 175 shuffle_gamma0_mask1[16]) = { 4, 5, 6, 7, 4, 5, 6, 7, 176 4, 5, 6, 7, 4, 5, 6, 7 }; 177 178 DECLARE_ALIGNED(16, static const uint8_t, 179 shuffle_gamma0_mask2[16]) = { 8, 9, 10, 11, 8, 9, 10, 11, 180 8, 9, 10, 11, 8, 9, 10, 11 }; 181 182 DECLARE_ALIGNED(16, static const uint8_t, 183 shuffle_gamma0_mask3[16]) = { 12, 13, 14, 15, 12, 13, 14, 15, 184 12, 13, 14, 15, 12, 13, 14, 15 }; 185 186 static inline void filter_src_pixels(__m128i src, __m128i *tmp, __m128i *coeff, 187 const int offset_bits_horiz, 188 const int reduce_bits_horiz, int k) { 189 const __m128i src_even = 190 _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)even_mask)); 191 const __m128i src_odd = 192 _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)odd_mask)); 193 // The pixel order we need for 'src' is: 194 // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9 195 const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd); 196 const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff[0]); 197 // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13 198 const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4), 199 _mm_srli_si128(src_odd, 4)); 200 const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff[1]); 201 // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10 202 const __m128i src_13 = 203 _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2)); 204 const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff[2]); 205 // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14 206 const __m128i src_57 = _mm_unpacklo_epi64(_mm_srli_si128(src_odd, 4), 207 _mm_srli_si128(src_even, 6)); 208 const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff[3]); 209 210 const __m128i round_const = _mm_set1_epi16((1 << offset_bits_horiz) + 211 ((1 << reduce_bits_horiz) >> 1)); 212 213 // Note: The values res_02 + res_46 and res_13 + res_57 both 214 // fit into int16s at this point, but their sum may be too wide to fit 215 // into an int16. However, once we also add round_const, the sum of 216 // all of these fits into a uint16. 217 // 218 // The wrapping behaviour of _mm_add_* is used here to make sure we 219 // get the correct result despite converting between different 220 // (implicit) types. 221 const __m128i res_even = _mm_add_epi16(res_02, res_46); 222 const __m128i res_odd = _mm_add_epi16(res_13, res_57); 223 const __m128i res = 224 _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const); 225 tmp[k + 7] = _mm_srl_epi16(res, _mm_cvtsi32_si128(reduce_bits_horiz)); 226 } 227 228 static inline void prepare_horizontal_filter_coeff(int alpha, int sx, 229 __m128i *coeff) { 230 // Filter even-index pixels 231 const __m128i tmp_0 = _mm_loadl_epi64( 232 (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]); 233 const __m128i tmp_1 = _mm_loadl_epi64( 234 (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]); 235 const __m128i tmp_2 = _mm_loadl_epi64( 236 (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]); 237 const __m128i tmp_3 = _mm_loadl_epi64( 238 (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]); 239 const __m128i tmp_4 = _mm_loadl_epi64( 240 (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]); 241 const __m128i tmp_5 = _mm_loadl_epi64( 242 (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]); 243 const __m128i tmp_6 = _mm_loadl_epi64( 244 (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]); 245 const __m128i tmp_7 = _mm_loadl_epi64( 246 (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]); 247 248 // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 0 2 249 const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2); 250 // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 1 3 251 const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3); 252 // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 4 6 253 const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6); 254 // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 5 7 255 const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7); 256 257 // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 0 2 4 6 258 const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10); 259 // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 0 2 4 6 260 const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10); 261 // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 1 3 5 7 262 const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11); 263 // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 1 3 5 7 264 const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11); 265 266 // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7 267 coeff[0] = _mm_unpacklo_epi64(tmp_12, tmp_14); 268 // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7 269 coeff[1] = _mm_unpackhi_epi64(tmp_12, tmp_14); 270 // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7 271 coeff[2] = _mm_unpacklo_epi64(tmp_13, tmp_15); 272 // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7 273 coeff[3] = _mm_unpackhi_epi64(tmp_13, tmp_15); 274 } 275 276 static inline void prepare_horizontal_filter_coeff_alpha0(int sx, 277 __m128i *coeff) { 278 // Filter even-index pixels 279 const __m128i tmp_0 = 280 _mm_loadl_epi64((__m128i *)&av1_filter_8bit[sx >> WARPEDDIFF_PREC_BITS]); 281 282 // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7 283 coeff[0] = 284 _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask01)); 285 // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7 286 coeff[1] = 287 _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask23)); 288 // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7 289 coeff[2] = 290 _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask45)); 291 // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7 292 coeff[3] = 293 _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask67)); 294 } 295 296 static inline void horizontal_filter(__m128i src, __m128i *tmp, int sx, 297 int alpha, int k, 298 const int offset_bits_horiz, 299 const int reduce_bits_horiz) { 300 __m128i coeff[4]; 301 prepare_horizontal_filter_coeff(alpha, sx, coeff); 302 filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k); 303 } 304 305 static inline void warp_horizontal_filter(const uint8_t *ref, __m128i *tmp, 306 int stride, int32_t ix4, int32_t iy4, 307 int32_t sx4, int alpha, int beta, 308 int p_height, int height, int i, 309 const int offset_bits_horiz, 310 const int reduce_bits_horiz) { 311 int k; 312 for (k = -7; k < AOMMIN(8, p_height - i); ++k) { 313 int iy = iy4 + k; 314 if (iy < 0) 315 iy = 0; 316 else if (iy > height - 1) 317 iy = height - 1; 318 int sx = sx4 + beta * (k + 4); 319 320 // Load source pixels 321 const __m128i src = 322 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); 323 horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz, 324 reduce_bits_horiz); 325 } 326 } 327 328 static inline void warp_horizontal_filter_alpha0( 329 const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, 330 int32_t sx4, int alpha, int beta, int p_height, int height, int i, 331 const int offset_bits_horiz, const int reduce_bits_horiz) { 332 (void)alpha; 333 int k; 334 for (k = -7; k < AOMMIN(8, p_height - i); ++k) { 335 int iy = iy4 + k; 336 if (iy < 0) 337 iy = 0; 338 else if (iy > height - 1) 339 iy = height - 1; 340 int sx = sx4 + beta * (k + 4); 341 342 // Load source pixels 343 const __m128i src = 344 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); 345 346 __m128i coeff[4]; 347 prepare_horizontal_filter_coeff_alpha0(sx, coeff); 348 filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k); 349 } 350 } 351 352 static inline void warp_horizontal_filter_beta0( 353 const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, 354 int32_t sx4, int alpha, int beta, int p_height, int height, int i, 355 const int offset_bits_horiz, const int reduce_bits_horiz) { 356 (void)beta; 357 int k; 358 __m128i coeff[4]; 359 prepare_horizontal_filter_coeff(alpha, sx4, coeff); 360 361 for (k = -7; k < AOMMIN(8, p_height - i); ++k) { 362 int iy = iy4 + k; 363 if (iy < 0) 364 iy = 0; 365 else if (iy > height - 1) 366 iy = height - 1; 367 368 // Load source pixels 369 const __m128i src = 370 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); 371 filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k); 372 } 373 } 374 375 static inline void warp_horizontal_filter_alpha0_beta0( 376 const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, 377 int32_t sx4, int alpha, int beta, int p_height, int height, int i, 378 const int offset_bits_horiz, const int reduce_bits_horiz) { 379 (void)beta; 380 (void)alpha; 381 int k; 382 383 __m128i coeff[4]; 384 prepare_horizontal_filter_coeff_alpha0(sx4, coeff); 385 386 for (k = -7; k < AOMMIN(8, p_height - i); ++k) { 387 int iy = iy4 + k; 388 if (iy < 0) 389 iy = 0; 390 else if (iy > height - 1) 391 iy = height - 1; 392 393 // Load source pixels 394 const __m128i src = 395 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); 396 filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k); 397 } 398 } 399 400 static inline void unpack_weights_and_set_round_const( 401 ConvolveParams *conv_params, const int round_bits, const int offset_bits, 402 __m128i *res_sub_const, __m128i *round_bits_const, __m128i *wt) { 403 *res_sub_const = 404 _mm_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) - 405 (1 << (offset_bits - conv_params->round_1 - 1))); 406 *round_bits_const = _mm_set1_epi16(((1 << round_bits) >> 1)); 407 408 const int w0 = conv_params->fwd_offset; 409 const int w1 = conv_params->bck_offset; 410 const __m128i wt0 = _mm_set1_epi16((int16_t)w0); 411 const __m128i wt1 = _mm_set1_epi16((int16_t)w1); 412 *wt = _mm_unpacklo_epi16(wt0, wt1); 413 } 414 415 static inline void prepare_vertical_filter_coeffs(int gamma, int sy, 416 __m128i *coeffs) { 417 const __m128i tmp_0 = 418 _mm_loadu_si128((__m128i *)(av1_warped_filter + 419 ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); 420 const __m128i tmp_2 = 421 _mm_loadu_si128((__m128i *)(av1_warped_filter + 422 ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); 423 const __m128i tmp_4 = 424 _mm_loadu_si128((__m128i *)(av1_warped_filter + 425 ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); 426 const __m128i tmp_6 = 427 _mm_loadu_si128((__m128i *)(av1_warped_filter + 428 ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); 429 430 const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2); 431 const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6); 432 const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2); 433 const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6); 434 435 // even coeffs 436 coeffs[0] = _mm_unpacklo_epi64(tmp_8, tmp_10); 437 coeffs[1] = _mm_unpackhi_epi64(tmp_8, tmp_10); 438 coeffs[2] = _mm_unpacklo_epi64(tmp_12, tmp_14); 439 coeffs[3] = _mm_unpackhi_epi64(tmp_12, tmp_14); 440 441 const __m128i tmp_1 = 442 _mm_loadu_si128((__m128i *)(av1_warped_filter + 443 ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); 444 const __m128i tmp_3 = 445 _mm_loadu_si128((__m128i *)(av1_warped_filter + 446 ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); 447 const __m128i tmp_5 = 448 _mm_loadu_si128((__m128i *)(av1_warped_filter + 449 ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); 450 const __m128i tmp_7 = 451 _mm_loadu_si128((__m128i *)(av1_warped_filter + 452 ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); 453 454 const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3); 455 const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7); 456 const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3); 457 const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7); 458 459 // odd coeffs 460 coeffs[4] = _mm_unpacklo_epi64(tmp_9, tmp_11); 461 coeffs[5] = _mm_unpackhi_epi64(tmp_9, tmp_11); 462 coeffs[6] = _mm_unpacklo_epi64(tmp_13, tmp_15); 463 coeffs[7] = _mm_unpackhi_epi64(tmp_13, tmp_15); 464 } 465 466 static inline void prepare_vertical_filter_coeffs_gamma0(int sy, 467 __m128i *coeffs) { 468 const __m128i tmp_0 = _mm_loadu_si128( 469 (__m128i *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS))); 470 471 // even coeffs 472 coeffs[0] = 473 _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask0)); 474 coeffs[1] = 475 _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask1)); 476 coeffs[2] = 477 _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask2)); 478 coeffs[3] = 479 _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask3)); 480 481 // odd coeffs 482 coeffs[4] = coeffs[0]; 483 coeffs[5] = coeffs[1]; 484 coeffs[6] = coeffs[2]; 485 coeffs[7] = coeffs[3]; 486 } 487 488 static inline void filter_src_pixels_vertical(__m128i *tmp, __m128i *coeffs, 489 __m128i *res_lo, __m128i *res_hi, 490 int k) { 491 // Load from tmp and rearrange pairs of consecutive rows into the 492 // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7 493 const __m128i *src = tmp + (k + 4); 494 const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]); 495 const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]); 496 const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]); 497 const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]); 498 499 const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]); 500 const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]); 501 const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]); 502 const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]); 503 504 const __m128i res_even = 505 _mm_add_epi32(_mm_add_epi32(res_0, res_2), _mm_add_epi32(res_4, res_6)); 506 507 // Filter odd-index pixels 508 const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]); 509 const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]); 510 const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]); 511 const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]); 512 513 const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[4]); 514 const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[5]); 515 const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[6]); 516 const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[7]); 517 518 const __m128i res_odd = 519 _mm_add_epi32(_mm_add_epi32(res_1, res_3), _mm_add_epi32(res_5, res_7)); 520 521 // Rearrange pixels back into the order 0 ... 7 522 *res_lo = _mm_unpacklo_epi32(res_even, res_odd); 523 *res_hi = _mm_unpackhi_epi32(res_even, res_odd); 524 } 525 526 static inline void store_vertical_filter_output( 527 __m128i *res_lo, __m128i *res_hi, const __m128i *res_add_const, 528 const __m128i *wt, const __m128i *res_sub_const, __m128i *round_bits_const, 529 uint8_t *pred, ConvolveParams *conv_params, int i, int j, int k, 530 const int reduce_bits_vert, int p_stride, int p_width, 531 const int round_bits) { 532 __m128i res_lo_1 = *res_lo; 533 __m128i res_hi_1 = *res_hi; 534 535 if (conv_params->is_compound) { 536 __m128i *const p = 537 (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j]; 538 res_lo_1 = _mm_srai_epi32(_mm_add_epi32(res_lo_1, *res_add_const), 539 reduce_bits_vert); 540 const __m128i temp_lo_16 = _mm_packus_epi32(res_lo_1, res_lo_1); 541 __m128i res_lo_16; 542 if (conv_params->do_average) { 543 __m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j]; 544 const __m128i p_16 = _mm_loadl_epi64(p); 545 546 if (conv_params->use_dist_wtd_comp_avg) { 547 const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16); 548 const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, *wt); 549 const __m128i shifted_32 = 550 _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); 551 res_lo_16 = _mm_packus_epi32(shifted_32, shifted_32); 552 } else { 553 res_lo_16 = _mm_srai_epi16(_mm_add_epi16(p_16, temp_lo_16), 1); 554 } 555 556 res_lo_16 = _mm_add_epi16(res_lo_16, *res_sub_const); 557 558 res_lo_16 = _mm_srai_epi16(_mm_add_epi16(res_lo_16, *round_bits_const), 559 round_bits); 560 __m128i res_8_lo = _mm_packus_epi16(res_lo_16, res_lo_16); 561 *(int *)dst8 = _mm_cvtsi128_si32(res_8_lo); 562 } else { 563 _mm_storel_epi64(p, temp_lo_16); 564 } 565 if (p_width > 4) { 566 __m128i *const p4 = 567 (__m128i *)&conv_params 568 ->dst[(i + k + 4) * conv_params->dst_stride + j + 4]; 569 res_hi_1 = _mm_srai_epi32(_mm_add_epi32(res_hi_1, *res_add_const), 570 reduce_bits_vert); 571 const __m128i temp_hi_16 = _mm_packus_epi32(res_hi_1, res_hi_1); 572 __m128i res_hi_16; 573 574 if (conv_params->do_average) { 575 __m128i *const dst8_4 = 576 (__m128i *)&pred[(i + k + 4) * p_stride + j + 4]; 577 const __m128i p4_16 = _mm_loadl_epi64(p4); 578 579 if (conv_params->use_dist_wtd_comp_avg) { 580 const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16); 581 const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, *wt); 582 const __m128i shifted_32 = 583 _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS); 584 res_hi_16 = _mm_packus_epi32(shifted_32, shifted_32); 585 } else { 586 res_hi_16 = _mm_srai_epi16(_mm_add_epi16(p4_16, temp_hi_16), 1); 587 } 588 res_hi_16 = _mm_add_epi16(res_hi_16, *res_sub_const); 589 590 res_hi_16 = _mm_srai_epi16(_mm_add_epi16(res_hi_16, *round_bits_const), 591 round_bits); 592 __m128i res_8_hi = _mm_packus_epi16(res_hi_16, res_hi_16); 593 *(int *)dst8_4 = _mm_cvtsi128_si32(res_8_hi); 594 595 } else { 596 _mm_storel_epi64(p4, temp_hi_16); 597 } 598 } 599 } else { 600 const __m128i res_lo_round = _mm_srai_epi32( 601 _mm_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert); 602 const __m128i res_hi_round = _mm_srai_epi32( 603 _mm_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert); 604 605 const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); 606 __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit); 607 608 // Store, blending with 'pred' if needed 609 __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j]; 610 611 // Note: If we're outputting a 4x4 block, we need to be very careful 612 // to only output 4 pixels at this point, to avoid encode/decode 613 // mismatches when encoding with multiple threads. 614 if (p_width == 4) { 615 *(int *)p = _mm_cvtsi128_si32(res_8bit); 616 } else { 617 _mm_storel_epi64(p, res_8bit); 618 } 619 } 620 } 621 622 static inline void warp_vertical_filter( 623 uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma, 624 int16_t delta, int p_height, int p_stride, int p_width, int i, int j, 625 int sy4, const int reduce_bits_vert, const __m128i *res_add_const, 626 const int round_bits, const int offset_bits) { 627 int k; 628 __m128i res_sub_const, round_bits_const, wt; 629 unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits, 630 &res_sub_const, &round_bits_const, &wt); 631 // Vertical filter 632 for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { 633 int sy = sy4 + delta * (k + 4); 634 635 __m128i coeffs[8]; 636 prepare_vertical_filter_coeffs(gamma, sy, coeffs); 637 638 __m128i res_lo; 639 __m128i res_hi; 640 filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k); 641 642 store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt, 643 &res_sub_const, &round_bits_const, pred, 644 conv_params, i, j, k, reduce_bits_vert, 645 p_stride, p_width, round_bits); 646 } 647 } 648 649 static inline void warp_vertical_filter_gamma0( 650 uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma, 651 int16_t delta, int p_height, int p_stride, int p_width, int i, int j, 652 int sy4, const int reduce_bits_vert, const __m128i *res_add_const, 653 const int round_bits, const int offset_bits) { 654 int k; 655 (void)gamma; 656 __m128i res_sub_const, round_bits_const, wt; 657 unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits, 658 &res_sub_const, &round_bits_const, &wt); 659 // Vertical filter 660 for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { 661 int sy = sy4 + delta * (k + 4); 662 663 __m128i coeffs[8]; 664 prepare_vertical_filter_coeffs_gamma0(sy, coeffs); 665 666 __m128i res_lo; 667 __m128i res_hi; 668 filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k); 669 670 store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt, 671 &res_sub_const, &round_bits_const, pred, 672 conv_params, i, j, k, reduce_bits_vert, 673 p_stride, p_width, round_bits); 674 } 675 } 676 677 static inline void warp_vertical_filter_delta0( 678 uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma, 679 int16_t delta, int p_height, int p_stride, int p_width, int i, int j, 680 int sy4, const int reduce_bits_vert, const __m128i *res_add_const, 681 const int round_bits, const int offset_bits) { 682 (void)delta; 683 int k; 684 __m128i res_sub_const, round_bits_const, wt; 685 unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits, 686 &res_sub_const, &round_bits_const, &wt); 687 688 __m128i coeffs[8]; 689 prepare_vertical_filter_coeffs(gamma, sy4, coeffs); 690 // Vertical filter 691 for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { 692 __m128i res_lo; 693 __m128i res_hi; 694 filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k); 695 696 store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt, 697 &res_sub_const, &round_bits_const, pred, 698 conv_params, i, j, k, reduce_bits_vert, 699 p_stride, p_width, round_bits); 700 } 701 } 702 703 static inline void warp_vertical_filter_gamma0_delta0( 704 uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma, 705 int16_t delta, int p_height, int p_stride, int p_width, int i, int j, 706 int sy4, const int reduce_bits_vert, const __m128i *res_add_const, 707 const int round_bits, const int offset_bits) { 708 (void)delta; 709 (void)gamma; 710 int k; 711 __m128i res_sub_const, round_bits_const, wt; 712 unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits, 713 &res_sub_const, &round_bits_const, &wt); 714 715 __m128i coeffs[8]; 716 prepare_vertical_filter_coeffs_gamma0(sy4, coeffs); 717 // Vertical filter 718 for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { 719 __m128i res_lo; 720 __m128i res_hi; 721 filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k); 722 723 store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt, 724 &res_sub_const, &round_bits_const, pred, 725 conv_params, i, j, k, reduce_bits_vert, 726 p_stride, p_width, round_bits); 727 } 728 } 729 730 static inline void prepare_warp_vertical_filter( 731 uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma, 732 int16_t delta, int p_height, int p_stride, int p_width, int i, int j, 733 int sy4, const int reduce_bits_vert, const __m128i *res_add_const, 734 const int round_bits, const int offset_bits) { 735 if (gamma == 0 && delta == 0) 736 warp_vertical_filter_gamma0_delta0( 737 pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i, j, 738 sy4, reduce_bits_vert, res_add_const, round_bits, offset_bits); 739 else if (gamma == 0 && delta != 0) 740 warp_vertical_filter_gamma0(pred, tmp, conv_params, gamma, delta, p_height, 741 p_stride, p_width, i, j, sy4, reduce_bits_vert, 742 res_add_const, round_bits, offset_bits); 743 else if (gamma != 0 && delta == 0) 744 warp_vertical_filter_delta0(pred, tmp, conv_params, gamma, delta, p_height, 745 p_stride, p_width, i, j, sy4, reduce_bits_vert, 746 res_add_const, round_bits, offset_bits); 747 else 748 warp_vertical_filter(pred, tmp, conv_params, gamma, delta, p_height, 749 p_stride, p_width, i, j, sy4, reduce_bits_vert, 750 res_add_const, round_bits, offset_bits); 751 } 752 753 static inline void prepare_warp_horizontal_filter( 754 const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, 755 int32_t sx4, int alpha, int beta, int p_height, int height, int i, 756 const int offset_bits_horiz, const int reduce_bits_horiz) { 757 if (alpha == 0 && beta == 0) 758 warp_horizontal_filter_alpha0_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha, 759 beta, p_height, height, i, 760 offset_bits_horiz, reduce_bits_horiz); 761 else if (alpha == 0 && beta != 0) 762 warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta, 763 p_height, height, i, offset_bits_horiz, 764 reduce_bits_horiz); 765 else if (alpha != 0 && beta == 0) 766 warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta, 767 p_height, height, i, offset_bits_horiz, 768 reduce_bits_horiz); 769 else 770 warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta, 771 p_height, height, i, offset_bits_horiz, 772 reduce_bits_horiz); 773 } 774 775 void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width, 776 int height, int stride, uint8_t *pred, int p_col, 777 int p_row, int p_width, int p_height, int p_stride, 778 int subsampling_x, int subsampling_y, 779 ConvolveParams *conv_params, int16_t alpha, 780 int16_t beta, int16_t gamma, int16_t delta) { 781 __m128i tmp[15]; 782 int i, j, k; 783 const int bd = 8; 784 const int reduce_bits_horiz = conv_params->round_0; 785 const int reduce_bits_vert = conv_params->is_compound 786 ? conv_params->round_1 787 : 2 * FILTER_BITS - reduce_bits_horiz; 788 const int offset_bits_horiz = bd + FILTER_BITS - 1; 789 assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL)); 790 791 const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz; 792 const __m128i reduce_bits_vert_const = 793 _mm_set1_epi32(((1 << reduce_bits_vert) >> 1)); 794 const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert); 795 const int round_bits = 796 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; 797 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; 798 assert(IMPLIES(conv_params->do_average, conv_params->is_compound)); 799 800 /* Note: For this code to work, the left/right frame borders need to be 801 extended by at least 13 pixels each. By the time we get here, other 802 code will have set up this border, but we allow an explicit check 803 for debugging purposes. 804 */ 805 /*for (i = 0; i < height; ++i) { 806 for (j = 0; j < 13; ++j) { 807 assert(ref[i * stride - 13 + j] == ref[i * stride]); 808 assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]); 809 } 810 }*/ 811 __m128i res_add_const_1; 812 if (conv_params->is_compound == 1) { 813 res_add_const_1 = _mm_add_epi32(reduce_bits_vert_const, res_add_const); 814 } else { 815 res_add_const_1 = _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) + 816 ((1 << reduce_bits_vert) >> 1)); 817 } 818 819 for (i = 0; i < p_height; i += 8) { 820 for (j = 0; j < p_width; j += 8) { 821 const int32_t src_x = (p_col + j + 4) << subsampling_x; 822 const int32_t src_y = (p_row + i + 4) << subsampling_y; 823 const int64_t dst_x = 824 (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0]; 825 const int64_t dst_y = 826 (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1]; 827 const int64_t x4 = dst_x >> subsampling_x; 828 const int64_t y4 = dst_y >> subsampling_y; 829 830 int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS); 831 int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); 832 int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS); 833 int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); 834 835 // Add in all the constant terms, including rounding and offset 836 sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + 837 (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); 838 sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + 839 (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); 840 841 sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); 842 sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); 843 844 // Horizontal filter 845 // If the block is aligned such that, after clamping, every sample 846 // would be taken from the leftmost/rightmost column, then we can 847 // skip the expensive horizontal filter. 848 if (ix4 <= -7) { 849 for (k = -7; k < AOMMIN(8, p_height - i); ++k) { 850 int iy = iy4 + k; 851 if (iy < 0) 852 iy = 0; 853 else if (iy > height - 1) 854 iy = height - 1; 855 tmp[k + 7] = _mm_set1_epi16( 856 (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + 857 ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz))); 858 } 859 } else if (ix4 >= width + 6) { 860 for (k = -7; k < AOMMIN(8, p_height - i); ++k) { 861 int iy = iy4 + k; 862 if (iy < 0) 863 iy = 0; 864 else if (iy > height - 1) 865 iy = height - 1; 866 tmp[k + 7] = 867 _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + 868 ref[iy * stride + (width - 1)] * 869 (1 << (FILTER_BITS - reduce_bits_horiz))); 870 } 871 } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) { 872 const int out_of_boundary_left = -(ix4 - 6); 873 const int out_of_boundary_right = (ix4 + 8) - width; 874 for (k = -7; k < AOMMIN(8, p_height - i); ++k) { 875 int iy = iy4 + k; 876 if (iy < 0) 877 iy = 0; 878 else if (iy > height - 1) 879 iy = height - 1; 880 int sx = sx4 + beta * (k + 4); 881 882 // Load source pixels 883 __m128i src = 884 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); 885 if (out_of_boundary_left >= 0) { 886 const __m128i shuffle_reg_left = 887 _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]); 888 src = _mm_shuffle_epi8(src, shuffle_reg_left); 889 } 890 if (out_of_boundary_right >= 0) { 891 const __m128i shuffle_reg_right = _mm_loadu_si128( 892 (__m128i *)warp_pad_right[out_of_boundary_right]); 893 src = _mm_shuffle_epi8(src, shuffle_reg_right); 894 } 895 horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz, 896 reduce_bits_horiz); 897 } 898 } else { 899 prepare_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, 900 beta, p_height, height, i, 901 offset_bits_horiz, reduce_bits_horiz); 902 } 903 904 // Vertical filter 905 prepare_warp_vertical_filter( 906 pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i, 907 j, sy4, reduce_bits_vert, &res_add_const_1, round_bits, offset_bits); 908 } 909 } 910 } 911 912 #endif // !CONFIG_HIGHWAY