warp_plane_avx2.c (53875B)
1 /* 2 * Copyright (c) 2019, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <immintrin.h> 13 #include "config/av1_rtcd.h" 14 #include "av1/common/warped_motion.h" 15 #include "aom_dsp/x86/synonyms.h" 16 17 #if !CONFIG_HIGHWAY 18 19 DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask01_avx2[32]) = { 20 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 21 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 22 }; 23 24 DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask23_avx2[32]) = { 25 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 26 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3 27 }; 28 29 DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask45_avx2[32]) = { 30 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 31 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5 32 }; 33 34 DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask67_avx2[32]) = { 35 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 36 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7 37 }; 38 39 DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask0_avx2[32]) = { 40 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 41 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 42 }; 43 44 DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask1_avx2[32]) = { 45 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 46 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7 47 }; 48 49 DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask2_avx2[32]) = { 50 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 51 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11 52 }; 53 54 DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask3_avx2[32]) = { 55 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 56 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15 57 }; 58 59 DECLARE_ALIGNED(32, static const uint8_t, 60 shuffle_src0[32]) = { 0, 2, 2, 4, 4, 6, 6, 8, 1, 3, 3, 61 5, 5, 7, 7, 9, 0, 2, 2, 4, 4, 6, 62 6, 8, 1, 3, 3, 5, 5, 7, 7, 9 }; 63 64 DECLARE_ALIGNED(32, static const uint8_t, 65 shuffle_src1[32]) = { 4, 6, 6, 8, 8, 10, 10, 12, 5, 7, 7, 66 9, 9, 11, 11, 13, 4, 6, 6, 8, 8, 10, 67 10, 12, 5, 7, 7, 9, 9, 11, 11, 13 }; 68 69 DECLARE_ALIGNED(32, static const uint8_t, 70 shuffle_src2[32]) = { 1, 3, 3, 5, 5, 7, 7, 9, 2, 4, 4, 71 6, 6, 8, 8, 10, 1, 3, 3, 5, 5, 7, 72 7, 9, 2, 4, 4, 6, 6, 8, 8, 10 }; 73 74 DECLARE_ALIGNED(32, static const uint8_t, 75 shuffle_src3[32]) = { 5, 7, 7, 9, 9, 11, 11, 13, 6, 8, 8, 76 10, 10, 12, 12, 14, 5, 7, 7, 9, 9, 11, 77 11, 13, 6, 8, 8, 10, 10, 12, 12, 14 }; 78 79 static inline void filter_src_pixels_avx2(const __m256i src, __m256i *horz_out, 80 __m256i *coeff, 81 const __m256i *shuffle_src, 82 const __m256i *round_const, 83 const __m128i *shift, int row) { 84 const __m256i src_0 = _mm256_shuffle_epi8(src, shuffle_src[0]); 85 const __m256i src_1 = _mm256_shuffle_epi8(src, shuffle_src[1]); 86 const __m256i src_2 = _mm256_shuffle_epi8(src, shuffle_src[2]); 87 const __m256i src_3 = _mm256_shuffle_epi8(src, shuffle_src[3]); 88 89 const __m256i res_02 = _mm256_maddubs_epi16(src_0, coeff[0]); 90 const __m256i res_46 = _mm256_maddubs_epi16(src_1, coeff[1]); 91 const __m256i res_13 = _mm256_maddubs_epi16(src_2, coeff[2]); 92 const __m256i res_57 = _mm256_maddubs_epi16(src_3, coeff[3]); 93 94 const __m256i res_even = _mm256_add_epi16(res_02, res_46); 95 const __m256i res_odd = _mm256_add_epi16(res_13, res_57); 96 const __m256i res = 97 _mm256_add_epi16(_mm256_add_epi16(res_even, res_odd), *round_const); 98 horz_out[row] = _mm256_srl_epi16(res, *shift); 99 } 100 101 static inline void prepare_horizontal_filter_coeff_avx2(int alpha, int beta, 102 int sx, 103 __m256i *coeff) { 104 __m128i tmp_0 = _mm_loadl_epi64( 105 (__m128i *)&av1_filter_8bit[((unsigned)(sx + 0 * alpha)) >> 106 WARPEDDIFF_PREC_BITS]); 107 __m128i tmp_1 = _mm_loadl_epi64( 108 (__m128i *)&av1_filter_8bit[((unsigned)(sx + 1 * alpha)) >> 109 WARPEDDIFF_PREC_BITS]); 110 __m128i tmp_2 = _mm_loadl_epi64( 111 (__m128i *)&av1_filter_8bit[((unsigned)(sx + 2 * alpha)) >> 112 WARPEDDIFF_PREC_BITS]); 113 __m128i tmp_3 = _mm_loadl_epi64( 114 (__m128i *)&av1_filter_8bit[((unsigned)(sx + 3 * alpha)) >> 115 WARPEDDIFF_PREC_BITS]); 116 117 __m128i tmp_4 = _mm_loadl_epi64( 118 (__m128i *)&av1_filter_8bit[((unsigned)(sx + 4 * alpha)) >> 119 WARPEDDIFF_PREC_BITS]); 120 __m128i tmp_5 = _mm_loadl_epi64( 121 (__m128i *)&av1_filter_8bit[((unsigned)(sx + 5 * alpha)) >> 122 WARPEDDIFF_PREC_BITS]); 123 __m128i tmp_6 = _mm_loadl_epi64( 124 (__m128i *)&av1_filter_8bit[((unsigned)(sx + 6 * alpha)) >> 125 WARPEDDIFF_PREC_BITS]); 126 __m128i tmp_7 = _mm_loadl_epi64( 127 (__m128i *)&av1_filter_8bit[((unsigned)(sx + 7 * alpha)) >> 128 WARPEDDIFF_PREC_BITS]); 129 130 __m256i tmp0_256 = _mm256_castsi128_si256(tmp_0); 131 __m256i tmp2_256 = _mm256_castsi128_si256(tmp_2); 132 __m256i tmp1_256 = _mm256_castsi128_si256(tmp_1); 133 __m256i tmp3_256 = _mm256_castsi128_si256(tmp_3); 134 135 __m256i tmp4_256 = _mm256_castsi128_si256(tmp_4); 136 __m256i tmp6_256 = _mm256_castsi128_si256(tmp_6); 137 __m256i tmp5_256 = _mm256_castsi128_si256(tmp_5); 138 __m256i tmp7_256 = _mm256_castsi128_si256(tmp_7); 139 140 __m128i tmp_8 = _mm_loadl_epi64( 141 (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 0 * alpha) >> 142 WARPEDDIFF_PREC_BITS]); 143 tmp0_256 = _mm256_inserti128_si256(tmp0_256, tmp_8, 1); 144 145 __m128i tmp_9 = _mm_loadl_epi64( 146 (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 1 * alpha) >> 147 WARPEDDIFF_PREC_BITS]); 148 tmp1_256 = _mm256_inserti128_si256(tmp1_256, tmp_9, 1); 149 150 __m128i tmp_10 = _mm_loadl_epi64( 151 (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 2 * alpha) >> 152 WARPEDDIFF_PREC_BITS]); 153 tmp2_256 = _mm256_inserti128_si256(tmp2_256, tmp_10, 1); 154 155 __m128i tmp_11 = _mm_loadl_epi64( 156 (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 3 * alpha) >> 157 WARPEDDIFF_PREC_BITS]); 158 tmp3_256 = _mm256_inserti128_si256(tmp3_256, tmp_11, 1); 159 160 tmp_2 = _mm_loadl_epi64( 161 (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 4 * alpha) >> 162 WARPEDDIFF_PREC_BITS]); 163 tmp4_256 = _mm256_inserti128_si256(tmp4_256, tmp_2, 1); 164 165 tmp_3 = _mm_loadl_epi64( 166 (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 5 * alpha) >> 167 WARPEDDIFF_PREC_BITS]); 168 tmp5_256 = _mm256_inserti128_si256(tmp5_256, tmp_3, 1); 169 170 tmp_6 = _mm_loadl_epi64( 171 (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 6 * alpha) >> 172 WARPEDDIFF_PREC_BITS]); 173 tmp6_256 = _mm256_inserti128_si256(tmp6_256, tmp_6, 1); 174 175 tmp_7 = _mm_loadl_epi64( 176 (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 7 * alpha) >> 177 WARPEDDIFF_PREC_BITS]); 178 tmp7_256 = _mm256_inserti128_si256(tmp7_256, tmp_7, 1); 179 180 const __m256i tmp_12 = _mm256_unpacklo_epi16(tmp0_256, tmp2_256); 181 const __m256i tmp_13 = _mm256_unpacklo_epi16(tmp1_256, tmp3_256); 182 const __m256i tmp_14 = _mm256_unpacklo_epi16(tmp4_256, tmp6_256); 183 const __m256i tmp_15 = _mm256_unpacklo_epi16(tmp5_256, tmp7_256); 184 185 const __m256i res_0 = _mm256_unpacklo_epi32(tmp_12, tmp_14); 186 const __m256i res_1 = _mm256_unpackhi_epi32(tmp_12, tmp_14); 187 const __m256i res_2 = _mm256_unpacklo_epi32(tmp_13, tmp_15); 188 const __m256i res_3 = _mm256_unpackhi_epi32(tmp_13, tmp_15); 189 190 coeff[0] = _mm256_unpacklo_epi64(res_0, res_2); 191 coeff[1] = _mm256_unpackhi_epi64(res_0, res_2); 192 coeff[2] = _mm256_unpacklo_epi64(res_1, res_3); 193 coeff[3] = _mm256_unpackhi_epi64(res_1, res_3); 194 } 195 196 static inline void prepare_horizontal_filter_coeff_beta0_avx2(int alpha, int sx, 197 __m256i *coeff) { 198 __m128i tmp_0 = _mm_loadl_epi64( 199 (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]); 200 __m128i tmp_1 = _mm_loadl_epi64( 201 (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]); 202 __m128i tmp_2 = _mm_loadl_epi64( 203 (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]); 204 __m128i tmp_3 = _mm_loadl_epi64( 205 (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]); 206 __m128i tmp_4 = _mm_loadl_epi64( 207 (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]); 208 __m128i tmp_5 = _mm_loadl_epi64( 209 (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]); 210 __m128i tmp_6 = _mm_loadl_epi64( 211 (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]); 212 __m128i tmp_7 = _mm_loadl_epi64( 213 (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]); 214 215 tmp_0 = _mm_unpacklo_epi16(tmp_0, tmp_2); 216 tmp_1 = _mm_unpacklo_epi16(tmp_1, tmp_3); 217 tmp_4 = _mm_unpacklo_epi16(tmp_4, tmp_6); 218 tmp_5 = _mm_unpacklo_epi16(tmp_5, tmp_7); 219 220 const __m256i tmp_12 = _mm256_broadcastsi128_si256(tmp_0); 221 const __m256i tmp_13 = _mm256_broadcastsi128_si256(tmp_1); 222 const __m256i tmp_14 = _mm256_broadcastsi128_si256(tmp_4); 223 const __m256i tmp_15 = _mm256_broadcastsi128_si256(tmp_5); 224 225 const __m256i res_0 = _mm256_unpacklo_epi32(tmp_12, tmp_14); 226 const __m256i res_1 = _mm256_unpackhi_epi32(tmp_12, tmp_14); 227 const __m256i res_2 = _mm256_unpacklo_epi32(tmp_13, tmp_15); 228 const __m256i res_3 = _mm256_unpackhi_epi32(tmp_13, tmp_15); 229 230 coeff[0] = _mm256_unpacklo_epi64(res_0, res_2); 231 coeff[1] = _mm256_unpackhi_epi64(res_0, res_2); 232 coeff[2] = _mm256_unpacklo_epi64(res_1, res_3); 233 coeff[3] = _mm256_unpackhi_epi64(res_1, res_3); 234 } 235 236 static inline void prepare_horizontal_filter_coeff_alpha0_avx2(int beta, int sx, 237 __m256i *coeff) { 238 const __m128i tmp_0 = 239 _mm_loadl_epi64((__m128i *)&av1_filter_8bit[sx >> WARPEDDIFF_PREC_BITS]); 240 const __m128i tmp_1 = _mm_loadl_epi64( 241 (__m128i *)&av1_filter_8bit[(sx + beta) >> WARPEDDIFF_PREC_BITS]); 242 243 const __m256i res_0 = 244 _mm256_inserti128_si256(_mm256_castsi128_si256(tmp_0), tmp_1, 0x1); 245 246 coeff[0] = _mm256_shuffle_epi8( 247 res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask01_avx2)); 248 coeff[1] = _mm256_shuffle_epi8( 249 res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask23_avx2)); 250 coeff[2] = _mm256_shuffle_epi8( 251 res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask45_avx2)); 252 coeff[3] = _mm256_shuffle_epi8( 253 res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask67_avx2)); 254 } 255 256 static inline void horizontal_filter_avx2(const __m256i src, __m256i *horz_out, 257 int sx, int alpha, int beta, int row, 258 const __m256i *shuffle_src, 259 const __m256i *round_const, 260 const __m128i *shift) { 261 __m256i coeff[4]; 262 prepare_horizontal_filter_coeff_avx2(alpha, beta, sx, coeff); 263 filter_src_pixels_avx2(src, horz_out, coeff, shuffle_src, round_const, shift, 264 row); 265 } 266 static inline void prepare_horizontal_filter_coeff(int alpha, int sx, 267 __m256i *coeff) { 268 const __m128i tmp_0 = _mm_loadl_epi64( 269 (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]); 270 const __m128i tmp_1 = _mm_loadl_epi64( 271 (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]); 272 const __m128i tmp_2 = _mm_loadl_epi64( 273 (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]); 274 const __m128i tmp_3 = _mm_loadl_epi64( 275 (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]); 276 const __m128i tmp_4 = _mm_loadl_epi64( 277 (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]); 278 const __m128i tmp_5 = _mm_loadl_epi64( 279 (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]); 280 const __m128i tmp_6 = _mm_loadl_epi64( 281 (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]); 282 const __m128i tmp_7 = _mm_loadl_epi64( 283 (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]); 284 285 const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2); 286 const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3); 287 const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6); 288 const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7); 289 290 const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10); 291 const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10); 292 const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11); 293 const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11); 294 295 coeff[0] = _mm256_castsi128_si256(_mm_unpacklo_epi64(tmp_12, tmp_14)); 296 coeff[1] = _mm256_castsi128_si256(_mm_unpackhi_epi64(tmp_12, tmp_14)); 297 coeff[2] = _mm256_castsi128_si256(_mm_unpacklo_epi64(tmp_13, tmp_15)); 298 coeff[3] = _mm256_castsi128_si256(_mm_unpackhi_epi64(tmp_13, tmp_15)); 299 } 300 301 static inline void warp_horizontal_filter_avx2( 302 const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4, 303 int32_t sx4, int alpha, int beta, int p_height, int height, int i, 304 const __m256i *round_const, const __m128i *shift, 305 const __m256i *shuffle_src) { 306 int k, iy, sx, row = 0; 307 __m256i coeff[4]; 308 for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) { 309 iy = iy4 + k; 310 iy = clamp(iy, 0, height - 1); 311 const __m128i src_0 = 312 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); 313 iy = iy4 + k + 1; 314 iy = clamp(iy, 0, height - 1); 315 const __m128i src_1 = 316 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); 317 const __m256i src_01 = 318 _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1); 319 sx = sx4 + beta * (k + 4); 320 horizontal_filter_avx2(src_01, horz_out, sx, alpha, beta, row, shuffle_src, 321 round_const, shift); 322 row += 1; 323 } 324 iy = iy4 + k; 325 iy = clamp(iy, 0, height - 1); 326 const __m256i src_01 = _mm256_castsi128_si256( 327 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7))); 328 sx = sx4 + beta * (k + 4); 329 prepare_horizontal_filter_coeff(alpha, sx, coeff); 330 filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const, 331 shift, row); 332 } 333 334 static inline void warp_horizontal_filter_alpha0_avx2( 335 const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4, 336 int32_t sx4, int alpha, int beta, int p_height, int height, int i, 337 const __m256i *round_const, const __m128i *shift, 338 const __m256i *shuffle_src) { 339 (void)alpha; 340 int k, iy, sx, row = 0; 341 __m256i coeff[4]; 342 for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) { 343 iy = iy4 + k; 344 iy = clamp(iy, 0, height - 1); 345 const __m128i src_0 = 346 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); 347 iy = iy4 + k + 1; 348 iy = clamp(iy, 0, height - 1); 349 const __m128i src_1 = 350 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); 351 const __m256i src_01 = 352 _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1); 353 sx = sx4 + beta * (k + 4); 354 prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx, coeff); 355 filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const, 356 shift, row); 357 row += 1; 358 } 359 iy = iy4 + k; 360 iy = clamp(iy, 0, height - 1); 361 const __m256i src_01 = _mm256_castsi128_si256( 362 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7))); 363 sx = sx4 + beta * (k + 4); 364 prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx, coeff); 365 filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const, 366 shift, row); 367 } 368 369 static inline void warp_horizontal_filter_beta0_avx2( 370 const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4, 371 int32_t sx4, int alpha, int beta, int p_height, int height, int i, 372 const __m256i *round_const, const __m128i *shift, 373 const __m256i *shuffle_src) { 374 (void)beta; 375 int k, iy, row = 0; 376 __m256i coeff[4]; 377 prepare_horizontal_filter_coeff_beta0_avx2(alpha, sx4, coeff); 378 for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) { 379 iy = iy4 + k; 380 iy = clamp(iy, 0, height - 1); 381 const __m128i src_0 = 382 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); 383 iy = iy4 + k + 1; 384 iy = clamp(iy, 0, height - 1); 385 const __m128i src_1 = 386 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); 387 const __m256i src_01 = 388 _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1); 389 filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const, 390 shift, row); 391 row += 1; 392 } 393 iy = iy4 + k; 394 iy = clamp(iy, 0, height - 1); 395 const __m256i src_01 = _mm256_castsi128_si256( 396 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7))); 397 filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const, 398 shift, row); 399 } 400 401 static inline void warp_horizontal_filter_alpha0_beta0_avx2( 402 const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4, 403 int32_t sx4, int alpha, int beta, int p_height, int height, int i, 404 const __m256i *round_const, const __m128i *shift, 405 const __m256i *shuffle_src) { 406 (void)alpha; 407 int k, iy, row = 0; 408 __m256i coeff[4]; 409 prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx4, coeff); 410 for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) { 411 iy = iy4 + k; 412 iy = clamp(iy, 0, height - 1); 413 const __m128i src0 = 414 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); 415 iy = iy4 + k + 1; 416 iy = clamp(iy, 0, height - 1); 417 const __m128i src1 = 418 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); 419 const __m256i src_01 = 420 _mm256_inserti128_si256(_mm256_castsi128_si256(src0), src1, 0x1); 421 filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const, 422 shift, row); 423 row += 1; 424 } 425 iy = iy4 + k; 426 iy = clamp(iy, 0, height - 1); 427 const __m256i src_01 = _mm256_castsi128_si256( 428 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7))); 429 filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const, 430 shift, row); 431 } 432 433 static inline void unpack_weights_and_set_round_const_avx2( 434 ConvolveParams *conv_params, const int round_bits, const int offset_bits, 435 __m256i *res_sub_const, __m256i *round_bits_const, __m256i *wt) { 436 *res_sub_const = 437 _mm256_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) - 438 (1 << (offset_bits - conv_params->round_1 - 1))); 439 *round_bits_const = _mm256_set1_epi16(((1 << round_bits) >> 1)); 440 441 const int w0 = conv_params->fwd_offset; 442 const int w1 = conv_params->bck_offset; 443 const __m256i wt0 = _mm256_set1_epi16((short)w0); 444 const __m256i wt1 = _mm256_set1_epi16((short)w1); 445 *wt = _mm256_unpacklo_epi16(wt0, wt1); 446 } 447 448 static inline void prepare_vertical_filter_coeffs_avx2(int gamma, int delta, 449 int sy, 450 __m256i *coeffs) { 451 __m128i filt_00 = 452 _mm_loadu_si128((__m128i *)(av1_warped_filter + 453 ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); 454 __m128i filt_01 = 455 _mm_loadu_si128((__m128i *)(av1_warped_filter + 456 ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); 457 __m128i filt_02 = 458 _mm_loadu_si128((__m128i *)(av1_warped_filter + 459 ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); 460 __m128i filt_03 = 461 _mm_loadu_si128((__m128i *)(av1_warped_filter + 462 ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); 463 464 __m128i filt_10 = _mm_loadu_si128( 465 (__m128i *)(av1_warped_filter + 466 (((sy + delta) + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); 467 __m128i filt_11 = _mm_loadu_si128( 468 (__m128i *)(av1_warped_filter + 469 (((sy + delta) + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); 470 __m128i filt_12 = _mm_loadu_si128( 471 (__m128i *)(av1_warped_filter + 472 (((sy + delta) + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); 473 __m128i filt_13 = _mm_loadu_si128( 474 (__m128i *)(av1_warped_filter + 475 (((sy + delta) + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); 476 477 __m256i filt_0 = 478 _mm256_inserti128_si256(_mm256_castsi128_si256(filt_00), filt_10, 0x1); 479 __m256i filt_1 = 480 _mm256_inserti128_si256(_mm256_castsi128_si256(filt_01), filt_11, 0x1); 481 __m256i filt_2 = 482 _mm256_inserti128_si256(_mm256_castsi128_si256(filt_02), filt_12, 0x1); 483 __m256i filt_3 = 484 _mm256_inserti128_si256(_mm256_castsi128_si256(filt_03), filt_13, 0x1); 485 486 __m256i res_0 = _mm256_unpacklo_epi32(filt_0, filt_1); 487 __m256i res_1 = _mm256_unpacklo_epi32(filt_2, filt_3); 488 __m256i res_2 = _mm256_unpackhi_epi32(filt_0, filt_1); 489 __m256i res_3 = _mm256_unpackhi_epi32(filt_2, filt_3); 490 491 coeffs[0] = _mm256_unpacklo_epi64(res_0, res_1); 492 coeffs[1] = _mm256_unpackhi_epi64(res_0, res_1); 493 coeffs[2] = _mm256_unpacklo_epi64(res_2, res_3); 494 coeffs[3] = _mm256_unpackhi_epi64(res_2, res_3); 495 496 filt_00 = 497 _mm_loadu_si128((__m128i *)(av1_warped_filter + 498 ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); 499 filt_01 = 500 _mm_loadu_si128((__m128i *)(av1_warped_filter + 501 ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); 502 filt_02 = 503 _mm_loadu_si128((__m128i *)(av1_warped_filter + 504 ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); 505 filt_03 = 506 _mm_loadu_si128((__m128i *)(av1_warped_filter + 507 ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); 508 509 filt_10 = _mm_loadu_si128( 510 (__m128i *)(av1_warped_filter + 511 (((sy + delta) + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); 512 filt_11 = _mm_loadu_si128( 513 (__m128i *)(av1_warped_filter + 514 (((sy + delta) + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); 515 filt_12 = _mm_loadu_si128( 516 (__m128i *)(av1_warped_filter + 517 (((sy + delta) + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); 518 filt_13 = _mm_loadu_si128( 519 (__m128i *)(av1_warped_filter + 520 (((sy + delta) + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); 521 522 filt_0 = 523 _mm256_inserti128_si256(_mm256_castsi128_si256(filt_00), filt_10, 0x1); 524 filt_1 = 525 _mm256_inserti128_si256(_mm256_castsi128_si256(filt_01), filt_11, 0x1); 526 filt_2 = 527 _mm256_inserti128_si256(_mm256_castsi128_si256(filt_02), filt_12, 0x1); 528 filt_3 = 529 _mm256_inserti128_si256(_mm256_castsi128_si256(filt_03), filt_13, 0x1); 530 531 res_0 = _mm256_unpacklo_epi32(filt_0, filt_1); 532 res_1 = _mm256_unpacklo_epi32(filt_2, filt_3); 533 res_2 = _mm256_unpackhi_epi32(filt_0, filt_1); 534 res_3 = _mm256_unpackhi_epi32(filt_2, filt_3); 535 536 coeffs[4] = _mm256_unpacklo_epi64(res_0, res_1); 537 coeffs[5] = _mm256_unpackhi_epi64(res_0, res_1); 538 coeffs[6] = _mm256_unpacklo_epi64(res_2, res_3); 539 coeffs[7] = _mm256_unpackhi_epi64(res_2, res_3); 540 } 541 542 static inline void prepare_vertical_filter_coeffs_delta0_avx2(int gamma, int sy, 543 __m256i *coeffs) { 544 __m128i filt_00 = 545 _mm_loadu_si128((__m128i *)(av1_warped_filter + 546 ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); 547 __m128i filt_01 = 548 _mm_loadu_si128((__m128i *)(av1_warped_filter + 549 ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); 550 __m128i filt_02 = 551 _mm_loadu_si128((__m128i *)(av1_warped_filter + 552 ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); 553 __m128i filt_03 = 554 _mm_loadu_si128((__m128i *)(av1_warped_filter + 555 ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); 556 557 __m256i filt_0 = _mm256_broadcastsi128_si256(filt_00); 558 __m256i filt_1 = _mm256_broadcastsi128_si256(filt_01); 559 __m256i filt_2 = _mm256_broadcastsi128_si256(filt_02); 560 __m256i filt_3 = _mm256_broadcastsi128_si256(filt_03); 561 562 __m256i res_0 = _mm256_unpacklo_epi32(filt_0, filt_1); 563 __m256i res_1 = _mm256_unpacklo_epi32(filt_2, filt_3); 564 __m256i res_2 = _mm256_unpackhi_epi32(filt_0, filt_1); 565 __m256i res_3 = _mm256_unpackhi_epi32(filt_2, filt_3); 566 567 coeffs[0] = _mm256_unpacklo_epi64(res_0, res_1); 568 coeffs[1] = _mm256_unpackhi_epi64(res_0, res_1); 569 coeffs[2] = _mm256_unpacklo_epi64(res_2, res_3); 570 coeffs[3] = _mm256_unpackhi_epi64(res_2, res_3); 571 572 filt_00 = 573 _mm_loadu_si128((__m128i *)(av1_warped_filter + 574 ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); 575 filt_01 = 576 _mm_loadu_si128((__m128i *)(av1_warped_filter + 577 ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); 578 filt_02 = 579 _mm_loadu_si128((__m128i *)(av1_warped_filter + 580 ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); 581 filt_03 = 582 _mm_loadu_si128((__m128i *)(av1_warped_filter + 583 ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); 584 585 filt_0 = _mm256_broadcastsi128_si256(filt_00); 586 filt_1 = _mm256_broadcastsi128_si256(filt_01); 587 filt_2 = _mm256_broadcastsi128_si256(filt_02); 588 filt_3 = _mm256_broadcastsi128_si256(filt_03); 589 590 res_0 = _mm256_unpacklo_epi32(filt_0, filt_1); 591 res_1 = _mm256_unpacklo_epi32(filt_2, filt_3); 592 res_2 = _mm256_unpackhi_epi32(filt_0, filt_1); 593 res_3 = _mm256_unpackhi_epi32(filt_2, filt_3); 594 595 coeffs[4] = _mm256_unpacklo_epi64(res_0, res_1); 596 coeffs[5] = _mm256_unpackhi_epi64(res_0, res_1); 597 coeffs[6] = _mm256_unpacklo_epi64(res_2, res_3); 598 coeffs[7] = _mm256_unpackhi_epi64(res_2, res_3); 599 } 600 601 static inline void prepare_vertical_filter_coeffs_gamma0_avx2(int delta, int sy, 602 __m256i *coeffs) { 603 const __m128i filt_0 = _mm_loadu_si128( 604 (__m128i *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS))); 605 const __m128i filt_1 = _mm_loadu_si128( 606 (__m128i *)(av1_warped_filter + ((sy + delta) >> WARPEDDIFF_PREC_BITS))); 607 608 __m256i res_0 = 609 _mm256_inserti128_si256(_mm256_castsi128_si256(filt_0), filt_1, 0x1); 610 611 coeffs[0] = _mm256_shuffle_epi8( 612 res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask0_avx2)); 613 coeffs[1] = _mm256_shuffle_epi8( 614 res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask1_avx2)); 615 coeffs[2] = _mm256_shuffle_epi8( 616 res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask2_avx2)); 617 coeffs[3] = _mm256_shuffle_epi8( 618 res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask3_avx2)); 619 620 coeffs[4] = coeffs[0]; 621 coeffs[5] = coeffs[1]; 622 coeffs[6] = coeffs[2]; 623 coeffs[7] = coeffs[3]; 624 } 625 626 static inline void filter_src_pixels_vertical_avx2(__m256i *horz_out, 627 __m256i *src, 628 __m256i *coeffs, 629 __m256i *res_lo, 630 __m256i *res_hi, int row) { 631 const __m256i src_6 = horz_out[row + 3]; 632 const __m256i src_7 = 633 _mm256_permute2x128_si256(horz_out[row + 3], horz_out[row + 4], 0x21); 634 635 src[6] = _mm256_unpacklo_epi16(src_6, src_7); 636 637 const __m256i res_0 = _mm256_madd_epi16(src[0], coeffs[0]); 638 const __m256i res_2 = _mm256_madd_epi16(src[2], coeffs[1]); 639 const __m256i res_4 = _mm256_madd_epi16(src[4], coeffs[2]); 640 const __m256i res_6 = _mm256_madd_epi16(src[6], coeffs[3]); 641 642 const __m256i res_even = _mm256_add_epi32(_mm256_add_epi32(res_0, res_2), 643 _mm256_add_epi32(res_4, res_6)); 644 645 src[7] = _mm256_unpackhi_epi16(src_6, src_7); 646 647 const __m256i res_1 = _mm256_madd_epi16(src[1], coeffs[4]); 648 const __m256i res_3 = _mm256_madd_epi16(src[3], coeffs[5]); 649 const __m256i res_5 = _mm256_madd_epi16(src[5], coeffs[6]); 650 const __m256i res_7 = _mm256_madd_epi16(src[7], coeffs[7]); 651 652 const __m256i res_odd = _mm256_add_epi32(_mm256_add_epi32(res_1, res_3), 653 _mm256_add_epi32(res_5, res_7)); 654 655 // Rearrange pixels back into the order 0 ... 7 656 *res_lo = _mm256_unpacklo_epi32(res_even, res_odd); 657 *res_hi = _mm256_unpackhi_epi32(res_even, res_odd); 658 } 659 660 static inline void store_vertical_filter_output_avx2( 661 const __m256i *res_lo, const __m256i *res_hi, const __m256i *res_add_const, 662 const __m256i *wt, const __m256i *res_sub_const, 663 const __m256i *round_bits_const, uint8_t *pred, ConvolveParams *conv_params, 664 int i, int j, int k, const int reduce_bits_vert, int p_stride, int p_width, 665 const int round_bits) { 666 __m256i res_lo_1 = *res_lo; 667 __m256i res_hi_1 = *res_hi; 668 669 if (conv_params->is_compound) { 670 __m128i *const p_0 = 671 (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j]; 672 __m128i *const p_1 = 673 (__m128i *)&conv_params 674 ->dst[(i + (k + 1) + 4) * conv_params->dst_stride + j]; 675 676 res_lo_1 = _mm256_srai_epi32(_mm256_add_epi32(res_lo_1, *res_add_const), 677 reduce_bits_vert); 678 679 const __m256i temp_lo_16 = _mm256_packus_epi32(res_lo_1, res_lo_1); 680 __m256i res_lo_16; 681 if (conv_params->do_average) { 682 __m128i *const dst8_0 = (__m128i *)&pred[(i + k + 4) * p_stride + j]; 683 __m128i *const dst8_1 = 684 (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j]; 685 const __m128i p_16_0 = _mm_loadl_epi64(p_0); 686 const __m128i p_16_1 = _mm_loadl_epi64(p_1); 687 const __m256i p_16 = 688 _mm256_inserti128_si256(_mm256_castsi128_si256(p_16_0), p_16_1, 1); 689 if (conv_params->use_dist_wtd_comp_avg) { 690 const __m256i p_16_lo = _mm256_unpacklo_epi16(p_16, temp_lo_16); 691 const __m256i wt_res_lo = _mm256_madd_epi16(p_16_lo, *wt); 692 const __m256i shifted_32 = 693 _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); 694 res_lo_16 = _mm256_packus_epi32(shifted_32, shifted_32); 695 } else { 696 res_lo_16 = _mm256_srai_epi16(_mm256_add_epi16(p_16, temp_lo_16), 1); 697 } 698 res_lo_16 = _mm256_add_epi16(res_lo_16, *res_sub_const); 699 res_lo_16 = _mm256_srai_epi16( 700 _mm256_add_epi16(res_lo_16, *round_bits_const), round_bits); 701 const __m256i res_8_lo = _mm256_packus_epi16(res_lo_16, res_lo_16); 702 const __m128i res_8_lo_0 = _mm256_castsi256_si128(res_8_lo); 703 const __m128i res_8_lo_1 = _mm256_extracti128_si256(res_8_lo, 1); 704 *(int *)dst8_0 = _mm_cvtsi128_si32(res_8_lo_0); 705 *(int *)dst8_1 = _mm_cvtsi128_si32(res_8_lo_1); 706 } else { 707 const __m128i temp_lo_16_0 = _mm256_castsi256_si128(temp_lo_16); 708 const __m128i temp_lo_16_1 = _mm256_extracti128_si256(temp_lo_16, 1); 709 _mm_storel_epi64(p_0, temp_lo_16_0); 710 _mm_storel_epi64(p_1, temp_lo_16_1); 711 } 712 if (p_width > 4) { 713 __m128i *const p4_0 = 714 (__m128i *)&conv_params 715 ->dst[(i + k + 4) * conv_params->dst_stride + j + 4]; 716 __m128i *const p4_1 = 717 (__m128i *)&conv_params 718 ->dst[(i + (k + 1) + 4) * conv_params->dst_stride + j + 4]; 719 res_hi_1 = _mm256_srai_epi32(_mm256_add_epi32(res_hi_1, *res_add_const), 720 reduce_bits_vert); 721 const __m256i temp_hi_16 = _mm256_packus_epi32(res_hi_1, res_hi_1); 722 __m256i res_hi_16; 723 if (conv_params->do_average) { 724 __m128i *const dst8_4_0 = 725 (__m128i *)&pred[(i + k + 4) * p_stride + j + 4]; 726 __m128i *const dst8_4_1 = 727 (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j + 4]; 728 const __m128i p4_16_0 = _mm_loadl_epi64(p4_0); 729 const __m128i p4_16_1 = _mm_loadl_epi64(p4_1); 730 const __m256i p4_16 = _mm256_inserti128_si256( 731 _mm256_castsi128_si256(p4_16_0), p4_16_1, 1); 732 if (conv_params->use_dist_wtd_comp_avg) { 733 const __m256i p_16_hi = _mm256_unpacklo_epi16(p4_16, temp_hi_16); 734 const __m256i wt_res_hi = _mm256_madd_epi16(p_16_hi, *wt); 735 const __m256i shifted_32 = 736 _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS); 737 res_hi_16 = _mm256_packus_epi32(shifted_32, shifted_32); 738 } else { 739 res_hi_16 = _mm256_srai_epi16(_mm256_add_epi16(p4_16, temp_hi_16), 1); 740 } 741 res_hi_16 = _mm256_add_epi16(res_hi_16, *res_sub_const); 742 res_hi_16 = _mm256_srai_epi16( 743 _mm256_add_epi16(res_hi_16, *round_bits_const), round_bits); 744 __m256i res_8_hi = _mm256_packus_epi16(res_hi_16, res_hi_16); 745 const __m128i res_8_hi_0 = _mm256_castsi256_si128(res_8_hi); 746 const __m128i res_8_hi_1 = _mm256_extracti128_si256(res_8_hi, 1); 747 *(int *)dst8_4_0 = _mm_cvtsi128_si32(res_8_hi_0); 748 *(int *)dst8_4_1 = _mm_cvtsi128_si32(res_8_hi_1); 749 } else { 750 const __m128i temp_hi_16_0 = _mm256_castsi256_si128(temp_hi_16); 751 const __m128i temp_hi_16_1 = _mm256_extracti128_si256(temp_hi_16, 1); 752 _mm_storel_epi64(p4_0, temp_hi_16_0); 753 _mm_storel_epi64(p4_1, temp_hi_16_1); 754 } 755 } 756 } else { 757 const __m256i res_lo_round = _mm256_srai_epi32( 758 _mm256_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert); 759 const __m256i res_hi_round = _mm256_srai_epi32( 760 _mm256_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert); 761 762 const __m256i res_16bit = _mm256_packs_epi32(res_lo_round, res_hi_round); 763 const __m256i res_8bit = _mm256_packus_epi16(res_16bit, res_16bit); 764 const __m128i res_8bit0 = _mm256_castsi256_si128(res_8bit); 765 const __m128i res_8bit1 = _mm256_extracti128_si256(res_8bit, 1); 766 767 // Store, blending with 'pred' if needed 768 __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j]; 769 __m128i *const p1 = (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j]; 770 771 if (p_width == 4) { 772 *(int *)p = _mm_cvtsi128_si32(res_8bit0); 773 *(int *)p1 = _mm_cvtsi128_si32(res_8bit1); 774 } else { 775 _mm_storel_epi64(p, res_8bit0); 776 _mm_storel_epi64(p1, res_8bit1); 777 } 778 } 779 } 780 781 static inline void warp_vertical_filter_avx2( 782 uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params, 783 int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width, 784 int i, int j, int sy4, const int reduce_bits_vert, 785 const __m256i *res_add_const, const int round_bits, 786 const __m256i *res_sub_const, const __m256i *round_bits_const, 787 const __m256i *wt) { 788 int k, row = 0; 789 __m256i src[8]; 790 const __m256i src_0 = horz_out[0]; 791 const __m256i src_1 = 792 _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21); 793 const __m256i src_2 = horz_out[1]; 794 const __m256i src_3 = 795 _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21); 796 const __m256i src_4 = horz_out[2]; 797 const __m256i src_5 = 798 _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21); 799 800 src[0] = _mm256_unpacklo_epi16(src_0, src_1); 801 src[2] = _mm256_unpacklo_epi16(src_2, src_3); 802 src[4] = _mm256_unpacklo_epi16(src_4, src_5); 803 804 src[1] = _mm256_unpackhi_epi16(src_0, src_1); 805 src[3] = _mm256_unpackhi_epi16(src_2, src_3); 806 src[5] = _mm256_unpackhi_epi16(src_4, src_5); 807 808 for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) { 809 int sy = sy4 + delta * (k + 4); 810 __m256i coeffs[8]; 811 prepare_vertical_filter_coeffs_avx2(gamma, delta, sy, coeffs); 812 __m256i res_lo, res_hi; 813 filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi, 814 row); 815 store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt, 816 res_sub_const, round_bits_const, pred, 817 conv_params, i, j, k, reduce_bits_vert, 818 p_stride, p_width, round_bits); 819 src[0] = src[2]; 820 src[2] = src[4]; 821 src[4] = src[6]; 822 src[1] = src[3]; 823 src[3] = src[5]; 824 src[5] = src[7]; 825 826 row += 1; 827 } 828 } 829 830 static inline void warp_vertical_filter_gamma0_avx2( 831 uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params, 832 int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width, 833 int i, int j, int sy4, const int reduce_bits_vert, 834 const __m256i *res_add_const, const int round_bits, 835 const __m256i *res_sub_const, const __m256i *round_bits_const, 836 const __m256i *wt) { 837 (void)gamma; 838 int k, row = 0; 839 __m256i src[8]; 840 const __m256i src_0 = horz_out[0]; 841 const __m256i src_1 = 842 _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21); 843 const __m256i src_2 = horz_out[1]; 844 const __m256i src_3 = 845 _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21); 846 const __m256i src_4 = horz_out[2]; 847 const __m256i src_5 = 848 _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21); 849 850 src[0] = _mm256_unpacklo_epi16(src_0, src_1); 851 src[2] = _mm256_unpacklo_epi16(src_2, src_3); 852 src[4] = _mm256_unpacklo_epi16(src_4, src_5); 853 854 src[1] = _mm256_unpackhi_epi16(src_0, src_1); 855 src[3] = _mm256_unpackhi_epi16(src_2, src_3); 856 src[5] = _mm256_unpackhi_epi16(src_4, src_5); 857 858 for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) { 859 int sy = sy4 + delta * (k + 4); 860 __m256i coeffs[8]; 861 prepare_vertical_filter_coeffs_gamma0_avx2(delta, sy, coeffs); 862 __m256i res_lo, res_hi; 863 filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi, 864 row); 865 store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt, 866 res_sub_const, round_bits_const, pred, 867 conv_params, i, j, k, reduce_bits_vert, 868 p_stride, p_width, round_bits); 869 src[0] = src[2]; 870 src[2] = src[4]; 871 src[4] = src[6]; 872 src[1] = src[3]; 873 src[3] = src[5]; 874 src[5] = src[7]; 875 row += 1; 876 } 877 } 878 879 static inline void warp_vertical_filter_delta0_avx2( 880 uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params, 881 int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width, 882 int i, int j, int sy4, const int reduce_bits_vert, 883 const __m256i *res_add_const, const int round_bits, 884 const __m256i *res_sub_const, const __m256i *round_bits_const, 885 const __m256i *wt) { 886 (void)delta; 887 int k, row = 0; 888 __m256i src[8], coeffs[8]; 889 const __m256i src_0 = horz_out[0]; 890 const __m256i src_1 = 891 _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21); 892 const __m256i src_2 = horz_out[1]; 893 const __m256i src_3 = 894 _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21); 895 const __m256i src_4 = horz_out[2]; 896 const __m256i src_5 = 897 _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21); 898 899 src[0] = _mm256_unpacklo_epi16(src_0, src_1); 900 src[2] = _mm256_unpacklo_epi16(src_2, src_3); 901 src[4] = _mm256_unpacklo_epi16(src_4, src_5); 902 903 src[1] = _mm256_unpackhi_epi16(src_0, src_1); 904 src[3] = _mm256_unpackhi_epi16(src_2, src_3); 905 src[5] = _mm256_unpackhi_epi16(src_4, src_5); 906 907 prepare_vertical_filter_coeffs_delta0_avx2(gamma, sy4, coeffs); 908 909 for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) { 910 __m256i res_lo, res_hi; 911 filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi, 912 row); 913 store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt, 914 res_sub_const, round_bits_const, pred, 915 conv_params, i, j, k, reduce_bits_vert, 916 p_stride, p_width, round_bits); 917 src[0] = src[2]; 918 src[2] = src[4]; 919 src[4] = src[6]; 920 src[1] = src[3]; 921 src[3] = src[5]; 922 src[5] = src[7]; 923 row += 1; 924 } 925 } 926 927 static inline void warp_vertical_filter_gamma0_delta0_avx2( 928 uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params, 929 int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width, 930 int i, int j, int sy4, const int reduce_bits_vert, 931 const __m256i *res_add_const, const int round_bits, 932 const __m256i *res_sub_const, const __m256i *round_bits_const, 933 const __m256i *wt) { 934 (void)gamma; 935 int k, row = 0; 936 __m256i src[8], coeffs[8]; 937 const __m256i src_0 = horz_out[0]; 938 const __m256i src_1 = 939 _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21); 940 const __m256i src_2 = horz_out[1]; 941 const __m256i src_3 = 942 _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21); 943 const __m256i src_4 = horz_out[2]; 944 const __m256i src_5 = 945 _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21); 946 947 src[0] = _mm256_unpacklo_epi16(src_0, src_1); 948 src[2] = _mm256_unpacklo_epi16(src_2, src_3); 949 src[4] = _mm256_unpacklo_epi16(src_4, src_5); 950 951 src[1] = _mm256_unpackhi_epi16(src_0, src_1); 952 src[3] = _mm256_unpackhi_epi16(src_2, src_3); 953 src[5] = _mm256_unpackhi_epi16(src_4, src_5); 954 955 prepare_vertical_filter_coeffs_gamma0_avx2(delta, sy4, coeffs); 956 957 for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) { 958 __m256i res_lo, res_hi; 959 filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi, 960 row); 961 store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt, 962 res_sub_const, round_bits_const, pred, 963 conv_params, i, j, k, reduce_bits_vert, 964 p_stride, p_width, round_bits); 965 src[0] = src[2]; 966 src[2] = src[4]; 967 src[4] = src[6]; 968 src[1] = src[3]; 969 src[3] = src[5]; 970 src[5] = src[7]; 971 row += 1; 972 } 973 } 974 975 static inline void prepare_warp_vertical_filter_avx2( 976 uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params, 977 int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width, 978 int i, int j, int sy4, const int reduce_bits_vert, 979 const __m256i *res_add_const, const int round_bits, 980 const __m256i *res_sub_const, const __m256i *round_bits_const, 981 const __m256i *wt) { 982 if (gamma == 0 && delta == 0) 983 warp_vertical_filter_gamma0_delta0_avx2( 984 pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width, 985 i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const, 986 round_bits_const, wt); 987 else if (gamma == 0 && delta != 0) 988 warp_vertical_filter_gamma0_avx2( 989 pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width, 990 i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const, 991 round_bits_const, wt); 992 else if (gamma != 0 && delta == 0) 993 warp_vertical_filter_delta0_avx2( 994 pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width, 995 i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const, 996 round_bits_const, wt); 997 else 998 warp_vertical_filter_avx2(pred, horz_out, conv_params, gamma, delta, 999 p_height, p_stride, p_width, i, j, sy4, 1000 reduce_bits_vert, res_add_const, round_bits, 1001 res_sub_const, round_bits_const, wt); 1002 } 1003 1004 static inline void prepare_warp_horizontal_filter_avx2( 1005 const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4, 1006 int32_t sx4, int alpha, int beta, int p_height, int height, int i, 1007 const __m256i *round_const, const __m128i *shift, 1008 const __m256i *shuffle_src) { 1009 if (alpha == 0 && beta == 0) 1010 warp_horizontal_filter_alpha0_beta0_avx2( 1011 ref, horz_out, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i, 1012 round_const, shift, shuffle_src); 1013 else if (alpha == 0 && beta != 0) 1014 warp_horizontal_filter_alpha0_avx2(ref, horz_out, stride, ix4, iy4, sx4, 1015 alpha, beta, p_height, height, i, 1016 round_const, shift, shuffle_src); 1017 else if (alpha != 0 && beta == 0) 1018 warp_horizontal_filter_beta0_avx2(ref, horz_out, stride, ix4, iy4, sx4, 1019 alpha, beta, p_height, height, i, 1020 round_const, shift, shuffle_src); 1021 else 1022 warp_horizontal_filter_avx2(ref, horz_out, stride, ix4, iy4, sx4, alpha, 1023 beta, p_height, height, i, round_const, shift, 1024 shuffle_src); 1025 } 1026 1027 void av1_warp_affine_avx2(const int32_t *mat, const uint8_t *ref, int width, 1028 int height, int stride, uint8_t *pred, int p_col, 1029 int p_row, int p_width, int p_height, int p_stride, 1030 int subsampling_x, int subsampling_y, 1031 ConvolveParams *conv_params, int16_t alpha, 1032 int16_t beta, int16_t gamma, int16_t delta) { 1033 __m256i horz_out[8]; 1034 int i, j, k; 1035 const int bd = 8; 1036 const int reduce_bits_horiz = conv_params->round_0; 1037 const int reduce_bits_vert = conv_params->is_compound 1038 ? conv_params->round_1 1039 : 2 * FILTER_BITS - reduce_bits_horiz; 1040 const int offset_bits_horiz = bd + FILTER_BITS - 1; 1041 assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL)); 1042 1043 const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz; 1044 const __m256i reduce_bits_vert_const = 1045 _mm256_set1_epi32(((1 << reduce_bits_vert) >> 1)); 1046 const __m256i res_add_const = _mm256_set1_epi32(1 << offset_bits_vert); 1047 const int round_bits = 1048 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; 1049 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; 1050 assert(IMPLIES(conv_params->do_average, conv_params->is_compound)); 1051 1052 const __m256i round_const = _mm256_set1_epi16( 1053 (1 << offset_bits_horiz) + ((1 << reduce_bits_horiz) >> 1)); 1054 const __m128i shift = _mm_cvtsi32_si128(reduce_bits_horiz); 1055 1056 __m256i res_sub_const, round_bits_const, wt; 1057 unpack_weights_and_set_round_const_avx2(conv_params, round_bits, offset_bits, 1058 &res_sub_const, &round_bits_const, 1059 &wt); 1060 1061 __m256i res_add_const_1; 1062 if (conv_params->is_compound == 1) { 1063 res_add_const_1 = _mm256_add_epi32(reduce_bits_vert_const, res_add_const); 1064 } else { 1065 res_add_const_1 = _mm256_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) + 1066 ((1 << reduce_bits_vert) >> 1)); 1067 } 1068 const int32_t const1 = alpha * (-4) + beta * (-4) + 1069 (1 << (WARPEDDIFF_PREC_BITS - 1)) + 1070 (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); 1071 const int32_t const2 = gamma * (-4) + delta * (-4) + 1072 (1 << (WARPEDDIFF_PREC_BITS - 1)) + 1073 (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); 1074 const int32_t const3 = ((1 << WARP_PARAM_REDUCE_BITS) - 1); 1075 const int16_t const4 = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)); 1076 const int16_t const5 = (1 << (FILTER_BITS - reduce_bits_horiz)); 1077 1078 __m256i shuffle_src[4]; 1079 shuffle_src[0] = _mm256_load_si256((__m256i *)shuffle_src0); 1080 shuffle_src[1] = _mm256_load_si256((__m256i *)shuffle_src1); 1081 shuffle_src[2] = _mm256_load_si256((__m256i *)shuffle_src2); 1082 shuffle_src[3] = _mm256_load_si256((__m256i *)shuffle_src3); 1083 1084 for (i = 0; i < p_height; i += 8) { 1085 for (j = 0; j < p_width; j += 8) { 1086 const int32_t src_x = (p_col + j + 4) << subsampling_x; 1087 const int32_t src_y = (p_row + i + 4) << subsampling_y; 1088 const int64_t dst_x = 1089 (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0]; 1090 const int64_t dst_y = 1091 (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1]; 1092 const int64_t x4 = dst_x >> subsampling_x; 1093 const int64_t y4 = dst_y >> subsampling_y; 1094 1095 int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS); 1096 int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); 1097 int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS); 1098 int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); 1099 1100 // Add in all the constant terms, including rounding and offset 1101 sx4 += const1; 1102 sy4 += const2; 1103 1104 sx4 &= ~const3; 1105 sy4 &= ~const3; 1106 1107 // Horizontal filter 1108 // If the block is aligned such that, after clamping, every sample 1109 // would be taken from the leftmost/rightmost column, then we can 1110 // skip the expensive horizontal filter. 1111 1112 if (ix4 <= -7) { 1113 int iy, row = 0; 1114 for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) { 1115 iy = iy4 + k; 1116 iy = clamp(iy, 0, height - 1); 1117 const __m256i temp_0 = 1118 _mm256_set1_epi16(const4 + ref[iy * stride] * const5); 1119 iy = iy4 + k + 1; 1120 iy = clamp(iy, 0, height - 1); 1121 const __m256i temp_1 = 1122 _mm256_set1_epi16(const4 + ref[iy * stride] * const5); 1123 horz_out[row] = _mm256_blend_epi32(temp_0, temp_1, 0xf0); 1124 row += 1; 1125 } 1126 iy = iy4 + k; 1127 iy = clamp(iy, 0, height - 1); 1128 horz_out[row] = _mm256_set1_epi16(const4 + ref[iy * stride] * const5); 1129 } else if (ix4 >= width + 6) { 1130 int iy, row = 0; 1131 for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) { 1132 iy = iy4 + k; 1133 iy = clamp(iy, 0, height - 1); 1134 const __m256i temp_0 = _mm256_set1_epi16( 1135 const4 + ref[iy * stride + (width - 1)] * const5); 1136 iy = iy4 + k + 1; 1137 iy = clamp(iy, 0, height - 1); 1138 const __m256i temp_1 = _mm256_set1_epi16( 1139 const4 + ref[iy * stride + (width - 1)] * const5); 1140 horz_out[row] = _mm256_blend_epi32(temp_0, temp_1, 0xf0); 1141 row += 1; 1142 } 1143 iy = iy4 + k; 1144 iy = clamp(iy, 0, height - 1); 1145 horz_out[row] = 1146 _mm256_set1_epi16(const4 + ref[iy * stride + (width - 1)] * const5); 1147 } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) { 1148 const int out_of_boundary_left = -(ix4 - 6); 1149 const int out_of_boundary_right = (ix4 + 8) - width; 1150 int iy, sx, row = 0; 1151 for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) { 1152 iy = iy4 + k; 1153 iy = clamp(iy, 0, height - 1); 1154 __m128i src0 = 1155 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); 1156 iy = iy4 + k + 1; 1157 iy = clamp(iy, 0, height - 1); 1158 __m128i src1 = 1159 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); 1160 1161 if (out_of_boundary_left >= 0) { 1162 const __m128i shuffle_reg_left = 1163 _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]); 1164 src0 = _mm_shuffle_epi8(src0, shuffle_reg_left); 1165 src1 = _mm_shuffle_epi8(src1, shuffle_reg_left); 1166 } 1167 if (out_of_boundary_right >= 0) { 1168 const __m128i shuffle_reg_right = _mm_loadu_si128( 1169 (__m128i *)warp_pad_right[out_of_boundary_right]); 1170 src0 = _mm_shuffle_epi8(src0, shuffle_reg_right); 1171 src1 = _mm_shuffle_epi8(src1, shuffle_reg_right); 1172 } 1173 sx = sx4 + beta * (k + 4); 1174 const __m256i src_01 = 1175 _mm256_inserti128_si256(_mm256_castsi128_si256(src0), src1, 0x1); 1176 horizontal_filter_avx2(src_01, horz_out, sx, alpha, beta, row, 1177 shuffle_src, &round_const, &shift); 1178 row += 1; 1179 } 1180 iy = iy4 + k; 1181 iy = clamp(iy, 0, height - 1); 1182 __m128i src = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); 1183 if (out_of_boundary_left >= 0) { 1184 const __m128i shuffle_reg_left = 1185 _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]); 1186 src = _mm_shuffle_epi8(src, shuffle_reg_left); 1187 } 1188 if (out_of_boundary_right >= 0) { 1189 const __m128i shuffle_reg_right = 1190 _mm_loadu_si128((__m128i *)warp_pad_right[out_of_boundary_right]); 1191 src = _mm_shuffle_epi8(src, shuffle_reg_right); 1192 } 1193 sx = sx4 + beta * (k + 4); 1194 const __m256i src_01 = _mm256_castsi128_si256(src); 1195 __m256i coeff[4]; 1196 prepare_horizontal_filter_coeff(alpha, sx, coeff); 1197 filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, 1198 &round_const, &shift, row); 1199 } else { 1200 prepare_warp_horizontal_filter_avx2( 1201 ref, horz_out, stride, ix4, iy4, sx4, alpha, beta, p_height, height, 1202 i, &round_const, &shift, shuffle_src); 1203 } 1204 1205 // Vertical filter 1206 prepare_warp_vertical_filter_avx2( 1207 pred, horz_out, conv_params, gamma, delta, p_height, p_stride, 1208 p_width, i, j, sy4, reduce_bits_vert, &res_add_const_1, round_bits, 1209 &res_sub_const, &round_bits_const, &wt); 1210 } 1211 } 1212 } 1213 1214 #endif // !CONFIG_HIGHWAY