highbd_warp_affine_avx2.c (29103B)
1 /* 2 * Copyright (c) 2020, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 #include <immintrin.h> 12 13 #include "config/av1_rtcd.h" 14 15 #include "av1/common/warped_motion.h" 16 17 void av1_highbd_warp_affine_avx2(const int32_t *mat, const uint16_t *ref, 18 int width, int height, int stride, 19 uint16_t *pred, int p_col, int p_row, 20 int p_width, int p_height, int p_stride, 21 int subsampling_x, int subsampling_y, int bd, 22 ConvolveParams *conv_params, int16_t alpha, 23 int16_t beta, int16_t gamma, int16_t delta) { 24 __m256i tmp[15]; 25 const int reduce_bits_horiz = conv_params->round_0; 26 const int reduce_bits_vert = conv_params->is_compound 27 ? conv_params->round_1 28 : 2 * FILTER_BITS - reduce_bits_horiz; 29 const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz; 30 const int offset_bits_horiz = bd + FILTER_BITS - 1; 31 const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz; 32 const int round_bits = 33 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; 34 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; 35 (void)max_bits_horiz; 36 assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL)); 37 38 // Check that, even with 12-bit input, the intermediate values will fit 39 // into an unsigned 16-bit intermediate array. 40 assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); 41 42 const __m256i clip_pixel = 43 _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); 44 const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert); 45 const __m256i reduce_bits_vert_const = 46 _mm256_set1_epi32(((1 << reduce_bits_vert) >> 1)); 47 const __m256i res_add_const = _mm256_set1_epi32(1 << offset_bits_vert); 48 const __m256i res_sub_const = 49 _mm256_set1_epi32(-(1 << (offset_bits - conv_params->round_1)) - 50 (1 << (offset_bits - conv_params->round_1 - 1))); 51 __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits); 52 __m256i round_bits_const = _mm256_set1_epi32(((1 << round_bits) >> 1)); 53 54 const int w0 = conv_params->fwd_offset; 55 const int w1 = conv_params->bck_offset; 56 const __m256i wt0 = _mm256_set1_epi32(w0); 57 const __m256i wt1 = _mm256_set1_epi32(w1); 58 59 __m256i v_rbhoriz = _mm256_set1_epi32(1 << (reduce_bits_horiz - 1)); 60 __m256i v_zeros = _mm256_setzero_si256(); 61 int ohoriz = 1 << offset_bits_horiz; 62 int mhoriz = 1 << max_bits_horiz; 63 (void)mhoriz; 64 int sx; 65 66 for (int i = 0; i < p_height; i += 8) { 67 for (int j = 0; j < p_width; j += 8) { 68 // Calculate the center of this 8x8 block, 69 // project to luma coordinates (if in a subsampled chroma plane), 70 // apply the affine transformation, 71 // then convert back to the original coordinates (if necessary) 72 const int32_t src_x = (p_col + j + 4) << subsampling_x; 73 const int32_t src_y = (p_row + i + 4) << subsampling_y; 74 const int64_t dst_x = 75 (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0]; 76 const int64_t dst_y = 77 (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1]; 78 const int64_t x4 = dst_x >> subsampling_x; 79 const int64_t y4 = dst_y >> subsampling_y; 80 81 const int16_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS); 82 int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); 83 const int16_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS); 84 int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); 85 86 sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + 87 (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); 88 sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + 89 (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); 90 91 sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); 92 sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); 93 94 // Horizontal filter 95 if (ix4 <= -7) { 96 for (int k = -7; k < AOMMIN(8, p_height - i); ++k) { 97 int iy = iy4 + k; 98 if (iy < 0) 99 iy = 0; 100 else if (iy > height - 1) 101 iy = height - 1; 102 tmp[k + 7] = _mm256_cvtepi16_epi32(_mm_set1_epi16( 103 (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + 104 ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)))); 105 } 106 } else if (ix4 >= width + 6) { 107 for (int k = -7; k < AOMMIN(8, p_height - i); ++k) { 108 int iy = iy4 + k; 109 if (iy < 0) 110 iy = 0; 111 else if (iy > height - 1) 112 iy = height - 1; 113 tmp[k + 7] = _mm256_cvtepi16_epi32( 114 _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + 115 ref[iy * stride + (width - 1)] * 116 (1 << (FILTER_BITS - reduce_bits_horiz)))); 117 } 118 } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) { 119 int32_t tmp1[8]; 120 for (int k = -7; k < AOMMIN(8, p_height - i); ++k) { 121 const int iy = clamp(iy4 + k, 0, height - 1); 122 123 sx = sx4 + beta * (k + 4); 124 for (int l = -4; l < 4; ++l) { 125 int ix = ix4 + l - 3; 126 const int offs = sx >> WARPEDDIFF_PREC_BITS; 127 const int16_t *coeffs = av1_warped_filter[offs]; 128 129 int32_t sum = 1 << offset_bits_horiz; 130 for (int m = 0; m < 8; ++m) { 131 const int sample_x = clamp(ix + m, 0, width - 1); 132 sum += ref[iy * stride + sample_x] * coeffs[m]; 133 } 134 sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz); 135 tmp1[(l + 4) / 2 + ((l + 4) % 2) * 4] = sum; 136 sx += alpha; 137 } 138 tmp[k + 7] = _mm256_loadu_si256((__m256i *)tmp1); 139 } 140 } else { 141 if (beta == 0 && alpha == 0) { 142 sx = sx4; 143 __m128i v_01 = _mm_loadu_si128( 144 (__m128i *) 145 av1_warped_filter[sx >> 146 WARPEDDIFF_PREC_BITS]); // A7A6A5A4A3A2A1A0 147 __m256i v_c01 = _mm256_broadcastd_epi32(v_01); // A1A0A1A0A1A0A1A0 148 __m256i v_c23 = _mm256_broadcastd_epi32( 149 _mm_shuffle_epi32(v_01, 1)); // A3A2A3A2A3A2A3A2 150 __m256i v_c45 = _mm256_broadcastd_epi32( 151 _mm_shuffle_epi32(v_01, 2)); // A5A4A5A4A5A4A5A4 152 __m256i v_c67 = _mm256_broadcastd_epi32( 153 _mm_shuffle_epi32(v_01, 3)); // A7A6A7A6A7A6A7A6 154 for (int k = -7; k < AOMMIN(8, p_height - i); ++k) { 155 int iy = iy4 + k; 156 if (iy < 0) 157 iy = 0; 158 else if (iy > height - 1) 159 iy = height - 1; 160 iy = iy * stride; 161 162 __m256i v_refl = _mm256_inserti128_si256( 163 _mm256_setzero_si256(), 164 _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0); 165 v_refl = _mm256_inserti128_si256( 166 v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]), 167 1); // R15 .. R0 168 169 __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE); 170 171 __m256i v_refu = 172 _mm256_alignr_epi8(v_ref, v_refl, 2); // R8R15R14...R2R1 173 v_refl = _mm256_inserti128_si256( 174 v_refl, _mm256_extracti128_si256(v_refu, 0), 1); 175 v_refu = _mm256_inserti128_si256( 176 v_refu, _mm256_extracti128_si256(v_ref, 0), 0); 177 178 __m256i v_sum = _mm256_set1_epi32(ohoriz); 179 __m256i parsum = _mm256_madd_epi16( 180 v_c01, _mm256_alignr_epi8(v_refu, v_refl, 181 0)); // R8R7R6..R1R7R6R5..R1R0 182 __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum); 183 184 parsum = _mm256_madd_epi16( 185 v_c23, 186 _mm256_alignr_epi8(v_refu, v_refl, 4)); // R10R9..R3R9R8..R3R2 187 __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum); 188 parsum = _mm256_madd_epi16( 189 v_c45, _mm256_alignr_epi8(v_refu, v_refl, 190 8)); // R12R11..R5R11R10..R5R4 191 __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum); 192 parsum = _mm256_madd_epi16( 193 v_c67, _mm256_alignr_epi8(v_refu, v_refl, 194 12)); // R14R13..R7R13R12..R7R6 195 __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum); 196 197 tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz), 198 reduce_bits_horiz); 199 } 200 } else if (alpha == 0) { 201 for (int k = -7; k < AOMMIN(8, p_height - i); ++k) { 202 int iy = iy4 + k; 203 if (iy < 0) 204 iy = 0; 205 else if (iy > height - 1) 206 iy = height - 1; 207 iy = iy * stride; 208 209 sx = sx4 + beta * (k + 4); 210 211 __m128i v_01 = _mm_loadu_si128( 212 (__m128i *)av1_warped_filter 213 [sx >> WARPEDDIFF_PREC_BITS]); // A7A6A5A4A3A2A1A0 214 __m256i v_c01 = _mm256_broadcastd_epi32(v_01); // A1A0A1A0A1A0A1A0 215 __m256i v_c23 = _mm256_broadcastd_epi32( 216 _mm_shuffle_epi32(v_01, 1)); // A3A2A3A2A3A2A3A2 217 __m256i v_c45 = _mm256_broadcastd_epi32( 218 _mm_shuffle_epi32(v_01, 2)); // A5A4A5A4A5A4A5A4 219 __m256i v_c67 = _mm256_broadcastd_epi32( 220 _mm_shuffle_epi32(v_01, 3)); // A7A6A7A6A7A6A7A6 221 222 __m256i v_refl = _mm256_inserti128_si256( 223 _mm256_setzero_si256(), 224 _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0); 225 v_refl = _mm256_inserti128_si256( 226 v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]), 227 1); // R15 .. R0 228 229 __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE); 230 231 __m256i v_refu = 232 _mm256_alignr_epi8(v_ref, v_refl, 2); // R8R15R14...R2R1 233 234 v_refl = _mm256_inserti128_si256( 235 v_refl, _mm256_extracti128_si256(v_refu, 0), 1); 236 v_refu = _mm256_inserti128_si256( 237 v_refu, _mm256_extracti128_si256(v_ref, 0), 0); 238 239 __m256i v_sum = _mm256_set1_epi32(ohoriz); 240 __m256i parsum = 241 _mm256_madd_epi16(v_c01, _mm256_alignr_epi8(v_refu, v_refl, 0)); 242 __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum); 243 244 parsum = 245 _mm256_madd_epi16(v_c23, _mm256_alignr_epi8(v_refu, v_refl, 4)); 246 __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum); 247 parsum = 248 _mm256_madd_epi16(v_c45, _mm256_alignr_epi8(v_refu, v_refl, 8)); 249 __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum); 250 parsum = _mm256_madd_epi16(v_c67, 251 _mm256_alignr_epi8(v_refu, v_refl, 12)); 252 __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum); 253 254 tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz), 255 reduce_bits_horiz); 256 } 257 } else if (beta == 0) { 258 sx = sx4; 259 __m256i v_coeff01 = _mm256_inserti128_si256( 260 v_zeros, 261 _mm_loadu_si128( 262 (__m128i *)av1_warped_filter[(sx) >> WARPEDDIFF_PREC_BITS]), 263 0); 264 v_coeff01 = _mm256_inserti128_si256( 265 v_coeff01, 266 _mm_loadu_si128( 267 (__m128i *) 268 av1_warped_filter[(sx + alpha) >> WARPEDDIFF_PREC_BITS]), 269 1); // B7B6..B1B0A7A6..A1A0 270 __m256i v_coeff23 = _mm256_inserti128_si256( 271 v_zeros, 272 _mm_loadu_si128( 273 (__m128i *)av1_warped_filter[(sx + 2 * alpha) >> 274 WARPEDDIFF_PREC_BITS]), 275 0); 276 v_coeff23 = _mm256_inserti128_si256( 277 v_coeff23, 278 _mm_loadu_si128( 279 (__m128i *)av1_warped_filter[(sx + 3 * alpha) >> 280 WARPEDDIFF_PREC_BITS]), 281 1); // D7D6..D1D0C7C6..C1C0 282 __m256i v_coeff45 = _mm256_inserti128_si256( 283 v_zeros, 284 _mm_loadu_si128( 285 (__m128i *)av1_warped_filter[(sx + 4 * alpha) >> 286 WARPEDDIFF_PREC_BITS]), 287 0); 288 v_coeff45 = _mm256_inserti128_si256( 289 v_coeff45, 290 _mm_loadu_si128( 291 (__m128i *)av1_warped_filter[(sx + 5 * alpha) >> 292 WARPEDDIFF_PREC_BITS]), 293 1); // F7F6..F1F0E7E6..E1E0 294 __m256i v_coeff67 = _mm256_inserti128_si256( 295 v_zeros, 296 _mm_loadu_si128( 297 (__m128i *)av1_warped_filter[(sx + 6 * alpha) >> 298 WARPEDDIFF_PREC_BITS]), 299 0); 300 v_coeff67 = _mm256_inserti128_si256( 301 v_coeff67, 302 _mm_loadu_si128( 303 (__m128i *)av1_warped_filter[(sx + 7 * alpha) >> 304 WARPEDDIFF_PREC_BITS]), 305 1); // H7H6..H1H0G7G6..G1G0 306 307 __m256i v_c0123 = _mm256_unpacklo_epi32( 308 v_coeff01, 309 v_coeff23); // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0 310 __m256i v_c0123u = _mm256_unpackhi_epi32( 311 v_coeff01, 312 v_coeff23); // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4 313 __m256i v_c4567 = _mm256_unpacklo_epi32( 314 v_coeff45, 315 v_coeff67); // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0 316 __m256i v_c4567u = _mm256_unpackhi_epi32( 317 v_coeff45, 318 v_coeff67); // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4 319 320 __m256i v_c01 = _mm256_unpacklo_epi64( 321 v_c0123, v_c4567); // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0 322 __m256i v_c23 = 323 _mm256_unpackhi_epi64(v_c0123, v_c4567); // H3H2 ... A3A2 324 __m256i v_c45 = 325 _mm256_unpacklo_epi64(v_c0123u, v_c4567u); // H5H4 ... A5A4 326 __m256i v_c67 = 327 _mm256_unpackhi_epi64(v_c0123u, v_c4567u); // H7H6 ... A7A6 328 329 for (int k = -7; k < AOMMIN(8, p_height - i); ++k) { 330 int iy = iy4 + k; 331 if (iy < 0) 332 iy = 0; 333 else if (iy > height - 1) 334 iy = height - 1; 335 iy = iy * stride; 336 337 __m256i v_refl = _mm256_inserti128_si256( 338 _mm256_setzero_si256(), 339 _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0); 340 v_refl = _mm256_inserti128_si256( 341 v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]), 342 1); // R15 .. R0 343 344 __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE); 345 346 __m256i v_refu = 347 _mm256_alignr_epi8(v_ref, v_refl, 2); // R8R15R14...R2R1 348 349 v_refl = _mm256_inserti128_si256( 350 v_refl, _mm256_extracti128_si256(v_refu, 0), 1); 351 v_refu = _mm256_inserti128_si256( 352 v_refu, _mm256_extracti128_si256(v_ref, 0), 0); 353 354 __m256i v_sum = _mm256_set1_epi32(ohoriz); 355 __m256i parsum = _mm256_madd_epi16( 356 v_c01, _mm256_alignr_epi8(v_refu, v_refl, 357 0)); // R8R7R6..R1R7R6R5..R1R0 358 __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum); 359 360 parsum = _mm256_madd_epi16( 361 v_c23, 362 _mm256_alignr_epi8(v_refu, v_refl, 4)); // R10R9..R3R9R8..R3R2 363 __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum); 364 parsum = _mm256_madd_epi16( 365 v_c45, _mm256_alignr_epi8(v_refu, v_refl, 366 8)); // R12R11..R5R11R10..R5R4 367 __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum); 368 parsum = _mm256_madd_epi16( 369 v_c67, _mm256_alignr_epi8(v_refu, v_refl, 370 12)); // R14R13..R7R13R12..R7R6 371 __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum); 372 373 tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz), 374 reduce_bits_horiz); 375 } 376 377 } else { 378 for (int k = -7; k < AOMMIN(8, p_height - i); ++k) { 379 int iy = iy4 + k; 380 if (iy < 0) 381 iy = 0; 382 else if (iy > height - 1) 383 iy = height - 1; 384 iy = iy * stride; 385 386 sx = sx4 + beta * (k + 4); 387 388 __m256i v_coeff01 = _mm256_inserti128_si256( 389 v_zeros, 390 _mm_loadu_si128( 391 (__m128i *)av1_warped_filter[(sx) >> WARPEDDIFF_PREC_BITS]), 392 0); 393 v_coeff01 = _mm256_inserti128_si256( 394 v_coeff01, 395 _mm_loadu_si128( 396 (__m128i *)av1_warped_filter[(sx + alpha) >> 397 WARPEDDIFF_PREC_BITS]), 398 1); // B7B6..B1B0A7A6..A1A0 399 __m256i v_coeff23 = _mm256_inserti128_si256( 400 v_zeros, 401 _mm_loadu_si128( 402 (__m128i *)av1_warped_filter[(sx + 2 * alpha) >> 403 WARPEDDIFF_PREC_BITS]), 404 0); 405 v_coeff23 = _mm256_inserti128_si256( 406 v_coeff23, 407 _mm_loadu_si128( 408 (__m128i *)av1_warped_filter[(sx + 3 * alpha) >> 409 WARPEDDIFF_PREC_BITS]), 410 1); // D7D6..D1D0C7C6..C1C0 411 __m256i v_coeff45 = _mm256_inserti128_si256( 412 v_zeros, 413 _mm_loadu_si128( 414 (__m128i *)av1_warped_filter[(sx + 4 * alpha) >> 415 WARPEDDIFF_PREC_BITS]), 416 0); 417 v_coeff45 = _mm256_inserti128_si256( 418 v_coeff45, 419 _mm_loadu_si128( 420 (__m128i *)av1_warped_filter[(sx + 5 * alpha) >> 421 WARPEDDIFF_PREC_BITS]), 422 1); // F7F6..F1F0E7E6..E1E0 423 __m256i v_coeff67 = _mm256_inserti128_si256( 424 v_zeros, 425 _mm_loadu_si128( 426 (__m128i *)av1_warped_filter[(sx + 6 * alpha) >> 427 WARPEDDIFF_PREC_BITS]), 428 0); 429 v_coeff67 = _mm256_inserti128_si256( 430 v_coeff67, 431 _mm_loadu_si128( 432 (__m128i *)av1_warped_filter[(sx + 7 * alpha) >> 433 WARPEDDIFF_PREC_BITS]), 434 1); // H7H6..H1H0G7G6..G1G0 435 436 __m256i v_c0123 = _mm256_unpacklo_epi32( 437 v_coeff01, 438 v_coeff23); // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0 439 __m256i v_c0123u = _mm256_unpackhi_epi32( 440 v_coeff01, 441 v_coeff23); // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4 442 __m256i v_c4567 = _mm256_unpacklo_epi32( 443 v_coeff45, 444 v_coeff67); // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0 445 __m256i v_c4567u = _mm256_unpackhi_epi32( 446 v_coeff45, 447 v_coeff67); // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4 448 449 __m256i v_c01 = _mm256_unpacklo_epi64( 450 v_c0123, v_c4567); // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0 451 __m256i v_c23 = 452 _mm256_unpackhi_epi64(v_c0123, v_c4567); // H3H2 ... A3A2 453 __m256i v_c45 = 454 _mm256_unpacklo_epi64(v_c0123u, v_c4567u); // H5H4 ... A5A4 455 __m256i v_c67 = 456 _mm256_unpackhi_epi64(v_c0123u, v_c4567u); // H7H6 ... A7A6 457 458 __m256i v_refl = _mm256_inserti128_si256( 459 _mm256_setzero_si256(), 460 _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0); 461 v_refl = _mm256_inserti128_si256( 462 v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]), 463 1); // R15 .. R0 464 465 __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE); 466 467 __m256i v_refu = 468 _mm256_alignr_epi8(v_ref, v_refl, 2); // R8R15R14...R2R1 469 470 v_refl = _mm256_inserti128_si256( 471 v_refl, _mm256_extracti128_si256(v_refu, 0), 1); 472 v_refu = _mm256_inserti128_si256( 473 v_refu, _mm256_extracti128_si256(v_ref, 0), 0); 474 475 __m256i v_sum = _mm256_set1_epi32(ohoriz); 476 __m256i parsum = 477 _mm256_madd_epi16(v_c01, _mm256_alignr_epi8(v_refu, v_refl, 0)); 478 __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum); 479 480 parsum = 481 _mm256_madd_epi16(v_c23, _mm256_alignr_epi8(v_refu, v_refl, 4)); 482 __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum); 483 parsum = 484 _mm256_madd_epi16(v_c45, _mm256_alignr_epi8(v_refu, v_refl, 8)); 485 __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum); 486 parsum = _mm256_madd_epi16(v_c67, 487 _mm256_alignr_epi8(v_refu, v_refl, 12)); 488 __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum); 489 490 tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz), 491 reduce_bits_horiz); 492 } 493 } 494 } 495 496 // Vertical filter 497 for (int k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { 498 int sy = sy4 + delta * (k + 4); 499 const __m256i *src = tmp + (k + 4); 500 501 __m256i v_coeff01 = _mm256_inserti128_si256( 502 v_zeros, 503 _mm_loadu_si128( 504 (__m128i *)av1_warped_filter[(sy) >> WARPEDDIFF_PREC_BITS]), 505 0); 506 v_coeff01 = _mm256_inserti128_si256( 507 v_coeff01, 508 _mm_loadu_si128( 509 (__m128i *) 510 av1_warped_filter[(sy + gamma) >> WARPEDDIFF_PREC_BITS]), 511 1); 512 __m256i v_coeff23 = _mm256_inserti128_si256( 513 v_zeros, 514 _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 2 * gamma) >> 515 WARPEDDIFF_PREC_BITS]), 516 0); 517 v_coeff23 = _mm256_inserti128_si256( 518 v_coeff23, 519 _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 3 * gamma) >> 520 WARPEDDIFF_PREC_BITS]), 521 1); 522 __m256i v_coeff45 = _mm256_inserti128_si256( 523 v_zeros, 524 _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 4 * gamma) >> 525 WARPEDDIFF_PREC_BITS]), 526 0); 527 v_coeff45 = _mm256_inserti128_si256( 528 v_coeff45, 529 _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 5 * gamma) >> 530 WARPEDDIFF_PREC_BITS]), 531 1); 532 __m256i v_coeff67 = _mm256_inserti128_si256( 533 v_zeros, 534 _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 6 * gamma) >> 535 WARPEDDIFF_PREC_BITS]), 536 0); 537 v_coeff67 = _mm256_inserti128_si256( 538 v_coeff67, 539 _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 7 * gamma) >> 540 WARPEDDIFF_PREC_BITS]), 541 1); 542 543 __m256i v_c0123 = _mm256_unpacklo_epi32( 544 v_coeff01, 545 v_coeff23); // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0 546 __m256i v_c0123u = _mm256_unpackhi_epi32( 547 v_coeff01, 548 v_coeff23); // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4 549 __m256i v_c4567 = _mm256_unpacklo_epi32( 550 v_coeff45, 551 v_coeff67); // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0 552 __m256i v_c4567u = _mm256_unpackhi_epi32( 553 v_coeff45, 554 v_coeff67); // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4 555 556 __m256i v_c01 = _mm256_unpacklo_epi64( 557 v_c0123, v_c4567); // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0 558 __m256i v_c23 = 559 _mm256_unpackhi_epi64(v_c0123, v_c4567); // H3H2 ... A3A2 560 __m256i v_c45 = 561 _mm256_unpacklo_epi64(v_c0123u, v_c4567u); // H5H4 ... A5A4 562 __m256i v_c67 = 563 _mm256_unpackhi_epi64(v_c0123u, v_c4567u); // H7H6 ... A7A6 564 565 __m256i v_src01l = 566 _mm256_unpacklo_epi32(src[0], src[1]); // T13T03T11T01T12T02T10T00 567 __m256i v_src01u = 568 _mm256_unpackhi_epi32(src[0], src[1]); // T17T07T15T05T16T06T14T04 569 __m256i v_sum = 570 _mm256_madd_epi16(_mm256_packus_epi32(v_src01l, v_src01u), 571 v_c01); // S7S5S3S1S6S4S2S0 572 573 __m256i v_src23l = _mm256_unpacklo_epi32(src[2], src[3]); 574 __m256i v_src23u = _mm256_unpackhi_epi32(src[2], src[3]); 575 v_sum = _mm256_add_epi32( 576 v_sum, 577 _mm256_madd_epi16(_mm256_packus_epi32(v_src23l, v_src23u), v_c23)); 578 579 __m256i v_src45l = _mm256_unpacklo_epi32(src[4], src[5]); 580 __m256i v_src45u = _mm256_unpackhi_epi32(src[4], src[5]); 581 v_sum = _mm256_add_epi32( 582 v_sum, 583 _mm256_madd_epi16(_mm256_packus_epi32(v_src45l, v_src45u), v_c45)); 584 585 __m256i v_src67l = _mm256_unpacklo_epi32(src[6], src[7]); 586 __m256i v_src67u = _mm256_unpackhi_epi32(src[6], src[7]); 587 v_sum = _mm256_add_epi32( 588 v_sum, 589 _mm256_madd_epi16(_mm256_packus_epi32(v_src67l, v_src67u), v_c67)); 590 591 // unpack S7S5S3S1S6S4S2S0 to S7S6S5S4S3S2S1S0 592 593 __m256i v_suml = 594 _mm256_permute4x64_epi64(v_sum, 0xD8); // S7S5S6S4S3S1S2S0 595 __m256i v_sumh = 596 _mm256_permute4x64_epi64(v_sum, 0x32); // S2S0S7S5S2S0S3S1 597 v_sum = _mm256_unpacklo_epi32(v_suml, v_sumh); // S7S6S5S4S3S2S1S0 598 599 if (conv_params->is_compound) { 600 __m128i *const p = 601 (__m128i *)&conv_params 602 ->dst[(i + k + 4) * conv_params->dst_stride + j]; 603 604 v_sum = _mm256_add_epi32(v_sum, res_add_const); 605 v_sum = 606 _mm256_sra_epi32(_mm256_add_epi32(v_sum, reduce_bits_vert_const), 607 reduce_bits_vert_shift); 608 if (conv_params->do_average) { 609 __m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j]; 610 __m256i p_32 = _mm256_cvtepu16_epi32(_mm_loadu_si128(p)); 611 612 if (conv_params->use_dist_wtd_comp_avg) { 613 v_sum = _mm256_add_epi32(_mm256_mullo_epi32(p_32, wt0), 614 _mm256_mullo_epi32(v_sum, wt1)); 615 v_sum = _mm256_srai_epi32(v_sum, DIST_PRECISION_BITS); 616 } else { 617 v_sum = _mm256_srai_epi32(_mm256_add_epi32(p_32, v_sum), 1); 618 } 619 620 __m256i v_sum1 = _mm256_add_epi32(v_sum, res_sub_const); 621 v_sum1 = _mm256_sra_epi32( 622 _mm256_add_epi32(v_sum1, round_bits_const), round_bits_shift); 623 624 __m256i v_sum16 = _mm256_packus_epi32(v_sum1, v_sum1); 625 v_sum16 = _mm256_permute4x64_epi64(v_sum16, 0xD8); 626 v_sum16 = _mm256_min_epi16(v_sum16, clip_pixel); 627 _mm_storeu_si128(dst16, _mm256_extracti128_si256(v_sum16, 0)); 628 } else { 629 v_sum = _mm256_packus_epi32(v_sum, v_sum); 630 __m256i v_sum16 = _mm256_permute4x64_epi64(v_sum, 0xD8); 631 _mm_storeu_si128(p, _mm256_extracti128_si256(v_sum16, 0)); 632 } 633 } else { 634 // Round and pack into 8 bits 635 const __m256i round_const = 636 _mm256_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) + 637 ((1 << reduce_bits_vert) >> 1)); 638 639 __m256i v_sum1 = _mm256_srai_epi32( 640 _mm256_add_epi32(v_sum, round_const), reduce_bits_vert); 641 642 v_sum1 = _mm256_packus_epi32(v_sum1, v_sum1); 643 __m256i v_sum16 = _mm256_permute4x64_epi64(v_sum1, 0xD8); 644 // Clamp res_16bit to the range [0, 2^bd - 1] 645 const __m256i max_val = _mm256_set1_epi16((1 << bd) - 1); 646 const __m256i zero = _mm256_setzero_si256(); 647 v_sum16 = _mm256_max_epi16(_mm256_min_epi16(v_sum16, max_val), zero); 648 649 __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j]; 650 651 _mm_storeu_si128(p, _mm256_extracti128_si256(v_sum16, 0)); 652 } 653 } 654 } 655 } 656 }