highbd_convolve_avx2.c (47756B)
1 /* 2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 #include <immintrin.h> 12 #include <string.h> 13 14 #include "config/av1_rtcd.h" 15 16 #include "aom_dsp/x86/convolve.h" 17 #include "aom_dsp/x86/convolve_avx2.h" 18 #include "aom_dsp/x86/synonyms.h" 19 20 // ----------------------------------------------------------------------------- 21 // Copy and average 22 23 static const uint8_t ip_shuffle_f2f3[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 24 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, 25 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 }; 26 static const uint8_t ip_shuffle_f4f5[32] = { 4, 5, 6, 7, 6, 7, 8, 9, 27 8, 9, 10, 11, 10, 11, 12, 13, 28 4, 5, 6, 7, 6, 7, 8, 9, 29 8, 9, 10, 11, 10, 11, 12, 13 }; 30 31 void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, 32 uint16_t *dst, int dst_stride, int w, int h, 33 const InterpFilterParams *filter_params_x, 34 const int subpel_x_qn, 35 ConvolveParams *conv_params, int bd); 36 void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, 37 uint16_t *dst, int dst_stride, int w, int h, 38 const InterpFilterParams *filter_params_y, 39 const int subpel_y_qn, int bd); 40 41 void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride, 42 uint16_t *dst, int dst_stride, int w, int h, 43 const InterpFilterParams *filter_params_y, 44 const int subpel_y_qn, int bd) { 45 if (filter_params_y->taps == 12) { 46 av1_highbd_convolve_y_sr_ssse3(src, src_stride, dst, dst_stride, w, h, 47 filter_params_y, subpel_y_qn, bd); 48 return; 49 } 50 int i, j; 51 const int fo_vert = filter_params_y->taps / 2 - 1; 52 const uint16_t *const src_ptr = src - fo_vert * src_stride; 53 54 __m256i s[8], coeffs_y[4]; 55 56 const int bits = FILTER_BITS; 57 58 const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); 59 const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); 60 const __m256i clip_pixel = 61 _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); 62 const __m256i zero = _mm256_setzero_si256(); 63 64 prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); 65 66 for (j = 0; j < w; j += 8) { 67 const uint16_t *data = &src_ptr[j]; 68 /* Vertical filter */ 69 { 70 __m256i src6; 71 __m256i s01 = _mm256_permute2x128_si256( 72 _mm256_castsi128_si256( 73 _mm_loadu_si128((__m128i *)(data + 0 * src_stride))), 74 _mm256_castsi128_si256( 75 _mm_loadu_si128((__m128i *)(data + 1 * src_stride))), 76 0x20); 77 __m256i s12 = _mm256_permute2x128_si256( 78 _mm256_castsi128_si256( 79 _mm_loadu_si128((__m128i *)(data + 1 * src_stride))), 80 _mm256_castsi128_si256( 81 _mm_loadu_si128((__m128i *)(data + 2 * src_stride))), 82 0x20); 83 __m256i s23 = _mm256_permute2x128_si256( 84 _mm256_castsi128_si256( 85 _mm_loadu_si128((__m128i *)(data + 2 * src_stride))), 86 _mm256_castsi128_si256( 87 _mm_loadu_si128((__m128i *)(data + 3 * src_stride))), 88 0x20); 89 __m256i s34 = _mm256_permute2x128_si256( 90 _mm256_castsi128_si256( 91 _mm_loadu_si128((__m128i *)(data + 3 * src_stride))), 92 _mm256_castsi128_si256( 93 _mm_loadu_si128((__m128i *)(data + 4 * src_stride))), 94 0x20); 95 __m256i s45 = _mm256_permute2x128_si256( 96 _mm256_castsi128_si256( 97 _mm_loadu_si128((__m128i *)(data + 4 * src_stride))), 98 _mm256_castsi128_si256( 99 _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), 100 0x20); 101 src6 = _mm256_castsi128_si256( 102 _mm_loadu_si128((__m128i *)(data + 6 * src_stride))); 103 __m256i s56 = _mm256_permute2x128_si256( 104 _mm256_castsi128_si256( 105 _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), 106 src6, 0x20); 107 108 s[0] = _mm256_unpacklo_epi16(s01, s12); 109 s[1] = _mm256_unpacklo_epi16(s23, s34); 110 s[2] = _mm256_unpacklo_epi16(s45, s56); 111 112 s[4] = _mm256_unpackhi_epi16(s01, s12); 113 s[5] = _mm256_unpackhi_epi16(s23, s34); 114 s[6] = _mm256_unpackhi_epi16(s45, s56); 115 116 for (i = 0; i < h; i += 2) { 117 data = &src_ptr[i * src_stride + j]; 118 119 const __m256i s67 = _mm256_permute2x128_si256( 120 src6, 121 _mm256_castsi128_si256( 122 _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), 123 0x20); 124 125 src6 = _mm256_castsi128_si256( 126 _mm_loadu_si128((__m128i *)(data + 8 * src_stride))); 127 128 const __m256i s78 = _mm256_permute2x128_si256( 129 _mm256_castsi128_si256( 130 _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), 131 src6, 0x20); 132 133 s[3] = _mm256_unpacklo_epi16(s67, s78); 134 s[7] = _mm256_unpackhi_epi16(s67, s78); 135 136 const __m256i res_a = convolve(s, coeffs_y); 137 138 __m256i res_a_round = _mm256_sra_epi32( 139 _mm256_add_epi32(res_a, round_const_bits), round_shift_bits); 140 141 if (w - j > 4) { 142 const __m256i res_b = convolve(s + 4, coeffs_y); 143 __m256i res_b_round = _mm256_sra_epi32( 144 _mm256_add_epi32(res_b, round_const_bits), round_shift_bits); 145 146 __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); 147 res_16bit = _mm256_min_epi16(res_16bit, clip_pixel); 148 res_16bit = _mm256_max_epi16(res_16bit, zero); 149 150 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], 151 _mm256_castsi256_si128(res_16bit)); 152 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], 153 _mm256_extracti128_si256(res_16bit, 1)); 154 } else if (w == 4) { 155 res_a_round = _mm256_packs_epi32(res_a_round, res_a_round); 156 res_a_round = _mm256_min_epi16(res_a_round, clip_pixel); 157 res_a_round = _mm256_max_epi16(res_a_round, zero); 158 159 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], 160 _mm256_castsi256_si128(res_a_round)); 161 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], 162 _mm256_extracti128_si256(res_a_round, 1)); 163 } else { 164 res_a_round = _mm256_packs_epi32(res_a_round, res_a_round); 165 res_a_round = _mm256_min_epi16(res_a_round, clip_pixel); 166 res_a_round = _mm256_max_epi16(res_a_round, zero); 167 168 xx_storel_32(&dst[i * dst_stride + j], 169 _mm256_castsi256_si128(res_a_round)); 170 xx_storel_32(&dst[i * dst_stride + j + dst_stride], 171 _mm256_extracti128_si256(res_a_round, 1)); 172 } 173 174 s[0] = s[1]; 175 s[1] = s[2]; 176 s[2] = s[3]; 177 178 s[4] = s[5]; 179 s[5] = s[6]; 180 s[6] = s[7]; 181 } 182 } 183 } 184 } 185 186 void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride, 187 uint16_t *dst, int dst_stride, int w, int h, 188 const InterpFilterParams *filter_params_x, 189 const int subpel_x_qn, 190 ConvolveParams *conv_params, int bd) { 191 if (filter_params_x->taps == 12) { 192 av1_highbd_convolve_x_sr_ssse3(src, src_stride, dst, dst_stride, w, h, 193 filter_params_x, subpel_x_qn, conv_params, 194 bd); 195 return; 196 } 197 int i, j; 198 const int fo_horiz = filter_params_x->taps / 2 - 1; 199 const uint16_t *const src_ptr = src - fo_horiz; 200 201 // Check that, even with 12-bit input, the intermediate values will fit 202 // into an unsigned 16-bit intermediate array. 203 assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); 204 205 __m256i s[4], coeffs_x[4]; 206 207 const __m256i round_const_x = 208 _mm256_set1_epi32(((1 << conv_params->round_0) >> 1)); 209 const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); 210 211 const int bits = FILTER_BITS - conv_params->round_0; 212 const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); 213 const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); 214 const __m256i clip_pixel = 215 _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); 216 const __m256i zero = _mm256_setzero_si256(); 217 218 assert(bits >= 0); 219 assert((FILTER_BITS - conv_params->round_1) >= 0 || 220 ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); 221 222 prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x); 223 224 for (j = 0; j < w; j += 8) { 225 /* Horizontal filter */ 226 for (i = 0; i < h; i += 2) { 227 const __m256i row0 = 228 _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]); 229 __m256i row1 = 230 _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]); 231 232 const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20); 233 const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31); 234 235 // even pixels 236 s[0] = _mm256_alignr_epi8(r1, r0, 0); 237 s[1] = _mm256_alignr_epi8(r1, r0, 4); 238 s[2] = _mm256_alignr_epi8(r1, r0, 8); 239 s[3] = _mm256_alignr_epi8(r1, r0, 12); 240 241 __m256i res_even = convolve(s, coeffs_x); 242 res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x), 243 round_shift_x); 244 245 // odd pixels 246 s[0] = _mm256_alignr_epi8(r1, r0, 2); 247 s[1] = _mm256_alignr_epi8(r1, r0, 6); 248 s[2] = _mm256_alignr_epi8(r1, r0, 10); 249 s[3] = _mm256_alignr_epi8(r1, r0, 14); 250 251 __m256i res_odd = convolve(s, coeffs_x); 252 res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x), 253 round_shift_x); 254 255 res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_bits), 256 round_shift_bits); 257 res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_bits), 258 round_shift_bits); 259 260 __m256i res_even1 = _mm256_packs_epi32(res_even, res_even); 261 __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd); 262 263 __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1); 264 res = _mm256_min_epi16(res, clip_pixel); 265 res = _mm256_max_epi16(res, zero); 266 267 if (w - j > 4) { 268 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], 269 _mm256_castsi256_si128(res)); 270 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], 271 _mm256_extracti128_si256(res, 1)); 272 } else if (w == 4) { 273 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], 274 _mm256_castsi256_si128(res)); 275 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], 276 _mm256_extracti128_si256(res, 1)); 277 } else { 278 xx_storel_32(&dst[i * dst_stride + j], _mm256_castsi256_si128(res)); 279 xx_storel_32(&dst[i * dst_stride + j + dst_stride], 280 _mm256_extracti128_si256(res, 1)); 281 } 282 } 283 } 284 } 285 286 #define CONV8_ROUNDING_BITS (7) 287 288 // ----------------------------------------------------------------------------- 289 // Horizontal and vertical filtering 290 291 static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 292 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, 293 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 }; 294 295 static const uint8_t signal_pattern_1[32] = { 4, 5, 6, 7, 6, 7, 8, 9, 296 8, 9, 10, 11, 10, 11, 12, 13, 297 4, 5, 6, 7, 6, 7, 8, 9, 298 8, 9, 10, 11, 10, 11, 12, 13 }; 299 300 static const uint8_t signal_pattern_2[32] = { 6, 7, 8, 9, 8, 9, 10, 11, 301 10, 11, 12, 13, 12, 13, 14, 15, 302 6, 7, 8, 9, 8, 9, 10, 11, 303 10, 11, 12, 13, 12, 13, 14, 15 }; 304 305 static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 }; 306 307 // ----------------------------------------------------------------------------- 308 // Horizontal Filtering 309 310 static inline void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) { 311 const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); 312 const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0); 313 const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1); 314 const __m256i c = _mm256_permutevar8x32_epi32(*s, idx); 315 316 p[0] = _mm256_shuffle_epi8(*s, sf0); // x0x6 317 p[1] = _mm256_shuffle_epi8(*s, sf1); // x1x7 318 p[2] = _mm256_shuffle_epi8(c, sf0); // x2x4 319 p[3] = _mm256_shuffle_epi8(c, sf1); // x3x5 320 } 321 322 // Note: 323 // Shared by 8x2 and 16x1 block 324 static inline void pack_16_pixels(const __m256i *s0, const __m256i *s1, 325 __m256i *x /*x[8]*/) { 326 __m256i pp[8]; 327 pack_pixels(s0, pp); 328 pack_pixels(s1, &pp[4]); 329 x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20); 330 x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20); 331 x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20); 332 x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20); 333 x[4] = x[2]; 334 x[5] = x[3]; 335 x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31); 336 x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31); 337 } 338 339 static inline void pack_8x1_pixels(const uint16_t *src, __m256i *x) { 340 __m256i pp[8]; 341 __m256i s0; 342 s0 = _mm256_loadu_si256((const __m256i *)src); 343 pack_pixels(&s0, pp); 344 x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30); 345 x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30); 346 x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30); 347 x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30); 348 } 349 350 static inline void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride, 351 __m256i *x) { 352 __m256i s0, s1; 353 s0 = _mm256_loadu_si256((const __m256i *)src); 354 s1 = _mm256_loadu_si256((const __m256i *)(src + stride)); 355 pack_16_pixels(&s0, &s1, x); 356 } 357 358 static inline void pack_16x1_pixels(const uint16_t *src, __m256i *x) { 359 __m256i s0, s1; 360 s0 = _mm256_loadu_si256((const __m256i *)src); 361 s1 = _mm256_loadu_si256((const __m256i *)(src + 8)); 362 pack_16_pixels(&s0, &s1, x); 363 } 364 365 // Note: 366 // Shared by horizontal and vertical filtering 367 static inline void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) { 368 const __m128i h = _mm_loadu_si128((const __m128i *)filter); 369 const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1); 370 const __m256i p0 = _mm256_set1_epi32(0x03020100); 371 const __m256i p1 = _mm256_set1_epi32(0x07060504); 372 const __m256i p2 = _mm256_set1_epi32(0x0b0a0908); 373 const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c); 374 f[0] = _mm256_shuffle_epi8(hh, p0); 375 f[1] = _mm256_shuffle_epi8(hh, p1); 376 f[2] = _mm256_shuffle_epi8(hh, p2); 377 f[3] = _mm256_shuffle_epi8(hh, p3); 378 } 379 380 static inline void pack_filters_4tap(const int16_t *filter, 381 __m256i *f /*f[4]*/) { 382 const __m128i h = _mm_loadu_si128((const __m128i *)filter); 383 const __m256i coeff = _mm256_broadcastsi128_si256(h); 384 385 // coeffs 2 3 2 3 2 3 2 3 386 f[0] = _mm256_shuffle_epi32(coeff, 0x55); 387 // coeffs 4 5 4 5 4 5 4 5 388 f[1] = _mm256_shuffle_epi32(coeff, 0xaa); 389 } 390 391 static inline void filter_8x1_pixels(const __m256i *sig /*sig[4]*/, 392 const __m256i *fil /*fil[4]*/, 393 __m256i *y) { 394 __m256i a, a0, a1; 395 396 a0 = _mm256_madd_epi16(fil[0], sig[0]); 397 a1 = _mm256_madd_epi16(fil[3], sig[3]); 398 a = _mm256_add_epi32(a0, a1); 399 400 a0 = _mm256_madd_epi16(fil[1], sig[1]); 401 a1 = _mm256_madd_epi16(fil[2], sig[2]); 402 403 { 404 const __m256i min = _mm256_min_epi32(a0, a1); 405 a = _mm256_add_epi32(a, min); 406 } 407 { 408 const __m256i max = _mm256_max_epi32(a0, a1); 409 a = _mm256_add_epi32(a, max); 410 } 411 { 412 const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); 413 a = _mm256_add_epi32(a, rounding); 414 *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS); 415 } 416 } 417 418 static inline void store_8x1_pixels(const __m256i *y, const __m256i *mask, 419 uint16_t *dst) { 420 const __m128i a0 = _mm256_castsi256_si128(*y); 421 const __m128i a1 = _mm256_extractf128_si256(*y, 1); 422 __m128i res = _mm_packus_epi32(a0, a1); 423 res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask)); 424 _mm_storeu_si128((__m128i *)dst, res); 425 } 426 427 static inline void store_8x2_pixels(const __m256i *y0, const __m256i *y1, 428 const __m256i *mask, uint16_t *dst, 429 ptrdiff_t pitch) { 430 __m256i a = _mm256_packus_epi32(*y0, *y1); 431 a = _mm256_min_epi16(a, *mask); 432 _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a)); 433 _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1)); 434 } 435 436 static inline void store_16x1_pixels(const __m256i *y0, const __m256i *y1, 437 const __m256i *mask, uint16_t *dst) { 438 __m256i a = _mm256_packus_epi32(*y0, *y1); 439 a = _mm256_min_epi16(a, *mask); 440 _mm256_storeu_si256((__m256i *)dst, a); 441 } 442 443 static void aom_highbd_filter_block1d8_h8_avx2( 444 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 445 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 446 __m256i signal[8], res0, res1; 447 const __m256i max = _mm256_set1_epi16((1 << bd) - 1); 448 449 __m256i ff[4]; 450 pack_filters(filter, ff); 451 452 src_ptr -= 3; 453 do { 454 pack_8x2_pixels(src_ptr, src_pitch, signal); 455 filter_8x1_pixels(signal, ff, &res0); 456 filter_8x1_pixels(&signal[4], ff, &res1); 457 store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); 458 height -= 2; 459 src_ptr += src_pitch << 1; 460 dst_ptr += dst_pitch << 1; 461 } while (height > 1); 462 463 if (height > 0) { 464 pack_8x1_pixels(src_ptr, signal); 465 filter_8x1_pixels(signal, ff, &res0); 466 store_8x1_pixels(&res0, &max, dst_ptr); 467 } 468 } 469 470 static void aom_highbd_filter_block1d16_h8_avx2( 471 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 472 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 473 __m256i signal[8], res0, res1; 474 const __m256i max = _mm256_set1_epi16((1 << bd) - 1); 475 476 __m256i ff[4]; 477 pack_filters(filter, ff); 478 479 src_ptr -= 3; 480 do { 481 pack_16x1_pixels(src_ptr, signal); 482 filter_8x1_pixels(signal, ff, &res0); 483 filter_8x1_pixels(&signal[4], ff, &res1); 484 store_16x1_pixels(&res0, &res1, &max, dst_ptr); 485 height -= 1; 486 src_ptr += src_pitch; 487 dst_ptr += dst_pitch; 488 } while (height > 0); 489 } 490 491 static void aom_highbd_filter_block1d4_h4_avx2( 492 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 493 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 494 const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); 495 __m256i ff[2], s[2]; 496 uint32_t i; 497 const __m256i clip_pixel = 498 _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); 499 const __m256i zero = _mm256_setzero_si256(); 500 501 static const uint8_t shuffle_mask[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 502 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, 503 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 }; 504 505 __m256i mask = _mm256_loadu_si256((__m256i *)shuffle_mask); 506 __m256i ip_mask_f2f3 = _mm256_loadu_si256((__m256i *)ip_shuffle_f2f3); 507 __m256i ip_mask_f4f5 = _mm256_loadu_si256((__m256i *)ip_shuffle_f4f5); 508 509 pack_filters_4tap(filter, ff); 510 src_ptr -= 3; 511 for (i = 0; i <= (height - 2); i += 2) { 512 __m256i row0 = _mm256_castsi128_si256( 513 _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 2])); 514 __m256i row1 = _mm256_castsi128_si256( 515 _mm_loadu_si128((__m128i *)&src_ptr[(i + 1) * src_pitch + 2])); 516 517 s[0] = _mm256_inserti128_si256(row0, _mm256_castsi256_si128(row1), 1); 518 s[1] = _mm256_alignr_epi8(s[0], s[0], 4); 519 520 s[0] = _mm256_shuffle_epi8(s[0], mask); 521 s[1] = _mm256_shuffle_epi8(s[1], mask); 522 523 __m256i res = convolve_4tap(s, ff); 524 res = 525 _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS); 526 527 res = _mm256_packs_epi32(res, res); 528 res = _mm256_min_epi16(res, clip_pixel); 529 res = _mm256_max_epi16(res, zero); 530 531 _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch], 532 _mm256_castsi256_si128(res)); 533 _mm_storel_epi64((__m128i *)&dst_ptr[(i + 1) * dst_pitch], 534 _mm256_extracti128_si256(res, 1)); 535 } 536 if (height % 2 != 0) { 537 i = height - 1; 538 const __m256i row0_0 = _mm256_castsi128_si256( 539 _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 2])); 540 const __m256i row0_1 = _mm256_castsi128_si256( 541 _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 6])); 542 543 const __m256i r0 = 544 _mm256_inserti128_si256(row0_0, _mm256_castsi256_si128(row0_1), 1); 545 546 s[0] = _mm256_shuffle_epi8(r0, ip_mask_f2f3); 547 s[1] = _mm256_shuffle_epi8(r0, ip_mask_f4f5); 548 549 __m256i res = convolve_4tap(s, ff); 550 res = 551 _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS); 552 553 res = _mm256_packs_epi32(res, res); 554 res = _mm256_min_epi16(res, clip_pixel); 555 res = _mm256_max_epi16(res, zero); 556 557 _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch], 558 _mm256_castsi256_si128(res)); 559 } 560 } 561 562 static void aom_highbd_filter_block1d8_h4_avx2( 563 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 564 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 565 const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); 566 __m256i ff[2], s[2]; 567 uint32_t i = 0; 568 const __m256i clip_pixel = 569 _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); 570 const __m256i zero = _mm256_setzero_si256(); 571 572 static const uint8_t shuffle_mask[32] = { 0, 1, 8, 9, 2, 3, 10, 11, 573 4, 5, 12, 13, 6, 7, 14, 15, 574 0, 1, 8, 9, 2, 3, 10, 11, 575 4, 5, 12, 13, 6, 7, 14, 15 }; 576 577 __m256i mask = _mm256_loadu_si256((__m256i *)shuffle_mask); 578 __m256i ip_mask_f2f3 = _mm256_loadu_si256((__m256i *)ip_shuffle_f2f3); 579 __m256i ip_mask_f4f5 = _mm256_loadu_si256((__m256i *)ip_shuffle_f4f5); 580 581 pack_filters_4tap(filter, ff); 582 src_ptr -= 3; 583 584 /* Horizontal filter */ 585 586 for (i = 0; i <= (height - 2); i += 2) { 587 const __m256i row0 = 588 _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 2]); 589 __m256i row1 = 590 _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_pitch + 2]); 591 592 const __m256i r0 = 593 _mm256_inserti128_si256(row0, _mm256_castsi256_si128(row1), 1); 594 const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31); 595 596 // even pixels 597 s[0] = r0; 598 s[1] = _mm256_alignr_epi8(r1, r0, 4); 599 600 __m256i res_even = convolve_4tap(s, ff); 601 res_even = _mm256_srai_epi32(_mm256_add_epi32(res_even, rounding), 602 CONV8_ROUNDING_BITS); 603 604 // odd pixels 605 s[0] = _mm256_alignr_epi8(r1, r0, 2); 606 s[1] = _mm256_alignr_epi8(r1, r0, 6); 607 608 __m256i res_odd = convolve_4tap(s, ff); 609 res_odd = _mm256_srai_epi32(_mm256_add_epi32(res_odd, rounding), 610 CONV8_ROUNDING_BITS); 611 612 __m256i res = _mm256_packs_epi32(res_even, res_odd); 613 res = _mm256_shuffle_epi8(res, mask); 614 615 res = _mm256_min_epi16(res, clip_pixel); 616 res = _mm256_max_epi16(res, zero); 617 618 _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch], 619 _mm256_castsi256_si128(res)); 620 _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch], 621 _mm256_extracti128_si256(res, 1)); 622 } 623 624 if (height % 2 != 0) { 625 i = height - 1; 626 const __m256i row0_0 = 627 _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 2]); 628 const __m256i row0_1 = 629 _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 6]); 630 631 const __m256i r0 = 632 _mm256_inserti128_si256(row0_0, _mm256_castsi256_si128(row0_1), 1); 633 634 s[0] = _mm256_shuffle_epi8(r0, ip_mask_f2f3); 635 s[1] = _mm256_shuffle_epi8(r0, ip_mask_f4f5); 636 637 __m256i res = convolve_4tap(s, ff); 638 res = 639 _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS); 640 641 res = _mm256_packs_epi32(res, res); 642 res = _mm256_min_epi16(res, clip_pixel); 643 res = _mm256_max_epi16(res, zero); 644 645 _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch], 646 _mm256_castsi256_si128(res)); 647 _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch + 4], 648 _mm256_extracti128_si256(res, 1)); 649 } 650 } 651 652 static void aom_highbd_filter_block1d16_h4_avx2( 653 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 654 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 655 aom_highbd_filter_block1d8_h4_avx2(src_ptr, src_pitch, dst_ptr, dst_pitch, 656 height, filter, bd); 657 aom_highbd_filter_block1d8_h4_avx2(src_ptr + 8, src_pitch, dst_ptr + 8, 658 dst_pitch, height, filter, bd); 659 } 660 661 // ----------------------------------------------------------------------------- 662 // 2-tap horizontal filtering 663 664 static inline void pack_2t_filter(const int16_t *filter, __m256i *f) { 665 const __m128i h = _mm_loadu_si128((const __m128i *)filter); 666 const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1); 667 const __m256i p = _mm256_set1_epi32(0x09080706); 668 f[0] = _mm256_shuffle_epi8(hh, p); 669 } 670 671 // can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels() 672 // the difference is s0/s1 specifies first and second rows or, 673 // first 16 samples and 8-sample shifted 16 samples 674 static inline void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1, 675 __m256i *sig) { 676 const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); 677 const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2); 678 __m256i x0 = _mm256_shuffle_epi8(*s0, sf2); 679 __m256i x1 = _mm256_shuffle_epi8(*s1, sf2); 680 __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx); 681 __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx); 682 r0 = _mm256_shuffle_epi8(r0, sf2); 683 r1 = _mm256_shuffle_epi8(r1, sf2); 684 sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20); 685 sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20); 686 } 687 688 static inline void pack_8x2_2t_pixels(const uint16_t *src, 689 const ptrdiff_t pitch, __m256i *sig) { 690 const __m256i r0 = _mm256_loadu_si256((const __m256i *)src); 691 const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch)); 692 pack_16_2t_pixels(&r0, &r1, sig); 693 } 694 695 static inline void pack_16x1_2t_pixels(const uint16_t *src, 696 __m256i *sig /*sig[2]*/) { 697 const __m256i r0 = _mm256_loadu_si256((const __m256i *)src); 698 const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8)); 699 pack_16_2t_pixels(&r0, &r1, sig); 700 } 701 702 static inline void pack_8x1_2t_pixels(const uint16_t *src, 703 __m256i *sig /*sig[2]*/) { 704 const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); 705 const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2); 706 __m256i r0 = _mm256_loadu_si256((const __m256i *)src); 707 __m256i x0 = _mm256_shuffle_epi8(r0, sf2); 708 r0 = _mm256_permutevar8x32_epi32(r0, idx); 709 r0 = _mm256_shuffle_epi8(r0, sf2); 710 sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20); 711 } 712 713 // can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels() 714 static inline void filter_16_2t_pixels(const __m256i *sig, const __m256i *f, 715 __m256i *y0, __m256i *y1) { 716 const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); 717 __m256i x0 = _mm256_madd_epi16(sig[0], *f); 718 __m256i x1 = _mm256_madd_epi16(sig[1], *f); 719 x0 = _mm256_add_epi32(x0, rounding); 720 x1 = _mm256_add_epi32(x1, rounding); 721 *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS); 722 *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS); 723 } 724 725 static inline void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f, 726 __m256i *y0) { 727 const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); 728 __m256i x0 = _mm256_madd_epi16(sig[0], *f); 729 x0 = _mm256_add_epi32(x0, rounding); 730 *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS); 731 } 732 733 static void aom_highbd_filter_block1d8_h2_avx2( 734 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 735 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 736 __m256i signal[2], res0, res1; 737 const __m256i max = _mm256_set1_epi16((1 << bd) - 1); 738 739 __m256i ff; 740 pack_2t_filter(filter, &ff); 741 742 src_ptr -= 3; 743 do { 744 pack_8x2_2t_pixels(src_ptr, src_pitch, signal); 745 filter_16_2t_pixels(signal, &ff, &res0, &res1); 746 store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); 747 height -= 2; 748 src_ptr += src_pitch << 1; 749 dst_ptr += dst_pitch << 1; 750 } while (height > 1); 751 752 if (height > 0) { 753 pack_8x1_2t_pixels(src_ptr, signal); 754 filter_8x1_2t_pixels(signal, &ff, &res0); 755 store_8x1_pixels(&res0, &max, dst_ptr); 756 } 757 } 758 759 static void aom_highbd_filter_block1d16_h2_avx2( 760 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 761 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 762 __m256i signal[2], res0, res1; 763 const __m256i max = _mm256_set1_epi16((1 << bd) - 1); 764 765 __m256i ff; 766 pack_2t_filter(filter, &ff); 767 768 src_ptr -= 3; 769 do { 770 pack_16x1_2t_pixels(src_ptr, signal); 771 filter_16_2t_pixels(signal, &ff, &res0, &res1); 772 store_16x1_pixels(&res0, &res1, &max, dst_ptr); 773 height -= 1; 774 src_ptr += src_pitch; 775 dst_ptr += dst_pitch; 776 } while (height > 0); 777 } 778 779 // ----------------------------------------------------------------------------- 780 // Vertical Filtering 781 782 static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) { 783 __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src)); 784 __m256i s1 = 785 _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch))); 786 __m256i s2 = _mm256_castsi128_si256( 787 _mm_loadu_si128((const __m128i *)(src + 2 * pitch))); 788 __m256i s3 = _mm256_castsi128_si256( 789 _mm_loadu_si128((const __m128i *)(src + 3 * pitch))); 790 __m256i s4 = _mm256_castsi128_si256( 791 _mm_loadu_si128((const __m128i *)(src + 4 * pitch))); 792 __m256i s5 = _mm256_castsi128_si256( 793 _mm_loadu_si128((const __m128i *)(src + 5 * pitch))); 794 __m256i s6 = _mm256_castsi128_si256( 795 _mm_loadu_si128((const __m128i *)(src + 6 * pitch))); 796 797 s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1); 798 s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1); 799 s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1); 800 s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1); 801 s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1); 802 s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1); 803 804 sig[0] = _mm256_unpacklo_epi16(s0, s1); 805 sig[4] = _mm256_unpackhi_epi16(s0, s1); 806 sig[1] = _mm256_unpacklo_epi16(s2, s3); 807 sig[5] = _mm256_unpackhi_epi16(s2, s3); 808 sig[2] = _mm256_unpacklo_epi16(s4, s5); 809 sig[6] = _mm256_unpackhi_epi16(s4, s5); 810 sig[8] = s6; 811 } 812 813 static inline void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch, 814 __m256i *sig) { 815 // base + 7th row 816 __m256i s0 = _mm256_castsi128_si256( 817 _mm_loadu_si128((const __m128i *)(src + 7 * pitch))); 818 // base + 8th row 819 __m256i s1 = _mm256_castsi128_si256( 820 _mm_loadu_si128((const __m128i *)(src + 8 * pitch))); 821 __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1); 822 __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1); 823 sig[3] = _mm256_unpacklo_epi16(s2, s3); 824 sig[7] = _mm256_unpackhi_epi16(s2, s3); 825 sig[8] = s1; 826 } 827 828 static inline void filter_8x9_pixels(const __m256i *sig, const __m256i *f, 829 __m256i *y0, __m256i *y1) { 830 filter_8x1_pixels(sig, f, y0); 831 filter_8x1_pixels(&sig[4], f, y1); 832 } 833 834 static inline void update_pixels(__m256i *sig) { 835 int i; 836 for (i = 0; i < 3; ++i) { 837 sig[i] = sig[i + 1]; 838 sig[i + 4] = sig[i + 5]; 839 } 840 } 841 842 static void aom_highbd_filter_block1d8_v8_avx2( 843 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 844 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 845 __m256i signal[9], res0, res1; 846 const __m256i max = _mm256_set1_epi16((1 << bd) - 1); 847 848 __m256i ff[4]; 849 pack_filters(filter, ff); 850 851 pack_8x9_init(src_ptr, src_pitch, signal); 852 853 do { 854 pack_8x9_pixels(src_ptr, src_pitch, signal); 855 856 filter_8x9_pixels(signal, ff, &res0, &res1); 857 store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); 858 update_pixels(signal); 859 860 src_ptr += src_pitch << 1; 861 dst_ptr += dst_pitch << 1; 862 height -= 2; 863 } while (height > 0); 864 } 865 866 static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) { 867 __m256i u0, u1, u2, u3; 868 // load 0-6 rows 869 const __m256i s0 = _mm256_loadu_si256((const __m256i *)src); 870 const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch)); 871 const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch)); 872 const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch)); 873 const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch)); 874 const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch)); 875 const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch)); 876 877 u0 = _mm256_permute2x128_si256(s0, s1, 0x20); // 0, 1 low 878 u1 = _mm256_permute2x128_si256(s0, s1, 0x31); // 0, 1 high 879 880 u2 = _mm256_permute2x128_si256(s1, s2, 0x20); // 1, 2 low 881 u3 = _mm256_permute2x128_si256(s1, s2, 0x31); // 1, 2 high 882 883 sig[0] = _mm256_unpacklo_epi16(u0, u2); 884 sig[4] = _mm256_unpackhi_epi16(u0, u2); 885 886 sig[8] = _mm256_unpacklo_epi16(u1, u3); 887 sig[12] = _mm256_unpackhi_epi16(u1, u3); 888 889 u0 = _mm256_permute2x128_si256(s2, s3, 0x20); 890 u1 = _mm256_permute2x128_si256(s2, s3, 0x31); 891 892 u2 = _mm256_permute2x128_si256(s3, s4, 0x20); 893 u3 = _mm256_permute2x128_si256(s3, s4, 0x31); 894 895 sig[1] = _mm256_unpacklo_epi16(u0, u2); 896 sig[5] = _mm256_unpackhi_epi16(u0, u2); 897 898 sig[9] = _mm256_unpacklo_epi16(u1, u3); 899 sig[13] = _mm256_unpackhi_epi16(u1, u3); 900 901 u0 = _mm256_permute2x128_si256(s4, s5, 0x20); 902 u1 = _mm256_permute2x128_si256(s4, s5, 0x31); 903 904 u2 = _mm256_permute2x128_si256(s5, s6, 0x20); 905 u3 = _mm256_permute2x128_si256(s5, s6, 0x31); 906 907 sig[2] = _mm256_unpacklo_epi16(u0, u2); 908 sig[6] = _mm256_unpackhi_epi16(u0, u2); 909 910 sig[10] = _mm256_unpacklo_epi16(u1, u3); 911 sig[14] = _mm256_unpackhi_epi16(u1, u3); 912 913 sig[16] = s6; 914 } 915 916 static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch, 917 __m256i *sig) { 918 // base + 7th row 919 const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch)); 920 // base + 8th row 921 const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch)); 922 923 __m256i u0, u1, u2, u3; 924 u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20); 925 u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31); 926 927 u2 = _mm256_permute2x128_si256(s7, s8, 0x20); 928 u3 = _mm256_permute2x128_si256(s7, s8, 0x31); 929 930 sig[3] = _mm256_unpacklo_epi16(u0, u2); 931 sig[7] = _mm256_unpackhi_epi16(u0, u2); 932 933 sig[11] = _mm256_unpacklo_epi16(u1, u3); 934 sig[15] = _mm256_unpackhi_epi16(u1, u3); 935 936 sig[16] = s8; 937 } 938 939 static inline void filter_16x9_pixels(const __m256i *sig, const __m256i *f, 940 __m256i *y0, __m256i *y1) { 941 __m256i res[4]; 942 int i; 943 for (i = 0; i < 4; ++i) { 944 filter_8x1_pixels(&sig[i << 2], f, &res[i]); 945 } 946 947 { 948 const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]); 949 const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]); 950 *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20); 951 *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31); 952 } 953 } 954 955 static inline void store_16x2_pixels(const __m256i *y0, const __m256i *y1, 956 const __m256i *mask, uint16_t *dst, 957 ptrdiff_t pitch) { 958 __m256i p = _mm256_min_epi16(*y0, *mask); 959 _mm256_storeu_si256((__m256i *)dst, p); 960 p = _mm256_min_epi16(*y1, *mask); 961 _mm256_storeu_si256((__m256i *)(dst + pitch), p); 962 } 963 964 static void update_16x9_pixels(__m256i *sig) { 965 update_pixels(&sig[0]); 966 update_pixels(&sig[8]); 967 } 968 969 static void aom_highbd_filter_block1d16_v8_avx2( 970 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 971 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 972 __m256i signal[17], res0, res1; 973 const __m256i max = _mm256_set1_epi16((1 << bd) - 1); 974 975 __m256i ff[4]; 976 pack_filters(filter, ff); 977 978 pack_16x9_init(src_ptr, src_pitch, signal); 979 980 do { 981 pack_16x9_pixels(src_ptr, src_pitch, signal); 982 filter_16x9_pixels(signal, ff, &res0, &res1); 983 store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); 984 update_16x9_pixels(signal); 985 986 src_ptr += src_pitch << 1; 987 dst_ptr += dst_pitch << 1; 988 height -= 2; 989 } while (height > 0); 990 } 991 992 static void aom_highbd_filter_block1d4_v4_avx2( 993 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 994 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 995 const int bits = FILTER_BITS; 996 997 const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); 998 const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); 999 const __m256i clip_pixel = 1000 _mm256_set1_epi32(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); 1001 const __m256i zero = _mm256_setzero_si256(); 1002 uint32_t i; 1003 __m256i s[2], ff[2]; 1004 1005 pack_filters_4tap(filter, ff); 1006 1007 const uint16_t *data = src_ptr; 1008 /* Vertical filter */ 1009 { 1010 __m128i s2 = _mm_loadl_epi64((__m128i *)(data + 2 * src_pitch)); 1011 __m128i s3 = _mm_loadl_epi64((__m128i *)(data + 3 * src_pitch)); 1012 1013 __m256i s23 = _mm256_inserti128_si256(_mm256_castsi128_si256(s2), s3, 1); 1014 1015 __m128i s4 = _mm_loadl_epi64((__m128i *)(data + 4 * src_pitch)); 1016 1017 __m256i s34 = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s4, 1); 1018 1019 s[0] = _mm256_unpacklo_epi16(s23, s34); 1020 1021 for (i = 0; i < height; i += 2) { 1022 data = &src_ptr[i * src_pitch]; 1023 1024 __m128i s5 = _mm_loadl_epi64((__m128i *)(data + 5 * src_pitch)); 1025 __m128i s6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_pitch)); 1026 1027 __m256i s45 = _mm256_inserti128_si256(_mm256_castsi128_si256(s4), s5, 1); 1028 __m256i s56 = _mm256_inserti128_si256(_mm256_castsi128_si256(s5), s6, 1); 1029 1030 s[1] = _mm256_unpacklo_epi16(s45, s56); 1031 1032 const __m256i res_a = convolve_4tap(s, ff); 1033 1034 __m256i res_a_round = _mm256_sra_epi32( 1035 _mm256_add_epi32(res_a, round_const_bits), round_shift_bits); 1036 1037 __m256i res_16bit = _mm256_min_epi32(res_a_round, clip_pixel); 1038 res_16bit = _mm256_max_epi32(res_16bit, zero); 1039 res_16bit = _mm256_packs_epi32(res_16bit, res_16bit); 1040 1041 _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch], 1042 _mm256_castsi256_si128(res_16bit)); 1043 _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch], 1044 _mm256_extracti128_si256(res_16bit, 1)); 1045 1046 s[0] = s[1]; 1047 s4 = s6; 1048 } 1049 } 1050 } 1051 1052 static void aom_highbd_filter_block1d8_v4_avx2( 1053 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 1054 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 1055 const int bits = FILTER_BITS; 1056 1057 const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); 1058 const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); 1059 const __m256i clip_pixel = 1060 _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); 1061 const __m256i zero = _mm256_setzero_si256(); 1062 __m256i s[4], ff[2]; 1063 uint32_t i; 1064 pack_filters_4tap(filter, ff); 1065 1066 const uint16_t *data = src_ptr; 1067 /* Vertical filter */ 1068 { 1069 __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_pitch)); 1070 __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_pitch)); 1071 1072 __m256i s23 = _mm256_inserti128_si256(_mm256_castsi128_si256(s2), s3, 1); 1073 1074 __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_pitch)); 1075 1076 __m256i s34 = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s4, 1); 1077 1078 s[0] = _mm256_unpacklo_epi16(s23, s34); 1079 s[2] = _mm256_unpackhi_epi16(s23, s34); 1080 1081 for (i = 0; i < height; i += 2) { 1082 data = &src_ptr[i * src_pitch]; 1083 1084 __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_pitch)); 1085 __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_pitch)); 1086 1087 __m256i s45 = _mm256_inserti128_si256(_mm256_castsi128_si256(s4), s5, 1); 1088 __m256i s56 = _mm256_inserti128_si256(_mm256_castsi128_si256(s5), s6, 1); 1089 1090 s[1] = _mm256_unpacklo_epi16(s45, s56); 1091 s[3] = _mm256_unpackhi_epi16(s45, s56); 1092 1093 const __m256i res_a = convolve_4tap(s, ff); 1094 1095 __m256i res_a_round = _mm256_sra_epi32( 1096 _mm256_add_epi32(res_a, round_const_bits), round_shift_bits); 1097 1098 const __m256i res_b = convolve_4tap(s + 2, ff); 1099 __m256i res_b_round = _mm256_sra_epi32( 1100 _mm256_add_epi32(res_b, round_const_bits), round_shift_bits); 1101 1102 __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); 1103 res_16bit = _mm256_min_epi16(res_16bit, clip_pixel); 1104 res_16bit = _mm256_max_epi16(res_16bit, zero); 1105 1106 _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch], 1107 _mm256_castsi256_si128(res_16bit)); 1108 _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch], 1109 _mm256_extracti128_si256(res_16bit, 1)); 1110 1111 s[0] = s[1]; 1112 s[2] = s[3]; 1113 s4 = s6; 1114 } 1115 } 1116 } 1117 1118 static void aom_highbd_filter_block1d16_v4_avx2( 1119 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 1120 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 1121 aom_highbd_filter_block1d8_v4_avx2(src_ptr, src_pitch, dst_ptr, dst_pitch, 1122 height, filter, bd); 1123 1124 aom_highbd_filter_block1d8_v4_avx2(src_ptr + 8, src_pitch, dst_ptr + 8, 1125 dst_pitch, height, filter, bd); 1126 } 1127 1128 // ----------------------------------------------------------------------------- 1129 // 2-tap vertical filtering 1130 1131 static void pack_16x2_init(const uint16_t *src, __m256i *sig) { 1132 sig[2] = _mm256_loadu_si256((const __m256i *)src); 1133 } 1134 1135 static inline void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch, 1136 __m256i *sig) { 1137 // load the next row 1138 const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch)); 1139 sig[0] = _mm256_unpacklo_epi16(sig[2], u); 1140 sig[1] = _mm256_unpackhi_epi16(sig[2], u); 1141 sig[2] = u; 1142 } 1143 1144 static inline void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f, 1145 __m256i *y0, __m256i *y1) { 1146 filter_16_2t_pixels(sig, f, y0, y1); 1147 } 1148 1149 static void aom_highbd_filter_block1d16_v2_avx2( 1150 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 1151 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 1152 __m256i signal[3], res0, res1; 1153 const __m256i max = _mm256_set1_epi16((1 << bd) - 1); 1154 __m256i ff; 1155 1156 pack_2t_filter(filter, &ff); 1157 pack_16x2_init(src_ptr, signal); 1158 1159 do { 1160 pack_16x2_2t_pixels(src_ptr, src_pitch, signal); 1161 filter_16x2_2t_pixels(signal, &ff, &res0, &res1); 1162 store_16x1_pixels(&res0, &res1, &max, dst_ptr); 1163 1164 src_ptr += src_pitch; 1165 dst_ptr += dst_pitch; 1166 height -= 1; 1167 } while (height > 0); 1168 } 1169 1170 static inline void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) { 1171 const __m128i h = _mm_loadu_si128((const __m128i *)filter); 1172 const __m128i p = _mm_set1_epi32(0x09080706); 1173 f[0] = _mm_shuffle_epi8(h, p); 1174 } 1175 1176 static void pack_8x2_init(const uint16_t *src, __m128i *sig) { 1177 sig[2] = _mm_loadu_si128((const __m128i *)src); 1178 } 1179 1180 static inline void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch, 1181 __m128i *sig) { 1182 // load the next row 1183 const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch)); 1184 sig[0] = _mm_unpacklo_epi16(sig[2], u); 1185 sig[1] = _mm_unpackhi_epi16(sig[2], u); 1186 sig[2] = u; 1187 } 1188 1189 static inline void filter_8_2t_pixels(const __m128i *sig, const __m128i *f, 1190 __m128i *y0, __m128i *y1) { 1191 const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); 1192 __m128i x0 = _mm_madd_epi16(sig[0], *f); 1193 __m128i x1 = _mm_madd_epi16(sig[1], *f); 1194 x0 = _mm_add_epi32(x0, rounding); 1195 x1 = _mm_add_epi32(x1, rounding); 1196 *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS); 1197 *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS); 1198 } 1199 1200 static inline void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1, 1201 const __m128i *mask, uint16_t *dst) { 1202 __m128i res = _mm_packus_epi32(*y0, *y1); 1203 res = _mm_min_epi16(res, *mask); 1204 _mm_storeu_si128((__m128i *)dst, res); 1205 } 1206 1207 static void aom_highbd_filter_block1d8_v2_avx2( 1208 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 1209 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 1210 __m128i signal[3], res0, res1; 1211 const __m128i max = _mm_set1_epi16((1 << bd) - 1); 1212 __m128i ff; 1213 1214 pack_8x1_2t_filter(filter, &ff); 1215 pack_8x2_init(src_ptr, signal); 1216 1217 do { 1218 pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal); 1219 filter_8_2t_pixels(signal, &ff, &res0, &res1); 1220 store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr); 1221 1222 src_ptr += src_pitch; 1223 dst_ptr += dst_pitch; 1224 height -= 1; 1225 } while (height > 0); 1226 } 1227 1228 void aom_highbd_filter_block1d4_h8_sse2(const uint16_t *, ptrdiff_t, uint16_t *, 1229 ptrdiff_t, uint32_t, const int16_t *, 1230 int); 1231 void aom_highbd_filter_block1d4_h2_sse2(const uint16_t *, ptrdiff_t, uint16_t *, 1232 ptrdiff_t, uint32_t, const int16_t *, 1233 int); 1234 void aom_highbd_filter_block1d4_v8_sse2(const uint16_t *, ptrdiff_t, uint16_t *, 1235 ptrdiff_t, uint32_t, const int16_t *, 1236 int); 1237 void aom_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *, 1238 ptrdiff_t, uint32_t, const int16_t *, 1239 int); 1240 #define aom_highbd_filter_block1d4_h8_avx2 aom_highbd_filter_block1d4_h8_sse2 1241 #define aom_highbd_filter_block1d4_h2_avx2 aom_highbd_filter_block1d4_h2_sse2 1242 #define aom_highbd_filter_block1d4_v8_avx2 aom_highbd_filter_block1d4_v8_sse2 1243 #define aom_highbd_filter_block1d4_v2_avx2 aom_highbd_filter_block1d4_v2_sse2 1244 1245 HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2) 1246 HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2) 1247 1248 #undef HIGHBD_FUNC