highbd_sad_avx2.c (27169B)
1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <immintrin.h> 13 14 #include "config/aom_config.h" 15 #include "config/aom_dsp_rtcd.h" 16 17 #include "aom/aom_integer.h" 18 #include "aom_dsp/x86/synonyms_avx2.h" 19 #include "aom_ports/mem.h" 20 21 // SAD 22 static inline unsigned int get_sad_from_mm256_epi32(const __m256i *v) { 23 // input 8 32-bit summation 24 __m128i lo128, hi128; 25 __m256i u = _mm256_srli_si256(*v, 8); 26 u = _mm256_add_epi32(u, *v); 27 28 // 4 32-bit summation 29 hi128 = _mm256_extracti128_si256(u, 1); 30 lo128 = _mm256_castsi256_si128(u); 31 lo128 = _mm_add_epi32(hi128, lo128); 32 33 // 2 32-bit summation 34 hi128 = _mm_srli_si128(lo128, 4); 35 lo128 = _mm_add_epi32(lo128, hi128); 36 37 return (unsigned int)_mm_cvtsi128_si32(lo128); 38 } 39 40 static inline void highbd_sad16x4_core_avx2(__m256i *s, __m256i *r, 41 __m256i *sad_acc) { 42 const __m256i zero = _mm256_setzero_si256(); 43 int i; 44 for (i = 0; i < 4; i++) { 45 s[i] = _mm256_sub_epi16(s[i], r[i]); 46 s[i] = _mm256_abs_epi16(s[i]); 47 } 48 49 s[0] = _mm256_add_epi16(s[0], s[1]); 50 s[0] = _mm256_add_epi16(s[0], s[2]); 51 s[0] = _mm256_add_epi16(s[0], s[3]); 52 53 r[0] = _mm256_unpacklo_epi16(s[0], zero); 54 r[1] = _mm256_unpackhi_epi16(s[0], zero); 55 56 r[0] = _mm256_add_epi32(r[0], r[1]); 57 *sad_acc = _mm256_add_epi32(*sad_acc, r[0]); 58 } 59 60 // If sec_ptr = 0, calculate regular SAD. Otherwise, calculate average SAD. 61 static inline void sad16x4(const uint16_t *src_ptr, int src_stride, 62 const uint16_t *ref_ptr, int ref_stride, 63 const uint16_t *sec_ptr, __m256i *sad_acc) { 64 __m256i s[4], r[4]; 65 s[0] = _mm256_loadu_si256((const __m256i *)src_ptr); 66 s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride)); 67 s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride)); 68 s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride)); 69 70 r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr); 71 r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride)); 72 r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride)); 73 r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride)); 74 75 if (sec_ptr) { 76 r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr)); 77 r[1] = _mm256_avg_epu16( 78 r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16))); 79 r[2] = _mm256_avg_epu16( 80 r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32))); 81 r[3] = _mm256_avg_epu16( 82 r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48))); 83 } 84 highbd_sad16x4_core_avx2(s, r, sad_acc); 85 } 86 87 static AOM_FORCE_INLINE unsigned int aom_highbd_sad16xN_avx2(int N, 88 const uint8_t *src, 89 int src_stride, 90 const uint8_t *ref, 91 int ref_stride) { 92 const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); 93 const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref); 94 int i; 95 __m256i sad = _mm256_setzero_si256(); 96 for (i = 0; i < N; i += 4) { 97 sad16x4(src_ptr, src_stride, ref_ptr, ref_stride, NULL, &sad); 98 src_ptr += src_stride << 2; 99 ref_ptr += ref_stride << 2; 100 } 101 return (unsigned int)get_sad_from_mm256_epi32(&sad); 102 } 103 104 static void sad32x4(const uint16_t *src_ptr, int src_stride, 105 const uint16_t *ref_ptr, int ref_stride, 106 const uint16_t *sec_ptr, __m256i *sad_acc) { 107 __m256i s[4], r[4]; 108 int row_sections = 0; 109 110 while (row_sections < 2) { 111 s[0] = _mm256_loadu_si256((const __m256i *)src_ptr); 112 s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16)); 113 s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride)); 114 s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 16)); 115 116 r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr); 117 r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16)); 118 r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride)); 119 r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 16)); 120 121 if (sec_ptr) { 122 r[0] = 123 _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr)); 124 r[1] = _mm256_avg_epu16( 125 r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16))); 126 r[2] = _mm256_avg_epu16( 127 r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32))); 128 r[3] = _mm256_avg_epu16( 129 r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48))); 130 sec_ptr += 32 << 1; 131 } 132 highbd_sad16x4_core_avx2(s, r, sad_acc); 133 134 row_sections += 1; 135 src_ptr += src_stride << 1; 136 ref_ptr += ref_stride << 1; 137 } 138 } 139 140 static AOM_FORCE_INLINE unsigned int aom_highbd_sad32xN_avx2(int N, 141 const uint8_t *src, 142 int src_stride, 143 const uint8_t *ref, 144 int ref_stride) { 145 __m256i sad = _mm256_setzero_si256(); 146 uint16_t *srcp = CONVERT_TO_SHORTPTR(src); 147 uint16_t *refp = CONVERT_TO_SHORTPTR(ref); 148 const int left_shift = 2; 149 int i; 150 151 for (i = 0; i < N; i += 4) { 152 sad32x4(srcp, src_stride, refp, ref_stride, NULL, &sad); 153 srcp += src_stride << left_shift; 154 refp += ref_stride << left_shift; 155 } 156 return get_sad_from_mm256_epi32(&sad); 157 } 158 159 static void sad64x2(const uint16_t *src_ptr, int src_stride, 160 const uint16_t *ref_ptr, int ref_stride, 161 const uint16_t *sec_ptr, __m256i *sad_acc) { 162 __m256i s[4], r[4]; 163 int i; 164 for (i = 0; i < 2; i++) { 165 s[0] = _mm256_loadu_si256((const __m256i *)src_ptr); 166 s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16)); 167 s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32)); 168 s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48)); 169 170 r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr); 171 r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16)); 172 r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32)); 173 r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48)); 174 if (sec_ptr) { 175 r[0] = 176 _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr)); 177 r[1] = _mm256_avg_epu16( 178 r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16))); 179 r[2] = _mm256_avg_epu16( 180 r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32))); 181 r[3] = _mm256_avg_epu16( 182 r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48))); 183 sec_ptr += 64; 184 } 185 highbd_sad16x4_core_avx2(s, r, sad_acc); 186 src_ptr += src_stride; 187 ref_ptr += ref_stride; 188 } 189 } 190 191 static AOM_FORCE_INLINE unsigned int aom_highbd_sad64xN_avx2(int N, 192 const uint8_t *src, 193 int src_stride, 194 const uint8_t *ref, 195 int ref_stride) { 196 __m256i sad = _mm256_setzero_si256(); 197 uint16_t *srcp = CONVERT_TO_SHORTPTR(src); 198 uint16_t *refp = CONVERT_TO_SHORTPTR(ref); 199 const int left_shift = 1; 200 int i; 201 for (i = 0; i < N; i += 2) { 202 sad64x2(srcp, src_stride, refp, ref_stride, NULL, &sad); 203 srcp += src_stride << left_shift; 204 refp += ref_stride << left_shift; 205 } 206 return get_sad_from_mm256_epi32(&sad); 207 } 208 209 static void sad128x1(const uint16_t *src_ptr, const uint16_t *ref_ptr, 210 const uint16_t *sec_ptr, __m256i *sad_acc) { 211 __m256i s[4], r[4]; 212 int i; 213 for (i = 0; i < 2; i++) { 214 s[0] = _mm256_loadu_si256((const __m256i *)src_ptr); 215 s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16)); 216 s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32)); 217 s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48)); 218 r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr); 219 r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16)); 220 r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32)); 221 r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48)); 222 if (sec_ptr) { 223 r[0] = 224 _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr)); 225 r[1] = _mm256_avg_epu16( 226 r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16))); 227 r[2] = _mm256_avg_epu16( 228 r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32))); 229 r[3] = _mm256_avg_epu16( 230 r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48))); 231 sec_ptr += 64; 232 } 233 highbd_sad16x4_core_avx2(s, r, sad_acc); 234 src_ptr += 64; 235 ref_ptr += 64; 236 } 237 } 238 239 static AOM_FORCE_INLINE unsigned int aom_highbd_sad128xN_avx2( 240 int N, const uint8_t *src, int src_stride, const uint8_t *ref, 241 int ref_stride) { 242 __m256i sad = _mm256_setzero_si256(); 243 uint16_t *srcp = CONVERT_TO_SHORTPTR(src); 244 uint16_t *refp = CONVERT_TO_SHORTPTR(ref); 245 int row = 0; 246 while (row < N) { 247 sad128x1(srcp, refp, NULL, &sad); 248 srcp += src_stride; 249 refp += ref_stride; 250 row++; 251 } 252 return get_sad_from_mm256_epi32(&sad); 253 } 254 255 #define HIGHBD_SADMXN_AVX2(m, n) \ 256 unsigned int aom_highbd_sad##m##x##n##_avx2( \ 257 const uint8_t *src, int src_stride, const uint8_t *ref, \ 258 int ref_stride) { \ 259 return aom_highbd_sad##m##xN_avx2(n, src, src_stride, ref, ref_stride); \ 260 } 261 262 #define HIGHBD_SAD_SKIP_MXN_AVX2(m, n) \ 263 unsigned int aom_highbd_sad_skip_##m##x##n##_avx2( \ 264 const uint8_t *src, int src_stride, const uint8_t *ref, \ 265 int ref_stride) { \ 266 return 2 * aom_highbd_sad##m##xN_avx2((n / 2), src, 2 * src_stride, ref, \ 267 2 * ref_stride); \ 268 } 269 270 HIGHBD_SADMXN_AVX2(16, 8) 271 HIGHBD_SADMXN_AVX2(16, 16) 272 HIGHBD_SADMXN_AVX2(16, 32) 273 274 HIGHBD_SADMXN_AVX2(32, 16) 275 HIGHBD_SADMXN_AVX2(32, 32) 276 HIGHBD_SADMXN_AVX2(32, 64) 277 278 HIGHBD_SADMXN_AVX2(64, 32) 279 HIGHBD_SADMXN_AVX2(64, 64) 280 HIGHBD_SADMXN_AVX2(64, 128) 281 282 HIGHBD_SADMXN_AVX2(128, 64) 283 HIGHBD_SADMXN_AVX2(128, 128) 284 285 #if !CONFIG_REALTIME_ONLY 286 HIGHBD_SADMXN_AVX2(16, 4) 287 HIGHBD_SADMXN_AVX2(16, 64) 288 HIGHBD_SADMXN_AVX2(32, 8) 289 HIGHBD_SADMXN_AVX2(64, 16) 290 #endif // !CONFIG_REALTIME_ONLY 291 292 HIGHBD_SAD_SKIP_MXN_AVX2(16, 16) 293 HIGHBD_SAD_SKIP_MXN_AVX2(16, 32) 294 295 HIGHBD_SAD_SKIP_MXN_AVX2(32, 16) 296 HIGHBD_SAD_SKIP_MXN_AVX2(32, 32) 297 HIGHBD_SAD_SKIP_MXN_AVX2(32, 64) 298 299 HIGHBD_SAD_SKIP_MXN_AVX2(64, 32) 300 HIGHBD_SAD_SKIP_MXN_AVX2(64, 64) 301 HIGHBD_SAD_SKIP_MXN_AVX2(64, 128) 302 303 HIGHBD_SAD_SKIP_MXN_AVX2(128, 64) 304 HIGHBD_SAD_SKIP_MXN_AVX2(128, 128) 305 306 #if !CONFIG_REALTIME_ONLY 307 HIGHBD_SAD_SKIP_MXN_AVX2(16, 64) 308 HIGHBD_SAD_SKIP_MXN_AVX2(64, 16) 309 #endif // !CONFIG_REALTIME_ONLY 310 311 unsigned int aom_highbd_sad16x8_avg_avx2(const uint8_t *src, int src_stride, 312 const uint8_t *ref, int ref_stride, 313 const uint8_t *second_pred) { 314 __m256i sad = _mm256_setzero_si256(); 315 uint16_t *srcp = CONVERT_TO_SHORTPTR(src); 316 uint16_t *refp = CONVERT_TO_SHORTPTR(ref); 317 uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); 318 319 sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad); 320 321 // Next 4 rows 322 srcp += src_stride << 2; 323 refp += ref_stride << 2; 324 secp += 64; 325 sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad); 326 return get_sad_from_mm256_epi32(&sad); 327 } 328 329 unsigned int aom_highbd_sad16x16_avg_avx2(const uint8_t *src, int src_stride, 330 const uint8_t *ref, int ref_stride, 331 const uint8_t *second_pred) { 332 const int left_shift = 3; 333 uint32_t sum = aom_highbd_sad16x8_avg_avx2(src, src_stride, ref, ref_stride, 334 second_pred); 335 src += src_stride << left_shift; 336 ref += ref_stride << left_shift; 337 second_pred += 16 << left_shift; 338 sum += aom_highbd_sad16x8_avg_avx2(src, src_stride, ref, ref_stride, 339 second_pred); 340 return sum; 341 } 342 343 unsigned int aom_highbd_sad16x32_avg_avx2(const uint8_t *src, int src_stride, 344 const uint8_t *ref, int ref_stride, 345 const uint8_t *second_pred) { 346 const int left_shift = 4; 347 uint32_t sum = aom_highbd_sad16x16_avg_avx2(src, src_stride, ref, ref_stride, 348 second_pred); 349 src += src_stride << left_shift; 350 ref += ref_stride << left_shift; 351 second_pred += 16 << left_shift; 352 sum += aom_highbd_sad16x16_avg_avx2(src, src_stride, ref, ref_stride, 353 second_pred); 354 return sum; 355 } 356 357 #if !CONFIG_REALTIME_ONLY 358 unsigned int aom_highbd_sad16x64_avg_avx2(const uint8_t *src, int src_stride, 359 const uint8_t *ref, int ref_stride, 360 const uint8_t *second_pred) { 361 const int left_shift = 5; 362 uint32_t sum = aom_highbd_sad16x32_avg_avx2(src, src_stride, ref, ref_stride, 363 second_pred); 364 src += src_stride << left_shift; 365 ref += ref_stride << left_shift; 366 second_pred += 16 << left_shift; 367 sum += aom_highbd_sad16x32_avg_avx2(src, src_stride, ref, ref_stride, 368 second_pred); 369 return sum; 370 } 371 372 unsigned int aom_highbd_sad32x8_avg_avx2(const uint8_t *src, int src_stride, 373 const uint8_t *ref, int ref_stride, 374 const uint8_t *second_pred) { 375 __m256i sad = _mm256_setzero_si256(); 376 uint16_t *srcp = CONVERT_TO_SHORTPTR(src); 377 uint16_t *refp = CONVERT_TO_SHORTPTR(ref); 378 uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); 379 const int left_shift = 2; 380 int row_section = 0; 381 382 while (row_section < 2) { 383 sad32x4(srcp, src_stride, refp, ref_stride, secp, &sad); 384 srcp += src_stride << left_shift; 385 refp += ref_stride << left_shift; 386 secp += 32 << left_shift; 387 row_section += 1; 388 } 389 return get_sad_from_mm256_epi32(&sad); 390 } 391 #endif // !CONFIG_REALTIME_ONLY 392 393 unsigned int aom_highbd_sad32x16_avg_avx2(const uint8_t *src, int src_stride, 394 const uint8_t *ref, int ref_stride, 395 const uint8_t *second_pred) { 396 __m256i sad = _mm256_setzero_si256(); 397 uint16_t *srcp = CONVERT_TO_SHORTPTR(src); 398 uint16_t *refp = CONVERT_TO_SHORTPTR(ref); 399 uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); 400 const int left_shift = 2; 401 int row_section = 0; 402 403 while (row_section < 4) { 404 sad32x4(srcp, src_stride, refp, ref_stride, secp, &sad); 405 srcp += src_stride << left_shift; 406 refp += ref_stride << left_shift; 407 secp += 32 << left_shift; 408 row_section += 1; 409 } 410 return get_sad_from_mm256_epi32(&sad); 411 } 412 413 unsigned int aom_highbd_sad32x32_avg_avx2(const uint8_t *src, int src_stride, 414 const uint8_t *ref, int ref_stride, 415 const uint8_t *second_pred) { 416 const int left_shift = 4; 417 uint32_t sum = aom_highbd_sad32x16_avg_avx2(src, src_stride, ref, ref_stride, 418 second_pred); 419 src += src_stride << left_shift; 420 ref += ref_stride << left_shift; 421 second_pred += 32 << left_shift; 422 sum += aom_highbd_sad32x16_avg_avx2(src, src_stride, ref, ref_stride, 423 second_pred); 424 return sum; 425 } 426 427 unsigned int aom_highbd_sad32x64_avg_avx2(const uint8_t *src, int src_stride, 428 const uint8_t *ref, int ref_stride, 429 const uint8_t *second_pred) { 430 const int left_shift = 5; 431 uint32_t sum = aom_highbd_sad32x32_avg_avx2(src, src_stride, ref, ref_stride, 432 second_pred); 433 src += src_stride << left_shift; 434 ref += ref_stride << left_shift; 435 second_pred += 32 << left_shift; 436 sum += aom_highbd_sad32x32_avg_avx2(src, src_stride, ref, ref_stride, 437 second_pred); 438 return sum; 439 } 440 441 #if !CONFIG_REALTIME_ONLY 442 unsigned int aom_highbd_sad64x16_avg_avx2(const uint8_t *src, int src_stride, 443 const uint8_t *ref, int ref_stride, 444 const uint8_t *second_pred) { 445 __m256i sad = _mm256_setzero_si256(); 446 uint16_t *srcp = CONVERT_TO_SHORTPTR(src); 447 uint16_t *refp = CONVERT_TO_SHORTPTR(ref); 448 uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); 449 const int left_shift = 1; 450 int row_section = 0; 451 452 while (row_section < 8) { 453 sad64x2(srcp, src_stride, refp, ref_stride, secp, &sad); 454 srcp += src_stride << left_shift; 455 refp += ref_stride << left_shift; 456 secp += 64 << left_shift; 457 row_section += 1; 458 } 459 return get_sad_from_mm256_epi32(&sad); 460 } 461 #endif // !CONFIG_REALTIME_ONLY 462 463 unsigned int aom_highbd_sad64x32_avg_avx2(const uint8_t *src, int src_stride, 464 const uint8_t *ref, int ref_stride, 465 const uint8_t *second_pred) { 466 __m256i sad = _mm256_setzero_si256(); 467 uint16_t *srcp = CONVERT_TO_SHORTPTR(src); 468 uint16_t *refp = CONVERT_TO_SHORTPTR(ref); 469 uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); 470 const int left_shift = 1; 471 int row_section = 0; 472 473 while (row_section < 16) { 474 sad64x2(srcp, src_stride, refp, ref_stride, secp, &sad); 475 srcp += src_stride << left_shift; 476 refp += ref_stride << left_shift; 477 secp += 64 << left_shift; 478 row_section += 1; 479 } 480 return get_sad_from_mm256_epi32(&sad); 481 } 482 483 unsigned int aom_highbd_sad64x64_avg_avx2(const uint8_t *src, int src_stride, 484 const uint8_t *ref, int ref_stride, 485 const uint8_t *second_pred) { 486 const int left_shift = 5; 487 uint32_t sum = aom_highbd_sad64x32_avg_avx2(src, src_stride, ref, ref_stride, 488 second_pred); 489 src += src_stride << left_shift; 490 ref += ref_stride << left_shift; 491 second_pred += 64 << left_shift; 492 sum += aom_highbd_sad64x32_avg_avx2(src, src_stride, ref, ref_stride, 493 second_pred); 494 return sum; 495 } 496 497 unsigned int aom_highbd_sad64x128_avg_avx2(const uint8_t *src, int src_stride, 498 const uint8_t *ref, int ref_stride, 499 const uint8_t *second_pred) { 500 const int left_shift = 6; 501 uint32_t sum = aom_highbd_sad64x64_avg_avx2(src, src_stride, ref, ref_stride, 502 second_pred); 503 src += src_stride << left_shift; 504 ref += ref_stride << left_shift; 505 second_pred += 64 << left_shift; 506 sum += aom_highbd_sad64x64_avg_avx2(src, src_stride, ref, ref_stride, 507 second_pred); 508 return sum; 509 } 510 511 unsigned int aom_highbd_sad128x64_avg_avx2(const uint8_t *src, int src_stride, 512 const uint8_t *ref, int ref_stride, 513 const uint8_t *second_pred) { 514 __m256i sad = _mm256_setzero_si256(); 515 uint16_t *srcp = CONVERT_TO_SHORTPTR(src); 516 uint16_t *refp = CONVERT_TO_SHORTPTR(ref); 517 uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); 518 int row = 0; 519 while (row < 64) { 520 sad128x1(srcp, refp, secp, &sad); 521 srcp += src_stride; 522 refp += ref_stride; 523 secp += 16 << 3; 524 row += 1; 525 } 526 return get_sad_from_mm256_epi32(&sad); 527 } 528 529 unsigned int aom_highbd_sad128x128_avg_avx2(const uint8_t *src, int src_stride, 530 const uint8_t *ref, int ref_stride, 531 const uint8_t *second_pred) { 532 unsigned int sum; 533 const int left_shift = 6; 534 535 sum = aom_highbd_sad128x64_avg_avx2(src, src_stride, ref, ref_stride, 536 second_pred); 537 src += src_stride << left_shift; 538 ref += ref_stride << left_shift; 539 second_pred += 128 << left_shift; 540 sum += aom_highbd_sad128x64_avg_avx2(src, src_stride, ref, ref_stride, 541 second_pred); 542 return sum; 543 } 544 545 // SAD 4D 546 // Combine 4 __m256i input vectors v to uint32_t result[4] 547 static inline void get_4d_sad_from_mm256_epi32(const __m256i *v, 548 uint32_t *res) { 549 __m256i u0, u1, u2, u3; 550 const __m256i mask = _mm256_set1_epi64x(~0u); 551 __m128i sad; 552 553 // 8 32-bit summation 554 u0 = _mm256_srli_si256(v[0], 4); 555 u1 = _mm256_srli_si256(v[1], 4); 556 u2 = _mm256_srli_si256(v[2], 4); 557 u3 = _mm256_srli_si256(v[3], 4); 558 559 u0 = _mm256_add_epi32(u0, v[0]); 560 u1 = _mm256_add_epi32(u1, v[1]); 561 u2 = _mm256_add_epi32(u2, v[2]); 562 u3 = _mm256_add_epi32(u3, v[3]); 563 564 u0 = _mm256_and_si256(u0, mask); 565 u1 = _mm256_and_si256(u1, mask); 566 u2 = _mm256_and_si256(u2, mask); 567 u3 = _mm256_and_si256(u3, mask); 568 // 4 32-bit summation, evenly positioned 569 570 u1 = _mm256_slli_si256(u1, 4); 571 u3 = _mm256_slli_si256(u3, 4); 572 573 u0 = _mm256_or_si256(u0, u1); 574 u2 = _mm256_or_si256(u2, u3); 575 // 8 32-bit summation, interleaved 576 577 u1 = _mm256_unpacklo_epi64(u0, u2); 578 u3 = _mm256_unpackhi_epi64(u0, u2); 579 580 u0 = _mm256_add_epi32(u1, u3); 581 sad = _mm_add_epi32(_mm256_extractf128_si256(u0, 1), 582 _mm256_castsi256_si128(u0)); 583 _mm_storeu_si128((__m128i *)res, sad); 584 } 585 586 static void convert_pointers(const uint8_t *const ref8[], 587 const uint16_t *ref[]) { 588 ref[0] = CONVERT_TO_SHORTPTR(ref8[0]); 589 ref[1] = CONVERT_TO_SHORTPTR(ref8[1]); 590 ref[2] = CONVERT_TO_SHORTPTR(ref8[2]); 591 ref[3] = CONVERT_TO_SHORTPTR(ref8[3]); 592 } 593 594 static void init_sad(__m256i *s) { 595 s[0] = _mm256_setzero_si256(); 596 s[1] = _mm256_setzero_si256(); 597 s[2] = _mm256_setzero_si256(); 598 s[3] = _mm256_setzero_si256(); 599 } 600 601 static AOM_FORCE_INLINE void aom_highbd_sadMxNxD_avx2( 602 int M, int N, int D, const uint8_t *src, int src_stride, 603 const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]) { 604 __m256i sad_vec[4]; 605 const uint16_t *refp[4]; 606 const uint16_t *keep = CONVERT_TO_SHORTPTR(src); 607 const uint16_t *srcp; 608 const int shift_for_rows = (M < 128) + (M < 64); 609 const int row_units = 1 << shift_for_rows; 610 int i, r; 611 612 init_sad(sad_vec); 613 convert_pointers(ref_array, refp); 614 615 for (i = 0; i < D; ++i) { 616 srcp = keep; 617 for (r = 0; r < N; r += row_units) { 618 if (M == 128) { 619 sad128x1(srcp, refp[i], NULL, &sad_vec[i]); 620 } else if (M == 64) { 621 sad64x2(srcp, src_stride, refp[i], ref_stride, NULL, &sad_vec[i]); 622 } else if (M == 32) { 623 sad32x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]); 624 } else if (M == 16) { 625 sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]); 626 } else { 627 assert(0); 628 } 629 srcp += src_stride << shift_for_rows; 630 refp[i] += ref_stride << shift_for_rows; 631 } 632 } 633 get_4d_sad_from_mm256_epi32(sad_vec, sad_array); 634 } 635 636 #define HIGHBD_SAD_MXNX4D_AVX2(m, n) \ 637 void aom_highbd_sad##m##x##n##x4d_avx2( \ 638 const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ 639 int ref_stride, uint32_t sad_array[4]) { \ 640 aom_highbd_sadMxNxD_avx2(m, n, 4, src, src_stride, ref_array, ref_stride, \ 641 sad_array); \ 642 } 643 #define HIGHBD_SAD_SKIP_MXNX4D_AVX2(m, n) \ 644 void aom_highbd_sad_skip_##m##x##n##x4d_avx2( \ 645 const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ 646 int ref_stride, uint32_t sad_array[4]) { \ 647 aom_highbd_sadMxNxD_avx2(m, (n / 2), 4, src, 2 * src_stride, ref_array, \ 648 2 * ref_stride, sad_array); \ 649 sad_array[0] <<= 1; \ 650 sad_array[1] <<= 1; \ 651 sad_array[2] <<= 1; \ 652 sad_array[3] <<= 1; \ 653 } 654 #define HIGHBD_SAD_MXNX3D_AVX2(m, n) \ 655 void aom_highbd_sad##m##x##n##x3d_avx2( \ 656 const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ 657 int ref_stride, uint32_t sad_array[4]) { \ 658 aom_highbd_sadMxNxD_avx2(m, n, 3, src, src_stride, ref_array, ref_stride, \ 659 sad_array); \ 660 } 661 662 HIGHBD_SAD_MXNX4D_AVX2(16, 8) 663 HIGHBD_SAD_MXNX4D_AVX2(16, 16) 664 HIGHBD_SAD_MXNX4D_AVX2(16, 32) 665 666 HIGHBD_SAD_MXNX4D_AVX2(32, 16) 667 HIGHBD_SAD_MXNX4D_AVX2(32, 32) 668 HIGHBD_SAD_MXNX4D_AVX2(32, 64) 669 670 HIGHBD_SAD_MXNX4D_AVX2(64, 32) 671 HIGHBD_SAD_MXNX4D_AVX2(64, 64) 672 HIGHBD_SAD_MXNX4D_AVX2(64, 128) 673 674 HIGHBD_SAD_MXNX4D_AVX2(128, 64) 675 HIGHBD_SAD_MXNX4D_AVX2(128, 128) 676 677 #if !CONFIG_REALTIME_ONLY 678 HIGHBD_SAD_MXNX4D_AVX2(16, 4) 679 HIGHBD_SAD_MXNX4D_AVX2(16, 64) 680 HIGHBD_SAD_MXNX4D_AVX2(32, 8) 681 HIGHBD_SAD_MXNX4D_AVX2(64, 16) 682 #endif // !CONFIG_REALTIME_ONLY 683 684 HIGHBD_SAD_SKIP_MXNX4D_AVX2(16, 16) 685 HIGHBD_SAD_SKIP_MXNX4D_AVX2(16, 32) 686 687 HIGHBD_SAD_SKIP_MXNX4D_AVX2(32, 16) 688 HIGHBD_SAD_SKIP_MXNX4D_AVX2(32, 32) 689 HIGHBD_SAD_SKIP_MXNX4D_AVX2(32, 64) 690 691 HIGHBD_SAD_SKIP_MXNX4D_AVX2(64, 32) 692 HIGHBD_SAD_SKIP_MXNX4D_AVX2(64, 64) 693 HIGHBD_SAD_SKIP_MXNX4D_AVX2(64, 128) 694 695 HIGHBD_SAD_SKIP_MXNX4D_AVX2(128, 64) 696 HIGHBD_SAD_SKIP_MXNX4D_AVX2(128, 128) 697 698 #if !CONFIG_REALTIME_ONLY 699 HIGHBD_SAD_SKIP_MXNX4D_AVX2(16, 64) 700 HIGHBD_SAD_SKIP_MXNX4D_AVX2(64, 16) 701 #endif // !CONFIG_REALTIME_ONLY 702 703 HIGHBD_SAD_MXNX3D_AVX2(16, 8) 704 HIGHBD_SAD_MXNX3D_AVX2(16, 16) 705 HIGHBD_SAD_MXNX3D_AVX2(16, 32) 706 707 HIGHBD_SAD_MXNX3D_AVX2(32, 16) 708 HIGHBD_SAD_MXNX3D_AVX2(32, 32) 709 HIGHBD_SAD_MXNX3D_AVX2(32, 64) 710 711 HIGHBD_SAD_MXNX3D_AVX2(64, 32) 712 HIGHBD_SAD_MXNX3D_AVX2(64, 64) 713 HIGHBD_SAD_MXNX3D_AVX2(64, 128) 714 715 HIGHBD_SAD_MXNX3D_AVX2(128, 64) 716 HIGHBD_SAD_MXNX3D_AVX2(128, 128) 717 718 #if !CONFIG_REALTIME_ONLY 719 HIGHBD_SAD_MXNX3D_AVX2(16, 4) 720 HIGHBD_SAD_MXNX3D_AVX2(16, 64) 721 HIGHBD_SAD_MXNX3D_AVX2(32, 8) 722 HIGHBD_SAD_MXNX3D_AVX2(64, 16) 723 #endif // !CONFIG_REALTIME_ONLY