intrapred_sse2.c (56011B)
1 /* 2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <emmintrin.h> 13 #include "aom_dsp/x86/intrapred_x86.h" 14 #include "config/aom_dsp_rtcd.h" 15 16 static inline void dc_store_4xh(uint32_t dc, int height, uint8_t *dst, 17 ptrdiff_t stride) { 18 for (int i = 0; i < height; i += 2) { 19 *(uint32_t *)dst = dc; 20 dst += stride; 21 *(uint32_t *)dst = dc; 22 dst += stride; 23 } 24 } 25 26 static inline void dc_store_8xh(const __m128i *row, int height, uint8_t *dst, 27 ptrdiff_t stride) { 28 int i; 29 for (i = 0; i < height; ++i) { 30 _mm_storel_epi64((__m128i *)dst, *row); 31 dst += stride; 32 } 33 } 34 35 static inline void dc_store_16xh(const __m128i *row, int height, uint8_t *dst, 36 ptrdiff_t stride) { 37 int i; 38 for (i = 0; i < height; ++i) { 39 _mm_store_si128((__m128i *)dst, *row); 40 dst += stride; 41 } 42 } 43 44 static inline void dc_store_32xh(const __m128i *row, int height, uint8_t *dst, 45 ptrdiff_t stride) { 46 int i; 47 for (i = 0; i < height; ++i) { 48 _mm_store_si128((__m128i *)dst, *row); 49 _mm_store_si128((__m128i *)(dst + 16), *row); 50 dst += stride; 51 } 52 } 53 54 static inline void dc_store_64xh(const __m128i *row, int height, uint8_t *dst, 55 ptrdiff_t stride) { 56 for (int i = 0; i < height; ++i) { 57 _mm_store_si128((__m128i *)dst, *row); 58 _mm_store_si128((__m128i *)(dst + 16), *row); 59 _mm_store_si128((__m128i *)(dst + 32), *row); 60 _mm_store_si128((__m128i *)(dst + 48), *row); 61 dst += stride; 62 } 63 } 64 65 static inline __m128i dc_sum_4(const uint8_t *ref) { 66 __m128i x = _mm_loadl_epi64((__m128i const *)ref); 67 const __m128i zero = _mm_setzero_si128(); 68 x = _mm_unpacklo_epi8(x, zero); 69 return _mm_sad_epu8(x, zero); 70 } 71 72 static inline __m128i dc_sum_8(const uint8_t *ref) { 73 __m128i x = _mm_loadl_epi64((__m128i const *)ref); 74 const __m128i zero = _mm_setzero_si128(); 75 return _mm_sad_epu8(x, zero); 76 } 77 78 static inline __m128i dc_sum_64(const uint8_t *ref) { 79 __m128i x0 = _mm_load_si128((__m128i const *)ref); 80 __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16)); 81 __m128i x2 = _mm_load_si128((__m128i const *)(ref + 32)); 82 __m128i x3 = _mm_load_si128((__m128i const *)(ref + 48)); 83 const __m128i zero = _mm_setzero_si128(); 84 x0 = _mm_sad_epu8(x0, zero); 85 x1 = _mm_sad_epu8(x1, zero); 86 x2 = _mm_sad_epu8(x2, zero); 87 x3 = _mm_sad_epu8(x3, zero); 88 x0 = _mm_add_epi16(x0, x1); 89 x2 = _mm_add_epi16(x2, x3); 90 x0 = _mm_add_epi16(x0, x2); 91 const __m128i high = _mm_unpackhi_epi64(x0, x0); 92 return _mm_add_epi16(x0, high); 93 } 94 95 #define DC_MULTIPLIER_1X2 0x5556 96 #define DC_MULTIPLIER_1X4 0x3334 97 98 #define DC_SHIFT2 16 99 100 static inline int divide_using_multiply_shift(int num, int shift1, 101 int multiplier) { 102 const int interm = num >> shift1; 103 return interm * multiplier >> DC_SHIFT2; 104 } 105 106 // ----------------------------------------------------------------------------- 107 // DC_PRED 108 109 void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, 110 const uint8_t *above, const uint8_t *left) { 111 const __m128i sum_left = dc_sum_8(left); 112 __m128i sum_above = dc_sum_4(above); 113 sum_above = _mm_add_epi16(sum_left, sum_above); 114 115 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); 116 sum += 6; 117 sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2); 118 119 const __m128i row = _mm_set1_epi8((int8_t)sum); 120 const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(row); 121 dc_store_4xh(pred, 8, dst, stride); 122 } 123 124 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 125 void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, 126 const uint8_t *above, const uint8_t *left) { 127 const __m128i sum_left = dc_sum_16_sse2(left); 128 __m128i sum_above = dc_sum_4(above); 129 sum_above = _mm_add_epi16(sum_left, sum_above); 130 131 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); 132 sum += 10; 133 sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4); 134 135 const __m128i row = _mm_set1_epi8((int8_t)sum); 136 const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(row); 137 dc_store_4xh(pred, 16, dst, stride); 138 } 139 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 140 141 void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, 142 const uint8_t *above, const uint8_t *left) { 143 const __m128i sum_left = dc_sum_4(left); 144 __m128i sum_above = dc_sum_8(above); 145 sum_above = _mm_add_epi16(sum_above, sum_left); 146 147 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); 148 sum += 6; 149 sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2); 150 151 const __m128i row = _mm_set1_epi8((int8_t)sum); 152 dc_store_8xh(&row, 4, dst, stride); 153 } 154 155 void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, 156 const uint8_t *above, const uint8_t *left) { 157 const __m128i sum_left = dc_sum_16_sse2(left); 158 __m128i sum_above = dc_sum_8(above); 159 sum_above = _mm_add_epi16(sum_above, sum_left); 160 161 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); 162 sum += 12; 163 sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2); 164 const __m128i row = _mm_set1_epi8((int8_t)sum); 165 dc_store_8xh(&row, 16, dst, stride); 166 } 167 168 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 169 void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, 170 const uint8_t *above, const uint8_t *left) { 171 const __m128i sum_left = dc_sum_32_sse2(left); 172 __m128i sum_above = dc_sum_8(above); 173 sum_above = _mm_add_epi16(sum_above, sum_left); 174 175 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); 176 sum += 20; 177 sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4); 178 const __m128i row = _mm_set1_epi8((int8_t)sum); 179 dc_store_8xh(&row, 32, dst, stride); 180 } 181 182 void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, 183 const uint8_t *above, const uint8_t *left) { 184 const __m128i sum_left = dc_sum_4(left); 185 __m128i sum_above = dc_sum_16_sse2(above); 186 sum_above = _mm_add_epi16(sum_above, sum_left); 187 188 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); 189 sum += 10; 190 sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4); 191 const __m128i row = _mm_set1_epi8((int8_t)sum); 192 dc_store_16xh(&row, 4, dst, stride); 193 } 194 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 195 196 void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, 197 const uint8_t *above, const uint8_t *left) { 198 const __m128i sum_left = dc_sum_8(left); 199 __m128i sum_above = dc_sum_16_sse2(above); 200 sum_above = _mm_add_epi16(sum_above, sum_left); 201 202 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); 203 sum += 12; 204 sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2); 205 const __m128i row = _mm_set1_epi8((int8_t)sum); 206 dc_store_16xh(&row, 8, dst, stride); 207 } 208 209 void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, 210 const uint8_t *above, const uint8_t *left) { 211 const __m128i sum_left = dc_sum_32_sse2(left); 212 __m128i sum_above = dc_sum_16_sse2(above); 213 sum_above = _mm_add_epi16(sum_left, sum_above); 214 215 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); 216 sum += 24; 217 sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2); 218 const __m128i row = _mm_set1_epi8((int8_t)sum); 219 dc_store_16xh(&row, 32, dst, stride); 220 } 221 222 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 223 void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, 224 const uint8_t *above, const uint8_t *left) { 225 const __m128i sum_left = dc_sum_64(left); 226 __m128i sum_above = dc_sum_16_sse2(above); 227 sum_above = _mm_add_epi16(sum_left, sum_above); 228 229 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); 230 sum += 40; 231 sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4); 232 const __m128i row = _mm_set1_epi8((int8_t)sum); 233 dc_store_16xh(&row, 64, dst, stride); 234 } 235 236 void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, 237 const uint8_t *above, const uint8_t *left) { 238 __m128i sum_above = dc_sum_32_sse2(above); 239 const __m128i sum_left = dc_sum_8(left); 240 sum_above = _mm_add_epi16(sum_above, sum_left); 241 242 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); 243 sum += 20; 244 sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4); 245 const __m128i row = _mm_set1_epi8((int8_t)sum); 246 dc_store_32xh(&row, 8, dst, stride); 247 } 248 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 249 250 void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, 251 const uint8_t *above, const uint8_t *left) { 252 __m128i sum_above = dc_sum_32_sse2(above); 253 const __m128i sum_left = dc_sum_16_sse2(left); 254 sum_above = _mm_add_epi16(sum_above, sum_left); 255 256 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); 257 sum += 24; 258 sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2); 259 const __m128i row = _mm_set1_epi8((int8_t)sum); 260 dc_store_32xh(&row, 16, dst, stride); 261 } 262 263 void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, 264 const uint8_t *above, const uint8_t *left) { 265 __m128i sum_above = dc_sum_32_sse2(above); 266 const __m128i sum_left = dc_sum_64(left); 267 sum_above = _mm_add_epi16(sum_above, sum_left); 268 269 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); 270 sum += 48; 271 sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2); 272 const __m128i row = _mm_set1_epi8((int8_t)sum); 273 dc_store_32xh(&row, 64, dst, stride); 274 } 275 276 void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, 277 const uint8_t *above, const uint8_t *left) { 278 __m128i sum_above = dc_sum_64(above); 279 const __m128i sum_left = dc_sum_64(left); 280 sum_above = _mm_add_epi16(sum_above, sum_left); 281 282 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); 283 sum += 64; 284 sum /= 128; 285 const __m128i row = _mm_set1_epi8((int8_t)sum); 286 dc_store_64xh(&row, 64, dst, stride); 287 } 288 289 void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, 290 const uint8_t *above, const uint8_t *left) { 291 __m128i sum_above = dc_sum_64(above); 292 const __m128i sum_left = dc_sum_32_sse2(left); 293 sum_above = _mm_add_epi16(sum_above, sum_left); 294 295 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); 296 sum += 48; 297 sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2); 298 const __m128i row = _mm_set1_epi8((int8_t)sum); 299 dc_store_64xh(&row, 32, dst, stride); 300 } 301 302 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 303 void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, 304 const uint8_t *above, const uint8_t *left) { 305 __m128i sum_above = dc_sum_64(above); 306 const __m128i sum_left = dc_sum_16_sse2(left); 307 sum_above = _mm_add_epi16(sum_above, sum_left); 308 309 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); 310 sum += 40; 311 sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4); 312 const __m128i row = _mm_set1_epi8((int8_t)sum); 313 dc_store_64xh(&row, 16, dst, stride); 314 } 315 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 316 317 // ----------------------------------------------------------------------------- 318 // DC_TOP 319 320 void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, 321 const uint8_t *above, const uint8_t *left) { 322 (void)left; 323 __m128i sum_above = dc_sum_4(above); 324 const __m128i two = _mm_set1_epi16(2); 325 sum_above = _mm_add_epi16(sum_above, two); 326 sum_above = _mm_srai_epi16(sum_above, 2); 327 sum_above = _mm_shufflelo_epi16(sum_above, 0); 328 sum_above = _mm_packus_epi16(sum_above, sum_above); 329 330 const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_above); 331 dc_store_4xh(pred, 8, dst, stride); 332 } 333 334 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 335 void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, 336 const uint8_t *above, const uint8_t *left) { 337 (void)left; 338 __m128i sum_above = dc_sum_4(above); 339 const __m128i two = _mm_set1_epi16(2); 340 sum_above = _mm_add_epi16(sum_above, two); 341 sum_above = _mm_srai_epi16(sum_above, 2); 342 sum_above = _mm_shufflelo_epi16(sum_above, 0); 343 sum_above = _mm_packus_epi16(sum_above, sum_above); 344 345 const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_above); 346 dc_store_4xh(pred, 16, dst, stride); 347 } 348 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 349 350 void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, 351 const uint8_t *above, const uint8_t *left) { 352 (void)left; 353 __m128i sum_above = dc_sum_8(above); 354 const __m128i four = _mm_set1_epi16(4); 355 sum_above = _mm_add_epi16(sum_above, four); 356 sum_above = _mm_srai_epi16(sum_above, 3); 357 sum_above = _mm_unpacklo_epi8(sum_above, sum_above); 358 const __m128i row = _mm_shufflelo_epi16(sum_above, 0); 359 dc_store_8xh(&row, 4, dst, stride); 360 } 361 362 void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, 363 const uint8_t *above, const uint8_t *left) { 364 (void)left; 365 __m128i sum_above = dc_sum_8(above); 366 const __m128i four = _mm_set1_epi16(4); 367 sum_above = _mm_add_epi16(sum_above, four); 368 sum_above = _mm_srai_epi16(sum_above, 3); 369 sum_above = _mm_unpacklo_epi8(sum_above, sum_above); 370 const __m128i row = _mm_shufflelo_epi16(sum_above, 0); 371 dc_store_8xh(&row, 16, dst, stride); 372 } 373 374 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 375 void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, 376 const uint8_t *above, const uint8_t *left) { 377 (void)left; 378 __m128i sum_above = dc_sum_8(above); 379 const __m128i four = _mm_set1_epi16(4); 380 sum_above = _mm_add_epi16(sum_above, four); 381 sum_above = _mm_srai_epi16(sum_above, 3); 382 sum_above = _mm_unpacklo_epi8(sum_above, sum_above); 383 const __m128i row = _mm_shufflelo_epi16(sum_above, 0); 384 dc_store_8xh(&row, 32, dst, stride); 385 } 386 387 void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, 388 const uint8_t *above, const uint8_t *left) { 389 (void)left; 390 __m128i sum_above = dc_sum_16_sse2(above); 391 const __m128i eight = _mm_set1_epi16(8); 392 sum_above = _mm_add_epi16(sum_above, eight); 393 sum_above = _mm_srai_epi16(sum_above, 4); 394 sum_above = _mm_unpacklo_epi8(sum_above, sum_above); 395 sum_above = _mm_shufflelo_epi16(sum_above, 0); 396 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); 397 dc_store_16xh(&row, 4, dst, stride); 398 } 399 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 400 401 void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, 402 const uint8_t *above, const uint8_t *left) { 403 (void)left; 404 __m128i sum_above = dc_sum_16_sse2(above); 405 const __m128i eight = _mm_set1_epi16(8); 406 sum_above = _mm_add_epi16(sum_above, eight); 407 sum_above = _mm_srai_epi16(sum_above, 4); 408 sum_above = _mm_unpacklo_epi8(sum_above, sum_above); 409 sum_above = _mm_shufflelo_epi16(sum_above, 0); 410 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); 411 dc_store_16xh(&row, 8, dst, stride); 412 } 413 414 void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, 415 const uint8_t *above, 416 const uint8_t *left) { 417 (void)left; 418 __m128i sum_above = dc_sum_16_sse2(above); 419 const __m128i eight = _mm_set1_epi16(8); 420 sum_above = _mm_add_epi16(sum_above, eight); 421 sum_above = _mm_srai_epi16(sum_above, 4); 422 sum_above = _mm_unpacklo_epi8(sum_above, sum_above); 423 sum_above = _mm_shufflelo_epi16(sum_above, 0); 424 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); 425 dc_store_16xh(&row, 32, dst, stride); 426 } 427 428 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 429 void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, 430 const uint8_t *above, 431 const uint8_t *left) { 432 (void)left; 433 __m128i sum_above = dc_sum_16_sse2(above); 434 const __m128i eight = _mm_set1_epi16(8); 435 sum_above = _mm_add_epi16(sum_above, eight); 436 sum_above = _mm_srai_epi16(sum_above, 4); 437 sum_above = _mm_unpacklo_epi8(sum_above, sum_above); 438 sum_above = _mm_shufflelo_epi16(sum_above, 0); 439 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); 440 dc_store_16xh(&row, 64, dst, stride); 441 } 442 443 void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, 444 const uint8_t *above, const uint8_t *left) { 445 (void)left; 446 __m128i sum_above = dc_sum_32_sse2(above); 447 const __m128i sixteen = _mm_set1_epi16(16); 448 sum_above = _mm_add_epi16(sum_above, sixteen); 449 sum_above = _mm_srai_epi16(sum_above, 5); 450 sum_above = _mm_unpacklo_epi8(sum_above, sum_above); 451 sum_above = _mm_shufflelo_epi16(sum_above, 0); 452 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); 453 dc_store_32xh(&row, 8, dst, stride); 454 } 455 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 456 457 void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, 458 const uint8_t *above, 459 const uint8_t *left) { 460 (void)left; 461 __m128i sum_above = dc_sum_32_sse2(above); 462 const __m128i sixteen = _mm_set1_epi16(16); 463 sum_above = _mm_add_epi16(sum_above, sixteen); 464 sum_above = _mm_srai_epi16(sum_above, 5); 465 sum_above = _mm_unpacklo_epi8(sum_above, sum_above); 466 sum_above = _mm_shufflelo_epi16(sum_above, 0); 467 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); 468 dc_store_32xh(&row, 16, dst, stride); 469 } 470 471 void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, 472 const uint8_t *above, 473 const uint8_t *left) { 474 (void)left; 475 __m128i sum_above = dc_sum_32_sse2(above); 476 const __m128i sixteen = _mm_set1_epi16(16); 477 sum_above = _mm_add_epi16(sum_above, sixteen); 478 sum_above = _mm_srai_epi16(sum_above, 5); 479 sum_above = _mm_unpacklo_epi8(sum_above, sum_above); 480 sum_above = _mm_shufflelo_epi16(sum_above, 0); 481 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); 482 dc_store_32xh(&row, 64, dst, stride); 483 } 484 485 void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, 486 const uint8_t *above, 487 const uint8_t *left) { 488 (void)left; 489 __m128i sum_above = dc_sum_64(above); 490 const __m128i thirtytwo = _mm_set1_epi16(32); 491 sum_above = _mm_add_epi16(sum_above, thirtytwo); 492 sum_above = _mm_srai_epi16(sum_above, 6); 493 sum_above = _mm_unpacklo_epi8(sum_above, sum_above); 494 sum_above = _mm_shufflelo_epi16(sum_above, 0); 495 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); 496 dc_store_64xh(&row, 64, dst, stride); 497 } 498 499 void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, 500 const uint8_t *above, 501 const uint8_t *left) { 502 (void)left; 503 __m128i sum_above = dc_sum_64(above); 504 const __m128i thirtytwo = _mm_set1_epi16(32); 505 sum_above = _mm_add_epi16(sum_above, thirtytwo); 506 sum_above = _mm_srai_epi16(sum_above, 6); 507 sum_above = _mm_unpacklo_epi8(sum_above, sum_above); 508 sum_above = _mm_shufflelo_epi16(sum_above, 0); 509 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); 510 dc_store_64xh(&row, 32, dst, stride); 511 } 512 513 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 514 void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, 515 const uint8_t *above, 516 const uint8_t *left) { 517 (void)left; 518 __m128i sum_above = dc_sum_64(above); 519 const __m128i thirtytwo = _mm_set1_epi16(32); 520 sum_above = _mm_add_epi16(sum_above, thirtytwo); 521 sum_above = _mm_srai_epi16(sum_above, 6); 522 sum_above = _mm_unpacklo_epi8(sum_above, sum_above); 523 sum_above = _mm_shufflelo_epi16(sum_above, 0); 524 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); 525 dc_store_64xh(&row, 16, dst, stride); 526 } 527 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 528 529 // ----------------------------------------------------------------------------- 530 // DC_LEFT 531 532 void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, 533 const uint8_t *above, const uint8_t *left) { 534 (void)above; 535 __m128i sum_left = dc_sum_8(left); 536 const __m128i four = _mm_set1_epi16(4); 537 sum_left = _mm_add_epi16(sum_left, four); 538 sum_left = _mm_srai_epi16(sum_left, 3); 539 sum_left = _mm_shufflelo_epi16(sum_left, 0); 540 sum_left = _mm_packus_epi16(sum_left, sum_left); 541 542 const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_left); 543 dc_store_4xh(pred, 8, dst, stride); 544 } 545 546 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 547 void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, 548 const uint8_t *above, 549 const uint8_t *left) { 550 (void)above; 551 __m128i sum_left = dc_sum_16_sse2(left); 552 const __m128i eight = _mm_set1_epi16(8); 553 sum_left = _mm_add_epi16(sum_left, eight); 554 sum_left = _mm_srai_epi16(sum_left, 4); 555 sum_left = _mm_shufflelo_epi16(sum_left, 0); 556 sum_left = _mm_packus_epi16(sum_left, sum_left); 557 558 const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_left); 559 dc_store_4xh(pred, 16, dst, stride); 560 } 561 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 562 563 void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, 564 const uint8_t *above, const uint8_t *left) { 565 (void)above; 566 __m128i sum_left = dc_sum_4(left); 567 const __m128i two = _mm_set1_epi16(2); 568 sum_left = _mm_add_epi16(sum_left, two); 569 sum_left = _mm_srai_epi16(sum_left, 2); 570 sum_left = _mm_unpacklo_epi8(sum_left, sum_left); 571 const __m128i row = _mm_shufflelo_epi16(sum_left, 0); 572 dc_store_8xh(&row, 4, dst, stride); 573 } 574 575 void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, 576 const uint8_t *above, 577 const uint8_t *left) { 578 (void)above; 579 __m128i sum_left = dc_sum_16_sse2(left); 580 const __m128i eight = _mm_set1_epi16(8); 581 sum_left = _mm_add_epi16(sum_left, eight); 582 sum_left = _mm_srai_epi16(sum_left, 4); 583 sum_left = _mm_unpacklo_epi8(sum_left, sum_left); 584 const __m128i row = _mm_shufflelo_epi16(sum_left, 0); 585 dc_store_8xh(&row, 16, dst, stride); 586 } 587 588 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 589 void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, 590 const uint8_t *above, 591 const uint8_t *left) { 592 (void)above; 593 __m128i sum_left = dc_sum_32_sse2(left); 594 const __m128i sixteen = _mm_set1_epi16(16); 595 sum_left = _mm_add_epi16(sum_left, sixteen); 596 sum_left = _mm_srai_epi16(sum_left, 5); 597 sum_left = _mm_unpacklo_epi8(sum_left, sum_left); 598 const __m128i row = _mm_shufflelo_epi16(sum_left, 0); 599 dc_store_8xh(&row, 32, dst, stride); 600 } 601 602 void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, 603 const uint8_t *above, 604 const uint8_t *left) { 605 (void)above; 606 __m128i sum_left = dc_sum_4(left); 607 const __m128i two = _mm_set1_epi16(2); 608 sum_left = _mm_add_epi16(sum_left, two); 609 sum_left = _mm_srai_epi16(sum_left, 2); 610 sum_left = _mm_unpacklo_epi8(sum_left, sum_left); 611 sum_left = _mm_shufflelo_epi16(sum_left, 0); 612 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); 613 dc_store_16xh(&row, 4, dst, stride); 614 } 615 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 616 617 void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, 618 const uint8_t *above, 619 const uint8_t *left) { 620 (void)above; 621 __m128i sum_left = dc_sum_8(left); 622 const __m128i four = _mm_set1_epi16(4); 623 sum_left = _mm_add_epi16(sum_left, four); 624 sum_left = _mm_srai_epi16(sum_left, 3); 625 sum_left = _mm_unpacklo_epi8(sum_left, sum_left); 626 sum_left = _mm_shufflelo_epi16(sum_left, 0); 627 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); 628 dc_store_16xh(&row, 8, dst, stride); 629 } 630 631 void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, 632 const uint8_t *above, 633 const uint8_t *left) { 634 (void)above; 635 __m128i sum_left = dc_sum_32_sse2(left); 636 const __m128i sixteen = _mm_set1_epi16(16); 637 sum_left = _mm_add_epi16(sum_left, sixteen); 638 sum_left = _mm_srai_epi16(sum_left, 5); 639 sum_left = _mm_unpacklo_epi8(sum_left, sum_left); 640 sum_left = _mm_shufflelo_epi16(sum_left, 0); 641 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); 642 dc_store_16xh(&row, 32, dst, stride); 643 } 644 645 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 646 void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, 647 const uint8_t *above, 648 const uint8_t *left) { 649 (void)above; 650 __m128i sum_left = dc_sum_64(left); 651 const __m128i thirtytwo = _mm_set1_epi16(32); 652 sum_left = _mm_add_epi16(sum_left, thirtytwo); 653 sum_left = _mm_srai_epi16(sum_left, 6); 654 sum_left = _mm_unpacklo_epi8(sum_left, sum_left); 655 sum_left = _mm_shufflelo_epi16(sum_left, 0); 656 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); 657 dc_store_16xh(&row, 64, dst, stride); 658 } 659 660 void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, 661 const uint8_t *above, 662 const uint8_t *left) { 663 (void)above; 664 __m128i sum_left = dc_sum_8(left); 665 const __m128i four = _mm_set1_epi16(4); 666 sum_left = _mm_add_epi16(sum_left, four); 667 sum_left = _mm_srai_epi16(sum_left, 3); 668 sum_left = _mm_unpacklo_epi8(sum_left, sum_left); 669 sum_left = _mm_shufflelo_epi16(sum_left, 0); 670 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); 671 dc_store_32xh(&row, 8, dst, stride); 672 } 673 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 674 675 void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, 676 const uint8_t *above, 677 const uint8_t *left) { 678 (void)above; 679 __m128i sum_left = dc_sum_16_sse2(left); 680 const __m128i eight = _mm_set1_epi16(8); 681 sum_left = _mm_add_epi16(sum_left, eight); 682 sum_left = _mm_srai_epi16(sum_left, 4); 683 sum_left = _mm_unpacklo_epi8(sum_left, sum_left); 684 sum_left = _mm_shufflelo_epi16(sum_left, 0); 685 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); 686 dc_store_32xh(&row, 16, dst, stride); 687 } 688 689 void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, 690 const uint8_t *above, 691 const uint8_t *left) { 692 (void)above; 693 __m128i sum_left = dc_sum_64(left); 694 const __m128i thirtytwo = _mm_set1_epi16(32); 695 sum_left = _mm_add_epi16(sum_left, thirtytwo); 696 sum_left = _mm_srai_epi16(sum_left, 6); 697 sum_left = _mm_unpacklo_epi8(sum_left, sum_left); 698 sum_left = _mm_shufflelo_epi16(sum_left, 0); 699 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); 700 dc_store_32xh(&row, 64, dst, stride); 701 } 702 703 void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, 704 const uint8_t *above, 705 const uint8_t *left) { 706 (void)above; 707 __m128i sum_left = dc_sum_64(left); 708 const __m128i thirtytwo = _mm_set1_epi16(32); 709 sum_left = _mm_add_epi16(sum_left, thirtytwo); 710 sum_left = _mm_srai_epi16(sum_left, 6); 711 sum_left = _mm_unpacklo_epi8(sum_left, sum_left); 712 sum_left = _mm_shufflelo_epi16(sum_left, 0); 713 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); 714 dc_store_64xh(&row, 64, dst, stride); 715 } 716 717 void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, 718 const uint8_t *above, 719 const uint8_t *left) { 720 (void)above; 721 __m128i sum_left = dc_sum_32_sse2(left); 722 const __m128i sixteen = _mm_set1_epi16(16); 723 sum_left = _mm_add_epi16(sum_left, sixteen); 724 sum_left = _mm_srai_epi16(sum_left, 5); 725 sum_left = _mm_unpacklo_epi8(sum_left, sum_left); 726 sum_left = _mm_shufflelo_epi16(sum_left, 0); 727 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); 728 dc_store_64xh(&row, 32, dst, stride); 729 } 730 731 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 732 void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, 733 const uint8_t *above, 734 const uint8_t *left) { 735 (void)above; 736 __m128i sum_left = dc_sum_16_sse2(left); 737 const __m128i eight = _mm_set1_epi16(8); 738 sum_left = _mm_add_epi16(sum_left, eight); 739 sum_left = _mm_srai_epi16(sum_left, 4); 740 sum_left = _mm_unpacklo_epi8(sum_left, sum_left); 741 sum_left = _mm_shufflelo_epi16(sum_left, 0); 742 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); 743 dc_store_64xh(&row, 16, dst, stride); 744 } 745 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 746 747 // ----------------------------------------------------------------------------- 748 // DC_128 749 750 void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, 751 const uint8_t *above, const uint8_t *left) { 752 (void)above; 753 (void)left; 754 const uint32_t pred = 0x80808080; 755 dc_store_4xh(pred, 8, dst, stride); 756 } 757 758 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 759 void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, 760 const uint8_t *above, const uint8_t *left) { 761 (void)above; 762 (void)left; 763 const uint32_t pred = 0x80808080; 764 dc_store_4xh(pred, 16, dst, stride); 765 } 766 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 767 768 void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, 769 const uint8_t *above, const uint8_t *left) { 770 (void)above; 771 (void)left; 772 const __m128i row = _mm_set1_epi8((int8_t)128); 773 dc_store_8xh(&row, 4, dst, stride); 774 } 775 776 void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, 777 const uint8_t *above, const uint8_t *left) { 778 (void)above; 779 (void)left; 780 const __m128i row = _mm_set1_epi8((int8_t)128); 781 dc_store_8xh(&row, 16, dst, stride); 782 } 783 784 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 785 void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, 786 const uint8_t *above, const uint8_t *left) { 787 (void)above; 788 (void)left; 789 const __m128i row = _mm_set1_epi8((int8_t)128); 790 dc_store_8xh(&row, 32, dst, stride); 791 } 792 793 void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, 794 const uint8_t *above, const uint8_t *left) { 795 (void)above; 796 (void)left; 797 const __m128i row = _mm_set1_epi8((int8_t)128); 798 dc_store_16xh(&row, 4, dst, stride); 799 } 800 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 801 802 void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, 803 const uint8_t *above, const uint8_t *left) { 804 (void)above; 805 (void)left; 806 const __m128i row = _mm_set1_epi8((int8_t)128); 807 dc_store_16xh(&row, 8, dst, stride); 808 } 809 810 void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, 811 const uint8_t *above, 812 const uint8_t *left) { 813 (void)above; 814 (void)left; 815 const __m128i row = _mm_set1_epi8((int8_t)128); 816 dc_store_16xh(&row, 32, dst, stride); 817 } 818 819 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 820 void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, 821 const uint8_t *above, 822 const uint8_t *left) { 823 (void)above; 824 (void)left; 825 const __m128i row = _mm_set1_epi8((int8_t)128); 826 dc_store_16xh(&row, 64, dst, stride); 827 } 828 829 void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, 830 const uint8_t *above, const uint8_t *left) { 831 (void)above; 832 (void)left; 833 const __m128i row = _mm_set1_epi8((int8_t)128); 834 dc_store_32xh(&row, 8, dst, stride); 835 } 836 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 837 838 void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, 839 const uint8_t *above, 840 const uint8_t *left) { 841 (void)above; 842 (void)left; 843 const __m128i row = _mm_set1_epi8((int8_t)128); 844 dc_store_32xh(&row, 16, dst, stride); 845 } 846 847 void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, 848 const uint8_t *above, 849 const uint8_t *left) { 850 (void)above; 851 (void)left; 852 const __m128i row = _mm_set1_epi8((int8_t)128); 853 dc_store_32xh(&row, 64, dst, stride); 854 } 855 856 void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, 857 const uint8_t *above, 858 const uint8_t *left) { 859 (void)above; 860 (void)left; 861 const __m128i row = _mm_set1_epi8((int8_t)128); 862 dc_store_64xh(&row, 64, dst, stride); 863 } 864 865 void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, 866 const uint8_t *above, 867 const uint8_t *left) { 868 (void)above; 869 (void)left; 870 const __m128i row = _mm_set1_epi8((int8_t)128); 871 dc_store_64xh(&row, 32, dst, stride); 872 } 873 874 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 875 void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, 876 const uint8_t *above, 877 const uint8_t *left) { 878 (void)above; 879 (void)left; 880 const __m128i row = _mm_set1_epi8((int8_t)128); 881 dc_store_64xh(&row, 16, dst, stride); 882 } 883 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 884 885 // ----------------------------------------------------------------------------- 886 // V_PRED 887 888 void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, 889 const uint8_t *above, const uint8_t *left) { 890 const uint32_t pred = *(uint32_t *)above; 891 (void)left; 892 dc_store_4xh(pred, 8, dst, stride); 893 } 894 895 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 896 void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, 897 const uint8_t *above, const uint8_t *left) { 898 const uint32_t pred = *(uint32_t *)above; 899 (void)left; 900 dc_store_4xh(pred, 16, dst, stride); 901 } 902 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 903 904 void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, 905 const uint8_t *above, const uint8_t *left) { 906 const __m128i row = _mm_loadl_epi64((__m128i const *)above); 907 (void)left; 908 dc_store_8xh(&row, 4, dst, stride); 909 } 910 911 void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, 912 const uint8_t *above, const uint8_t *left) { 913 const __m128i row = _mm_loadl_epi64((__m128i const *)above); 914 (void)left; 915 dc_store_8xh(&row, 16, dst, stride); 916 } 917 918 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 919 void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, 920 const uint8_t *above, const uint8_t *left) { 921 const __m128i row = _mm_loadl_epi64((__m128i const *)above); 922 (void)left; 923 dc_store_8xh(&row, 32, dst, stride); 924 } 925 926 void aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, 927 const uint8_t *above, const uint8_t *left) { 928 const __m128i row = _mm_load_si128((__m128i const *)above); 929 (void)left; 930 dc_store_16xh(&row, 4, dst, stride); 931 } 932 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 933 934 void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, 935 const uint8_t *above, const uint8_t *left) { 936 const __m128i row = _mm_load_si128((__m128i const *)above); 937 (void)left; 938 dc_store_16xh(&row, 8, dst, stride); 939 } 940 941 void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, 942 const uint8_t *above, const uint8_t *left) { 943 const __m128i row = _mm_load_si128((__m128i const *)above); 944 (void)left; 945 dc_store_16xh(&row, 32, dst, stride); 946 } 947 948 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 949 void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, 950 const uint8_t *above, const uint8_t *left) { 951 const __m128i row = _mm_load_si128((__m128i const *)above); 952 (void)left; 953 dc_store_16xh(&row, 64, dst, stride); 954 } 955 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 956 957 static inline void v_predictor_32xh(uint8_t *dst, ptrdiff_t stride, 958 const uint8_t *above, int height) { 959 const __m128i row0 = _mm_load_si128((__m128i const *)above); 960 const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16)); 961 for (int i = 0; i < height; ++i) { 962 _mm_store_si128((__m128i *)dst, row0); 963 _mm_store_si128((__m128i *)(dst + 16), row1); 964 dst += stride; 965 } 966 } 967 968 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 969 void aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, 970 const uint8_t *above, const uint8_t *left) { 971 (void)left; 972 v_predictor_32xh(dst, stride, above, 8); 973 } 974 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 975 976 void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, 977 const uint8_t *above, const uint8_t *left) { 978 (void)left; 979 v_predictor_32xh(dst, stride, above, 16); 980 } 981 982 void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, 983 const uint8_t *above, const uint8_t *left) { 984 (void)left; 985 v_predictor_32xh(dst, stride, above, 64); 986 } 987 988 static inline void v_predictor_64xh(uint8_t *dst, ptrdiff_t stride, 989 const uint8_t *above, int height) { 990 const __m128i row0 = _mm_load_si128((__m128i const *)above); 991 const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16)); 992 const __m128i row2 = _mm_load_si128((__m128i const *)(above + 32)); 993 const __m128i row3 = _mm_load_si128((__m128i const *)(above + 48)); 994 for (int i = 0; i < height; ++i) { 995 _mm_store_si128((__m128i *)dst, row0); 996 _mm_store_si128((__m128i *)(dst + 16), row1); 997 _mm_store_si128((__m128i *)(dst + 32), row2); 998 _mm_store_si128((__m128i *)(dst + 48), row3); 999 dst += stride; 1000 } 1001 } 1002 1003 void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, 1004 const uint8_t *above, const uint8_t *left) { 1005 (void)left; 1006 v_predictor_64xh(dst, stride, above, 64); 1007 } 1008 1009 void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, 1010 const uint8_t *above, const uint8_t *left) { 1011 (void)left; 1012 v_predictor_64xh(dst, stride, above, 32); 1013 } 1014 1015 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1016 void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, 1017 const uint8_t *above, const uint8_t *left) { 1018 (void)left; 1019 v_predictor_64xh(dst, stride, above, 16); 1020 } 1021 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1022 1023 // ----------------------------------------------------------------------------- 1024 // H_PRED 1025 1026 void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, 1027 const uint8_t *above, const uint8_t *left) { 1028 (void)above; 1029 __m128i left_col = _mm_loadl_epi64((__m128i const *)left); 1030 left_col = _mm_unpacklo_epi8(left_col, left_col); 1031 __m128i row0 = _mm_shufflelo_epi16(left_col, 0); 1032 __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55); 1033 __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa); 1034 __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff); 1035 *(int *)dst = _mm_cvtsi128_si32(row0); 1036 dst += stride; 1037 *(int *)dst = _mm_cvtsi128_si32(row1); 1038 dst += stride; 1039 *(int *)dst = _mm_cvtsi128_si32(row2); 1040 dst += stride; 1041 *(int *)dst = _mm_cvtsi128_si32(row3); 1042 dst += stride; 1043 left_col = _mm_unpackhi_epi64(left_col, left_col); 1044 row0 = _mm_shufflelo_epi16(left_col, 0); 1045 row1 = _mm_shufflelo_epi16(left_col, 0x55); 1046 row2 = _mm_shufflelo_epi16(left_col, 0xaa); 1047 row3 = _mm_shufflelo_epi16(left_col, 0xff); 1048 *(int *)dst = _mm_cvtsi128_si32(row0); 1049 dst += stride; 1050 *(int *)dst = _mm_cvtsi128_si32(row1); 1051 dst += stride; 1052 *(int *)dst = _mm_cvtsi128_si32(row2); 1053 dst += stride; 1054 *(int *)dst = _mm_cvtsi128_si32(row3); 1055 } 1056 1057 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1058 void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, 1059 const uint8_t *above, const uint8_t *left) { 1060 (void)above; 1061 const __m128i left_col = _mm_load_si128((__m128i const *)left); 1062 __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col); 1063 __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col); 1064 1065 __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0); 1066 __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55); 1067 __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); 1068 __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff); 1069 *(int *)dst = _mm_cvtsi128_si32(row0); 1070 dst += stride; 1071 *(int *)dst = _mm_cvtsi128_si32(row1); 1072 dst += stride; 1073 *(int *)dst = _mm_cvtsi128_si32(row2); 1074 dst += stride; 1075 *(int *)dst = _mm_cvtsi128_si32(row3); 1076 dst += stride; 1077 1078 left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low); 1079 row0 = _mm_shufflelo_epi16(left_col_low, 0); 1080 row1 = _mm_shufflelo_epi16(left_col_low, 0x55); 1081 row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); 1082 row3 = _mm_shufflelo_epi16(left_col_low, 0xff); 1083 *(int *)dst = _mm_cvtsi128_si32(row0); 1084 dst += stride; 1085 *(int *)dst = _mm_cvtsi128_si32(row1); 1086 dst += stride; 1087 *(int *)dst = _mm_cvtsi128_si32(row2); 1088 dst += stride; 1089 *(int *)dst = _mm_cvtsi128_si32(row3); 1090 dst += stride; 1091 1092 row0 = _mm_shufflelo_epi16(left_col_high, 0); 1093 row1 = _mm_shufflelo_epi16(left_col_high, 0x55); 1094 row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); 1095 row3 = _mm_shufflelo_epi16(left_col_high, 0xff); 1096 *(int *)dst = _mm_cvtsi128_si32(row0); 1097 dst += stride; 1098 *(int *)dst = _mm_cvtsi128_si32(row1); 1099 dst += stride; 1100 *(int *)dst = _mm_cvtsi128_si32(row2); 1101 dst += stride; 1102 *(int *)dst = _mm_cvtsi128_si32(row3); 1103 dst += stride; 1104 1105 left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high); 1106 row0 = _mm_shufflelo_epi16(left_col_high, 0); 1107 row1 = _mm_shufflelo_epi16(left_col_high, 0x55); 1108 row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); 1109 row3 = _mm_shufflelo_epi16(left_col_high, 0xff); 1110 *(int *)dst = _mm_cvtsi128_si32(row0); 1111 dst += stride; 1112 *(int *)dst = _mm_cvtsi128_si32(row1); 1113 dst += stride; 1114 *(int *)dst = _mm_cvtsi128_si32(row2); 1115 dst += stride; 1116 *(int *)dst = _mm_cvtsi128_si32(row3); 1117 } 1118 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1119 1120 void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, 1121 const uint8_t *above, const uint8_t *left) { 1122 (void)above; 1123 __m128i left_col = _mm_loadl_epi64((__m128i const *)left); 1124 left_col = _mm_unpacklo_epi8(left_col, left_col); 1125 __m128i row0 = _mm_shufflelo_epi16(left_col, 0); 1126 __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55); 1127 __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa); 1128 __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff); 1129 _mm_storel_epi64((__m128i *)dst, row0); 1130 dst += stride; 1131 _mm_storel_epi64((__m128i *)dst, row1); 1132 dst += stride; 1133 _mm_storel_epi64((__m128i *)dst, row2); 1134 dst += stride; 1135 _mm_storel_epi64((__m128i *)dst, row3); 1136 } 1137 1138 static inline void h_predictor_8x16xc(uint8_t *dst, ptrdiff_t stride, 1139 const uint8_t *above, const uint8_t *left, 1140 int count) { 1141 (void)above; 1142 for (int i = 0; i < count; ++i) { 1143 const __m128i left_col = _mm_load_si128((__m128i const *)left); 1144 __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col); 1145 __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col); 1146 1147 __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0); 1148 __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55); 1149 __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); 1150 __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff); 1151 _mm_storel_epi64((__m128i *)dst, row0); 1152 dst += stride; 1153 _mm_storel_epi64((__m128i *)dst, row1); 1154 dst += stride; 1155 _mm_storel_epi64((__m128i *)dst, row2); 1156 dst += stride; 1157 _mm_storel_epi64((__m128i *)dst, row3); 1158 dst += stride; 1159 1160 left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low); 1161 row0 = _mm_shufflelo_epi16(left_col_low, 0); 1162 row1 = _mm_shufflelo_epi16(left_col_low, 0x55); 1163 row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); 1164 row3 = _mm_shufflelo_epi16(left_col_low, 0xff); 1165 _mm_storel_epi64((__m128i *)dst, row0); 1166 dst += stride; 1167 _mm_storel_epi64((__m128i *)dst, row1); 1168 dst += stride; 1169 _mm_storel_epi64((__m128i *)dst, row2); 1170 dst += stride; 1171 _mm_storel_epi64((__m128i *)dst, row3); 1172 dst += stride; 1173 1174 row0 = _mm_shufflelo_epi16(left_col_high, 0); 1175 row1 = _mm_shufflelo_epi16(left_col_high, 0x55); 1176 row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); 1177 row3 = _mm_shufflelo_epi16(left_col_high, 0xff); 1178 _mm_storel_epi64((__m128i *)dst, row0); 1179 dst += stride; 1180 _mm_storel_epi64((__m128i *)dst, row1); 1181 dst += stride; 1182 _mm_storel_epi64((__m128i *)dst, row2); 1183 dst += stride; 1184 _mm_storel_epi64((__m128i *)dst, row3); 1185 dst += stride; 1186 1187 left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high); 1188 row0 = _mm_shufflelo_epi16(left_col_high, 0); 1189 row1 = _mm_shufflelo_epi16(left_col_high, 0x55); 1190 row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); 1191 row3 = _mm_shufflelo_epi16(left_col_high, 0xff); 1192 _mm_storel_epi64((__m128i *)dst, row0); 1193 dst += stride; 1194 _mm_storel_epi64((__m128i *)dst, row1); 1195 dst += stride; 1196 _mm_storel_epi64((__m128i *)dst, row2); 1197 dst += stride; 1198 _mm_storel_epi64((__m128i *)dst, row3); 1199 dst += stride; 1200 left += 16; 1201 } 1202 } 1203 1204 void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, 1205 const uint8_t *above, const uint8_t *left) { 1206 h_predictor_8x16xc(dst, stride, above, left, 1); 1207 } 1208 1209 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1210 void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, 1211 const uint8_t *above, const uint8_t *left) { 1212 h_predictor_8x16xc(dst, stride, above, left, 2); 1213 } 1214 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1215 1216 static inline void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst, 1217 ptrdiff_t stride) { 1218 int i; 1219 for (i = 0; i < h; ++i) { 1220 _mm_store_si128((__m128i *)dst, row[i]); 1221 dst += stride; 1222 } 1223 } 1224 1225 static inline void repeat_low_4pixels(const __m128i *x, __m128i *row) { 1226 const __m128i u0 = _mm_shufflelo_epi16(*x, 0); 1227 const __m128i u1 = _mm_shufflelo_epi16(*x, 0x55); 1228 const __m128i u2 = _mm_shufflelo_epi16(*x, 0xaa); 1229 const __m128i u3 = _mm_shufflelo_epi16(*x, 0xff); 1230 1231 row[0] = _mm_unpacklo_epi64(u0, u0); 1232 row[1] = _mm_unpacklo_epi64(u1, u1); 1233 row[2] = _mm_unpacklo_epi64(u2, u2); 1234 row[3] = _mm_unpacklo_epi64(u3, u3); 1235 } 1236 1237 static inline void repeat_high_4pixels(const __m128i *x, __m128i *row) { 1238 const __m128i u0 = _mm_shufflehi_epi16(*x, 0); 1239 const __m128i u1 = _mm_shufflehi_epi16(*x, 0x55); 1240 const __m128i u2 = _mm_shufflehi_epi16(*x, 0xaa); 1241 const __m128i u3 = _mm_shufflehi_epi16(*x, 0xff); 1242 1243 row[0] = _mm_unpackhi_epi64(u0, u0); 1244 row[1] = _mm_unpackhi_epi64(u1, u1); 1245 row[2] = _mm_unpackhi_epi64(u2, u2); 1246 row[3] = _mm_unpackhi_epi64(u3, u3); 1247 } 1248 1249 // Process 16x8, first 4 rows 1250 // Use first 8 bytes of left register: xxxxxxxx33221100 1251 static inline void h_prediction_16x8_1(const __m128i *left, uint8_t *dst, 1252 ptrdiff_t stride) { 1253 __m128i row[4]; 1254 repeat_low_4pixels(left, row); 1255 h_pred_store_16xh(row, 4, dst, stride); 1256 } 1257 1258 // Process 16x8, second 4 rows 1259 // Use second 8 bytes of left register: 77665544xxxxxxxx 1260 static inline void h_prediction_16x8_2(const __m128i *left, uint8_t *dst, 1261 ptrdiff_t stride) { 1262 __m128i row[4]; 1263 repeat_high_4pixels(left, row); 1264 h_pred_store_16xh(row, 4, dst, stride); 1265 } 1266 1267 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1268 void aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, 1269 const uint8_t *above, const uint8_t *left) { 1270 (void)above; 1271 const __m128i left_col = _mm_loadl_epi64((const __m128i *)left); 1272 const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col); 1273 h_prediction_16x8_1(&left_col_8p, dst, stride); 1274 } 1275 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1276 1277 void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, 1278 const uint8_t *above, const uint8_t *left) { 1279 (void)above; 1280 const __m128i left_col = _mm_loadl_epi64((const __m128i *)left); 1281 const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col); 1282 h_prediction_16x8_1(&left_col_8p, dst, stride); 1283 dst += stride << 2; 1284 h_prediction_16x8_2(&left_col_8p, dst, stride); 1285 } 1286 1287 static inline void h_predictor_16xh(uint8_t *dst, ptrdiff_t stride, 1288 const uint8_t *left, int count) { 1289 int i = 0; 1290 do { 1291 const __m128i left_col = _mm_load_si128((const __m128i *)left); 1292 const __m128i left_col_8p_lo = _mm_unpacklo_epi8(left_col, left_col); 1293 h_prediction_16x8_1(&left_col_8p_lo, dst, stride); 1294 dst += stride << 2; 1295 h_prediction_16x8_2(&left_col_8p_lo, dst, stride); 1296 dst += stride << 2; 1297 1298 const __m128i left_col_8p_hi = _mm_unpackhi_epi8(left_col, left_col); 1299 h_prediction_16x8_1(&left_col_8p_hi, dst, stride); 1300 dst += stride << 2; 1301 h_prediction_16x8_2(&left_col_8p_hi, dst, stride); 1302 dst += stride << 2; 1303 1304 left += 16; 1305 i++; 1306 } while (i < count); 1307 } 1308 1309 void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, 1310 const uint8_t *above, const uint8_t *left) { 1311 (void)above; 1312 h_predictor_16xh(dst, stride, left, 2); 1313 } 1314 1315 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1316 void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, 1317 const uint8_t *above, const uint8_t *left) { 1318 (void)above; 1319 h_predictor_16xh(dst, stride, left, 4); 1320 } 1321 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1322 1323 static inline void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst, 1324 ptrdiff_t stride) { 1325 int i; 1326 for (i = 0; i < h; ++i) { 1327 _mm_store_si128((__m128i *)dst, row[i]); 1328 _mm_store_si128((__m128i *)(dst + 16), row[i]); 1329 dst += stride; 1330 } 1331 } 1332 1333 // Process 32x8, first 4 rows 1334 // Use first 8 bytes of left register: xxxxxxxx33221100 1335 static inline void h_prediction_32x8_1(const __m128i *left, uint8_t *dst, 1336 ptrdiff_t stride) { 1337 __m128i row[4]; 1338 repeat_low_4pixels(left, row); 1339 h_pred_store_32xh(row, 4, dst, stride); 1340 } 1341 1342 // Process 32x8, second 4 rows 1343 // Use second 8 bytes of left register: 77665544xxxxxxxx 1344 static inline void h_prediction_32x8_2(const __m128i *left, uint8_t *dst, 1345 ptrdiff_t stride) { 1346 __m128i row[4]; 1347 repeat_high_4pixels(left, row); 1348 h_pred_store_32xh(row, 4, dst, stride); 1349 } 1350 1351 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1352 void aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, 1353 const uint8_t *above, const uint8_t *left) { 1354 __m128i left_col, left_col_8p; 1355 (void)above; 1356 1357 left_col = _mm_load_si128((const __m128i *)left); 1358 1359 left_col_8p = _mm_unpacklo_epi8(left_col, left_col); 1360 h_prediction_32x8_1(&left_col_8p, dst, stride); 1361 dst += stride << 2; 1362 h_prediction_32x8_2(&left_col_8p, dst, stride); 1363 } 1364 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1365 1366 void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, 1367 const uint8_t *above, const uint8_t *left) { 1368 __m128i left_col, left_col_8p; 1369 (void)above; 1370 1371 left_col = _mm_load_si128((const __m128i *)left); 1372 1373 left_col_8p = _mm_unpacklo_epi8(left_col, left_col); 1374 h_prediction_32x8_1(&left_col_8p, dst, stride); 1375 dst += stride << 2; 1376 h_prediction_32x8_2(&left_col_8p, dst, stride); 1377 dst += stride << 2; 1378 1379 left_col_8p = _mm_unpackhi_epi8(left_col, left_col); 1380 h_prediction_32x8_1(&left_col_8p, dst, stride); 1381 dst += stride << 2; 1382 h_prediction_32x8_2(&left_col_8p, dst, stride); 1383 } 1384 1385 static inline void h_predictor_32xh(uint8_t *dst, ptrdiff_t stride, 1386 const uint8_t *left, int height) { 1387 int i = height >> 2; 1388 do { 1389 __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]); 1390 left4 = _mm_unpacklo_epi8(left4, left4); 1391 left4 = _mm_unpacklo_epi8(left4, left4); 1392 const __m128i r0 = _mm_shuffle_epi32(left4, 0x0); 1393 const __m128i r1 = _mm_shuffle_epi32(left4, 0x55); 1394 _mm_store_si128((__m128i *)dst, r0); 1395 _mm_store_si128((__m128i *)(dst + 16), r0); 1396 _mm_store_si128((__m128i *)(dst + stride), r1); 1397 _mm_store_si128((__m128i *)(dst + stride + 16), r1); 1398 const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa); 1399 const __m128i r3 = _mm_shuffle_epi32(left4, 0xff); 1400 _mm_store_si128((__m128i *)(dst + stride * 2), r2); 1401 _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2); 1402 _mm_store_si128((__m128i *)(dst + stride * 3), r3); 1403 _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3); 1404 left += 4; 1405 dst += stride * 4; 1406 } while (--i); 1407 } 1408 1409 void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, 1410 const uint8_t *above, const uint8_t *left) { 1411 (void)above; 1412 h_predictor_32xh(dst, stride, left, 64); 1413 } 1414 1415 static inline void h_predictor_64xh(uint8_t *dst, ptrdiff_t stride, 1416 const uint8_t *left, int height) { 1417 int i = height >> 2; 1418 do { 1419 __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]); 1420 left4 = _mm_unpacklo_epi8(left4, left4); 1421 left4 = _mm_unpacklo_epi8(left4, left4); 1422 const __m128i r0 = _mm_shuffle_epi32(left4, 0x0); 1423 const __m128i r1 = _mm_shuffle_epi32(left4, 0x55); 1424 _mm_store_si128((__m128i *)dst, r0); 1425 _mm_store_si128((__m128i *)(dst + 16), r0); 1426 _mm_store_si128((__m128i *)(dst + 32), r0); 1427 _mm_store_si128((__m128i *)(dst + 48), r0); 1428 _mm_store_si128((__m128i *)(dst + stride), r1); 1429 _mm_store_si128((__m128i *)(dst + stride + 16), r1); 1430 _mm_store_si128((__m128i *)(dst + stride + 32), r1); 1431 _mm_store_si128((__m128i *)(dst + stride + 48), r1); 1432 const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa); 1433 const __m128i r3 = _mm_shuffle_epi32(left4, 0xff); 1434 _mm_store_si128((__m128i *)(dst + stride * 2), r2); 1435 _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2); 1436 _mm_store_si128((__m128i *)(dst + stride * 2 + 32), r2); 1437 _mm_store_si128((__m128i *)(dst + stride * 2 + 48), r2); 1438 _mm_store_si128((__m128i *)(dst + stride * 3), r3); 1439 _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3); 1440 _mm_store_si128((__m128i *)(dst + stride * 3 + 32), r3); 1441 _mm_store_si128((__m128i *)(dst + stride * 3 + 48), r3); 1442 left += 4; 1443 dst += stride * 4; 1444 } while (--i); 1445 } 1446 1447 void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, 1448 const uint8_t *above, const uint8_t *left) { 1449 (void)above; 1450 h_predictor_64xh(dst, stride, left, 64); 1451 } 1452 1453 void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, 1454 const uint8_t *above, const uint8_t *left) { 1455 (void)above; 1456 h_predictor_64xh(dst, stride, left, 32); 1457 } 1458 1459 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1460 void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, 1461 const uint8_t *above, const uint8_t *left) { 1462 (void)above; 1463 h_predictor_64xh(dst, stride, left, 16); 1464 } 1465 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER