highbd_intrapred_sse2.c (37593B)
1 /* 2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <emmintrin.h> 13 14 #include "config/aom_dsp_rtcd.h" 15 16 // ----------------------------------------------------------------------------- 17 // H_PRED 18 19 void aom_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, 20 const uint16_t *above, 21 const uint16_t *left, int bd) { 22 const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left); 23 const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); 24 const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); 25 const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); 26 const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); 27 (void)above; 28 (void)bd; 29 _mm_storel_epi64((__m128i *)dst, row0); 30 dst += stride; 31 _mm_storel_epi64((__m128i *)dst, row1); 32 dst += stride; 33 _mm_storel_epi64((__m128i *)dst, row2); 34 dst += stride; 35 _mm_storel_epi64((__m128i *)dst, row3); 36 } 37 38 void aom_highbd_h_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, 39 const uint16_t *above, 40 const uint16_t *left, int bd) { 41 aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd); 42 dst += stride << 2; 43 left += 4; 44 aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd); 45 } 46 47 void aom_highbd_h_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, 48 const uint16_t *above, 49 const uint16_t *left, int bd) { 50 const __m128i left_u16 = _mm_load_si128((const __m128i *)left); 51 const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); 52 const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); 53 const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); 54 const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); 55 (void)above; 56 (void)bd; 57 _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0)); 58 dst += stride; 59 _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1)); 60 dst += stride; 61 _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2)); 62 dst += stride; 63 _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3)); 64 } 65 66 void aom_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, 67 const uint16_t *above, 68 const uint16_t *left, int bd) { 69 const __m128i left_u16 = _mm_load_si128((const __m128i *)left); 70 const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); 71 const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); 72 const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); 73 const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); 74 const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); 75 const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); 76 const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); 77 const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); 78 (void)above; 79 (void)bd; 80 _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0)); 81 dst += stride; 82 _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1)); 83 dst += stride; 84 _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2)); 85 dst += stride; 86 _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3)); 87 dst += stride; 88 _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4)); 89 dst += stride; 90 _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5)); 91 dst += stride; 92 _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6)); 93 dst += stride; 94 _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7)); 95 } 96 97 void aom_highbd_h_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, 98 const uint16_t *above, 99 const uint16_t *left, int bd) { 100 aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd); 101 dst += stride << 3; 102 left += 8; 103 aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd); 104 } 105 106 static inline void h_store_16_unpacklo(uint16_t **dst, const ptrdiff_t stride, 107 const __m128i *row) { 108 const __m128i val = _mm_unpacklo_epi64(*row, *row); 109 _mm_store_si128((__m128i *)*dst, val); 110 _mm_store_si128((__m128i *)(*dst + 8), val); 111 *dst += stride; 112 } 113 114 static inline void h_store_16_unpackhi(uint16_t **dst, const ptrdiff_t stride, 115 const __m128i *row) { 116 const __m128i val = _mm_unpackhi_epi64(*row, *row); 117 _mm_store_si128((__m128i *)(*dst), val); 118 _mm_store_si128((__m128i *)(*dst + 8), val); 119 *dst += stride; 120 } 121 122 static inline void h_predictor_16x8(uint16_t *dst, ptrdiff_t stride, 123 const uint16_t *left) { 124 const __m128i left_u16 = _mm_load_si128((const __m128i *)left); 125 const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); 126 const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); 127 const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); 128 const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); 129 const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); 130 const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); 131 const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); 132 const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); 133 h_store_16_unpacklo(&dst, stride, &row0); 134 h_store_16_unpacklo(&dst, stride, &row1); 135 h_store_16_unpacklo(&dst, stride, &row2); 136 h_store_16_unpacklo(&dst, stride, &row3); 137 h_store_16_unpackhi(&dst, stride, &row4); 138 h_store_16_unpackhi(&dst, stride, &row5); 139 h_store_16_unpackhi(&dst, stride, &row6); 140 h_store_16_unpackhi(&dst, stride, &row7); 141 } 142 143 void aom_highbd_h_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, 144 const uint16_t *above, 145 const uint16_t *left, int bd) { 146 (void)above; 147 (void)bd; 148 h_predictor_16x8(dst, stride, left); 149 } 150 151 void aom_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, 152 const uint16_t *above, 153 const uint16_t *left, int bd) { 154 int i; 155 (void)above; 156 (void)bd; 157 158 for (i = 0; i < 2; i++, left += 8) { 159 h_predictor_16x8(dst, stride, left); 160 dst += stride << 3; 161 } 162 } 163 164 void aom_highbd_h_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, 165 const uint16_t *above, 166 const uint16_t *left, int bd) { 167 int i; 168 (void)above; 169 (void)bd; 170 171 for (i = 0; i < 4; i++, left += 8) { 172 h_predictor_16x8(dst, stride, left); 173 dst += stride << 3; 174 } 175 } 176 177 static inline void h_store_32_unpacklo(uint16_t **dst, const ptrdiff_t stride, 178 const __m128i *row) { 179 const __m128i val = _mm_unpacklo_epi64(*row, *row); 180 _mm_store_si128((__m128i *)(*dst), val); 181 _mm_store_si128((__m128i *)(*dst + 8), val); 182 _mm_store_si128((__m128i *)(*dst + 16), val); 183 _mm_store_si128((__m128i *)(*dst + 24), val); 184 *dst += stride; 185 } 186 187 static inline void h_store_32_unpackhi(uint16_t **dst, const ptrdiff_t stride, 188 const __m128i *row) { 189 const __m128i val = _mm_unpackhi_epi64(*row, *row); 190 _mm_store_si128((__m128i *)(*dst), val); 191 _mm_store_si128((__m128i *)(*dst + 8), val); 192 _mm_store_si128((__m128i *)(*dst + 16), val); 193 _mm_store_si128((__m128i *)(*dst + 24), val); 194 *dst += stride; 195 } 196 197 static inline void h_predictor_32x8(uint16_t *dst, ptrdiff_t stride, 198 const uint16_t *left) { 199 const __m128i left_u16 = _mm_load_si128((const __m128i *)left); 200 const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); 201 const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); 202 const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); 203 const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); 204 const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); 205 const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); 206 const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); 207 const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); 208 h_store_32_unpacklo(&dst, stride, &row0); 209 h_store_32_unpacklo(&dst, stride, &row1); 210 h_store_32_unpacklo(&dst, stride, &row2); 211 h_store_32_unpacklo(&dst, stride, &row3); 212 h_store_32_unpackhi(&dst, stride, &row4); 213 h_store_32_unpackhi(&dst, stride, &row5); 214 h_store_32_unpackhi(&dst, stride, &row6); 215 h_store_32_unpackhi(&dst, stride, &row7); 216 } 217 218 void aom_highbd_h_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, 219 const uint16_t *above, 220 const uint16_t *left, int bd) { 221 int i; 222 (void)above; 223 (void)bd; 224 225 for (i = 0; i < 2; i++, left += 8) { 226 h_predictor_32x8(dst, stride, left); 227 dst += stride << 3; 228 } 229 } 230 231 void aom_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, 232 const uint16_t *above, 233 const uint16_t *left, int bd) { 234 int i; 235 (void)above; 236 (void)bd; 237 238 for (i = 0; i < 4; i++, left += 8) { 239 h_predictor_32x8(dst, stride, left); 240 dst += stride << 3; 241 } 242 } 243 244 // ----------------------------------------------------------------------------- 245 // DC_TOP, DC_LEFT, DC_128 246 247 // 4x4 248 249 static inline __m128i dc_sum_4(const uint16_t *ref) { 250 const __m128i _dcba = _mm_loadl_epi64((const __m128i *)ref); 251 const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe); 252 const __m128i a = _mm_add_epi16(_dcba, _xxdc); 253 return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1)); 254 } 255 256 static inline void dc_store_4x4(uint16_t *dst, ptrdiff_t stride, 257 const __m128i *dc) { 258 const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0); 259 int i; 260 for (i = 0; i < 4; ++i, dst += stride) { 261 _mm_storel_epi64((__m128i *)dst, dc_dup); 262 } 263 } 264 265 void aom_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, 266 const uint16_t *above, 267 const uint16_t *left, int bd) { 268 const __m128i two = _mm_cvtsi32_si128(2); 269 const __m128i sum = dc_sum_4(left); 270 const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); 271 (void)above; 272 (void)bd; 273 dc_store_4x4(dst, stride, &dc); 274 } 275 276 void aom_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, 277 const uint16_t *above, 278 const uint16_t *left, int bd) { 279 const __m128i two = _mm_cvtsi32_si128(2); 280 const __m128i sum = dc_sum_4(above); 281 const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); 282 (void)left; 283 (void)bd; 284 dc_store_4x4(dst, stride, &dc); 285 } 286 287 void aom_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, 288 const uint16_t *above, 289 const uint16_t *left, int bd) { 290 const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); 291 const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); 292 (void)above; 293 (void)left; 294 dc_store_4x4(dst, stride, &dc_dup); 295 } 296 297 // ----------------------------------------------------------------------------- 298 // 4x8 299 300 static inline void dc_store_4x8(uint16_t *dst, ptrdiff_t stride, 301 const __m128i *dc) { 302 const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0); 303 int i; 304 for (i = 0; i < 8; ++i, dst += stride) { 305 _mm_storel_epi64((__m128i *)dst, dc_dup); 306 } 307 } 308 309 // Shared with DC 8xh 310 static inline __m128i dc_sum_8(const uint16_t *ref) { 311 const __m128i ref_u16 = _mm_load_si128((const __m128i *)ref); 312 const __m128i _dcba = _mm_add_epi16(ref_u16, _mm_srli_si128(ref_u16, 8)); 313 const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe); 314 const __m128i a = _mm_add_epi16(_dcba, _xxdc); 315 316 return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1)); 317 } 318 319 void aom_highbd_dc_left_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, 320 const uint16_t *above, 321 const uint16_t *left, int bd) { 322 const __m128i sum = dc_sum_8(left); 323 const __m128i four = _mm_cvtsi32_si128(4); 324 const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); 325 (void)above; 326 (void)bd; 327 dc_store_4x8(dst, stride, &dc); 328 } 329 330 void aom_highbd_dc_top_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, 331 const uint16_t *above, 332 const uint16_t *left, int bd) { 333 const __m128i two = _mm_cvtsi32_si128(2); 334 const __m128i sum = dc_sum_4(above); 335 const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); 336 (void)left; 337 (void)bd; 338 dc_store_4x8(dst, stride, &dc); 339 } 340 341 void aom_highbd_dc_128_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, 342 const uint16_t *above, 343 const uint16_t *left, int bd) { 344 const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); 345 const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); 346 (void)above; 347 (void)left; 348 dc_store_4x8(dst, stride, &dc_dup); 349 } 350 351 // ----------------------------------------------------------------------------- 352 // 8xh 353 354 static inline void dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int height, 355 const __m128i *dc) { 356 const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); 357 const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); 358 int i; 359 for (i = 0; i < height; ++i, dst += stride) { 360 _mm_store_si128((__m128i *)dst, dc_dup); 361 } 362 } 363 364 // ----------------------------------------------------------------------------- 365 // DC_TOP 366 367 static inline void dc_top_predictor_8xh(uint16_t *dst, ptrdiff_t stride, 368 int height, const uint16_t *above) { 369 const __m128i four = _mm_cvtsi32_si128(4); 370 const __m128i sum = dc_sum_8(above); 371 const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); 372 dc_store_8xh(dst, stride, height, &dc); 373 } 374 375 void aom_highbd_dc_top_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, 376 const uint16_t *above, 377 const uint16_t *left, int bd) { 378 (void)left; 379 (void)bd; 380 dc_top_predictor_8xh(dst, stride, 4, above); 381 } 382 383 void aom_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, 384 const uint16_t *above, 385 const uint16_t *left, int bd) { 386 (void)left; 387 (void)bd; 388 dc_top_predictor_8xh(dst, stride, 8, above); 389 } 390 391 void aom_highbd_dc_top_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, 392 const uint16_t *above, 393 const uint16_t *left, int bd) { 394 (void)left; 395 (void)bd; 396 dc_top_predictor_8xh(dst, stride, 16, above); 397 } 398 399 // ----------------------------------------------------------------------------- 400 // DC_LEFT 401 402 void aom_highbd_dc_left_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, 403 const uint16_t *above, 404 const uint16_t *left, int bd) { 405 const __m128i two = _mm_cvtsi32_si128(2); 406 const __m128i sum = dc_sum_4(left); 407 const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); 408 (void)above; 409 (void)bd; 410 dc_store_8xh(dst, stride, 4, &dc); 411 } 412 413 void aom_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, 414 const uint16_t *above, 415 const uint16_t *left, int bd) { 416 const __m128i four = _mm_cvtsi32_si128(4); 417 const __m128i sum = dc_sum_8(left); 418 const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); 419 (void)above; 420 (void)bd; 421 dc_store_8xh(dst, stride, 8, &dc); 422 } 423 424 // Shared with DC 16xh 425 static inline __m128i dc_sum_16(const uint16_t *ref) { 426 const __m128i sum_lo = dc_sum_8(ref); 427 const __m128i sum_hi = dc_sum_8(ref + 8); 428 return _mm_add_epi16(sum_lo, sum_hi); 429 } 430 431 void aom_highbd_dc_left_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, 432 const uint16_t *above, 433 const uint16_t *left, int bd) { 434 const __m128i eight = _mm_cvtsi32_si128(8); 435 const __m128i sum = dc_sum_16(left); 436 const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); 437 (void)above; 438 (void)bd; 439 dc_store_8xh(dst, stride, 16, &dc); 440 } 441 442 // ----------------------------------------------------------------------------- 443 // DC_128 444 445 static inline void dc_128_predictor_8xh(uint16_t *dst, ptrdiff_t stride, 446 int height, int bd) { 447 const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); 448 const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); 449 dc_store_8xh(dst, stride, height, &dc_dup); 450 } 451 452 void aom_highbd_dc_128_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, 453 const uint16_t *above, 454 const uint16_t *left, int bd) { 455 (void)above; 456 (void)left; 457 dc_128_predictor_8xh(dst, stride, 4, bd); 458 } 459 460 void aom_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, 461 const uint16_t *above, 462 const uint16_t *left, int bd) { 463 (void)above; 464 (void)left; 465 dc_128_predictor_8xh(dst, stride, 8, bd); 466 } 467 468 void aom_highbd_dc_128_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, 469 const uint16_t *above, 470 const uint16_t *left, int bd) { 471 (void)above; 472 (void)left; 473 dc_128_predictor_8xh(dst, stride, 16, bd); 474 } 475 476 // ----------------------------------------------------------------------------- 477 // 16xh 478 479 static inline void dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int height, 480 const __m128i *dc) { 481 const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); 482 const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); 483 int i; 484 for (i = 0; i < height; ++i, dst += stride) { 485 _mm_store_si128((__m128i *)dst, dc_dup); 486 _mm_store_si128((__m128i *)(dst + 8), dc_dup); 487 } 488 } 489 490 // ----------------------------------------------------------------------------- 491 // DC_LEFT 492 493 void aom_highbd_dc_left_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, 494 const uint16_t *above, 495 const uint16_t *left, int bd) { 496 const __m128i four = _mm_cvtsi32_si128(4); 497 const __m128i sum = dc_sum_8(left); 498 const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); 499 (void)above; 500 (void)bd; 501 dc_store_16xh(dst, stride, 8, &dc); 502 } 503 504 void aom_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, 505 const uint16_t *above, 506 const uint16_t *left, int bd) { 507 const __m128i eight = _mm_cvtsi32_si128(8); 508 const __m128i sum = dc_sum_16(left); 509 const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); 510 (void)above; 511 (void)bd; 512 dc_store_16xh(dst, stride, 16, &dc); 513 } 514 515 // Shared with 32xh 516 static inline __m128i dc_sum_32(const uint16_t *ref) { 517 const __m128i zero = _mm_setzero_si128(); 518 const __m128i sum_a = dc_sum_16(ref); 519 const __m128i sum_b = dc_sum_16(ref + 16); 520 // 12 bit bd will outrange, so expand to 32 bit before adding final total 521 return _mm_add_epi32(_mm_unpacklo_epi16(sum_a, zero), 522 _mm_unpacklo_epi16(sum_b, zero)); 523 } 524 525 void aom_highbd_dc_left_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, 526 const uint16_t *above, 527 const uint16_t *left, int bd) { 528 const __m128i sixteen = _mm_cvtsi32_si128(16); 529 const __m128i sum = dc_sum_32(left); 530 const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); 531 (void)above; 532 (void)bd; 533 dc_store_16xh(dst, stride, 32, &dc); 534 } 535 536 // ----------------------------------------------------------------------------- 537 // DC_TOP 538 539 void aom_highbd_dc_top_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, 540 const uint16_t *above, 541 const uint16_t *left, int bd) { 542 const __m128i eight = _mm_cvtsi32_si128(8); 543 const __m128i sum = dc_sum_16(above); 544 const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); 545 (void)left; 546 (void)bd; 547 dc_store_16xh(dst, stride, 8, &dc); 548 } 549 550 void aom_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, 551 const uint16_t *above, 552 const uint16_t *left, int bd) { 553 const __m128i eight = _mm_cvtsi32_si128(8); 554 const __m128i sum = dc_sum_16(above); 555 const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); 556 (void)left; 557 (void)bd; 558 dc_store_16xh(dst, stride, 16, &dc); 559 } 560 561 void aom_highbd_dc_top_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, 562 const uint16_t *above, 563 const uint16_t *left, int bd) { 564 const __m128i eight = _mm_cvtsi32_si128(8); 565 const __m128i sum = dc_sum_16(above); 566 const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); 567 (void)left; 568 (void)bd; 569 dc_store_16xh(dst, stride, 32, &dc); 570 } 571 572 // ----------------------------------------------------------------------------- 573 // DC_128 574 575 void aom_highbd_dc_128_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, 576 const uint16_t *above, 577 const uint16_t *left, int bd) { 578 const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); 579 const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); 580 (void)above; 581 (void)left; 582 dc_store_16xh(dst, stride, 8, &dc_dup); 583 } 584 585 void aom_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, 586 const uint16_t *above, 587 const uint16_t *left, int bd) { 588 const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); 589 const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); 590 (void)above; 591 (void)left; 592 dc_store_16xh(dst, stride, 16, &dc_dup); 593 } 594 595 void aom_highbd_dc_128_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, 596 const uint16_t *above, 597 const uint16_t *left, int bd) { 598 const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); 599 const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); 600 (void)above; 601 (void)left; 602 dc_store_16xh(dst, stride, 32, &dc_dup); 603 } 604 605 // ----------------------------------------------------------------------------- 606 // 32xh 607 608 static inline void dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int height, 609 const __m128i *dc) { 610 const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); 611 const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); 612 int i; 613 for (i = 0; i < height; ++i, dst += stride) { 614 _mm_store_si128((__m128i *)dst, dc_dup); 615 _mm_store_si128((__m128i *)(dst + 8), dc_dup); 616 _mm_store_si128((__m128i *)(dst + 16), dc_dup); 617 _mm_store_si128((__m128i *)(dst + 24), dc_dup); 618 } 619 } 620 621 void aom_highbd_dc_left_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, 622 const uint16_t *above, 623 const uint16_t *left, int bd) { 624 const __m128i eight = _mm_cvtsi32_si128(8); 625 const __m128i sum = dc_sum_16(left); 626 const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); 627 (void)above; 628 (void)bd; 629 dc_store_32xh(dst, stride, 16, &dc); 630 } 631 632 void aom_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, 633 const uint16_t *above, 634 const uint16_t *left, int bd) { 635 const __m128i sixteen = _mm_cvtsi32_si128(16); 636 const __m128i sum = dc_sum_32(left); 637 const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); 638 (void)above; 639 (void)bd; 640 dc_store_32xh(dst, stride, 32, &dc); 641 } 642 643 void aom_highbd_dc_top_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, 644 const uint16_t *above, 645 const uint16_t *left, int bd) { 646 const __m128i sixteen = _mm_cvtsi32_si128(16); 647 const __m128i sum = dc_sum_32(above); 648 const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); 649 (void)left; 650 (void)bd; 651 dc_store_32xh(dst, stride, 16, &dc); 652 } 653 654 void aom_highbd_dc_128_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, 655 const uint16_t *above, 656 const uint16_t *left, int bd) { 657 const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); 658 const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); 659 (void)above; 660 (void)left; 661 dc_store_32xh(dst, stride, 16, &dc_dup); 662 } 663 664 void aom_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, 665 const uint16_t *above, 666 const uint16_t *left, int bd) { 667 const __m128i sixteen = _mm_cvtsi32_si128(16); 668 const __m128i sum = dc_sum_32(above); 669 const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); 670 (void)left; 671 (void)bd; 672 dc_store_32xh(dst, stride, 32, &dc); 673 } 674 675 void aom_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, 676 const uint16_t *above, 677 const uint16_t *left, int bd) { 678 const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); 679 const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); 680 (void)above; 681 (void)left; 682 dc_store_32xh(dst, stride, 32, &dc_dup); 683 } 684 685 // ----------------------------------------------------------------------------- 686 // V_PRED 687 688 void aom_highbd_v_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, 689 const uint16_t *above, 690 const uint16_t *left, int bd) { 691 (void)left; 692 (void)bd; 693 const __m128i above_u16 = _mm_loadl_epi64((const __m128i *)above); 694 int i; 695 for (i = 0; i < 2; ++i) { 696 _mm_storel_epi64((__m128i *)dst, above_u16); 697 _mm_storel_epi64((__m128i *)(dst + stride), above_u16); 698 _mm_storel_epi64((__m128i *)(dst + 2 * stride), above_u16); 699 _mm_storel_epi64((__m128i *)(dst + 3 * stride), above_u16); 700 dst += stride << 2; 701 } 702 } 703 704 void aom_highbd_v_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, 705 const uint16_t *above, 706 const uint16_t *left, int bd) { 707 (void)left; 708 (void)bd; 709 const __m128i above_u16 = _mm_load_si128((const __m128i *)above); 710 _mm_store_si128((__m128i *)dst, above_u16); 711 _mm_store_si128((__m128i *)(dst + stride), above_u16); 712 _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16); 713 _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16); 714 } 715 716 void aom_highbd_v_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, 717 const uint16_t *above, 718 const uint16_t *left, int bd) { 719 (void)left; 720 (void)bd; 721 const __m128i above_u16 = _mm_load_si128((const __m128i *)above); 722 int i; 723 for (i = 0; i < 4; ++i) { 724 _mm_store_si128((__m128i *)dst, above_u16); 725 _mm_store_si128((__m128i *)(dst + stride), above_u16); 726 _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16); 727 _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16); 728 dst += stride << 2; 729 } 730 } 731 732 void aom_highbd_v_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, 733 const uint16_t *above, 734 const uint16_t *left, int bd) { 735 (void)left; 736 (void)bd; 737 const __m128i above0_u16 = _mm_load_si128((const __m128i *)above); 738 const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8)); 739 int i; 740 for (i = 0; i < 2; ++i) { 741 _mm_store_si128((__m128i *)dst, above0_u16); 742 _mm_store_si128((__m128i *)(dst + 8), above1_u16); 743 dst += stride; 744 _mm_store_si128((__m128i *)dst, above0_u16); 745 _mm_store_si128((__m128i *)(dst + 8), above1_u16); 746 dst += stride; 747 _mm_store_si128((__m128i *)dst, above0_u16); 748 _mm_store_si128((__m128i *)(dst + 8), above1_u16); 749 dst += stride; 750 _mm_store_si128((__m128i *)dst, above0_u16); 751 _mm_store_si128((__m128i *)(dst + 8), above1_u16); 752 dst += stride; 753 } 754 } 755 756 void aom_highbd_v_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, 757 const uint16_t *above, 758 const uint16_t *left, int bd) { 759 (void)left; 760 (void)bd; 761 const __m128i above0_u16 = _mm_load_si128((const __m128i *)above); 762 const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8)); 763 int i; 764 for (i = 0; i < 8; ++i) { 765 _mm_store_si128((__m128i *)dst, above0_u16); 766 _mm_store_si128((__m128i *)(dst + 8), above1_u16); 767 dst += stride; 768 _mm_store_si128((__m128i *)dst, above0_u16); 769 _mm_store_si128((__m128i *)(dst + 8), above1_u16); 770 dst += stride; 771 _mm_store_si128((__m128i *)dst, above0_u16); 772 _mm_store_si128((__m128i *)(dst + 8), above1_u16); 773 dst += stride; 774 _mm_store_si128((__m128i *)dst, above0_u16); 775 _mm_store_si128((__m128i *)(dst + 8), above1_u16); 776 dst += stride; 777 } 778 } 779 780 void aom_highbd_v_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, 781 const uint16_t *above, 782 const uint16_t *left, int bd) { 783 (void)left; 784 (void)bd; 785 const __m128i above0_u16 = _mm_load_si128((const __m128i *)above); 786 const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8)); 787 const __m128i above2_u16 = _mm_load_si128((const __m128i *)(above + 16)); 788 const __m128i above3_u16 = _mm_load_si128((const __m128i *)(above + 24)); 789 int i; 790 for (i = 0; i < 4; ++i) { 791 _mm_store_si128((__m128i *)dst, above0_u16); 792 _mm_store_si128((__m128i *)(dst + 8), above1_u16); 793 _mm_store_si128((__m128i *)(dst + 16), above2_u16); 794 _mm_store_si128((__m128i *)(dst + 24), above3_u16); 795 dst += stride; 796 _mm_store_si128((__m128i *)dst, above0_u16); 797 _mm_store_si128((__m128i *)(dst + 8), above1_u16); 798 _mm_store_si128((__m128i *)(dst + 16), above2_u16); 799 _mm_store_si128((__m128i *)(dst + 24), above3_u16); 800 dst += stride; 801 _mm_store_si128((__m128i *)dst, above0_u16); 802 _mm_store_si128((__m128i *)(dst + 8), above1_u16); 803 _mm_store_si128((__m128i *)(dst + 16), above2_u16); 804 _mm_store_si128((__m128i *)(dst + 24), above3_u16); 805 dst += stride; 806 _mm_store_si128((__m128i *)dst, above0_u16); 807 _mm_store_si128((__m128i *)(dst + 8), above1_u16); 808 _mm_store_si128((__m128i *)(dst + 16), above2_u16); 809 _mm_store_si128((__m128i *)(dst + 24), above3_u16); 810 dst += stride; 811 } 812 } 813 814 // ----------------------------------------------------------------------------- 815 // DC_PRED 816 817 void aom_highbd_dc_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, 818 const uint16_t *above, 819 const uint16_t *left, int bd) { 820 (void)bd; 821 const __m128i sum_above = dc_sum_4(above); 822 const __m128i sum_left = dc_sum_8(left); 823 const __m128i sum = _mm_add_epi16(sum_above, sum_left); 824 uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum); 825 sum32 >>= 16; 826 sum32 += 6; 827 sum32 /= 12; 828 const __m128i row = _mm_set1_epi16((int16_t)sum32); 829 int i; 830 for (i = 0; i < 4; ++i) { 831 _mm_storel_epi64((__m128i *)dst, row); 832 dst += stride; 833 _mm_storel_epi64((__m128i *)dst, row); 834 dst += stride; 835 } 836 } 837 838 void aom_highbd_dc_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, 839 const uint16_t *above, 840 const uint16_t *left, int bd) { 841 (void)bd; 842 const __m128i sum_left = dc_sum_4(left); 843 const __m128i sum_above = dc_sum_8(above); 844 const __m128i sum = _mm_add_epi16(sum_above, sum_left); 845 uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum); 846 sum32 >>= 16; 847 sum32 += 6; 848 sum32 /= 12; 849 const __m128i row = _mm_set1_epi16((int16_t)sum32); 850 851 _mm_store_si128((__m128i *)dst, row); 852 dst += stride; 853 _mm_store_si128((__m128i *)dst, row); 854 dst += stride; 855 _mm_store_si128((__m128i *)dst, row); 856 dst += stride; 857 _mm_store_si128((__m128i *)dst, row); 858 } 859 860 void aom_highbd_dc_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, 861 const uint16_t *above, 862 const uint16_t *left, int bd) { 863 (void)bd; 864 __m128i sum_left = dc_sum_16(left); 865 __m128i sum_above = dc_sum_8(above); 866 const __m128i zero = _mm_setzero_si128(); 867 sum_left = _mm_unpacklo_epi16(sum_left, zero); 868 sum_above = _mm_unpacklo_epi16(sum_above, zero); 869 const __m128i sum = _mm_add_epi32(sum_left, sum_above); 870 uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum); 871 sum32 += 12; 872 sum32 /= 24; 873 const __m128i row = _mm_set1_epi16((int16_t)sum32); 874 int i; 875 for (i = 0; i < 4; ++i) { 876 _mm_store_si128((__m128i *)dst, row); 877 dst += stride; 878 _mm_store_si128((__m128i *)dst, row); 879 dst += stride; 880 _mm_store_si128((__m128i *)dst, row); 881 dst += stride; 882 _mm_store_si128((__m128i *)dst, row); 883 dst += stride; 884 } 885 } 886 887 void aom_highbd_dc_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, 888 const uint16_t *above, 889 const uint16_t *left, int bd) { 890 (void)bd; 891 __m128i sum_left = dc_sum_8(left); 892 __m128i sum_above = dc_sum_16(above); 893 const __m128i zero = _mm_setzero_si128(); 894 sum_left = _mm_unpacklo_epi16(sum_left, zero); 895 sum_above = _mm_unpacklo_epi16(sum_above, zero); 896 const __m128i sum = _mm_add_epi32(sum_left, sum_above); 897 uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum); 898 sum32 += 12; 899 sum32 /= 24; 900 const __m128i row = _mm_set1_epi16((int16_t)sum32); 901 int i; 902 for (i = 0; i < 2; ++i) { 903 _mm_store_si128((__m128i *)dst, row); 904 _mm_store_si128((__m128i *)(dst + 8), row); 905 dst += stride; 906 _mm_store_si128((__m128i *)dst, row); 907 _mm_store_si128((__m128i *)(dst + 8), row); 908 dst += stride; 909 _mm_store_si128((__m128i *)dst, row); 910 _mm_store_si128((__m128i *)(dst + 8), row); 911 dst += stride; 912 _mm_store_si128((__m128i *)dst, row); 913 _mm_store_si128((__m128i *)(dst + 8), row); 914 dst += stride; 915 } 916 } 917 918 void aom_highbd_dc_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, 919 const uint16_t *above, 920 const uint16_t *left, int bd) { 921 (void)bd; 922 __m128i sum_left = dc_sum_32(left); 923 __m128i sum_above = dc_sum_16(above); 924 const __m128i zero = _mm_setzero_si128(); 925 sum_above = _mm_unpacklo_epi16(sum_above, zero); 926 const __m128i sum = _mm_add_epi32(sum_left, sum_above); 927 uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum); 928 sum32 += 24; 929 sum32 /= 48; 930 const __m128i row = _mm_set1_epi16((int16_t)sum32); 931 int i; 932 for (i = 0; i < 8; ++i) { 933 _mm_store_si128((__m128i *)dst, row); 934 _mm_store_si128((__m128i *)(dst + 8), row); 935 dst += stride; 936 _mm_store_si128((__m128i *)dst, row); 937 _mm_store_si128((__m128i *)(dst + 8), row); 938 dst += stride; 939 _mm_store_si128((__m128i *)dst, row); 940 _mm_store_si128((__m128i *)(dst + 8), row); 941 dst += stride; 942 _mm_store_si128((__m128i *)dst, row); 943 _mm_store_si128((__m128i *)(dst + 8), row); 944 dst += stride; 945 } 946 } 947 948 void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, 949 const uint16_t *above, 950 const uint16_t *left, int bd) { 951 (void)bd; 952 __m128i sum_left = dc_sum_16(left); 953 __m128i sum_above = dc_sum_32(above); 954 const __m128i zero = _mm_setzero_si128(); 955 sum_left = _mm_unpacklo_epi16(sum_left, zero); 956 const __m128i sum = _mm_add_epi32(sum_left, sum_above); 957 uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum); 958 sum32 += 24; 959 sum32 /= 48; 960 const __m128i row = _mm_set1_epi16((int16_t)sum32); 961 int i; 962 for (i = 0; i < 4; ++i) { 963 _mm_store_si128((__m128i *)dst, row); 964 _mm_store_si128((__m128i *)(dst + 8), row); 965 _mm_store_si128((__m128i *)(dst + 16), row); 966 _mm_store_si128((__m128i *)(dst + 24), row); 967 dst += stride; 968 _mm_store_si128((__m128i *)dst, row); 969 _mm_store_si128((__m128i *)(dst + 8), row); 970 _mm_store_si128((__m128i *)(dst + 16), row); 971 _mm_store_si128((__m128i *)(dst + 24), row); 972 dst += stride; 973 _mm_store_si128((__m128i *)dst, row); 974 _mm_store_si128((__m128i *)(dst + 8), row); 975 _mm_store_si128((__m128i *)(dst + 16), row); 976 _mm_store_si128((__m128i *)(dst + 24), row); 977 dst += stride; 978 _mm_store_si128((__m128i *)dst, row); 979 _mm_store_si128((__m128i *)(dst + 8), row); 980 _mm_store_si128((__m128i *)(dst + 16), row); 981 _mm_store_si128((__m128i *)(dst + 24), row); 982 dst += stride; 983 } 984 }