intrapred_ssse3.c (142986B)
1 /* 2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <tmmintrin.h> 13 14 #include "config/aom_dsp_rtcd.h" 15 16 #include "aom_dsp/intrapred_common.h" 17 18 // ----------------------------------------------------------------------------- 19 // PAETH_PRED 20 21 // Return 8 16-bit pixels in one row 22 static inline __m128i paeth_8x1_pred(const __m128i *left, const __m128i *top, 23 const __m128i *topleft) { 24 const __m128i base = _mm_sub_epi16(_mm_add_epi16(*top, *left), *topleft); 25 26 __m128i pl = _mm_abs_epi16(_mm_sub_epi16(base, *left)); 27 __m128i pt = _mm_abs_epi16(_mm_sub_epi16(base, *top)); 28 __m128i ptl = _mm_abs_epi16(_mm_sub_epi16(base, *topleft)); 29 30 __m128i mask1 = _mm_cmpgt_epi16(pl, pt); 31 mask1 = _mm_or_si128(mask1, _mm_cmpgt_epi16(pl, ptl)); 32 __m128i mask2 = _mm_cmpgt_epi16(pt, ptl); 33 34 pl = _mm_andnot_si128(mask1, *left); 35 36 ptl = _mm_and_si128(mask2, *topleft); 37 pt = _mm_andnot_si128(mask2, *top); 38 pt = _mm_or_si128(pt, ptl); 39 pt = _mm_and_si128(mask1, pt); 40 41 return _mm_or_si128(pl, pt); 42 } 43 44 void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, 45 const uint8_t *above, const uint8_t *left) { 46 __m128i l = _mm_loadl_epi64((const __m128i *)left); 47 const __m128i t = _mm_loadl_epi64((const __m128i *)above); 48 const __m128i zero = _mm_setzero_si128(); 49 const __m128i t16 = _mm_unpacklo_epi8(t, zero); 50 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); 51 __m128i rep = _mm_set1_epi16((short)0x8000); 52 const __m128i one = _mm_set1_epi16(1); 53 54 int i; 55 for (i = 0; i < 4; ++i) { 56 const __m128i l16 = _mm_shuffle_epi8(l, rep); 57 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); 58 59 *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row)); 60 dst += stride; 61 rep = _mm_add_epi16(rep, one); 62 } 63 } 64 65 void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride, 66 const uint8_t *above, const uint8_t *left) { 67 __m128i l = _mm_loadl_epi64((const __m128i *)left); 68 const __m128i t = _mm_loadl_epi64((const __m128i *)above); 69 const __m128i zero = _mm_setzero_si128(); 70 const __m128i t16 = _mm_unpacklo_epi8(t, zero); 71 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); 72 __m128i rep = _mm_set1_epi16((short)0x8000); 73 const __m128i one = _mm_set1_epi16(1); 74 75 int i; 76 for (i = 0; i < 8; ++i) { 77 const __m128i l16 = _mm_shuffle_epi8(l, rep); 78 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); 79 80 *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row)); 81 dst += stride; 82 rep = _mm_add_epi16(rep, one); 83 } 84 } 85 86 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 87 void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride, 88 const uint8_t *above, const uint8_t *left) { 89 __m128i l = _mm_load_si128((const __m128i *)left); 90 const __m128i t = _mm_cvtsi32_si128(((const int *)above)[0]); 91 const __m128i zero = _mm_setzero_si128(); 92 const __m128i t16 = _mm_unpacklo_epi8(t, zero); 93 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); 94 __m128i rep = _mm_set1_epi16((short)0x8000); 95 const __m128i one = _mm_set1_epi16(1); 96 97 for (int i = 0; i < 16; ++i) { 98 const __m128i l16 = _mm_shuffle_epi8(l, rep); 99 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); 100 101 *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row)); 102 dst += stride; 103 rep = _mm_add_epi16(rep, one); 104 } 105 } 106 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 107 108 void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride, 109 const uint8_t *above, const uint8_t *left) { 110 __m128i l = _mm_loadl_epi64((const __m128i *)left); 111 const __m128i t = _mm_loadl_epi64((const __m128i *)above); 112 const __m128i zero = _mm_setzero_si128(); 113 const __m128i t16 = _mm_unpacklo_epi8(t, zero); 114 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); 115 __m128i rep = _mm_set1_epi16((short)0x8000); 116 const __m128i one = _mm_set1_epi16(1); 117 118 int i; 119 for (i = 0; i < 4; ++i) { 120 const __m128i l16 = _mm_shuffle_epi8(l, rep); 121 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); 122 123 _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row)); 124 dst += stride; 125 rep = _mm_add_epi16(rep, one); 126 } 127 } 128 129 void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, 130 const uint8_t *above, const uint8_t *left) { 131 __m128i l = _mm_loadl_epi64((const __m128i *)left); 132 const __m128i t = _mm_loadl_epi64((const __m128i *)above); 133 const __m128i zero = _mm_setzero_si128(); 134 const __m128i t16 = _mm_unpacklo_epi8(t, zero); 135 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); 136 __m128i rep = _mm_set1_epi16((short)0x8000); 137 const __m128i one = _mm_set1_epi16(1); 138 139 int i; 140 for (i = 0; i < 8; ++i) { 141 const __m128i l16 = _mm_shuffle_epi8(l, rep); 142 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); 143 144 _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row)); 145 dst += stride; 146 rep = _mm_add_epi16(rep, one); 147 } 148 } 149 150 void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride, 151 const uint8_t *above, const uint8_t *left) { 152 __m128i l = _mm_load_si128((const __m128i *)left); 153 const __m128i t = _mm_loadl_epi64((const __m128i *)above); 154 const __m128i zero = _mm_setzero_si128(); 155 const __m128i t16 = _mm_unpacklo_epi8(t, zero); 156 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); 157 __m128i rep = _mm_set1_epi16((short)0x8000); 158 const __m128i one = _mm_set1_epi16(1); 159 160 int i; 161 for (i = 0; i < 16; ++i) { 162 const __m128i l16 = _mm_shuffle_epi8(l, rep); 163 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); 164 165 _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row)); 166 dst += stride; 167 rep = _mm_add_epi16(rep, one); 168 } 169 } 170 171 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 172 void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride, 173 const uint8_t *above, const uint8_t *left) { 174 const __m128i t = _mm_loadl_epi64((const __m128i *)above); 175 const __m128i zero = _mm_setzero_si128(); 176 const __m128i t16 = _mm_unpacklo_epi8(t, zero); 177 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); 178 const __m128i one = _mm_set1_epi16(1); 179 180 for (int j = 0; j < 2; ++j) { 181 const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); 182 __m128i rep = _mm_set1_epi16((short)0x8000); 183 for (int i = 0; i < 16; ++i) { 184 const __m128i l16 = _mm_shuffle_epi8(l, rep); 185 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); 186 187 _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row)); 188 dst += stride; 189 rep = _mm_add_epi16(rep, one); 190 } 191 } 192 } 193 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 194 195 // Return 16 8-bit pixels in one row 196 static inline __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0, 197 const __m128i *top1, 198 const __m128i *topleft) { 199 const __m128i p0 = paeth_8x1_pred(left, top0, topleft); 200 const __m128i p1 = paeth_8x1_pred(left, top1, topleft); 201 return _mm_packus_epi16(p0, p1); 202 } 203 204 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 205 void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride, 206 const uint8_t *above, const uint8_t *left) { 207 __m128i l = _mm_cvtsi32_si128(((const int *)left)[0]); 208 const __m128i t = _mm_load_si128((const __m128i *)above); 209 const __m128i zero = _mm_setzero_si128(); 210 const __m128i top0 = _mm_unpacklo_epi8(t, zero); 211 const __m128i top1 = _mm_unpackhi_epi8(t, zero); 212 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); 213 __m128i rep = _mm_set1_epi16((short)0x8000); 214 const __m128i one = _mm_set1_epi16(1); 215 216 for (int i = 0; i < 4; ++i) { 217 const __m128i l16 = _mm_shuffle_epi8(l, rep); 218 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); 219 220 _mm_store_si128((__m128i *)dst, row); 221 dst += stride; 222 rep = _mm_add_epi16(rep, one); 223 } 224 } 225 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 226 227 void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride, 228 const uint8_t *above, const uint8_t *left) { 229 __m128i l = _mm_loadl_epi64((const __m128i *)left); 230 const __m128i t = _mm_load_si128((const __m128i *)above); 231 const __m128i zero = _mm_setzero_si128(); 232 const __m128i top0 = _mm_unpacklo_epi8(t, zero); 233 const __m128i top1 = _mm_unpackhi_epi8(t, zero); 234 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); 235 __m128i rep = _mm_set1_epi16((short)0x8000); 236 const __m128i one = _mm_set1_epi16(1); 237 238 int i; 239 for (i = 0; i < 8; ++i) { 240 const __m128i l16 = _mm_shuffle_epi8(l, rep); 241 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); 242 243 _mm_store_si128((__m128i *)dst, row); 244 dst += stride; 245 rep = _mm_add_epi16(rep, one); 246 } 247 } 248 249 void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, 250 const uint8_t *above, 251 const uint8_t *left) { 252 __m128i l = _mm_load_si128((const __m128i *)left); 253 const __m128i t = _mm_load_si128((const __m128i *)above); 254 const __m128i zero = _mm_setzero_si128(); 255 const __m128i top0 = _mm_unpacklo_epi8(t, zero); 256 const __m128i top1 = _mm_unpackhi_epi8(t, zero); 257 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); 258 __m128i rep = _mm_set1_epi16((short)0x8000); 259 const __m128i one = _mm_set1_epi16(1); 260 261 int i; 262 for (i = 0; i < 16; ++i) { 263 const __m128i l16 = _mm_shuffle_epi8(l, rep); 264 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); 265 266 _mm_store_si128((__m128i *)dst, row); 267 dst += stride; 268 rep = _mm_add_epi16(rep, one); 269 } 270 } 271 272 void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride, 273 const uint8_t *above, 274 const uint8_t *left) { 275 __m128i l = _mm_load_si128((const __m128i *)left); 276 const __m128i t = _mm_load_si128((const __m128i *)above); 277 const __m128i zero = _mm_setzero_si128(); 278 const __m128i top0 = _mm_unpacklo_epi8(t, zero); 279 const __m128i top1 = _mm_unpackhi_epi8(t, zero); 280 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); 281 __m128i rep = _mm_set1_epi16((short)0x8000); 282 const __m128i one = _mm_set1_epi16(1); 283 __m128i l16; 284 285 int i; 286 for (i = 0; i < 16; ++i) { 287 l16 = _mm_shuffle_epi8(l, rep); 288 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); 289 290 _mm_store_si128((__m128i *)dst, row); 291 dst += stride; 292 rep = _mm_add_epi16(rep, one); 293 } 294 295 l = _mm_load_si128((const __m128i *)(left + 16)); 296 rep = _mm_set1_epi16((short)0x8000); 297 for (i = 0; i < 16; ++i) { 298 l16 = _mm_shuffle_epi8(l, rep); 299 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); 300 301 _mm_store_si128((__m128i *)dst, row); 302 dst += stride; 303 rep = _mm_add_epi16(rep, one); 304 } 305 } 306 307 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 308 void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride, 309 const uint8_t *above, 310 const uint8_t *left) { 311 const __m128i t = _mm_load_si128((const __m128i *)above); 312 const __m128i zero = _mm_setzero_si128(); 313 const __m128i top0 = _mm_unpacklo_epi8(t, zero); 314 const __m128i top1 = _mm_unpackhi_epi8(t, zero); 315 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); 316 const __m128i one = _mm_set1_epi16(1); 317 318 for (int j = 0; j < 4; ++j) { 319 const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); 320 __m128i rep = _mm_set1_epi16((short)0x8000); 321 for (int i = 0; i < 16; ++i) { 322 const __m128i l16 = _mm_shuffle_epi8(l, rep); 323 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); 324 _mm_store_si128((__m128i *)dst, row); 325 dst += stride; 326 rep = _mm_add_epi16(rep, one); 327 } 328 } 329 } 330 331 void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride, 332 const uint8_t *above, const uint8_t *left) { 333 const __m128i a = _mm_load_si128((const __m128i *)above); 334 const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); 335 const __m128i zero = _mm_setzero_si128(); 336 const __m128i al = _mm_unpacklo_epi8(a, zero); 337 const __m128i ah = _mm_unpackhi_epi8(a, zero); 338 const __m128i bl = _mm_unpacklo_epi8(b, zero); 339 const __m128i bh = _mm_unpackhi_epi8(b, zero); 340 341 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); 342 __m128i rep = _mm_set1_epi16((short)0x8000); 343 const __m128i one = _mm_set1_epi16(1); 344 const __m128i l = _mm_loadl_epi64((const __m128i *)left); 345 __m128i l16; 346 347 for (int i = 0; i < 8; ++i) { 348 l16 = _mm_shuffle_epi8(l, rep); 349 const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); 350 const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); 351 352 _mm_store_si128((__m128i *)dst, r32l); 353 _mm_store_si128((__m128i *)(dst + 16), r32h); 354 dst += stride; 355 rep = _mm_add_epi16(rep, one); 356 } 357 } 358 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 359 360 void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride, 361 const uint8_t *above, 362 const uint8_t *left) { 363 const __m128i a = _mm_load_si128((const __m128i *)above); 364 const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); 365 const __m128i zero = _mm_setzero_si128(); 366 const __m128i al = _mm_unpacklo_epi8(a, zero); 367 const __m128i ah = _mm_unpackhi_epi8(a, zero); 368 const __m128i bl = _mm_unpacklo_epi8(b, zero); 369 const __m128i bh = _mm_unpackhi_epi8(b, zero); 370 371 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); 372 __m128i rep = _mm_set1_epi16((short)0x8000); 373 const __m128i one = _mm_set1_epi16(1); 374 __m128i l = _mm_load_si128((const __m128i *)left); 375 __m128i l16; 376 377 int i; 378 for (i = 0; i < 16; ++i) { 379 l16 = _mm_shuffle_epi8(l, rep); 380 const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); 381 const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); 382 383 _mm_store_si128((__m128i *)dst, r32l); 384 _mm_store_si128((__m128i *)(dst + 16), r32h); 385 dst += stride; 386 rep = _mm_add_epi16(rep, one); 387 } 388 } 389 390 void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, 391 const uint8_t *above, 392 const uint8_t *left) { 393 const __m128i a = _mm_load_si128((const __m128i *)above); 394 const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); 395 const __m128i zero = _mm_setzero_si128(); 396 const __m128i al = _mm_unpacklo_epi8(a, zero); 397 const __m128i ah = _mm_unpackhi_epi8(a, zero); 398 const __m128i bl = _mm_unpacklo_epi8(b, zero); 399 const __m128i bh = _mm_unpackhi_epi8(b, zero); 400 401 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); 402 __m128i rep = _mm_set1_epi16((short)0x8000); 403 const __m128i one = _mm_set1_epi16(1); 404 __m128i l = _mm_load_si128((const __m128i *)left); 405 __m128i l16; 406 407 int i; 408 for (i = 0; i < 16; ++i) { 409 l16 = _mm_shuffle_epi8(l, rep); 410 const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); 411 const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); 412 413 _mm_store_si128((__m128i *)dst, r32l); 414 _mm_store_si128((__m128i *)(dst + 16), r32h); 415 dst += stride; 416 rep = _mm_add_epi16(rep, one); 417 } 418 419 rep = _mm_set1_epi16((short)0x8000); 420 l = _mm_load_si128((const __m128i *)(left + 16)); 421 for (i = 0; i < 16; ++i) { 422 l16 = _mm_shuffle_epi8(l, rep); 423 const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); 424 const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); 425 426 _mm_store_si128((__m128i *)dst, r32l); 427 _mm_store_si128((__m128i *)(dst + 16), r32h); 428 dst += stride; 429 rep = _mm_add_epi16(rep, one); 430 } 431 } 432 433 void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride, 434 const uint8_t *above, 435 const uint8_t *left) { 436 const __m128i a = _mm_load_si128((const __m128i *)above); 437 const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); 438 const __m128i zero = _mm_setzero_si128(); 439 const __m128i al = _mm_unpacklo_epi8(a, zero); 440 const __m128i ah = _mm_unpackhi_epi8(a, zero); 441 const __m128i bl = _mm_unpacklo_epi8(b, zero); 442 const __m128i bh = _mm_unpackhi_epi8(b, zero); 443 444 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); 445 const __m128i one = _mm_set1_epi16(1); 446 __m128i l16; 447 448 int i, j; 449 for (j = 0; j < 4; ++j) { 450 const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); 451 __m128i rep = _mm_set1_epi16((short)0x8000); 452 for (i = 0; i < 16; ++i) { 453 l16 = _mm_shuffle_epi8(l, rep); 454 const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); 455 const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); 456 457 _mm_store_si128((__m128i *)dst, r32l); 458 _mm_store_si128((__m128i *)(dst + 16), r32h); 459 dst += stride; 460 rep = _mm_add_epi16(rep, one); 461 } 462 } 463 } 464 465 void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride, 466 const uint8_t *above, 467 const uint8_t *left) { 468 const __m128i a = _mm_load_si128((const __m128i *)above); 469 const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); 470 const __m128i c = _mm_load_si128((const __m128i *)(above + 32)); 471 const __m128i d = _mm_load_si128((const __m128i *)(above + 48)); 472 const __m128i zero = _mm_setzero_si128(); 473 const __m128i al = _mm_unpacklo_epi8(a, zero); 474 const __m128i ah = _mm_unpackhi_epi8(a, zero); 475 const __m128i bl = _mm_unpacklo_epi8(b, zero); 476 const __m128i bh = _mm_unpackhi_epi8(b, zero); 477 const __m128i cl = _mm_unpacklo_epi8(c, zero); 478 const __m128i ch = _mm_unpackhi_epi8(c, zero); 479 const __m128i dl = _mm_unpacklo_epi8(d, zero); 480 const __m128i dh = _mm_unpackhi_epi8(d, zero); 481 482 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); 483 const __m128i one = _mm_set1_epi16(1); 484 __m128i l16; 485 486 int i, j; 487 for (j = 0; j < 2; ++j) { 488 const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); 489 __m128i rep = _mm_set1_epi16((short)0x8000); 490 for (i = 0; i < 16; ++i) { 491 l16 = _mm_shuffle_epi8(l, rep); 492 const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16); 493 const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16); 494 const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16); 495 const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16); 496 497 _mm_store_si128((__m128i *)dst, r0); 498 _mm_store_si128((__m128i *)(dst + 16), r1); 499 _mm_store_si128((__m128i *)(dst + 32), r2); 500 _mm_store_si128((__m128i *)(dst + 48), r3); 501 dst += stride; 502 rep = _mm_add_epi16(rep, one); 503 } 504 } 505 } 506 507 void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride, 508 const uint8_t *above, 509 const uint8_t *left) { 510 const __m128i a = _mm_load_si128((const __m128i *)above); 511 const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); 512 const __m128i c = _mm_load_si128((const __m128i *)(above + 32)); 513 const __m128i d = _mm_load_si128((const __m128i *)(above + 48)); 514 const __m128i zero = _mm_setzero_si128(); 515 const __m128i al = _mm_unpacklo_epi8(a, zero); 516 const __m128i ah = _mm_unpackhi_epi8(a, zero); 517 const __m128i bl = _mm_unpacklo_epi8(b, zero); 518 const __m128i bh = _mm_unpackhi_epi8(b, zero); 519 const __m128i cl = _mm_unpacklo_epi8(c, zero); 520 const __m128i ch = _mm_unpackhi_epi8(c, zero); 521 const __m128i dl = _mm_unpacklo_epi8(d, zero); 522 const __m128i dh = _mm_unpackhi_epi8(d, zero); 523 524 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); 525 const __m128i one = _mm_set1_epi16(1); 526 __m128i l16; 527 528 int i, j; 529 for (j = 0; j < 4; ++j) { 530 const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); 531 __m128i rep = _mm_set1_epi16((short)0x8000); 532 for (i = 0; i < 16; ++i) { 533 l16 = _mm_shuffle_epi8(l, rep); 534 const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16); 535 const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16); 536 const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16); 537 const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16); 538 539 _mm_store_si128((__m128i *)dst, r0); 540 _mm_store_si128((__m128i *)(dst + 16), r1); 541 _mm_store_si128((__m128i *)(dst + 32), r2); 542 _mm_store_si128((__m128i *)(dst + 48), r3); 543 dst += stride; 544 rep = _mm_add_epi16(rep, one); 545 } 546 } 547 } 548 549 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 550 void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride, 551 const uint8_t *above, 552 const uint8_t *left) { 553 const __m128i a = _mm_load_si128((const __m128i *)above); 554 const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); 555 const __m128i c = _mm_load_si128((const __m128i *)(above + 32)); 556 const __m128i d = _mm_load_si128((const __m128i *)(above + 48)); 557 const __m128i zero = _mm_setzero_si128(); 558 const __m128i al = _mm_unpacklo_epi8(a, zero); 559 const __m128i ah = _mm_unpackhi_epi8(a, zero); 560 const __m128i bl = _mm_unpacklo_epi8(b, zero); 561 const __m128i bh = _mm_unpackhi_epi8(b, zero); 562 const __m128i cl = _mm_unpacklo_epi8(c, zero); 563 const __m128i ch = _mm_unpackhi_epi8(c, zero); 564 const __m128i dl = _mm_unpacklo_epi8(d, zero); 565 const __m128i dh = _mm_unpackhi_epi8(d, zero); 566 567 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); 568 const __m128i one = _mm_set1_epi16(1); 569 __m128i l16; 570 571 int i; 572 const __m128i l = _mm_load_si128((const __m128i *)left); 573 __m128i rep = _mm_set1_epi16((short)0x8000); 574 for (i = 0; i < 16; ++i) { 575 l16 = _mm_shuffle_epi8(l, rep); 576 const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16); 577 const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16); 578 const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16); 579 const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16); 580 581 _mm_store_si128((__m128i *)dst, r0); 582 _mm_store_si128((__m128i *)(dst + 16), r1); 583 _mm_store_si128((__m128i *)(dst + 32), r2); 584 _mm_store_si128((__m128i *)(dst + 48), r3); 585 dst += stride; 586 rep = _mm_add_epi16(rep, one); 587 } 588 } 589 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 590 591 // ----------------------------------------------------------------------------- 592 // SMOOTH_PRED 593 594 // pixels[0]: above and below_pred interleave vector 595 // pixels[1]: left vector 596 // pixels[2]: right_pred vector 597 static inline void load_pixel_w4(const uint8_t *above, const uint8_t *left, 598 int height, __m128i *pixels) { 599 __m128i d = _mm_cvtsi32_si128(((const int *)above)[0]); 600 if (height == 4) 601 pixels[1] = _mm_cvtsi32_si128(((const int *)left)[0]); 602 else if (height == 8) 603 pixels[1] = _mm_loadl_epi64(((const __m128i *)left)); 604 else 605 pixels[1] = _mm_loadu_si128(((const __m128i *)left)); 606 607 pixels[2] = _mm_set1_epi16((int16_t)above[3]); 608 609 const __m128i bp = _mm_set1_epi16((int16_t)left[height - 1]); 610 const __m128i zero = _mm_setzero_si128(); 611 d = _mm_unpacklo_epi8(d, zero); 612 pixels[0] = _mm_unpacklo_epi16(d, bp); 613 } 614 615 // weight_h[0]: weight_h vector 616 // weight_h[1]: scale - weight_h vector 617 // weight_h[2]: same as [0], second half for height = 16 only 618 // weight_h[3]: same as [1], second half for height = 16 only 619 // weight_w[0]: weights_w and scale - weights_w interleave vector 620 static inline void load_weight_w4(int height, __m128i *weight_h, 621 __m128i *weight_w) { 622 const __m128i zero = _mm_setzero_si128(); 623 const __m128i d = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE)); 624 const __m128i t = _mm_cvtsi32_si128(((const int *)smooth_weights)[0]); 625 weight_h[0] = _mm_unpacklo_epi8(t, zero); 626 weight_h[1] = _mm_sub_epi16(d, weight_h[0]); 627 weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]); 628 629 if (height == 8) { 630 const __m128i weight = _mm_loadl_epi64((const __m128i *)&smooth_weights[4]); 631 weight_h[0] = _mm_unpacklo_epi8(weight, zero); 632 weight_h[1] = _mm_sub_epi16(d, weight_h[0]); 633 } else if (height == 16) { 634 const __m128i weight = 635 _mm_loadu_si128((const __m128i *)&smooth_weights[12]); 636 weight_h[0] = _mm_unpacklo_epi8(weight, zero); 637 weight_h[1] = _mm_sub_epi16(d, weight_h[0]); 638 weight_h[2] = _mm_unpackhi_epi8(weight, zero); 639 weight_h[3] = _mm_sub_epi16(d, weight_h[2]); 640 } 641 } 642 643 static inline void smooth_pred_4xh(const __m128i *pixel, const __m128i *wh, 644 const __m128i *ww, int h, uint8_t *dst, 645 ptrdiff_t stride, int second_half) { 646 const __m128i round = _mm_set1_epi32((1 << SMOOTH_WEIGHT_LOG2_SCALE)); 647 const __m128i one = _mm_set1_epi16(1); 648 const __m128i inc = _mm_set1_epi16(0x202); 649 const __m128i gat = _mm_set1_epi32(0xc080400); 650 __m128i rep = second_half ? _mm_set1_epi16((short)0x8008) 651 : _mm_set1_epi16((short)0x8000); 652 __m128i d = _mm_set1_epi16(0x100); 653 654 for (int i = 0; i < h; ++i) { 655 const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d); 656 const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d); 657 const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc); 658 __m128i s = _mm_madd_epi16(pixel[0], wh_sc); 659 660 __m128i b = _mm_shuffle_epi8(pixel[1], rep); 661 b = _mm_unpacklo_epi16(b, pixel[2]); 662 __m128i sum = _mm_madd_epi16(b, ww[0]); 663 664 sum = _mm_add_epi32(s, sum); 665 sum = _mm_add_epi32(sum, round); 666 sum = _mm_srai_epi32(sum, 1 + SMOOTH_WEIGHT_LOG2_SCALE); 667 668 sum = _mm_shuffle_epi8(sum, gat); 669 *(int *)dst = _mm_cvtsi128_si32(sum); 670 dst += stride; 671 672 rep = _mm_add_epi16(rep, one); 673 d = _mm_add_epi16(d, inc); 674 } 675 } 676 677 void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, 678 const uint8_t *above, const uint8_t *left) { 679 __m128i pixels[3]; 680 load_pixel_w4(above, left, 4, pixels); 681 682 __m128i wh[4], ww[2]; 683 load_weight_w4(4, wh, ww); 684 685 smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0); 686 } 687 688 void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride, 689 const uint8_t *above, const uint8_t *left) { 690 __m128i pixels[3]; 691 load_pixel_w4(above, left, 8, pixels); 692 693 __m128i wh[4], ww[2]; 694 load_weight_w4(8, wh, ww); 695 696 smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0); 697 } 698 699 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 700 void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride, 701 const uint8_t *above, 702 const uint8_t *left) { 703 __m128i pixels[3]; 704 load_pixel_w4(above, left, 16, pixels); 705 706 __m128i wh[4], ww[2]; 707 load_weight_w4(16, wh, ww); 708 709 smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0); 710 dst += stride << 3; 711 smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1); 712 } 713 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 714 715 // pixels[0]: above and below_pred interleave vector, first half 716 // pixels[1]: above and below_pred interleave vector, second half 717 // pixels[2]: left vector 718 // pixels[3]: right_pred vector 719 // pixels[4]: above and below_pred interleave vector, first half 720 // pixels[5]: above and below_pred interleave vector, second half 721 // pixels[6]: left vector + 16 722 // pixels[7]: right_pred vector 723 static inline void load_pixel_w8(const uint8_t *above, const uint8_t *left, 724 int height, __m128i *pixels) { 725 const __m128i zero = _mm_setzero_si128(); 726 const __m128i bp = _mm_set1_epi16((int16_t)left[height - 1]); 727 __m128i d = _mm_loadl_epi64((const __m128i *)above); 728 d = _mm_unpacklo_epi8(d, zero); 729 pixels[0] = _mm_unpacklo_epi16(d, bp); 730 pixels[1] = _mm_unpackhi_epi16(d, bp); 731 732 pixels[3] = _mm_set1_epi16((int16_t)above[7]); 733 734 if (height == 4) { 735 pixels[2] = _mm_cvtsi32_si128(((const int *)left)[0]); 736 } else if (height == 8) { 737 pixels[2] = _mm_loadl_epi64((const __m128i *)left); 738 } else if (height == 16) { 739 pixels[2] = _mm_load_si128((const __m128i *)left); 740 } else { 741 pixels[2] = _mm_load_si128((const __m128i *)left); 742 pixels[4] = pixels[0]; 743 pixels[5] = pixels[1]; 744 pixels[6] = _mm_load_si128((const __m128i *)(left + 16)); 745 pixels[7] = pixels[3]; 746 } 747 } 748 749 // weight_h[0]: weight_h vector 750 // weight_h[1]: scale - weight_h vector 751 // weight_h[2]: same as [0], offset 8 752 // weight_h[3]: same as [1], offset 8 753 // weight_h[4]: same as [0], offset 16 754 // weight_h[5]: same as [1], offset 16 755 // weight_h[6]: same as [0], offset 24 756 // weight_h[7]: same as [1], offset 24 757 // weight_w[0]: weights_w and scale - weights_w interleave vector, first half 758 // weight_w[1]: weights_w and scale - weights_w interleave vector, second half 759 static inline void load_weight_w8(int height, __m128i *weight_h, 760 __m128i *weight_w) { 761 const __m128i zero = _mm_setzero_si128(); 762 const int we_offset = height < 8 ? 0 : 4; 763 __m128i we = _mm_loadu_si128((const __m128i *)&smooth_weights[we_offset]); 764 weight_h[0] = _mm_unpacklo_epi8(we, zero); 765 const __m128i d = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE)); 766 weight_h[1] = _mm_sub_epi16(d, weight_h[0]); 767 768 if (height == 4) { 769 we = _mm_srli_si128(we, 4); 770 __m128i tmp1 = _mm_unpacklo_epi8(we, zero); 771 __m128i tmp2 = _mm_sub_epi16(d, tmp1); 772 weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2); 773 weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2); 774 } else { 775 weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]); 776 weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]); 777 } 778 779 if (height == 16) { 780 we = _mm_loadu_si128((const __m128i *)&smooth_weights[12]); 781 weight_h[0] = _mm_unpacklo_epi8(we, zero); 782 weight_h[1] = _mm_sub_epi16(d, weight_h[0]); 783 weight_h[2] = _mm_unpackhi_epi8(we, zero); 784 weight_h[3] = _mm_sub_epi16(d, weight_h[2]); 785 } else if (height == 32) { 786 const __m128i weight_lo = 787 _mm_loadu_si128((const __m128i *)&smooth_weights[28]); 788 weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero); 789 weight_h[1] = _mm_sub_epi16(d, weight_h[0]); 790 weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero); 791 weight_h[3] = _mm_sub_epi16(d, weight_h[2]); 792 const __m128i weight_hi = 793 _mm_loadu_si128((const __m128i *)&smooth_weights[28 + 16]); 794 weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero); 795 weight_h[5] = _mm_sub_epi16(d, weight_h[4]); 796 weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero); 797 weight_h[7] = _mm_sub_epi16(d, weight_h[6]); 798 } 799 } 800 801 static inline void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh, 802 const __m128i *ww, int h, uint8_t *dst, 803 ptrdiff_t stride, int second_half) { 804 const __m128i round = _mm_set1_epi32((1 << SMOOTH_WEIGHT_LOG2_SCALE)); 805 const __m128i one = _mm_set1_epi16(1); 806 const __m128i inc = _mm_set1_epi16(0x202); 807 const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); 808 809 __m128i rep = second_half ? _mm_set1_epi16((short)0x8008) 810 : _mm_set1_epi16((short)0x8000); 811 __m128i d = _mm_set1_epi16(0x100); 812 813 int i; 814 for (i = 0; i < h; ++i) { 815 const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d); 816 const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d); 817 const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc); 818 __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc); 819 __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc); 820 821 __m128i b = _mm_shuffle_epi8(pixels[2], rep); 822 b = _mm_unpacklo_epi16(b, pixels[3]); 823 __m128i sum0 = _mm_madd_epi16(b, ww[0]); 824 __m128i sum1 = _mm_madd_epi16(b, ww[1]); 825 826 s0 = _mm_add_epi32(s0, sum0); 827 s0 = _mm_add_epi32(s0, round); 828 s0 = _mm_srai_epi32(s0, 1 + SMOOTH_WEIGHT_LOG2_SCALE); 829 830 s1 = _mm_add_epi32(s1, sum1); 831 s1 = _mm_add_epi32(s1, round); 832 s1 = _mm_srai_epi32(s1, 1 + SMOOTH_WEIGHT_LOG2_SCALE); 833 834 sum0 = _mm_packus_epi16(s0, s1); 835 sum0 = _mm_shuffle_epi8(sum0, gat); 836 _mm_storel_epi64((__m128i *)dst, sum0); 837 dst += stride; 838 839 rep = _mm_add_epi16(rep, one); 840 d = _mm_add_epi16(d, inc); 841 } 842 } 843 844 void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride, 845 const uint8_t *above, const uint8_t *left) { 846 __m128i pixels[4]; 847 load_pixel_w8(above, left, 4, pixels); 848 849 __m128i wh[4], ww[2]; 850 load_weight_w8(4, wh, ww); 851 852 smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0); 853 } 854 855 void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, 856 const uint8_t *above, const uint8_t *left) { 857 __m128i pixels[4]; 858 load_pixel_w8(above, left, 8, pixels); 859 860 __m128i wh[4], ww[2]; 861 load_weight_w8(8, wh, ww); 862 863 smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0); 864 } 865 866 void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride, 867 const uint8_t *above, 868 const uint8_t *left) { 869 __m128i pixels[4]; 870 load_pixel_w8(above, left, 16, pixels); 871 872 __m128i wh[4], ww[2]; 873 load_weight_w8(16, wh, ww); 874 875 smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0); 876 dst += stride << 3; 877 smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1); 878 } 879 880 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 881 void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride, 882 const uint8_t *above, 883 const uint8_t *left) { 884 __m128i pixels[8]; 885 load_pixel_w8(above, left, 32, pixels); 886 887 __m128i wh[8], ww[2]; 888 load_weight_w8(32, wh, ww); 889 890 smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0); 891 dst += stride << 3; 892 smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1); 893 dst += stride << 3; 894 smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0); 895 dst += stride << 3; 896 smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1); 897 } 898 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 899 900 // TODO(slavarnway): Visual Studio only supports restrict when /std:c11 901 // (available in 2019+) or greater is specified; __restrict can be used in that 902 // case. This should be moved to rtcd and used consistently between the 903 // function declarations and definitions to avoid warnings in Visual Studio 904 // when defining LIBAOM_RESTRICT to restrict or __restrict. 905 #if defined(_MSC_VER) 906 #define LIBAOM_RESTRICT 907 #else 908 #define LIBAOM_RESTRICT restrict 909 #endif 910 911 static AOM_FORCE_INLINE __m128i Load4(const void *src) { 912 // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32 913 // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a 914 // movss instruction. 915 // 916 // Until compiler support of _mm_loadu_si32 is widespread, use of 917 // _mm_loadu_si32 is banned. 918 int val; 919 memcpy(&val, src, sizeof(val)); 920 return _mm_cvtsi32_si128(val); 921 } 922 923 static AOM_FORCE_INLINE __m128i LoadLo8(const void *a) { 924 return _mm_loadl_epi64((const __m128i *)(a)); 925 } 926 927 static AOM_FORCE_INLINE __m128i LoadUnaligned16(const void *a) { 928 return _mm_loadu_si128((const __m128i *)(a)); 929 } 930 931 static AOM_FORCE_INLINE void Store4(void *dst, const __m128i x) { 932 const int val = _mm_cvtsi128_si32(x); 933 memcpy(dst, &val, sizeof(val)); 934 } 935 936 static AOM_FORCE_INLINE void StoreLo8(void *a, const __m128i v) { 937 _mm_storel_epi64((__m128i *)(a), v); 938 } 939 940 static AOM_FORCE_INLINE void StoreUnaligned16(void *a, const __m128i v) { 941 _mm_storeu_si128((__m128i *)(a), v); 942 } 943 944 static AOM_FORCE_INLINE __m128i cvtepu8_epi16(__m128i x) { 945 return _mm_unpacklo_epi8((x), _mm_setzero_si128()); 946 } 947 948 static AOM_FORCE_INLINE __m128i cvtepu8_epi32(__m128i x) { 949 const __m128i tmp = _mm_unpacklo_epi8((x), _mm_setzero_si128()); 950 return _mm_unpacklo_epi16(tmp, _mm_setzero_si128()); 951 } 952 953 static AOM_FORCE_INLINE __m128i cvtepu16_epi32(__m128i x) { 954 return _mm_unpacklo_epi16((x), _mm_setzero_si128()); 955 } 956 957 static void smooth_predictor_wxh(uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, 958 const uint8_t *LIBAOM_RESTRICT top_row, 959 const uint8_t *LIBAOM_RESTRICT left_column, 960 int width, int height) { 961 const uint8_t *const sm_weights_h = smooth_weights + height - 4; 962 const uint8_t *const sm_weights_w = smooth_weights + width - 4; 963 const __m128i zero = _mm_setzero_si128(); 964 const __m128i scale_value = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); 965 const __m128i bottom_left = _mm_cvtsi32_si128(left_column[height - 1]); 966 const __m128i top_right = _mm_set1_epi16(top_row[width - 1]); 967 const __m128i round = _mm_set1_epi32(1 << SMOOTH_WEIGHT_LOG2_SCALE); 968 for (int y = 0; y < height; ++y) { 969 const __m128i weights_y = _mm_cvtsi32_si128(sm_weights_h[y]); 970 const __m128i left_y = _mm_cvtsi32_si128(left_column[y]); 971 const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y); 972 __m128i scaled_bottom_left = 973 _mm_mullo_epi16(scale_m_weights_y, bottom_left); 974 const __m128i weight_left_y = 975 _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0); 976 scaled_bottom_left = _mm_add_epi32(scaled_bottom_left, round); 977 scaled_bottom_left = _mm_shuffle_epi32(scaled_bottom_left, 0); 978 for (int x = 0; x < width; x += 8) { 979 const __m128i top_x = LoadLo8(top_row + x); 980 const __m128i weights_x = LoadLo8(sm_weights_w + x); 981 const __m128i top_weights_x = _mm_unpacklo_epi8(top_x, weights_x); 982 const __m128i top_weights_x_lo = cvtepu8_epi16(top_weights_x); 983 const __m128i top_weights_x_hi = _mm_unpackhi_epi8(top_weights_x, zero); 984 985 // Here opposite weights and pixels are multiplied, where the order of 986 // interleaving is indicated in the names. 987 __m128i pred_lo = _mm_madd_epi16(top_weights_x_lo, weight_left_y); 988 __m128i pred_hi = _mm_madd_epi16(top_weights_x_hi, weight_left_y); 989 990 // |scaled_bottom_left| is always scaled by the same weight each row, so 991 // we only derive |scaled_top_right| values here. 992 const __m128i inverted_weights_x = 993 _mm_sub_epi16(scale_value, cvtepu8_epi16(weights_x)); 994 const __m128i scaled_top_right = 995 _mm_mullo_epi16(inverted_weights_x, top_right); 996 const __m128i scaled_top_right_lo = cvtepu16_epi32(scaled_top_right); 997 const __m128i scaled_top_right_hi = 998 _mm_unpackhi_epi16(scaled_top_right, zero); 999 pred_lo = _mm_add_epi32(pred_lo, scaled_bottom_left); 1000 pred_hi = _mm_add_epi32(pred_hi, scaled_bottom_left); 1001 pred_lo = _mm_add_epi32(pred_lo, scaled_top_right_lo); 1002 pred_hi = _mm_add_epi32(pred_hi, scaled_top_right_hi); 1003 1004 // The round value for RightShiftWithRounding was added with 1005 // |scaled_bottom_left|. 1006 pred_lo = _mm_srli_epi32(pred_lo, (1 + SMOOTH_WEIGHT_LOG2_SCALE)); 1007 pred_hi = _mm_srli_epi32(pred_hi, (1 + SMOOTH_WEIGHT_LOG2_SCALE)); 1008 const __m128i pred = _mm_packus_epi16(pred_lo, pred_hi); 1009 StoreLo8(dst + x, _mm_packus_epi16(pred, pred)); 1010 } 1011 dst += stride; 1012 } 1013 } 1014 1015 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1016 void aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride, 1017 const uint8_t *above, 1018 const uint8_t *left) { 1019 smooth_predictor_wxh(dst, stride, above, left, 16, 4); 1020 } 1021 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1022 1023 void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride, 1024 const uint8_t *above, 1025 const uint8_t *left) { 1026 smooth_predictor_wxh(dst, stride, above, left, 16, 8); 1027 } 1028 1029 void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, 1030 const uint8_t *above, 1031 const uint8_t *left) { 1032 smooth_predictor_wxh(dst, stride, above, left, 16, 16); 1033 } 1034 1035 void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride, 1036 const uint8_t *above, 1037 const uint8_t *left) { 1038 smooth_predictor_wxh(dst, stride, above, left, 16, 32); 1039 } 1040 1041 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1042 void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride, 1043 const uint8_t *above, 1044 const uint8_t *left) { 1045 smooth_predictor_wxh(dst, stride, above, left, 16, 64); 1046 } 1047 1048 void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride, 1049 const uint8_t *above, 1050 const uint8_t *left) { 1051 smooth_predictor_wxh(dst, stride, above, left, 32, 8); 1052 } 1053 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1054 1055 void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride, 1056 const uint8_t *above, 1057 const uint8_t *left) { 1058 smooth_predictor_wxh(dst, stride, above, left, 32, 16); 1059 } 1060 1061 void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, 1062 const uint8_t *above, 1063 const uint8_t *left) { 1064 smooth_predictor_wxh(dst, stride, above, left, 32, 32); 1065 } 1066 1067 void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride, 1068 const uint8_t *above, 1069 const uint8_t *left) { 1070 smooth_predictor_wxh(dst, stride, above, left, 32, 64); 1071 } 1072 1073 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1074 void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride, 1075 const uint8_t *above, 1076 const uint8_t *left) { 1077 smooth_predictor_wxh(dst, stride, above, left, 64, 16); 1078 } 1079 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1080 1081 void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride, 1082 const uint8_t *above, 1083 const uint8_t *left) { 1084 smooth_predictor_wxh(dst, stride, above, left, 64, 32); 1085 } 1086 1087 void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride, 1088 const uint8_t *above, 1089 const uint8_t *left) { 1090 smooth_predictor_wxh(dst, stride, above, left, 64, 64); 1091 } 1092 1093 // ----------------------------------------------------------------------------- 1094 // Smooth horizontal/vertical helper functions. 1095 1096 // For Horizontal, pixels1 and pixels2 are the same repeated value. For 1097 // Vertical, weights1 and weights2 are the same, and scaled_corner1 and 1098 // scaled_corner2 are the same. 1099 static AOM_FORCE_INLINE void write_smooth_directional_sum16( 1100 uint8_t *LIBAOM_RESTRICT dst, const __m128i pixels1, const __m128i pixels2, 1101 const __m128i weights1, const __m128i weights2, 1102 const __m128i scaled_corner1, const __m128i scaled_corner2, 1103 const __m128i round) { 1104 const __m128i weighted_px1 = _mm_mullo_epi16(pixels1, weights1); 1105 const __m128i weighted_px2 = _mm_mullo_epi16(pixels2, weights2); 1106 const __m128i pred_sum1 = _mm_add_epi16(scaled_corner1, weighted_px1); 1107 const __m128i pred_sum2 = _mm_add_epi16(scaled_corner2, weighted_px2); 1108 // Equivalent to RightShiftWithRounding(pred[x][y], 8). 1109 const __m128i pred1 = _mm_srli_epi16(_mm_add_epi16(pred_sum1, round), 8); 1110 const __m128i pred2 = _mm_srli_epi16(_mm_add_epi16(pred_sum2, round), 8); 1111 StoreUnaligned16(dst, _mm_packus_epi16(pred1, pred2)); 1112 } 1113 1114 static AOM_FORCE_INLINE __m128i smooth_directional_sum8( 1115 const __m128i pixels, const __m128i weights, const __m128i scaled_corner) { 1116 const __m128i weighted_px = _mm_mullo_epi16(pixels, weights); 1117 return _mm_add_epi16(scaled_corner, weighted_px); 1118 } 1119 1120 static AOM_FORCE_INLINE void write_smooth_directional_sum8( 1121 uint8_t *LIBAOM_RESTRICT dst, const __m128i *pixels, const __m128i *weights, 1122 const __m128i *scaled_corner, const __m128i *round) { 1123 const __m128i pred_sum = 1124 smooth_directional_sum8(*pixels, *weights, *scaled_corner); 1125 // Equivalent to RightShiftWithRounding(pred[x][y], 8). 1126 const __m128i pred = _mm_srli_epi16(_mm_add_epi16(pred_sum, *round), 8); 1127 StoreLo8(dst, _mm_packus_epi16(pred, pred)); 1128 } 1129 1130 // ----------------------------------------------------------------------------- 1131 // SMOOTH_V_PRED 1132 1133 static AOM_FORCE_INLINE void load_smooth_vertical_pixels4( 1134 const uint8_t *LIBAOM_RESTRICT above, const uint8_t *LIBAOM_RESTRICT left, 1135 const int height, __m128i *pixels) { 1136 __m128i top = Load4(above); 1137 const __m128i bottom_left = _mm_set1_epi16(left[height - 1]); 1138 top = cvtepu8_epi16(top); 1139 pixels[0] = _mm_unpacklo_epi16(top, bottom_left); 1140 } 1141 1142 // |weight_array| alternates weight vectors from the table with their inverted 1143 // (256-w) counterparts. This is precomputed by the compiler when the weights 1144 // table is visible to this module. Removing this visibility can cut speed by up 1145 // to half in both 4xH and 8xH transforms. 1146 static AOM_FORCE_INLINE void load_smooth_vertical_weights4( 1147 const uint8_t *LIBAOM_RESTRICT weight_array, const int height, 1148 __m128i *weights) { 1149 const __m128i inverter = _mm_set1_epi16(256); 1150 1151 if (height == 4) { 1152 const __m128i weight = Load4(weight_array); 1153 weights[0] = cvtepu8_epi16(weight); 1154 weights[1] = _mm_sub_epi16(inverter, weights[0]); 1155 } else if (height == 8) { 1156 const __m128i weight = LoadLo8(weight_array + 4); 1157 weights[0] = cvtepu8_epi16(weight); 1158 weights[1] = _mm_sub_epi16(inverter, weights[0]); 1159 } else { 1160 const __m128i weight = LoadUnaligned16(weight_array + 12); 1161 const __m128i zero = _mm_setzero_si128(); 1162 weights[0] = cvtepu8_epi16(weight); 1163 weights[1] = _mm_sub_epi16(inverter, weights[0]); 1164 weights[2] = _mm_unpackhi_epi8(weight, zero); 1165 weights[3] = _mm_sub_epi16(inverter, weights[2]); 1166 } 1167 } 1168 1169 static AOM_FORCE_INLINE void write_smooth_vertical4xh( 1170 const __m128i *pixel, const __m128i *weight, const int height, 1171 uint8_t *LIBAOM_RESTRICT dst, const ptrdiff_t stride) { 1172 const __m128i pred_round = _mm_set1_epi32(128); 1173 const __m128i mask_increment = _mm_set1_epi16(0x0202); 1174 const __m128i cvtepu8_epi32 = _mm_set1_epi32(0xC080400); 1175 __m128i y_select = _mm_set1_epi16(0x0100); 1176 1177 for (int y = 0; y < height; ++y) { 1178 const __m128i weight_y = _mm_shuffle_epi8(weight[0], y_select); 1179 const __m128i inverted_weight_y = _mm_shuffle_epi8(weight[1], y_select); 1180 const __m128i alternate_weights = 1181 _mm_unpacklo_epi16(weight_y, inverted_weight_y); 1182 // Here the pixel vector is top_row[0], corner, top_row[1], corner, ... 1183 // The madd instruction yields four results of the form: 1184 // (top_row[x] * weight[y] + corner * inverted_weight[y]) 1185 __m128i sum = _mm_madd_epi16(pixel[0], alternate_weights); 1186 sum = _mm_add_epi32(sum, pred_round); 1187 sum = _mm_srai_epi32(sum, 8); 1188 sum = _mm_shuffle_epi8(sum, cvtepu8_epi32); 1189 Store4(dst, sum); 1190 dst += stride; 1191 y_select = _mm_add_epi16(y_select, mask_increment); 1192 } 1193 } 1194 1195 void aom_smooth_v_predictor_4x4_ssse3( 1196 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, 1197 const uint8_t *LIBAOM_RESTRICT top_row, 1198 const uint8_t *LIBAOM_RESTRICT left_column) { 1199 __m128i pixels; 1200 load_smooth_vertical_pixels4(top_row, left_column, 4, &pixels); 1201 1202 __m128i weights[2]; 1203 load_smooth_vertical_weights4(smooth_weights, 4, weights); 1204 1205 write_smooth_vertical4xh(&pixels, weights, 4, dst, stride); 1206 } 1207 1208 void aom_smooth_v_predictor_4x8_ssse3( 1209 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, 1210 const uint8_t *LIBAOM_RESTRICT top_row, 1211 const uint8_t *LIBAOM_RESTRICT left_column) { 1212 __m128i pixels; 1213 load_smooth_vertical_pixels4(top_row, left_column, 8, &pixels); 1214 1215 __m128i weights[2]; 1216 load_smooth_vertical_weights4(smooth_weights, 8, weights); 1217 1218 write_smooth_vertical4xh(&pixels, weights, 8, dst, stride); 1219 } 1220 1221 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1222 void aom_smooth_v_predictor_4x16_ssse3( 1223 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, 1224 const uint8_t *LIBAOM_RESTRICT top_row, 1225 const uint8_t *LIBAOM_RESTRICT left_column) { 1226 __m128i pixels; 1227 load_smooth_vertical_pixels4(top_row, left_column, 16, &pixels); 1228 1229 __m128i weights[4]; 1230 load_smooth_vertical_weights4(smooth_weights, 16, weights); 1231 1232 write_smooth_vertical4xh(&pixels, weights, 8, dst, stride); 1233 dst += stride << 3; 1234 write_smooth_vertical4xh(&pixels, &weights[2], 8, dst, stride); 1235 } 1236 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1237 1238 void aom_smooth_v_predictor_8x4_ssse3( 1239 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, 1240 const uint8_t *LIBAOM_RESTRICT top_row, 1241 const uint8_t *LIBAOM_RESTRICT left_column) { 1242 const __m128i bottom_left = _mm_set1_epi16(left_column[3]); 1243 const __m128i weights = cvtepu8_epi16(Load4(smooth_weights)); 1244 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); 1245 const __m128i inverted_weights = _mm_sub_epi16(scale, weights); 1246 const __m128i scaled_bottom_left = 1247 _mm_mullo_epi16(inverted_weights, bottom_left); 1248 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); 1249 __m128i y_select = _mm_set1_epi32(0x01000100); 1250 const __m128i top = cvtepu8_epi16(LoadLo8(top_row)); 1251 __m128i weights_y = _mm_shuffle_epi8(weights, y_select); 1252 __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select); 1253 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y, 1254 &round); 1255 dst += stride; 1256 y_select = _mm_set1_epi32(0x03020302); 1257 weights_y = _mm_shuffle_epi8(weights, y_select); 1258 scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select); 1259 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y, 1260 &round); 1261 dst += stride; 1262 y_select = _mm_set1_epi32(0x05040504); 1263 weights_y = _mm_shuffle_epi8(weights, y_select); 1264 scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select); 1265 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y, 1266 &round); 1267 dst += stride; 1268 y_select = _mm_set1_epi32(0x07060706); 1269 weights_y = _mm_shuffle_epi8(weights, y_select); 1270 scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select); 1271 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y, 1272 &round); 1273 } 1274 1275 void aom_smooth_v_predictor_8x8_ssse3( 1276 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, 1277 const uint8_t *LIBAOM_RESTRICT top_row, 1278 const uint8_t *LIBAOM_RESTRICT left_column) { 1279 const __m128i bottom_left = _mm_set1_epi16(left_column[7]); 1280 const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4)); 1281 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); 1282 const __m128i inverted_weights = _mm_sub_epi16(scale, weights); 1283 const __m128i scaled_bottom_left = 1284 _mm_mullo_epi16(inverted_weights, bottom_left); 1285 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); 1286 const __m128i top = cvtepu8_epi16(LoadLo8(top_row)); 1287 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 1288 const __m128i y_select = _mm_set1_epi32(y_mask); 1289 const __m128i weights_y = _mm_shuffle_epi8(weights, y_select); 1290 const __m128i scaled_bottom_left_y = 1291 _mm_shuffle_epi8(scaled_bottom_left, y_select); 1292 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y, 1293 &round); 1294 dst += stride; 1295 } 1296 } 1297 1298 void aom_smooth_v_predictor_8x16_ssse3( 1299 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, 1300 const uint8_t *LIBAOM_RESTRICT top_row, 1301 const uint8_t *LIBAOM_RESTRICT left_column) { 1302 const __m128i bottom_left = _mm_set1_epi16(left_column[15]); 1303 const __m128i weights = LoadUnaligned16(smooth_weights + 12); 1304 1305 const __m128i weights1 = cvtepu8_epi16(weights); 1306 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8)); 1307 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); 1308 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); 1309 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); 1310 const __m128i scaled_bottom_left1 = 1311 _mm_mullo_epi16(inverted_weights1, bottom_left); 1312 const __m128i scaled_bottom_left2 = 1313 _mm_mullo_epi16(inverted_weights2, bottom_left); 1314 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); 1315 const __m128i top = cvtepu8_epi16(LoadLo8(top_row)); 1316 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 1317 const __m128i y_select = _mm_set1_epi32(y_mask); 1318 const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select); 1319 const __m128i scaled_bottom_left_y = 1320 _mm_shuffle_epi8(scaled_bottom_left1, y_select); 1321 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y, 1322 &round); 1323 dst += stride; 1324 } 1325 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 1326 const __m128i y_select = _mm_set1_epi32(y_mask); 1327 const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select); 1328 const __m128i scaled_bottom_left_y = 1329 _mm_shuffle_epi8(scaled_bottom_left2, y_select); 1330 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y, 1331 &round); 1332 dst += stride; 1333 } 1334 } 1335 1336 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1337 void aom_smooth_v_predictor_8x32_ssse3( 1338 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, 1339 const uint8_t *LIBAOM_RESTRICT top_row, 1340 const uint8_t *LIBAOM_RESTRICT left_column) { 1341 const __m128i zero = _mm_setzero_si128(); 1342 const __m128i bottom_left = _mm_set1_epi16(left_column[31]); 1343 const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28); 1344 const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44); 1345 const __m128i weights1 = cvtepu8_epi16(weights_lo); 1346 const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero); 1347 const __m128i weights3 = cvtepu8_epi16(weights_hi); 1348 const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero); 1349 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); 1350 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); 1351 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); 1352 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); 1353 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); 1354 const __m128i scaled_bottom_left1 = 1355 _mm_mullo_epi16(inverted_weights1, bottom_left); 1356 const __m128i scaled_bottom_left2 = 1357 _mm_mullo_epi16(inverted_weights2, bottom_left); 1358 const __m128i scaled_bottom_left3 = 1359 _mm_mullo_epi16(inverted_weights3, bottom_left); 1360 const __m128i scaled_bottom_left4 = 1361 _mm_mullo_epi16(inverted_weights4, bottom_left); 1362 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); 1363 const __m128i top = cvtepu8_epi16(LoadLo8(top_row)); 1364 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 1365 const __m128i y_select = _mm_set1_epi32(y_mask); 1366 const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select); 1367 const __m128i scaled_bottom_left_y = 1368 _mm_shuffle_epi8(scaled_bottom_left1, y_select); 1369 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y, 1370 &round); 1371 dst += stride; 1372 } 1373 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 1374 const __m128i y_select = _mm_set1_epi32(y_mask); 1375 const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select); 1376 const __m128i scaled_bottom_left_y = 1377 _mm_shuffle_epi8(scaled_bottom_left2, y_select); 1378 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y, 1379 &round); 1380 dst += stride; 1381 } 1382 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 1383 const __m128i y_select = _mm_set1_epi32(y_mask); 1384 const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select); 1385 const __m128i scaled_bottom_left_y = 1386 _mm_shuffle_epi8(scaled_bottom_left3, y_select); 1387 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y, 1388 &round); 1389 dst += stride; 1390 } 1391 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 1392 const __m128i y_select = _mm_set1_epi32(y_mask); 1393 const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select); 1394 const __m128i scaled_bottom_left_y = 1395 _mm_shuffle_epi8(scaled_bottom_left4, y_select); 1396 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y, 1397 &round); 1398 dst += stride; 1399 } 1400 } 1401 1402 void aom_smooth_v_predictor_16x4_ssse3( 1403 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, 1404 const uint8_t *LIBAOM_RESTRICT top_row, 1405 const uint8_t *LIBAOM_RESTRICT left_column) { 1406 const __m128i bottom_left = _mm_set1_epi16(left_column[3]); 1407 const __m128i weights = cvtepu8_epi16(Load4(smooth_weights)); 1408 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); 1409 const __m128i inverted_weights = _mm_sub_epi16(scale, weights); 1410 const __m128i scaled_bottom_left = 1411 _mm_mullo_epi16(inverted_weights, bottom_left); 1412 const __m128i round = _mm_set1_epi16(128); 1413 const __m128i top = LoadUnaligned16(top_row); 1414 const __m128i top_lo = cvtepu8_epi16(top); 1415 const __m128i top_hi = cvtepu8_epi16(_mm_srli_si128(top, 8)); 1416 1417 __m128i y_select = _mm_set1_epi32(0x01000100); 1418 __m128i weights_y = _mm_shuffle_epi8(weights, y_select); 1419 __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select); 1420 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y, 1421 scaled_bottom_left_y, scaled_bottom_left_y, 1422 round); 1423 dst += stride; 1424 y_select = _mm_set1_epi32(0x03020302); 1425 weights_y = _mm_shuffle_epi8(weights, y_select); 1426 scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select); 1427 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y, 1428 scaled_bottom_left_y, scaled_bottom_left_y, 1429 round); 1430 dst += stride; 1431 y_select = _mm_set1_epi32(0x05040504); 1432 weights_y = _mm_shuffle_epi8(weights, y_select); 1433 scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select); 1434 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y, 1435 scaled_bottom_left_y, scaled_bottom_left_y, 1436 round); 1437 dst += stride; 1438 y_select = _mm_set1_epi32(0x07060706); 1439 weights_y = _mm_shuffle_epi8(weights, y_select); 1440 scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select); 1441 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y, 1442 scaled_bottom_left_y, scaled_bottom_left_y, 1443 round); 1444 } 1445 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1446 1447 void aom_smooth_v_predictor_16x8_ssse3( 1448 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, 1449 const uint8_t *LIBAOM_RESTRICT top_row, 1450 const uint8_t *LIBAOM_RESTRICT left_column) { 1451 const __m128i bottom_left = _mm_set1_epi16(left_column[7]); 1452 const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4)); 1453 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); 1454 const __m128i inverted_weights = _mm_sub_epi16(scale, weights); 1455 const __m128i scaled_bottom_left = 1456 _mm_mullo_epi16(inverted_weights, bottom_left); 1457 const __m128i round = _mm_set1_epi16(128); 1458 const __m128i top = LoadUnaligned16(top_row); 1459 const __m128i top_lo = cvtepu8_epi16(top); 1460 const __m128i top_hi = cvtepu8_epi16(_mm_srli_si128(top, 8)); 1461 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 1462 const __m128i y_select = _mm_set1_epi32(y_mask); 1463 const __m128i weights_y = _mm_shuffle_epi8(weights, y_select); 1464 const __m128i scaled_bottom_left_y = 1465 _mm_shuffle_epi8(scaled_bottom_left, y_select); 1466 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y, 1467 scaled_bottom_left_y, scaled_bottom_left_y, 1468 round); 1469 dst += stride; 1470 } 1471 } 1472 1473 void aom_smooth_v_predictor_16x16_ssse3( 1474 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, 1475 const uint8_t *LIBAOM_RESTRICT top_row, 1476 const uint8_t *LIBAOM_RESTRICT left_column) { 1477 const __m128i bottom_left = _mm_set1_epi16(left_column[15]); 1478 const __m128i zero = _mm_setzero_si128(); 1479 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); 1480 const __m128i weights = LoadUnaligned16(smooth_weights + 12); 1481 const __m128i weights_lo = cvtepu8_epi16(weights); 1482 const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero); 1483 const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo); 1484 const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi); 1485 const __m128i scaled_bottom_left_lo = 1486 _mm_mullo_epi16(inverted_weights_lo, bottom_left); 1487 const __m128i scaled_bottom_left_hi = 1488 _mm_mullo_epi16(inverted_weights_hi, bottom_left); 1489 const __m128i round = _mm_set1_epi16(128); 1490 1491 const __m128i top = LoadUnaligned16(top_row); 1492 const __m128i top_lo = cvtepu8_epi16(top); 1493 const __m128i top_hi = _mm_unpackhi_epi8(top, zero); 1494 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 1495 const __m128i y_select = _mm_set1_epi32(y_mask); 1496 const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select); 1497 const __m128i scaled_bottom_left_y = 1498 _mm_shuffle_epi8(scaled_bottom_left_lo, y_select); 1499 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y, 1500 scaled_bottom_left_y, scaled_bottom_left_y, 1501 round); 1502 dst += stride; 1503 } 1504 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 1505 const __m128i y_select = _mm_set1_epi32(y_mask); 1506 const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select); 1507 const __m128i scaled_bottom_left_y = 1508 _mm_shuffle_epi8(scaled_bottom_left_hi, y_select); 1509 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y, 1510 scaled_bottom_left_y, scaled_bottom_left_y, 1511 round); 1512 dst += stride; 1513 } 1514 } 1515 1516 void aom_smooth_v_predictor_16x32_ssse3( 1517 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, 1518 const uint8_t *LIBAOM_RESTRICT top_row, 1519 const uint8_t *LIBAOM_RESTRICT left_column) { 1520 const __m128i bottom_left = _mm_set1_epi16(left_column[31]); 1521 const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28); 1522 const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44); 1523 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); 1524 const __m128i zero = _mm_setzero_si128(); 1525 const __m128i weights1 = cvtepu8_epi16(weights_lo); 1526 const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero); 1527 const __m128i weights3 = cvtepu8_epi16(weights_hi); 1528 const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero); 1529 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); 1530 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); 1531 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); 1532 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); 1533 const __m128i scaled_bottom_left1 = 1534 _mm_mullo_epi16(inverted_weights1, bottom_left); 1535 const __m128i scaled_bottom_left2 = 1536 _mm_mullo_epi16(inverted_weights2, bottom_left); 1537 const __m128i scaled_bottom_left3 = 1538 _mm_mullo_epi16(inverted_weights3, bottom_left); 1539 const __m128i scaled_bottom_left4 = 1540 _mm_mullo_epi16(inverted_weights4, bottom_left); 1541 const __m128i round = _mm_set1_epi16(128); 1542 1543 const __m128i top = LoadUnaligned16(top_row); 1544 const __m128i top_lo = cvtepu8_epi16(top); 1545 const __m128i top_hi = _mm_unpackhi_epi8(top, zero); 1546 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 1547 const __m128i y_select = _mm_set1_epi32(y_mask); 1548 const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select); 1549 const __m128i scaled_bottom_left_y = 1550 _mm_shuffle_epi8(scaled_bottom_left1, y_select); 1551 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y, 1552 scaled_bottom_left_y, scaled_bottom_left_y, 1553 round); 1554 dst += stride; 1555 } 1556 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 1557 const __m128i y_select = _mm_set1_epi32(y_mask); 1558 const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select); 1559 const __m128i scaled_bottom_left_y = 1560 _mm_shuffle_epi8(scaled_bottom_left2, y_select); 1561 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y, 1562 scaled_bottom_left_y, scaled_bottom_left_y, 1563 round); 1564 dst += stride; 1565 } 1566 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 1567 const __m128i y_select = _mm_set1_epi32(y_mask); 1568 const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select); 1569 const __m128i scaled_bottom_left_y = 1570 _mm_shuffle_epi8(scaled_bottom_left3, y_select); 1571 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y, 1572 scaled_bottom_left_y, scaled_bottom_left_y, 1573 round); 1574 dst += stride; 1575 } 1576 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 1577 const __m128i y_select = _mm_set1_epi32(y_mask); 1578 const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select); 1579 const __m128i scaled_bottom_left_y = 1580 _mm_shuffle_epi8(scaled_bottom_left4, y_select); 1581 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y, 1582 scaled_bottom_left_y, scaled_bottom_left_y, 1583 round); 1584 dst += stride; 1585 } 1586 } 1587 1588 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1589 void aom_smooth_v_predictor_16x64_ssse3( 1590 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, 1591 const uint8_t *LIBAOM_RESTRICT top_row, 1592 const uint8_t *LIBAOM_RESTRICT left_column) { 1593 const __m128i bottom_left = _mm_set1_epi16(left_column[63]); 1594 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); 1595 const __m128i round = _mm_set1_epi16(128); 1596 const __m128i zero = _mm_setzero_si128(); 1597 const __m128i top = LoadUnaligned16(top_row); 1598 const __m128i top_lo = cvtepu8_epi16(top); 1599 const __m128i top_hi = _mm_unpackhi_epi8(top, zero); 1600 const uint8_t *weights_base_ptr = smooth_weights + 60; 1601 for (int left_offset = 0; left_offset < 64; left_offset += 16) { 1602 const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset); 1603 const __m128i weights_lo = cvtepu8_epi16(weights); 1604 const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero); 1605 const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo); 1606 const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi); 1607 const __m128i scaled_bottom_left_lo = 1608 _mm_mullo_epi16(inverted_weights_lo, bottom_left); 1609 const __m128i scaled_bottom_left_hi = 1610 _mm_mullo_epi16(inverted_weights_hi, bottom_left); 1611 1612 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 1613 const __m128i y_select = _mm_set1_epi32(y_mask); 1614 const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select); 1615 const __m128i scaled_bottom_left_y = 1616 _mm_shuffle_epi8(scaled_bottom_left_lo, y_select); 1617 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y, 1618 scaled_bottom_left_y, scaled_bottom_left_y, 1619 round); 1620 dst += stride; 1621 } 1622 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 1623 const __m128i y_select = _mm_set1_epi32(y_mask); 1624 const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select); 1625 const __m128i scaled_bottom_left_y = 1626 _mm_shuffle_epi8(scaled_bottom_left_hi, y_select); 1627 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y, 1628 scaled_bottom_left_y, scaled_bottom_left_y, 1629 round); 1630 dst += stride; 1631 } 1632 } 1633 } 1634 1635 void aom_smooth_v_predictor_32x8_ssse3( 1636 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, 1637 const uint8_t *LIBAOM_RESTRICT top_row, 1638 const uint8_t *LIBAOM_RESTRICT left_column) { 1639 const __m128i zero = _mm_setzero_si128(); 1640 const __m128i bottom_left = _mm_set1_epi16(left_column[7]); 1641 const __m128i top_lo = LoadUnaligned16(top_row); 1642 const __m128i top_hi = LoadUnaligned16(top_row + 16); 1643 const __m128i top1 = cvtepu8_epi16(top_lo); 1644 const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero); 1645 const __m128i top3 = cvtepu8_epi16(top_hi); 1646 const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero); 1647 __m128i scale = _mm_set1_epi16(256); 1648 const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4)); 1649 const __m128i inverted_weights = _mm_sub_epi16(scale, weights); 1650 const __m128i scaled_bottom_left = 1651 _mm_mullo_epi16(inverted_weights, bottom_left); 1652 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); 1653 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 1654 __m128i y_select = _mm_set1_epi32(y_mask); 1655 const __m128i weights_y = _mm_shuffle_epi8(weights, y_select); 1656 const __m128i scaled_bottom_left_y = 1657 _mm_shuffle_epi8(scaled_bottom_left, y_select); 1658 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, 1659 scaled_bottom_left_y, scaled_bottom_left_y, 1660 round); 1661 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, 1662 scaled_bottom_left_y, scaled_bottom_left_y, 1663 round); 1664 dst += stride; 1665 } 1666 } 1667 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1668 1669 void aom_smooth_v_predictor_32x16_ssse3( 1670 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, 1671 const uint8_t *LIBAOM_RESTRICT top_row, 1672 const uint8_t *LIBAOM_RESTRICT left_column) { 1673 const __m128i zero = _mm_setzero_si128(); 1674 const __m128i bottom_left = _mm_set1_epi16(left_column[15]); 1675 const __m128i top_lo = LoadUnaligned16(top_row); 1676 const __m128i top_hi = LoadUnaligned16(top_row + 16); 1677 const __m128i top1 = cvtepu8_epi16(top_lo); 1678 const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero); 1679 const __m128i top3 = cvtepu8_epi16(top_hi); 1680 const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero); 1681 const __m128i weights = LoadUnaligned16(smooth_weights + 12); 1682 const __m128i weights1 = cvtepu8_epi16(weights); 1683 const __m128i weights2 = _mm_unpackhi_epi8(weights, zero); 1684 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); 1685 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); 1686 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); 1687 const __m128i scaled_bottom_left1 = 1688 _mm_mullo_epi16(inverted_weights1, bottom_left); 1689 const __m128i scaled_bottom_left2 = 1690 _mm_mullo_epi16(inverted_weights2, bottom_left); 1691 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); 1692 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 1693 __m128i y_select = _mm_set1_epi32(y_mask); 1694 const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select); 1695 const __m128i scaled_bottom_left_y = 1696 _mm_shuffle_epi8(scaled_bottom_left1, y_select); 1697 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, 1698 scaled_bottom_left_y, scaled_bottom_left_y, 1699 round); 1700 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, 1701 scaled_bottom_left_y, scaled_bottom_left_y, 1702 round); 1703 dst += stride; 1704 } 1705 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 1706 __m128i y_select = _mm_set1_epi32(y_mask); 1707 const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select); 1708 const __m128i scaled_bottom_left_y = 1709 _mm_shuffle_epi8(scaled_bottom_left2, y_select); 1710 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, 1711 scaled_bottom_left_y, scaled_bottom_left_y, 1712 round); 1713 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, 1714 scaled_bottom_left_y, scaled_bottom_left_y, 1715 round); 1716 dst += stride; 1717 } 1718 } 1719 1720 void aom_smooth_v_predictor_32x32_ssse3( 1721 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, 1722 const uint8_t *LIBAOM_RESTRICT top_row, 1723 const uint8_t *LIBAOM_RESTRICT left_column) { 1724 const __m128i bottom_left = _mm_set1_epi16(left_column[31]); 1725 const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28); 1726 const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44); 1727 const __m128i zero = _mm_setzero_si128(); 1728 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); 1729 const __m128i top_lo = LoadUnaligned16(top_row); 1730 const __m128i top_hi = LoadUnaligned16(top_row + 16); 1731 const __m128i top1 = cvtepu8_epi16(top_lo); 1732 const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero); 1733 const __m128i top3 = cvtepu8_epi16(top_hi); 1734 const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero); 1735 const __m128i weights1 = cvtepu8_epi16(weights_lo); 1736 const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero); 1737 const __m128i weights3 = cvtepu8_epi16(weights_hi); 1738 const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero); 1739 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); 1740 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); 1741 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); 1742 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); 1743 const __m128i scaled_bottom_left1 = 1744 _mm_mullo_epi16(inverted_weights1, bottom_left); 1745 const __m128i scaled_bottom_left2 = 1746 _mm_mullo_epi16(inverted_weights2, bottom_left); 1747 const __m128i scaled_bottom_left3 = 1748 _mm_mullo_epi16(inverted_weights3, bottom_left); 1749 const __m128i scaled_bottom_left4 = 1750 _mm_mullo_epi16(inverted_weights4, bottom_left); 1751 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); 1752 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 1753 const __m128i y_select = _mm_set1_epi32(y_mask); 1754 const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select); 1755 const __m128i scaled_bottom_left_y = 1756 _mm_shuffle_epi8(scaled_bottom_left1, y_select); 1757 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, 1758 scaled_bottom_left_y, scaled_bottom_left_y, 1759 round); 1760 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, 1761 scaled_bottom_left_y, scaled_bottom_left_y, 1762 round); 1763 dst += stride; 1764 } 1765 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 1766 const __m128i y_select = _mm_set1_epi32(y_mask); 1767 const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select); 1768 const __m128i scaled_bottom_left_y = 1769 _mm_shuffle_epi8(scaled_bottom_left2, y_select); 1770 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, 1771 scaled_bottom_left_y, scaled_bottom_left_y, 1772 round); 1773 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, 1774 scaled_bottom_left_y, scaled_bottom_left_y, 1775 round); 1776 dst += stride; 1777 } 1778 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 1779 const __m128i y_select = _mm_set1_epi32(y_mask); 1780 const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select); 1781 const __m128i scaled_bottom_left_y = 1782 _mm_shuffle_epi8(scaled_bottom_left3, y_select); 1783 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, 1784 scaled_bottom_left_y, scaled_bottom_left_y, 1785 round); 1786 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, 1787 scaled_bottom_left_y, scaled_bottom_left_y, 1788 round); 1789 dst += stride; 1790 } 1791 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 1792 const __m128i y_select = _mm_set1_epi32(y_mask); 1793 const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select); 1794 const __m128i scaled_bottom_left_y = 1795 _mm_shuffle_epi8(scaled_bottom_left4, y_select); 1796 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, 1797 scaled_bottom_left_y, scaled_bottom_left_y, 1798 round); 1799 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, 1800 scaled_bottom_left_y, scaled_bottom_left_y, 1801 round); 1802 dst += stride; 1803 } 1804 } 1805 1806 void aom_smooth_v_predictor_32x64_ssse3( 1807 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, 1808 const uint8_t *LIBAOM_RESTRICT top_row, 1809 const uint8_t *LIBAOM_RESTRICT left_column) { 1810 const __m128i zero = _mm_setzero_si128(); 1811 const __m128i bottom_left = _mm_set1_epi16(left_column[63]); 1812 const __m128i top_lo = LoadUnaligned16(top_row); 1813 const __m128i top_hi = LoadUnaligned16(top_row + 16); 1814 const __m128i top1 = cvtepu8_epi16(top_lo); 1815 const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero); 1816 const __m128i top3 = cvtepu8_epi16(top_hi); 1817 const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero); 1818 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); 1819 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); 1820 const uint8_t *weights_base_ptr = smooth_weights + 60; 1821 for (int left_offset = 0; left_offset < 64; left_offset += 16) { 1822 const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset); 1823 const __m128i weights_lo = cvtepu8_epi16(weights); 1824 const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero); 1825 const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo); 1826 const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi); 1827 const __m128i scaled_bottom_left_lo = 1828 _mm_mullo_epi16(inverted_weights_lo, bottom_left); 1829 const __m128i scaled_bottom_left_hi = 1830 _mm_mullo_epi16(inverted_weights_hi, bottom_left); 1831 1832 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 1833 const __m128i y_select = _mm_set1_epi32(y_mask); 1834 const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select); 1835 const __m128i scaled_bottom_left_y = 1836 _mm_shuffle_epi8(scaled_bottom_left_lo, y_select); 1837 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, 1838 scaled_bottom_left_y, scaled_bottom_left_y, 1839 round); 1840 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, 1841 scaled_bottom_left_y, scaled_bottom_left_y, 1842 round); 1843 dst += stride; 1844 } 1845 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 1846 const __m128i y_select = _mm_set1_epi32(y_mask); 1847 const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select); 1848 const __m128i scaled_bottom_left_y = 1849 _mm_shuffle_epi8(scaled_bottom_left_hi, y_select); 1850 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, 1851 scaled_bottom_left_y, scaled_bottom_left_y, 1852 round); 1853 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, 1854 scaled_bottom_left_y, scaled_bottom_left_y, 1855 round); 1856 dst += stride; 1857 } 1858 } 1859 } 1860 1861 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1862 void aom_smooth_v_predictor_64x16_ssse3( 1863 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, 1864 const uint8_t *LIBAOM_RESTRICT top_row, 1865 const uint8_t *LIBAOM_RESTRICT left_column) { 1866 const __m128i bottom_left = _mm_set1_epi16(left_column[15]); 1867 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); 1868 const __m128i zero = _mm_setzero_si128(); 1869 const __m128i top_lolo = LoadUnaligned16(top_row); 1870 const __m128i top_lohi = LoadUnaligned16(top_row + 16); 1871 const __m128i top1 = cvtepu8_epi16(top_lolo); 1872 const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero); 1873 const __m128i top3 = cvtepu8_epi16(top_lohi); 1874 const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero); 1875 1876 const __m128i weights = LoadUnaligned16(smooth_weights + 12); 1877 const __m128i weights1 = cvtepu8_epi16(weights); 1878 const __m128i weights2 = _mm_unpackhi_epi8(weights, zero); 1879 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); 1880 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); 1881 const __m128i top_hilo = LoadUnaligned16(top_row + 32); 1882 const __m128i top_hihi = LoadUnaligned16(top_row + 48); 1883 const __m128i top5 = cvtepu8_epi16(top_hilo); 1884 const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero); 1885 const __m128i top7 = cvtepu8_epi16(top_hihi); 1886 const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero); 1887 const __m128i scaled_bottom_left1 = 1888 _mm_mullo_epi16(inverted_weights1, bottom_left); 1889 const __m128i scaled_bottom_left2 = 1890 _mm_mullo_epi16(inverted_weights2, bottom_left); 1891 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); 1892 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 1893 const __m128i y_select = _mm_set1_epi32(y_mask); 1894 const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select); 1895 const __m128i scaled_bottom_left_y = 1896 _mm_shuffle_epi8(scaled_bottom_left1, y_select); 1897 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, 1898 scaled_bottom_left_y, scaled_bottom_left_y, 1899 round); 1900 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, 1901 scaled_bottom_left_y, scaled_bottom_left_y, 1902 round); 1903 write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y, 1904 scaled_bottom_left_y, scaled_bottom_left_y, 1905 round); 1906 write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y, 1907 scaled_bottom_left_y, scaled_bottom_left_y, 1908 round); 1909 dst += stride; 1910 } 1911 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 1912 const __m128i y_select = _mm_set1_epi32(y_mask); 1913 const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select); 1914 const __m128i scaled_bottom_left_y = 1915 _mm_shuffle_epi8(scaled_bottom_left2, y_select); 1916 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, 1917 scaled_bottom_left_y, scaled_bottom_left_y, 1918 round); 1919 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, 1920 scaled_bottom_left_y, scaled_bottom_left_y, 1921 round); 1922 write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y, 1923 scaled_bottom_left_y, scaled_bottom_left_y, 1924 round); 1925 write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y, 1926 scaled_bottom_left_y, scaled_bottom_left_y, 1927 round); 1928 dst += stride; 1929 } 1930 } 1931 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1932 1933 void aom_smooth_v_predictor_64x32_ssse3( 1934 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, 1935 const uint8_t *LIBAOM_RESTRICT top_row, 1936 const uint8_t *LIBAOM_RESTRICT left_column) { 1937 const __m128i zero = _mm_setzero_si128(); 1938 const __m128i bottom_left = _mm_set1_epi16(left_column[31]); 1939 const __m128i top_lolo = LoadUnaligned16(top_row); 1940 const __m128i top_lohi = LoadUnaligned16(top_row + 16); 1941 const __m128i top1 = cvtepu8_epi16(top_lolo); 1942 const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero); 1943 const __m128i top3 = cvtepu8_epi16(top_lohi); 1944 const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero); 1945 const __m128i top_hilo = LoadUnaligned16(top_row + 32); 1946 const __m128i top_hihi = LoadUnaligned16(top_row + 48); 1947 const __m128i top5 = cvtepu8_epi16(top_hilo); 1948 const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero); 1949 const __m128i top7 = cvtepu8_epi16(top_hihi); 1950 const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero); 1951 const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28); 1952 const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44); 1953 const __m128i weights1 = cvtepu8_epi16(weights_lo); 1954 const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero); 1955 const __m128i weights3 = cvtepu8_epi16(weights_hi); 1956 const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero); 1957 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); 1958 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); 1959 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); 1960 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); 1961 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); 1962 const __m128i scaled_bottom_left1 = 1963 _mm_mullo_epi16(inverted_weights1, bottom_left); 1964 const __m128i scaled_bottom_left2 = 1965 _mm_mullo_epi16(inverted_weights2, bottom_left); 1966 const __m128i scaled_bottom_left3 = 1967 _mm_mullo_epi16(inverted_weights3, bottom_left); 1968 const __m128i scaled_bottom_left4 = 1969 _mm_mullo_epi16(inverted_weights4, bottom_left); 1970 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); 1971 1972 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 1973 const __m128i y_select = _mm_set1_epi32(y_mask); 1974 const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select); 1975 const __m128i scaled_bottom_left_y = 1976 _mm_shuffle_epi8(scaled_bottom_left1, y_select); 1977 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, 1978 scaled_bottom_left_y, scaled_bottom_left_y, 1979 round); 1980 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, 1981 scaled_bottom_left_y, scaled_bottom_left_y, 1982 round); 1983 write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y, 1984 scaled_bottom_left_y, scaled_bottom_left_y, 1985 round); 1986 write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y, 1987 scaled_bottom_left_y, scaled_bottom_left_y, 1988 round); 1989 dst += stride; 1990 } 1991 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 1992 const __m128i y_select = _mm_set1_epi32(y_mask); 1993 const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select); 1994 const __m128i scaled_bottom_left_y = 1995 _mm_shuffle_epi8(scaled_bottom_left2, y_select); 1996 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, 1997 scaled_bottom_left_y, scaled_bottom_left_y, 1998 round); 1999 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, 2000 scaled_bottom_left_y, scaled_bottom_left_y, 2001 round); 2002 write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y, 2003 scaled_bottom_left_y, scaled_bottom_left_y, 2004 round); 2005 write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y, 2006 scaled_bottom_left_y, scaled_bottom_left_y, 2007 round); 2008 dst += stride; 2009 } 2010 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 2011 const __m128i y_select = _mm_set1_epi32(y_mask); 2012 const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select); 2013 const __m128i scaled_bottom_left_y = 2014 _mm_shuffle_epi8(scaled_bottom_left3, y_select); 2015 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, 2016 scaled_bottom_left_y, scaled_bottom_left_y, 2017 round); 2018 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, 2019 scaled_bottom_left_y, scaled_bottom_left_y, 2020 round); 2021 write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y, 2022 scaled_bottom_left_y, scaled_bottom_left_y, 2023 round); 2024 write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y, 2025 scaled_bottom_left_y, scaled_bottom_left_y, 2026 round); 2027 dst += stride; 2028 } 2029 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 2030 const __m128i y_select = _mm_set1_epi32(y_mask); 2031 const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select); 2032 const __m128i scaled_bottom_left_y = 2033 _mm_shuffle_epi8(scaled_bottom_left4, y_select); 2034 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, 2035 scaled_bottom_left_y, scaled_bottom_left_y, 2036 round); 2037 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, 2038 scaled_bottom_left_y, scaled_bottom_left_y, 2039 round); 2040 write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y, 2041 scaled_bottom_left_y, scaled_bottom_left_y, 2042 round); 2043 write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y, 2044 scaled_bottom_left_y, scaled_bottom_left_y, 2045 round); 2046 dst += stride; 2047 } 2048 } 2049 2050 void aom_smooth_v_predictor_64x64_ssse3( 2051 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, 2052 const uint8_t *LIBAOM_RESTRICT top_row, 2053 const uint8_t *LIBAOM_RESTRICT left_column) { 2054 const __m128i zero = _mm_setzero_si128(); 2055 const __m128i bottom_left = _mm_set1_epi16(left_column[63]); 2056 const __m128i top_lolo = LoadUnaligned16(top_row); 2057 const __m128i top_lohi = LoadUnaligned16(top_row + 16); 2058 const __m128i top1 = cvtepu8_epi16(top_lolo); 2059 const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero); 2060 const __m128i top3 = cvtepu8_epi16(top_lohi); 2061 const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero); 2062 const __m128i top_hilo = LoadUnaligned16(top_row + 32); 2063 const __m128i top_hihi = LoadUnaligned16(top_row + 48); 2064 const __m128i top5 = cvtepu8_epi16(top_hilo); 2065 const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero); 2066 const __m128i top7 = cvtepu8_epi16(top_hihi); 2067 const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero); 2068 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); 2069 const __m128i round = _mm_set1_epi16(128); 2070 const uint8_t *weights_base_ptr = smooth_weights + 60; 2071 for (int left_offset = 0; left_offset < 64; left_offset += 16) { 2072 const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset); 2073 const __m128i weights_lo = cvtepu8_epi16(weights); 2074 const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero); 2075 const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo); 2076 const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi); 2077 const __m128i scaled_bottom_left_lo = 2078 _mm_mullo_epi16(inverted_weights_lo, bottom_left); 2079 const __m128i scaled_bottom_left_hi = 2080 _mm_mullo_epi16(inverted_weights_hi, bottom_left); 2081 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 2082 const __m128i y_select = _mm_set1_epi32(y_mask); 2083 const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select); 2084 const __m128i scaled_bottom_left_y = 2085 _mm_shuffle_epi8(scaled_bottom_left_lo, y_select); 2086 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, 2087 scaled_bottom_left_y, scaled_bottom_left_y, 2088 round); 2089 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, 2090 scaled_bottom_left_y, scaled_bottom_left_y, 2091 round); 2092 write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y, 2093 scaled_bottom_left_y, scaled_bottom_left_y, 2094 round); 2095 write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y, 2096 scaled_bottom_left_y, scaled_bottom_left_y, 2097 round); 2098 dst += stride; 2099 } 2100 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 2101 const __m128i y_select = _mm_set1_epi32(y_mask); 2102 const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select); 2103 const __m128i scaled_bottom_left_y = 2104 _mm_shuffle_epi8(scaled_bottom_left_hi, y_select); 2105 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, 2106 scaled_bottom_left_y, scaled_bottom_left_y, 2107 round); 2108 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, 2109 scaled_bottom_left_y, scaled_bottom_left_y, 2110 round); 2111 write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y, 2112 scaled_bottom_left_y, scaled_bottom_left_y, 2113 round); 2114 write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y, 2115 scaled_bottom_left_y, scaled_bottom_left_y, 2116 round); 2117 dst += stride; 2118 } 2119 } 2120 } 2121 2122 // ----------------------------------------------------------------------------- 2123 // SMOOTH_H_PRED 2124 static AOM_FORCE_INLINE void write_smooth_horizontal_sum4( 2125 uint8_t *LIBAOM_RESTRICT dst, const __m128i *left_y, const __m128i *weights, 2126 const __m128i *scaled_top_right, const __m128i *round) { 2127 const __m128i weighted_left_y = _mm_mullo_epi16(*left_y, *weights); 2128 const __m128i pred_sum = _mm_add_epi32(*scaled_top_right, weighted_left_y); 2129 // Equivalent to RightShiftWithRounding(pred[x][y], 8). 2130 const __m128i pred = _mm_srli_epi32(_mm_add_epi32(pred_sum, *round), 8); 2131 const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400); 2132 Store4(dst, _mm_shuffle_epi8(pred, cvtepi32_epi8)); 2133 } 2134 2135 void aom_smooth_h_predictor_4x4_ssse3( 2136 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, 2137 const uint8_t *LIBAOM_RESTRICT top_row, 2138 const uint8_t *LIBAOM_RESTRICT left_column) { 2139 const __m128i top_right = _mm_set1_epi32(top_row[3]); 2140 const __m128i left = cvtepu8_epi32(Load4(left_column)); 2141 const __m128i weights = cvtepu8_epi32(Load4(smooth_weights)); 2142 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); 2143 const __m128i inverted_weights = _mm_sub_epi32(scale, weights); 2144 const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right); 2145 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); 2146 __m128i left_y = _mm_shuffle_epi32(left, 0); 2147 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, 2148 &round); 2149 dst += stride; 2150 left_y = _mm_shuffle_epi32(left, 0x55); 2151 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, 2152 &round); 2153 dst += stride; 2154 left_y = _mm_shuffle_epi32(left, 0xaa); 2155 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, 2156 &round); 2157 dst += stride; 2158 left_y = _mm_shuffle_epi32(left, 0xff); 2159 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, 2160 &round); 2161 } 2162 2163 void aom_smooth_h_predictor_4x8_ssse3( 2164 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, 2165 const uint8_t *LIBAOM_RESTRICT top_row, 2166 const uint8_t *LIBAOM_RESTRICT left_column) { 2167 const __m128i top_right = _mm_set1_epi32(top_row[3]); 2168 const __m128i weights = cvtepu8_epi32(Load4(smooth_weights)); 2169 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); 2170 const __m128i inverted_weights = _mm_sub_epi32(scale, weights); 2171 const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right); 2172 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); 2173 __m128i left = cvtepu8_epi32(Load4(left_column)); 2174 __m128i left_y = _mm_shuffle_epi32(left, 0); 2175 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, 2176 &round); 2177 dst += stride; 2178 left_y = _mm_shuffle_epi32(left, 0x55); 2179 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, 2180 &round); 2181 dst += stride; 2182 left_y = _mm_shuffle_epi32(left, 0xaa); 2183 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, 2184 &round); 2185 dst += stride; 2186 left_y = _mm_shuffle_epi32(left, 0xff); 2187 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, 2188 &round); 2189 dst += stride; 2190 2191 left = cvtepu8_epi32(Load4(left_column + 4)); 2192 left_y = _mm_shuffle_epi32(left, 0); 2193 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, 2194 &round); 2195 dst += stride; 2196 left_y = _mm_shuffle_epi32(left, 0x55); 2197 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, 2198 &round); 2199 dst += stride; 2200 left_y = _mm_shuffle_epi32(left, 0xaa); 2201 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, 2202 &round); 2203 dst += stride; 2204 left_y = _mm_shuffle_epi32(left, 0xff); 2205 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, 2206 &round); 2207 } 2208 2209 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 2210 void aom_smooth_h_predictor_4x16_ssse3( 2211 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, 2212 const uint8_t *LIBAOM_RESTRICT top_row, 2213 const uint8_t *LIBAOM_RESTRICT left_column) { 2214 const __m128i top_right = _mm_set1_epi32(top_row[3]); 2215 const __m128i weights = cvtepu8_epi32(Load4(smooth_weights)); 2216 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); 2217 const __m128i inverted_weights = _mm_sub_epi32(scale, weights); 2218 const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right); 2219 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); 2220 __m128i left = cvtepu8_epi32(Load4(left_column)); 2221 __m128i left_y = _mm_shuffle_epi32(left, 0); 2222 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, 2223 &round); 2224 dst += stride; 2225 left_y = _mm_shuffle_epi32(left, 0x55); 2226 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, 2227 &round); 2228 dst += stride; 2229 left_y = _mm_shuffle_epi32(left, 0xaa); 2230 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, 2231 &round); 2232 dst += stride; 2233 left_y = _mm_shuffle_epi32(left, 0xff); 2234 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, 2235 &round); 2236 dst += stride; 2237 2238 left = cvtepu8_epi32(Load4(left_column + 4)); 2239 left_y = _mm_shuffle_epi32(left, 0); 2240 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, 2241 &round); 2242 dst += stride; 2243 left_y = _mm_shuffle_epi32(left, 0x55); 2244 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, 2245 &round); 2246 dst += stride; 2247 left_y = _mm_shuffle_epi32(left, 0xaa); 2248 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, 2249 &round); 2250 dst += stride; 2251 left_y = _mm_shuffle_epi32(left, 0xff); 2252 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, 2253 &round); 2254 dst += stride; 2255 2256 left = cvtepu8_epi32(Load4(left_column + 8)); 2257 left_y = _mm_shuffle_epi32(left, 0); 2258 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, 2259 &round); 2260 dst += stride; 2261 left_y = _mm_shuffle_epi32(left, 0x55); 2262 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, 2263 &round); 2264 dst += stride; 2265 left_y = _mm_shuffle_epi32(left, 0xaa); 2266 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, 2267 &round); 2268 dst += stride; 2269 left_y = _mm_shuffle_epi32(left, 0xff); 2270 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, 2271 &round); 2272 dst += stride; 2273 2274 left = cvtepu8_epi32(Load4(left_column + 12)); 2275 left_y = _mm_shuffle_epi32(left, 0); 2276 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, 2277 &round); 2278 dst += stride; 2279 left_y = _mm_shuffle_epi32(left, 0x55); 2280 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, 2281 &round); 2282 dst += stride; 2283 left_y = _mm_shuffle_epi32(left, 0xaa); 2284 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, 2285 &round); 2286 dst += stride; 2287 left_y = _mm_shuffle_epi32(left, 0xff); 2288 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, 2289 &round); 2290 } 2291 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 2292 2293 // For SMOOTH_H, |pixels| is the repeated left value for the row. For SMOOTH_V, 2294 // |pixels| is a segment of the top row or the whole top row, and |weights| is 2295 // repeated. 2296 void aom_smooth_h_predictor_8x4_ssse3( 2297 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, 2298 const uint8_t *LIBAOM_RESTRICT top_row, 2299 const uint8_t *LIBAOM_RESTRICT left_column) { 2300 const __m128i top_right = _mm_set1_epi16(top_row[7]); 2301 const __m128i left = cvtepu8_epi16(Load4(left_column)); 2302 const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4)); 2303 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); 2304 const __m128i inverted_weights = _mm_sub_epi16(scale, weights); 2305 const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right); 2306 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); 2307 __m128i y_select = _mm_set1_epi32(0x01000100); 2308 __m128i left_y = _mm_shuffle_epi8(left, y_select); 2309 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right, 2310 &round); 2311 dst += stride; 2312 y_select = _mm_set1_epi32(0x03020302); 2313 left_y = _mm_shuffle_epi8(left, y_select); 2314 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right, 2315 &round); 2316 dst += stride; 2317 y_select = _mm_set1_epi32(0x05040504); 2318 left_y = _mm_shuffle_epi8(left, y_select); 2319 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right, 2320 &round); 2321 dst += stride; 2322 y_select = _mm_set1_epi32(0x07060706); 2323 left_y = _mm_shuffle_epi8(left, y_select); 2324 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right, 2325 &round); 2326 } 2327 2328 void aom_smooth_h_predictor_8x8_ssse3( 2329 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, 2330 const uint8_t *LIBAOM_RESTRICT top_row, 2331 const uint8_t *LIBAOM_RESTRICT left_column) { 2332 const __m128i top_right = _mm_set1_epi16(top_row[7]); 2333 const __m128i left = cvtepu8_epi16(LoadLo8(left_column)); 2334 const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4)); 2335 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); 2336 const __m128i inverted_weights = _mm_sub_epi16(scale, weights); 2337 const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right); 2338 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); 2339 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 2340 const __m128i y_select = _mm_set1_epi32(y_mask); 2341 const __m128i left_y = _mm_shuffle_epi8(left, y_select); 2342 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right, 2343 &round); 2344 dst += stride; 2345 } 2346 } 2347 2348 void aom_smooth_h_predictor_8x16_ssse3( 2349 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, 2350 const uint8_t *LIBAOM_RESTRICT top_row, 2351 const uint8_t *LIBAOM_RESTRICT left_column) { 2352 const __m128i top_right = _mm_set1_epi16(top_row[7]); 2353 const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4)); 2354 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); 2355 const __m128i inverted_weights = _mm_sub_epi16(scale, weights); 2356 const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right); 2357 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); 2358 __m128i left = cvtepu8_epi16(LoadLo8(left_column)); 2359 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 2360 const __m128i y_select = _mm_set1_epi32(y_mask); 2361 const __m128i left_y = _mm_shuffle_epi8(left, y_select); 2362 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right, 2363 &round); 2364 dst += stride; 2365 } 2366 left = cvtepu8_epi16(LoadLo8(left_column + 8)); 2367 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 2368 const __m128i y_select = _mm_set1_epi32(y_mask); 2369 const __m128i left_y = _mm_shuffle_epi8(left, y_select); 2370 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right, 2371 &round); 2372 dst += stride; 2373 } 2374 } 2375 2376 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 2377 void aom_smooth_h_predictor_8x32_ssse3( 2378 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, 2379 const uint8_t *LIBAOM_RESTRICT top_row, 2380 const uint8_t *LIBAOM_RESTRICT left_column) { 2381 const __m128i top_right = _mm_set1_epi16(top_row[7]); 2382 const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4)); 2383 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); 2384 const __m128i inverted_weights = _mm_sub_epi16(scale, weights); 2385 const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right); 2386 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); 2387 __m128i left = cvtepu8_epi16(LoadLo8(left_column)); 2388 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 2389 const __m128i y_select = _mm_set1_epi32(y_mask); 2390 const __m128i left_y = _mm_shuffle_epi8(left, y_select); 2391 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right, 2392 &round); 2393 dst += stride; 2394 } 2395 left = cvtepu8_epi16(LoadLo8(left_column + 8)); 2396 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 2397 const __m128i y_select = _mm_set1_epi32(y_mask); 2398 const __m128i left_y = _mm_shuffle_epi8(left, y_select); 2399 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right, 2400 &round); 2401 dst += stride; 2402 } 2403 left = cvtepu8_epi16(LoadLo8(left_column + 16)); 2404 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 2405 const __m128i y_select = _mm_set1_epi32(y_mask); 2406 const __m128i left_y = _mm_shuffle_epi8(left, y_select); 2407 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right, 2408 &round); 2409 dst += stride; 2410 } 2411 left = cvtepu8_epi16(LoadLo8(left_column + 24)); 2412 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 2413 const __m128i y_select = _mm_set1_epi32(y_mask); 2414 const __m128i left_y = _mm_shuffle_epi8(left, y_select); 2415 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right, 2416 &round); 2417 dst += stride; 2418 } 2419 } 2420 2421 void aom_smooth_h_predictor_16x4_ssse3( 2422 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, 2423 const uint8_t *LIBAOM_RESTRICT top_row, 2424 const uint8_t *LIBAOM_RESTRICT left_column) { 2425 const __m128i top_right = _mm_set1_epi16(top_row[15]); 2426 const __m128i left = cvtepu8_epi16(Load4(left_column)); 2427 const __m128i weights = LoadUnaligned16(smooth_weights + 12); 2428 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); 2429 const __m128i weights1 = cvtepu8_epi16(weights); 2430 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8)); 2431 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); 2432 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); 2433 const __m128i scaled_top_right1 = 2434 _mm_mullo_epi16(inverted_weights1, top_right); 2435 const __m128i scaled_top_right2 = 2436 _mm_mullo_epi16(inverted_weights2, top_right); 2437 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); 2438 __m128i y_mask = _mm_set1_epi32(0x01000100); 2439 __m128i left_y = _mm_shuffle_epi8(left, y_mask); 2440 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, 2441 scaled_top_right1, scaled_top_right2, round); 2442 dst += stride; 2443 y_mask = _mm_set1_epi32(0x03020302); 2444 left_y = _mm_shuffle_epi8(left, y_mask); 2445 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, 2446 scaled_top_right1, scaled_top_right2, round); 2447 dst += stride; 2448 y_mask = _mm_set1_epi32(0x05040504); 2449 left_y = _mm_shuffle_epi8(left, y_mask); 2450 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, 2451 scaled_top_right1, scaled_top_right2, round); 2452 dst += stride; 2453 y_mask = _mm_set1_epi32(0x07060706); 2454 left_y = _mm_shuffle_epi8(left, y_mask); 2455 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, 2456 scaled_top_right1, scaled_top_right2, round); 2457 } 2458 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 2459 2460 void aom_smooth_h_predictor_16x8_ssse3( 2461 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, 2462 const uint8_t *LIBAOM_RESTRICT top_row, 2463 const uint8_t *LIBAOM_RESTRICT left_column) { 2464 const __m128i top_right = _mm_set1_epi16(top_row[15]); 2465 const __m128i left = cvtepu8_epi16(LoadLo8(left_column)); 2466 const __m128i weights = LoadUnaligned16(smooth_weights + 12); 2467 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); 2468 const __m128i weights1 = cvtepu8_epi16(weights); 2469 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8)); 2470 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); 2471 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); 2472 const __m128i scaled_top_right1 = 2473 _mm_mullo_epi16(inverted_weights1, top_right); 2474 const __m128i scaled_top_right2 = 2475 _mm_mullo_epi16(inverted_weights2, top_right); 2476 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); 2477 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 2478 const __m128i y_select = _mm_set1_epi32(y_mask); 2479 const __m128i left_y = _mm_shuffle_epi8(left, y_select); 2480 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, 2481 scaled_top_right1, scaled_top_right2, round); 2482 dst += stride; 2483 } 2484 } 2485 2486 void aom_smooth_h_predictor_16x16_ssse3( 2487 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, 2488 const uint8_t *LIBAOM_RESTRICT top_row, 2489 const uint8_t *LIBAOM_RESTRICT left_column) { 2490 const __m128i top_right = _mm_set1_epi16(top_row[15]); 2491 const __m128i weights = LoadUnaligned16(smooth_weights + 12); 2492 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); 2493 const __m128i weights1 = cvtepu8_epi16(weights); 2494 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8)); 2495 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); 2496 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); 2497 const __m128i scaled_top_right1 = 2498 _mm_mullo_epi16(inverted_weights1, top_right); 2499 const __m128i scaled_top_right2 = 2500 _mm_mullo_epi16(inverted_weights2, top_right); 2501 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); 2502 __m128i left = cvtepu8_epi16(LoadLo8(left_column)); 2503 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 2504 const __m128i y_select = _mm_set1_epi32(y_mask); 2505 const __m128i left_y = _mm_shuffle_epi8(left, y_select); 2506 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, 2507 scaled_top_right1, scaled_top_right2, round); 2508 dst += stride; 2509 } 2510 left = cvtepu8_epi16(LoadLo8(left_column + 8)); 2511 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 2512 const __m128i y_select = _mm_set1_epi32(y_mask); 2513 const __m128i left_y = _mm_shuffle_epi8(left, y_select); 2514 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, 2515 scaled_top_right1, scaled_top_right2, round); 2516 dst += stride; 2517 } 2518 } 2519 2520 void aom_smooth_h_predictor_16x32_ssse3( 2521 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, 2522 const uint8_t *LIBAOM_RESTRICT top_row, 2523 const uint8_t *LIBAOM_RESTRICT left_column) { 2524 const __m128i top_right = _mm_set1_epi16(top_row[15]); 2525 const __m128i weights = LoadUnaligned16(smooth_weights + 12); 2526 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); 2527 const __m128i weights1 = cvtepu8_epi16(weights); 2528 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8)); 2529 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); 2530 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); 2531 const __m128i scaled_top_right1 = 2532 _mm_mullo_epi16(inverted_weights1, top_right); 2533 const __m128i scaled_top_right2 = 2534 _mm_mullo_epi16(inverted_weights2, top_right); 2535 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); 2536 __m128i left = cvtepu8_epi16(LoadLo8(left_column)); 2537 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 2538 const __m128i y_select = _mm_set1_epi32(y_mask); 2539 const __m128i left_y = _mm_shuffle_epi8(left, y_select); 2540 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, 2541 scaled_top_right1, scaled_top_right2, round); 2542 dst += stride; 2543 } 2544 left = cvtepu8_epi16(LoadLo8(left_column + 8)); 2545 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 2546 const __m128i y_select = _mm_set1_epi32(y_mask); 2547 const __m128i left_y = _mm_shuffle_epi8(left, y_select); 2548 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, 2549 scaled_top_right1, scaled_top_right2, round); 2550 dst += stride; 2551 } 2552 left = cvtepu8_epi16(LoadLo8(left_column + 16)); 2553 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 2554 const __m128i y_select = _mm_set1_epi32(y_mask); 2555 const __m128i left_y = _mm_shuffle_epi8(left, y_select); 2556 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, 2557 scaled_top_right1, scaled_top_right2, round); 2558 dst += stride; 2559 } 2560 left = cvtepu8_epi16(LoadLo8(left_column + 24)); 2561 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 2562 const __m128i y_select = _mm_set1_epi32(y_mask); 2563 const __m128i left_y = _mm_shuffle_epi8(left, y_select); 2564 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, 2565 scaled_top_right1, scaled_top_right2, round); 2566 dst += stride; 2567 } 2568 } 2569 2570 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 2571 void aom_smooth_h_predictor_16x64_ssse3( 2572 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, 2573 const uint8_t *LIBAOM_RESTRICT top_row, 2574 const uint8_t *LIBAOM_RESTRICT left_column) { 2575 const __m128i top_right = _mm_set1_epi16(top_row[15]); 2576 const __m128i weights = LoadUnaligned16(smooth_weights + 12); 2577 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); 2578 const __m128i weights1 = cvtepu8_epi16(weights); 2579 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8)); 2580 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); 2581 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); 2582 const __m128i scaled_top_right1 = 2583 _mm_mullo_epi16(inverted_weights1, top_right); 2584 const __m128i scaled_top_right2 = 2585 _mm_mullo_epi16(inverted_weights2, top_right); 2586 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); 2587 for (int left_offset = 0; left_offset < 64; left_offset += 8) { 2588 const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset)); 2589 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 2590 const __m128i y_select = _mm_set1_epi32(y_mask); 2591 const __m128i left_y = _mm_shuffle_epi8(left, y_select); 2592 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, 2593 scaled_top_right1, scaled_top_right2, 2594 round); 2595 dst += stride; 2596 } 2597 } 2598 } 2599 2600 void aom_smooth_h_predictor_32x8_ssse3( 2601 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, 2602 const uint8_t *LIBAOM_RESTRICT top_row, 2603 const uint8_t *LIBAOM_RESTRICT left_column) { 2604 const __m128i top_right = _mm_set1_epi16(top_row[31]); 2605 const __m128i left = cvtepu8_epi16(LoadLo8(left_column)); 2606 const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28); 2607 const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44); 2608 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); 2609 const __m128i weights1 = cvtepu8_epi16(weights_lo); 2610 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8)); 2611 const __m128i weights3 = cvtepu8_epi16(weights_hi); 2612 const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8)); 2613 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); 2614 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); 2615 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); 2616 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); 2617 const __m128i scaled_top_right1 = 2618 _mm_mullo_epi16(inverted_weights1, top_right); 2619 const __m128i scaled_top_right2 = 2620 _mm_mullo_epi16(inverted_weights2, top_right); 2621 const __m128i scaled_top_right3 = 2622 _mm_mullo_epi16(inverted_weights3, top_right); 2623 const __m128i scaled_top_right4 = 2624 _mm_mullo_epi16(inverted_weights4, top_right); 2625 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); 2626 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 2627 __m128i y_select = _mm_set1_epi32(y_mask); 2628 __m128i left_y = _mm_shuffle_epi8(left, y_select); 2629 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, 2630 scaled_top_right1, scaled_top_right2, round); 2631 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4, 2632 scaled_top_right3, scaled_top_right4, round); 2633 dst += stride; 2634 } 2635 } 2636 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 2637 2638 void aom_smooth_h_predictor_32x16_ssse3( 2639 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, 2640 const uint8_t *LIBAOM_RESTRICT top_row, 2641 const uint8_t *LIBAOM_RESTRICT left_column) { 2642 const __m128i top_right = _mm_set1_epi16(top_row[31]); 2643 const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column)); 2644 const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28); 2645 const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44); 2646 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); 2647 const __m128i weights1 = cvtepu8_epi16(weights_lo); 2648 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8)); 2649 const __m128i weights3 = cvtepu8_epi16(weights_hi); 2650 const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8)); 2651 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); 2652 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); 2653 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); 2654 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); 2655 const __m128i scaled_top_right1 = 2656 _mm_mullo_epi16(inverted_weights1, top_right); 2657 const __m128i scaled_top_right2 = 2658 _mm_mullo_epi16(inverted_weights2, top_right); 2659 const __m128i scaled_top_right3 = 2660 _mm_mullo_epi16(inverted_weights3, top_right); 2661 const __m128i scaled_top_right4 = 2662 _mm_mullo_epi16(inverted_weights4, top_right); 2663 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); 2664 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 2665 __m128i y_select = _mm_set1_epi32(y_mask); 2666 __m128i left_y = _mm_shuffle_epi8(left1, y_select); 2667 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, 2668 scaled_top_right1, scaled_top_right2, round); 2669 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4, 2670 scaled_top_right3, scaled_top_right4, round); 2671 dst += stride; 2672 } 2673 const __m128i left2 = 2674 cvtepu8_epi16(LoadLo8((const uint8_t *)left_column + 8)); 2675 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 2676 __m128i y_select = _mm_set1_epi32(y_mask); 2677 __m128i left_y = _mm_shuffle_epi8(left2, y_select); 2678 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, 2679 scaled_top_right1, scaled_top_right2, round); 2680 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4, 2681 scaled_top_right3, scaled_top_right4, round); 2682 dst += stride; 2683 } 2684 } 2685 2686 void aom_smooth_h_predictor_32x32_ssse3( 2687 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, 2688 const uint8_t *LIBAOM_RESTRICT top_row, 2689 const uint8_t *LIBAOM_RESTRICT left_column) { 2690 const __m128i top_right = _mm_set1_epi16(top_row[31]); 2691 const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28); 2692 const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44); 2693 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); 2694 const __m128i weights1 = cvtepu8_epi16(weights_lo); 2695 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8)); 2696 const __m128i weights3 = cvtepu8_epi16(weights_hi); 2697 const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8)); 2698 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); 2699 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); 2700 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); 2701 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); 2702 const __m128i scaled_top_right1 = 2703 _mm_mullo_epi16(inverted_weights1, top_right); 2704 const __m128i scaled_top_right2 = 2705 _mm_mullo_epi16(inverted_weights2, top_right); 2706 const __m128i scaled_top_right3 = 2707 _mm_mullo_epi16(inverted_weights3, top_right); 2708 const __m128i scaled_top_right4 = 2709 _mm_mullo_epi16(inverted_weights4, top_right); 2710 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); 2711 __m128i left = cvtepu8_epi16(LoadLo8(left_column)); 2712 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 2713 __m128i y_select = _mm_set1_epi32(y_mask); 2714 __m128i left_y = _mm_shuffle_epi8(left, y_select); 2715 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, 2716 scaled_top_right1, scaled_top_right2, round); 2717 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4, 2718 scaled_top_right3, scaled_top_right4, round); 2719 dst += stride; 2720 } 2721 left = cvtepu8_epi16(LoadLo8(left_column + 8)); 2722 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 2723 __m128i y_select = _mm_set1_epi32(y_mask); 2724 __m128i left_y = _mm_shuffle_epi8(left, y_select); 2725 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, 2726 scaled_top_right1, scaled_top_right2, round); 2727 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4, 2728 scaled_top_right3, scaled_top_right4, round); 2729 dst += stride; 2730 } 2731 left = cvtepu8_epi16(LoadLo8(left_column + 16)); 2732 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 2733 __m128i y_select = _mm_set1_epi32(y_mask); 2734 __m128i left_y = _mm_shuffle_epi8(left, y_select); 2735 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, 2736 scaled_top_right1, scaled_top_right2, round); 2737 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4, 2738 scaled_top_right3, scaled_top_right4, round); 2739 dst += stride; 2740 } 2741 left = cvtepu8_epi16(LoadLo8(left_column + 24)); 2742 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 2743 __m128i y_select = _mm_set1_epi32(y_mask); 2744 __m128i left_y = _mm_shuffle_epi8(left, y_select); 2745 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, 2746 scaled_top_right1, scaled_top_right2, round); 2747 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4, 2748 scaled_top_right3, scaled_top_right4, round); 2749 dst += stride; 2750 } 2751 } 2752 2753 void aom_smooth_h_predictor_32x64_ssse3( 2754 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, 2755 const uint8_t *LIBAOM_RESTRICT top_row, 2756 const uint8_t *LIBAOM_RESTRICT left_column) { 2757 const __m128i top_right = _mm_set1_epi16(top_row[31]); 2758 const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28); 2759 const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44); 2760 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); 2761 const __m128i weights1 = cvtepu8_epi16(weights_lo); 2762 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8)); 2763 const __m128i weights3 = cvtepu8_epi16(weights_hi); 2764 const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8)); 2765 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); 2766 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); 2767 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); 2768 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); 2769 const __m128i scaled_top_right1 = 2770 _mm_mullo_epi16(inverted_weights1, top_right); 2771 const __m128i scaled_top_right2 = 2772 _mm_mullo_epi16(inverted_weights2, top_right); 2773 const __m128i scaled_top_right3 = 2774 _mm_mullo_epi16(inverted_weights3, top_right); 2775 const __m128i scaled_top_right4 = 2776 _mm_mullo_epi16(inverted_weights4, top_right); 2777 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); 2778 for (int left_offset = 0; left_offset < 64; left_offset += 8) { 2779 const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset)); 2780 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 2781 const __m128i y_select = _mm_set1_epi32(y_mask); 2782 const __m128i left_y = _mm_shuffle_epi8(left, y_select); 2783 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, 2784 scaled_top_right1, scaled_top_right2, 2785 round); 2786 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, 2787 weights4, scaled_top_right3, 2788 scaled_top_right4, round); 2789 dst += stride; 2790 } 2791 } 2792 } 2793 2794 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 2795 void aom_smooth_h_predictor_64x16_ssse3( 2796 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, 2797 const uint8_t *LIBAOM_RESTRICT top_row, 2798 const uint8_t *LIBAOM_RESTRICT left_column) { 2799 const __m128i top_right = _mm_set1_epi16(top_row[63]); 2800 const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column)); 2801 const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60); 2802 const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76); 2803 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); 2804 const __m128i weights1 = cvtepu8_epi16(weights_lolo); 2805 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8)); 2806 const __m128i weights3 = cvtepu8_epi16(weights_lohi); 2807 const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8)); 2808 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); 2809 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); 2810 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); 2811 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); 2812 const __m128i scaled_top_right1 = 2813 _mm_mullo_epi16(inverted_weights1, top_right); 2814 const __m128i scaled_top_right2 = 2815 _mm_mullo_epi16(inverted_weights2, top_right); 2816 const __m128i scaled_top_right3 = 2817 _mm_mullo_epi16(inverted_weights3, top_right); 2818 const __m128i scaled_top_right4 = 2819 _mm_mullo_epi16(inverted_weights4, top_right); 2820 const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92); 2821 const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108); 2822 const __m128i weights5 = cvtepu8_epi16(weights_hilo); 2823 const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8)); 2824 const __m128i weights7 = cvtepu8_epi16(weights_hihi); 2825 const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8)); 2826 const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5); 2827 const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6); 2828 const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7); 2829 const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8); 2830 const __m128i scaled_top_right5 = 2831 _mm_mullo_epi16(inverted_weights5, top_right); 2832 const __m128i scaled_top_right6 = 2833 _mm_mullo_epi16(inverted_weights6, top_right); 2834 const __m128i scaled_top_right7 = 2835 _mm_mullo_epi16(inverted_weights7, top_right); 2836 const __m128i scaled_top_right8 = 2837 _mm_mullo_epi16(inverted_weights8, top_right); 2838 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); 2839 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 2840 __m128i y_select = _mm_set1_epi32(y_mask); 2841 __m128i left_y = _mm_shuffle_epi8(left1, y_select); 2842 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, 2843 scaled_top_right1, scaled_top_right2, round); 2844 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4, 2845 scaled_top_right3, scaled_top_right4, round); 2846 write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6, 2847 scaled_top_right5, scaled_top_right6, round); 2848 write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8, 2849 scaled_top_right7, scaled_top_right8, round); 2850 dst += stride; 2851 } 2852 const __m128i left2 = cvtepu8_epi16(LoadLo8(left_column + 8)); 2853 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 2854 __m128i y_select = _mm_set1_epi32(y_mask); 2855 __m128i left_y = _mm_shuffle_epi8(left2, y_select); 2856 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, 2857 scaled_top_right1, scaled_top_right2, round); 2858 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4, 2859 scaled_top_right3, scaled_top_right4, round); 2860 write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6, 2861 scaled_top_right5, scaled_top_right6, round); 2862 write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8, 2863 scaled_top_right7, scaled_top_right8, round); 2864 dst += stride; 2865 } 2866 } 2867 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 2868 2869 void aom_smooth_h_predictor_64x32_ssse3( 2870 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, 2871 const uint8_t *LIBAOM_RESTRICT top_row, 2872 const uint8_t *LIBAOM_RESTRICT left_column) { 2873 const __m128i top_right = _mm_set1_epi16(top_row[63]); 2874 const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column)); 2875 const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60); 2876 const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76); 2877 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); 2878 const __m128i weights1 = cvtepu8_epi16(weights_lolo); 2879 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8)); 2880 const __m128i weights3 = cvtepu8_epi16(weights_lohi); 2881 const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8)); 2882 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); 2883 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); 2884 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); 2885 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); 2886 const __m128i scaled_top_right1 = 2887 _mm_mullo_epi16(inverted_weights1, top_right); 2888 const __m128i scaled_top_right2 = 2889 _mm_mullo_epi16(inverted_weights2, top_right); 2890 const __m128i scaled_top_right3 = 2891 _mm_mullo_epi16(inverted_weights3, top_right); 2892 const __m128i scaled_top_right4 = 2893 _mm_mullo_epi16(inverted_weights4, top_right); 2894 const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92); 2895 const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108); 2896 const __m128i weights5 = cvtepu8_epi16(weights_hilo); 2897 const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8)); 2898 const __m128i weights7 = cvtepu8_epi16(weights_hihi); 2899 const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8)); 2900 const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5); 2901 const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6); 2902 const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7); 2903 const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8); 2904 const __m128i scaled_top_right5 = 2905 _mm_mullo_epi16(inverted_weights5, top_right); 2906 const __m128i scaled_top_right6 = 2907 _mm_mullo_epi16(inverted_weights6, top_right); 2908 const __m128i scaled_top_right7 = 2909 _mm_mullo_epi16(inverted_weights7, top_right); 2910 const __m128i scaled_top_right8 = 2911 _mm_mullo_epi16(inverted_weights8, top_right); 2912 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); 2913 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 2914 const __m128i y_select = _mm_set1_epi32(y_mask); 2915 const __m128i left_y = _mm_shuffle_epi8(left1, y_select); 2916 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, 2917 scaled_top_right1, scaled_top_right2, round); 2918 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4, 2919 scaled_top_right3, scaled_top_right4, round); 2920 write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6, 2921 scaled_top_right5, scaled_top_right6, round); 2922 write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8, 2923 scaled_top_right7, scaled_top_right8, round); 2924 dst += stride; 2925 } 2926 const __m128i left2 = cvtepu8_epi16(LoadLo8(left_column + 8)); 2927 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 2928 const __m128i y_select = _mm_set1_epi32(y_mask); 2929 const __m128i left_y = _mm_shuffle_epi8(left2, y_select); 2930 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, 2931 scaled_top_right1, scaled_top_right2, round); 2932 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4, 2933 scaled_top_right3, scaled_top_right4, round); 2934 write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6, 2935 scaled_top_right5, scaled_top_right6, round); 2936 write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8, 2937 scaled_top_right7, scaled_top_right8, round); 2938 dst += stride; 2939 } 2940 const __m128i left3 = cvtepu8_epi16(LoadLo8(left_column + 16)); 2941 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 2942 const __m128i y_select = _mm_set1_epi32(y_mask); 2943 const __m128i left_y = _mm_shuffle_epi8(left3, y_select); 2944 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, 2945 scaled_top_right1, scaled_top_right2, round); 2946 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4, 2947 scaled_top_right3, scaled_top_right4, round); 2948 write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6, 2949 scaled_top_right5, scaled_top_right6, round); 2950 write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8, 2951 scaled_top_right7, scaled_top_right8, round); 2952 dst += stride; 2953 } 2954 const __m128i left4 = cvtepu8_epi16(LoadLo8(left_column + 24)); 2955 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 2956 const __m128i y_select = _mm_set1_epi32(y_mask); 2957 const __m128i left_y = _mm_shuffle_epi8(left4, y_select); 2958 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, 2959 scaled_top_right1, scaled_top_right2, round); 2960 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4, 2961 scaled_top_right3, scaled_top_right4, round); 2962 write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6, 2963 scaled_top_right5, scaled_top_right6, round); 2964 write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8, 2965 scaled_top_right7, scaled_top_right8, round); 2966 dst += stride; 2967 } 2968 } 2969 2970 void aom_smooth_h_predictor_64x64_ssse3( 2971 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, 2972 const uint8_t *LIBAOM_RESTRICT top_row, 2973 const uint8_t *LIBAOM_RESTRICT left_column) { 2974 const __m128i top_right = _mm_set1_epi16(top_row[63]); 2975 const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60); 2976 const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76); 2977 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); 2978 const __m128i weights1 = cvtepu8_epi16(weights_lolo); 2979 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8)); 2980 const __m128i weights3 = cvtepu8_epi16(weights_lohi); 2981 const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8)); 2982 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); 2983 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); 2984 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); 2985 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); 2986 const __m128i scaled_top_right1 = 2987 _mm_mullo_epi16(inverted_weights1, top_right); 2988 const __m128i scaled_top_right2 = 2989 _mm_mullo_epi16(inverted_weights2, top_right); 2990 const __m128i scaled_top_right3 = 2991 _mm_mullo_epi16(inverted_weights3, top_right); 2992 const __m128i scaled_top_right4 = 2993 _mm_mullo_epi16(inverted_weights4, top_right); 2994 const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92); 2995 const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108); 2996 const __m128i weights5 = cvtepu8_epi16(weights_hilo); 2997 const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8)); 2998 const __m128i weights7 = cvtepu8_epi16(weights_hihi); 2999 const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8)); 3000 const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5); 3001 const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6); 3002 const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7); 3003 const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8); 3004 const __m128i scaled_top_right5 = 3005 _mm_mullo_epi16(inverted_weights5, top_right); 3006 const __m128i scaled_top_right6 = 3007 _mm_mullo_epi16(inverted_weights6, top_right); 3008 const __m128i scaled_top_right7 = 3009 _mm_mullo_epi16(inverted_weights7, top_right); 3010 const __m128i scaled_top_right8 = 3011 _mm_mullo_epi16(inverted_weights8, top_right); 3012 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); 3013 for (int left_offset = 0; left_offset < 64; left_offset += 8) { 3014 const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset)); 3015 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { 3016 const __m128i y_select = _mm_set1_epi32(y_mask); 3017 const __m128i left_y = _mm_shuffle_epi8(left, y_select); 3018 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, 3019 scaled_top_right1, scaled_top_right2, 3020 round); 3021 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, 3022 weights4, scaled_top_right3, 3023 scaled_top_right4, round); 3024 write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, 3025 weights6, scaled_top_right5, 3026 scaled_top_right6, round); 3027 write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, 3028 weights8, scaled_top_right7, 3029 scaled_top_right8, round); 3030 dst += stride; 3031 } 3032 } 3033 }