v128_intrinsics_x86.h (20903B)
1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_ 13 #define AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_ 14 15 #include <stdint.h> 16 #include "aom_dsp/simd/v64_intrinsics_x86.h" 17 18 typedef __m128i v128; 19 20 SIMD_INLINE uint32_t v128_low_u32(v128 a) { 21 return (uint32_t)_mm_cvtsi128_si32(a); 22 } 23 24 SIMD_INLINE v64 v128_low_v64(v128 a) { 25 return _mm_unpacklo_epi64(a, v64_zero()); 26 } 27 28 SIMD_INLINE v64 v128_high_v64(v128 a) { return _mm_srli_si128(a, 8); } 29 30 SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) { 31 return _mm_unpacklo_epi64(b, a); 32 } 33 34 SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) { 35 return v128_from_v64(v64_from_64(a), v64_from_64(b)); 36 } 37 38 SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { 39 return _mm_set_epi32((int)a, (int)b, (int)c, (int)d); 40 } 41 42 SIMD_INLINE v128 v128_load_aligned(const void *p) { 43 return _mm_load_si128((__m128i *)p); 44 } 45 46 SIMD_INLINE v128 v128_load_unaligned(const void *p) { 47 #if defined(__SSSE3__) 48 return _mm_lddqu_si128((__m128i *)p); 49 #else 50 return _mm_loadu_si128((__m128i *)p); 51 #endif 52 } 53 54 SIMD_INLINE void v128_store_aligned(void *p, v128 a) { 55 _mm_store_si128((__m128i *)p, a); 56 } 57 58 SIMD_INLINE void v128_store_unaligned(void *p, v128 a) { 59 _mm_storeu_si128((__m128i *)p, a); 60 } 61 62 // The following function requires an immediate. 63 // Some compilers will check this during optimisation, others wont. 64 #if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__) 65 #if defined(__SSSE3__) 66 SIMD_INLINE v128 v128_align(v128 a, v128 b, const unsigned int c) { 67 return c ? _mm_alignr_epi8(a, b, c) : b; 68 } 69 #else 70 #define v128_align(a, b, c) \ 71 ((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b)) 72 #endif 73 #else 74 #if defined(__SSSE3__) 75 #define v128_align(a, b, c) ((c) ? _mm_alignr_epi8(a, b, (uint8_t)(c)) : (b)) 76 #else 77 #define v128_align(a, b, c) \ 78 ((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b)) 79 #endif 80 #endif 81 82 SIMD_INLINE v128 v128_zero(void) { return _mm_setzero_si128(); } 83 84 SIMD_INLINE v128 v128_dup_8(uint8_t x) { return _mm_set1_epi8((char)x); } 85 86 SIMD_INLINE v128 v128_dup_16(uint16_t x) { return _mm_set1_epi16((short)x); } 87 88 SIMD_INLINE v128 v128_dup_32(uint32_t x) { return _mm_set1_epi32((int)x); } 89 90 SIMD_INLINE v128 v128_dup_64(uint64_t x) { 91 // _mm_set_pi64x and _mm_cvtsi64x_si64 missing in some compilers 92 return _mm_set_epi32((int32_t)(x >> 32), (int32_t)x, (int32_t)(x >> 32), 93 (int32_t)x); 94 } 95 96 SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return _mm_add_epi8(a, b); } 97 98 SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return _mm_add_epi16(a, b); } 99 100 SIMD_INLINE v128 v128_sadd_u8(v128 a, v128 b) { return _mm_adds_epu8(a, b); } 101 102 SIMD_INLINE v128 v128_sadd_s8(v128 a, v128 b) { return _mm_adds_epi8(a, b); } 103 104 SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return _mm_adds_epi16(a, b); } 105 106 SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return _mm_add_epi32(a, b); } 107 108 SIMD_INLINE v128 v128_add_64(v128 a, v128 b) { return _mm_add_epi64(a, b); } 109 110 SIMD_INLINE v128 v128_padd_s16(v128 a) { 111 return _mm_madd_epi16(a, _mm_set1_epi16(1)); 112 } 113 114 SIMD_INLINE v128 v128_sub_8(v128 a, v128 b) { return _mm_sub_epi8(a, b); } 115 116 SIMD_INLINE v128 v128_ssub_u8(v128 a, v128 b) { return _mm_subs_epu8(a, b); } 117 118 SIMD_INLINE v128 v128_ssub_s8(v128 a, v128 b) { return _mm_subs_epi8(a, b); } 119 120 SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return _mm_sub_epi16(a, b); } 121 122 SIMD_INLINE v128 v128_ssub_s16(v128 a, v128 b) { return _mm_subs_epi16(a, b); } 123 124 SIMD_INLINE v128 v128_ssub_u16(v128 a, v128 b) { return _mm_subs_epu16(a, b); } 125 126 SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return _mm_sub_epi32(a, b); } 127 128 SIMD_INLINE v128 v128_sub_64(v128 a, v128 b) { return _mm_sub_epi64(a, b); } 129 130 SIMD_INLINE v128 v128_abs_s16(v128 a) { 131 #if defined(__SSSE3__) 132 return _mm_abs_epi16(a); 133 #else 134 return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a)); 135 #endif 136 } 137 138 SIMD_INLINE v128 v128_abs_s8(v128 a) { 139 #if defined(__SSSE3__) 140 return _mm_abs_epi8(a); 141 #else 142 v128 sign = _mm_cmplt_epi8(a, _mm_setzero_si128()); 143 return _mm_xor_si128(sign, _mm_add_epi8(a, sign)); 144 #endif 145 } 146 147 SIMD_INLINE v128 v128_ziplo_8(v128 a, v128 b) { 148 return _mm_unpacklo_epi8(b, a); 149 } 150 151 SIMD_INLINE v128 v128_ziphi_8(v128 a, v128 b) { 152 return _mm_unpackhi_epi8(b, a); 153 } 154 155 SIMD_INLINE v128 v128_ziplo_16(v128 a, v128 b) { 156 return _mm_unpacklo_epi16(b, a); 157 } 158 159 SIMD_INLINE v128 v128_ziphi_16(v128 a, v128 b) { 160 return _mm_unpackhi_epi16(b, a); 161 } 162 163 SIMD_INLINE v128 v128_ziplo_32(v128 a, v128 b) { 164 return _mm_unpacklo_epi32(b, a); 165 } 166 167 SIMD_INLINE v128 v128_ziphi_32(v128 a, v128 b) { 168 return _mm_unpackhi_epi32(b, a); 169 } 170 171 SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) { 172 return _mm_unpacklo_epi64(b, a); 173 } 174 175 SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) { 176 return _mm_unpackhi_epi64(b, a); 177 } 178 179 SIMD_INLINE v128 v128_zip_8(v64 a, v64 b) { return _mm_unpacklo_epi8(b, a); } 180 181 SIMD_INLINE v128 v128_zip_16(v64 a, v64 b) { return _mm_unpacklo_epi16(b, a); } 182 183 SIMD_INLINE v128 v128_zip_32(v64 a, v64 b) { return _mm_unpacklo_epi32(b, a); } 184 185 SIMD_INLINE v128 v128_unziphi_8(v128 a, v128 b) { 186 return _mm_packs_epi16(_mm_srai_epi16(b, 8), _mm_srai_epi16(a, 8)); 187 } 188 189 SIMD_INLINE v128 v128_unziplo_8(v128 a, v128 b) { 190 #if defined(__SSSE3__) 191 #ifdef __x86_64__ 192 v128 order = _mm_cvtsi64_si128(0x0e0c0a0806040200LL); 193 #else 194 v128 order = _mm_set_epi32(0, 0, 0x0e0c0a08, 0x06040200); 195 #endif 196 return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order), 197 _mm_shuffle_epi8(a, order)); 198 #else 199 return v128_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1)); 200 #endif 201 } 202 203 SIMD_INLINE v128 v128_unziphi_16(v128 a, v128 b) { 204 return _mm_packs_epi32(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16)); 205 } 206 207 SIMD_INLINE v128 v128_unziplo_16(v128 a, v128 b) { 208 #if defined(__SSSE3__) 209 #ifdef __x86_64__ 210 v128 order = _mm_cvtsi64_si128(0x0d0c090805040100LL); 211 #else 212 v128 order = _mm_set_epi32(0, 0, 0x0d0c0908, 0x05040100); 213 #endif 214 return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order), 215 _mm_shuffle_epi8(a, order)); 216 #else 217 return v128_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2)); 218 #endif 219 } 220 221 SIMD_INLINE v128 v128_unziphi_32(v128 a, v128 b) { 222 return _mm_castps_si128(_mm_shuffle_ps( 223 _mm_castsi128_ps(b), _mm_castsi128_ps(a), _MM_SHUFFLE(3, 1, 3, 1))); 224 } 225 226 SIMD_INLINE v128 v128_unziplo_32(v128 a, v128 b) { 227 return _mm_castps_si128(_mm_shuffle_ps( 228 _mm_castsi128_ps(b), _mm_castsi128_ps(a), _MM_SHUFFLE(2, 0, 2, 0))); 229 } 230 231 SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) { 232 return _mm_unpacklo_epi8(a, _mm_setzero_si128()); 233 } 234 235 SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) { 236 return _mm_unpacklo_epi8(a, _mm_setzero_si128()); 237 } 238 239 SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) { 240 return _mm_unpackhi_epi8(a, _mm_setzero_si128()); 241 } 242 243 SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) { 244 return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8); 245 } 246 247 SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) { 248 return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8); 249 } 250 251 SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) { 252 return _mm_srai_epi16(_mm_unpackhi_epi8(a, a), 8); 253 } 254 255 SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) { 256 return _mm_packs_epi32(b, a); 257 } 258 259 SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) { 260 #if defined(__SSE4_1__) 261 return _mm_packus_epi32(b, a); 262 #else 263 return v128_from_v64(v64_pack_s32_u16(v128_high_v64(a), v128_low_v64(a)), 264 v64_pack_s32_u16(v128_high_v64(b), v128_low_v64(b))); 265 #endif 266 } 267 268 SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) { 269 return _mm_packus_epi16(b, a); 270 } 271 272 SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) { 273 return _mm_packs_epi16(b, a); 274 } 275 276 SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) { 277 return _mm_unpacklo_epi16(a, _mm_setzero_si128()); 278 } 279 280 SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) { 281 return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16); 282 } 283 284 SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) { 285 return _mm_unpacklo_epi16(a, _mm_setzero_si128()); 286 } 287 288 SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) { 289 return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16); 290 } 291 292 SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) { 293 return _mm_unpackhi_epi16(a, _mm_setzero_si128()); 294 } 295 296 SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) { 297 return _mm_srai_epi32(_mm_unpackhi_epi16(a, a), 16); 298 } 299 300 SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) { 301 #if defined(__SSSE3__) 302 return _mm_shuffle_epi8(x, pattern); 303 #else 304 v128 output; 305 unsigned char *input = (unsigned char *)&x; 306 unsigned char *index = (unsigned char *)&pattern; 307 unsigned char *selected = (unsigned char *)&output; 308 int counter; 309 310 for (counter = 0; counter < 16; counter++) { 311 selected[counter] = input[index[counter] & 15]; 312 } 313 314 return output; 315 #endif 316 } 317 318 SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) { 319 v128 t1 = _mm_madd_epi16(v128_unpackhi_s8_s16(a), v128_unpackhi_u8_s16(b)); 320 v128 t2 = _mm_madd_epi16(v128_unpacklo_s8_s16(a), v128_unpacklo_u8_s16(b)); 321 v128 t = v128_add_32(t1, t2); 322 t = v128_add_32(t, _mm_srli_si128(t, 8)); 323 t = v128_add_32(t, _mm_srli_si128(t, 4)); 324 return (int32_t)v128_low_u32(t); 325 } 326 327 SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) { 328 v128 r = _mm_madd_epi16(a, b); 329 #if defined(__SSE4_1__) && defined(__x86_64__) 330 v128 c = _mm_add_epi64(_mm_cvtepi32_epi64(r), 331 _mm_cvtepi32_epi64(_mm_srli_si128(r, 8))); 332 return _mm_cvtsi128_si64(_mm_add_epi64(c, _mm_srli_si128(c, 8))); 333 #else 334 return (int64_t)_mm_cvtsi128_si32(r) + 335 (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) + 336 (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 8)) + 337 (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 12)); 338 #endif 339 } 340 341 SIMD_INLINE uint64_t v128_hadd_u8(v128 a) { 342 v128 t = _mm_sad_epu8(a, _mm_setzero_si128()); 343 return v64_low_u32(v128_low_v64(t)) + v64_low_u32(v128_high_v64(t)); 344 } 345 346 typedef v128 sad128_internal; 347 348 SIMD_INLINE sad128_internal v128_sad_u8_init(void) { 349 return _mm_setzero_si128(); 350 } 351 352 /* Implementation dependent return value. Result must be finalised with 353 v128_sad_sum(). 354 The result for more than 32 v128_sad_u8() calls is undefined. */ 355 SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) { 356 return _mm_add_epi64(s, _mm_sad_epu8(a, b)); 357 } 358 359 SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) { 360 return v128_low_u32(_mm_add_epi32(s, _mm_unpackhi_epi64(s, s))); 361 } 362 363 typedef int32_t ssd128_internal; 364 365 SIMD_INLINE ssd128_internal v128_ssd_u8_init(void) { return 0; } 366 367 /* Implementation dependent return value. Result must be finalised with 368 * v128_ssd_sum(). */ 369 SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) { 370 v128 z = _mm_setzero_si128(); 371 v128 l = _mm_sub_epi16(_mm_unpacklo_epi8(a, z), _mm_unpacklo_epi8(b, z)); 372 v128 h = _mm_sub_epi16(_mm_unpackhi_epi8(a, z), _mm_unpackhi_epi8(b, z)); 373 v128 rl = _mm_madd_epi16(l, l); 374 v128 rh = _mm_madd_epi16(h, h); 375 v128 r = _mm_add_epi32(rl, rh); 376 r = _mm_add_epi32(r, _mm_srli_si128(r, 8)); 377 r = _mm_add_epi32(r, _mm_srli_si128(r, 4)); 378 return s + _mm_cvtsi128_si32(r); 379 } 380 381 SIMD_INLINE int32_t v128_ssd_u8_sum(ssd128_internal s) { return s; } 382 383 SIMD_INLINE v128 v128_or(v128 a, v128 b) { return _mm_or_si128(a, b); } 384 385 SIMD_INLINE v128 v128_xor(v128 a, v128 b) { return _mm_xor_si128(a, b); } 386 387 SIMD_INLINE v128 v128_and(v128 a, v128 b) { return _mm_and_si128(a, b); } 388 389 SIMD_INLINE v128 v128_andn(v128 a, v128 b) { return _mm_andnot_si128(b, a); } 390 391 SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) { 392 v64 lo_bits = v64_mullo_s16(a, b); 393 v64 hi_bits = v64_mulhi_s16(a, b); 394 return v128_from_v64(v64_ziphi_16(hi_bits, lo_bits), 395 v64_ziplo_16(hi_bits, lo_bits)); 396 } 397 398 SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) { 399 return _mm_mullo_epi16(a, b); 400 } 401 402 SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) { 403 return _mm_mulhi_epi16(a, b); 404 } 405 406 SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) { 407 #if defined(__SSE4_1__) 408 return _mm_mullo_epi32(a, b); 409 #else 410 return _mm_unpacklo_epi32( 411 _mm_shuffle_epi32(_mm_mul_epu32(a, b), 8), 412 _mm_shuffle_epi32( 413 _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)), 8)); 414 #endif 415 } 416 417 SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) { 418 v128 r = v128_mullo_s32(a, b); 419 return (int64_t)_mm_cvtsi128_si32(r) + 420 (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) + 421 (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 8)) + 422 (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 12)); 423 } 424 425 SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return _mm_madd_epi16(a, b); } 426 427 SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) { 428 #if defined(__SSSE3__) 429 return _mm_maddubs_epi16(a, b); 430 #else 431 return _mm_packs_epi32( 432 _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()), 433 _mm_srai_epi16(_mm_unpacklo_epi8(b, b), 8)), 434 _mm_madd_epi16(_mm_unpackhi_epi8(a, _mm_setzero_si128()), 435 _mm_srai_epi16(_mm_unpackhi_epi8(b, b), 8))); 436 #endif 437 } 438 439 SIMD_INLINE v128 v128_padd_u8(v128 a) { 440 return v128_madd_us8(a, _mm_set1_epi8(1)); 441 } 442 443 SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return _mm_avg_epu8(a, b); } 444 445 SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) { 446 return _mm_sub_epi8(_mm_avg_epu8(a, b), 447 _mm_and_si128(_mm_xor_si128(a, b), v128_dup_8(1))); 448 } 449 450 SIMD_INLINE v128 v128_rdavg_u16(v128 a, v128 b) { 451 return _mm_sub_epi16(_mm_avg_epu16(a, b), 452 _mm_and_si128(_mm_xor_si128(a, b), v128_dup_16(1))); 453 } 454 455 SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return _mm_avg_epu16(a, b); } 456 457 SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return _mm_min_epu8(a, b); } 458 459 SIMD_INLINE v128 v128_max_u8(v128 a, v128 b) { return _mm_max_epu8(a, b); } 460 461 SIMD_INLINE v128 v128_min_s8(v128 a, v128 b) { 462 #if defined(__SSE4_1__) 463 return _mm_min_epi8(a, b); 464 #else 465 v128 mask = _mm_cmplt_epi8(a, b); 466 return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); 467 #endif 468 } 469 470 SIMD_INLINE uint32_t v128_movemask_8(v128 a) { return _mm_movemask_epi8(a); } 471 472 SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) { 473 #if defined(__SSE4_1__) 474 return _mm_blendv_epi8(a, b, c); 475 #else 476 c = _mm_cmplt_epi8(c, v128_zero()); 477 return v128_or(v128_and(b, c), v128_andn(a, c)); 478 #endif 479 } 480 481 SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) { 482 #if defined(__SSE4_1__) 483 return _mm_max_epi8(a, b); 484 #else 485 v128 mask = _mm_cmplt_epi8(b, a); 486 return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); 487 #endif 488 } 489 490 SIMD_INLINE v128 v128_min_s16(v128 a, v128 b) { return _mm_min_epi16(a, b); } 491 492 SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return _mm_max_epi16(a, b); } 493 494 SIMD_INLINE v128 v128_min_s32(v128 a, v128 b) { 495 #if defined(__SSE4_1__) 496 return _mm_min_epi32(a, b); 497 #else 498 v128 mask = _mm_cmplt_epi32(a, b); 499 return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); 500 #endif 501 } 502 503 SIMD_INLINE v128 v128_max_s32(v128 a, v128 b) { 504 #if defined(__SSE4_1__) 505 return _mm_max_epi32(a, b); 506 #else 507 v128 mask = _mm_cmplt_epi32(b, a); 508 return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); 509 #endif 510 } 511 512 SIMD_INLINE v128 v128_cmpgt_s8(v128 a, v128 b) { return _mm_cmpgt_epi8(a, b); } 513 514 SIMD_INLINE v128 v128_cmplt_s8(v128 a, v128 b) { return _mm_cmplt_epi8(a, b); } 515 516 SIMD_INLINE v128 v128_cmpeq_8(v128 a, v128 b) { return _mm_cmpeq_epi8(a, b); } 517 518 SIMD_INLINE v128 v128_cmpgt_s16(v128 a, v128 b) { 519 return _mm_cmpgt_epi16(a, b); 520 } 521 522 SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) { 523 return _mm_cmplt_epi16(a, b); 524 } 525 526 SIMD_INLINE v128 v128_cmpeq_32(v128 a, v128 b) { return _mm_cmpeq_epi32(a, b); } 527 528 SIMD_INLINE v128 v128_cmpgt_s32(v128 a, v128 b) { 529 return _mm_cmpgt_epi32(a, b); 530 } 531 532 SIMD_INLINE v128 v128_cmplt_s32(v128 a, v128 b) { 533 return _mm_cmplt_epi32(a, b); 534 } 535 536 SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return _mm_cmpeq_epi16(a, b); } 537 538 SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) { 539 return _mm_and_si128(_mm_set1_epi8((char)(0xff << c)), 540 _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c))); 541 } 542 543 SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) { 544 return _mm_and_si128(_mm_set1_epi8((char)(0xff >> c)), 545 _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c))); 546 } 547 548 SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) { 549 __m128i x = _mm_cvtsi32_si128((int)(c + 8)); 550 return _mm_packs_epi16(_mm_sra_epi16(_mm_unpacklo_epi8(a, a), x), 551 _mm_sra_epi16(_mm_unpackhi_epi8(a, a), x)); 552 } 553 554 SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) { 555 return _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c)); 556 } 557 558 SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) { 559 return _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c)); 560 } 561 562 SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) { 563 return _mm_sra_epi16(a, _mm_cvtsi32_si128((int)c)); 564 } 565 566 SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) { 567 return _mm_sll_epi32(a, _mm_cvtsi32_si128((int)c)); 568 } 569 570 SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) { 571 return _mm_srl_epi32(a, _mm_cvtsi32_si128((int)c)); 572 } 573 574 SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) { 575 return _mm_sra_epi32(a, _mm_cvtsi32_si128((int)c)); 576 } 577 578 SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) { 579 return _mm_sll_epi64(a, _mm_cvtsi32_si128((int)c)); 580 } 581 582 SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) { 583 return _mm_srl_epi64(a, _mm_cvtsi32_si128((int)c)); 584 } 585 586 SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) { 587 // _mm_sra_epi64 is missing in gcc? 588 return v128_from_64((uint64_t)((int64_t)v64_u64(v128_high_v64(a)) >> c), 589 (uint64_t)((int64_t)v64_u64(v128_low_v64(a)) >> c)); 590 // return _mm_sra_epi64(a, _mm_cvtsi32_si128((int)c)); 591 } 592 593 /* These intrinsics require immediate values, so we must use #defines 594 to enforce that. */ 595 #define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c) & 127) 596 #define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c) & 127) 597 #define v128_shl_n_8(a, c) \ 598 _mm_and_si128(_mm_set1_epi8((char)(0xff << (c))), _mm_slli_epi16(a, c)) 599 #define v128_shr_n_u8(a, c) \ 600 _mm_and_si128(_mm_set1_epi8((char)(0xff >> (c))), _mm_srli_epi16(a, c)) 601 #define v128_shr_n_s8(a, c) \ 602 _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), \ 603 _mm_srai_epi16(_mm_unpackhi_epi8(a, a), (c) + 8)) 604 #define v128_shl_n_16(a, c) _mm_slli_epi16(a, c) 605 #define v128_shr_n_u16(a, c) _mm_srli_epi16(a, c) 606 #define v128_shr_n_s16(a, c) _mm_srai_epi16(a, c) 607 #define v128_shl_n_32(a, c) _mm_slli_epi32(a, c) 608 #define v128_shr_n_u32(a, c) _mm_srli_epi32(a, c) 609 #define v128_shr_n_s32(a, c) _mm_srai_epi32(a, c) 610 #define v128_shl_n_64(a, c) _mm_slli_epi64(a, c) 611 #define v128_shr_n_u64(a, c) _mm_srli_epi64(a, c) 612 #define v128_shr_n_s64(a, c) \ 613 v128_shr_s64(a, c) // _mm_srai_epi64 missing in gcc? 614 615 typedef v128 sad128_internal_u16; 616 617 SIMD_INLINE sad128_internal_u16 v128_sad_u16_init(void) { return v128_zero(); } 618 619 /* Implementation dependent return value. Result must be finalised with 620 * v128_sad_u16_sum(). */ 621 SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a, 622 v128 b) { 623 #if defined(__SSE4_1__) 624 v128 t = v128_sub_16(_mm_max_epu16(a, b), _mm_min_epu16(a, b)); 625 #else 626 v128 t = v128_cmplt_s16(v128_xor(a, v128_dup_16(32768)), 627 v128_xor(b, v128_dup_16(32768))); 628 t = v128_sub_16(v128_or(v128_and(b, t), v128_andn(a, t)), 629 v128_or(v128_and(a, t), v128_andn(b, t))); 630 #endif 631 return v128_add_32( 632 s, v128_add_32(v128_unpackhi_u16_s32(t), v128_unpacklo_u16_s32(t))); 633 } 634 635 SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) { 636 return v128_low_u32(s) + v128_low_u32(v128_shr_n_byte(s, 4)) + 637 v128_low_u32(v128_shr_n_byte(s, 8)) + 638 v128_low_u32(v128_shr_n_byte(s, 12)); 639 } 640 641 typedef v128 ssd128_internal_s16; 642 643 SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init(void) { return v128_zero(); } 644 645 /* Implementation dependent return value. Result must be finalised with 646 * v128_ssd_s16_sum(). */ 647 SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a, 648 v128 b) { 649 v128 d = v128_sub_16(a, b); 650 d = v128_madd_s16(d, d); 651 return v128_add_64(s, v128_add_64(_mm_unpackhi_epi32(d, v128_zero()), 652 _mm_unpacklo_epi32(d, v128_zero()))); 653 } 654 655 SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) { 656 return v64_u64(v128_low_v64(s)) + v64_u64(v128_high_v64(s)); 657 } 658 659 #endif // AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_