v256_intrinsics_x86.h (26504B)
1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_ 13 #define AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_ 14 15 #if !defined(__AVX2__) 16 17 #include "aom_dsp/simd/v256_intrinsics_v128.h" 18 19 #else 20 21 // The _m256i type seems to cause problems for g++'s mangling prior to 22 // version 5, but adding -fabi-version=0 fixes this. 23 #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 5 && \ 24 defined(__AVX2__) && defined(__cplusplus) 25 #pragma GCC optimize "-fabi-version=0" 26 #endif 27 28 #include <immintrin.h> 29 30 #include "aom_dsp/simd/v128_intrinsics_x86.h" 31 32 typedef __m256i v256; 33 34 SIMD_INLINE uint32_t v256_low_u32(v256 a) { 35 return (uint32_t)_mm_cvtsi128_si32(_mm256_extracti128_si256(a, 0)); 36 } 37 38 SIMD_INLINE v64 v256_low_v64(v256 a) { 39 return _mm_unpacklo_epi64(_mm256_extracti128_si256(a, 0), v64_zero()); 40 } 41 42 SIMD_INLINE uint64_t v256_low_u64(v256 a) { return v64_u64(v256_low_v64(a)); } 43 44 SIMD_INLINE v128 v256_low_v128(v256 a) { return _mm256_castsi256_si128(a); } 45 46 SIMD_INLINE v128 v256_high_v128(v256 a) { 47 return _mm256_extracti128_si256(a, 1); 48 } 49 50 SIMD_INLINE v256 v256_from_v128(v128 a, v128 b) { 51 // gcc seems to be missing _mm256_set_m128i() 52 return _mm256_inserti128_si256(_mm256_castsi128_si256(b), a, 1); 53 } 54 55 SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) { 56 return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d)); 57 } 58 59 SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) { 60 return _mm256_set_epi64x((int64_t)a, (int64_t)b, (int64_t)c, (int64_t)d); 61 } 62 63 SIMD_INLINE v256 v256_load_aligned(const void *p) { 64 return _mm256_load_si256((const __m256i *)p); 65 } 66 67 SIMD_INLINE v256 v256_load_unaligned(const void *p) { 68 return _mm256_loadu_si256((const __m256i *)p); 69 } 70 71 SIMD_INLINE void v256_store_aligned(void *p, v256 a) { 72 _mm256_store_si256((__m256i *)p, a); 73 } 74 75 SIMD_INLINE void v256_store_unaligned(void *p, v256 a) { 76 _mm256_storeu_si256((__m256i *)p, a); 77 } 78 79 SIMD_INLINE v256 v256_zero(void) { return _mm256_setzero_si256(); } 80 81 SIMD_INLINE v256 v256_dup_8(uint8_t x) { return _mm256_set1_epi8((char)x); } 82 83 SIMD_INLINE v256 v256_dup_16(uint16_t x) { return _mm256_set1_epi16((short)x); } 84 85 SIMD_INLINE v256 v256_dup_32(uint32_t x) { return _mm256_set1_epi32((int)x); } 86 87 SIMD_INLINE v256 v256_dup_64(uint64_t x) { 88 return _mm256_set1_epi64x((int64_t)x); 89 } 90 91 SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return _mm256_add_epi8(a, b); } 92 93 SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return _mm256_add_epi16(a, b); } 94 95 SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) { return _mm256_adds_epu8(a, b); } 96 97 SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) { return _mm256_adds_epi8(a, b); } 98 99 SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) { 100 return _mm256_adds_epi16(a, b); 101 } 102 103 SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return _mm256_add_epi32(a, b); } 104 105 SIMD_INLINE v256 v256_add_64(v256 a, v256 b) { return _mm256_add_epi64(a, b); } 106 107 SIMD_INLINE v256 v256_padd_u8(v256 a) { 108 return _mm256_maddubs_epi16(a, _mm256_set1_epi8(1)); 109 } 110 111 SIMD_INLINE v256 v256_padd_s16(v256 a) { 112 return _mm256_madd_epi16(a, _mm256_set1_epi16(1)); 113 } 114 115 SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { return _mm256_sub_epi8(a, b); } 116 117 SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return _mm256_subs_epu8(a, b); } 118 119 SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { return _mm256_subs_epi8(a, b); } 120 121 SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { return _mm256_sub_epi16(a, b); } 122 123 SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) { 124 return _mm256_subs_epi16(a, b); 125 } 126 127 SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) { 128 return _mm256_subs_epu16(a, b); 129 } 130 131 SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return _mm256_sub_epi32(a, b); } 132 133 SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) { return _mm256_sub_epi64(a, b); } 134 135 SIMD_INLINE v256 v256_abs_s16(v256 a) { return _mm256_abs_epi16(a); } 136 137 SIMD_INLINE v256 v256_abs_s8(v256 a) { return _mm256_abs_epi8(a); } 138 139 // AVX doesn't have the direct intrinsics to zip/unzip 8, 16, 32 bit 140 // lanes of lower or upper halves of a 256bit vector because the 141 // unpack/pack intrinsics operate on the 256 bit input vector as 2 142 // independent 128 bit vectors. 143 SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) { 144 return _mm256_unpacklo_epi8( 145 _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), 146 _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); 147 } 148 149 SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) { 150 return _mm256_unpackhi_epi8( 151 _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), 152 _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); 153 } 154 155 SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) { 156 return _mm256_unpacklo_epi16( 157 _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), 158 _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); 159 } 160 161 SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) { 162 return _mm256_unpackhi_epi16( 163 _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), 164 _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); 165 } 166 167 SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) { 168 return _mm256_unpacklo_epi32( 169 _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), 170 _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); 171 } 172 173 SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) { 174 return _mm256_unpackhi_epi32( 175 _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), 176 _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); 177 } 178 179 SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) { 180 return _mm256_unpacklo_epi64( 181 _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), 182 _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); 183 } 184 185 SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) { 186 return _mm256_unpackhi_epi64( 187 _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), 188 _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); 189 } 190 191 SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) { 192 return _mm256_permute2x128_si256(a, b, 0x02); 193 } 194 195 SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) { 196 return _mm256_permute2x128_si256(a, b, 0x13); 197 } 198 199 SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) { 200 return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b)); 201 } 202 203 SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) { 204 return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b)); 205 } 206 207 SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) { 208 return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b)); 209 } 210 211 SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) { 212 return _mm256_permute4x64_epi64( 213 _mm256_packs_epi16(_mm256_srai_epi16(b, 8), _mm256_srai_epi16(a, 8)), 214 _MM_SHUFFLE(3, 1, 2, 0)); 215 } 216 217 SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) { 218 return v256_unziphi_8(_mm256_slli_si256(a, 1), _mm256_slli_si256(b, 1)); 219 } 220 221 SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) { 222 return _mm256_permute4x64_epi64( 223 _mm256_packs_epi32(_mm256_srai_epi32(b, 16), _mm256_srai_epi32(a, 16)), 224 _MM_SHUFFLE(3, 1, 2, 0)); 225 } 226 227 SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) { 228 return v256_unziphi_16(_mm256_slli_si256(a, 2), _mm256_slli_si256(b, 2)); 229 } 230 231 SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) { 232 return _mm256_permute4x64_epi64( 233 _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(b), 234 _mm256_castsi256_ps(a), 235 _MM_SHUFFLE(3, 1, 3, 1))), 236 _MM_SHUFFLE(3, 1, 2, 0)); 237 } 238 239 SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) { 240 return _mm256_permute4x64_epi64( 241 _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(b), 242 _mm256_castsi256_ps(a), 243 _MM_SHUFFLE(2, 0, 2, 0))), 244 _MM_SHUFFLE(3, 1, 2, 0)); 245 } 246 247 SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) { 248 return _mm256_permute4x64_epi64( 249 _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(b), 250 _mm256_castsi256_pd(a), 15)), 251 _MM_SHUFFLE(3, 1, 2, 0)); 252 } 253 254 SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) { 255 return _mm256_permute4x64_epi64( 256 _mm256_castpd_si256( 257 _mm256_shuffle_pd(_mm256_castsi256_pd(b), _mm256_castsi256_pd(a), 0)), 258 _MM_SHUFFLE(3, 1, 2, 0)); 259 } 260 261 SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { return _mm256_cvtepu8_epi16(a); } 262 263 SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) { 264 return _mm256_unpacklo_epi8( 265 _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)), 266 _mm256_setzero_si256()); 267 } 268 269 SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) { 270 return _mm256_unpackhi_epi8( 271 _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)), 272 _mm256_setzero_si256()); 273 } 274 275 SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) { 276 return v256_from_v128(v128_unpackhi_s8_s16(a), v128_unpacklo_s8_s16(a)); 277 } 278 279 SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) { 280 return _mm256_srai_epi16( 281 _mm256_unpacklo_epi8( 282 a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))), 283 8); 284 } 285 286 SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) { 287 return _mm256_srai_epi16( 288 _mm256_unpackhi_epi8( 289 a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))), 290 8); 291 } 292 293 SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) { 294 return _mm256_permute4x64_epi64(_mm256_packs_epi32(b, a), 295 _MM_SHUFFLE(3, 1, 2, 0)); 296 } 297 298 SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) { 299 return _mm256_permute4x64_epi64(_mm256_packus_epi32(b, a), 300 _MM_SHUFFLE(3, 1, 2, 0)); 301 } 302 303 SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) { 304 return _mm256_permute4x64_epi64(_mm256_packus_epi16(b, a), 305 _MM_SHUFFLE(3, 1, 2, 0)); 306 } 307 308 SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) { 309 return _mm256_permute4x64_epi64(_mm256_packs_epi16(b, a), 310 _MM_SHUFFLE(3, 1, 2, 0)); 311 } 312 313 SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) { 314 return _mm256_cvtepu16_epi32(a); 315 } 316 317 SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) { 318 return _mm256_cvtepi16_epi32(a); 319 } 320 321 SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) { 322 return _mm256_unpacklo_epi16( 323 _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)), 324 _mm256_setzero_si256()); 325 } 326 327 SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) { 328 return _mm256_srai_epi32( 329 _mm256_unpacklo_epi16( 330 a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))), 331 16); 332 } 333 334 SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) { 335 return _mm256_unpackhi_epi16( 336 _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)), 337 _mm256_setzero_si256()); 338 } 339 340 SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) { 341 return _mm256_srai_epi32( 342 _mm256_unpackhi_epi16( 343 a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))), 344 16); 345 } 346 347 SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) { 348 return _mm256_blendv_epi8( 349 _mm256_shuffle_epi8( 350 _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 1, 0, 1)), pattern), 351 _mm256_shuffle_epi8( 352 _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 0, 0, 0)), pattern), 353 _mm256_cmpgt_epi8(v256_dup_8(16), pattern)); 354 } 355 356 SIMD_INLINE v256 v256_wideshuffle_8(v256 a, v256 b, v256 pattern) { 357 v256 c32 = v256_dup_8(32); 358 v256 p32 = v256_sub_8(pattern, c32); 359 v256 r1 = _mm256_blendv_epi8( 360 _mm256_shuffle_epi8( 361 _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 1, 0, 1)), p32), 362 _mm256_shuffle_epi8( 363 _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 0, 0, 0)), p32), 364 _mm256_cmpgt_epi8(v256_dup_8(48), pattern)); 365 v256 r2 = _mm256_blendv_epi8( 366 _mm256_shuffle_epi8( 367 _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 3, 0, 3)), pattern), 368 _mm256_shuffle_epi8( 369 _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 2, 0, 2)), pattern), 370 _mm256_cmpgt_epi8(v256_dup_8(16), pattern)); 371 return _mm256_blendv_epi8(r1, r2, _mm256_cmpgt_epi8(c32, pattern)); 372 } 373 374 SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) { 375 return _mm256_shuffle_epi8(a, pattern); 376 } 377 378 SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) { 379 v256 t1 = _mm256_madd_epi16(v256_unpackhi_s8_s16(a), v256_unpackhi_u8_s16(b)); 380 v256 t2 = _mm256_madd_epi16(v256_unpacklo_s8_s16(a), v256_unpacklo_u8_s16(b)); 381 t1 = _mm256_add_epi32(t1, t2); 382 v128 t = _mm_add_epi32(_mm256_extracti128_si256(t1, 0), 383 _mm256_extracti128_si256(t1, 1)); 384 t = _mm_add_epi32(t, _mm_srli_si128(t, 8)); 385 t = _mm_add_epi32(t, _mm_srli_si128(t, 4)); 386 return (int32_t)v128_low_u32(t); 387 } 388 389 SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) { 390 v256 r = _mm256_madd_epi16(a, b); 391 #if defined(__x86_64__) 392 v128 t; 393 r = _mm256_add_epi64(_mm256_cvtepi32_epi64(v256_high_v128(r)), 394 _mm256_cvtepi32_epi64(v256_low_v128(r))); 395 t = v256_low_v128(_mm256_add_epi64( 396 r, _mm256_permute2x128_si256(r, r, _MM_SHUFFLE(2, 0, 0, 1)))); 397 return _mm_cvtsi128_si64(_mm_add_epi64(t, _mm_srli_si128(t, 8))); 398 #else 399 v128 l = v256_low_v128(r); 400 v128 h = v256_high_v128(r); 401 return (int64_t)_mm_cvtsi128_si32(l) + 402 (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 4)) + 403 (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 8)) + 404 (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 12)) + 405 (int64_t)_mm_cvtsi128_si32(h) + 406 (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 4)) + 407 (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 8)) + 408 (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 12)); 409 #endif 410 } 411 412 SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) { 413 v256 r = _mm256_mullo_epi32(a, b); 414 #if defined(__x86_64__) 415 v128 t; 416 r = _mm256_add_epi64(_mm256_cvtepi32_epi64(v256_high_v128(r)), 417 _mm256_cvtepi32_epi64(v256_low_v128(r))); 418 t = v256_low_v128(_mm256_add_epi64( 419 r, _mm256_permute2x128_si256(r, r, _MM_SHUFFLE(2, 0, 0, 1)))); 420 return _mm_cvtsi128_si64(_mm_add_epi64(t, _mm_srli_si128(t, 8))); 421 #else 422 v128 l = v256_low_v128(r); 423 v128 h = v256_high_v128(r); 424 return (int64_t)_mm_cvtsi128_si32(l) + 425 (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 4)) + 426 (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 8)) + 427 (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 12)) + 428 (int64_t)_mm_cvtsi128_si32(h) + 429 (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 4)) + 430 (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 8)) + 431 (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 12)); 432 #endif 433 } 434 435 SIMD_INLINE uint64_t v256_hadd_u8(v256 a) { 436 v256 t = _mm256_sad_epu8(a, _mm256_setzero_si256()); 437 v128 lo = v256_low_v128(t); 438 v128 hi = v256_high_v128(t); 439 lo = v128_add_32(lo, hi); 440 return v64_low_u32(v128_low_v64(lo)) + v128_low_u32(v128_high_v64(lo)); 441 } 442 443 typedef v256 sad256_internal; 444 445 SIMD_INLINE sad256_internal v256_sad_u8_init(void) { 446 return _mm256_setzero_si256(); 447 } 448 449 /* Implementation dependent return value. Result must be finalised with 450 v256_sad_u8_sum(). 451 The result for more than 32 v256_sad_u8() calls is undefined. */ 452 SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) { 453 return _mm256_add_epi64(s, _mm256_sad_epu8(a, b)); 454 } 455 456 SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) { 457 v256 t = _mm256_add_epi32(s, _mm256_unpackhi_epi64(s, s)); 458 return v128_low_u32(_mm_add_epi32(v256_high_v128(t), v256_low_v128(t))); 459 } 460 461 typedef v256 ssd256_internal; 462 463 SIMD_INLINE ssd256_internal v256_ssd_u8_init(void) { 464 return _mm256_setzero_si256(); 465 } 466 467 /* Implementation dependent return value. Result must be finalised with 468 * v256_ssd_u8_sum(). */ 469 SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) { 470 v256 l = _mm256_sub_epi16(_mm256_unpacklo_epi8(a, _mm256_setzero_si256()), 471 _mm256_unpacklo_epi8(b, _mm256_setzero_si256())); 472 v256 h = _mm256_sub_epi16(_mm256_unpackhi_epi8(a, _mm256_setzero_si256()), 473 _mm256_unpackhi_epi8(b, _mm256_setzero_si256())); 474 v256 rl = _mm256_madd_epi16(l, l); 475 v256 rh = _mm256_madd_epi16(h, h); 476 v128 c = _mm_cvtsi32_si128(32); 477 rl = _mm256_add_epi32(rl, _mm256_srli_si256(rl, 8)); 478 rl = _mm256_add_epi32(rl, _mm256_srli_si256(rl, 4)); 479 rh = _mm256_add_epi32(rh, _mm256_srli_si256(rh, 8)); 480 rh = _mm256_add_epi32(rh, _mm256_srli_si256(rh, 4)); 481 return _mm256_add_epi64( 482 s, 483 _mm256_srl_epi64(_mm256_sll_epi64(_mm256_unpacklo_epi64(rl, rh), c), c)); 484 } 485 486 SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) { 487 v256 t = _mm256_add_epi32(s, _mm256_unpackhi_epi64(s, s)); 488 return v128_low_u32(_mm_add_epi32(v256_high_v128(t), v256_low_v128(t))); 489 } 490 491 SIMD_INLINE v256 v256_or(v256 a, v256 b) { return _mm256_or_si256(a, b); } 492 493 SIMD_INLINE v256 v256_xor(v256 a, v256 b) { return _mm256_xor_si256(a, b); } 494 495 SIMD_INLINE v256 v256_and(v256 a, v256 b) { return _mm256_and_si256(a, b); } 496 497 SIMD_INLINE v256 v256_andn(v256 a, v256 b) { return _mm256_andnot_si256(b, a); } 498 499 SIMD_INLINE v256 v256_mul_s16(v64 a, v64 b) { 500 v128 lo_bits = v128_mullo_s16(a, b); 501 v128 hi_bits = v128_mulhi_s16(a, b); 502 return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits), 503 v128_ziplo_16(hi_bits, lo_bits)); 504 } 505 506 SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) { 507 return _mm256_mullo_epi16(a, b); 508 } 509 510 SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) { 511 return _mm256_mulhi_epi16(a, b); 512 } 513 514 SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) { 515 return _mm256_mullo_epi32(a, b); 516 } 517 518 SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) { 519 return _mm256_madd_epi16(a, b); 520 } 521 522 SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) { 523 return _mm256_maddubs_epi16(a, b); 524 } 525 526 SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { return _mm256_avg_epu8(a, b); } 527 528 SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) { 529 return _mm256_sub_epi8( 530 _mm256_avg_epu8(a, b), 531 _mm256_and_si256(_mm256_xor_si256(a, b), v256_dup_8(1))); 532 } 533 534 SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) { 535 return _mm256_sub_epi16( 536 _mm256_avg_epu16(a, b), 537 _mm256_and_si256(_mm256_xor_si256(a, b), v256_dup_16(1))); 538 } 539 540 SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return _mm256_avg_epu16(a, b); } 541 542 SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return _mm256_min_epu8(a, b); } 543 544 SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return _mm256_max_epu8(a, b); } 545 546 SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return _mm256_min_epi8(a, b); } 547 548 SIMD_INLINE uint32_t v256_movemask_8(v256 a) { 549 return (uint32_t)_mm256_movemask_epi8(a); 550 } 551 552 SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) { 553 return _mm256_blendv_epi8(a, b, c); 554 } 555 556 SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return _mm256_max_epi8(a, b); } 557 558 SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return _mm256_min_epi16(a, b); } 559 560 SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return _mm256_max_epi16(a, b); } 561 562 SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) { return _mm256_min_epi32(a, b); } 563 564 SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) { return _mm256_max_epi32(a, b); } 565 566 SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) { 567 return _mm256_cmpgt_epi8(a, b); 568 } 569 570 SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) { 571 return _mm256_cmpgt_epi8(b, a); 572 } 573 574 SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) { 575 return _mm256_cmpeq_epi8(a, b); 576 } 577 578 SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) { 579 return _mm256_cmpgt_epi16(a, b); 580 } 581 582 SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) { 583 return _mm256_cmpgt_epi16(b, a); 584 } 585 586 SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) { 587 return _mm256_cmpeq_epi16(a, b); 588 } 589 590 SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) { 591 return _mm256_cmpgt_epi32(a, b); 592 } 593 594 SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) { 595 return _mm256_cmpgt_epi32(b, a); 596 } 597 598 SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) { 599 return _mm256_cmpeq_epi32(a, b); 600 } 601 602 SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) { 603 return _mm256_and_si256(_mm256_set1_epi8((char)(0xff << c)), 604 _mm256_sll_epi16(a, _mm_cvtsi32_si128((int)c))); 605 } 606 607 SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) { 608 return _mm256_and_si256(_mm256_set1_epi8((char)(0xff >> c)), 609 _mm256_srl_epi16(a, _mm_cvtsi32_si128((int)c))); 610 } 611 612 SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) { 613 __m128i x = _mm_cvtsi32_si128((int)(c + 8)); 614 return _mm256_packs_epi16(_mm256_sra_epi16(_mm256_unpacklo_epi8(a, a), x), 615 _mm256_sra_epi16(_mm256_unpackhi_epi8(a, a), x)); 616 } 617 618 SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) { 619 return _mm256_sll_epi16(a, _mm_cvtsi32_si128((int)c)); 620 } 621 622 SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) { 623 return _mm256_srl_epi16(a, _mm_cvtsi32_si128((int)c)); 624 } 625 626 SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) { 627 return _mm256_sra_epi16(a, _mm_cvtsi32_si128((int)c)); 628 } 629 630 SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) { 631 return _mm256_sll_epi32(a, _mm_cvtsi32_si128((int)c)); 632 } 633 634 SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) { 635 return _mm256_srl_epi32(a, _mm_cvtsi32_si128((int)c)); 636 } 637 638 SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) { 639 return _mm256_sra_epi32(a, _mm_cvtsi32_si128((int)c)); 640 } 641 642 SIMD_INLINE v256 v256_shl_64(v256 a, unsigned int c) { 643 return _mm256_sll_epi64(a, _mm_cvtsi32_si128((int)c)); 644 } 645 646 SIMD_INLINE v256 v256_shr_u64(v256 a, unsigned int c) { 647 return _mm256_srl_epi64(a, _mm_cvtsi32_si128((int)c)); 648 } 649 650 SIMD_INLINE v256 v256_shr_s64(v256 a, unsigned int c) { 651 #if defined(__AVX512VL__) 652 return _mm256_sra_epi64(a, _mm_cvtsi32_si128((int)c)); 653 #else 654 return v256_from_v128(v128_shr_s64(v256_high_v128(a), c), 655 v128_shr_s64(v256_low_v128(a), c)); 656 #endif 657 } 658 659 /* These intrinsics require immediate values, so we must use #defines 660 to enforce that. */ 661 // _mm256_slli_si256 works on 128 bit lanes and can't be used 662 #define v256_shl_n_byte(a, n) \ 663 ((n) < 16 ? v256_from_v128( \ 664 v128_align(v256_high_v128(a), v256_low_v128(a), 16 - (n)), \ 665 v128_shl_n_byte(v256_low_v128(a), n)) \ 666 : _mm256_inserti128_si256( \ 667 _mm256_setzero_si256(), \ 668 v128_shl_n_byte(v256_low_v128(a), (n) - 16), 1)) 669 670 // _mm256_srli_si256 works on 128 bit lanes and can't be used 671 #define v256_shr_n_byte(a, n) \ 672 ((n) < 16 \ 673 ? _mm256_alignr_epi8( \ 674 _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), a, n) \ 675 : ((n) == 16 ? _mm256_permute2x128_si256(_mm256_setzero_si256(), a, 3) \ 676 : _mm256_inserti128_si256( \ 677 _mm256_setzero_si256(), \ 678 v128_shr_n_byte(v256_high_v128(a), (n) - 16), 0))) 679 680 // _mm256_alignr_epi8 works on two 128 bit lanes and can't be used 681 #define v256_align(a, b, c) \ 682 ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b) 683 684 #define v256_shl_n_8(a, c) \ 685 _mm256_and_si256(_mm256_set1_epi8((char)(0xff << (c))), \ 686 _mm256_slli_epi16(a, c)) 687 #define v256_shr_n_u8(a, c) \ 688 _mm256_and_si256(_mm256_set1_epi8((char)(0xff >> (c))), \ 689 _mm256_srli_epi16(a, c)) 690 #define v256_shr_n_s8(a, c) \ 691 _mm256_packs_epi16(_mm256_srai_epi16(_mm256_unpacklo_epi8(a, a), (c) + 8), \ 692 _mm256_srai_epi16(_mm256_unpackhi_epi8(a, a), (c) + 8)) 693 #define v256_shl_n_16(a, c) _mm256_slli_epi16(a, c) 694 #define v256_shr_n_u16(a, c) _mm256_srli_epi16(a, c) 695 #define v256_shr_n_s16(a, c) _mm256_srai_epi16(a, c) 696 #define v256_shl_n_32(a, c) _mm256_slli_epi32(a, c) 697 #define v256_shr_n_u32(a, c) _mm256_srli_epi32(a, c) 698 #define v256_shr_n_s32(a, c) _mm256_srai_epi32(a, c) 699 #define v256_shl_n_64(a, c) _mm256_slli_epi64(a, c) 700 #define v256_shr_n_u64(a, c) _mm256_srli_epi64(a, c) 701 #define v256_shr_n_s64(a, c) \ 702 v256_shr_s64((a), (c)) // _mm256_srai_epi64 broken in gcc? 703 #define v256_shr_n_word(a, n) v256_shr_n_byte(a, 2 * (n)) 704 #define v256_shl_n_word(a, n) v256_shl_n_byte(a, 2 * (n)) 705 706 typedef v256 sad256_internal_u16; 707 708 SIMD_INLINE sad256_internal_u16 v256_sad_u16_init(void) { return v256_zero(); } 709 710 /* Implementation dependent return value. Result must be finalised with 711 * v256_sad_u16_sum(). */ 712 SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a, 713 v256 b) { 714 #if defined(__SSE4_1__) 715 v256 t = v256_sub_16(_mm256_max_epu16(a, b), _mm256_min_epu16(a, b)); 716 #else 717 v256 t = v256_cmplt_s16(v256_xor(a, v256_dup_16(32768)), 718 v256_xor(b, v256_dup_16(32768))); 719 t = v256_sub_16(v256_or(v256_and(b, t), v256_andn(a, t)), 720 v256_or(v256_and(a, t), v256_andn(b, t))); 721 #endif 722 return v256_add_32( 723 s, v256_add_32(v256_unpackhi_u16_s32(t), v256_unpacklo_u16_s32(t))); 724 } 725 726 SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) { 727 v128 t = v128_add_32(v256_high_v128(s), v256_low_v128(s)); 728 return v128_low_u32(t) + v128_low_u32(v128_shr_n_byte(t, 4)) + 729 v128_low_u32(v128_shr_n_byte(t, 8)) + 730 v128_low_u32(v128_shr_n_byte(t, 12)); 731 } 732 733 typedef v256 ssd256_internal_s16; 734 735 SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init(void) { return v256_zero(); } 736 737 /* Implementation dependent return value. Result must be finalised with 738 * v256_ssd_s16_sum(). */ 739 SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a, 740 v256 b) { 741 v256 d = v256_sub_16(a, b); 742 d = v256_madd_s16(d, d); 743 return v256_add_64(s, v256_add_64(_mm256_unpackhi_epi32(d, v256_zero()), 744 _mm256_unpacklo_epi32(d, v256_zero()))); 745 } 746 747 SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) { 748 v128 t = v128_add_64(v256_high_v128(s), v256_low_v128(s)); 749 return v64_u64(v128_low_v64(t)) + v64_u64(v128_high_v64(t)); 750 } 751 752 #endif 753 754 #endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_