v256_intrinsics_v128.h (27920B)
1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_ 13 #define AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_ 14 15 #include "config/aom_config.h" 16 17 #if HAVE_NEON 18 #error "Do not use this file for Neon" 19 #endif 20 21 #if HAVE_SSE2 22 #include "aom_dsp/simd/v128_intrinsics_x86.h" 23 #else 24 #include "aom_dsp/simd/v128_intrinsics.h" 25 #endif 26 27 typedef struct { 28 v128 val[2]; 29 } v256; 30 31 SIMD_INLINE uint32_t v256_low_u32(v256 a) { return v128_low_u32(a.val[0]); } 32 33 SIMD_INLINE v64 v256_low_v64(v256 a) { return v128_low_v64(a.val[0]); } 34 35 SIMD_INLINE uint64_t v256_low_u64(v256 a) { return v64_u64(v256_low_v64(a)); } 36 37 SIMD_INLINE v128 v256_low_v128(v256 a) { return a.val[0]; } 38 39 SIMD_INLINE v128 v256_high_v128(v256 a) { return a.val[1]; } 40 41 SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) { 42 v256 t; 43 t.val[1] = hi; 44 t.val[0] = lo; 45 return t; 46 } 47 48 SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) { 49 return v256_from_v128(v128_from_64(a, b), v128_from_64(c, d)); 50 } 51 52 SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) { 53 return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d)); 54 } 55 56 SIMD_INLINE v256 v256_load_unaligned(const void *p) { 57 return v256_from_v128(v128_load_unaligned((uint8_t *)p + 16), 58 v128_load_unaligned(p)); 59 } 60 61 SIMD_INLINE v256 v256_load_aligned(const void *p) { 62 return v256_from_v128(v128_load_aligned((uint8_t *)p + 16), 63 v128_load_aligned(p)); 64 } 65 66 SIMD_INLINE void v256_store_unaligned(void *p, v256 a) { 67 v128_store_unaligned(p, a.val[0]); 68 v128_store_unaligned((uint8_t *)p + 16, a.val[1]); 69 } 70 71 SIMD_INLINE void v256_store_aligned(void *p, v256 a) { 72 v128_store_aligned(p, a.val[0]); 73 v128_store_aligned((uint8_t *)p + 16, a.val[1]); 74 } 75 76 SIMD_INLINE v256 v256_zero(void) { 77 return v256_from_v128(v128_zero(), v128_zero()); 78 } 79 80 SIMD_INLINE v256 v256_dup_8(uint8_t x) { 81 v128 t = v128_dup_8(x); 82 return v256_from_v128(t, t); 83 } 84 85 SIMD_INLINE v256 v256_dup_16(uint16_t x) { 86 v128 t = v128_dup_16(x); 87 return v256_from_v128(t, t); 88 } 89 90 SIMD_INLINE v256 v256_dup_32(uint32_t x) { 91 v128 t = v128_dup_32(x); 92 return v256_from_v128(t, t); 93 } 94 95 SIMD_INLINE v256 v256_dup_64(uint64_t x) { 96 v128 t = v128_dup_64(x); 97 return v256_from_v128(t, t); 98 } 99 100 SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) { 101 return v128_dotp_su8(a.val[1], b.val[1]) + v128_dotp_su8(a.val[0], b.val[0]); 102 } 103 104 SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) { 105 return v128_dotp_s16(a.val[1], b.val[1]) + v128_dotp_s16(a.val[0], b.val[0]); 106 } 107 108 SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) { 109 return v128_dotp_s32(a.val[1], b.val[1]) + v128_dotp_s32(a.val[0], b.val[0]); 110 } 111 112 SIMD_INLINE uint64_t v256_hadd_u8(v256 a) { 113 return v128_hadd_u8(a.val[1]) + v128_hadd_u8(a.val[0]); 114 } 115 116 typedef struct { 117 sad128_internal val[2]; 118 } sad256_internal; 119 120 SIMD_INLINE sad256_internal v256_sad_u8_init(void) { 121 sad256_internal t; 122 t.val[1] = v128_sad_u8_init(); 123 t.val[0] = v128_sad_u8_init(); 124 return t; 125 } 126 127 /* Implementation dependent return value. Result must be finalised with 128 v256_sad_u8_sum(). 129 The result for more than 16 v256_sad_u8() calls is undefined. */ 130 SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) { 131 sad256_internal t; 132 t.val[1] = v128_sad_u8(s.val[1], a.val[1], b.val[1]); 133 t.val[0] = v128_sad_u8(s.val[0], a.val[0], b.val[0]); 134 return t; 135 } 136 137 SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) { 138 return v128_sad_u8_sum(s.val[1]) + v128_sad_u8_sum(s.val[0]); 139 } 140 141 typedef struct { 142 ssd128_internal val[2]; 143 } ssd256_internal; 144 145 SIMD_INLINE ssd256_internal v256_ssd_u8_init(void) { 146 ssd256_internal t; 147 t.val[1] = v128_ssd_u8_init(); 148 t.val[0] = v128_ssd_u8_init(); 149 return t; 150 } 151 152 /* Implementation dependent return value. Result must be finalised with 153 * v256_ssd_u8_sum(). */ 154 SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) { 155 ssd256_internal t; 156 t.val[1] = v128_ssd_u8(s.val[1], a.val[1], b.val[1]); 157 t.val[0] = v128_ssd_u8(s.val[0], a.val[0], b.val[0]); 158 return t; 159 } 160 161 SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) { 162 return v128_ssd_u8_sum(s.val[1]) + v128_ssd_u8_sum(s.val[0]); 163 } 164 165 SIMD_INLINE v256 v256_or(v256 a, v256 b) { 166 return v256_from_v128(v128_or(a.val[1], b.val[1]), 167 v128_or(a.val[0], b.val[0])); 168 } 169 170 SIMD_INLINE v256 v256_xor(v256 a, v256 b) { 171 return v256_from_v128(v128_xor(a.val[1], b.val[1]), 172 v128_xor(a.val[0], b.val[0])); 173 } 174 175 SIMD_INLINE v256 v256_and(v256 a, v256 b) { 176 return v256_from_v128(v128_and(a.val[1], b.val[1]), 177 v128_and(a.val[0], b.val[0])); 178 } 179 180 SIMD_INLINE v256 v256_andn(v256 a, v256 b) { 181 return v256_from_v128(v128_andn(a.val[1], b.val[1]), 182 v128_andn(a.val[0], b.val[0])); 183 } 184 185 SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { 186 return v256_from_v128(v128_add_8(a.val[1], b.val[1]), 187 v128_add_8(a.val[0], b.val[0])); 188 } 189 190 SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { 191 return v256_from_v128(v128_add_16(a.val[1], b.val[1]), 192 v128_add_16(a.val[0], b.val[0])); 193 } 194 195 SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) { 196 return v256_from_v128(v128_sadd_s8(a.val[1], b.val[1]), 197 v128_sadd_s8(a.val[0], b.val[0])); 198 } 199 200 SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) { 201 return v256_from_v128(v128_sadd_u8(a.val[1], b.val[1]), 202 v128_sadd_u8(a.val[0], b.val[0])); 203 } 204 205 SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) { 206 return v256_from_v128(v128_sadd_s16(a.val[1], b.val[1]), 207 v128_sadd_s16(a.val[0], b.val[0])); 208 } 209 210 SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { 211 return v256_from_v128(v128_add_32(a.val[1], b.val[1]), 212 v128_add_32(a.val[0], b.val[0])); 213 } 214 215 SIMD_INLINE v256 v256_add_64(v256 a, v256 b) { 216 return v256_from_v128(v128_add_64(a.val[1], b.val[1]), 217 v128_add_64(a.val[0], b.val[0])); 218 } 219 220 SIMD_INLINE v256 v256_padd_u8(v256 a) { 221 return v256_from_v128(v128_padd_u8(a.val[1]), v128_padd_u8(a.val[0])); 222 } 223 224 SIMD_INLINE v256 v256_padd_s16(v256 a) { 225 return v256_from_v128(v128_padd_s16(a.val[1]), v128_padd_s16(a.val[0])); 226 } 227 228 SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { 229 return v256_from_v128(v128_sub_8(a.val[1], b.val[1]), 230 v128_sub_8(a.val[0], b.val[0])); 231 } 232 233 SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { 234 return v256_from_v128(v128_ssub_u8(a.val[1], b.val[1]), 235 v128_ssub_u8(a.val[0], b.val[0])); 236 } 237 238 SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { 239 return v256_from_v128(v128_ssub_s8(a.val[1], b.val[1]), 240 v128_ssub_s8(a.val[0], b.val[0])); 241 } 242 243 SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { 244 return v256_from_v128(v128_sub_16(a.val[1], b.val[1]), 245 v128_sub_16(a.val[0], b.val[0])); 246 } 247 248 SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) { 249 return v256_from_v128(v128_ssub_s16(a.val[1], b.val[1]), 250 v128_ssub_s16(a.val[0], b.val[0])); 251 } 252 253 SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) { 254 return v256_from_v128(v128_ssub_u16(a.val[1], b.val[1]), 255 v128_ssub_u16(a.val[0], b.val[0])); 256 } 257 258 SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { 259 return v256_from_v128(v128_sub_32(a.val[1], b.val[1]), 260 v128_sub_32(a.val[0], b.val[0])); 261 } 262 263 SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) { 264 return v256_from_v128(v128_sub_64(a.val[1], b.val[1]), 265 v128_sub_64(a.val[0], b.val[0])); 266 } 267 268 SIMD_INLINE v256 v256_abs_s16(v256 a) { 269 return v256_from_v128(v128_abs_s16(a.val[1]), v128_abs_s16(a.val[0])); 270 } 271 272 SIMD_INLINE v256 v256_abs_s8(v256 a) { 273 return v256_from_v128(v128_abs_s8(a.val[1]), v128_abs_s8(a.val[0])); 274 } 275 276 SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) { 277 v128 lo_bits = v128_mullo_s16(a, b); 278 v128 hi_bits = v128_mulhi_s16(a, b); 279 return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits), 280 v128_ziplo_16(hi_bits, lo_bits)); 281 } 282 283 SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) { 284 return v256_from_v128(v128_mullo_s16(a.val[1], b.val[1]), 285 v128_mullo_s16(a.val[0], b.val[0])); 286 } 287 288 SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) { 289 return v256_from_v128(v128_mulhi_s16(a.val[1], b.val[1]), 290 v128_mulhi_s16(a.val[0], b.val[0])); 291 } 292 293 SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) { 294 return v256_from_v128(v128_mullo_s32(a.val[1], b.val[1]), 295 v128_mullo_s32(a.val[0], b.val[0])); 296 } 297 298 SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) { 299 return v256_from_v128(v128_madd_s16(a.val[1], b.val[1]), 300 v128_madd_s16(a.val[0], b.val[0])); 301 } 302 303 SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) { 304 return v256_from_v128(v128_madd_us8(a.val[1], b.val[1]), 305 v128_madd_us8(a.val[0], b.val[0])); 306 } 307 308 SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { 309 return v256_from_v128(v128_avg_u8(a.val[1], b.val[1]), 310 v128_avg_u8(a.val[0], b.val[0])); 311 } 312 313 SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) { 314 return v256_from_v128(v128_rdavg_u8(a.val[1], b.val[1]), 315 v128_rdavg_u8(a.val[0], b.val[0])); 316 } 317 318 SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) { 319 return v256_from_v128(v128_rdavg_u16(a.val[1], b.val[1]), 320 v128_rdavg_u16(a.val[0], b.val[0])); 321 } 322 323 SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { 324 return v256_from_v128(v128_avg_u16(a.val[1], b.val[1]), 325 v128_avg_u16(a.val[0], b.val[0])); 326 } 327 328 SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { 329 return v256_from_v128(v128_min_u8(a.val[1], b.val[1]), 330 v128_min_u8(a.val[0], b.val[0])); 331 } 332 333 SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { 334 return v256_from_v128(v128_max_u8(a.val[1], b.val[1]), 335 v128_max_u8(a.val[0], b.val[0])); 336 } 337 338 SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { 339 return v256_from_v128(v128_min_s8(a.val[1], b.val[1]), 340 v128_min_s8(a.val[0], b.val[0])); 341 } 342 343 SIMD_INLINE uint32_t v256_movemask_8(v256 a) { 344 return (v128_movemask_8(v256_high_v128(a)) << 16) | 345 v128_movemask_8(v256_low_v128(a)); 346 } 347 348 SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) { 349 return v256_from_v128(v128_blend_8(a.val[1], b.val[1], c.val[1]), 350 v128_blend_8(a.val[0], b.val[0], c.val[0])); 351 } 352 353 SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { 354 return v256_from_v128(v128_max_s8(a.val[1], b.val[1]), 355 v128_max_s8(a.val[0], b.val[0])); 356 } 357 358 SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { 359 return v256_from_v128(v128_min_s16(a.val[1], b.val[1]), 360 v128_min_s16(a.val[0], b.val[0])); 361 } 362 363 SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { 364 return v256_from_v128(v128_max_s16(a.val[1], b.val[1]), 365 v128_max_s16(a.val[0], b.val[0])); 366 } 367 368 SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) { 369 return v256_from_v128(v128_min_s32(a.val[1], b.val[1]), 370 v128_min_s32(a.val[0], b.val[0])); 371 } 372 373 SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) { 374 return v256_from_v128(v128_max_s32(a.val[1], b.val[1]), 375 v128_max_s32(a.val[0], b.val[0])); 376 } 377 378 SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) { 379 return v256_from_v128(v128_ziphi_8(a.val[0], b.val[0]), 380 v128_ziplo_8(a.val[0], b.val[0])); 381 } 382 383 SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) { 384 return v256_from_v128(v128_ziphi_8(a.val[1], b.val[1]), 385 v128_ziplo_8(a.val[1], b.val[1])); 386 } 387 388 SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) { 389 return v256_from_v128(v128_ziphi_16(a.val[0], b.val[0]), 390 v128_ziplo_16(a.val[0], b.val[0])); 391 } 392 393 SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) { 394 return v256_from_v128(v128_ziphi_16(a.val[1], b.val[1]), 395 v128_ziplo_16(a.val[1], b.val[1])); 396 } 397 398 SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) { 399 return v256_from_v128(v128_ziphi_32(a.val[0], b.val[0]), 400 v128_ziplo_32(a.val[0], b.val[0])); 401 } 402 403 SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) { 404 return v256_from_v128(v128_ziphi_32(a.val[1], b.val[1]), 405 v128_ziplo_32(a.val[1], b.val[1])); 406 } 407 408 SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) { 409 return v256_from_v128(v128_ziphi_64(a.val[0], b.val[0]), 410 v128_ziplo_64(a.val[0], b.val[0])); 411 } 412 413 SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) { 414 return v256_from_v128(v128_ziphi_64(a.val[1], b.val[1]), 415 v128_ziplo_64(a.val[1], b.val[1])); 416 } 417 418 SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) { 419 return v256_from_v128(a.val[0], b.val[0]); 420 } 421 422 SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) { 423 return v256_from_v128(a.val[1], b.val[1]); 424 } 425 426 SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) { 427 return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b)); 428 } 429 430 SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) { 431 return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b)); 432 } 433 434 SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) { 435 return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b)); 436 } 437 438 SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) { 439 return v256_from_v128(v128_unziplo_8(a.val[1], a.val[0]), 440 v128_unziplo_8(b.val[1], b.val[0])); 441 } 442 443 SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) { 444 return v256_from_v128(v128_unziphi_8(a.val[1], a.val[0]), 445 v128_unziphi_8(b.val[1], b.val[0])); 446 } 447 448 SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) { 449 return v256_from_v128(v128_unziplo_16(a.val[1], a.val[0]), 450 v128_unziplo_16(b.val[1], b.val[0])); 451 } 452 453 SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) { 454 return v256_from_v128(v128_unziphi_16(a.val[1], a.val[0]), 455 v128_unziphi_16(b.val[1], b.val[0])); 456 } 457 458 SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) { 459 return v256_from_v128(v128_unziplo_32(a.val[1], a.val[0]), 460 v128_unziplo_32(b.val[1], b.val[0])); 461 } 462 463 SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) { 464 return v256_from_v128(v128_unziphi_32(a.val[1], a.val[0]), 465 v128_unziphi_32(b.val[1], b.val[0])); 466 } 467 468 SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) { 469 #if HAVE_SSE2 470 return v256_from_v128( 471 _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a.val[0]), 472 _mm_castsi128_pd(a.val[1]), 0)), 473 _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b.val[0]), 474 _mm_castsi128_pd(b.val[1]), 0))); 475 #else 476 return v256_from_v64(v128_low_v64(a.val[1]), v128_low_v64(a.val[0]), 477 v128_low_v64(b.val[1]), v128_low_v64(b.val[0])); 478 #endif 479 } 480 481 SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) { 482 #if HAVE_SSE2 483 return v256_from_v128( 484 _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a.val[0]), 485 _mm_castsi128_pd(a.val[1]), 3)), 486 _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b.val[0]), 487 _mm_castsi128_pd(b.val[1]), 3))); 488 #else 489 return v256_from_v64(v128_high_v64(a.val[1]), v128_high_v64(a.val[0]), 490 v128_high_v64(b.val[1]), v128_high_v64(b.val[0])); 491 #endif 492 } 493 494 SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { 495 return v256_from_v128(v128_unpackhi_u8_s16(a), v128_unpacklo_u8_s16(a)); 496 } 497 498 SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) { 499 return v256_from_v128(v128_unpackhi_u8_s16(a.val[0]), 500 v128_unpacklo_u8_s16(a.val[0])); 501 } 502 503 SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) { 504 return v256_from_v128(v128_unpackhi_u8_s16(a.val[1]), 505 v128_unpacklo_u8_s16(a.val[1])); 506 } 507 508 SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) { 509 return v256_from_v128(v128_unpackhi_s8_s16(a), v128_unpacklo_s8_s16(a)); 510 } 511 512 SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) { 513 return v256_from_v128(v128_unpackhi_s8_s16(a.val[0]), 514 v128_unpacklo_s8_s16(a.val[0])); 515 } 516 517 SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) { 518 return v256_from_v128(v128_unpackhi_s8_s16(a.val[1]), 519 v128_unpacklo_s8_s16(a.val[1])); 520 } 521 522 SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) { 523 return v256_from_v128(v128_pack_s32_s16(a.val[1], a.val[0]), 524 v128_pack_s32_s16(b.val[1], b.val[0])); 525 } 526 527 SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) { 528 return v256_from_v128(v128_pack_s32_u16(a.val[1], a.val[0]), 529 v128_pack_s32_u16(b.val[1], b.val[0])); 530 } 531 532 SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) { 533 return v256_from_v128(v128_pack_s16_u8(a.val[1], a.val[0]), 534 v128_pack_s16_u8(b.val[1], b.val[0])); 535 } 536 537 SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) { 538 return v256_from_v128(v128_pack_s16_s8(a.val[1], a.val[0]), 539 v128_pack_s16_s8(b.val[1], b.val[0])); 540 } 541 542 SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) { 543 return v256_from_v128(v128_unpackhi_u16_s32(a), v128_unpacklo_u16_s32(a)); 544 } 545 546 SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) { 547 return v256_from_v128(v128_unpackhi_s16_s32(a), v128_unpacklo_s16_s32(a)); 548 } 549 550 SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) { 551 return v256_from_v128(v128_unpackhi_u16_s32(a.val[0]), 552 v128_unpacklo_u16_s32(a.val[0])); 553 } 554 555 SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) { 556 return v256_from_v128(v128_unpackhi_s16_s32(a.val[0]), 557 v128_unpacklo_s16_s32(a.val[0])); 558 } 559 560 SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) { 561 return v256_from_v128(v128_unpackhi_u16_s32(a.val[1]), 562 v128_unpacklo_u16_s32(a.val[1])); 563 } 564 565 SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) { 566 return v256_from_v128(v128_unpackhi_s16_s32(a.val[1]), 567 v128_unpacklo_s16_s32(a.val[1])); 568 } 569 570 SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) { 571 return v256_from_v128(v128_cmpgt_s8(a.val[1], b.val[1]), 572 v128_cmpgt_s8(a.val[0], b.val[0])); 573 } 574 575 SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) { 576 return v256_from_v128(v128_cmplt_s8(a.val[1], b.val[1]), 577 v128_cmplt_s8(a.val[0], b.val[0])); 578 } 579 580 SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) { 581 return v256_from_v128(v128_cmpeq_8(a.val[1], b.val[1]), 582 v128_cmpeq_8(a.val[0], b.val[0])); 583 } 584 585 SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) { 586 return v256_from_v128(v128_cmpgt_s16(a.val[1], b.val[1]), 587 v128_cmpgt_s16(a.val[0], b.val[0])); 588 } 589 590 SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) { 591 return v256_from_v128(v128_cmplt_s16(a.val[1], b.val[1]), 592 v128_cmplt_s16(a.val[0], b.val[0])); 593 } 594 595 SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) { 596 return v256_from_v128(v128_cmpeq_16(a.val[1], b.val[1]), 597 v128_cmpeq_16(a.val[0], b.val[0])); 598 } 599 600 SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) { 601 return v256_from_v128(v128_cmpgt_s32(a.val[1], b.val[1]), 602 v128_cmpgt_s32(a.val[0], b.val[0])); 603 } 604 605 SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) { 606 return v256_from_v128(v128_cmplt_s32(a.val[1], b.val[1]), 607 v128_cmplt_s32(a.val[0], b.val[0])); 608 } 609 610 SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) { 611 return v256_from_v128(v128_cmpeq_32(a.val[1], b.val[1]), 612 v128_cmpeq_32(a.val[0], b.val[0])); 613 } 614 615 SIMD_INLINE v256 v256_shuffle_8(v256 x, v256 pattern) { 616 v128 c16 = v128_dup_8(16); 617 v128 maskhi = v128_cmplt_s8(pattern.val[1], c16); 618 v128 masklo = v128_cmplt_s8(pattern.val[0], c16); 619 return v256_from_v128( 620 v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[1], c16)), 621 v128_shuffle_8(x.val[0], pattern.val[1]), maskhi), 622 v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[0], c16)), 623 v128_shuffle_8(x.val[0], pattern.val[0]), masklo)); 624 } 625 626 SIMD_INLINE v256 v256_wideshuffle_8(v256 x, v256 y, v256 pattern) { 627 v128 c16 = v128_dup_8(16); 628 v128 c32 = v128_dup_8(32); 629 v128 c48 = v128_dup_8(48); 630 v128 maskhi16 = v128_cmpgt_s8(c16, pattern.val[1]); 631 v128 masklo16 = v128_cmpgt_s8(c16, pattern.val[0]); 632 v128 maskhi48 = v128_cmpgt_s8(c48, pattern.val[1]); 633 v128 masklo48 = v128_cmpgt_s8(c48, pattern.val[0]); 634 v256 r1 = v256_from_v128( 635 v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[1], c48)), 636 v128_shuffle_8(x.val[0], v128_sub_8(pattern.val[1], c32)), 637 maskhi48), 638 v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[0], c48)), 639 v128_shuffle_8(x.val[0], v128_sub_8(pattern.val[0], c32)), 640 masklo48)); 641 v256 r2 = v256_from_v128( 642 v128_blend_8(v128_shuffle_8(y.val[1], v128_sub_8(pattern.val[1], c16)), 643 v128_shuffle_8(y.val[0], pattern.val[1]), maskhi16), 644 v128_blend_8(v128_shuffle_8(y.val[1], v128_sub_8(pattern.val[0], c16)), 645 v128_shuffle_8(y.val[0], pattern.val[0]), masklo16)); 646 return v256_blend_8(r1, r2, v256_cmpgt_s8(v256_from_v128(c32, c32), pattern)); 647 } 648 649 SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) { 650 return v256_from_v128( 651 v128_shuffle_8(v256_high_v128(a), v256_high_v128(pattern)), 652 v128_shuffle_8(v256_low_v128(a), v256_low_v128(pattern))); 653 } 654 655 SIMD_INLINE v256 v256_shl_8(v256 a, const unsigned int c) { 656 return v256_from_v128(v128_shl_8(a.val[1], c), v128_shl_8(a.val[0], c)); 657 } 658 659 SIMD_INLINE v256 v256_shr_u8(v256 a, const unsigned int c) { 660 return v256_from_v128(v128_shr_u8(a.val[1], c), v128_shr_u8(a.val[0], c)); 661 } 662 663 SIMD_INLINE v256 v256_shr_s8(v256 a, const unsigned int c) { 664 return v256_from_v128(v128_shr_s8(a.val[1], c), v128_shr_s8(a.val[0], c)); 665 } 666 667 SIMD_INLINE v256 v256_shl_16(v256 a, const unsigned int c) { 668 return v256_from_v128(v128_shl_16(a.val[1], c), v128_shl_16(a.val[0], c)); 669 } 670 671 SIMD_INLINE v256 v256_shr_u16(v256 a, const unsigned int c) { 672 return v256_from_v128(v128_shr_u16(a.val[1], c), v128_shr_u16(a.val[0], c)); 673 } 674 675 SIMD_INLINE v256 v256_shr_s16(v256 a, const unsigned int c) { 676 return v256_from_v128(v128_shr_s16(a.val[1], c), v128_shr_s16(a.val[0], c)); 677 } 678 679 SIMD_INLINE v256 v256_shl_32(v256 a, const unsigned int c) { 680 return v256_from_v128(v128_shl_32(a.val[1], c), v128_shl_32(a.val[0], c)); 681 } 682 683 SIMD_INLINE v256 v256_shr_u32(v256 a, const unsigned int c) { 684 return v256_from_v128(v128_shr_u32(a.val[1], c), v128_shr_u32(a.val[0], c)); 685 } 686 687 SIMD_INLINE v256 v256_shr_s32(v256 a, const unsigned int c) { 688 return v256_from_v128(v128_shr_s32(a.val[1], c), v128_shr_s32(a.val[0], c)); 689 } 690 691 SIMD_INLINE v256 v256_shl_64(v256 a, const unsigned int c) { 692 return v256_from_v128(v128_shl_64(a.val[1], c), v128_shl_64(a.val[0], c)); 693 } 694 695 SIMD_INLINE v256 v256_shr_u64(v256 a, const unsigned int c) { 696 return v256_from_v128(v128_shr_u64(a.val[1], c), v128_shr_u64(a.val[0], c)); 697 } 698 699 SIMD_INLINE v256 v256_shr_s64(v256 a, const unsigned int c) { 700 return v256_from_v128(v128_shr_s64(a.val[1], c), v128_shr_s64(a.val[0], c)); 701 } 702 703 /* These intrinsics require immediate values, so we must use #defines 704 to enforce that. */ 705 #define v256_shl_n_byte(a, n) \ 706 ((n) < 16 ? v256_from_v128(v128_or(v128_shl_n_byte(a.val[1], n), \ 707 v128_shr_n_byte(a.val[0], 16 - (n))), \ 708 v128_shl_n_byte(a.val[0], (n))) \ 709 : v256_from_v128( \ 710 (n) > 16 ? v128_shl_n_byte(a.val[0], (n) - 16) : a.val[0], \ 711 v128_zero())) 712 713 #define v256_shr_n_byte(a, n) \ 714 (n == 0 \ 715 ? a \ 716 : ((n) < 16 \ 717 ? v256_from_v128(v128_shr_n_byte(a.val[1], n), \ 718 v128_or(v128_shr_n_byte(a.val[0], n), \ 719 v128_shl_n_byte(a.val[1], 16 - (n)))) \ 720 : v256_from_v128(v128_zero(), \ 721 (n) > 16 ? v128_shr_n_byte(a.val[1], (n) - 16) \ 722 : a.val[1]))) 723 724 #define v256_align(a, b, c) \ 725 ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b) 726 727 #define v256_shl_n_8(a, n) \ 728 v256_from_v128(v128_shl_n_8(a.val[1], n), v128_shl_n_8(a.val[0], n)) 729 #define v256_shl_n_16(a, n) \ 730 v256_from_v128(v128_shl_n_16(a.val[1], n), v128_shl_n_16(a.val[0], n)) 731 #define v256_shl_n_32(a, n) \ 732 v256_from_v128(v128_shl_n_32(a.val[1], n), v128_shl_n_32(a.val[0], n)) 733 #define v256_shl_n_64(a, n) \ 734 v256_from_v128(v128_shl_n_64(a.val[1], n), v128_shl_n_64(a.val[0], n)) 735 #define v256_shr_n_u8(a, n) \ 736 v256_from_v128(v128_shr_n_u8(a.val[1], n), v128_shr_n_u8(a.val[0], n)) 737 #define v256_shr_n_u16(a, n) \ 738 v256_from_v128(v128_shr_n_u16(a.val[1], n), v128_shr_n_u16(a.val[0], n)) 739 #define v256_shr_n_u32(a, n) \ 740 v256_from_v128(v128_shr_n_u32(a.val[1], n), v128_shr_n_u32(a.val[0], n)) 741 #define v256_shr_n_u64(a, n) \ 742 v256_from_v128(v128_shr_n_u64(a.val[1], n), v128_shr_n_u64(a.val[0], n)) 743 #define v256_shr_n_s8(a, n) \ 744 v256_from_v128(v128_shr_n_s8(a.val[1], n), v128_shr_n_s8(a.val[0], n)) 745 #define v256_shr_n_s16(a, n) \ 746 v256_from_v128(v128_shr_n_s16(a.val[1], n), v128_shr_n_s16(a.val[0], n)) 747 #define v256_shr_n_s32(a, n) \ 748 v256_from_v128(v128_shr_n_s32(a.val[1], n), v128_shr_n_s32(a.val[0], n)) 749 #define v256_shr_n_s64(a, n) \ 750 v256_from_v128(v128_shr_n_s64(a.val[1], n), v128_shr_n_s64(a.val[0], n)) 751 752 #define v256_shr_n_word(a, n) v256_shr_n_byte(a, 2 * (n)) 753 #define v256_shl_n_word(a, n) v256_shl_n_byte(a, 2 * (n)) 754 755 typedef struct { 756 sad128_internal_u16 val[2]; 757 } sad256_internal_u16; 758 759 SIMD_INLINE sad256_internal_u16 v256_sad_u16_init(void) { 760 sad256_internal_u16 t; 761 t.val[1] = v128_sad_u16_init(); 762 t.val[0] = v128_sad_u16_init(); 763 return t; 764 } 765 766 /* Implementation dependent return value. Result must be finalised with 767 v256_sad_u16_sum(). 768 The result for more than 16 v256_sad_u16() calls is undefined. */ 769 SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a, 770 v256 b) { 771 sad256_internal_u16 t; 772 t.val[1] = v128_sad_u16(s.val[1], a.val[1], b.val[1]); 773 t.val[0] = v128_sad_u16(s.val[0], a.val[0], b.val[0]); 774 return t; 775 } 776 777 SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) { 778 return v128_sad_u16_sum(s.val[1]) + v128_sad_u16_sum(s.val[0]); 779 } 780 781 typedef struct { 782 ssd128_internal_s16 val[2]; 783 } ssd256_internal_s16; 784 785 SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init(void) { 786 ssd256_internal_s16 t; 787 t.val[1] = v128_ssd_s16_init(); 788 t.val[0] = v128_ssd_s16_init(); 789 return t; 790 } 791 792 /* Implementation dependent return value. Result must be finalised with 793 * v256_ssd_s16_sum(). */ 794 SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a, 795 v256 b) { 796 ssd256_internal_s16 t; 797 t.val[1] = v128_ssd_s16(s.val[1], a.val[1], b.val[1]); 798 t.val[0] = v128_ssd_s16(s.val[0], a.val[0], b.val[0]); 799 return t; 800 } 801 802 SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) { 803 return v128_ssd_s16_sum(s.val[1]) + v128_ssd_s16_sum(s.val[0]); 804 } 805 806 #endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_