v256_intrinsics_c.h (31024B)
1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_ 13 #define AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_ 14 15 #include <stdio.h> 16 #include <stdlib.h> 17 #include <string.h> 18 19 #include "config/aom_config.h" 20 21 #include "aom_dsp/simd/v128_intrinsics_c.h" 22 23 typedef union { 24 uint8_t u8[32]; 25 uint16_t u16[16]; 26 uint32_t u32[8]; 27 uint64_t u64[4]; 28 int8_t s8[32]; 29 int16_t s16[16]; 30 int32_t s32[8]; 31 int64_t s64[4]; 32 c_v64 v64[4]; 33 c_v128 v128[2]; 34 } c_v256; 35 36 SIMD_INLINE uint32_t c_v256_low_u32(c_v256 a) { return a.u32[0]; } 37 38 SIMD_INLINE c_v64 c_v256_low_v64(c_v256 a) { return a.v64[0]; } 39 40 SIMD_INLINE uint64_t c_v256_low_u64(c_v256 a) { return a.u64[0]; } 41 42 SIMD_INLINE c_v128 c_v256_low_v128(c_v256 a) { return a.v128[0]; } 43 44 SIMD_INLINE c_v128 c_v256_high_v128(c_v256 a) { return a.v128[1]; } 45 46 SIMD_INLINE c_v256 c_v256_from_v128(c_v128 hi, c_v128 lo) { 47 c_v256 t; 48 t.v128[1] = hi; 49 t.v128[0] = lo; 50 return t; 51 } 52 53 SIMD_INLINE c_v256 c_v256_from_64(uint64_t a, uint64_t b, uint64_t c, 54 uint64_t d) { 55 c_v256 t; 56 t.u64[3] = a; 57 t.u64[2] = b; 58 t.u64[1] = c; 59 t.u64[0] = d; 60 return t; 61 } 62 63 SIMD_INLINE c_v256 c_v256_from_v64(c_v64 a, c_v64 b, c_v64 c, c_v64 d) { 64 c_v256 t; 65 t.u64[3] = a.u64; 66 t.u64[2] = b.u64; 67 t.u64[1] = c.u64; 68 t.u64[0] = d.u64; 69 return t; 70 } 71 72 SIMD_INLINE c_v256 c_v256_load_unaligned(const void *p) { 73 c_v256 t; 74 memcpy(&t, p, 32); 75 return t; 76 } 77 78 SIMD_INLINE c_v256 c_v256_load_aligned(const void *p) { 79 if (SIMD_CHECK && (uintptr_t)p & 31) { 80 fprintf(stderr, "Error: unaligned v256 load at %p\n", p); 81 abort(); 82 } 83 return c_v256_load_unaligned(p); 84 } 85 86 SIMD_INLINE void c_v256_store_unaligned(void *p, c_v256 a) { 87 memcpy(p, &a, 32); 88 } 89 90 SIMD_INLINE void c_v256_store_aligned(void *p, c_v256 a) { 91 if (SIMD_CHECK && (uintptr_t)p & 31) { 92 fprintf(stderr, "Error: unaligned v256 store at %p\n", p); 93 abort(); 94 } 95 c_v256_store_unaligned(p, a); 96 } 97 98 SIMD_INLINE c_v256 c_v256_zero(void) { 99 c_v256 t; 100 t.u64[3] = t.u64[2] = t.u64[1] = t.u64[0] = 0; 101 return t; 102 } 103 104 SIMD_INLINE c_v256 c_v256_dup_8(uint8_t x) { 105 c_v256 t; 106 t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_8(x); 107 return t; 108 } 109 110 SIMD_INLINE c_v256 c_v256_dup_16(uint16_t x) { 111 c_v256 t; 112 t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_16(x); 113 return t; 114 } 115 116 SIMD_INLINE c_v256 c_v256_dup_32(uint32_t x) { 117 c_v256 t; 118 t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_32(x); 119 return t; 120 } 121 122 SIMD_INLINE c_v256 c_v256_dup_64(uint64_t x) { 123 c_v256 t; 124 t.u64[3] = t.u64[2] = t.u64[1] = t.u64[0] = x; 125 return t; 126 } 127 128 SIMD_INLINE int64_t c_v256_dotp_su8(c_v256 a, c_v256 b) { 129 return c_v128_dotp_su8(a.v128[1], b.v128[1]) + 130 c_v128_dotp_su8(a.v128[0], b.v128[0]); 131 } 132 133 SIMD_INLINE int64_t c_v256_dotp_s16(c_v256 a, c_v256 b) { 134 return c_v128_dotp_s16(a.v128[1], b.v128[1]) + 135 c_v128_dotp_s16(a.v128[0], b.v128[0]); 136 } 137 138 SIMD_INLINE int64_t c_v256_dotp_s32(c_v256 a, c_v256 b) { 139 return c_v128_dotp_s32(a.v128[1], b.v128[1]) + 140 c_v128_dotp_s32(a.v128[0], b.v128[0]); 141 } 142 143 SIMD_INLINE uint64_t c_v256_hadd_u8(c_v256 a) { 144 return c_v128_hadd_u8(a.v128[1]) + c_v128_hadd_u8(a.v128[0]); 145 } 146 147 typedef struct { 148 uint32_t val; 149 int count; 150 } c_sad256_internal; 151 152 SIMD_INLINE c_sad256_internal c_v256_sad_u8_init(void) { 153 c_sad256_internal t; 154 t.val = t.count = 0; 155 return t; 156 } 157 158 /* Implementation dependent return value. Result must be finalised with 159 v256_sad_u8_sum(). 160 The result for more than 16 v256_sad_u8() calls is undefined. */ 161 SIMD_INLINE c_sad256_internal c_v256_sad_u8(c_sad256_internal s, c_v256 a, 162 c_v256 b) { 163 int c; 164 for (c = 0; c < 32; c++) 165 s.val += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c]; 166 s.count++; 167 if (SIMD_CHECK && s.count > 32) { 168 fprintf(stderr, 169 "Error: sad called 32 times returning an undefined result\n"); 170 abort(); 171 } 172 return s; 173 } 174 175 SIMD_INLINE uint32_t c_v256_sad_u8_sum(c_sad256_internal s) { return s.val; } 176 177 typedef uint32_t c_ssd256_internal; 178 179 SIMD_INLINE c_ssd256_internal c_v256_ssd_u8_init(void) { return 0; } 180 181 /* Implementation dependent return value. Result must be finalised with 182 * v256_ssd_u8_sum(). */ 183 SIMD_INLINE c_ssd256_internal c_v256_ssd_u8(c_ssd256_internal s, c_v256 a, 184 c_v256 b) { 185 int c; 186 for (c = 0; c < 32; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]); 187 return s; 188 } 189 190 SIMD_INLINE uint32_t c_v256_ssd_u8_sum(c_ssd256_internal s) { return s; } 191 192 SIMD_INLINE c_v256 c_v256_or(c_v256 a, c_v256 b) { 193 return c_v256_from_v128(c_v128_or(a.v128[1], b.v128[1]), 194 c_v128_or(a.v128[0], b.v128[0])); 195 } 196 197 SIMD_INLINE c_v256 c_v256_xor(c_v256 a, c_v256 b) { 198 return c_v256_from_v128(c_v128_xor(a.v128[1], b.v128[1]), 199 c_v128_xor(a.v128[0], b.v128[0])); 200 } 201 202 SIMD_INLINE c_v256 c_v256_and(c_v256 a, c_v256 b) { 203 return c_v256_from_v128(c_v128_and(a.v128[1], b.v128[1]), 204 c_v128_and(a.v128[0], b.v128[0])); 205 } 206 207 SIMD_INLINE c_v256 c_v256_andn(c_v256 a, c_v256 b) { 208 return c_v256_from_v128(c_v128_andn(a.v128[1], b.v128[1]), 209 c_v128_andn(a.v128[0], b.v128[0])); 210 } 211 212 SIMD_INLINE c_v256 c_v256_add_8(c_v256 a, c_v256 b) { 213 return c_v256_from_v128(c_v128_add_8(a.v128[1], b.v128[1]), 214 c_v128_add_8(a.v128[0], b.v128[0])); 215 } 216 217 SIMD_INLINE c_v256 c_v256_add_16(c_v256 a, c_v256 b) { 218 return c_v256_from_v128(c_v128_add_16(a.v128[1], b.v128[1]), 219 c_v128_add_16(a.v128[0], b.v128[0])); 220 } 221 222 SIMD_INLINE c_v256 c_v256_sadd_s8(c_v256 a, c_v256 b) { 223 return c_v256_from_v128(c_v128_sadd_s8(a.v128[1], b.v128[1]), 224 c_v128_sadd_s8(a.v128[0], b.v128[0])); 225 } 226 227 SIMD_INLINE c_v256 c_v256_sadd_u8(c_v256 a, c_v256 b) { 228 return c_v256_from_v128(c_v128_sadd_u8(a.v128[1], b.v128[1]), 229 c_v128_sadd_u8(a.v128[0], b.v128[0])); 230 } 231 232 SIMD_INLINE c_v256 c_v256_sadd_s16(c_v256 a, c_v256 b) { 233 return c_v256_from_v128(c_v128_sadd_s16(a.v128[1], b.v128[1]), 234 c_v128_sadd_s16(a.v128[0], b.v128[0])); 235 } 236 237 SIMD_INLINE c_v256 c_v256_add_32(c_v256 a, c_v256 b) { 238 return c_v256_from_v128(c_v128_add_32(a.v128[1], b.v128[1]), 239 c_v128_add_32(a.v128[0], b.v128[0])); 240 } 241 242 SIMD_INLINE c_v256 c_v256_add_64(c_v256 a, c_v256 b) { 243 return c_v256_from_v128(c_v128_add_64(a.v128[1], b.v128[1]), 244 c_v128_add_64(a.v128[0], b.v128[0])); 245 } 246 247 SIMD_INLINE c_v256 c_v256_sub_64(c_v256 a, c_v256 b) { 248 return c_v256_from_v128(c_v128_sub_64(a.v128[1], b.v128[1]), 249 c_v128_sub_64(a.v128[0], b.v128[0])); 250 } 251 252 SIMD_INLINE c_v256 c_v256_padd_u8(c_v256 a) { 253 c_v256 t; 254 for (int i = 0; i < 16; i++) 255 t.u16[i] = (uint16_t)a.u8[i * 2] + (uint16_t)a.u8[i * 2 + 1]; 256 return t; 257 } 258 259 SIMD_INLINE c_v256 c_v256_padd_s16(c_v256 a) { 260 c_v256 t; 261 t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1]; 262 t.s32[1] = (int32_t)a.s16[2] + (int32_t)a.s16[3]; 263 t.s32[2] = (int32_t)a.s16[4] + (int32_t)a.s16[5]; 264 t.s32[3] = (int32_t)a.s16[6] + (int32_t)a.s16[7]; 265 t.s32[4] = (int32_t)a.s16[8] + (int32_t)a.s16[9]; 266 t.s32[5] = (int32_t)a.s16[10] + (int32_t)a.s16[11]; 267 t.s32[6] = (int32_t)a.s16[12] + (int32_t)a.s16[13]; 268 t.s32[7] = (int32_t)a.s16[14] + (int32_t)a.s16[15]; 269 return t; 270 } 271 272 SIMD_INLINE c_v256 c_v256_sub_8(c_v256 a, c_v256 b) { 273 return c_v256_from_v128(c_v128_sub_8(a.v128[1], b.v128[1]), 274 c_v128_sub_8(a.v128[0], b.v128[0])); 275 } 276 277 SIMD_INLINE c_v256 c_v256_ssub_u8(c_v256 a, c_v256 b) { 278 return c_v256_from_v128(c_v128_ssub_u8(a.v128[1], b.v128[1]), 279 c_v128_ssub_u8(a.v128[0], b.v128[0])); 280 } 281 282 SIMD_INLINE c_v256 c_v256_ssub_s8(c_v256 a, c_v256 b) { 283 return c_v256_from_v128(c_v128_ssub_s8(a.v128[1], b.v128[1]), 284 c_v128_ssub_s8(a.v128[0], b.v128[0])); 285 } 286 287 SIMD_INLINE c_v256 c_v256_sub_16(c_v256 a, c_v256 b) { 288 return c_v256_from_v128(c_v128_sub_16(a.v128[1], b.v128[1]), 289 c_v128_sub_16(a.v128[0], b.v128[0])); 290 } 291 292 SIMD_INLINE c_v256 c_v256_ssub_s16(c_v256 a, c_v256 b) { 293 return c_v256_from_v128(c_v128_ssub_s16(a.v128[1], b.v128[1]), 294 c_v128_ssub_s16(a.v128[0], b.v128[0])); 295 } 296 297 SIMD_INLINE c_v256 c_v256_ssub_u16(c_v256 a, c_v256 b) { 298 return c_v256_from_v128(c_v128_ssub_u16(a.v128[1], b.v128[1]), 299 c_v128_ssub_u16(a.v128[0], b.v128[0])); 300 } 301 302 SIMD_INLINE c_v256 c_v256_sub_32(c_v256 a, c_v256 b) { 303 return c_v256_from_v128(c_v128_sub_32(a.v128[1], b.v128[1]), 304 c_v128_sub_32(a.v128[0], b.v128[0])); 305 } 306 307 SIMD_INLINE c_v256 c_v256_abs_s16(c_v256 a) { 308 return c_v256_from_v128(c_v128_abs_s16(a.v128[1]), c_v128_abs_s16(a.v128[0])); 309 } 310 311 SIMD_INLINE c_v256 c_v256_abs_s8(c_v256 a) { 312 return c_v256_from_v128(c_v128_abs_s8(a.v128[1]), c_v128_abs_s8(a.v128[0])); 313 } 314 315 SIMD_INLINE c_v256 c_v256_mul_s16(c_v128 a, c_v128 b) { 316 c_v128 lo_bits = c_v128_mullo_s16(a, b); 317 c_v128 hi_bits = c_v128_mulhi_s16(a, b); 318 return c_v256_from_v128(c_v128_ziphi_16(hi_bits, lo_bits), 319 c_v128_ziplo_16(hi_bits, lo_bits)); 320 } 321 322 SIMD_INLINE c_v256 c_v256_mullo_s16(c_v256 a, c_v256 b) { 323 return c_v256_from_v128(c_v128_mullo_s16(a.v128[1], b.v128[1]), 324 c_v128_mullo_s16(a.v128[0], b.v128[0])); 325 } 326 327 SIMD_INLINE c_v256 c_v256_mulhi_s16(c_v256 a, c_v256 b) { 328 return c_v256_from_v128(c_v128_mulhi_s16(a.v128[1], b.v128[1]), 329 c_v128_mulhi_s16(a.v128[0], b.v128[0])); 330 } 331 332 SIMD_INLINE c_v256 c_v256_mullo_s32(c_v256 a, c_v256 b) { 333 return c_v256_from_v128(c_v128_mullo_s32(a.v128[1], b.v128[1]), 334 c_v128_mullo_s32(a.v128[0], b.v128[0])); 335 } 336 337 SIMD_INLINE c_v256 c_v256_madd_s16(c_v256 a, c_v256 b) { 338 return c_v256_from_v128(c_v128_madd_s16(a.v128[1], b.v128[1]), 339 c_v128_madd_s16(a.v128[0], b.v128[0])); 340 } 341 342 SIMD_INLINE c_v256 c_v256_madd_us8(c_v256 a, c_v256 b) { 343 return c_v256_from_v128(c_v128_madd_us8(a.v128[1], b.v128[1]), 344 c_v128_madd_us8(a.v128[0], b.v128[0])); 345 } 346 347 SIMD_INLINE c_v256 c_v256_avg_u8(c_v256 a, c_v256 b) { 348 return c_v256_from_v128(c_v128_avg_u8(a.v128[1], b.v128[1]), 349 c_v128_avg_u8(a.v128[0], b.v128[0])); 350 } 351 352 SIMD_INLINE c_v256 c_v256_rdavg_u8(c_v256 a, c_v256 b) { 353 return c_v256_from_v128(c_v128_rdavg_u8(a.v128[1], b.v128[1]), 354 c_v128_rdavg_u8(a.v128[0], b.v128[0])); 355 } 356 357 SIMD_INLINE c_v256 c_v256_rdavg_u16(c_v256 a, c_v256 b) { 358 return c_v256_from_v128(c_v128_rdavg_u16(a.v128[1], b.v128[1]), 359 c_v128_rdavg_u16(a.v128[0], b.v128[0])); 360 } 361 362 SIMD_INLINE c_v256 c_v256_avg_u16(c_v256 a, c_v256 b) { 363 return c_v256_from_v128(c_v128_avg_u16(a.v128[1], b.v128[1]), 364 c_v128_avg_u16(a.v128[0], b.v128[0])); 365 } 366 367 SIMD_INLINE c_v256 c_v256_min_u8(c_v256 a, c_v256 b) { 368 return c_v256_from_v128(c_v128_min_u8(a.v128[1], b.v128[1]), 369 c_v128_min_u8(a.v128[0], b.v128[0])); 370 } 371 372 SIMD_INLINE c_v256 c_v256_max_u8(c_v256 a, c_v256 b) { 373 return c_v256_from_v128(c_v128_max_u8(a.v128[1], b.v128[1]), 374 c_v128_max_u8(a.v128[0], b.v128[0])); 375 } 376 377 SIMD_INLINE c_v256 c_v256_min_s8(c_v256 a, c_v256 b) { 378 return c_v256_from_v128(c_v128_min_s8(a.v128[1], b.v128[1]), 379 c_v128_min_s8(a.v128[0], b.v128[0])); 380 } 381 382 SIMD_INLINE uint32_t c_v256_movemask_8(c_v256 a) { 383 return ((uint32_t)(a.s8[31] < 0) << 31) | ((a.s8[30] < 0) << 30) | 384 ((a.s8[29] < 0) << 29) | ((a.s8[28] < 0) << 28) | 385 ((a.s8[27] < 0) << 27) | ((a.s8[26] < 0) << 26) | 386 ((a.s8[25] < 0) << 25) | ((a.s8[24] < 0) << 24) | 387 ((a.s8[23] < 0) << 23) | ((a.s8[22] < 0) << 22) | 388 ((a.s8[21] < 0) << 21) | ((a.s8[20] < 0) << 20) | 389 ((a.s8[19] < 0) << 19) | ((a.s8[18] < 0) << 18) | 390 ((a.s8[17] < 0) << 17) | ((a.s8[16] < 0) << 16) | 391 ((a.s8[15] < 0) << 15) | ((a.s8[14] < 0) << 14) | 392 ((a.s8[13] < 0) << 13) | ((a.s8[12] < 0) << 12) | 393 ((a.s8[11] < 0) << 11) | ((a.s8[10] < 0) << 10) | 394 ((a.s8[9] < 0) << 9) | ((a.s8[8] < 0) << 8) | ((a.s8[7] < 0) << 7) | 395 ((a.s8[6] < 0) << 6) | ((a.s8[5] < 0) << 5) | ((a.s8[4] < 0) << 4) | 396 ((a.s8[3] < 0) << 3) | ((a.s8[2] < 0) << 2) | ((a.s8[1] < 0) << 1) | 397 ((a.s8[0] < 0) << 0); 398 } 399 400 SIMD_INLINE c_v256 c_v256_blend_8(c_v256 a, c_v256 b, c_v256 c) { 401 c_v256 t; 402 for (int i = 0; i < 32; i++) t.u8[i] = c.s8[i] < 0 ? b.u8[i] : a.u8[i]; 403 return t; 404 } 405 406 SIMD_INLINE c_v256 c_v256_max_s8(c_v256 a, c_v256 b) { 407 return c_v256_from_v128(c_v128_max_s8(a.v128[1], b.v128[1]), 408 c_v128_max_s8(a.v128[0], b.v128[0])); 409 } 410 411 SIMD_INLINE c_v256 c_v256_min_s16(c_v256 a, c_v256 b) { 412 return c_v256_from_v128(c_v128_min_s16(a.v128[1], b.v128[1]), 413 c_v128_min_s16(a.v128[0], b.v128[0])); 414 } 415 416 SIMD_INLINE c_v256 c_v256_max_s16(c_v256 a, c_v256 b) { 417 return c_v256_from_v128(c_v128_max_s16(a.v128[1], b.v128[1]), 418 c_v128_max_s16(a.v128[0], b.v128[0])); 419 } 420 421 SIMD_INLINE c_v256 c_v256_min_s32(c_v256 a, c_v256 b) { 422 return c_v256_from_v128(c_v128_min_s32(a.v128[1], b.v128[1]), 423 c_v128_min_s32(a.v128[0], b.v128[0])); 424 } 425 426 SIMD_INLINE c_v256 c_v256_max_s32(c_v256 a, c_v256 b) { 427 return c_v256_from_v128(c_v128_max_s32(a.v128[1], b.v128[1]), 428 c_v128_max_s32(a.v128[0], b.v128[0])); 429 } 430 431 SIMD_INLINE c_v256 c_v256_ziplo_8(c_v256 a, c_v256 b) { 432 return c_v256_from_v128(c_v128_ziphi_8(a.v128[0], b.v128[0]), 433 c_v128_ziplo_8(a.v128[0], b.v128[0])); 434 } 435 436 SIMD_INLINE c_v256 c_v256_ziphi_8(c_v256 a, c_v256 b) { 437 return c_v256_from_v128(c_v128_ziphi_8(a.v128[1], b.v128[1]), 438 c_v128_ziplo_8(a.v128[1], b.v128[1])); 439 } 440 441 SIMD_INLINE c_v256 c_v256_ziplo_16(c_v256 a, c_v256 b) { 442 return c_v256_from_v128(c_v128_ziphi_16(a.v128[0], b.v128[0]), 443 c_v128_ziplo_16(a.v128[0], b.v128[0])); 444 } 445 446 SIMD_INLINE c_v256 c_v256_ziphi_16(c_v256 a, c_v256 b) { 447 return c_v256_from_v128(c_v128_ziphi_16(a.v128[1], b.v128[1]), 448 c_v128_ziplo_16(a.v128[1], b.v128[1])); 449 } 450 451 SIMD_INLINE c_v256 c_v256_ziplo_32(c_v256 a, c_v256 b) { 452 return c_v256_from_v128(c_v128_ziphi_32(a.v128[0], b.v128[0]), 453 c_v128_ziplo_32(a.v128[0], b.v128[0])); 454 } 455 456 SIMD_INLINE c_v256 c_v256_ziphi_32(c_v256 a, c_v256 b) { 457 return c_v256_from_v128(c_v128_ziphi_32(a.v128[1], b.v128[1]), 458 c_v128_ziplo_32(a.v128[1], b.v128[1])); 459 } 460 461 SIMD_INLINE c_v256 c_v256_ziplo_64(c_v256 a, c_v256 b) { 462 return c_v256_from_v128(c_v128_ziphi_64(a.v128[0], b.v128[0]), 463 c_v128_ziplo_64(a.v128[0], b.v128[0])); 464 } 465 466 SIMD_INLINE c_v256 c_v256_ziphi_64(c_v256 a, c_v256 b) { 467 return c_v256_from_v128(c_v128_ziphi_64(a.v128[1], b.v128[1]), 468 c_v128_ziplo_64(a.v128[1], b.v128[1])); 469 } 470 471 SIMD_INLINE c_v256 c_v256_ziplo_128(c_v256 a, c_v256 b) { 472 return c_v256_from_v128(a.v128[0], b.v128[0]); 473 } 474 475 SIMD_INLINE c_v256 c_v256_ziphi_128(c_v256 a, c_v256 b) { 476 return c_v256_from_v128(a.v128[1], b.v128[1]); 477 } 478 479 SIMD_INLINE c_v256 c_v256_zip_8(c_v128 a, c_v128 b) { 480 return c_v256_from_v128(c_v128_ziphi_8(a, b), c_v128_ziplo_8(a, b)); 481 } 482 483 SIMD_INLINE c_v256 c_v256_zip_16(c_v128 a, c_v128 b) { 484 return c_v256_from_v128(c_v128_ziphi_16(a, b), c_v128_ziplo_16(a, b)); 485 } 486 487 SIMD_INLINE c_v256 c_v256_zip_32(c_v128 a, c_v128 b) { 488 return c_v256_from_v128(c_v128_ziphi_32(a, b), c_v128_ziplo_32(a, b)); 489 } 490 491 SIMD_INLINE c_v256 _c_v256_unzip_8(c_v256 a, c_v256 b, int mode) { 492 c_v256 t; 493 int i; 494 if (mode) { 495 for (i = 0; i < 16; i++) { 496 t.u8[i] = a.u8[i * 2 + 1]; 497 t.u8[i + 16] = b.u8[i * 2 + 1]; 498 } 499 } else { 500 for (i = 0; i < 16; i++) { 501 t.u8[i] = b.u8[i * 2]; 502 t.u8[i + 16] = a.u8[i * 2]; 503 } 504 } 505 return t; 506 } 507 508 SIMD_INLINE c_v256 c_v256_unziplo_8(c_v256 a, c_v256 b) { 509 return CONFIG_BIG_ENDIAN ? _c_v256_unzip_8(a, b, 1) 510 : _c_v256_unzip_8(a, b, 0); 511 } 512 513 SIMD_INLINE c_v256 c_v256_unziphi_8(c_v256 a, c_v256 b) { 514 return CONFIG_BIG_ENDIAN ? _c_v256_unzip_8(b, a, 0) 515 : _c_v256_unzip_8(b, a, 1); 516 } 517 518 SIMD_INLINE c_v256 _c_v256_unzip_16(c_v256 a, c_v256 b, int mode) { 519 c_v256 t; 520 int i; 521 if (mode) { 522 for (i = 0; i < 8; i++) { 523 t.u16[i] = a.u16[i * 2 + 1]; 524 t.u16[i + 8] = b.u16[i * 2 + 1]; 525 } 526 } else { 527 for (i = 0; i < 8; i++) { 528 t.u16[i] = b.u16[i * 2]; 529 t.u16[i + 8] = a.u16[i * 2]; 530 } 531 } 532 return t; 533 } 534 535 SIMD_INLINE c_v256 c_v256_unziplo_16(c_v256 a, c_v256 b) { 536 return CONFIG_BIG_ENDIAN ? _c_v256_unzip_16(a, b, 1) 537 : _c_v256_unzip_16(a, b, 0); 538 } 539 540 SIMD_INLINE c_v256 c_v256_unziphi_16(c_v256 a, c_v256 b) { 541 return CONFIG_BIG_ENDIAN ? _c_v256_unzip_16(b, a, 0) 542 : _c_v256_unzip_16(b, a, 1); 543 } 544 545 SIMD_INLINE c_v256 _c_v256_unzip_32(c_v256 a, c_v256 b, int mode) { 546 c_v256 t; 547 if (mode) { 548 t.u32[7] = b.u32[7]; 549 t.u32[6] = b.u32[5]; 550 t.u32[5] = b.u32[3]; 551 t.u32[4] = b.u32[1]; 552 t.u32[3] = a.u32[7]; 553 t.u32[2] = a.u32[5]; 554 t.u32[1] = a.u32[3]; 555 t.u32[0] = a.u32[1]; 556 } else { 557 t.u32[7] = a.u32[6]; 558 t.u32[6] = a.u32[4]; 559 t.u32[5] = a.u32[2]; 560 t.u32[4] = a.u32[0]; 561 t.u32[3] = b.u32[6]; 562 t.u32[2] = b.u32[4]; 563 t.u32[1] = b.u32[2]; 564 t.u32[0] = b.u32[0]; 565 } 566 return t; 567 } 568 569 SIMD_INLINE c_v256 c_v256_unziplo_32(c_v256 a, c_v256 b) { 570 return CONFIG_BIG_ENDIAN ? _c_v256_unzip_32(a, b, 1) 571 : _c_v256_unzip_32(a, b, 0); 572 } 573 574 SIMD_INLINE c_v256 c_v256_unziphi_32(c_v256 a, c_v256 b) { 575 return CONFIG_BIG_ENDIAN ? _c_v256_unzip_32(b, a, 0) 576 : _c_v256_unzip_32(b, a, 1); 577 } 578 579 SIMD_INLINE c_v256 _c_v256_unzip_64(c_v256 a, c_v256 b, int mode) { 580 c_v256 t; 581 if (mode) { 582 t.u64[3] = b.u64[3]; 583 t.u64[2] = b.u64[1]; 584 t.u64[1] = a.u64[3]; 585 t.u64[0] = a.u64[1]; 586 } else { 587 t.u64[3] = a.u64[2]; 588 t.u64[2] = a.u64[0]; 589 t.u64[1] = b.u64[2]; 590 t.u64[0] = b.u64[0]; 591 } 592 return t; 593 } 594 595 SIMD_INLINE c_v256 c_v256_unziplo_64(c_v256 a, c_v256 b) { 596 return CONFIG_BIG_ENDIAN ? _c_v256_unzip_64(a, b, 1) 597 : _c_v256_unzip_64(a, b, 0); 598 } 599 600 SIMD_INLINE c_v256 c_v256_unziphi_64(c_v256 a, c_v256 b) { 601 return CONFIG_BIG_ENDIAN ? _c_v256_unzip_64(b, a, 0) 602 : _c_v256_unzip_64(b, a, 1); 603 } 604 605 SIMD_INLINE c_v256 c_v256_unpack_u8_s16(c_v128 a) { 606 return c_v256_from_v128(c_v128_unpackhi_u8_s16(a), c_v128_unpacklo_u8_s16(a)); 607 } 608 609 SIMD_INLINE c_v256 c_v256_unpacklo_u8_s16(c_v256 a) { 610 return c_v256_from_v128(c_v128_unpackhi_u8_s16(a.v128[0]), 611 c_v128_unpacklo_u8_s16(a.v128[0])); 612 } 613 614 SIMD_INLINE c_v256 c_v256_unpackhi_u8_s16(c_v256 a) { 615 return c_v256_from_v128(c_v128_unpackhi_u8_s16(a.v128[1]), 616 c_v128_unpacklo_u8_s16(a.v128[1])); 617 } 618 619 SIMD_INLINE c_v256 c_v256_unpack_s8_s16(c_v128 a) { 620 return c_v256_from_v128(c_v128_unpackhi_s8_s16(a), c_v128_unpacklo_s8_s16(a)); 621 } 622 623 SIMD_INLINE c_v256 c_v256_unpacklo_s8_s16(c_v256 a) { 624 return c_v256_from_v128(c_v128_unpackhi_s8_s16(a.v128[0]), 625 c_v128_unpacklo_s8_s16(a.v128[0])); 626 } 627 628 SIMD_INLINE c_v256 c_v256_unpackhi_s8_s16(c_v256 a) { 629 return c_v256_from_v128(c_v128_unpackhi_s8_s16(a.v128[1]), 630 c_v128_unpacklo_s8_s16(a.v128[1])); 631 } 632 633 SIMD_INLINE c_v256 c_v256_pack_s32_s16(c_v256 a, c_v256 b) { 634 return c_v256_from_v128(c_v128_pack_s32_s16(a.v128[1], a.v128[0]), 635 c_v128_pack_s32_s16(b.v128[1], b.v128[0])); 636 } 637 638 SIMD_INLINE c_v256 c_v256_pack_s32_u16(c_v256 a, c_v256 b) { 639 return c_v256_from_v128(c_v128_pack_s32_u16(a.v128[1], a.v128[0]), 640 c_v128_pack_s32_u16(b.v128[1], b.v128[0])); 641 } 642 643 SIMD_INLINE c_v256 c_v256_pack_s16_u8(c_v256 a, c_v256 b) { 644 return c_v256_from_v128(c_v128_pack_s16_u8(a.v128[1], a.v128[0]), 645 c_v128_pack_s16_u8(b.v128[1], b.v128[0])); 646 } 647 648 SIMD_INLINE c_v256 c_v256_pack_s16_s8(c_v256 a, c_v256 b) { 649 return c_v256_from_v128(c_v128_pack_s16_s8(a.v128[1], a.v128[0]), 650 c_v128_pack_s16_s8(b.v128[1], b.v128[0])); 651 } 652 653 SIMD_INLINE c_v256 c_v256_unpack_u16_s32(c_v128 a) { 654 return c_v256_from_v128(c_v128_unpackhi_u16_s32(a), 655 c_v128_unpacklo_u16_s32(a)); 656 } 657 658 SIMD_INLINE c_v256 c_v256_unpack_s16_s32(c_v128 a) { 659 return c_v256_from_v128(c_v128_unpackhi_s16_s32(a), 660 c_v128_unpacklo_s16_s32(a)); 661 } 662 663 SIMD_INLINE c_v256 c_v256_unpacklo_u16_s32(c_v256 a) { 664 return c_v256_from_v128(c_v128_unpackhi_u16_s32(a.v128[0]), 665 c_v128_unpacklo_u16_s32(a.v128[0])); 666 } 667 668 SIMD_INLINE c_v256 c_v256_unpacklo_s16_s32(c_v256 a) { 669 return c_v256_from_v128(c_v128_unpackhi_s16_s32(a.v128[0]), 670 c_v128_unpacklo_s16_s32(a.v128[0])); 671 } 672 673 SIMD_INLINE c_v256 c_v256_unpackhi_u16_s32(c_v256 a) { 674 return c_v256_from_v128(c_v128_unpackhi_u16_s32(a.v128[1]), 675 c_v128_unpacklo_u16_s32(a.v128[1])); 676 } 677 678 SIMD_INLINE c_v256 c_v256_unpackhi_s16_s32(c_v256 a) { 679 return c_v256_from_v128(c_v128_unpackhi_s16_s32(a.v128[1]), 680 c_v128_unpacklo_s16_s32(a.v128[1])); 681 } 682 683 SIMD_INLINE c_v256 c_v256_shuffle_8(c_v256 a, c_v256 pattern) { 684 c_v256 t; 685 int c; 686 for (c = 0; c < 32; c++) 687 t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 31 - (pattern.u8[c] & 31) 688 : pattern.u8[c] & 31]; 689 690 return t; 691 } 692 693 SIMD_INLINE c_v256 c_v256_wideshuffle_8(c_v256 a, c_v256 b, c_v256 pattern) { 694 c_v256 t; 695 int c; 696 for (c = 0; c < 32; c++) 697 t.u8[c] = (pattern.u8[c] < 32 698 ? b.u8 699 : a.u8)[CONFIG_BIG_ENDIAN ? 31 - (pattern.u8[c] & 31) 700 : pattern.u8[c] & 31]; 701 return t; 702 } 703 704 // Pairwise / dual-lane shuffle: shuffle two 128 bit lates. 705 SIMD_INLINE c_v256 c_v256_pshuffle_8(c_v256 a, c_v256 pattern) { 706 return c_v256_from_v128( 707 c_v128_shuffle_8(c_v256_high_v128(a), c_v256_high_v128(pattern)), 708 c_v128_shuffle_8(c_v256_low_v128(a), c_v256_low_v128(pattern))); 709 } 710 711 SIMD_INLINE c_v256 c_v256_cmpgt_s8(c_v256 a, c_v256 b) { 712 return c_v256_from_v128(c_v128_cmpgt_s8(a.v128[1], b.v128[1]), 713 c_v128_cmpgt_s8(a.v128[0], b.v128[0])); 714 } 715 716 SIMD_INLINE c_v256 c_v256_cmplt_s8(c_v256 a, c_v256 b) { 717 return c_v256_from_v128(c_v128_cmplt_s8(a.v128[1], b.v128[1]), 718 c_v128_cmplt_s8(a.v128[0], b.v128[0])); 719 } 720 721 SIMD_INLINE c_v256 c_v256_cmpeq_8(c_v256 a, c_v256 b) { 722 return c_v256_from_v128(c_v128_cmpeq_8(a.v128[1], b.v128[1]), 723 c_v128_cmpeq_8(a.v128[0], b.v128[0])); 724 } 725 726 SIMD_INLINE c_v256 c_v256_cmpgt_s16(c_v256 a, c_v256 b) { 727 return c_v256_from_v128(c_v128_cmpgt_s16(a.v128[1], b.v128[1]), 728 c_v128_cmpgt_s16(a.v128[0], b.v128[0])); 729 } 730 731 SIMD_INLINE c_v256 c_v256_cmplt_s16(c_v256 a, c_v256 b) { 732 return c_v256_from_v128(c_v128_cmplt_s16(a.v128[1], b.v128[1]), 733 c_v128_cmplt_s16(a.v128[0], b.v128[0])); 734 } 735 736 SIMD_INLINE c_v256 c_v256_cmpeq_16(c_v256 a, c_v256 b) { 737 return c_v256_from_v128(c_v128_cmpeq_16(a.v128[1], b.v128[1]), 738 c_v128_cmpeq_16(a.v128[0], b.v128[0])); 739 } 740 741 SIMD_INLINE c_v256 c_v256_cmpgt_s32(c_v256 a, c_v256 b) { 742 return c_v256_from_v128(c_v128_cmpgt_s32(a.v128[1], b.v128[1]), 743 c_v128_cmpgt_s32(a.v128[0], b.v128[0])); 744 } 745 746 SIMD_INLINE c_v256 c_v256_cmplt_s32(c_v256 a, c_v256 b) { 747 return c_v256_from_v128(c_v128_cmplt_s32(a.v128[1], b.v128[1]), 748 c_v128_cmplt_s32(a.v128[0], b.v128[0])); 749 } 750 751 SIMD_INLINE c_v256 c_v256_cmpeq_32(c_v256 a, c_v256 b) { 752 return c_v256_from_v128(c_v128_cmpeq_32(a.v128[1], b.v128[1]), 753 c_v128_cmpeq_32(a.v128[0], b.v128[0])); 754 } 755 756 SIMD_INLINE c_v256 c_v256_shl_n_byte(c_v256 a, unsigned int n) { 757 if (n == 0) return a; 758 if (n < 16) 759 return c_v256_from_v128(c_v128_or(c_v128_shl_n_byte(a.v128[1], n), 760 c_v128_shr_n_byte(a.v128[0], 16 - n)), 761 c_v128_shl_n_byte(a.v128[0], n)); 762 else if (n > 16) 763 return c_v256_from_v128(c_v128_shl_n_byte(a.v128[0], n - 16), 764 c_v128_zero()); 765 else 766 return c_v256_from_v128(c_v256_low_v128(a), c_v128_zero()); 767 } 768 769 SIMD_INLINE c_v256 c_v256_shr_n_byte(c_v256 a, unsigned int n) { 770 if (n == 0) return a; 771 if (n < 16) 772 return c_v256_from_v128(c_v128_shr_n_byte(a.v128[1], n), 773 c_v128_or(c_v128_shr_n_byte(a.v128[0], n), 774 c_v128_shl_n_byte(a.v128[1], 16 - n))); 775 else if (n > 16) 776 return c_v256_from_v128(c_v128_zero(), 777 c_v128_shr_n_byte(a.v128[1], n - 16)); 778 else 779 return c_v256_from_v128(c_v128_zero(), c_v256_high_v128(a)); 780 } 781 782 SIMD_INLINE c_v256 c_v256_align(c_v256 a, c_v256 b, unsigned int c) { 783 if (SIMD_CHECK && c > 31) { 784 fprintf(stderr, "Error: undefined alignment %d\n", c); 785 abort(); 786 } 787 return c ? c_v256_or(c_v256_shr_n_byte(b, c), c_v256_shl_n_byte(a, 32 - c)) 788 : b; 789 } 790 791 SIMD_INLINE c_v256 c_v256_shl_8(c_v256 a, unsigned int c) { 792 return c_v256_from_v128(c_v128_shl_8(a.v128[1], c), 793 c_v128_shl_8(a.v128[0], c)); 794 } 795 796 SIMD_INLINE c_v256 c_v256_shr_u8(c_v256 a, unsigned int c) { 797 return c_v256_from_v128(c_v128_shr_u8(a.v128[1], c), 798 c_v128_shr_u8(a.v128[0], c)); 799 } 800 801 SIMD_INLINE c_v256 c_v256_shr_s8(c_v256 a, unsigned int c) { 802 return c_v256_from_v128(c_v128_shr_s8(a.v128[1], c), 803 c_v128_shr_s8(a.v128[0], c)); 804 } 805 806 SIMD_INLINE c_v256 c_v256_shl_16(c_v256 a, unsigned int c) { 807 return c_v256_from_v128(c_v128_shl_16(a.v128[1], c), 808 c_v128_shl_16(a.v128[0], c)); 809 } 810 811 SIMD_INLINE c_v256 c_v256_shr_u16(c_v256 a, unsigned int c) { 812 return c_v256_from_v128(c_v128_shr_u16(a.v128[1], c), 813 c_v128_shr_u16(a.v128[0], c)); 814 } 815 816 SIMD_INLINE c_v256 c_v256_shr_s16(c_v256 a, unsigned int c) { 817 return c_v256_from_v128(c_v128_shr_s16(a.v128[1], c), 818 c_v128_shr_s16(a.v128[0], c)); 819 } 820 821 SIMD_INLINE c_v256 c_v256_shl_32(c_v256 a, unsigned int c) { 822 return c_v256_from_v128(c_v128_shl_32(a.v128[1], c), 823 c_v128_shl_32(a.v128[0], c)); 824 } 825 826 SIMD_INLINE c_v256 c_v256_shr_u32(c_v256 a, unsigned int c) { 827 return c_v256_from_v128(c_v128_shr_u32(a.v128[1], c), 828 c_v128_shr_u32(a.v128[0], c)); 829 } 830 831 SIMD_INLINE c_v256 c_v256_shr_s32(c_v256 a, unsigned int c) { 832 return c_v256_from_v128(c_v128_shr_s32(a.v128[1], c), 833 c_v128_shr_s32(a.v128[0], c)); 834 } 835 836 SIMD_INLINE c_v256 c_v256_shr_s64(c_v256 a, unsigned int n) { 837 c_v256 t; 838 if (SIMD_CHECK && n > 63) { 839 fprintf(stderr, "Error: undefined s64 shift right %d\n", n); 840 abort(); 841 } 842 t.s64[3] = a.s64[3] >> n; 843 t.s64[2] = a.s64[2] >> n; 844 t.s64[1] = a.s64[1] >> n; 845 t.s64[0] = a.s64[0] >> n; 846 return t; 847 } 848 849 SIMD_INLINE c_v256 c_v256_shr_u64(c_v256 a, unsigned int n) { 850 c_v256 t; 851 if (SIMD_CHECK && n > 63) { 852 fprintf(stderr, "Error: undefined s64 shift right %d\n", n); 853 abort(); 854 } 855 t.u64[3] = a.u64[3] >> n; 856 t.u64[2] = a.u64[2] >> n; 857 t.u64[1] = a.u64[1] >> n; 858 t.u64[0] = a.u64[0] >> n; 859 return t; 860 } 861 862 SIMD_INLINE c_v256 c_v256_shl_64(c_v256 a, unsigned int n) { 863 c_v256 t; 864 if (SIMD_CHECK && n > 63) { 865 fprintf(stderr, "Error: undefined s64 shift right %d\n", n); 866 abort(); 867 } 868 t.u64[3] = a.u64[3] << n; 869 t.u64[2] = a.u64[2] << n; 870 t.u64[1] = a.u64[1] << n; 871 t.u64[0] = a.u64[0] << n; 872 return t; 873 } 874 875 SIMD_INLINE c_v256 c_v256_shl_n_8(c_v256 a, unsigned int n) { 876 return c_v256_shl_8(a, n); 877 } 878 879 SIMD_INLINE c_v256 c_v256_shl_n_16(c_v256 a, unsigned int n) { 880 return c_v256_shl_16(a, n); 881 } 882 883 SIMD_INLINE c_v256 c_v256_shl_n_32(c_v256 a, unsigned int n) { 884 return c_v256_shl_32(a, n); 885 } 886 887 SIMD_INLINE c_v256 c_v256_shl_n_64(c_v256 a, unsigned int n) { 888 return c_v256_shl_64(a, n); 889 } 890 891 SIMD_INLINE c_v256 c_v256_shr_n_u8(c_v256 a, unsigned int n) { 892 return c_v256_shr_u8(a, n); 893 } 894 895 SIMD_INLINE c_v256 c_v256_shr_n_u16(c_v256 a, unsigned int n) { 896 return c_v256_shr_u16(a, n); 897 } 898 899 SIMD_INLINE c_v256 c_v256_shr_n_u32(c_v256 a, unsigned int n) { 900 return c_v256_shr_u32(a, n); 901 } 902 903 SIMD_INLINE c_v256 c_v256_shr_n_u64(c_v256 a, unsigned int n) { 904 return c_v256_shr_u64(a, n); 905 } 906 907 SIMD_INLINE c_v256 c_v256_shr_n_s8(c_v256 a, unsigned int n) { 908 return c_v256_shr_s8(a, n); 909 } 910 911 SIMD_INLINE c_v256 c_v256_shr_n_s16(c_v256 a, unsigned int n) { 912 return c_v256_shr_s16(a, n); 913 } 914 915 SIMD_INLINE c_v256 c_v256_shr_n_s32(c_v256 a, unsigned int n) { 916 return c_v256_shr_s32(a, n); 917 } 918 919 SIMD_INLINE c_v256 c_v256_shr_n_s64(c_v256 a, unsigned int n) { 920 return c_v256_shr_s64(a, n); 921 } 922 923 SIMD_INLINE c_v256 c_v256_shr_n_word(c_v256 a, const unsigned int n) { 924 return c_v256_shr_n_byte(a, 2 * n); 925 } 926 SIMD_INLINE c_v256 c_v256_shl_n_word(c_v256 a, const unsigned int n) { 927 return c_v256_shl_n_byte(a, 2 * n); 928 } 929 930 typedef uint32_t c_sad256_internal_u16; 931 932 SIMD_INLINE c_sad256_internal_u16 c_v256_sad_u16_init(void) { return 0; } 933 934 /* Implementation dependent return value. Result must be finalised with 935 v256_sad_u16_sum(). */ 936 SIMD_INLINE c_sad256_internal_u16 c_v256_sad_u16(c_sad256_internal_u16 s, 937 c_v256 a, c_v256 b) { 938 int c; 939 for (c = 0; c < 16; c++) 940 s += a.u16[c] > b.u16[c] ? a.u16[c] - b.u16[c] : b.u16[c] - a.u16[c]; 941 return s; 942 } 943 944 SIMD_INLINE uint32_t c_v256_sad_u16_sum(c_sad256_internal_u16 s) { return s; } 945 946 typedef uint64_t c_ssd256_internal_s16; 947 948 SIMD_INLINE c_ssd256_internal_s16 c_v256_ssd_s16_init(void) { return 0; } 949 950 /* Implementation dependent return value. Result must be finalised with 951 * v256_ssd_s16_sum(). */ 952 SIMD_INLINE c_ssd256_internal_s16 c_v256_ssd_s16(c_ssd256_internal_s16 s, 953 c_v256 a, c_v256 b) { 954 int c; 955 for (c = 0; c < 16; c++) 956 s += (int32_t)(int16_t)(a.s16[c] - b.s16[c]) * 957 (int32_t)(int16_t)(a.s16[c] - b.s16[c]); 958 return s; 959 } 960 961 SIMD_INLINE uint64_t c_v256_ssd_s16_sum(c_ssd256_internal_s16 s) { return s; } 962 963 #endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_