v128_intrinsics_c.h (28087B)
1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_ 13 #define AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_ 14 15 #include <stdio.h> 16 #include <stdlib.h> 17 #include <string.h> 18 19 #include "config/aom_config.h" 20 21 #include "aom_dsp/simd/v64_intrinsics_c.h" 22 23 typedef union { 24 uint8_t u8[16]; 25 uint16_t u16[8]; 26 uint32_t u32[4]; 27 uint64_t u64[2]; 28 int8_t s8[16]; 29 int16_t s16[8]; 30 int32_t s32[4]; 31 int64_t s64[2]; 32 c_v64 v64[2]; 33 } c_v128; 34 35 SIMD_INLINE uint32_t c_v128_low_u32(c_v128 a) { return a.u32[0]; } 36 37 SIMD_INLINE c_v64 c_v128_low_v64(c_v128 a) { return a.v64[0]; } 38 39 SIMD_INLINE c_v64 c_v128_high_v64(c_v128 a) { return a.v64[1]; } 40 41 SIMD_INLINE c_v128 c_v128_from_64(uint64_t hi, uint64_t lo) { 42 c_v128 t; 43 t.u64[1] = hi; 44 t.u64[0] = lo; 45 return t; 46 } 47 48 SIMD_INLINE c_v128 c_v128_from_v64(c_v64 hi, c_v64 lo) { 49 c_v128 t; 50 t.v64[1] = hi; 51 t.v64[0] = lo; 52 return t; 53 } 54 55 SIMD_INLINE c_v128 c_v128_from_32(uint32_t a, uint32_t b, uint32_t c, 56 uint32_t d) { 57 c_v128 t; 58 t.u32[3] = a; 59 t.u32[2] = b; 60 t.u32[1] = c; 61 t.u32[0] = d; 62 return t; 63 } 64 65 SIMD_INLINE c_v128 c_v128_load_unaligned(const void *p) { 66 c_v128 t; 67 memcpy(&t, p, 16); 68 return t; 69 } 70 71 SIMD_INLINE c_v128 c_v128_load_aligned(const void *p) { 72 if (SIMD_CHECK && (uintptr_t)p & 15) { 73 fprintf(stderr, "Error: unaligned v128 load at %p\n", p); 74 abort(); 75 } 76 return c_v128_load_unaligned(p); 77 } 78 79 SIMD_INLINE void c_v128_store_unaligned(void *p, c_v128 a) { 80 memcpy(p, &a, 16); 81 } 82 83 SIMD_INLINE void c_v128_store_aligned(void *p, c_v128 a) { 84 if (SIMD_CHECK && (uintptr_t)p & 15) { 85 fprintf(stderr, "Error: unaligned v128 store at %p\n", p); 86 abort(); 87 } 88 c_v128_store_unaligned(p, a); 89 } 90 91 SIMD_INLINE c_v128 c_v128_zero(void) { 92 c_v128 t; 93 t.u64[1] = t.u64[0] = 0; 94 return t; 95 } 96 97 SIMD_INLINE c_v128 c_v128_dup_8(uint8_t x) { 98 c_v128 t; 99 t.v64[1] = t.v64[0] = c_v64_dup_8(x); 100 return t; 101 } 102 103 SIMD_INLINE c_v128 c_v128_dup_16(uint16_t x) { 104 c_v128 t; 105 t.v64[1] = t.v64[0] = c_v64_dup_16(x); 106 return t; 107 } 108 109 SIMD_INLINE c_v128 c_v128_dup_32(uint32_t x) { 110 c_v128 t; 111 t.v64[1] = t.v64[0] = c_v64_dup_32(x); 112 return t; 113 } 114 115 SIMD_INLINE c_v128 c_v128_dup_64(uint64_t x) { 116 c_v128 t; 117 t.u64[1] = t.u64[0] = x; 118 return t; 119 } 120 121 SIMD_INLINE int64_t c_v128_dotp_su8(c_v128 a, c_v128 b) { 122 return c_v64_dotp_su8(a.v64[1], b.v64[1]) + 123 c_v64_dotp_su8(a.v64[0], b.v64[0]); 124 } 125 126 SIMD_INLINE int64_t c_v128_dotp_s16(c_v128 a, c_v128 b) { 127 return c_v64_dotp_s16(a.v64[1], b.v64[1]) + 128 c_v64_dotp_s16(a.v64[0], b.v64[0]); 129 } 130 131 SIMD_INLINE int64_t c_v128_dotp_s32(c_v128 a, c_v128 b) { 132 // 32 bit products, 64 bit sum 133 return (int64_t)(int32_t)((int64_t)a.s32[3] * b.s32[3]) + 134 (int64_t)(int32_t)((int64_t)a.s32[2] * b.s32[2]) + 135 (int64_t)(int32_t)((int64_t)a.s32[1] * b.s32[1]) + 136 (int64_t)(int32_t)((int64_t)a.s32[0] * b.s32[0]); 137 } 138 139 SIMD_INLINE uint64_t c_v128_hadd_u8(c_v128 a) { 140 return c_v64_hadd_u8(a.v64[1]) + c_v64_hadd_u8(a.v64[0]); 141 } 142 143 typedef struct { 144 uint32_t val; 145 int count; 146 } c_sad128_internal; 147 148 SIMD_INLINE c_sad128_internal c_v128_sad_u8_init(void) { 149 c_sad128_internal t; 150 t.val = t.count = 0; 151 return t; 152 } 153 154 /* Implementation dependent return value. Result must be finalised with 155 * v128_sad_u8_sum(). The result for more than 32 v128_sad_u8() calls is 156 * undefined. */ 157 SIMD_INLINE c_sad128_internal c_v128_sad_u8(c_sad128_internal s, c_v128 a, 158 c_v128 b) { 159 int c; 160 for (c = 0; c < 16; c++) 161 s.val += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c]; 162 s.count++; 163 if (SIMD_CHECK && s.count > 32) { 164 fprintf(stderr, 165 "Error: sad called 32 times returning an undefined result\n"); 166 abort(); 167 } 168 return s; 169 } 170 171 SIMD_INLINE uint32_t c_v128_sad_u8_sum(c_sad128_internal s) { return s.val; } 172 173 typedef uint32_t c_ssd128_internal; 174 175 SIMD_INLINE c_ssd128_internal c_v128_ssd_u8_init(void) { return 0; } 176 177 /* Implementation dependent return value. Result must be finalised with 178 * v128_ssd_u8_sum(). */ 179 SIMD_INLINE c_ssd128_internal c_v128_ssd_u8(c_ssd128_internal s, c_v128 a, 180 c_v128 b) { 181 int c; 182 for (c = 0; c < 16; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]); 183 return s; 184 } 185 186 SIMD_INLINE uint32_t c_v128_ssd_u8_sum(c_ssd128_internal s) { return s; } 187 188 SIMD_INLINE c_v128 c_v128_or(c_v128 a, c_v128 b) { 189 return c_v128_from_v64(c_v64_or(a.v64[1], b.v64[1]), 190 c_v64_or(a.v64[0], b.v64[0])); 191 } 192 193 SIMD_INLINE c_v128 c_v128_xor(c_v128 a, c_v128 b) { 194 return c_v128_from_v64(c_v64_xor(a.v64[1], b.v64[1]), 195 c_v64_xor(a.v64[0], b.v64[0])); 196 } 197 198 SIMD_INLINE c_v128 c_v128_and(c_v128 a, c_v128 b) { 199 return c_v128_from_v64(c_v64_and(a.v64[1], b.v64[1]), 200 c_v64_and(a.v64[0], b.v64[0])); 201 } 202 203 SIMD_INLINE c_v128 c_v128_andn(c_v128 a, c_v128 b) { 204 return c_v128_from_v64(c_v64_andn(a.v64[1], b.v64[1]), 205 c_v64_andn(a.v64[0], b.v64[0])); 206 } 207 208 SIMD_INLINE c_v128 c_v128_add_8(c_v128 a, c_v128 b) { 209 return c_v128_from_v64(c_v64_add_8(a.v64[1], b.v64[1]), 210 c_v64_add_8(a.v64[0], b.v64[0])); 211 } 212 213 SIMD_INLINE c_v128 c_v128_add_16(c_v128 a, c_v128 b) { 214 return c_v128_from_v64(c_v64_add_16(a.v64[1], b.v64[1]), 215 c_v64_add_16(a.v64[0], b.v64[0])); 216 } 217 218 SIMD_INLINE c_v128 c_v128_sadd_u8(c_v128 a, c_v128 b) { 219 return c_v128_from_v64(c_v64_sadd_u8(a.v64[1], b.v64[1]), 220 c_v64_sadd_u8(a.v64[0], b.v64[0])); 221 } 222 223 SIMD_INLINE c_v128 c_v128_sadd_s8(c_v128 a, c_v128 b) { 224 return c_v128_from_v64(c_v64_sadd_s8(a.v64[1], b.v64[1]), 225 c_v64_sadd_s8(a.v64[0], b.v64[0])); 226 } 227 228 SIMD_INLINE c_v128 c_v128_sadd_s16(c_v128 a, c_v128 b) { 229 return c_v128_from_v64(c_v64_sadd_s16(a.v64[1], b.v64[1]), 230 c_v64_sadd_s16(a.v64[0], b.v64[0])); 231 } 232 233 SIMD_INLINE c_v128 c_v128_add_32(c_v128 a, c_v128 b) { 234 return c_v128_from_v64(c_v64_add_32(a.v64[1], b.v64[1]), 235 c_v64_add_32(a.v64[0], b.v64[0])); 236 } 237 238 SIMD_INLINE c_v128 c_v128_add_64(c_v128 a, c_v128 b) { 239 // Two complement overflow (silences sanitizers) 240 return c_v128_from_64( 241 a.v64[1].u64 > ~b.v64[1].u64 ? a.v64[1].u64 - ~b.v64[1].u64 - 1 242 : a.v64[1].u64 + b.v64[1].u64, 243 a.v64[0].u64 > ~b.v64[0].u64 ? a.v64[0].u64 - ~b.v64[0].u64 - 1 244 : a.v64[0].u64 + b.v64[0].u64); 245 } 246 247 SIMD_INLINE c_v128 c_v128_padd_s16(c_v128 a) { 248 c_v128 t; 249 t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1]; 250 t.s32[1] = (int32_t)a.s16[2] + (int32_t)a.s16[3]; 251 t.s32[2] = (int32_t)a.s16[4] + (int32_t)a.s16[5]; 252 t.s32[3] = (int32_t)a.s16[6] + (int32_t)a.s16[7]; 253 return t; 254 } 255 256 SIMD_INLINE c_v128 c_v128_padd_u8(c_v128 a) { 257 c_v128 t; 258 t.u16[0] = (uint16_t)a.u8[0] + (uint16_t)a.u8[1]; 259 t.u16[1] = (uint16_t)a.u8[2] + (uint16_t)a.u8[3]; 260 t.u16[2] = (uint16_t)a.u8[4] + (uint16_t)a.u8[5]; 261 t.u16[3] = (uint16_t)a.u8[6] + (uint16_t)a.u8[7]; 262 t.u16[4] = (uint16_t)a.u8[8] + (uint16_t)a.u8[9]; 263 t.u16[5] = (uint16_t)a.u8[10] + (uint16_t)a.u8[11]; 264 t.u16[6] = (uint16_t)a.u8[12] + (uint16_t)a.u8[13]; 265 t.u16[7] = (uint16_t)a.u8[14] + (uint16_t)a.u8[15]; 266 return t; 267 } 268 269 SIMD_INLINE c_v128 c_v128_sub_8(c_v128 a, c_v128 b) { 270 return c_v128_from_v64(c_v64_sub_8(a.v64[1], b.v64[1]), 271 c_v64_sub_8(a.v64[0], b.v64[0])); 272 } 273 274 SIMD_INLINE c_v128 c_v128_ssub_u8(c_v128 a, c_v128 b) { 275 return c_v128_from_v64(c_v64_ssub_u8(a.v64[1], b.v64[1]), 276 c_v64_ssub_u8(a.v64[0], b.v64[0])); 277 } 278 279 SIMD_INLINE c_v128 c_v128_ssub_s8(c_v128 a, c_v128 b) { 280 return c_v128_from_v64(c_v64_ssub_s8(a.v64[1], b.v64[1]), 281 c_v64_ssub_s8(a.v64[0], b.v64[0])); 282 } 283 284 SIMD_INLINE c_v128 c_v128_sub_16(c_v128 a, c_v128 b) { 285 return c_v128_from_v64(c_v64_sub_16(a.v64[1], b.v64[1]), 286 c_v64_sub_16(a.v64[0], b.v64[0])); 287 } 288 289 SIMD_INLINE c_v128 c_v128_ssub_s16(c_v128 a, c_v128 b) { 290 return c_v128_from_v64(c_v64_ssub_s16(a.v64[1], b.v64[1]), 291 c_v64_ssub_s16(a.v64[0], b.v64[0])); 292 } 293 294 SIMD_INLINE c_v128 c_v128_ssub_u16(c_v128 a, c_v128 b) { 295 return c_v128_from_v64(c_v64_ssub_u16(a.v64[1], b.v64[1]), 296 c_v64_ssub_u16(a.v64[0], b.v64[0])); 297 } 298 299 SIMD_INLINE c_v128 c_v128_sub_32(c_v128 a, c_v128 b) { 300 return c_v128_from_v64(c_v64_sub_32(a.v64[1], b.v64[1]), 301 c_v64_sub_32(a.v64[0], b.v64[0])); 302 } 303 304 SIMD_INLINE c_v128 c_v128_sub_64(c_v128 a, c_v128 b) { 305 // Two complement underflow (silences sanitizers) 306 return c_v128_from_64( 307 a.v64[1].u64 < b.v64[1].u64 ? a.v64[1].u64 + ~b.v64[1].u64 + 1 308 : a.v64[1].u64 - b.v64[1].u64, 309 a.v64[0].u64 < b.v64[0].u64 ? a.v64[0].u64 + ~b.v64[0].u64 + 1 310 : a.v64[0].u64 - b.v64[0].u64); 311 } 312 313 SIMD_INLINE c_v128 c_v128_abs_s16(c_v128 a) { 314 return c_v128_from_v64(c_v64_abs_s16(a.v64[1]), c_v64_abs_s16(a.v64[0])); 315 } 316 317 SIMD_INLINE c_v128 c_v128_abs_s8(c_v128 a) { 318 return c_v128_from_v64(c_v64_abs_s8(a.v64[1]), c_v64_abs_s8(a.v64[0])); 319 } 320 321 SIMD_INLINE c_v128 c_v128_mul_s16(c_v64 a, c_v64 b) { 322 c_v64 lo_bits = c_v64_mullo_s16(a, b); 323 c_v64 hi_bits = c_v64_mulhi_s16(a, b); 324 return c_v128_from_v64(c_v64_ziphi_16(hi_bits, lo_bits), 325 c_v64_ziplo_16(hi_bits, lo_bits)); 326 } 327 328 SIMD_INLINE c_v128 c_v128_mullo_s16(c_v128 a, c_v128 b) { 329 return c_v128_from_v64(c_v64_mullo_s16(a.v64[1], b.v64[1]), 330 c_v64_mullo_s16(a.v64[0], b.v64[0])); 331 } 332 333 SIMD_INLINE c_v128 c_v128_mulhi_s16(c_v128 a, c_v128 b) { 334 return c_v128_from_v64(c_v64_mulhi_s16(a.v64[1], b.v64[1]), 335 c_v64_mulhi_s16(a.v64[0], b.v64[0])); 336 } 337 338 SIMD_INLINE c_v128 c_v128_mullo_s32(c_v128 a, c_v128 b) { 339 return c_v128_from_v64(c_v64_mullo_s32(a.v64[1], b.v64[1]), 340 c_v64_mullo_s32(a.v64[0], b.v64[0])); 341 } 342 343 SIMD_INLINE c_v128 c_v128_madd_s16(c_v128 a, c_v128 b) { 344 return c_v128_from_v64(c_v64_madd_s16(a.v64[1], b.v64[1]), 345 c_v64_madd_s16(a.v64[0], b.v64[0])); 346 } 347 348 SIMD_INLINE c_v128 c_v128_madd_us8(c_v128 a, c_v128 b) { 349 return c_v128_from_v64(c_v64_madd_us8(a.v64[1], b.v64[1]), 350 c_v64_madd_us8(a.v64[0], b.v64[0])); 351 } 352 353 SIMD_INLINE c_v128 c_v128_avg_u8(c_v128 a, c_v128 b) { 354 return c_v128_from_v64(c_v64_avg_u8(a.v64[1], b.v64[1]), 355 c_v64_avg_u8(a.v64[0], b.v64[0])); 356 } 357 358 SIMD_INLINE c_v128 c_v128_rdavg_u8(c_v128 a, c_v128 b) { 359 return c_v128_from_v64(c_v64_rdavg_u8(a.v64[1], b.v64[1]), 360 c_v64_rdavg_u8(a.v64[0], b.v64[0])); 361 } 362 363 SIMD_INLINE c_v128 c_v128_rdavg_u16(c_v128 a, c_v128 b) { 364 return c_v128_from_v64(c_v64_rdavg_u16(a.v64[1], b.v64[1]), 365 c_v64_rdavg_u16(a.v64[0], b.v64[0])); 366 } 367 368 SIMD_INLINE c_v128 c_v128_avg_u16(c_v128 a, c_v128 b) { 369 return c_v128_from_v64(c_v64_avg_u16(a.v64[1], b.v64[1]), 370 c_v64_avg_u16(a.v64[0], b.v64[0])); 371 } 372 373 SIMD_INLINE c_v128 c_v128_min_u8(c_v128 a, c_v128 b) { 374 return c_v128_from_v64(c_v64_min_u8(a.v64[1], b.v64[1]), 375 c_v64_min_u8(a.v64[0], b.v64[0])); 376 } 377 378 SIMD_INLINE c_v128 c_v128_max_u8(c_v128 a, c_v128 b) { 379 return c_v128_from_v64(c_v64_max_u8(a.v64[1], b.v64[1]), 380 c_v64_max_u8(a.v64[0], b.v64[0])); 381 } 382 383 SIMD_INLINE c_v128 c_v128_min_s8(c_v128 a, c_v128 b) { 384 return c_v128_from_v64(c_v64_min_s8(a.v64[1], b.v64[1]), 385 c_v64_min_s8(a.v64[0], b.v64[0])); 386 } 387 388 SIMD_INLINE uint32_t c_v128_movemask_8(c_v128 a) { 389 return ((a.s8[15] < 0) << 15) | ((a.s8[14] < 0) << 14) | 390 ((a.s8[13] < 0) << 13) | ((a.s8[12] < 0) << 12) | 391 ((a.s8[11] < 0) << 11) | ((a.s8[10] < 0) << 10) | 392 ((a.s8[9] < 0) << 9) | ((a.s8[8] < 0) << 8) | ((a.s8[7] < 0) << 7) | 393 ((a.s8[6] < 0) << 6) | ((a.s8[5] < 0) << 5) | ((a.s8[4] < 0) << 4) | 394 ((a.s8[3] < 0) << 3) | ((a.s8[2] < 0) << 2) | ((a.s8[1] < 0) << 1) | 395 ((a.s8[0] < 0) << 0); 396 } 397 398 SIMD_INLINE c_v128 c_v128_blend_8(c_v128 a, c_v128 b, c_v128 c) { 399 c_v128 t; 400 for (int i = 0; i < 16; i++) t.u8[i] = c.s8[i] < 0 ? b.u8[i] : a.u8[i]; 401 return t; 402 } 403 404 SIMD_INLINE c_v128 c_v128_max_s8(c_v128 a, c_v128 b) { 405 return c_v128_from_v64(c_v64_max_s8(a.v64[1], b.v64[1]), 406 c_v64_max_s8(a.v64[0], b.v64[0])); 407 } 408 409 SIMD_INLINE c_v128 c_v128_min_s16(c_v128 a, c_v128 b) { 410 return c_v128_from_v64(c_v64_min_s16(a.v64[1], b.v64[1]), 411 c_v64_min_s16(a.v64[0], b.v64[0])); 412 } 413 414 SIMD_INLINE c_v128 c_v128_max_s16(c_v128 a, c_v128 b) { 415 return c_v128_from_v64(c_v64_max_s16(a.v64[1], b.v64[1]), 416 c_v64_max_s16(a.v64[0], b.v64[0])); 417 } 418 419 SIMD_INLINE c_v128 c_v128_max_s32(c_v128 a, c_v128 b) { 420 c_v128 t; 421 int c; 422 for (c = 0; c < 4; c++) t.s32[c] = a.s32[c] > b.s32[c] ? a.s32[c] : b.s32[c]; 423 return t; 424 } 425 426 SIMD_INLINE c_v128 c_v128_min_s32(c_v128 a, c_v128 b) { 427 c_v128 t; 428 int c; 429 for (c = 0; c < 4; c++) t.s32[c] = a.s32[c] > b.s32[c] ? b.s32[c] : a.s32[c]; 430 return t; 431 } 432 433 SIMD_INLINE c_v128 c_v128_ziplo_8(c_v128 a, c_v128 b) { 434 return c_v128_from_v64(c_v64_ziphi_8(a.v64[0], b.v64[0]), 435 c_v64_ziplo_8(a.v64[0], b.v64[0])); 436 } 437 438 SIMD_INLINE c_v128 c_v128_ziphi_8(c_v128 a, c_v128 b) { 439 return c_v128_from_v64(c_v64_ziphi_8(a.v64[1], b.v64[1]), 440 c_v64_ziplo_8(a.v64[1], b.v64[1])); 441 } 442 443 SIMD_INLINE c_v128 c_v128_ziplo_16(c_v128 a, c_v128 b) { 444 return c_v128_from_v64(c_v64_ziphi_16(a.v64[0], b.v64[0]), 445 c_v64_ziplo_16(a.v64[0], b.v64[0])); 446 } 447 448 SIMD_INLINE c_v128 c_v128_ziphi_16(c_v128 a, c_v128 b) { 449 return c_v128_from_v64(c_v64_ziphi_16(a.v64[1], b.v64[1]), 450 c_v64_ziplo_16(a.v64[1], b.v64[1])); 451 } 452 453 SIMD_INLINE c_v128 c_v128_ziplo_32(c_v128 a, c_v128 b) { 454 return c_v128_from_v64(c_v64_ziphi_32(a.v64[0], b.v64[0]), 455 c_v64_ziplo_32(a.v64[0], b.v64[0])); 456 } 457 458 SIMD_INLINE c_v128 c_v128_ziphi_32(c_v128 a, c_v128 b) { 459 return c_v128_from_v64(c_v64_ziphi_32(a.v64[1], b.v64[1]), 460 c_v64_ziplo_32(a.v64[1], b.v64[1])); 461 } 462 463 SIMD_INLINE c_v128 c_v128_ziplo_64(c_v128 a, c_v128 b) { 464 return c_v128_from_v64(a.v64[0], b.v64[0]); 465 } 466 467 SIMD_INLINE c_v128 c_v128_ziphi_64(c_v128 a, c_v128 b) { 468 return c_v128_from_v64(a.v64[1], b.v64[1]); 469 } 470 471 SIMD_INLINE c_v128 c_v128_zip_8(c_v64 a, c_v64 b) { 472 return c_v128_from_v64(c_v64_ziphi_8(a, b), c_v64_ziplo_8(a, b)); 473 } 474 475 SIMD_INLINE c_v128 c_v128_zip_16(c_v64 a, c_v64 b) { 476 return c_v128_from_v64(c_v64_ziphi_16(a, b), c_v64_ziplo_16(a, b)); 477 } 478 479 SIMD_INLINE c_v128 c_v128_zip_32(c_v64 a, c_v64 b) { 480 return c_v128_from_v64(c_v64_ziphi_32(a, b), c_v64_ziplo_32(a, b)); 481 } 482 483 SIMD_INLINE c_v128 _c_v128_unzip_8(c_v128 a, c_v128 b, int mode) { 484 c_v128 t; 485 if (mode) { 486 t.u8[15] = b.u8[15]; 487 t.u8[14] = b.u8[13]; 488 t.u8[13] = b.u8[11]; 489 t.u8[12] = b.u8[9]; 490 t.u8[11] = b.u8[7]; 491 t.u8[10] = b.u8[5]; 492 t.u8[9] = b.u8[3]; 493 t.u8[8] = b.u8[1]; 494 t.u8[7] = a.u8[15]; 495 t.u8[6] = a.u8[13]; 496 t.u8[5] = a.u8[11]; 497 t.u8[4] = a.u8[9]; 498 t.u8[3] = a.u8[7]; 499 t.u8[2] = a.u8[5]; 500 t.u8[1] = a.u8[3]; 501 t.u8[0] = a.u8[1]; 502 } else { 503 t.u8[15] = a.u8[14]; 504 t.u8[14] = a.u8[12]; 505 t.u8[13] = a.u8[10]; 506 t.u8[12] = a.u8[8]; 507 t.u8[11] = a.u8[6]; 508 t.u8[10] = a.u8[4]; 509 t.u8[9] = a.u8[2]; 510 t.u8[8] = a.u8[0]; 511 t.u8[7] = b.u8[14]; 512 t.u8[6] = b.u8[12]; 513 t.u8[5] = b.u8[10]; 514 t.u8[4] = b.u8[8]; 515 t.u8[3] = b.u8[6]; 516 t.u8[2] = b.u8[4]; 517 t.u8[1] = b.u8[2]; 518 t.u8[0] = b.u8[0]; 519 } 520 return t; 521 } 522 523 SIMD_INLINE c_v128 c_v128_unziplo_8(c_v128 a, c_v128 b) { 524 return CONFIG_BIG_ENDIAN ? _c_v128_unzip_8(a, b, 1) 525 : _c_v128_unzip_8(a, b, 0); 526 } 527 528 SIMD_INLINE c_v128 c_v128_unziphi_8(c_v128 a, c_v128 b) { 529 return CONFIG_BIG_ENDIAN ? _c_v128_unzip_8(b, a, 0) 530 : _c_v128_unzip_8(b, a, 1); 531 } 532 533 SIMD_INLINE c_v128 _c_v128_unzip_16(c_v128 a, c_v128 b, int mode) { 534 c_v128 t; 535 if (mode) { 536 t.u16[7] = b.u16[7]; 537 t.u16[6] = b.u16[5]; 538 t.u16[5] = b.u16[3]; 539 t.u16[4] = b.u16[1]; 540 t.u16[3] = a.u16[7]; 541 t.u16[2] = a.u16[5]; 542 t.u16[1] = a.u16[3]; 543 t.u16[0] = a.u16[1]; 544 } else { 545 t.u16[7] = a.u16[6]; 546 t.u16[6] = a.u16[4]; 547 t.u16[5] = a.u16[2]; 548 t.u16[4] = a.u16[0]; 549 t.u16[3] = b.u16[6]; 550 t.u16[2] = b.u16[4]; 551 t.u16[1] = b.u16[2]; 552 t.u16[0] = b.u16[0]; 553 } 554 return t; 555 } 556 557 SIMD_INLINE c_v128 c_v128_unziplo_16(c_v128 a, c_v128 b) { 558 return CONFIG_BIG_ENDIAN ? _c_v128_unzip_16(a, b, 1) 559 : _c_v128_unzip_16(a, b, 0); 560 } 561 562 SIMD_INLINE c_v128 c_v128_unziphi_16(c_v128 a, c_v128 b) { 563 return CONFIG_BIG_ENDIAN ? _c_v128_unzip_16(b, a, 0) 564 : _c_v128_unzip_16(b, a, 1); 565 } 566 567 SIMD_INLINE c_v128 _c_v128_unzip_32(c_v128 a, c_v128 b, int mode) { 568 c_v128 t; 569 if (mode) { 570 t.u32[3] = b.u32[3]; 571 t.u32[2] = b.u32[1]; 572 t.u32[1] = a.u32[3]; 573 t.u32[0] = a.u32[1]; 574 } else { 575 t.u32[3] = a.u32[2]; 576 t.u32[2] = a.u32[0]; 577 t.u32[1] = b.u32[2]; 578 t.u32[0] = b.u32[0]; 579 } 580 return t; 581 } 582 583 SIMD_INLINE c_v128 c_v128_unziplo_32(c_v128 a, c_v128 b) { 584 return CONFIG_BIG_ENDIAN ? _c_v128_unzip_32(a, b, 1) 585 : _c_v128_unzip_32(a, b, 0); 586 } 587 588 SIMD_INLINE c_v128 c_v128_unziphi_32(c_v128 a, c_v128 b) { 589 return CONFIG_BIG_ENDIAN ? _c_v128_unzip_32(b, a, 0) 590 : _c_v128_unzip_32(b, a, 1); 591 } 592 593 SIMD_INLINE c_v128 c_v128_unpack_u8_s16(c_v64 a) { 594 return c_v128_from_v64(c_v64_unpackhi_u8_s16(a), c_v64_unpacklo_u8_s16(a)); 595 } 596 597 SIMD_INLINE c_v128 c_v128_unpacklo_u8_s16(c_v128 a) { 598 return c_v128_from_v64(c_v64_unpackhi_u8_s16(a.v64[0]), 599 c_v64_unpacklo_u8_s16(a.v64[0])); 600 } 601 602 SIMD_INLINE c_v128 c_v128_unpackhi_u8_s16(c_v128 a) { 603 return c_v128_from_v64(c_v64_unpackhi_u8_s16(a.v64[1]), 604 c_v64_unpacklo_u8_s16(a.v64[1])); 605 } 606 607 SIMD_INLINE c_v128 c_v128_unpack_s8_s16(c_v64 a) { 608 return c_v128_from_v64(c_v64_unpackhi_s8_s16(a), c_v64_unpacklo_s8_s16(a)); 609 } 610 611 SIMD_INLINE c_v128 c_v128_unpacklo_s8_s16(c_v128 a) { 612 return c_v128_from_v64(c_v64_unpackhi_s8_s16(a.v64[0]), 613 c_v64_unpacklo_s8_s16(a.v64[0])); 614 } 615 616 SIMD_INLINE c_v128 c_v128_unpackhi_s8_s16(c_v128 a) { 617 return c_v128_from_v64(c_v64_unpackhi_s8_s16(a.v64[1]), 618 c_v64_unpacklo_s8_s16(a.v64[1])); 619 } 620 621 SIMD_INLINE c_v128 c_v128_pack_s32_s16(c_v128 a, c_v128 b) { 622 return c_v128_from_v64(c_v64_pack_s32_s16(a.v64[1], a.v64[0]), 623 c_v64_pack_s32_s16(b.v64[1], b.v64[0])); 624 } 625 626 SIMD_INLINE c_v128 c_v128_pack_s32_u16(c_v128 a, c_v128 b) { 627 return c_v128_from_v64(c_v64_pack_s32_u16(a.v64[1], a.v64[0]), 628 c_v64_pack_s32_u16(b.v64[1], b.v64[0])); 629 } 630 631 SIMD_INLINE c_v128 c_v128_pack_s16_u8(c_v128 a, c_v128 b) { 632 return c_v128_from_v64(c_v64_pack_s16_u8(a.v64[1], a.v64[0]), 633 c_v64_pack_s16_u8(b.v64[1], b.v64[0])); 634 } 635 636 SIMD_INLINE c_v128 c_v128_pack_s16_s8(c_v128 a, c_v128 b) { 637 return c_v128_from_v64(c_v64_pack_s16_s8(a.v64[1], a.v64[0]), 638 c_v64_pack_s16_s8(b.v64[1], b.v64[0])); 639 } 640 641 SIMD_INLINE c_v128 c_v128_unpack_u16_s32(c_v64 a) { 642 return c_v128_from_v64(c_v64_unpackhi_u16_s32(a), c_v64_unpacklo_u16_s32(a)); 643 } 644 645 SIMD_INLINE c_v128 c_v128_unpack_s16_s32(c_v64 a) { 646 return c_v128_from_v64(c_v64_unpackhi_s16_s32(a), c_v64_unpacklo_s16_s32(a)); 647 } 648 649 SIMD_INLINE c_v128 c_v128_unpacklo_u16_s32(c_v128 a) { 650 return c_v128_from_v64(c_v64_unpackhi_u16_s32(a.v64[0]), 651 c_v64_unpacklo_u16_s32(a.v64[0])); 652 } 653 654 SIMD_INLINE c_v128 c_v128_unpacklo_s16_s32(c_v128 a) { 655 return c_v128_from_v64(c_v64_unpackhi_s16_s32(a.v64[0]), 656 c_v64_unpacklo_s16_s32(a.v64[0])); 657 } 658 659 SIMD_INLINE c_v128 c_v128_unpackhi_u16_s32(c_v128 a) { 660 return c_v128_from_v64(c_v64_unpackhi_u16_s32(a.v64[1]), 661 c_v64_unpacklo_u16_s32(a.v64[1])); 662 } 663 664 SIMD_INLINE c_v128 c_v128_unpackhi_s16_s32(c_v128 a) { 665 return c_v128_from_v64(c_v64_unpackhi_s16_s32(a.v64[1]), 666 c_v64_unpacklo_s16_s32(a.v64[1])); 667 } 668 669 SIMD_INLINE c_v128 c_v128_shuffle_8(c_v128 a, c_v128 pattern) { 670 c_v128 t; 671 int c; 672 for (c = 0; c < 16; c++) 673 t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 15 - (pattern.u8[c] & 15) 674 : pattern.u8[c] & 15]; 675 676 return t; 677 } 678 679 SIMD_INLINE c_v128 c_v128_cmpgt_s8(c_v128 a, c_v128 b) { 680 return c_v128_from_v64(c_v64_cmpgt_s8(a.v64[1], b.v64[1]), 681 c_v64_cmpgt_s8(a.v64[0], b.v64[0])); 682 } 683 684 SIMD_INLINE c_v128 c_v128_cmplt_s8(c_v128 a, c_v128 b) { 685 return c_v128_from_v64(c_v64_cmplt_s8(a.v64[1], b.v64[1]), 686 c_v64_cmplt_s8(a.v64[0], b.v64[0])); 687 } 688 689 SIMD_INLINE c_v128 c_v128_cmpeq_8(c_v128 a, c_v128 b) { 690 return c_v128_from_v64(c_v64_cmpeq_8(a.v64[1], b.v64[1]), 691 c_v64_cmpeq_8(a.v64[0], b.v64[0])); 692 } 693 694 SIMD_INLINE c_v128 c_v128_cmpgt_s16(c_v128 a, c_v128 b) { 695 return c_v128_from_v64(c_v64_cmpgt_s16(a.v64[1], b.v64[1]), 696 c_v64_cmpgt_s16(a.v64[0], b.v64[0])); 697 } 698 699 SIMD_INLINE c_v128 c_v128_cmplt_s16(c_v128 a, c_v128 b) { 700 return c_v128_from_v64(c_v64_cmplt_s16(a.v64[1], b.v64[1]), 701 c_v64_cmplt_s16(a.v64[0], b.v64[0])); 702 } 703 704 SIMD_INLINE c_v128 c_v128_cmpeq_16(c_v128 a, c_v128 b) { 705 return c_v128_from_v64(c_v64_cmpeq_16(a.v64[1], b.v64[1]), 706 c_v64_cmpeq_16(a.v64[0], b.v64[0])); 707 } 708 709 SIMD_INLINE c_v128 c_v128_cmpgt_s32(c_v128 a, c_v128 b) { 710 c_v128 t; 711 int c; 712 for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] > b.s32[c]); 713 return t; 714 } 715 716 SIMD_INLINE c_v128 c_v128_cmplt_s32(c_v128 a, c_v128 b) { 717 c_v128 t; 718 int c; 719 for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] < b.s32[c]); 720 return t; 721 } 722 723 SIMD_INLINE c_v128 c_v128_cmpeq_32(c_v128 a, c_v128 b) { 724 c_v128 t; 725 int c; 726 for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] == b.s32[c]); 727 return t; 728 } 729 730 SIMD_INLINE c_v128 c_v128_shl_n_byte(c_v128 a, const unsigned int n) { 731 if (n == 0) return a; 732 if (n < 8) 733 return c_v128_from_v64(c_v64_or(c_v64_shl_n_byte(a.v64[1], n), 734 c_v64_shr_n_byte(a.v64[0], 8 - n)), 735 c_v64_shl_n_byte(a.v64[0], n)); 736 else 737 return c_v128_from_v64(c_v64_shl_n_byte(a.v64[0], n - 8), c_v64_zero()); 738 } 739 740 SIMD_INLINE c_v128 c_v128_shr_n_byte(c_v128 a, const unsigned int n) { 741 if (n == 0) return a; 742 if (n < 8) 743 return c_v128_from_v64(c_v64_shr_n_byte(a.v64[1], n), 744 c_v64_or(c_v64_shr_n_byte(a.v64[0], n), 745 c_v64_shl_n_byte(a.v64[1], 8 - n))); 746 else 747 return c_v128_from_v64(c_v64_zero(), c_v64_shr_n_byte(a.v64[1], n - 8)); 748 } 749 750 SIMD_INLINE c_v128 c_v128_align(c_v128 a, c_v128 b, const unsigned int c) { 751 if (SIMD_CHECK && c > 15) { 752 fprintf(stderr, "Error: undefined alignment %d\n", c); 753 abort(); 754 } 755 return c ? c_v128_or(c_v128_shr_n_byte(b, c), c_v128_shl_n_byte(a, 16 - c)) 756 : b; 757 } 758 759 SIMD_INLINE c_v128 c_v128_shl_8(c_v128 a, const unsigned int c) { 760 return c_v128_from_v64(c_v64_shl_8(a.v64[1], c), c_v64_shl_8(a.v64[0], c)); 761 } 762 763 SIMD_INLINE c_v128 c_v128_shr_u8(c_v128 a, const unsigned int c) { 764 return c_v128_from_v64(c_v64_shr_u8(a.v64[1], c), c_v64_shr_u8(a.v64[0], c)); 765 } 766 767 SIMD_INLINE c_v128 c_v128_shr_s8(c_v128 a, const unsigned int c) { 768 return c_v128_from_v64(c_v64_shr_s8(a.v64[1], c), c_v64_shr_s8(a.v64[0], c)); 769 } 770 771 SIMD_INLINE c_v128 c_v128_shl_16(c_v128 a, const unsigned int c) { 772 return c_v128_from_v64(c_v64_shl_16(a.v64[1], c), c_v64_shl_16(a.v64[0], c)); 773 } 774 775 SIMD_INLINE c_v128 c_v128_shr_u16(c_v128 a, const unsigned int c) { 776 return c_v128_from_v64(c_v64_shr_u16(a.v64[1], c), 777 c_v64_shr_u16(a.v64[0], c)); 778 } 779 780 SIMD_INLINE c_v128 c_v128_shr_s16(c_v128 a, const unsigned int c) { 781 return c_v128_from_v64(c_v64_shr_s16(a.v64[1], c), 782 c_v64_shr_s16(a.v64[0], c)); 783 } 784 785 SIMD_INLINE c_v128 c_v128_shl_32(c_v128 a, const unsigned int c) { 786 return c_v128_from_v64(c_v64_shl_32(a.v64[1], c), c_v64_shl_32(a.v64[0], c)); 787 } 788 789 SIMD_INLINE c_v128 c_v128_shr_u32(c_v128 a, const unsigned int c) { 790 return c_v128_from_v64(c_v64_shr_u32(a.v64[1], c), 791 c_v64_shr_u32(a.v64[0], c)); 792 } 793 794 SIMD_INLINE c_v128 c_v128_shr_s32(c_v128 a, const unsigned int c) { 795 return c_v128_from_v64(c_v64_shr_s32(a.v64[1], c), 796 c_v64_shr_s32(a.v64[0], c)); 797 } 798 799 SIMD_INLINE c_v128 c_v128_shl_64(c_v128 a, const unsigned int c) { 800 a.v64[1].u64 <<= c; 801 a.v64[0].u64 <<= c; 802 return c_v128_from_v64(a.v64[1], a.v64[0]); 803 } 804 805 SIMD_INLINE c_v128 c_v128_shr_u64(c_v128 a, const unsigned int c) { 806 a.v64[1].u64 >>= c; 807 a.v64[0].u64 >>= c; 808 return c_v128_from_v64(a.v64[1], a.v64[0]); 809 } 810 811 SIMD_INLINE c_v128 c_v128_shr_s64(c_v128 a, const unsigned int c) { 812 a.v64[1].s64 >>= c; 813 a.v64[0].s64 >>= c; 814 return c_v128_from_v64(a.v64[1], a.v64[0]); 815 } 816 817 SIMD_INLINE c_v128 c_v128_shl_n_8(c_v128 a, const unsigned int n) { 818 return c_v128_shl_8(a, n); 819 } 820 821 SIMD_INLINE c_v128 c_v128_shl_n_16(c_v128 a, const unsigned int n) { 822 return c_v128_shl_16(a, n); 823 } 824 825 SIMD_INLINE c_v128 c_v128_shl_n_32(c_v128 a, const unsigned int n) { 826 return c_v128_shl_32(a, n); 827 } 828 829 SIMD_INLINE c_v128 c_v128_shl_n_64(c_v128 a, const unsigned int n) { 830 return c_v128_shl_64(a, n); 831 } 832 833 SIMD_INLINE c_v128 c_v128_shr_n_u8(c_v128 a, const unsigned int n) { 834 return c_v128_shr_u8(a, n); 835 } 836 837 SIMD_INLINE c_v128 c_v128_shr_n_u16(c_v128 a, const unsigned int n) { 838 return c_v128_shr_u16(a, n); 839 } 840 841 SIMD_INLINE c_v128 c_v128_shr_n_u32(c_v128 a, const unsigned int n) { 842 return c_v128_shr_u32(a, n); 843 } 844 845 SIMD_INLINE c_v128 c_v128_shr_n_u64(c_v128 a, const unsigned int n) { 846 return c_v128_shr_u64(a, n); 847 } 848 849 SIMD_INLINE c_v128 c_v128_shr_n_s8(c_v128 a, const unsigned int n) { 850 return c_v128_shr_s8(a, n); 851 } 852 853 SIMD_INLINE c_v128 c_v128_shr_n_s16(c_v128 a, const unsigned int n) { 854 return c_v128_shr_s16(a, n); 855 } 856 857 SIMD_INLINE c_v128 c_v128_shr_n_s32(c_v128 a, const unsigned int n) { 858 return c_v128_shr_s32(a, n); 859 } 860 861 SIMD_INLINE c_v128 c_v128_shr_n_s64(c_v128 a, const unsigned int n) { 862 return c_v128_shr_s64(a, n); 863 } 864 865 typedef uint32_t c_sad128_internal_u16; 866 867 SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16_init(void) { return 0; } 868 869 /* Implementation dependent return value. Result must be finalised with 870 * v128_sad_u16_sum(). */ 871 SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16(c_sad128_internal_u16 s, 872 c_v128 a, c_v128 b) { 873 int c; 874 for (c = 0; c < 8; c++) 875 s += a.u16[c] > b.u16[c] ? a.u16[c] - b.u16[c] : b.u16[c] - a.u16[c]; 876 return s; 877 } 878 879 SIMD_INLINE uint32_t c_v128_sad_u16_sum(c_sad128_internal_u16 s) { return s; } 880 881 typedef uint64_t c_ssd128_internal_s16; 882 883 SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16_init(void) { return 0; } 884 885 /* Implementation dependent return value. Result must be finalised with 886 * v128_ssd_s16_sum(). */ 887 SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16(c_ssd128_internal_s16 s, 888 c_v128 a, c_v128 b) { 889 int c; 890 for (c = 0; c < 8; c++) 891 s += (int32_t)(int16_t)(a.s16[c] - b.s16[c]) * 892 (int32_t)(int16_t)(a.s16[c] - b.s16[c]); 893 return s; 894 } 895 896 SIMD_INLINE uint64_t c_v128_ssd_s16_sum(c_ssd128_internal_s16 s) { return s; } 897 898 #endif // AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_