v64_intrinsics_c.h (23452B)
1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_ 13 #define AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_ 14 15 /* Note: This implements the intrinsics in plain, unoptimised C. 16 Intended for reference, porting or debugging. */ 17 18 #include <stdio.h> 19 #include <stdlib.h> 20 21 #include "config/aom_config.h" 22 23 typedef union { 24 uint8_t u8[8]; 25 uint16_t u16[4]; 26 uint32_t u32[2]; 27 uint64_t u64; 28 int8_t s8[8]; 29 int16_t s16[4]; 30 int32_t s32[2]; 31 int64_t s64; 32 } c_v64; 33 34 SIMD_INLINE uint32_t c_v64_low_u32(c_v64 a) { 35 return a.u32[!!CONFIG_BIG_ENDIAN]; 36 } 37 38 SIMD_INLINE uint32_t c_v64_high_u32(c_v64 a) { 39 return a.u32[!CONFIG_BIG_ENDIAN]; 40 } 41 42 SIMD_INLINE int32_t c_v64_low_s32(c_v64 a) { 43 return a.s32[!!CONFIG_BIG_ENDIAN]; 44 } 45 46 SIMD_INLINE int32_t c_v64_high_s32(c_v64 a) { 47 return a.s32[!CONFIG_BIG_ENDIAN]; 48 } 49 50 SIMD_INLINE c_v64 c_v64_from_32(uint32_t x, uint32_t y) { 51 c_v64 t; 52 t.u32[!CONFIG_BIG_ENDIAN] = x; 53 t.u32[!!CONFIG_BIG_ENDIAN] = y; 54 return t; 55 } 56 57 SIMD_INLINE c_v64 c_v64_from_64(uint64_t x) { 58 c_v64 t; 59 t.u64 = x; 60 return t; 61 } 62 63 SIMD_INLINE uint64_t c_v64_u64(c_v64 x) { return x.u64; } 64 65 SIMD_INLINE c_v64 c_v64_from_16(uint16_t a, uint16_t b, uint16_t c, 66 uint16_t d) { 67 c_v64 t; 68 if (CONFIG_BIG_ENDIAN) { 69 t.u16[0] = a; 70 t.u16[1] = b; 71 t.u16[2] = c; 72 t.u16[3] = d; 73 } else { 74 t.u16[3] = a; 75 t.u16[2] = b; 76 t.u16[1] = c; 77 t.u16[0] = d; 78 } 79 return t; 80 } 81 82 SIMD_INLINE uint32_t c_u32_load_unaligned(const void *p) { 83 uint32_t t; 84 uint8_t *pp = (uint8_t *)p; 85 uint8_t *q = (uint8_t *)&t; 86 int c; 87 for (c = 0; c < 4; c++) q[c] = pp[c]; 88 return t; 89 } 90 91 SIMD_INLINE void c_u32_store_unaligned(void *p, uint32_t a) { 92 uint8_t *pp = (uint8_t *)p; 93 uint8_t *q = (uint8_t *)&a; 94 int c; 95 for (c = 0; c < 4; c++) pp[c] = q[c]; 96 } 97 98 SIMD_INLINE uint32_t c_u32_load_aligned(const void *p) { 99 if (SIMD_CHECK && (uintptr_t)p & 3) { 100 fprintf(stderr, "Error: Unaligned u32 load at %p\n", p); 101 abort(); 102 } 103 return c_u32_load_unaligned(p); 104 } 105 106 SIMD_INLINE void c_u32_store_aligned(void *p, uint32_t a) { 107 if (SIMD_CHECK && (uintptr_t)p & 3) { 108 fprintf(stderr, "Error: Unaligned u32 store at %p\n", p); 109 abort(); 110 } 111 c_u32_store_unaligned(p, a); 112 } 113 114 SIMD_INLINE c_v64 c_v64_load_unaligned(const void *p) { 115 c_v64 t; 116 uint8_t *pp = (uint8_t *)p; 117 uint8_t *q = (uint8_t *)&t; 118 int c; 119 for (c = 0; c < 8; c++) q[c] = pp[c]; 120 return t; 121 } 122 123 SIMD_INLINE c_v64 c_v64_load_aligned(const void *p) { 124 if (SIMD_CHECK && (uintptr_t)p & 7) { 125 fprintf(stderr, "Error: Unaligned c_v64 load at %p\n", p); 126 abort(); 127 } 128 return c_v64_load_unaligned(p); 129 } 130 131 SIMD_INLINE void c_v64_store_unaligned(void *p, c_v64 a) { 132 uint8_t *q = (uint8_t *)p; 133 uint8_t *r = (uint8_t *)&a; 134 int c; 135 for (c = 0; c < 8; c++) q[c] = r[c]; 136 } 137 138 SIMD_INLINE void c_v64_store_aligned(void *p, c_v64 a) { 139 if (SIMD_CHECK && (uintptr_t)p & 7) { 140 fprintf(stderr, "Error: Unaligned c_v64 store at %p\n", p); 141 abort(); 142 } 143 c_v64_store_unaligned(p, a); 144 } 145 146 SIMD_INLINE c_v64 c_v64_zero(void) { 147 c_v64 t; 148 t.u64 = 0; 149 return t; 150 } 151 152 SIMD_INLINE c_v64 c_v64_dup_8(uint8_t x) { 153 c_v64 t; 154 t.u8[0] = t.u8[1] = t.u8[2] = t.u8[3] = t.u8[4] = t.u8[5] = t.u8[6] = 155 t.u8[7] = x; 156 return t; 157 } 158 159 SIMD_INLINE c_v64 c_v64_dup_16(uint16_t x) { 160 c_v64 t; 161 t.u16[0] = t.u16[1] = t.u16[2] = t.u16[3] = x; 162 return t; 163 } 164 165 SIMD_INLINE c_v64 c_v64_dup_32(uint32_t x) { 166 c_v64 t; 167 t.u32[0] = t.u32[1] = x; 168 return t; 169 } 170 171 SIMD_INLINE c_v64 c_v64_add_8(c_v64 a, c_v64 b) { 172 c_v64 t; 173 int c; 174 for (c = 0; c < 8; c++) t.u8[c] = (uint8_t)(a.u8[c] + b.u8[c]); 175 return t; 176 } 177 178 SIMD_INLINE c_v64 c_v64_add_16(c_v64 a, c_v64 b) { 179 c_v64 t; 180 int c; 181 for (c = 0; c < 4; c++) t.u16[c] = (uint16_t)(a.u16[c] + b.u16[c]); 182 return t; 183 } 184 185 SIMD_INLINE c_v64 c_v64_sadd_u8(c_v64 a, c_v64 b) { 186 c_v64 t; 187 int c; 188 for (c = 0; c < 8; c++) 189 t.u8[c] = SIMD_CLAMP((int16_t)a.u8[c] + (int16_t)b.u8[c], 0, 255); 190 return t; 191 } 192 193 SIMD_INLINE c_v64 c_v64_sadd_s8(c_v64 a, c_v64 b) { 194 c_v64 t; 195 int c; 196 for (c = 0; c < 8; c++) 197 t.s8[c] = SIMD_CLAMP((int16_t)a.s8[c] + (int16_t)b.s8[c], -128, 127); 198 return t; 199 } 200 201 SIMD_INLINE c_v64 c_v64_sadd_s16(c_v64 a, c_v64 b) { 202 c_v64 t; 203 int c; 204 for (c = 0; c < 4; c++) 205 t.s16[c] = SIMD_CLAMP((int32_t)a.s16[c] + (int32_t)b.s16[c], -32768, 32767); 206 return t; 207 } 208 209 SIMD_INLINE c_v64 c_v64_add_32(c_v64 a, c_v64 b) { 210 c_v64 t; 211 t.u32[0] = (uint32_t)((uint64_t)a.u32[0] + b.u32[0]); 212 t.u32[1] = (uint32_t)((uint64_t)a.u32[1] + b.u32[1]); 213 return t; 214 } 215 216 SIMD_INLINE c_v64 c_v64_sub_8(c_v64 a, c_v64 b) { 217 c_v64 t; 218 int c; 219 for (c = 0; c < 8; c++) t.u8[c] = (uint8_t)(a.u8[c] - b.u8[c]); 220 return t; 221 } 222 223 SIMD_INLINE c_v64 c_v64_ssub_u8(c_v64 a, c_v64 b) { 224 c_v64 t; 225 int c; 226 for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] < b.u8[c] ? 0 : a.u8[c] - b.u8[c]; 227 return t; 228 } 229 230 SIMD_INLINE c_v64 c_v64_ssub_s8(c_v64 a, c_v64 b) { 231 c_v64 t; 232 int c; 233 for (c = 0; c < 8; c++) { 234 int16_t d = (int16_t)a.s8[c] - (int16_t)b.s8[c]; 235 t.s8[c] = SIMD_CLAMP(d, -128, 127); 236 } 237 return t; 238 } 239 240 SIMD_INLINE c_v64 c_v64_sub_16(c_v64 a, c_v64 b) { 241 c_v64 t; 242 int c; 243 for (c = 0; c < 4; c++) t.u16[c] = (uint16_t)(a.u16[c] - b.u16[c]); 244 return t; 245 } 246 247 SIMD_INLINE c_v64 c_v64_ssub_s16(c_v64 a, c_v64 b) { 248 c_v64 t; 249 int c; 250 for (c = 0; c < 4; c++) 251 t.s16[c] = SIMD_CLAMP((int32_t)a.s16[c] - (int32_t)b.s16[c], -32768, 32767); 252 return t; 253 } 254 255 SIMD_INLINE c_v64 c_v64_ssub_u16(c_v64 a, c_v64 b) { 256 c_v64 t; 257 int c; 258 for (c = 0; c < 4; c++) 259 t.u16[c] = 260 (int32_t)a.u16[c] - (int32_t)b.u16[c] < 0 ? 0 : a.u16[c] - b.u16[c]; 261 return t; 262 } 263 264 SIMD_INLINE c_v64 c_v64_sub_32(c_v64 a, c_v64 b) { 265 c_v64 t; 266 t.u32[0] = (uint32_t)((int64_t)a.u32[0] - b.u32[0]); 267 t.u32[1] = (uint32_t)((int64_t)a.u32[1] - b.u32[1]); 268 return t; 269 } 270 271 SIMD_INLINE c_v64 c_v64_abs_s16(c_v64 a) { 272 c_v64 t; 273 int c; 274 for (c = 0; c < 4; c++) 275 t.u16[c] = (uint16_t)((int16_t)a.u16[c] > 0 ? a.u16[c] : -a.u16[c]); 276 return t; 277 } 278 279 SIMD_INLINE c_v64 c_v64_abs_s8(c_v64 a) { 280 c_v64 t; 281 int c; 282 for (c = 0; c < 8; c++) 283 t.u8[c] = (uint8_t)((int8_t)a.u8[c] > 0 ? a.u8[c] : -a.u8[c]); 284 return t; 285 } 286 287 SIMD_INLINE c_v64 _c_v64_zip_8(c_v64 a, c_v64 b, int mode) { 288 c_v64 t; 289 if (mode) { 290 t.u8[7] = a.u8[7]; 291 t.u8[6] = b.u8[7]; 292 t.u8[5] = a.u8[6]; 293 t.u8[4] = b.u8[6]; 294 t.u8[3] = a.u8[5]; 295 t.u8[2] = b.u8[5]; 296 t.u8[1] = a.u8[4]; 297 t.u8[0] = b.u8[4]; 298 } else { 299 t.u8[7] = a.u8[3]; 300 t.u8[6] = b.u8[3]; 301 t.u8[5] = a.u8[2]; 302 t.u8[4] = b.u8[2]; 303 t.u8[3] = a.u8[1]; 304 t.u8[2] = b.u8[1]; 305 t.u8[1] = a.u8[0]; 306 t.u8[0] = b.u8[0]; 307 } 308 return t; 309 } 310 311 SIMD_INLINE c_v64 c_v64_ziplo_8(c_v64 a, c_v64 b) { 312 return CONFIG_BIG_ENDIAN ? _c_v64_zip_8(b, a, 1) : _c_v64_zip_8(a, b, 0); 313 } 314 315 SIMD_INLINE c_v64 c_v64_ziphi_8(c_v64 a, c_v64 b) { 316 return CONFIG_BIG_ENDIAN ? _c_v64_zip_8(b, a, 0) : _c_v64_zip_8(a, b, 1); 317 } 318 319 SIMD_INLINE c_v64 _c_v64_zip_16(c_v64 a, c_v64 b, int mode) { 320 c_v64 t; 321 if (mode) { 322 t.u16[3] = a.u16[3]; 323 t.u16[2] = b.u16[3]; 324 t.u16[1] = a.u16[2]; 325 t.u16[0] = b.u16[2]; 326 } else { 327 t.u16[3] = a.u16[1]; 328 t.u16[2] = b.u16[1]; 329 t.u16[1] = a.u16[0]; 330 t.u16[0] = b.u16[0]; 331 } 332 return t; 333 } 334 335 SIMD_INLINE c_v64 c_v64_ziplo_16(c_v64 a, c_v64 b) { 336 return CONFIG_BIG_ENDIAN ? _c_v64_zip_16(b, a, 1) : _c_v64_zip_16(a, b, 0); 337 } 338 339 SIMD_INLINE c_v64 c_v64_ziphi_16(c_v64 a, c_v64 b) { 340 return CONFIG_BIG_ENDIAN ? _c_v64_zip_16(b, a, 0) : _c_v64_zip_16(a, b, 1); 341 } 342 343 SIMD_INLINE c_v64 _c_v64_zip_32(c_v64 a, c_v64 b, int mode) { 344 c_v64 t; 345 if (mode) { 346 t.u32[1] = a.u32[1]; 347 t.u32[0] = b.u32[1]; 348 } else { 349 t.u32[1] = a.u32[0]; 350 t.u32[0] = b.u32[0]; 351 } 352 return t; 353 } 354 355 SIMD_INLINE c_v64 c_v64_ziplo_32(c_v64 a, c_v64 b) { 356 return CONFIG_BIG_ENDIAN ? _c_v64_zip_32(b, a, 1) : _c_v64_zip_32(a, b, 0); 357 } 358 359 SIMD_INLINE c_v64 c_v64_ziphi_32(c_v64 a, c_v64 b) { 360 return CONFIG_BIG_ENDIAN ? _c_v64_zip_32(b, a, 0) : _c_v64_zip_32(a, b, 1); 361 } 362 363 SIMD_INLINE c_v64 _c_v64_unzip_8(c_v64 a, c_v64 b, int mode) { 364 c_v64 t; 365 if (mode) { 366 t.u8[7] = b.u8[7]; 367 t.u8[6] = b.u8[5]; 368 t.u8[5] = b.u8[3]; 369 t.u8[4] = b.u8[1]; 370 t.u8[3] = a.u8[7]; 371 t.u8[2] = a.u8[5]; 372 t.u8[1] = a.u8[3]; 373 t.u8[0] = a.u8[1]; 374 } else { 375 t.u8[7] = a.u8[6]; 376 t.u8[6] = a.u8[4]; 377 t.u8[5] = a.u8[2]; 378 t.u8[4] = a.u8[0]; 379 t.u8[3] = b.u8[6]; 380 t.u8[2] = b.u8[4]; 381 t.u8[1] = b.u8[2]; 382 t.u8[0] = b.u8[0]; 383 } 384 return t; 385 } 386 387 SIMD_INLINE c_v64 c_v64_unziplo_8(c_v64 a, c_v64 b) { 388 return CONFIG_BIG_ENDIAN ? _c_v64_unzip_8(a, b, 1) : _c_v64_unzip_8(a, b, 0); 389 } 390 391 SIMD_INLINE c_v64 c_v64_unziphi_8(c_v64 a, c_v64 b) { 392 return CONFIG_BIG_ENDIAN ? _c_v64_unzip_8(b, a, 0) : _c_v64_unzip_8(b, a, 1); 393 } 394 395 SIMD_INLINE c_v64 _c_v64_unzip_16(c_v64 a, c_v64 b, int mode) { 396 c_v64 t; 397 if (mode) { 398 t.u16[3] = b.u16[3]; 399 t.u16[2] = b.u16[1]; 400 t.u16[1] = a.u16[3]; 401 t.u16[0] = a.u16[1]; 402 } else { 403 t.u16[3] = a.u16[2]; 404 t.u16[2] = a.u16[0]; 405 t.u16[1] = b.u16[2]; 406 t.u16[0] = b.u16[0]; 407 } 408 return t; 409 } 410 411 SIMD_INLINE c_v64 c_v64_unziplo_16(c_v64 a, c_v64 b) { 412 return CONFIG_BIG_ENDIAN ? _c_v64_unzip_16(a, b, 1) 413 : _c_v64_unzip_16(a, b, 0); 414 } 415 416 SIMD_INLINE c_v64 c_v64_unziphi_16(c_v64 a, c_v64 b) { 417 return CONFIG_BIG_ENDIAN ? _c_v64_unzip_16(b, a, 0) 418 : _c_v64_unzip_16(b, a, 1); 419 } 420 421 SIMD_INLINE c_v64 c_v64_unpacklo_u8_s16(c_v64 a) { 422 c_v64 t; 423 int endian = !!CONFIG_BIG_ENDIAN * 4; 424 t.s16[3] = (int16_t)a.u8[3 + endian]; 425 t.s16[2] = (int16_t)a.u8[2 + endian]; 426 t.s16[1] = (int16_t)a.u8[1 + endian]; 427 t.s16[0] = (int16_t)a.u8[0 + endian]; 428 return t; 429 } 430 431 SIMD_INLINE c_v64 c_v64_unpackhi_u8_s16(c_v64 a) { 432 c_v64 t; 433 int endian = !!CONFIG_BIG_ENDIAN * 4; 434 t.s16[3] = (int16_t)a.u8[7 - endian]; 435 t.s16[2] = (int16_t)a.u8[6 - endian]; 436 t.s16[1] = (int16_t)a.u8[5 - endian]; 437 t.s16[0] = (int16_t)a.u8[4 - endian]; 438 return t; 439 } 440 441 SIMD_INLINE c_v64 c_v64_unpacklo_s8_s16(c_v64 a) { 442 c_v64 t; 443 int endian = !!CONFIG_BIG_ENDIAN * 4; 444 t.s16[3] = (int16_t)a.s8[3 + endian]; 445 t.s16[2] = (int16_t)a.s8[2 + endian]; 446 t.s16[1] = (int16_t)a.s8[1 + endian]; 447 t.s16[0] = (int16_t)a.s8[0 + endian]; 448 return t; 449 } 450 451 SIMD_INLINE c_v64 c_v64_unpackhi_s8_s16(c_v64 a) { 452 c_v64 t; 453 int endian = !!CONFIG_BIG_ENDIAN * 4; 454 t.s16[3] = (int16_t)a.s8[7 - endian]; 455 t.s16[2] = (int16_t)a.s8[6 - endian]; 456 t.s16[1] = (int16_t)a.s8[5 - endian]; 457 t.s16[0] = (int16_t)a.s8[4 - endian]; 458 return t; 459 } 460 461 SIMD_INLINE c_v64 c_v64_pack_s32_s16(c_v64 a, c_v64 b) { 462 c_v64 t; 463 if (CONFIG_BIG_ENDIAN) { 464 c_v64 u = a; 465 a = b; 466 b = u; 467 } 468 t.s16[3] = SIMD_CLAMP(a.s32[1], -32768, 32767); 469 t.s16[2] = SIMD_CLAMP(a.s32[0], -32768, 32767); 470 t.s16[1] = SIMD_CLAMP(b.s32[1], -32768, 32767); 471 t.s16[0] = SIMD_CLAMP(b.s32[0], -32768, 32767); 472 return t; 473 } 474 475 SIMD_INLINE c_v64 c_v64_pack_s32_u16(c_v64 a, c_v64 b) { 476 c_v64 t; 477 if (CONFIG_BIG_ENDIAN) { 478 c_v64 u = a; 479 a = b; 480 b = u; 481 } 482 t.u16[3] = SIMD_CLAMP(a.s32[1], 0, 65535); 483 t.u16[2] = SIMD_CLAMP(a.s32[0], 0, 65535); 484 t.u16[1] = SIMD_CLAMP(b.s32[1], 0, 65535); 485 t.u16[0] = SIMD_CLAMP(b.s32[0], 0, 65535); 486 return t; 487 } 488 489 SIMD_INLINE c_v64 c_v64_pack_s16_u8(c_v64 a, c_v64 b) { 490 c_v64 t; 491 if (CONFIG_BIG_ENDIAN) { 492 c_v64 u = a; 493 a = b; 494 b = u; 495 } 496 t.u8[7] = SIMD_CLAMP(a.s16[3], 0, 255); 497 t.u8[6] = SIMD_CLAMP(a.s16[2], 0, 255); 498 t.u8[5] = SIMD_CLAMP(a.s16[1], 0, 255); 499 t.u8[4] = SIMD_CLAMP(a.s16[0], 0, 255); 500 t.u8[3] = SIMD_CLAMP(b.s16[3], 0, 255); 501 t.u8[2] = SIMD_CLAMP(b.s16[2], 0, 255); 502 t.u8[1] = SIMD_CLAMP(b.s16[1], 0, 255); 503 t.u8[0] = SIMD_CLAMP(b.s16[0], 0, 255); 504 return t; 505 } 506 507 SIMD_INLINE c_v64 c_v64_pack_s16_s8(c_v64 a, c_v64 b) { 508 c_v64 t; 509 if (CONFIG_BIG_ENDIAN) { 510 c_v64 u = a; 511 a = b; 512 b = u; 513 } 514 t.s8[7] = SIMD_CLAMP(a.s16[3], -128, 127); 515 t.s8[6] = SIMD_CLAMP(a.s16[2], -128, 127); 516 t.s8[5] = SIMD_CLAMP(a.s16[1], -128, 127); 517 t.s8[4] = SIMD_CLAMP(a.s16[0], -128, 127); 518 t.s8[3] = SIMD_CLAMP(b.s16[3], -128, 127); 519 t.s8[2] = SIMD_CLAMP(b.s16[2], -128, 127); 520 t.s8[1] = SIMD_CLAMP(b.s16[1], -128, 127); 521 t.s8[0] = SIMD_CLAMP(b.s16[0], -128, 127); 522 return t; 523 } 524 525 SIMD_INLINE c_v64 c_v64_unpacklo_u16_s32(c_v64 a) { 526 c_v64 t; 527 t.s32[1] = a.u16[1 + !!CONFIG_BIG_ENDIAN * 2]; 528 t.s32[0] = a.u16[0 + !!CONFIG_BIG_ENDIAN * 2]; 529 return t; 530 } 531 532 SIMD_INLINE c_v64 c_v64_unpacklo_s16_s32(c_v64 a) { 533 c_v64 t; 534 t.s32[1] = a.s16[1 + !!CONFIG_BIG_ENDIAN * 2]; 535 t.s32[0] = a.s16[0 + !!CONFIG_BIG_ENDIAN * 2]; 536 return t; 537 } 538 539 SIMD_INLINE c_v64 c_v64_unpackhi_u16_s32(c_v64 a) { 540 c_v64 t; 541 t.s32[1] = a.u16[3 - !!CONFIG_BIG_ENDIAN * 2]; 542 t.s32[0] = a.u16[2 - !!CONFIG_BIG_ENDIAN * 2]; 543 return t; 544 } 545 546 SIMD_INLINE c_v64 c_v64_unpackhi_s16_s32(c_v64 a) { 547 c_v64 t; 548 t.s32[1] = a.s16[3 - !!CONFIG_BIG_ENDIAN * 2]; 549 t.s32[0] = a.s16[2 - !!CONFIG_BIG_ENDIAN * 2]; 550 return t; 551 } 552 553 SIMD_INLINE c_v64 c_v64_shuffle_8(c_v64 a, c_v64 pattern) { 554 c_v64 t; 555 int c; 556 for (c = 0; c < 8; c++) { 557 if (SIMD_CHECK && (pattern.u8[c] & ~7)) { 558 fprintf(stderr, "Error: Undefined v64_shuffle_8 index %d/%d\n", 559 pattern.u8[c], c); 560 abort(); 561 } 562 t.u8[c] = 563 a.u8[CONFIG_BIG_ENDIAN ? 7 - (pattern.u8[c] & 7) : pattern.u8[c] & 7]; 564 } 565 return t; 566 } 567 568 SIMD_INLINE int64_t c_v64_dotp_su8(c_v64 a, c_v64 b) { 569 return a.s8[7] * b.u8[7] + a.s8[6] * b.u8[6] + a.s8[5] * b.u8[5] + 570 a.s8[4] * b.u8[4] + a.s8[3] * b.u8[3] + a.s8[2] * b.u8[2] + 571 a.s8[1] * b.u8[1] + a.s8[0] * b.u8[0]; 572 } 573 574 SIMD_INLINE int64_t c_v64_dotp_s16(c_v64 a, c_v64 b) { 575 return (int64_t)(a.s16[3] * b.s16[3] + a.s16[2] * b.s16[2]) + 576 (int64_t)(a.s16[1] * b.s16[1] + a.s16[0] * b.s16[0]); 577 } 578 579 SIMD_INLINE uint64_t c_v64_hadd_u8(c_v64 a) { 580 return a.u8[7] + a.u8[6] + a.u8[5] + a.u8[4] + a.u8[3] + a.u8[2] + a.u8[1] + 581 a.u8[0]; 582 } 583 584 SIMD_INLINE int64_t c_v64_hadd_s16(c_v64 a) { 585 return a.s16[3] + a.s16[2] + a.s16[1] + a.s16[0]; 586 } 587 588 typedef struct { 589 uint32_t val; 590 int count; 591 } c_sad64_internal; 592 593 SIMD_INLINE c_sad64_internal c_v64_sad_u8_init(void) { 594 c_sad64_internal t; 595 t.val = t.count = 0; 596 return t; 597 } 598 599 /* Implementation dependent return value. Result must be finalised with 600 v64_sad_u8_sum(). The result for more than 32 v64_sad_u8() calls is 601 undefined. */ 602 SIMD_INLINE c_sad64_internal c_v64_sad_u8(c_sad64_internal s, c_v64 a, 603 c_v64 b) { 604 int c; 605 for (c = 0; c < 8; c++) 606 s.val += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c]; 607 s.count++; 608 if (SIMD_CHECK && s.count > 32) { 609 fprintf(stderr, 610 "Error: sad called 32 times returning an undefined result\n"); 611 abort(); 612 } 613 return s; 614 } 615 616 SIMD_INLINE uint32_t c_v64_sad_u8_sum(c_sad64_internal s) { return s.val; } 617 618 typedef uint32_t c_ssd64_internal; 619 620 /* Implementation dependent return value. Result must be finalised with 621 * v64_ssd_u8_sum(). */ 622 SIMD_INLINE c_ssd64_internal c_v64_ssd_u8_init(void) { return 0; } 623 624 SIMD_INLINE c_ssd64_internal c_v64_ssd_u8(c_ssd64_internal s, c_v64 a, 625 c_v64 b) { 626 int c; 627 for (c = 0; c < 8; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]); 628 return s; 629 } 630 631 SIMD_INLINE uint32_t c_v64_ssd_u8_sum(c_ssd64_internal s) { return s; } 632 633 SIMD_INLINE c_v64 c_v64_or(c_v64 a, c_v64 b) { 634 c_v64 t; 635 t.u64 = a.u64 | b.u64; 636 return t; 637 } 638 639 SIMD_INLINE c_v64 c_v64_xor(c_v64 a, c_v64 b) { 640 c_v64 t; 641 t.u64 = a.u64 ^ b.u64; 642 return t; 643 } 644 645 SIMD_INLINE c_v64 c_v64_and(c_v64 a, c_v64 b) { 646 c_v64 t; 647 t.u64 = a.u64 & b.u64; 648 return t; 649 } 650 651 SIMD_INLINE c_v64 c_v64_andn(c_v64 a, c_v64 b) { 652 c_v64 t; 653 t.u64 = a.u64 & ~b.u64; 654 return t; 655 } 656 657 SIMD_INLINE c_v64 c_v64_mullo_s16(c_v64 a, c_v64 b) { 658 c_v64 t; 659 int c; 660 for (c = 0; c < 4; c++) t.s16[c] = (int16_t)(a.s16[c] * b.s16[c]); 661 return t; 662 } 663 664 SIMD_INLINE c_v64 c_v64_mulhi_s16(c_v64 a, c_v64 b) { 665 c_v64 t; 666 int c; 667 for (c = 0; c < 4; c++) t.s16[c] = (a.s16[c] * b.s16[c]) >> 16; 668 return t; 669 } 670 671 SIMD_INLINE c_v64 c_v64_mullo_s32(c_v64 a, c_v64 b) { 672 c_v64 t; 673 t.s32[0] = (int32_t)((int64_t)a.s32[0] * b.s32[0]); 674 t.s32[1] = (int32_t)((int64_t)a.s32[1] * b.s32[1]); 675 return t; 676 } 677 678 SIMD_INLINE c_v64 c_v64_madd_s16(c_v64 a, c_v64 b) { 679 c_v64 t; 680 t.s32[0] = a.s16[0] * b.s16[0] + a.s16[1] * b.s16[1]; 681 t.s32[1] = a.s16[2] * b.s16[2] + a.s16[3] * b.s16[3]; 682 return t; 683 } 684 685 SIMD_INLINE c_v64 c_v64_madd_us8(c_v64 a, c_v64 b) { 686 c_v64 t; 687 int32_t u; 688 u = a.u8[0] * b.s8[0] + a.u8[1] * b.s8[1]; 689 t.s16[0] = SIMD_CLAMP(u, -32768, 32767); 690 u = a.u8[2] * b.s8[2] + a.u8[3] * b.s8[3]; 691 t.s16[1] = SIMD_CLAMP(u, -32768, 32767); 692 u = a.u8[4] * b.s8[4] + a.u8[5] * b.s8[5]; 693 t.s16[2] = SIMD_CLAMP(u, -32768, 32767); 694 u = a.u8[6] * b.s8[6] + a.u8[7] * b.s8[7]; 695 t.s16[3] = SIMD_CLAMP(u, -32768, 32767); 696 return t; 697 } 698 699 SIMD_INLINE c_v64 c_v64_avg_u8(c_v64 a, c_v64 b) { 700 c_v64 t; 701 int c; 702 for (c = 0; c < 8; c++) t.u8[c] = (a.u8[c] + b.u8[c] + 1) >> 1; 703 return t; 704 } 705 706 SIMD_INLINE c_v64 c_v64_rdavg_u8(c_v64 a, c_v64 b) { 707 c_v64 t; 708 int c; 709 for (c = 0; c < 8; c++) t.u8[c] = (a.u8[c] + b.u8[c]) >> 1; 710 return t; 711 } 712 713 SIMD_INLINE c_v64 c_v64_rdavg_u16(c_v64 a, c_v64 b) { 714 c_v64 t; 715 int c; 716 for (c = 0; c < 4; c++) t.u16[c] = (a.u16[c] + b.u16[c]) >> 1; 717 return t; 718 } 719 720 SIMD_INLINE c_v64 c_v64_avg_u16(c_v64 a, c_v64 b) { 721 c_v64 t; 722 int c; 723 for (c = 0; c < 4; c++) t.u16[c] = (a.u16[c] + b.u16[c] + 1) >> 1; 724 return t; 725 } 726 727 SIMD_INLINE c_v64 c_v64_min_u8(c_v64 a, c_v64 b) { 728 c_v64 t; 729 int c; 730 for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] > b.u8[c] ? b.u8[c] : a.u8[c]; 731 return t; 732 } 733 734 SIMD_INLINE c_v64 c_v64_max_u8(c_v64 a, c_v64 b) { 735 c_v64 t; 736 int c; 737 for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] > b.u8[c] ? a.u8[c] : b.u8[c]; 738 return t; 739 } 740 741 SIMD_INLINE c_v64 c_v64_min_s8(c_v64 a, c_v64 b) { 742 c_v64 t; 743 int c; 744 for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] > b.s8[c] ? b.s8[c] : a.s8[c]; 745 return t; 746 } 747 748 SIMD_INLINE c_v64 c_v64_max_s8(c_v64 a, c_v64 b) { 749 c_v64 t; 750 int c; 751 for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] > b.s8[c] ? a.s8[c] : b.s8[c]; 752 return t; 753 } 754 755 SIMD_INLINE c_v64 c_v64_min_s16(c_v64 a, c_v64 b) { 756 c_v64 t; 757 int c; 758 for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] > b.s16[c] ? b.s16[c] : a.s16[c]; 759 return t; 760 } 761 762 SIMD_INLINE c_v64 c_v64_max_s16(c_v64 a, c_v64 b) { 763 c_v64 t; 764 int c; 765 for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] > b.s16[c] ? a.s16[c] : b.s16[c]; 766 return t; 767 } 768 769 SIMD_INLINE c_v64 c_v64_cmpgt_s8(c_v64 a, c_v64 b) { 770 c_v64 t; 771 int c; 772 for (c = 0; c < 8; c++) t.s8[c] = -(a.s8[c] > b.s8[c]); 773 return t; 774 } 775 776 SIMD_INLINE c_v64 c_v64_cmplt_s8(c_v64 a, c_v64 b) { 777 c_v64 t; 778 int c; 779 for (c = 0; c < 8; c++) t.s8[c] = -(a.s8[c] < b.s8[c]); 780 return t; 781 } 782 783 SIMD_INLINE c_v64 c_v64_cmpeq_8(c_v64 a, c_v64 b) { 784 c_v64 t; 785 int c; 786 for (c = 0; c < 8; c++) t.s8[c] = -(a.u8[c] == b.u8[c]); 787 return t; 788 } 789 790 SIMD_INLINE c_v64 c_v64_cmpgt_s16(c_v64 a, c_v64 b) { 791 c_v64 t; 792 int c; 793 for (c = 0; c < 4; c++) t.s16[c] = -(a.s16[c] > b.s16[c]); 794 return t; 795 } 796 797 SIMD_INLINE c_v64 c_v64_cmplt_s16(c_v64 a, c_v64 b) { 798 c_v64 t; 799 int c; 800 for (c = 0; c < 4; c++) t.s16[c] = -(a.s16[c] < b.s16[c]); 801 return t; 802 } 803 804 SIMD_INLINE c_v64 c_v64_cmpeq_16(c_v64 a, c_v64 b) { 805 c_v64 t; 806 int c; 807 for (c = 0; c < 4; c++) t.s16[c] = -(a.u16[c] == b.u16[c]); 808 return t; 809 } 810 811 SIMD_INLINE c_v64 c_v64_shl_8(c_v64 a, unsigned int n) { 812 c_v64 t; 813 int c; 814 if (SIMD_CHECK && n > 7) { 815 fprintf(stderr, "Error: Undefined u8 shift left %d\n", n); 816 abort(); 817 } 818 for (c = 0; c < 8; c++) t.s8[c] = (int8_t)(a.u8[c] << n); 819 return t; 820 } 821 822 SIMD_INLINE c_v64 c_v64_shr_u8(c_v64 a, unsigned int n) { 823 c_v64 t; 824 int c; 825 if (SIMD_CHECK && n > 7) { 826 fprintf(stderr, "Error: Undefined u8 shift right %d\n", n); 827 abort(); 828 } 829 for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] >> n; 830 return t; 831 } 832 833 SIMD_INLINE c_v64 c_v64_shr_s8(c_v64 a, unsigned int n) { 834 c_v64 t; 835 int c; 836 if (SIMD_CHECK && n > 7) { 837 fprintf(stderr, "Error: Undefined s8 shift right %d\n", n); 838 abort(); 839 } 840 for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] >> n; 841 return t; 842 } 843 844 SIMD_INLINE c_v64 c_v64_shl_16(c_v64 a, unsigned int n) { 845 c_v64 t; 846 int c; 847 if (SIMD_CHECK && n > 15) { 848 fprintf(stderr, "Error: Undefined u16 shift left %d\n", n); 849 abort(); 850 } 851 for (c = 0; c < 4; c++) t.u16[c] = (uint16_t)(a.u16[c] << n); 852 return t; 853 } 854 855 SIMD_INLINE c_v64 c_v64_shr_u16(c_v64 a, unsigned int n) { 856 c_v64 t; 857 int c; 858 if (SIMD_CHECK && n > 15) { 859 fprintf(stderr, "Error: Undefined u16 shift right %d\n", n); 860 abort(); 861 } 862 for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] >> n; 863 return t; 864 } 865 866 SIMD_INLINE c_v64 c_v64_shr_s16(c_v64 a, unsigned int n) { 867 c_v64 t; 868 int c; 869 if (SIMD_CHECK && n > 15) { 870 fprintf(stderr, "Error: undefined s16 shift right %d\n", n); 871 abort(); 872 } 873 for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] >> n; 874 return t; 875 } 876 877 SIMD_INLINE c_v64 c_v64_shl_32(c_v64 a, unsigned int n) { 878 c_v64 t; 879 if (SIMD_CHECK && n > 31) { 880 fprintf(stderr, "Error: undefined u32 shift left %d\n", n); 881 abort(); 882 } 883 t.u32[1] = a.u32[1] << n; 884 t.u32[0] = a.u32[0] << n; 885 return t; 886 } 887 888 SIMD_INLINE c_v64 c_v64_shr_u32(c_v64 a, unsigned int n) { 889 c_v64 t; 890 if (SIMD_CHECK && n > 31) { 891 fprintf(stderr, "Error: undefined u32 shift right %d\n", n); 892 abort(); 893 } 894 t.u32[1] = a.u32[1] >> n; 895 t.u32[0] = a.u32[0] >> n; 896 return t; 897 } 898 899 SIMD_INLINE c_v64 c_v64_shr_s32(c_v64 a, unsigned int n) { 900 c_v64 t; 901 if (SIMD_CHECK && n > 31) { 902 fprintf(stderr, "Error: undefined s32 shift right %d\n", n); 903 abort(); 904 } 905 t.s32[1] = a.s32[1] >> n; 906 t.s32[0] = a.s32[0] >> n; 907 return t; 908 } 909 910 SIMD_INLINE c_v64 c_v64_shr_n_byte(c_v64 x, unsigned int i) { 911 c_v64 t; 912 t.u64 = x.u64 >> i * 8; 913 return t; 914 } 915 916 SIMD_INLINE c_v64 c_v64_shl_n_byte(c_v64 x, unsigned int i) { 917 c_v64 t; 918 t.u64 = x.u64 << i * 8; 919 return t; 920 } 921 922 SIMD_INLINE c_v64 c_v64_align(c_v64 a, c_v64 b, unsigned int c) { 923 if (SIMD_CHECK && c > 7) { 924 fprintf(stderr, "Error: undefined alignment %d\n", c); 925 abort(); 926 } 927 return c ? c_v64_or(c_v64_shr_n_byte(b, c), c_v64_shl_n_byte(a, 8 - c)) : b; 928 } 929 930 SIMD_INLINE c_v64 c_v64_shl_n_8(c_v64 a, unsigned int c) { 931 return c_v64_shl_8(a, c); 932 } 933 934 SIMD_INLINE c_v64 c_v64_shr_n_u8(c_v64 a, unsigned int c) { 935 return c_v64_shr_u8(a, c); 936 } 937 938 SIMD_INLINE c_v64 c_v64_shr_n_s8(c_v64 a, unsigned int c) { 939 return c_v64_shr_s8(a, c); 940 } 941 942 SIMD_INLINE c_v64 c_v64_shl_n_16(c_v64 a, unsigned int c) { 943 return c_v64_shl_16(a, c); 944 } 945 946 SIMD_INLINE c_v64 c_v64_shr_n_u16(c_v64 a, unsigned int c) { 947 return c_v64_shr_u16(a, c); 948 } 949 950 SIMD_INLINE c_v64 c_v64_shr_n_s16(c_v64 a, unsigned int c) { 951 return c_v64_shr_s16(a, c); 952 } 953 954 SIMD_INLINE c_v64 c_v64_shl_n_32(c_v64 a, unsigned int c) { 955 return c_v64_shl_32(a, c); 956 } 957 958 SIMD_INLINE c_v64 c_v64_shr_n_u32(c_v64 a, unsigned int c) { 959 return c_v64_shr_u32(a, c); 960 } 961 962 SIMD_INLINE c_v64 c_v64_shr_n_s32(c_v64 a, unsigned int c) { 963 return c_v64_shr_s32(a, c); 964 } 965 966 #endif // AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_