mem_neon.h (49354B)
1 /* 2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #ifndef AOM_AOM_DSP_ARM_MEM_NEON_H_ 13 #define AOM_AOM_DSP_ARM_MEM_NEON_H_ 14 15 #include <arm_neon.h> 16 #include <string.h> 17 #include "aom_dsp/aom_dsp_common.h" 18 19 #if defined(__arm__) || defined(_M_ARM) 20 #define ARM_32_BIT 21 #endif 22 23 // DEFICIENT_CLANG_32_BIT includes clang-cl. 24 #if defined(__clang__) && defined(ARM_32_BIT) && \ 25 (__clang_major__ <= 6 || (defined(__ANDROID__) && __clang_major__ <= 7)) 26 #define DEFICIENT_CLANG_32_BIT 27 #endif 28 29 #if defined(__GNUC__) && !defined(__clang__) && defined(ARM_32_BIT) && \ 30 __GNUC__ < 14 31 #define DEFICIENT_GCC_32_BIT 32 #endif 33 34 // Support for xN Neon intrinsics is lacking in some compilers. 35 #if defined(DEFICIENT_CLANG_32_BIT) || defined(DEFICIENT_GCC_32_BIT) 36 37 static inline uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) { 38 uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16), 39 vld1q_u8(ptr + 2 * 16) } }; 40 return res; 41 } 42 43 static inline uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) { 44 uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } }; 45 return res; 46 } 47 48 static inline uint16x8x2_t vld1q_u16_x2(const uint16_t *ptr) { 49 uint16x8x2_t res = { { vld1q_u16(ptr + 0), vld1q_u16(ptr + 8) } }; 50 return res; 51 } 52 53 static inline uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) { 54 uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8), 55 vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } }; 56 return res; 57 } 58 59 static inline int16x8x2_t vld1q_s16_x2(const int16_t *ptr) { 60 int16x8x2_t res = { { vld1q_s16(ptr + 0 * 8), vld1q_s16(ptr + 1 * 8) } }; 61 return res; 62 } 63 64 static inline int16x8x4_t vld1q_s16_x4(const int16_t *ptr) { 65 int16x8x4_t res = { { vld1q_s16(ptr + 0 * 8), vld1q_s16(ptr + 1 * 8), 66 vld1q_s16(ptr + 2 * 8), vld1q_s16(ptr + 3 * 8) } }; 67 return res; 68 } 69 70 static inline void vst1_u8_x2(uint8_t *ptr, uint8x8x2_t a) { 71 vst1_u8(ptr + 0 * 8, a.val[0]); 72 vst1_u8(ptr + 1 * 8, a.val[1]); 73 } 74 75 static inline void vst1_u8_x4(uint8_t *ptr, uint8x8x4_t a) { 76 vst1_u8(ptr + 0 * 8, a.val[0]); 77 vst1_u8(ptr + 1 * 8, a.val[1]); 78 vst1_u8(ptr + 2 * 8, a.val[2]); 79 vst1_u8(ptr + 3 * 8, a.val[3]); 80 } 81 82 static inline void vst1q_u16_x2(uint16_t *ptr, uint16x8x2_t a) { 83 vst1q_u16(ptr + 0 * 8, a.val[0]); 84 vst1q_u16(ptr + 1 * 8, a.val[1]); 85 } 86 87 static inline void vst1q_u16_x4(uint16_t *ptr, uint16x8x4_t a) { 88 vst1q_u16(ptr + 0 * 8, a.val[0]); 89 vst1q_u16(ptr + 1 * 8, a.val[1]); 90 vst1q_u16(ptr + 2 * 8, a.val[2]); 91 vst1q_u16(ptr + 3 * 8, a.val[3]); 92 } 93 94 #elif defined(__GNUC__) && !defined(__clang__) // GCC 64-bit. 95 #if __GNUC__ < 8 96 static inline uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) { 97 uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } }; 98 return res; 99 } 100 101 static inline uint16x8x2_t vld1q_u16_x2(const uint16_t *ptr) { 102 uint16x8x2_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8) } }; 103 return res; 104 } 105 106 static inline int16x8x2_t vld1q_s16_x2(const int16_t *ptr) { 107 int16x8x2_t res = { { vld1q_s16(ptr + 0 * 8), vld1q_s16(ptr + 1 * 8) } }; 108 return res; 109 } 110 #endif // __GNUC__ < 8 111 112 #if __GNUC__ < 9 113 static inline uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) { 114 uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16), 115 vld1q_u8(ptr + 2 * 16) } }; 116 return res; 117 } 118 #endif // __GNUC__ < 9 119 120 #if ((__GNUC__ << 8) | __GNUC_MINOR__) < 0x805 121 static inline uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) { 122 uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8), 123 vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } }; 124 return res; 125 } 126 127 static inline int16x8x4_t vld1q_s16_x4(const int16_t *ptr) { 128 int16x8x4_t res = { { vld1q_s16(ptr + 0 * 8), vld1q_s16(ptr + 1 * 8), 129 vld1q_s16(ptr + 2 * 8), vld1q_s16(ptr + 3 * 8) } }; 130 return res; 131 } 132 133 static inline void vst1_u8_x2(uint8_t *ptr, uint8x8x2_t a) { 134 vst1_u8(ptr + 0 * 8, a.val[0]); 135 vst1_u8(ptr + 1 * 8, a.val[1]); 136 } 137 138 static inline void vst1_u8_x4(uint8_t *ptr, uint8x8x4_t a) { 139 vst1_u8(ptr + 0 * 8, a.val[0]); 140 vst1_u8(ptr + 1 * 8, a.val[1]); 141 vst1_u8(ptr + 2 * 8, a.val[2]); 142 vst1_u8(ptr + 3 * 8, a.val[3]); 143 } 144 145 static inline void vst1q_u16_x2(uint16_t *ptr, uint16x8x2_t a) { 146 vst1q_u16(ptr + 0 * 8, a.val[0]); 147 vst1q_u16(ptr + 1 * 8, a.val[1]); 148 } 149 150 static inline void vst1q_u16_x4(uint16_t *ptr, uint16x8x4_t a) { 151 vst1q_u16(ptr + 0 * 8, a.val[0]); 152 vst1q_u16(ptr + 1 * 8, a.val[1]); 153 vst1q_u16(ptr + 2 * 8, a.val[2]); 154 vst1q_u16(ptr + 3 * 8, a.val[3]); 155 } 156 #endif // ((__GNUC__ << 8) | __GNUC_MINOR__) < 0x805 157 #endif // defined(__GNUC__) && !defined(__clang__) 158 159 static inline void store_u8_8x2(uint8_t *s, ptrdiff_t p, const uint8x8_t s0, 160 const uint8x8_t s1) { 161 vst1_u8(s, s0); 162 s += p; 163 vst1_u8(s, s1); 164 s += p; 165 } 166 167 static inline uint8x16_t load_u8_8x2(const uint8_t *s, ptrdiff_t p) { 168 return vcombine_u8(vld1_u8(s), vld1_u8(s + p)); 169 } 170 171 // Load four bytes into the low half of a uint8x8_t, zero the upper half. 172 static inline uint8x8_t load_u8_4x1(const uint8_t *p) { 173 uint8x8_t ret = vdup_n_u8(0); 174 ret = vreinterpret_u8_u32( 175 vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 0)); 176 return ret; 177 } 178 179 static inline uint8x8_t load_u8_4x2(const uint8_t *p, ptrdiff_t stride) { 180 uint8x8_t ret = vdup_n_u8(0); 181 ret = vreinterpret_u8_u32( 182 vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 0)); 183 p += stride; 184 ret = vreinterpret_u8_u32( 185 vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 1)); 186 return ret; 187 } 188 189 static inline uint16x4_t load_u16_2x2(const uint16_t *p, ptrdiff_t stride) { 190 uint16x4_t ret = vdup_n_u16(0); 191 ret = vreinterpret_u16_u32( 192 vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u16(ret), 0)); 193 p += stride; 194 ret = vreinterpret_u16_u32( 195 vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u16(ret), 1)); 196 return ret; 197 } 198 199 static inline void load_u8_8x8(const uint8_t *s, ptrdiff_t p, 200 uint8x8_t *const s0, uint8x8_t *const s1, 201 uint8x8_t *const s2, uint8x8_t *const s3, 202 uint8x8_t *const s4, uint8x8_t *const s5, 203 uint8x8_t *const s6, uint8x8_t *const s7) { 204 *s0 = vld1_u8(s); 205 s += p; 206 *s1 = vld1_u8(s); 207 s += p; 208 *s2 = vld1_u8(s); 209 s += p; 210 *s3 = vld1_u8(s); 211 s += p; 212 *s4 = vld1_u8(s); 213 s += p; 214 *s5 = vld1_u8(s); 215 s += p; 216 *s6 = vld1_u8(s); 217 s += p; 218 *s7 = vld1_u8(s); 219 } 220 221 static inline void load_u8_8x7(const uint8_t *s, ptrdiff_t p, 222 uint8x8_t *const s0, uint8x8_t *const s1, 223 uint8x8_t *const s2, uint8x8_t *const s3, 224 uint8x8_t *const s4, uint8x8_t *const s5, 225 uint8x8_t *const s6) { 226 *s0 = vld1_u8(s); 227 s += p; 228 *s1 = vld1_u8(s); 229 s += p; 230 *s2 = vld1_u8(s); 231 s += p; 232 *s3 = vld1_u8(s); 233 s += p; 234 *s4 = vld1_u8(s); 235 s += p; 236 *s5 = vld1_u8(s); 237 s += p; 238 *s6 = vld1_u8(s); 239 } 240 241 static inline void load_u8_8x6(const uint8_t *s, ptrdiff_t p, 242 uint8x8_t *const s0, uint8x8_t *const s1, 243 uint8x8_t *const s2, uint8x8_t *const s3, 244 uint8x8_t *const s4, uint8x8_t *const s5) { 245 *s0 = vld1_u8(s); 246 s += p; 247 *s1 = vld1_u8(s); 248 s += p; 249 *s2 = vld1_u8(s); 250 s += p; 251 *s3 = vld1_u8(s); 252 s += p; 253 *s4 = vld1_u8(s); 254 s += p; 255 *s5 = vld1_u8(s); 256 } 257 258 static inline void load_u8_8x4(const uint8_t *s, const ptrdiff_t p, 259 uint8x8_t *const s0, uint8x8_t *const s1, 260 uint8x8_t *const s2, uint8x8_t *const s3) { 261 *s0 = vld1_u8(s); 262 s += p; 263 *s1 = vld1_u8(s); 264 s += p; 265 *s2 = vld1_u8(s); 266 s += p; 267 *s3 = vld1_u8(s); 268 } 269 270 static inline void load_u8_8x3(const uint8_t *s, const ptrdiff_t p, 271 uint8x8_t *const s0, uint8x8_t *const s1, 272 uint8x8_t *const s2) { 273 *s0 = vld1_u8(s); 274 s += p; 275 *s1 = vld1_u8(s); 276 s += p; 277 *s2 = vld1_u8(s); 278 } 279 280 static inline void load_u16_4x4(const uint16_t *s, const ptrdiff_t p, 281 uint16x4_t *const s0, uint16x4_t *const s1, 282 uint16x4_t *const s2, uint16x4_t *const s3) { 283 *s0 = vld1_u16(s); 284 s += p; 285 *s1 = vld1_u16(s); 286 s += p; 287 *s2 = vld1_u16(s); 288 s += p; 289 *s3 = vld1_u16(s); 290 s += p; 291 } 292 293 static inline void load_u16_4x6(const uint16_t *s, ptrdiff_t p, 294 uint16x4_t *const s0, uint16x4_t *const s1, 295 uint16x4_t *const s2, uint16x4_t *const s3, 296 uint16x4_t *const s4, uint16x4_t *const s5) { 297 *s0 = vld1_u16(s); 298 s += p; 299 *s1 = vld1_u16(s); 300 s += p; 301 *s2 = vld1_u16(s); 302 s += p; 303 *s3 = vld1_u16(s); 304 s += p; 305 *s4 = vld1_u16(s); 306 s += p; 307 *s5 = vld1_u16(s); 308 } 309 310 static inline void load_u16_4x7(const uint16_t *s, ptrdiff_t p, 311 uint16x4_t *const s0, uint16x4_t *const s1, 312 uint16x4_t *const s2, uint16x4_t *const s3, 313 uint16x4_t *const s4, uint16x4_t *const s5, 314 uint16x4_t *const s6) { 315 *s0 = vld1_u16(s); 316 s += p; 317 *s1 = vld1_u16(s); 318 s += p; 319 *s2 = vld1_u16(s); 320 s += p; 321 *s3 = vld1_u16(s); 322 s += p; 323 *s4 = vld1_u16(s); 324 s += p; 325 *s5 = vld1_u16(s); 326 s += p; 327 *s6 = vld1_u16(s); 328 } 329 330 static inline void load_u16_4x8(const uint16_t *s, ptrdiff_t p, 331 uint16x4_t *const s0, uint16x4_t *const s1, 332 uint16x4_t *const s2, uint16x4_t *const s3, 333 uint16x4_t *const s4, uint16x4_t *const s5, 334 uint16x4_t *const s6, uint16x4_t *const s7) { 335 *s0 = vld1_u16(s); 336 s += p; 337 *s1 = vld1_u16(s); 338 s += p; 339 *s2 = vld1_u16(s); 340 s += p; 341 *s3 = vld1_u16(s); 342 s += p; 343 *s4 = vld1_u16(s); 344 s += p; 345 *s5 = vld1_u16(s); 346 s += p; 347 *s6 = vld1_u16(s); 348 s += p; 349 *s7 = vld1_u16(s); 350 } 351 352 static inline void load_u16_4x14(const uint16_t *s, ptrdiff_t p, 353 uint16x4_t *const s0, uint16x4_t *const s1, 354 uint16x4_t *const s2, uint16x4_t *const s3, 355 uint16x4_t *const s4, uint16x4_t *const s5, 356 uint16x4_t *const s6, uint16x4_t *const s7, 357 uint16x4_t *const s8, uint16x4_t *const s9, 358 uint16x4_t *const s10, uint16x4_t *const s11, 359 uint16x4_t *const s12, uint16x4_t *const s13) { 360 *s0 = vld1_u16(s); 361 s += p; 362 *s1 = vld1_u16(s); 363 s += p; 364 *s2 = vld1_u16(s); 365 s += p; 366 *s3 = vld1_u16(s); 367 s += p; 368 *s4 = vld1_u16(s); 369 s += p; 370 *s5 = vld1_u16(s); 371 s += p; 372 *s6 = vld1_u16(s); 373 s += p; 374 *s7 = vld1_u16(s); 375 s += p; 376 *s8 = vld1_u16(s); 377 s += p; 378 *s9 = vld1_u16(s); 379 s += p; 380 *s10 = vld1_u16(s); 381 s += p; 382 *s11 = vld1_u16(s); 383 s += p; 384 *s12 = vld1_u16(s); 385 s += p; 386 *s13 = vld1_u16(s); 387 } 388 389 static inline void load_s16_8x2(const int16_t *s, const ptrdiff_t p, 390 int16x8_t *const s0, int16x8_t *const s1) { 391 *s0 = vld1q_s16(s); 392 s += p; 393 *s1 = vld1q_s16(s); 394 } 395 396 static inline void load_u16_8x2(const uint16_t *s, const ptrdiff_t p, 397 uint16x8_t *const s0, uint16x8_t *const s1) { 398 *s0 = vld1q_u16(s); 399 s += p; 400 *s1 = vld1q_u16(s); 401 } 402 403 static inline void load_u16_8x3(const uint16_t *s, const ptrdiff_t p, 404 uint16x8_t *const s0, uint16x8_t *const s1, 405 uint16x8_t *const s2) { 406 *s0 = vld1q_u16(s); 407 s += p; 408 *s1 = vld1q_u16(s); 409 s += p; 410 *s2 = vld1q_u16(s); 411 } 412 413 static inline void load_u16_8x4(const uint16_t *s, const ptrdiff_t p, 414 uint16x8_t *const s0, uint16x8_t *const s1, 415 uint16x8_t *const s2, uint16x8_t *const s3) { 416 *s0 = vld1q_u16(s); 417 s += p; 418 *s1 = vld1q_u16(s); 419 s += p; 420 *s2 = vld1q_u16(s); 421 s += p; 422 *s3 = vld1q_u16(s); 423 s += p; 424 } 425 426 static inline void load_s16_4x12(const int16_t *s, ptrdiff_t p, 427 int16x4_t *const s0, int16x4_t *const s1, 428 int16x4_t *const s2, int16x4_t *const s3, 429 int16x4_t *const s4, int16x4_t *const s5, 430 int16x4_t *const s6, int16x4_t *const s7, 431 int16x4_t *const s8, int16x4_t *const s9, 432 int16x4_t *const s10, int16x4_t *const s11) { 433 *s0 = vld1_s16(s); 434 s += p; 435 *s1 = vld1_s16(s); 436 s += p; 437 *s2 = vld1_s16(s); 438 s += p; 439 *s3 = vld1_s16(s); 440 s += p; 441 *s4 = vld1_s16(s); 442 s += p; 443 *s5 = vld1_s16(s); 444 s += p; 445 *s6 = vld1_s16(s); 446 s += p; 447 *s7 = vld1_s16(s); 448 s += p; 449 *s8 = vld1_s16(s); 450 s += p; 451 *s9 = vld1_s16(s); 452 s += p; 453 *s10 = vld1_s16(s); 454 s += p; 455 *s11 = vld1_s16(s); 456 } 457 458 static inline void load_s16_4x11(const int16_t *s, ptrdiff_t p, 459 int16x4_t *const s0, int16x4_t *const s1, 460 int16x4_t *const s2, int16x4_t *const s3, 461 int16x4_t *const s4, int16x4_t *const s5, 462 int16x4_t *const s6, int16x4_t *const s7, 463 int16x4_t *const s8, int16x4_t *const s9, 464 int16x4_t *const s10) { 465 *s0 = vld1_s16(s); 466 s += p; 467 *s1 = vld1_s16(s); 468 s += p; 469 *s2 = vld1_s16(s); 470 s += p; 471 *s3 = vld1_s16(s); 472 s += p; 473 *s4 = vld1_s16(s); 474 s += p; 475 *s5 = vld1_s16(s); 476 s += p; 477 *s6 = vld1_s16(s); 478 s += p; 479 *s7 = vld1_s16(s); 480 s += p; 481 *s8 = vld1_s16(s); 482 s += p; 483 *s9 = vld1_s16(s); 484 s += p; 485 *s10 = vld1_s16(s); 486 } 487 488 static inline void load_u16_4x11(const uint16_t *s, ptrdiff_t p, 489 uint16x4_t *const s0, uint16x4_t *const s1, 490 uint16x4_t *const s2, uint16x4_t *const s3, 491 uint16x4_t *const s4, uint16x4_t *const s5, 492 uint16x4_t *const s6, uint16x4_t *const s7, 493 uint16x4_t *const s8, uint16x4_t *const s9, 494 uint16x4_t *const s10) { 495 *s0 = vld1_u16(s); 496 s += p; 497 *s1 = vld1_u16(s); 498 s += p; 499 *s2 = vld1_u16(s); 500 s += p; 501 *s3 = vld1_u16(s); 502 s += p; 503 *s4 = vld1_u16(s); 504 s += p; 505 *s5 = vld1_u16(s); 506 s += p; 507 *s6 = vld1_u16(s); 508 s += p; 509 *s7 = vld1_u16(s); 510 s += p; 511 *s8 = vld1_u16(s); 512 s += p; 513 *s9 = vld1_u16(s); 514 s += p; 515 *s10 = vld1_u16(s); 516 } 517 518 static inline void load_s16_4x8(const int16_t *s, ptrdiff_t p, 519 int16x4_t *const s0, int16x4_t *const s1, 520 int16x4_t *const s2, int16x4_t *const s3, 521 int16x4_t *const s4, int16x4_t *const s5, 522 int16x4_t *const s6, int16x4_t *const s7) { 523 *s0 = vld1_s16(s); 524 s += p; 525 *s1 = vld1_s16(s); 526 s += p; 527 *s2 = vld1_s16(s); 528 s += p; 529 *s3 = vld1_s16(s); 530 s += p; 531 *s4 = vld1_s16(s); 532 s += p; 533 *s5 = vld1_s16(s); 534 s += p; 535 *s6 = vld1_s16(s); 536 s += p; 537 *s7 = vld1_s16(s); 538 } 539 540 static inline void load_s16_4x7(const int16_t *s, ptrdiff_t p, 541 int16x4_t *const s0, int16x4_t *const s1, 542 int16x4_t *const s2, int16x4_t *const s3, 543 int16x4_t *const s4, int16x4_t *const s5, 544 int16x4_t *const s6) { 545 *s0 = vld1_s16(s); 546 s += p; 547 *s1 = vld1_s16(s); 548 s += p; 549 *s2 = vld1_s16(s); 550 s += p; 551 *s3 = vld1_s16(s); 552 s += p; 553 *s4 = vld1_s16(s); 554 s += p; 555 *s5 = vld1_s16(s); 556 s += p; 557 *s6 = vld1_s16(s); 558 } 559 560 static inline void load_s16_4x6(const int16_t *s, ptrdiff_t p, 561 int16x4_t *const s0, int16x4_t *const s1, 562 int16x4_t *const s2, int16x4_t *const s3, 563 int16x4_t *const s4, int16x4_t *const s5) { 564 *s0 = vld1_s16(s); 565 s += p; 566 *s1 = vld1_s16(s); 567 s += p; 568 *s2 = vld1_s16(s); 569 s += p; 570 *s3 = vld1_s16(s); 571 s += p; 572 *s4 = vld1_s16(s); 573 s += p; 574 *s5 = vld1_s16(s); 575 } 576 577 static inline void load_s16_4x5(const int16_t *s, ptrdiff_t p, 578 int16x4_t *const s0, int16x4_t *const s1, 579 int16x4_t *const s2, int16x4_t *const s3, 580 int16x4_t *const s4) { 581 *s0 = vld1_s16(s); 582 s += p; 583 *s1 = vld1_s16(s); 584 s += p; 585 *s2 = vld1_s16(s); 586 s += p; 587 *s3 = vld1_s16(s); 588 s += p; 589 *s4 = vld1_s16(s); 590 } 591 592 static inline void load_u16_4x5(const uint16_t *s, const ptrdiff_t p, 593 uint16x4_t *const s0, uint16x4_t *const s1, 594 uint16x4_t *const s2, uint16x4_t *const s3, 595 uint16x4_t *const s4) { 596 *s0 = vld1_u16(s); 597 s += p; 598 *s1 = vld1_u16(s); 599 s += p; 600 *s2 = vld1_u16(s); 601 s += p; 602 *s3 = vld1_u16(s); 603 s += p; 604 *s4 = vld1_u16(s); 605 s += p; 606 } 607 608 static inline void load_u8_8x5(const uint8_t *s, ptrdiff_t p, 609 uint8x8_t *const s0, uint8x8_t *const s1, 610 uint8x8_t *const s2, uint8x8_t *const s3, 611 uint8x8_t *const s4) { 612 *s0 = vld1_u8(s); 613 s += p; 614 *s1 = vld1_u8(s); 615 s += p; 616 *s2 = vld1_u8(s); 617 s += p; 618 *s3 = vld1_u8(s); 619 s += p; 620 *s4 = vld1_u8(s); 621 } 622 623 static inline void load_u16_8x5(const uint16_t *s, const ptrdiff_t p, 624 uint16x8_t *const s0, uint16x8_t *const s1, 625 uint16x8_t *const s2, uint16x8_t *const s3, 626 uint16x8_t *const s4) { 627 *s0 = vld1q_u16(s); 628 s += p; 629 *s1 = vld1q_u16(s); 630 s += p; 631 *s2 = vld1q_u16(s); 632 s += p; 633 *s3 = vld1q_u16(s); 634 s += p; 635 *s4 = vld1q_u16(s); 636 s += p; 637 } 638 639 static inline void load_s16_4x4(const int16_t *s, ptrdiff_t p, 640 int16x4_t *const s0, int16x4_t *const s1, 641 int16x4_t *const s2, int16x4_t *const s3) { 642 *s0 = vld1_s16(s); 643 s += p; 644 *s1 = vld1_s16(s); 645 s += p; 646 *s2 = vld1_s16(s); 647 s += p; 648 *s3 = vld1_s16(s); 649 } 650 651 static inline void load_s16_4x3(const int16_t *s, ptrdiff_t p, 652 int16x4_t *const s0, int16x4_t *const s1, 653 int16x4_t *const s2) { 654 *s0 = vld1_s16(s); 655 s += p; 656 *s1 = vld1_s16(s); 657 s += p; 658 *s2 = vld1_s16(s); 659 } 660 661 static inline void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0, 662 const uint8x8_t s1, const uint8x8_t s2, 663 const uint8x8_t s3, const uint8x8_t s4, 664 const uint8x8_t s5, const uint8x8_t s6, 665 const uint8x8_t s7) { 666 vst1_u8(s, s0); 667 s += p; 668 vst1_u8(s, s1); 669 s += p; 670 vst1_u8(s, s2); 671 s += p; 672 vst1_u8(s, s3); 673 s += p; 674 vst1_u8(s, s4); 675 s += p; 676 vst1_u8(s, s5); 677 s += p; 678 vst1_u8(s, s6); 679 s += p; 680 vst1_u8(s, s7); 681 } 682 683 static inline void store_u8_8x4(uint8_t *s, ptrdiff_t p, const uint8x8_t s0, 684 const uint8x8_t s1, const uint8x8_t s2, 685 const uint8x8_t s3) { 686 vst1_u8(s, s0); 687 s += p; 688 vst1_u8(s, s1); 689 s += p; 690 vst1_u8(s, s2); 691 s += p; 692 vst1_u8(s, s3); 693 } 694 695 static inline void store_u8_16x4(uint8_t *s, ptrdiff_t p, const uint8x16_t s0, 696 const uint8x16_t s1, const uint8x16_t s2, 697 const uint8x16_t s3) { 698 vst1q_u8(s, s0); 699 s += p; 700 vst1q_u8(s, s1); 701 s += p; 702 vst1q_u8(s, s2); 703 s += p; 704 vst1q_u8(s, s3); 705 } 706 707 static inline void store_u16_8x8(uint16_t *s, ptrdiff_t dst_stride, 708 const uint16x8_t s0, const uint16x8_t s1, 709 const uint16x8_t s2, const uint16x8_t s3, 710 const uint16x8_t s4, const uint16x8_t s5, 711 const uint16x8_t s6, const uint16x8_t s7) { 712 vst1q_u16(s, s0); 713 s += dst_stride; 714 vst1q_u16(s, s1); 715 s += dst_stride; 716 vst1q_u16(s, s2); 717 s += dst_stride; 718 vst1q_u16(s, s3); 719 s += dst_stride; 720 vst1q_u16(s, s4); 721 s += dst_stride; 722 vst1q_u16(s, s5); 723 s += dst_stride; 724 vst1q_u16(s, s6); 725 s += dst_stride; 726 vst1q_u16(s, s7); 727 } 728 729 static inline void store_u16_4x3(uint16_t *s, ptrdiff_t dst_stride, 730 const uint16x4_t s0, const uint16x4_t s1, 731 const uint16x4_t s2) { 732 vst1_u16(s, s0); 733 s += dst_stride; 734 vst1_u16(s, s1); 735 s += dst_stride; 736 vst1_u16(s, s2); 737 } 738 739 static inline void store_u16_4x4(uint16_t *s, ptrdiff_t dst_stride, 740 const uint16x4_t s0, const uint16x4_t s1, 741 const uint16x4_t s2, const uint16x4_t s3) { 742 vst1_u16(s, s0); 743 s += dst_stride; 744 vst1_u16(s, s1); 745 s += dst_stride; 746 vst1_u16(s, s2); 747 s += dst_stride; 748 vst1_u16(s, s3); 749 } 750 751 static inline void store_u16_4x6(uint16_t *s, ptrdiff_t dst_stride, 752 const uint16x4_t s0, const uint16x4_t s1, 753 const uint16x4_t s2, const uint16x4_t s3, 754 const uint16x4_t s4, const uint16x4_t s5) { 755 vst1_u16(s, s0); 756 s += dst_stride; 757 vst1_u16(s, s1); 758 s += dst_stride; 759 vst1_u16(s, s2); 760 s += dst_stride; 761 vst1_u16(s, s3); 762 s += dst_stride; 763 vst1_u16(s, s4); 764 s += dst_stride; 765 vst1_u16(s, s5); 766 } 767 768 static inline void store_u16_4x12(uint16_t *s, ptrdiff_t dst_stride, 769 const uint16x4_t s0, const uint16x4_t s1, 770 const uint16x4_t s2, const uint16x4_t s3, 771 const uint16x4_t s4, const uint16x4_t s5, 772 const uint16x4_t s6, const uint16x4_t s7, 773 const uint16x4_t s8, const uint16x4_t s9, 774 const uint16x4_t s10, const uint16x4_t s11) { 775 vst1_u16(s, s0); 776 s += dst_stride; 777 vst1_u16(s, s1); 778 s += dst_stride; 779 vst1_u16(s, s2); 780 s += dst_stride; 781 vst1_u16(s, s3); 782 s += dst_stride; 783 vst1_u16(s, s4); 784 s += dst_stride; 785 vst1_u16(s, s5); 786 s += dst_stride; 787 vst1_u16(s, s6); 788 s += dst_stride; 789 vst1_u16(s, s7); 790 s += dst_stride; 791 vst1_u16(s, s8); 792 s += dst_stride; 793 vst1_u16(s, s9); 794 s += dst_stride; 795 vst1_u16(s, s10); 796 s += dst_stride; 797 vst1_u16(s, s11); 798 s += dst_stride; 799 } 800 801 static inline void store_u16_8x2(uint16_t *s, ptrdiff_t dst_stride, 802 const uint16x8_t s0, const uint16x8_t s1) { 803 vst1q_u16(s, s0); 804 s += dst_stride; 805 vst1q_u16(s, s1); 806 } 807 808 static inline void store_u16_8x3(uint16_t *s, ptrdiff_t dst_stride, 809 const uint16x8_t s0, const uint16x8_t s1, 810 const uint16x8_t s2) { 811 vst1q_u16(s, s0); 812 s += dst_stride; 813 vst1q_u16(s, s1); 814 s += dst_stride; 815 vst1q_u16(s, s2); 816 } 817 818 static inline void store_u16_8x4(uint16_t *s, ptrdiff_t dst_stride, 819 const uint16x8_t s0, const uint16x8_t s1, 820 const uint16x8_t s2, const uint16x8_t s3) { 821 vst1q_u16(s, s0); 822 s += dst_stride; 823 vst1q_u16(s, s1); 824 s += dst_stride; 825 vst1q_u16(s, s2); 826 s += dst_stride; 827 vst1q_u16(s, s3); 828 } 829 830 static inline void store_s16_8x8(int16_t *s, ptrdiff_t dst_stride, 831 const int16x8_t s0, const int16x8_t s1, 832 const int16x8_t s2, const int16x8_t s3, 833 const int16x8_t s4, const int16x8_t s5, 834 const int16x8_t s6, const int16x8_t s7) { 835 vst1q_s16(s, s0); 836 s += dst_stride; 837 vst1q_s16(s, s1); 838 s += dst_stride; 839 vst1q_s16(s, s2); 840 s += dst_stride; 841 vst1q_s16(s, s3); 842 s += dst_stride; 843 vst1q_s16(s, s4); 844 s += dst_stride; 845 vst1q_s16(s, s5); 846 s += dst_stride; 847 vst1q_s16(s, s6); 848 s += dst_stride; 849 vst1q_s16(s, s7); 850 } 851 852 static inline void store_s16_4x4(int16_t *s, ptrdiff_t dst_stride, 853 const int16x4_t s0, const int16x4_t s1, 854 const int16x4_t s2, const int16x4_t s3) { 855 vst1_s16(s, s0); 856 s += dst_stride; 857 vst1_s16(s, s1); 858 s += dst_stride; 859 vst1_s16(s, s2); 860 s += dst_stride; 861 vst1_s16(s, s3); 862 } 863 864 static inline void store_s16_4x8(int16_t *s, ptrdiff_t dst_stride, 865 const int16x4_t s0, const int16x4_t s1, 866 const int16x4_t s2, const int16x4_t s3, 867 const int16x4_t s4, const int16x4_t s5, 868 const int16x4_t s6, const int16x4_t s7) { 869 vst1_s16(s, s0); 870 s += dst_stride; 871 vst1_s16(s, s1); 872 s += dst_stride; 873 vst1_s16(s, s2); 874 s += dst_stride; 875 vst1_s16(s, s3); 876 s += dst_stride; 877 vst1_s16(s, s4); 878 s += dst_stride; 879 vst1_s16(s, s5); 880 s += dst_stride; 881 vst1_s16(s, s6); 882 s += dst_stride; 883 vst1_s16(s, s7); 884 } 885 886 static inline void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride, 887 const int16x8_t s0, const int16x8_t s1, 888 const int16x8_t s2, const int16x8_t s3) { 889 vst1q_s16(s, s0); 890 s += dst_stride; 891 vst1q_s16(s, s1); 892 s += dst_stride; 893 vst1q_s16(s, s2); 894 s += dst_stride; 895 vst1q_s16(s, s3); 896 } 897 898 static inline void store_s16_8x2(int16_t *s, ptrdiff_t dst_stride, 899 const int16x8_t s0, const int16x8_t s1) { 900 vst1q_s16(s, s0); 901 s += dst_stride; 902 vst1q_s16(s, s1); 903 } 904 905 static inline void load_u8_8x11(const uint8_t *s, ptrdiff_t p, 906 uint8x8_t *const s0, uint8x8_t *const s1, 907 uint8x8_t *const s2, uint8x8_t *const s3, 908 uint8x8_t *const s4, uint8x8_t *const s5, 909 uint8x8_t *const s6, uint8x8_t *const s7, 910 uint8x8_t *const s8, uint8x8_t *const s9, 911 uint8x8_t *const s10) { 912 *s0 = vld1_u8(s); 913 s += p; 914 *s1 = vld1_u8(s); 915 s += p; 916 *s2 = vld1_u8(s); 917 s += p; 918 *s3 = vld1_u8(s); 919 s += p; 920 *s4 = vld1_u8(s); 921 s += p; 922 *s5 = vld1_u8(s); 923 s += p; 924 *s6 = vld1_u8(s); 925 s += p; 926 *s7 = vld1_u8(s); 927 s += p; 928 *s8 = vld1_u8(s); 929 s += p; 930 *s9 = vld1_u8(s); 931 s += p; 932 *s10 = vld1_u8(s); 933 } 934 935 static inline void load_s16_8x10(const int16_t *s, ptrdiff_t p, 936 int16x8_t *const s0, int16x8_t *const s1, 937 int16x8_t *const s2, int16x8_t *const s3, 938 int16x8_t *const s4, int16x8_t *const s5, 939 int16x8_t *const s6, int16x8_t *const s7, 940 int16x8_t *const s8, int16x8_t *const s9) { 941 *s0 = vld1q_s16(s); 942 s += p; 943 *s1 = vld1q_s16(s); 944 s += p; 945 *s2 = vld1q_s16(s); 946 s += p; 947 *s3 = vld1q_s16(s); 948 s += p; 949 *s4 = vld1q_s16(s); 950 s += p; 951 *s5 = vld1q_s16(s); 952 s += p; 953 *s6 = vld1q_s16(s); 954 s += p; 955 *s7 = vld1q_s16(s); 956 s += p; 957 *s8 = vld1q_s16(s); 958 s += p; 959 *s9 = vld1q_s16(s); 960 } 961 962 static inline void load_s16_8x11(const int16_t *s, ptrdiff_t p, 963 int16x8_t *const s0, int16x8_t *const s1, 964 int16x8_t *const s2, int16x8_t *const s3, 965 int16x8_t *const s4, int16x8_t *const s5, 966 int16x8_t *const s6, int16x8_t *const s7, 967 int16x8_t *const s8, int16x8_t *const s9, 968 int16x8_t *const s10) { 969 *s0 = vld1q_s16(s); 970 s += p; 971 *s1 = vld1q_s16(s); 972 s += p; 973 *s2 = vld1q_s16(s); 974 s += p; 975 *s3 = vld1q_s16(s); 976 s += p; 977 *s4 = vld1q_s16(s); 978 s += p; 979 *s5 = vld1q_s16(s); 980 s += p; 981 *s6 = vld1q_s16(s); 982 s += p; 983 *s7 = vld1q_s16(s); 984 s += p; 985 *s8 = vld1q_s16(s); 986 s += p; 987 *s9 = vld1q_s16(s); 988 s += p; 989 *s10 = vld1q_s16(s); 990 } 991 992 static inline void load_s16_8x12(const int16_t *s, ptrdiff_t p, 993 int16x8_t *const s0, int16x8_t *const s1, 994 int16x8_t *const s2, int16x8_t *const s3, 995 int16x8_t *const s4, int16x8_t *const s5, 996 int16x8_t *const s6, int16x8_t *const s7, 997 int16x8_t *const s8, int16x8_t *const s9, 998 int16x8_t *const s10, int16x8_t *const s11) { 999 *s0 = vld1q_s16(s); 1000 s += p; 1001 *s1 = vld1q_s16(s); 1002 s += p; 1003 *s2 = vld1q_s16(s); 1004 s += p; 1005 *s3 = vld1q_s16(s); 1006 s += p; 1007 *s4 = vld1q_s16(s); 1008 s += p; 1009 *s5 = vld1q_s16(s); 1010 s += p; 1011 *s6 = vld1q_s16(s); 1012 s += p; 1013 *s7 = vld1q_s16(s); 1014 s += p; 1015 *s8 = vld1q_s16(s); 1016 s += p; 1017 *s9 = vld1q_s16(s); 1018 s += p; 1019 *s10 = vld1q_s16(s); 1020 s += p; 1021 *s11 = vld1q_s16(s); 1022 } 1023 1024 static inline void load_u16_8x11(const uint16_t *s, ptrdiff_t p, 1025 uint16x8_t *const s0, uint16x8_t *const s1, 1026 uint16x8_t *const s2, uint16x8_t *const s3, 1027 uint16x8_t *const s4, uint16x8_t *const s5, 1028 uint16x8_t *const s6, uint16x8_t *const s7, 1029 uint16x8_t *const s8, uint16x8_t *const s9, 1030 uint16x8_t *const s10) { 1031 *s0 = vld1q_u16(s); 1032 s += p; 1033 *s1 = vld1q_u16(s); 1034 s += p; 1035 *s2 = vld1q_u16(s); 1036 s += p; 1037 *s3 = vld1q_u16(s); 1038 s += p; 1039 *s4 = vld1q_u16(s); 1040 s += p; 1041 *s5 = vld1q_u16(s); 1042 s += p; 1043 *s6 = vld1q_u16(s); 1044 s += p; 1045 *s7 = vld1q_u16(s); 1046 s += p; 1047 *s8 = vld1q_u16(s); 1048 s += p; 1049 *s9 = vld1q_u16(s); 1050 s += p; 1051 *s10 = vld1q_u16(s); 1052 } 1053 1054 static inline void load_s16_8x8(const int16_t *s, ptrdiff_t p, 1055 int16x8_t *const s0, int16x8_t *const s1, 1056 int16x8_t *const s2, int16x8_t *const s3, 1057 int16x8_t *const s4, int16x8_t *const s5, 1058 int16x8_t *const s6, int16x8_t *const s7) { 1059 *s0 = vld1q_s16(s); 1060 s += p; 1061 *s1 = vld1q_s16(s); 1062 s += p; 1063 *s2 = vld1q_s16(s); 1064 s += p; 1065 *s3 = vld1q_s16(s); 1066 s += p; 1067 *s4 = vld1q_s16(s); 1068 s += p; 1069 *s5 = vld1q_s16(s); 1070 s += p; 1071 *s6 = vld1q_s16(s); 1072 s += p; 1073 *s7 = vld1q_s16(s); 1074 } 1075 1076 static inline void load_u16_8x7(const uint16_t *s, ptrdiff_t p, 1077 uint16x8_t *const s0, uint16x8_t *const s1, 1078 uint16x8_t *const s2, uint16x8_t *const s3, 1079 uint16x8_t *const s4, uint16x8_t *const s5, 1080 uint16x8_t *const s6) { 1081 *s0 = vld1q_u16(s); 1082 s += p; 1083 *s1 = vld1q_u16(s); 1084 s += p; 1085 *s2 = vld1q_u16(s); 1086 s += p; 1087 *s3 = vld1q_u16(s); 1088 s += p; 1089 *s4 = vld1q_u16(s); 1090 s += p; 1091 *s5 = vld1q_u16(s); 1092 s += p; 1093 *s6 = vld1q_u16(s); 1094 } 1095 1096 static inline void load_s16_8x7(const int16_t *s, ptrdiff_t p, 1097 int16x8_t *const s0, int16x8_t *const s1, 1098 int16x8_t *const s2, int16x8_t *const s3, 1099 int16x8_t *const s4, int16x8_t *const s5, 1100 int16x8_t *const s6) { 1101 *s0 = vld1q_s16(s); 1102 s += p; 1103 *s1 = vld1q_s16(s); 1104 s += p; 1105 *s2 = vld1q_s16(s); 1106 s += p; 1107 *s3 = vld1q_s16(s); 1108 s += p; 1109 *s4 = vld1q_s16(s); 1110 s += p; 1111 *s5 = vld1q_s16(s); 1112 s += p; 1113 *s6 = vld1q_s16(s); 1114 } 1115 1116 static inline void load_s16_8x6(const int16_t *s, ptrdiff_t p, 1117 int16x8_t *const s0, int16x8_t *const s1, 1118 int16x8_t *const s2, int16x8_t *const s3, 1119 int16x8_t *const s4, int16x8_t *const s5) { 1120 *s0 = vld1q_s16(s); 1121 s += p; 1122 *s1 = vld1q_s16(s); 1123 s += p; 1124 *s2 = vld1q_s16(s); 1125 s += p; 1126 *s3 = vld1q_s16(s); 1127 s += p; 1128 *s4 = vld1q_s16(s); 1129 s += p; 1130 *s5 = vld1q_s16(s); 1131 } 1132 1133 static inline void load_s16_8x5(const int16_t *s, ptrdiff_t p, 1134 int16x8_t *const s0, int16x8_t *const s1, 1135 int16x8_t *const s2, int16x8_t *const s3, 1136 int16x8_t *const s4) { 1137 *s0 = vld1q_s16(s); 1138 s += p; 1139 *s1 = vld1q_s16(s); 1140 s += p; 1141 *s2 = vld1q_s16(s); 1142 s += p; 1143 *s3 = vld1q_s16(s); 1144 s += p; 1145 *s4 = vld1q_s16(s); 1146 } 1147 1148 static inline void load_s16_8x4(const int16_t *s, ptrdiff_t p, 1149 int16x8_t *const s0, int16x8_t *const s1, 1150 int16x8_t *const s2, int16x8_t *const s3) { 1151 *s0 = vld1q_s16(s); 1152 s += p; 1153 *s1 = vld1q_s16(s); 1154 s += p; 1155 *s2 = vld1q_s16(s); 1156 s += p; 1157 *s3 = vld1q_s16(s); 1158 } 1159 1160 static inline void load_s16_8x3(const int16_t *s, ptrdiff_t p, 1161 int16x8_t *const s0, int16x8_t *const s1, 1162 int16x8_t *const s2) { 1163 *s0 = vld1q_s16(s); 1164 s += p; 1165 *s1 = vld1q_s16(s); 1166 s += p; 1167 *s2 = vld1q_s16(s); 1168 } 1169 1170 #if AOM_ARCH_AARCH64 1171 #define load_unaligned_u32_2x1_lane(v, p, lane) \ 1172 do { \ 1173 (v) = vld1_lane_u32((const uint32_t *)(p), (v), (lane)); \ 1174 } while (0) 1175 1176 #define load_unaligned_u32_4x1_lane(v, p, lane) \ 1177 do { \ 1178 (v) = vld1q_lane_u32((const uint32_t *)(p), (v), (lane)); \ 1179 } while (0) 1180 #else 1181 #define load_unaligned_u32_2x1_lane(v, p, lane) \ 1182 do { \ 1183 uint32_t tmp; \ 1184 memcpy(&tmp, (p), 4); \ 1185 (v) = vset_lane_u32(tmp, (v), (lane)); \ 1186 } while (0) 1187 1188 #define load_unaligned_u32_4x1_lane(v, p, lane) \ 1189 do { \ 1190 uint32_t tmp; \ 1191 memcpy(&tmp, (p), 4); \ 1192 (v) = vsetq_lane_u32(tmp, (v), (lane)); \ 1193 } while (0) 1194 #endif 1195 1196 // Load 2 sets of 4 bytes when alignment is not guaranteed. 1197 static inline uint8x8_t load_unaligned_u8(const uint8_t *buf, 1198 ptrdiff_t stride) { 1199 uint32_t a; 1200 memcpy(&a, buf, 4); 1201 buf += stride; 1202 uint32x2_t a_u32 = vdup_n_u32(a); 1203 memcpy(&a, buf, 4); 1204 a_u32 = vset_lane_u32(a, a_u32, 1); 1205 return vreinterpret_u8_u32(a_u32); 1206 } 1207 1208 // Load 4 sets of 4 bytes when alignment is not guaranteed. 1209 static inline uint8x16_t load_unaligned_u8q(const uint8_t *buf, 1210 ptrdiff_t stride) { 1211 uint32_t a; 1212 uint32x4_t a_u32; 1213 if (stride == 4) return vld1q_u8(buf); 1214 memcpy(&a, buf, 4); 1215 buf += stride; 1216 a_u32 = vdupq_n_u32(a); 1217 memcpy(&a, buf, 4); 1218 buf += stride; 1219 a_u32 = vsetq_lane_u32(a, a_u32, 1); 1220 memcpy(&a, buf, 4); 1221 buf += stride; 1222 a_u32 = vsetq_lane_u32(a, a_u32, 2); 1223 memcpy(&a, buf, 4); 1224 a_u32 = vsetq_lane_u32(a, a_u32, 3); 1225 return vreinterpretq_u8_u32(a_u32); 1226 } 1227 1228 static inline uint8x8_t load_unaligned_u8_2x2(const uint8_t *buf, 1229 ptrdiff_t stride) { 1230 uint16_t a; 1231 uint16x4_t a_u16; 1232 1233 memcpy(&a, buf, 2); 1234 buf += stride; 1235 a_u16 = vdup_n_u16(a); 1236 memcpy(&a, buf, 2); 1237 a_u16 = vset_lane_u16(a, a_u16, 1); 1238 return vreinterpret_u8_u16(a_u16); 1239 } 1240 1241 static inline uint8x8_t load_unaligned_u8_4x1(const uint8_t *buf) { 1242 uint32_t a; 1243 uint32x2_t a_u32; 1244 1245 memcpy(&a, buf, 4); 1246 a_u32 = vdup_n_u32(0); 1247 a_u32 = vset_lane_u32(a, a_u32, 0); 1248 return vreinterpret_u8_u32(a_u32); 1249 } 1250 1251 static inline uint8x8_t load_unaligned_dup_u8_4x2(const uint8_t *buf) { 1252 uint32_t a; 1253 uint32x2_t a_u32; 1254 1255 memcpy(&a, buf, 4); 1256 a_u32 = vdup_n_u32(a); 1257 return vreinterpret_u8_u32(a_u32); 1258 } 1259 1260 static inline uint8x8_t load_unaligned_dup_u8_2x4(const uint8_t *buf) { 1261 uint16_t a; 1262 uint16x4_t a_u32; 1263 1264 memcpy(&a, buf, 2); 1265 a_u32 = vdup_n_u16(a); 1266 return vreinterpret_u8_u16(a_u32); 1267 } 1268 1269 static inline uint8x8_t load_unaligned_u8_4x2(const uint8_t *buf, 1270 ptrdiff_t stride) { 1271 uint32_t a; 1272 uint32x2_t a_u32; 1273 1274 memcpy(&a, buf, 4); 1275 buf += stride; 1276 a_u32 = vdup_n_u32(a); 1277 memcpy(&a, buf, 4); 1278 a_u32 = vset_lane_u32(a, a_u32, 1); 1279 return vreinterpret_u8_u32(a_u32); 1280 } 1281 1282 static inline void load_unaligned_u8_4x4(const uint8_t *buf, ptrdiff_t stride, 1283 uint8x8_t *tu0, uint8x8_t *tu1) { 1284 *tu0 = load_unaligned_u8_4x2(buf, stride); 1285 buf += 2 * stride; 1286 *tu1 = load_unaligned_u8_4x2(buf, stride); 1287 } 1288 1289 static inline void load_unaligned_u8_3x8(const uint8_t *buf, ptrdiff_t stride, 1290 uint8x8_t *tu0, uint8x8_t *tu1, 1291 uint8x8_t *tu2) { 1292 load_unaligned_u8_4x4(buf, stride, tu0, tu1); 1293 buf += 4 * stride; 1294 *tu2 = load_unaligned_u8_4x2(buf, stride); 1295 } 1296 1297 static inline void load_unaligned_u8_4x8(const uint8_t *buf, ptrdiff_t stride, 1298 uint8x8_t *tu0, uint8x8_t *tu1, 1299 uint8x8_t *tu2, uint8x8_t *tu3) { 1300 load_unaligned_u8_4x4(buf, stride, tu0, tu1); 1301 buf += 4 * stride; 1302 load_unaligned_u8_4x4(buf, stride, tu2, tu3); 1303 } 1304 1305 static inline void load_u8_16x8(const uint8_t *s, ptrdiff_t p, 1306 uint8x16_t *const s0, uint8x16_t *const s1, 1307 uint8x16_t *const s2, uint8x16_t *const s3, 1308 uint8x16_t *const s4, uint8x16_t *const s5, 1309 uint8x16_t *const s6, uint8x16_t *const s7) { 1310 *s0 = vld1q_u8(s); 1311 s += p; 1312 *s1 = vld1q_u8(s); 1313 s += p; 1314 *s2 = vld1q_u8(s); 1315 s += p; 1316 *s3 = vld1q_u8(s); 1317 s += p; 1318 *s4 = vld1q_u8(s); 1319 s += p; 1320 *s5 = vld1q_u8(s); 1321 s += p; 1322 *s6 = vld1q_u8(s); 1323 s += p; 1324 *s7 = vld1q_u8(s); 1325 } 1326 1327 static inline void load_u8_16x5(const uint8_t *s, ptrdiff_t p, 1328 uint8x16_t *const s0, uint8x16_t *const s1, 1329 uint8x16_t *const s2, uint8x16_t *const s3, 1330 uint8x16_t *const s4) { 1331 *s0 = vld1q_u8(s); 1332 s += p; 1333 *s1 = vld1q_u8(s); 1334 s += p; 1335 *s2 = vld1q_u8(s); 1336 s += p; 1337 *s3 = vld1q_u8(s); 1338 s += p; 1339 *s4 = vld1q_u8(s); 1340 } 1341 1342 static inline void load_u8_16x4(const uint8_t *s, ptrdiff_t p, 1343 uint8x16_t *const s0, uint8x16_t *const s1, 1344 uint8x16_t *const s2, uint8x16_t *const s3) { 1345 *s0 = vld1q_u8(s); 1346 s += p; 1347 *s1 = vld1q_u8(s); 1348 s += p; 1349 *s2 = vld1q_u8(s); 1350 s += p; 1351 *s3 = vld1q_u8(s); 1352 } 1353 1354 static inline void load_u8_16x3(const uint8_t *s, ptrdiff_t p, 1355 uint8x16_t *const s0, uint8x16_t *const s1, 1356 uint8x16_t *const s2) { 1357 *s0 = vld1q_u8(s); 1358 s += p; 1359 *s1 = vld1q_u8(s); 1360 s += p; 1361 *s2 = vld1q_u8(s); 1362 } 1363 1364 static inline void load_u16_8x8(const uint16_t *s, const ptrdiff_t p, 1365 uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2, 1366 uint16x8_t *s3, uint16x8_t *s4, uint16x8_t *s5, 1367 uint16x8_t *s6, uint16x8_t *s7) { 1368 *s0 = vld1q_u16(s); 1369 s += p; 1370 *s1 = vld1q_u16(s); 1371 s += p; 1372 *s2 = vld1q_u16(s); 1373 s += p; 1374 *s3 = vld1q_u16(s); 1375 s += p; 1376 *s4 = vld1q_u16(s); 1377 s += p; 1378 *s5 = vld1q_u16(s); 1379 s += p; 1380 *s6 = vld1q_u16(s); 1381 s += p; 1382 *s7 = vld1q_u16(s); 1383 } 1384 1385 static inline void load_u16_16x4(const uint16_t *s, ptrdiff_t p, 1386 uint16x8_t *const s0, uint16x8_t *const s1, 1387 uint16x8_t *const s2, uint16x8_t *const s3, 1388 uint16x8_t *const s4, uint16x8_t *const s5, 1389 uint16x8_t *const s6, uint16x8_t *const s7) { 1390 *s0 = vld1q_u16(s); 1391 *s1 = vld1q_u16(s + 8); 1392 s += p; 1393 *s2 = vld1q_u16(s); 1394 *s3 = vld1q_u16(s + 8); 1395 s += p; 1396 *s4 = vld1q_u16(s); 1397 *s5 = vld1q_u16(s + 8); 1398 s += p; 1399 *s6 = vld1q_u16(s); 1400 *s7 = vld1q_u16(s + 8); 1401 } 1402 1403 static inline uint16x4_t load_unaligned_u16_2x2(const uint16_t *buf, 1404 ptrdiff_t stride) { 1405 uint32_t a; 1406 uint32x2_t a_u32; 1407 1408 memcpy(&a, buf, 4); 1409 buf += stride; 1410 a_u32 = vdup_n_u32(a); 1411 memcpy(&a, buf, 4); 1412 a_u32 = vset_lane_u32(a, a_u32, 1); 1413 return vreinterpret_u16_u32(a_u32); 1414 } 1415 1416 static inline uint16x4_t load_unaligned_u16_4x1(const uint16_t *buf) { 1417 uint64_t a; 1418 uint64x1_t a_u64 = vdup_n_u64(0); 1419 memcpy(&a, buf, 8); 1420 a_u64 = vset_lane_u64(a, a_u64, 0); 1421 return vreinterpret_u16_u64(a_u64); 1422 } 1423 1424 static inline uint16x8_t load_unaligned_u16_4x2(const uint16_t *buf, 1425 ptrdiff_t stride) { 1426 uint64_t a; 1427 uint64x2_t a_u64; 1428 1429 memcpy(&a, buf, 8); 1430 buf += stride; 1431 a_u64 = vdupq_n_u64(0); 1432 a_u64 = vsetq_lane_u64(a, a_u64, 0); 1433 memcpy(&a, buf, 8); 1434 buf += stride; 1435 a_u64 = vsetq_lane_u64(a, a_u64, 1); 1436 return vreinterpretq_u16_u64(a_u64); 1437 } 1438 1439 static inline int16x8_t load_unaligned_s16_4x2(const int16_t *buf, 1440 ptrdiff_t stride) { 1441 int64_t a; 1442 int64x2_t a_s64; 1443 memcpy(&a, buf, 8); 1444 buf += stride; 1445 a_s64 = vdupq_n_s64(0); 1446 a_s64 = vsetq_lane_s64(a, a_s64, 0); 1447 memcpy(&a, buf, 8); 1448 buf += stride; 1449 a_s64 = vsetq_lane_s64(a, a_s64, 1); 1450 return vreinterpretq_s16_s64(a_s64); 1451 } 1452 1453 static inline void load_unaligned_u16_4x4(const uint16_t *buf, ptrdiff_t stride, 1454 uint16x8_t *tu0, uint16x8_t *tu1) { 1455 *tu0 = load_unaligned_u16_4x2(buf, stride); 1456 buf += 2 * stride; 1457 *tu1 = load_unaligned_u16_4x2(buf, stride); 1458 } 1459 1460 static inline void load_s32_4x4(int32_t *s, ptrdiff_t p, int32x4_t *s1, 1461 int32x4_t *s2, int32x4_t *s3, int32x4_t *s4) { 1462 *s1 = vld1q_s32(s); 1463 s += p; 1464 *s2 = vld1q_s32(s); 1465 s += p; 1466 *s3 = vld1q_s32(s); 1467 s += p; 1468 *s4 = vld1q_s32(s); 1469 } 1470 1471 static inline void store_s32_4x4(int32_t *s, ptrdiff_t p, int32x4_t s1, 1472 int32x4_t s2, int32x4_t s3, int32x4_t s4) { 1473 vst1q_s32(s, s1); 1474 s += p; 1475 vst1q_s32(s, s2); 1476 s += p; 1477 vst1q_s32(s, s3); 1478 s += p; 1479 vst1q_s32(s, s4); 1480 } 1481 1482 static inline void load_u32_4x4(uint32_t *s, ptrdiff_t p, uint32x4_t *s1, 1483 uint32x4_t *s2, uint32x4_t *s3, 1484 uint32x4_t *s4) { 1485 *s1 = vld1q_u32(s); 1486 s += p; 1487 *s2 = vld1q_u32(s); 1488 s += p; 1489 *s3 = vld1q_u32(s); 1490 s += p; 1491 *s4 = vld1q_u32(s); 1492 } 1493 1494 static inline void store_u32_4x4(uint32_t *s, ptrdiff_t p, uint32x4_t s1, 1495 uint32x4_t s2, uint32x4_t s3, uint32x4_t s4) { 1496 vst1q_u32(s, s1); 1497 s += p; 1498 vst1q_u32(s, s2); 1499 s += p; 1500 vst1q_u32(s, s3); 1501 s += p; 1502 vst1q_u32(s, s4); 1503 } 1504 1505 static inline int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) { 1506 const int32x4_t v0 = vld1q_s32(buf); 1507 const int32x4_t v1 = vld1q_s32(buf + 4); 1508 const int16x4_t s0 = vmovn_s32(v0); 1509 const int16x4_t s1 = vmovn_s32(v1); 1510 return vcombine_s16(s0, s1); 1511 } 1512 1513 static inline void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) { 1514 const int32x4_t v0 = vmovl_s16(vget_low_s16(a)); 1515 const int32x4_t v1 = vmovl_s16(vget_high_s16(a)); 1516 vst1q_s32(buf, v0); 1517 vst1q_s32(buf + 4, v1); 1518 } 1519 1520 static inline void store_s16_to_tran_low(tran_low_t *buf, const int16x4_t a) { 1521 const int32x4_t v0 = vmovl_s16(a); 1522 vst1q_s32(buf, v0); 1523 } 1524 1525 static inline uint8x8_t load_u8_gather_s16_x8(const uint8_t *src, 1526 int16x8_t indices) { 1527 // Recent Clang and GCC versions correctly identify that this zero-broadcast 1528 // is redundant. Alternatively we could load and broadcast the zeroth element 1529 // and then replace the other lanes, however this is slower than loading a 1530 // single element without broadcast on some micro-architectures. 1531 uint8x8_t ret = vdup_n_u8(0); 1532 ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 0), ret, 0); 1533 ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 1), ret, 1); 1534 ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 2), ret, 2); 1535 ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 3), ret, 3); 1536 ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 0), ret, 4); 1537 ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 1), ret, 5); 1538 ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 2), ret, 6); 1539 ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 3), ret, 7); 1540 return ret; 1541 } 1542 1543 // The `lane` parameter here must be an immediate. 1544 #define store_u8_2x1_lane(dst, src, lane) \ 1545 do { \ 1546 uint16_t a = vget_lane_u16(vreinterpret_u16_u8(src), lane); \ 1547 memcpy(dst, &a, 2); \ 1548 } while (0) 1549 1550 #define store_u8_4x1_lane(dst, src, lane) \ 1551 do { \ 1552 uint32_t a = vget_lane_u32(vreinterpret_u32_u8(src), lane); \ 1553 memcpy(dst, &a, 4); \ 1554 } while (0) 1555 1556 #define store_u16_2x1_lane(dst, src, lane) \ 1557 do { \ 1558 uint32_t a = vget_lane_u32(vreinterpret_u32_u16(src), lane); \ 1559 memcpy(dst, &a, 4); \ 1560 } while (0) 1561 1562 #define store_u16_4x1_lane(dst, src, lane) \ 1563 do { \ 1564 uint64_t a = vgetq_lane_u64(vreinterpretq_u64_u16(src), lane); \ 1565 memcpy(dst, &a, 8); \ 1566 } while (0) 1567 1568 #define store_s16_4x1_lane(dst, src, lane) \ 1569 do { \ 1570 int64_t a = vgetq_lane_s64(vreinterpretq_s64_s16(src), lane); \ 1571 memcpy(dst, &a, 8); \ 1572 } while (0) 1573 1574 // Store the low 16-bits from a single vector. 1575 static inline void store_u8_2x1(uint8_t *dst, const uint8x8_t src) { 1576 store_u8_2x1_lane(dst, src, 0); 1577 } 1578 1579 // Store the low 32-bits from a single vector. 1580 static inline void store_u8_4x1(uint8_t *dst, const uint8x8_t src) { 1581 store_u8_4x1_lane(dst, src, 0); 1582 } 1583 1584 // Store two blocks of 16-bits from a single vector. 1585 static inline void store_u8x2_strided_x2(uint8_t *dst, ptrdiff_t dst_stride, 1586 uint8x8_t src) { 1587 store_u8_2x1_lane(dst, src, 0); 1588 dst += dst_stride; 1589 store_u8_2x1_lane(dst, src, 1); 1590 } 1591 1592 static inline void store_u8x2_strided_x4(uint8_t *dst, ptrdiff_t dst_stride, 1593 uint8x8_t src) { 1594 store_u8_2x1_lane(dst, src, 0); 1595 dst += dst_stride; 1596 store_u8_2x1_lane(dst, src, 1); 1597 dst += dst_stride; 1598 store_u8_2x1_lane(dst, src, 2); 1599 dst += dst_stride; 1600 store_u8_2x1_lane(dst, src, 3); 1601 } 1602 1603 // Store two blocks of 32-bits from a single vector. 1604 static inline void store_u8x4_strided_x2(uint8_t *dst, ptrdiff_t stride, 1605 uint8x8_t src) { 1606 store_u8_4x1_lane(dst, src, 0); 1607 dst += stride; 1608 store_u8_4x1_lane(dst, src, 1); 1609 } 1610 1611 // Store four blocks of 32-bits from a single vector. 1612 static inline void store_u8x4_strided_x4(uint8_t *dst, ptrdiff_t stride, 1613 uint8x16_t src) { 1614 store_u8_4x1_lane(dst, vget_low_u8(src), 0); 1615 dst += stride; 1616 store_u8_4x1_lane(dst, vget_low_u8(src), 1); 1617 dst += stride; 1618 store_u8_4x1_lane(dst, vget_high_u8(src), 0); 1619 dst += stride; 1620 store_u8_4x1_lane(dst, vget_high_u8(src), 1); 1621 } 1622 1623 // Store the low 32-bits from a single vector. 1624 static inline void store_u16_2x1(uint16_t *dst, const uint16x4_t src) { 1625 store_u16_2x1_lane(dst, src, 0); 1626 } 1627 1628 // Store two blocks of 32-bits from a single vector. 1629 static inline void store_u16x2_strided_x2(uint16_t *dst, ptrdiff_t dst_stride, 1630 uint16x4_t src) { 1631 store_u16_2x1_lane(dst, src, 0); 1632 dst += dst_stride; 1633 store_u16_2x1_lane(dst, src, 1); 1634 } 1635 1636 // Store two blocks of 64-bits from a single vector. 1637 static inline void store_u16x4_strided_x2(uint16_t *dst, ptrdiff_t dst_stride, 1638 uint16x8_t src) { 1639 store_u16_4x1_lane(dst, src, 0); 1640 dst += dst_stride; 1641 store_u16_4x1_lane(dst, src, 1); 1642 } 1643 1644 // Store two blocks of 64-bits from a single vector. 1645 static inline void store_s16x4_strided_x2(int16_t *dst, ptrdiff_t dst_stride, 1646 int16x8_t src) { 1647 store_s16_4x1_lane(dst, src, 0); 1648 dst += dst_stride; 1649 store_s16_4x1_lane(dst, src, 1); 1650 } 1651 1652 #undef store_u8_2x1_lane 1653 #undef store_u8_4x1_lane 1654 #undef store_u16_2x1_lane 1655 #undef store_u16_4x1_lane 1656 #undef store_s16_4x1_lane 1657 1658 #endif // AOM_AOM_DSP_ARM_MEM_NEON_H_