tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

mem_neon.h (49354B)


      1 /*
      2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #ifndef AOM_AOM_DSP_ARM_MEM_NEON_H_
     13 #define AOM_AOM_DSP_ARM_MEM_NEON_H_
     14 
     15 #include <arm_neon.h>
     16 #include <string.h>
     17 #include "aom_dsp/aom_dsp_common.h"
     18 
     19 #if defined(__arm__) || defined(_M_ARM)
     20 #define ARM_32_BIT
     21 #endif
     22 
     23 // DEFICIENT_CLANG_32_BIT includes clang-cl.
     24 #if defined(__clang__) && defined(ARM_32_BIT) && \
     25    (__clang_major__ <= 6 || (defined(__ANDROID__) && __clang_major__ <= 7))
     26 #define DEFICIENT_CLANG_32_BIT
     27 #endif
     28 
     29 #if defined(__GNUC__) && !defined(__clang__) && defined(ARM_32_BIT) && \
     30    __GNUC__ < 14
     31 #define DEFICIENT_GCC_32_BIT
     32 #endif
     33 
     34 // Support for xN Neon intrinsics is lacking in some compilers.
     35 #if defined(DEFICIENT_CLANG_32_BIT) || defined(DEFICIENT_GCC_32_BIT)
     36 
     37 static inline uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
     38  uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
     39                         vld1q_u8(ptr + 2 * 16) } };
     40  return res;
     41 }
     42 
     43 static inline uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
     44  uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
     45  return res;
     46 }
     47 
     48 static inline uint16x8x2_t vld1q_u16_x2(const uint16_t *ptr) {
     49  uint16x8x2_t res = { { vld1q_u16(ptr + 0), vld1q_u16(ptr + 8) } };
     50  return res;
     51 }
     52 
     53 static inline uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
     54  uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
     55                         vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
     56  return res;
     57 }
     58 
     59 static inline int16x8x2_t vld1q_s16_x2(const int16_t *ptr) {
     60  int16x8x2_t res = { { vld1q_s16(ptr + 0 * 8), vld1q_s16(ptr + 1 * 8) } };
     61  return res;
     62 }
     63 
     64 static inline int16x8x4_t vld1q_s16_x4(const int16_t *ptr) {
     65  int16x8x4_t res = { { vld1q_s16(ptr + 0 * 8), vld1q_s16(ptr + 1 * 8),
     66                        vld1q_s16(ptr + 2 * 8), vld1q_s16(ptr + 3 * 8) } };
     67  return res;
     68 }
     69 
     70 static inline void vst1_u8_x2(uint8_t *ptr, uint8x8x2_t a) {
     71  vst1_u8(ptr + 0 * 8, a.val[0]);
     72  vst1_u8(ptr + 1 * 8, a.val[1]);
     73 }
     74 
     75 static inline void vst1_u8_x4(uint8_t *ptr, uint8x8x4_t a) {
     76  vst1_u8(ptr + 0 * 8, a.val[0]);
     77  vst1_u8(ptr + 1 * 8, a.val[1]);
     78  vst1_u8(ptr + 2 * 8, a.val[2]);
     79  vst1_u8(ptr + 3 * 8, a.val[3]);
     80 }
     81 
     82 static inline void vst1q_u16_x2(uint16_t *ptr, uint16x8x2_t a) {
     83  vst1q_u16(ptr + 0 * 8, a.val[0]);
     84  vst1q_u16(ptr + 1 * 8, a.val[1]);
     85 }
     86 
     87 static inline void vst1q_u16_x4(uint16_t *ptr, uint16x8x4_t a) {
     88  vst1q_u16(ptr + 0 * 8, a.val[0]);
     89  vst1q_u16(ptr + 1 * 8, a.val[1]);
     90  vst1q_u16(ptr + 2 * 8, a.val[2]);
     91  vst1q_u16(ptr + 3 * 8, a.val[3]);
     92 }
     93 
     94 #elif defined(__GNUC__) && !defined(__clang__)  // GCC 64-bit.
     95 #if __GNUC__ < 8
     96 static inline uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
     97  uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
     98  return res;
     99 }
    100 
    101 static inline uint16x8x2_t vld1q_u16_x2(const uint16_t *ptr) {
    102  uint16x8x2_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8) } };
    103  return res;
    104 }
    105 
    106 static inline int16x8x2_t vld1q_s16_x2(const int16_t *ptr) {
    107  int16x8x2_t res = { { vld1q_s16(ptr + 0 * 8), vld1q_s16(ptr + 1 * 8) } };
    108  return res;
    109 }
    110 #endif  // __GNUC__ < 8
    111 
    112 #if __GNUC__ < 9
    113 static inline uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
    114  uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
    115                         vld1q_u8(ptr + 2 * 16) } };
    116  return res;
    117 }
    118 #endif  // __GNUC__ < 9
    119 
    120 #if ((__GNUC__ << 8) | __GNUC_MINOR__) < 0x805
    121 static inline uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
    122  uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
    123                         vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
    124  return res;
    125 }
    126 
    127 static inline int16x8x4_t vld1q_s16_x4(const int16_t *ptr) {
    128  int16x8x4_t res = { { vld1q_s16(ptr + 0 * 8), vld1q_s16(ptr + 1 * 8),
    129                        vld1q_s16(ptr + 2 * 8), vld1q_s16(ptr + 3 * 8) } };
    130  return res;
    131 }
    132 
    133 static inline void vst1_u8_x2(uint8_t *ptr, uint8x8x2_t a) {
    134  vst1_u8(ptr + 0 * 8, a.val[0]);
    135  vst1_u8(ptr + 1 * 8, a.val[1]);
    136 }
    137 
    138 static inline void vst1_u8_x4(uint8_t *ptr, uint8x8x4_t a) {
    139  vst1_u8(ptr + 0 * 8, a.val[0]);
    140  vst1_u8(ptr + 1 * 8, a.val[1]);
    141  vst1_u8(ptr + 2 * 8, a.val[2]);
    142  vst1_u8(ptr + 3 * 8, a.val[3]);
    143 }
    144 
    145 static inline void vst1q_u16_x2(uint16_t *ptr, uint16x8x2_t a) {
    146  vst1q_u16(ptr + 0 * 8, a.val[0]);
    147  vst1q_u16(ptr + 1 * 8, a.val[1]);
    148 }
    149 
    150 static inline void vst1q_u16_x4(uint16_t *ptr, uint16x8x4_t a) {
    151  vst1q_u16(ptr + 0 * 8, a.val[0]);
    152  vst1q_u16(ptr + 1 * 8, a.val[1]);
    153  vst1q_u16(ptr + 2 * 8, a.val[2]);
    154  vst1q_u16(ptr + 3 * 8, a.val[3]);
    155 }
    156 #endif  // ((__GNUC__ << 8) | __GNUC_MINOR__) < 0x805
    157 #endif  // defined(__GNUC__) && !defined(__clang__)
    158 
    159 static inline void store_u8_8x2(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
    160                                const uint8x8_t s1) {
    161  vst1_u8(s, s0);
    162  s += p;
    163  vst1_u8(s, s1);
    164  s += p;
    165 }
    166 
    167 static inline uint8x16_t load_u8_8x2(const uint8_t *s, ptrdiff_t p) {
    168  return vcombine_u8(vld1_u8(s), vld1_u8(s + p));
    169 }
    170 
    171 // Load four bytes into the low half of a uint8x8_t, zero the upper half.
    172 static inline uint8x8_t load_u8_4x1(const uint8_t *p) {
    173  uint8x8_t ret = vdup_n_u8(0);
    174  ret = vreinterpret_u8_u32(
    175      vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 0));
    176  return ret;
    177 }
    178 
    179 static inline uint8x8_t load_u8_4x2(const uint8_t *p, ptrdiff_t stride) {
    180  uint8x8_t ret = vdup_n_u8(0);
    181  ret = vreinterpret_u8_u32(
    182      vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 0));
    183  p += stride;
    184  ret = vreinterpret_u8_u32(
    185      vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 1));
    186  return ret;
    187 }
    188 
    189 static inline uint16x4_t load_u16_2x2(const uint16_t *p, ptrdiff_t stride) {
    190  uint16x4_t ret = vdup_n_u16(0);
    191  ret = vreinterpret_u16_u32(
    192      vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u16(ret), 0));
    193  p += stride;
    194  ret = vreinterpret_u16_u32(
    195      vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u16(ret), 1));
    196  return ret;
    197 }
    198 
    199 static inline void load_u8_8x8(const uint8_t *s, ptrdiff_t p,
    200                               uint8x8_t *const s0, uint8x8_t *const s1,
    201                               uint8x8_t *const s2, uint8x8_t *const s3,
    202                               uint8x8_t *const s4, uint8x8_t *const s5,
    203                               uint8x8_t *const s6, uint8x8_t *const s7) {
    204  *s0 = vld1_u8(s);
    205  s += p;
    206  *s1 = vld1_u8(s);
    207  s += p;
    208  *s2 = vld1_u8(s);
    209  s += p;
    210  *s3 = vld1_u8(s);
    211  s += p;
    212  *s4 = vld1_u8(s);
    213  s += p;
    214  *s5 = vld1_u8(s);
    215  s += p;
    216  *s6 = vld1_u8(s);
    217  s += p;
    218  *s7 = vld1_u8(s);
    219 }
    220 
    221 static inline void load_u8_8x7(const uint8_t *s, ptrdiff_t p,
    222                               uint8x8_t *const s0, uint8x8_t *const s1,
    223                               uint8x8_t *const s2, uint8x8_t *const s3,
    224                               uint8x8_t *const s4, uint8x8_t *const s5,
    225                               uint8x8_t *const s6) {
    226  *s0 = vld1_u8(s);
    227  s += p;
    228  *s1 = vld1_u8(s);
    229  s += p;
    230  *s2 = vld1_u8(s);
    231  s += p;
    232  *s3 = vld1_u8(s);
    233  s += p;
    234  *s4 = vld1_u8(s);
    235  s += p;
    236  *s5 = vld1_u8(s);
    237  s += p;
    238  *s6 = vld1_u8(s);
    239 }
    240 
    241 static inline void load_u8_8x6(const uint8_t *s, ptrdiff_t p,
    242                               uint8x8_t *const s0, uint8x8_t *const s1,
    243                               uint8x8_t *const s2, uint8x8_t *const s3,
    244                               uint8x8_t *const s4, uint8x8_t *const s5) {
    245  *s0 = vld1_u8(s);
    246  s += p;
    247  *s1 = vld1_u8(s);
    248  s += p;
    249  *s2 = vld1_u8(s);
    250  s += p;
    251  *s3 = vld1_u8(s);
    252  s += p;
    253  *s4 = vld1_u8(s);
    254  s += p;
    255  *s5 = vld1_u8(s);
    256 }
    257 
    258 static inline void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
    259                               uint8x8_t *const s0, uint8x8_t *const s1,
    260                               uint8x8_t *const s2, uint8x8_t *const s3) {
    261  *s0 = vld1_u8(s);
    262  s += p;
    263  *s1 = vld1_u8(s);
    264  s += p;
    265  *s2 = vld1_u8(s);
    266  s += p;
    267  *s3 = vld1_u8(s);
    268 }
    269 
    270 static inline void load_u8_8x3(const uint8_t *s, const ptrdiff_t p,
    271                               uint8x8_t *const s0, uint8x8_t *const s1,
    272                               uint8x8_t *const s2) {
    273  *s0 = vld1_u8(s);
    274  s += p;
    275  *s1 = vld1_u8(s);
    276  s += p;
    277  *s2 = vld1_u8(s);
    278 }
    279 
    280 static inline void load_u16_4x4(const uint16_t *s, const ptrdiff_t p,
    281                                uint16x4_t *const s0, uint16x4_t *const s1,
    282                                uint16x4_t *const s2, uint16x4_t *const s3) {
    283  *s0 = vld1_u16(s);
    284  s += p;
    285  *s1 = vld1_u16(s);
    286  s += p;
    287  *s2 = vld1_u16(s);
    288  s += p;
    289  *s3 = vld1_u16(s);
    290  s += p;
    291 }
    292 
    293 static inline void load_u16_4x6(const uint16_t *s, ptrdiff_t p,
    294                                uint16x4_t *const s0, uint16x4_t *const s1,
    295                                uint16x4_t *const s2, uint16x4_t *const s3,
    296                                uint16x4_t *const s4, uint16x4_t *const s5) {
    297  *s0 = vld1_u16(s);
    298  s += p;
    299  *s1 = vld1_u16(s);
    300  s += p;
    301  *s2 = vld1_u16(s);
    302  s += p;
    303  *s3 = vld1_u16(s);
    304  s += p;
    305  *s4 = vld1_u16(s);
    306  s += p;
    307  *s5 = vld1_u16(s);
    308 }
    309 
    310 static inline void load_u16_4x7(const uint16_t *s, ptrdiff_t p,
    311                                uint16x4_t *const s0, uint16x4_t *const s1,
    312                                uint16x4_t *const s2, uint16x4_t *const s3,
    313                                uint16x4_t *const s4, uint16x4_t *const s5,
    314                                uint16x4_t *const s6) {
    315  *s0 = vld1_u16(s);
    316  s += p;
    317  *s1 = vld1_u16(s);
    318  s += p;
    319  *s2 = vld1_u16(s);
    320  s += p;
    321  *s3 = vld1_u16(s);
    322  s += p;
    323  *s4 = vld1_u16(s);
    324  s += p;
    325  *s5 = vld1_u16(s);
    326  s += p;
    327  *s6 = vld1_u16(s);
    328 }
    329 
    330 static inline void load_u16_4x8(const uint16_t *s, ptrdiff_t p,
    331                                uint16x4_t *const s0, uint16x4_t *const s1,
    332                                uint16x4_t *const s2, uint16x4_t *const s3,
    333                                uint16x4_t *const s4, uint16x4_t *const s5,
    334                                uint16x4_t *const s6, uint16x4_t *const s7) {
    335  *s0 = vld1_u16(s);
    336  s += p;
    337  *s1 = vld1_u16(s);
    338  s += p;
    339  *s2 = vld1_u16(s);
    340  s += p;
    341  *s3 = vld1_u16(s);
    342  s += p;
    343  *s4 = vld1_u16(s);
    344  s += p;
    345  *s5 = vld1_u16(s);
    346  s += p;
    347  *s6 = vld1_u16(s);
    348  s += p;
    349  *s7 = vld1_u16(s);
    350 }
    351 
    352 static inline void load_u16_4x14(const uint16_t *s, ptrdiff_t p,
    353                                 uint16x4_t *const s0, uint16x4_t *const s1,
    354                                 uint16x4_t *const s2, uint16x4_t *const s3,
    355                                 uint16x4_t *const s4, uint16x4_t *const s5,
    356                                 uint16x4_t *const s6, uint16x4_t *const s7,
    357                                 uint16x4_t *const s8, uint16x4_t *const s9,
    358                                 uint16x4_t *const s10, uint16x4_t *const s11,
    359                                 uint16x4_t *const s12, uint16x4_t *const s13) {
    360  *s0 = vld1_u16(s);
    361  s += p;
    362  *s1 = vld1_u16(s);
    363  s += p;
    364  *s2 = vld1_u16(s);
    365  s += p;
    366  *s3 = vld1_u16(s);
    367  s += p;
    368  *s4 = vld1_u16(s);
    369  s += p;
    370  *s5 = vld1_u16(s);
    371  s += p;
    372  *s6 = vld1_u16(s);
    373  s += p;
    374  *s7 = vld1_u16(s);
    375  s += p;
    376  *s8 = vld1_u16(s);
    377  s += p;
    378  *s9 = vld1_u16(s);
    379  s += p;
    380  *s10 = vld1_u16(s);
    381  s += p;
    382  *s11 = vld1_u16(s);
    383  s += p;
    384  *s12 = vld1_u16(s);
    385  s += p;
    386  *s13 = vld1_u16(s);
    387 }
    388 
    389 static inline void load_s16_8x2(const int16_t *s, const ptrdiff_t p,
    390                                int16x8_t *const s0, int16x8_t *const s1) {
    391  *s0 = vld1q_s16(s);
    392  s += p;
    393  *s1 = vld1q_s16(s);
    394 }
    395 
    396 static inline void load_u16_8x2(const uint16_t *s, const ptrdiff_t p,
    397                                uint16x8_t *const s0, uint16x8_t *const s1) {
    398  *s0 = vld1q_u16(s);
    399  s += p;
    400  *s1 = vld1q_u16(s);
    401 }
    402 
    403 static inline void load_u16_8x3(const uint16_t *s, const ptrdiff_t p,
    404                                uint16x8_t *const s0, uint16x8_t *const s1,
    405                                uint16x8_t *const s2) {
    406  *s0 = vld1q_u16(s);
    407  s += p;
    408  *s1 = vld1q_u16(s);
    409  s += p;
    410  *s2 = vld1q_u16(s);
    411 }
    412 
    413 static inline void load_u16_8x4(const uint16_t *s, const ptrdiff_t p,
    414                                uint16x8_t *const s0, uint16x8_t *const s1,
    415                                uint16x8_t *const s2, uint16x8_t *const s3) {
    416  *s0 = vld1q_u16(s);
    417  s += p;
    418  *s1 = vld1q_u16(s);
    419  s += p;
    420  *s2 = vld1q_u16(s);
    421  s += p;
    422  *s3 = vld1q_u16(s);
    423  s += p;
    424 }
    425 
    426 static inline void load_s16_4x12(const int16_t *s, ptrdiff_t p,
    427                                 int16x4_t *const s0, int16x4_t *const s1,
    428                                 int16x4_t *const s2, int16x4_t *const s3,
    429                                 int16x4_t *const s4, int16x4_t *const s5,
    430                                 int16x4_t *const s6, int16x4_t *const s7,
    431                                 int16x4_t *const s8, int16x4_t *const s9,
    432                                 int16x4_t *const s10, int16x4_t *const s11) {
    433  *s0 = vld1_s16(s);
    434  s += p;
    435  *s1 = vld1_s16(s);
    436  s += p;
    437  *s2 = vld1_s16(s);
    438  s += p;
    439  *s3 = vld1_s16(s);
    440  s += p;
    441  *s4 = vld1_s16(s);
    442  s += p;
    443  *s5 = vld1_s16(s);
    444  s += p;
    445  *s6 = vld1_s16(s);
    446  s += p;
    447  *s7 = vld1_s16(s);
    448  s += p;
    449  *s8 = vld1_s16(s);
    450  s += p;
    451  *s9 = vld1_s16(s);
    452  s += p;
    453  *s10 = vld1_s16(s);
    454  s += p;
    455  *s11 = vld1_s16(s);
    456 }
    457 
    458 static inline void load_s16_4x11(const int16_t *s, ptrdiff_t p,
    459                                 int16x4_t *const s0, int16x4_t *const s1,
    460                                 int16x4_t *const s2, int16x4_t *const s3,
    461                                 int16x4_t *const s4, int16x4_t *const s5,
    462                                 int16x4_t *const s6, int16x4_t *const s7,
    463                                 int16x4_t *const s8, int16x4_t *const s9,
    464                                 int16x4_t *const s10) {
    465  *s0 = vld1_s16(s);
    466  s += p;
    467  *s1 = vld1_s16(s);
    468  s += p;
    469  *s2 = vld1_s16(s);
    470  s += p;
    471  *s3 = vld1_s16(s);
    472  s += p;
    473  *s4 = vld1_s16(s);
    474  s += p;
    475  *s5 = vld1_s16(s);
    476  s += p;
    477  *s6 = vld1_s16(s);
    478  s += p;
    479  *s7 = vld1_s16(s);
    480  s += p;
    481  *s8 = vld1_s16(s);
    482  s += p;
    483  *s9 = vld1_s16(s);
    484  s += p;
    485  *s10 = vld1_s16(s);
    486 }
    487 
    488 static inline void load_u16_4x11(const uint16_t *s, ptrdiff_t p,
    489                                 uint16x4_t *const s0, uint16x4_t *const s1,
    490                                 uint16x4_t *const s2, uint16x4_t *const s3,
    491                                 uint16x4_t *const s4, uint16x4_t *const s5,
    492                                 uint16x4_t *const s6, uint16x4_t *const s7,
    493                                 uint16x4_t *const s8, uint16x4_t *const s9,
    494                                 uint16x4_t *const s10) {
    495  *s0 = vld1_u16(s);
    496  s += p;
    497  *s1 = vld1_u16(s);
    498  s += p;
    499  *s2 = vld1_u16(s);
    500  s += p;
    501  *s3 = vld1_u16(s);
    502  s += p;
    503  *s4 = vld1_u16(s);
    504  s += p;
    505  *s5 = vld1_u16(s);
    506  s += p;
    507  *s6 = vld1_u16(s);
    508  s += p;
    509  *s7 = vld1_u16(s);
    510  s += p;
    511  *s8 = vld1_u16(s);
    512  s += p;
    513  *s9 = vld1_u16(s);
    514  s += p;
    515  *s10 = vld1_u16(s);
    516 }
    517 
    518 static inline void load_s16_4x8(const int16_t *s, ptrdiff_t p,
    519                                int16x4_t *const s0, int16x4_t *const s1,
    520                                int16x4_t *const s2, int16x4_t *const s3,
    521                                int16x4_t *const s4, int16x4_t *const s5,
    522                                int16x4_t *const s6, int16x4_t *const s7) {
    523  *s0 = vld1_s16(s);
    524  s += p;
    525  *s1 = vld1_s16(s);
    526  s += p;
    527  *s2 = vld1_s16(s);
    528  s += p;
    529  *s3 = vld1_s16(s);
    530  s += p;
    531  *s4 = vld1_s16(s);
    532  s += p;
    533  *s5 = vld1_s16(s);
    534  s += p;
    535  *s6 = vld1_s16(s);
    536  s += p;
    537  *s7 = vld1_s16(s);
    538 }
    539 
    540 static inline void load_s16_4x7(const int16_t *s, ptrdiff_t p,
    541                                int16x4_t *const s0, int16x4_t *const s1,
    542                                int16x4_t *const s2, int16x4_t *const s3,
    543                                int16x4_t *const s4, int16x4_t *const s5,
    544                                int16x4_t *const s6) {
    545  *s0 = vld1_s16(s);
    546  s += p;
    547  *s1 = vld1_s16(s);
    548  s += p;
    549  *s2 = vld1_s16(s);
    550  s += p;
    551  *s3 = vld1_s16(s);
    552  s += p;
    553  *s4 = vld1_s16(s);
    554  s += p;
    555  *s5 = vld1_s16(s);
    556  s += p;
    557  *s6 = vld1_s16(s);
    558 }
    559 
    560 static inline void load_s16_4x6(const int16_t *s, ptrdiff_t p,
    561                                int16x4_t *const s0, int16x4_t *const s1,
    562                                int16x4_t *const s2, int16x4_t *const s3,
    563                                int16x4_t *const s4, int16x4_t *const s5) {
    564  *s0 = vld1_s16(s);
    565  s += p;
    566  *s1 = vld1_s16(s);
    567  s += p;
    568  *s2 = vld1_s16(s);
    569  s += p;
    570  *s3 = vld1_s16(s);
    571  s += p;
    572  *s4 = vld1_s16(s);
    573  s += p;
    574  *s5 = vld1_s16(s);
    575 }
    576 
    577 static inline void load_s16_4x5(const int16_t *s, ptrdiff_t p,
    578                                int16x4_t *const s0, int16x4_t *const s1,
    579                                int16x4_t *const s2, int16x4_t *const s3,
    580                                int16x4_t *const s4) {
    581  *s0 = vld1_s16(s);
    582  s += p;
    583  *s1 = vld1_s16(s);
    584  s += p;
    585  *s2 = vld1_s16(s);
    586  s += p;
    587  *s3 = vld1_s16(s);
    588  s += p;
    589  *s4 = vld1_s16(s);
    590 }
    591 
    592 static inline void load_u16_4x5(const uint16_t *s, const ptrdiff_t p,
    593                                uint16x4_t *const s0, uint16x4_t *const s1,
    594                                uint16x4_t *const s2, uint16x4_t *const s3,
    595                                uint16x4_t *const s4) {
    596  *s0 = vld1_u16(s);
    597  s += p;
    598  *s1 = vld1_u16(s);
    599  s += p;
    600  *s2 = vld1_u16(s);
    601  s += p;
    602  *s3 = vld1_u16(s);
    603  s += p;
    604  *s4 = vld1_u16(s);
    605  s += p;
    606 }
    607 
    608 static inline void load_u8_8x5(const uint8_t *s, ptrdiff_t p,
    609                               uint8x8_t *const s0, uint8x8_t *const s1,
    610                               uint8x8_t *const s2, uint8x8_t *const s3,
    611                               uint8x8_t *const s4) {
    612  *s0 = vld1_u8(s);
    613  s += p;
    614  *s1 = vld1_u8(s);
    615  s += p;
    616  *s2 = vld1_u8(s);
    617  s += p;
    618  *s3 = vld1_u8(s);
    619  s += p;
    620  *s4 = vld1_u8(s);
    621 }
    622 
    623 static inline void load_u16_8x5(const uint16_t *s, const ptrdiff_t p,
    624                                uint16x8_t *const s0, uint16x8_t *const s1,
    625                                uint16x8_t *const s2, uint16x8_t *const s3,
    626                                uint16x8_t *const s4) {
    627  *s0 = vld1q_u16(s);
    628  s += p;
    629  *s1 = vld1q_u16(s);
    630  s += p;
    631  *s2 = vld1q_u16(s);
    632  s += p;
    633  *s3 = vld1q_u16(s);
    634  s += p;
    635  *s4 = vld1q_u16(s);
    636  s += p;
    637 }
    638 
    639 static inline void load_s16_4x4(const int16_t *s, ptrdiff_t p,
    640                                int16x4_t *const s0, int16x4_t *const s1,
    641                                int16x4_t *const s2, int16x4_t *const s3) {
    642  *s0 = vld1_s16(s);
    643  s += p;
    644  *s1 = vld1_s16(s);
    645  s += p;
    646  *s2 = vld1_s16(s);
    647  s += p;
    648  *s3 = vld1_s16(s);
    649 }
    650 
    651 static inline void load_s16_4x3(const int16_t *s, ptrdiff_t p,
    652                                int16x4_t *const s0, int16x4_t *const s1,
    653                                int16x4_t *const s2) {
    654  *s0 = vld1_s16(s);
    655  s += p;
    656  *s1 = vld1_s16(s);
    657  s += p;
    658  *s2 = vld1_s16(s);
    659 }
    660 
    661 static inline void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
    662                                const uint8x8_t s1, const uint8x8_t s2,
    663                                const uint8x8_t s3, const uint8x8_t s4,
    664                                const uint8x8_t s5, const uint8x8_t s6,
    665                                const uint8x8_t s7) {
    666  vst1_u8(s, s0);
    667  s += p;
    668  vst1_u8(s, s1);
    669  s += p;
    670  vst1_u8(s, s2);
    671  s += p;
    672  vst1_u8(s, s3);
    673  s += p;
    674  vst1_u8(s, s4);
    675  s += p;
    676  vst1_u8(s, s5);
    677  s += p;
    678  vst1_u8(s, s6);
    679  s += p;
    680  vst1_u8(s, s7);
    681 }
    682 
    683 static inline void store_u8_8x4(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
    684                                const uint8x8_t s1, const uint8x8_t s2,
    685                                const uint8x8_t s3) {
    686  vst1_u8(s, s0);
    687  s += p;
    688  vst1_u8(s, s1);
    689  s += p;
    690  vst1_u8(s, s2);
    691  s += p;
    692  vst1_u8(s, s3);
    693 }
    694 
    695 static inline void store_u8_16x4(uint8_t *s, ptrdiff_t p, const uint8x16_t s0,
    696                                 const uint8x16_t s1, const uint8x16_t s2,
    697                                 const uint8x16_t s3) {
    698  vst1q_u8(s, s0);
    699  s += p;
    700  vst1q_u8(s, s1);
    701  s += p;
    702  vst1q_u8(s, s2);
    703  s += p;
    704  vst1q_u8(s, s3);
    705 }
    706 
    707 static inline void store_u16_8x8(uint16_t *s, ptrdiff_t dst_stride,
    708                                 const uint16x8_t s0, const uint16x8_t s1,
    709                                 const uint16x8_t s2, const uint16x8_t s3,
    710                                 const uint16x8_t s4, const uint16x8_t s5,
    711                                 const uint16x8_t s6, const uint16x8_t s7) {
    712  vst1q_u16(s, s0);
    713  s += dst_stride;
    714  vst1q_u16(s, s1);
    715  s += dst_stride;
    716  vst1q_u16(s, s2);
    717  s += dst_stride;
    718  vst1q_u16(s, s3);
    719  s += dst_stride;
    720  vst1q_u16(s, s4);
    721  s += dst_stride;
    722  vst1q_u16(s, s5);
    723  s += dst_stride;
    724  vst1q_u16(s, s6);
    725  s += dst_stride;
    726  vst1q_u16(s, s7);
    727 }
    728 
    729 static inline void store_u16_4x3(uint16_t *s, ptrdiff_t dst_stride,
    730                                 const uint16x4_t s0, const uint16x4_t s1,
    731                                 const uint16x4_t s2) {
    732  vst1_u16(s, s0);
    733  s += dst_stride;
    734  vst1_u16(s, s1);
    735  s += dst_stride;
    736  vst1_u16(s, s2);
    737 }
    738 
    739 static inline void store_u16_4x4(uint16_t *s, ptrdiff_t dst_stride,
    740                                 const uint16x4_t s0, const uint16x4_t s1,
    741                                 const uint16x4_t s2, const uint16x4_t s3) {
    742  vst1_u16(s, s0);
    743  s += dst_stride;
    744  vst1_u16(s, s1);
    745  s += dst_stride;
    746  vst1_u16(s, s2);
    747  s += dst_stride;
    748  vst1_u16(s, s3);
    749 }
    750 
    751 static inline void store_u16_4x6(uint16_t *s, ptrdiff_t dst_stride,
    752                                 const uint16x4_t s0, const uint16x4_t s1,
    753                                 const uint16x4_t s2, const uint16x4_t s3,
    754                                 const uint16x4_t s4, const uint16x4_t s5) {
    755  vst1_u16(s, s0);
    756  s += dst_stride;
    757  vst1_u16(s, s1);
    758  s += dst_stride;
    759  vst1_u16(s, s2);
    760  s += dst_stride;
    761  vst1_u16(s, s3);
    762  s += dst_stride;
    763  vst1_u16(s, s4);
    764  s += dst_stride;
    765  vst1_u16(s, s5);
    766 }
    767 
    768 static inline void store_u16_4x12(uint16_t *s, ptrdiff_t dst_stride,
    769                                  const uint16x4_t s0, const uint16x4_t s1,
    770                                  const uint16x4_t s2, const uint16x4_t s3,
    771                                  const uint16x4_t s4, const uint16x4_t s5,
    772                                  const uint16x4_t s6, const uint16x4_t s7,
    773                                  const uint16x4_t s8, const uint16x4_t s9,
    774                                  const uint16x4_t s10, const uint16x4_t s11) {
    775  vst1_u16(s, s0);
    776  s += dst_stride;
    777  vst1_u16(s, s1);
    778  s += dst_stride;
    779  vst1_u16(s, s2);
    780  s += dst_stride;
    781  vst1_u16(s, s3);
    782  s += dst_stride;
    783  vst1_u16(s, s4);
    784  s += dst_stride;
    785  vst1_u16(s, s5);
    786  s += dst_stride;
    787  vst1_u16(s, s6);
    788  s += dst_stride;
    789  vst1_u16(s, s7);
    790  s += dst_stride;
    791  vst1_u16(s, s8);
    792  s += dst_stride;
    793  vst1_u16(s, s9);
    794  s += dst_stride;
    795  vst1_u16(s, s10);
    796  s += dst_stride;
    797  vst1_u16(s, s11);
    798  s += dst_stride;
    799 }
    800 
    801 static inline void store_u16_8x2(uint16_t *s, ptrdiff_t dst_stride,
    802                                 const uint16x8_t s0, const uint16x8_t s1) {
    803  vst1q_u16(s, s0);
    804  s += dst_stride;
    805  vst1q_u16(s, s1);
    806 }
    807 
    808 static inline void store_u16_8x3(uint16_t *s, ptrdiff_t dst_stride,
    809                                 const uint16x8_t s0, const uint16x8_t s1,
    810                                 const uint16x8_t s2) {
    811  vst1q_u16(s, s0);
    812  s += dst_stride;
    813  vst1q_u16(s, s1);
    814  s += dst_stride;
    815  vst1q_u16(s, s2);
    816 }
    817 
    818 static inline void store_u16_8x4(uint16_t *s, ptrdiff_t dst_stride,
    819                                 const uint16x8_t s0, const uint16x8_t s1,
    820                                 const uint16x8_t s2, const uint16x8_t s3) {
    821  vst1q_u16(s, s0);
    822  s += dst_stride;
    823  vst1q_u16(s, s1);
    824  s += dst_stride;
    825  vst1q_u16(s, s2);
    826  s += dst_stride;
    827  vst1q_u16(s, s3);
    828 }
    829 
    830 static inline void store_s16_8x8(int16_t *s, ptrdiff_t dst_stride,
    831                                 const int16x8_t s0, const int16x8_t s1,
    832                                 const int16x8_t s2, const int16x8_t s3,
    833                                 const int16x8_t s4, const int16x8_t s5,
    834                                 const int16x8_t s6, const int16x8_t s7) {
    835  vst1q_s16(s, s0);
    836  s += dst_stride;
    837  vst1q_s16(s, s1);
    838  s += dst_stride;
    839  vst1q_s16(s, s2);
    840  s += dst_stride;
    841  vst1q_s16(s, s3);
    842  s += dst_stride;
    843  vst1q_s16(s, s4);
    844  s += dst_stride;
    845  vst1q_s16(s, s5);
    846  s += dst_stride;
    847  vst1q_s16(s, s6);
    848  s += dst_stride;
    849  vst1q_s16(s, s7);
    850 }
    851 
    852 static inline void store_s16_4x4(int16_t *s, ptrdiff_t dst_stride,
    853                                 const int16x4_t s0, const int16x4_t s1,
    854                                 const int16x4_t s2, const int16x4_t s3) {
    855  vst1_s16(s, s0);
    856  s += dst_stride;
    857  vst1_s16(s, s1);
    858  s += dst_stride;
    859  vst1_s16(s, s2);
    860  s += dst_stride;
    861  vst1_s16(s, s3);
    862 }
    863 
    864 static inline void store_s16_4x8(int16_t *s, ptrdiff_t dst_stride,
    865                                 const int16x4_t s0, const int16x4_t s1,
    866                                 const int16x4_t s2, const int16x4_t s3,
    867                                 const int16x4_t s4, const int16x4_t s5,
    868                                 const int16x4_t s6, const int16x4_t s7) {
    869  vst1_s16(s, s0);
    870  s += dst_stride;
    871  vst1_s16(s, s1);
    872  s += dst_stride;
    873  vst1_s16(s, s2);
    874  s += dst_stride;
    875  vst1_s16(s, s3);
    876  s += dst_stride;
    877  vst1_s16(s, s4);
    878  s += dst_stride;
    879  vst1_s16(s, s5);
    880  s += dst_stride;
    881  vst1_s16(s, s6);
    882  s += dst_stride;
    883  vst1_s16(s, s7);
    884 }
    885 
    886 static inline void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride,
    887                                 const int16x8_t s0, const int16x8_t s1,
    888                                 const int16x8_t s2, const int16x8_t s3) {
    889  vst1q_s16(s, s0);
    890  s += dst_stride;
    891  vst1q_s16(s, s1);
    892  s += dst_stride;
    893  vst1q_s16(s, s2);
    894  s += dst_stride;
    895  vst1q_s16(s, s3);
    896 }
    897 
    898 static inline void store_s16_8x2(int16_t *s, ptrdiff_t dst_stride,
    899                                 const int16x8_t s0, const int16x8_t s1) {
    900  vst1q_s16(s, s0);
    901  s += dst_stride;
    902  vst1q_s16(s, s1);
    903 }
    904 
    905 static inline void load_u8_8x11(const uint8_t *s, ptrdiff_t p,
    906                                uint8x8_t *const s0, uint8x8_t *const s1,
    907                                uint8x8_t *const s2, uint8x8_t *const s3,
    908                                uint8x8_t *const s4, uint8x8_t *const s5,
    909                                uint8x8_t *const s6, uint8x8_t *const s7,
    910                                uint8x8_t *const s8, uint8x8_t *const s9,
    911                                uint8x8_t *const s10) {
    912  *s0 = vld1_u8(s);
    913  s += p;
    914  *s1 = vld1_u8(s);
    915  s += p;
    916  *s2 = vld1_u8(s);
    917  s += p;
    918  *s3 = vld1_u8(s);
    919  s += p;
    920  *s4 = vld1_u8(s);
    921  s += p;
    922  *s5 = vld1_u8(s);
    923  s += p;
    924  *s6 = vld1_u8(s);
    925  s += p;
    926  *s7 = vld1_u8(s);
    927  s += p;
    928  *s8 = vld1_u8(s);
    929  s += p;
    930  *s9 = vld1_u8(s);
    931  s += p;
    932  *s10 = vld1_u8(s);
    933 }
    934 
    935 static inline void load_s16_8x10(const int16_t *s, ptrdiff_t p,
    936                                 int16x8_t *const s0, int16x8_t *const s1,
    937                                 int16x8_t *const s2, int16x8_t *const s3,
    938                                 int16x8_t *const s4, int16x8_t *const s5,
    939                                 int16x8_t *const s6, int16x8_t *const s7,
    940                                 int16x8_t *const s8, int16x8_t *const s9) {
    941  *s0 = vld1q_s16(s);
    942  s += p;
    943  *s1 = vld1q_s16(s);
    944  s += p;
    945  *s2 = vld1q_s16(s);
    946  s += p;
    947  *s3 = vld1q_s16(s);
    948  s += p;
    949  *s4 = vld1q_s16(s);
    950  s += p;
    951  *s5 = vld1q_s16(s);
    952  s += p;
    953  *s6 = vld1q_s16(s);
    954  s += p;
    955  *s7 = vld1q_s16(s);
    956  s += p;
    957  *s8 = vld1q_s16(s);
    958  s += p;
    959  *s9 = vld1q_s16(s);
    960 }
    961 
    962 static inline void load_s16_8x11(const int16_t *s, ptrdiff_t p,
    963                                 int16x8_t *const s0, int16x8_t *const s1,
    964                                 int16x8_t *const s2, int16x8_t *const s3,
    965                                 int16x8_t *const s4, int16x8_t *const s5,
    966                                 int16x8_t *const s6, int16x8_t *const s7,
    967                                 int16x8_t *const s8, int16x8_t *const s9,
    968                                 int16x8_t *const s10) {
    969  *s0 = vld1q_s16(s);
    970  s += p;
    971  *s1 = vld1q_s16(s);
    972  s += p;
    973  *s2 = vld1q_s16(s);
    974  s += p;
    975  *s3 = vld1q_s16(s);
    976  s += p;
    977  *s4 = vld1q_s16(s);
    978  s += p;
    979  *s5 = vld1q_s16(s);
    980  s += p;
    981  *s6 = vld1q_s16(s);
    982  s += p;
    983  *s7 = vld1q_s16(s);
    984  s += p;
    985  *s8 = vld1q_s16(s);
    986  s += p;
    987  *s9 = vld1q_s16(s);
    988  s += p;
    989  *s10 = vld1q_s16(s);
    990 }
    991 
    992 static inline void load_s16_8x12(const int16_t *s, ptrdiff_t p,
    993                                 int16x8_t *const s0, int16x8_t *const s1,
    994                                 int16x8_t *const s2, int16x8_t *const s3,
    995                                 int16x8_t *const s4, int16x8_t *const s5,
    996                                 int16x8_t *const s6, int16x8_t *const s7,
    997                                 int16x8_t *const s8, int16x8_t *const s9,
    998                                 int16x8_t *const s10, int16x8_t *const s11) {
    999  *s0 = vld1q_s16(s);
   1000  s += p;
   1001  *s1 = vld1q_s16(s);
   1002  s += p;
   1003  *s2 = vld1q_s16(s);
   1004  s += p;
   1005  *s3 = vld1q_s16(s);
   1006  s += p;
   1007  *s4 = vld1q_s16(s);
   1008  s += p;
   1009  *s5 = vld1q_s16(s);
   1010  s += p;
   1011  *s6 = vld1q_s16(s);
   1012  s += p;
   1013  *s7 = vld1q_s16(s);
   1014  s += p;
   1015  *s8 = vld1q_s16(s);
   1016  s += p;
   1017  *s9 = vld1q_s16(s);
   1018  s += p;
   1019  *s10 = vld1q_s16(s);
   1020  s += p;
   1021  *s11 = vld1q_s16(s);
   1022 }
   1023 
   1024 static inline void load_u16_8x11(const uint16_t *s, ptrdiff_t p,
   1025                                 uint16x8_t *const s0, uint16x8_t *const s1,
   1026                                 uint16x8_t *const s2, uint16x8_t *const s3,
   1027                                 uint16x8_t *const s4, uint16x8_t *const s5,
   1028                                 uint16x8_t *const s6, uint16x8_t *const s7,
   1029                                 uint16x8_t *const s8, uint16x8_t *const s9,
   1030                                 uint16x8_t *const s10) {
   1031  *s0 = vld1q_u16(s);
   1032  s += p;
   1033  *s1 = vld1q_u16(s);
   1034  s += p;
   1035  *s2 = vld1q_u16(s);
   1036  s += p;
   1037  *s3 = vld1q_u16(s);
   1038  s += p;
   1039  *s4 = vld1q_u16(s);
   1040  s += p;
   1041  *s5 = vld1q_u16(s);
   1042  s += p;
   1043  *s6 = vld1q_u16(s);
   1044  s += p;
   1045  *s7 = vld1q_u16(s);
   1046  s += p;
   1047  *s8 = vld1q_u16(s);
   1048  s += p;
   1049  *s9 = vld1q_u16(s);
   1050  s += p;
   1051  *s10 = vld1q_u16(s);
   1052 }
   1053 
   1054 static inline void load_s16_8x8(const int16_t *s, ptrdiff_t p,
   1055                                int16x8_t *const s0, int16x8_t *const s1,
   1056                                int16x8_t *const s2, int16x8_t *const s3,
   1057                                int16x8_t *const s4, int16x8_t *const s5,
   1058                                int16x8_t *const s6, int16x8_t *const s7) {
   1059  *s0 = vld1q_s16(s);
   1060  s += p;
   1061  *s1 = vld1q_s16(s);
   1062  s += p;
   1063  *s2 = vld1q_s16(s);
   1064  s += p;
   1065  *s3 = vld1q_s16(s);
   1066  s += p;
   1067  *s4 = vld1q_s16(s);
   1068  s += p;
   1069  *s5 = vld1q_s16(s);
   1070  s += p;
   1071  *s6 = vld1q_s16(s);
   1072  s += p;
   1073  *s7 = vld1q_s16(s);
   1074 }
   1075 
   1076 static inline void load_u16_8x7(const uint16_t *s, ptrdiff_t p,
   1077                                uint16x8_t *const s0, uint16x8_t *const s1,
   1078                                uint16x8_t *const s2, uint16x8_t *const s3,
   1079                                uint16x8_t *const s4, uint16x8_t *const s5,
   1080                                uint16x8_t *const s6) {
   1081  *s0 = vld1q_u16(s);
   1082  s += p;
   1083  *s1 = vld1q_u16(s);
   1084  s += p;
   1085  *s2 = vld1q_u16(s);
   1086  s += p;
   1087  *s3 = vld1q_u16(s);
   1088  s += p;
   1089  *s4 = vld1q_u16(s);
   1090  s += p;
   1091  *s5 = vld1q_u16(s);
   1092  s += p;
   1093  *s6 = vld1q_u16(s);
   1094 }
   1095 
   1096 static inline void load_s16_8x7(const int16_t *s, ptrdiff_t p,
   1097                                int16x8_t *const s0, int16x8_t *const s1,
   1098                                int16x8_t *const s2, int16x8_t *const s3,
   1099                                int16x8_t *const s4, int16x8_t *const s5,
   1100                                int16x8_t *const s6) {
   1101  *s0 = vld1q_s16(s);
   1102  s += p;
   1103  *s1 = vld1q_s16(s);
   1104  s += p;
   1105  *s2 = vld1q_s16(s);
   1106  s += p;
   1107  *s3 = vld1q_s16(s);
   1108  s += p;
   1109  *s4 = vld1q_s16(s);
   1110  s += p;
   1111  *s5 = vld1q_s16(s);
   1112  s += p;
   1113  *s6 = vld1q_s16(s);
   1114 }
   1115 
   1116 static inline void load_s16_8x6(const int16_t *s, ptrdiff_t p,
   1117                                int16x8_t *const s0, int16x8_t *const s1,
   1118                                int16x8_t *const s2, int16x8_t *const s3,
   1119                                int16x8_t *const s4, int16x8_t *const s5) {
   1120  *s0 = vld1q_s16(s);
   1121  s += p;
   1122  *s1 = vld1q_s16(s);
   1123  s += p;
   1124  *s2 = vld1q_s16(s);
   1125  s += p;
   1126  *s3 = vld1q_s16(s);
   1127  s += p;
   1128  *s4 = vld1q_s16(s);
   1129  s += p;
   1130  *s5 = vld1q_s16(s);
   1131 }
   1132 
   1133 static inline void load_s16_8x5(const int16_t *s, ptrdiff_t p,
   1134                                int16x8_t *const s0, int16x8_t *const s1,
   1135                                int16x8_t *const s2, int16x8_t *const s3,
   1136                                int16x8_t *const s4) {
   1137  *s0 = vld1q_s16(s);
   1138  s += p;
   1139  *s1 = vld1q_s16(s);
   1140  s += p;
   1141  *s2 = vld1q_s16(s);
   1142  s += p;
   1143  *s3 = vld1q_s16(s);
   1144  s += p;
   1145  *s4 = vld1q_s16(s);
   1146 }
   1147 
   1148 static inline void load_s16_8x4(const int16_t *s, ptrdiff_t p,
   1149                                int16x8_t *const s0, int16x8_t *const s1,
   1150                                int16x8_t *const s2, int16x8_t *const s3) {
   1151  *s0 = vld1q_s16(s);
   1152  s += p;
   1153  *s1 = vld1q_s16(s);
   1154  s += p;
   1155  *s2 = vld1q_s16(s);
   1156  s += p;
   1157  *s3 = vld1q_s16(s);
   1158 }
   1159 
   1160 static inline void load_s16_8x3(const int16_t *s, ptrdiff_t p,
   1161                                int16x8_t *const s0, int16x8_t *const s1,
   1162                                int16x8_t *const s2) {
   1163  *s0 = vld1q_s16(s);
   1164  s += p;
   1165  *s1 = vld1q_s16(s);
   1166  s += p;
   1167  *s2 = vld1q_s16(s);
   1168 }
   1169 
   1170 #if AOM_ARCH_AARCH64
   1171 #define load_unaligned_u32_2x1_lane(v, p, lane)              \
   1172  do {                                                       \
   1173    (v) = vld1_lane_u32((const uint32_t *)(p), (v), (lane)); \
   1174  } while (0)
   1175 
   1176 #define load_unaligned_u32_4x1_lane(v, p, lane)               \
   1177  do {                                                        \
   1178    (v) = vld1q_lane_u32((const uint32_t *)(p), (v), (lane)); \
   1179  } while (0)
   1180 #else
   1181 #define load_unaligned_u32_2x1_lane(v, p, lane) \
   1182  do {                                          \
   1183    uint32_t tmp;                               \
   1184    memcpy(&tmp, (p), 4);                       \
   1185    (v) = vset_lane_u32(tmp, (v), (lane));      \
   1186  } while (0)
   1187 
   1188 #define load_unaligned_u32_4x1_lane(v, p, lane) \
   1189  do {                                          \
   1190    uint32_t tmp;                               \
   1191    memcpy(&tmp, (p), 4);                       \
   1192    (v) = vsetq_lane_u32(tmp, (v), (lane));     \
   1193  } while (0)
   1194 #endif
   1195 
   1196 // Load 2 sets of 4 bytes when alignment is not guaranteed.
   1197 static inline uint8x8_t load_unaligned_u8(const uint8_t *buf,
   1198                                          ptrdiff_t stride) {
   1199  uint32_t a;
   1200  memcpy(&a, buf, 4);
   1201  buf += stride;
   1202  uint32x2_t a_u32 = vdup_n_u32(a);
   1203  memcpy(&a, buf, 4);
   1204  a_u32 = vset_lane_u32(a, a_u32, 1);
   1205  return vreinterpret_u8_u32(a_u32);
   1206 }
   1207 
   1208 // Load 4 sets of 4 bytes when alignment is not guaranteed.
   1209 static inline uint8x16_t load_unaligned_u8q(const uint8_t *buf,
   1210                                            ptrdiff_t stride) {
   1211  uint32_t a;
   1212  uint32x4_t a_u32;
   1213  if (stride == 4) return vld1q_u8(buf);
   1214  memcpy(&a, buf, 4);
   1215  buf += stride;
   1216  a_u32 = vdupq_n_u32(a);
   1217  memcpy(&a, buf, 4);
   1218  buf += stride;
   1219  a_u32 = vsetq_lane_u32(a, a_u32, 1);
   1220  memcpy(&a, buf, 4);
   1221  buf += stride;
   1222  a_u32 = vsetq_lane_u32(a, a_u32, 2);
   1223  memcpy(&a, buf, 4);
   1224  a_u32 = vsetq_lane_u32(a, a_u32, 3);
   1225  return vreinterpretq_u8_u32(a_u32);
   1226 }
   1227 
   1228 static inline uint8x8_t load_unaligned_u8_2x2(const uint8_t *buf,
   1229                                              ptrdiff_t stride) {
   1230  uint16_t a;
   1231  uint16x4_t a_u16;
   1232 
   1233  memcpy(&a, buf, 2);
   1234  buf += stride;
   1235  a_u16 = vdup_n_u16(a);
   1236  memcpy(&a, buf, 2);
   1237  a_u16 = vset_lane_u16(a, a_u16, 1);
   1238  return vreinterpret_u8_u16(a_u16);
   1239 }
   1240 
   1241 static inline uint8x8_t load_unaligned_u8_4x1(const uint8_t *buf) {
   1242  uint32_t a;
   1243  uint32x2_t a_u32;
   1244 
   1245  memcpy(&a, buf, 4);
   1246  a_u32 = vdup_n_u32(0);
   1247  a_u32 = vset_lane_u32(a, a_u32, 0);
   1248  return vreinterpret_u8_u32(a_u32);
   1249 }
   1250 
   1251 static inline uint8x8_t load_unaligned_dup_u8_4x2(const uint8_t *buf) {
   1252  uint32_t a;
   1253  uint32x2_t a_u32;
   1254 
   1255  memcpy(&a, buf, 4);
   1256  a_u32 = vdup_n_u32(a);
   1257  return vreinterpret_u8_u32(a_u32);
   1258 }
   1259 
   1260 static inline uint8x8_t load_unaligned_dup_u8_2x4(const uint8_t *buf) {
   1261  uint16_t a;
   1262  uint16x4_t a_u32;
   1263 
   1264  memcpy(&a, buf, 2);
   1265  a_u32 = vdup_n_u16(a);
   1266  return vreinterpret_u8_u16(a_u32);
   1267 }
   1268 
   1269 static inline uint8x8_t load_unaligned_u8_4x2(const uint8_t *buf,
   1270                                              ptrdiff_t stride) {
   1271  uint32_t a;
   1272  uint32x2_t a_u32;
   1273 
   1274  memcpy(&a, buf, 4);
   1275  buf += stride;
   1276  a_u32 = vdup_n_u32(a);
   1277  memcpy(&a, buf, 4);
   1278  a_u32 = vset_lane_u32(a, a_u32, 1);
   1279  return vreinterpret_u8_u32(a_u32);
   1280 }
   1281 
   1282 static inline void load_unaligned_u8_4x4(const uint8_t *buf, ptrdiff_t stride,
   1283                                         uint8x8_t *tu0, uint8x8_t *tu1) {
   1284  *tu0 = load_unaligned_u8_4x2(buf, stride);
   1285  buf += 2 * stride;
   1286  *tu1 = load_unaligned_u8_4x2(buf, stride);
   1287 }
   1288 
   1289 static inline void load_unaligned_u8_3x8(const uint8_t *buf, ptrdiff_t stride,
   1290                                         uint8x8_t *tu0, uint8x8_t *tu1,
   1291                                         uint8x8_t *tu2) {
   1292  load_unaligned_u8_4x4(buf, stride, tu0, tu1);
   1293  buf += 4 * stride;
   1294  *tu2 = load_unaligned_u8_4x2(buf, stride);
   1295 }
   1296 
   1297 static inline void load_unaligned_u8_4x8(const uint8_t *buf, ptrdiff_t stride,
   1298                                         uint8x8_t *tu0, uint8x8_t *tu1,
   1299                                         uint8x8_t *tu2, uint8x8_t *tu3) {
   1300  load_unaligned_u8_4x4(buf, stride, tu0, tu1);
   1301  buf += 4 * stride;
   1302  load_unaligned_u8_4x4(buf, stride, tu2, tu3);
   1303 }
   1304 
   1305 static inline void load_u8_16x8(const uint8_t *s, ptrdiff_t p,
   1306                                uint8x16_t *const s0, uint8x16_t *const s1,
   1307                                uint8x16_t *const s2, uint8x16_t *const s3,
   1308                                uint8x16_t *const s4, uint8x16_t *const s5,
   1309                                uint8x16_t *const s6, uint8x16_t *const s7) {
   1310  *s0 = vld1q_u8(s);
   1311  s += p;
   1312  *s1 = vld1q_u8(s);
   1313  s += p;
   1314  *s2 = vld1q_u8(s);
   1315  s += p;
   1316  *s3 = vld1q_u8(s);
   1317  s += p;
   1318  *s4 = vld1q_u8(s);
   1319  s += p;
   1320  *s5 = vld1q_u8(s);
   1321  s += p;
   1322  *s6 = vld1q_u8(s);
   1323  s += p;
   1324  *s7 = vld1q_u8(s);
   1325 }
   1326 
   1327 static inline void load_u8_16x5(const uint8_t *s, ptrdiff_t p,
   1328                                uint8x16_t *const s0, uint8x16_t *const s1,
   1329                                uint8x16_t *const s2, uint8x16_t *const s3,
   1330                                uint8x16_t *const s4) {
   1331  *s0 = vld1q_u8(s);
   1332  s += p;
   1333  *s1 = vld1q_u8(s);
   1334  s += p;
   1335  *s2 = vld1q_u8(s);
   1336  s += p;
   1337  *s3 = vld1q_u8(s);
   1338  s += p;
   1339  *s4 = vld1q_u8(s);
   1340 }
   1341 
   1342 static inline void load_u8_16x4(const uint8_t *s, ptrdiff_t p,
   1343                                uint8x16_t *const s0, uint8x16_t *const s1,
   1344                                uint8x16_t *const s2, uint8x16_t *const s3) {
   1345  *s0 = vld1q_u8(s);
   1346  s += p;
   1347  *s1 = vld1q_u8(s);
   1348  s += p;
   1349  *s2 = vld1q_u8(s);
   1350  s += p;
   1351  *s3 = vld1q_u8(s);
   1352 }
   1353 
   1354 static inline void load_u8_16x3(const uint8_t *s, ptrdiff_t p,
   1355                                uint8x16_t *const s0, uint8x16_t *const s1,
   1356                                uint8x16_t *const s2) {
   1357  *s0 = vld1q_u8(s);
   1358  s += p;
   1359  *s1 = vld1q_u8(s);
   1360  s += p;
   1361  *s2 = vld1q_u8(s);
   1362 }
   1363 
   1364 static inline void load_u16_8x8(const uint16_t *s, const ptrdiff_t p,
   1365                                uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2,
   1366                                uint16x8_t *s3, uint16x8_t *s4, uint16x8_t *s5,
   1367                                uint16x8_t *s6, uint16x8_t *s7) {
   1368  *s0 = vld1q_u16(s);
   1369  s += p;
   1370  *s1 = vld1q_u16(s);
   1371  s += p;
   1372  *s2 = vld1q_u16(s);
   1373  s += p;
   1374  *s3 = vld1q_u16(s);
   1375  s += p;
   1376  *s4 = vld1q_u16(s);
   1377  s += p;
   1378  *s5 = vld1q_u16(s);
   1379  s += p;
   1380  *s6 = vld1q_u16(s);
   1381  s += p;
   1382  *s7 = vld1q_u16(s);
   1383 }
   1384 
   1385 static inline void load_u16_16x4(const uint16_t *s, ptrdiff_t p,
   1386                                 uint16x8_t *const s0, uint16x8_t *const s1,
   1387                                 uint16x8_t *const s2, uint16x8_t *const s3,
   1388                                 uint16x8_t *const s4, uint16x8_t *const s5,
   1389                                 uint16x8_t *const s6, uint16x8_t *const s7) {
   1390  *s0 = vld1q_u16(s);
   1391  *s1 = vld1q_u16(s + 8);
   1392  s += p;
   1393  *s2 = vld1q_u16(s);
   1394  *s3 = vld1q_u16(s + 8);
   1395  s += p;
   1396  *s4 = vld1q_u16(s);
   1397  *s5 = vld1q_u16(s + 8);
   1398  s += p;
   1399  *s6 = vld1q_u16(s);
   1400  *s7 = vld1q_u16(s + 8);
   1401 }
   1402 
   1403 static inline uint16x4_t load_unaligned_u16_2x2(const uint16_t *buf,
   1404                                                ptrdiff_t stride) {
   1405  uint32_t a;
   1406  uint32x2_t a_u32;
   1407 
   1408  memcpy(&a, buf, 4);
   1409  buf += stride;
   1410  a_u32 = vdup_n_u32(a);
   1411  memcpy(&a, buf, 4);
   1412  a_u32 = vset_lane_u32(a, a_u32, 1);
   1413  return vreinterpret_u16_u32(a_u32);
   1414 }
   1415 
   1416 static inline uint16x4_t load_unaligned_u16_4x1(const uint16_t *buf) {
   1417  uint64_t a;
   1418  uint64x1_t a_u64 = vdup_n_u64(0);
   1419  memcpy(&a, buf, 8);
   1420  a_u64 = vset_lane_u64(a, a_u64, 0);
   1421  return vreinterpret_u16_u64(a_u64);
   1422 }
   1423 
   1424 static inline uint16x8_t load_unaligned_u16_4x2(const uint16_t *buf,
   1425                                                ptrdiff_t stride) {
   1426  uint64_t a;
   1427  uint64x2_t a_u64;
   1428 
   1429  memcpy(&a, buf, 8);
   1430  buf += stride;
   1431  a_u64 = vdupq_n_u64(0);
   1432  a_u64 = vsetq_lane_u64(a, a_u64, 0);
   1433  memcpy(&a, buf, 8);
   1434  buf += stride;
   1435  a_u64 = vsetq_lane_u64(a, a_u64, 1);
   1436  return vreinterpretq_u16_u64(a_u64);
   1437 }
   1438 
   1439 static inline int16x8_t load_unaligned_s16_4x2(const int16_t *buf,
   1440                                               ptrdiff_t stride) {
   1441  int64_t a;
   1442  int64x2_t a_s64;
   1443  memcpy(&a, buf, 8);
   1444  buf += stride;
   1445  a_s64 = vdupq_n_s64(0);
   1446  a_s64 = vsetq_lane_s64(a, a_s64, 0);
   1447  memcpy(&a, buf, 8);
   1448  buf += stride;
   1449  a_s64 = vsetq_lane_s64(a, a_s64, 1);
   1450  return vreinterpretq_s16_s64(a_s64);
   1451 }
   1452 
   1453 static inline void load_unaligned_u16_4x4(const uint16_t *buf, ptrdiff_t stride,
   1454                                          uint16x8_t *tu0, uint16x8_t *tu1) {
   1455  *tu0 = load_unaligned_u16_4x2(buf, stride);
   1456  buf += 2 * stride;
   1457  *tu1 = load_unaligned_u16_4x2(buf, stride);
   1458 }
   1459 
   1460 static inline void load_s32_4x4(int32_t *s, ptrdiff_t p, int32x4_t *s1,
   1461                                int32x4_t *s2, int32x4_t *s3, int32x4_t *s4) {
   1462  *s1 = vld1q_s32(s);
   1463  s += p;
   1464  *s2 = vld1q_s32(s);
   1465  s += p;
   1466  *s3 = vld1q_s32(s);
   1467  s += p;
   1468  *s4 = vld1q_s32(s);
   1469 }
   1470 
   1471 static inline void store_s32_4x4(int32_t *s, ptrdiff_t p, int32x4_t s1,
   1472                                 int32x4_t s2, int32x4_t s3, int32x4_t s4) {
   1473  vst1q_s32(s, s1);
   1474  s += p;
   1475  vst1q_s32(s, s2);
   1476  s += p;
   1477  vst1q_s32(s, s3);
   1478  s += p;
   1479  vst1q_s32(s, s4);
   1480 }
   1481 
   1482 static inline void load_u32_4x4(uint32_t *s, ptrdiff_t p, uint32x4_t *s1,
   1483                                uint32x4_t *s2, uint32x4_t *s3,
   1484                                uint32x4_t *s4) {
   1485  *s1 = vld1q_u32(s);
   1486  s += p;
   1487  *s2 = vld1q_u32(s);
   1488  s += p;
   1489  *s3 = vld1q_u32(s);
   1490  s += p;
   1491  *s4 = vld1q_u32(s);
   1492 }
   1493 
   1494 static inline void store_u32_4x4(uint32_t *s, ptrdiff_t p, uint32x4_t s1,
   1495                                 uint32x4_t s2, uint32x4_t s3, uint32x4_t s4) {
   1496  vst1q_u32(s, s1);
   1497  s += p;
   1498  vst1q_u32(s, s2);
   1499  s += p;
   1500  vst1q_u32(s, s3);
   1501  s += p;
   1502  vst1q_u32(s, s4);
   1503 }
   1504 
   1505 static inline int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
   1506  const int32x4_t v0 = vld1q_s32(buf);
   1507  const int32x4_t v1 = vld1q_s32(buf + 4);
   1508  const int16x4_t s0 = vmovn_s32(v0);
   1509  const int16x4_t s1 = vmovn_s32(v1);
   1510  return vcombine_s16(s0, s1);
   1511 }
   1512 
   1513 static inline void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
   1514  const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
   1515  const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
   1516  vst1q_s32(buf, v0);
   1517  vst1q_s32(buf + 4, v1);
   1518 }
   1519 
   1520 static inline void store_s16_to_tran_low(tran_low_t *buf, const int16x4_t a) {
   1521  const int32x4_t v0 = vmovl_s16(a);
   1522  vst1q_s32(buf, v0);
   1523 }
   1524 
   1525 static inline uint8x8_t load_u8_gather_s16_x8(const uint8_t *src,
   1526                                              int16x8_t indices) {
   1527  // Recent Clang and GCC versions correctly identify that this zero-broadcast
   1528  // is redundant. Alternatively we could load and broadcast the zeroth element
   1529  // and then replace the other lanes, however this is slower than loading a
   1530  // single element without broadcast on some micro-architectures.
   1531  uint8x8_t ret = vdup_n_u8(0);
   1532  ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 0), ret, 0);
   1533  ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 1), ret, 1);
   1534  ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 2), ret, 2);
   1535  ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 3), ret, 3);
   1536  ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 0), ret, 4);
   1537  ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 1), ret, 5);
   1538  ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 2), ret, 6);
   1539  ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 3), ret, 7);
   1540  return ret;
   1541 }
   1542 
   1543 // The `lane` parameter here must be an immediate.
   1544 #define store_u8_2x1_lane(dst, src, lane)                       \
   1545  do {                                                          \
   1546    uint16_t a = vget_lane_u16(vreinterpret_u16_u8(src), lane); \
   1547    memcpy(dst, &a, 2);                                         \
   1548  } while (0)
   1549 
   1550 #define store_u8_4x1_lane(dst, src, lane)                       \
   1551  do {                                                          \
   1552    uint32_t a = vget_lane_u32(vreinterpret_u32_u8(src), lane); \
   1553    memcpy(dst, &a, 4);                                         \
   1554  } while (0)
   1555 
   1556 #define store_u16_2x1_lane(dst, src, lane)                       \
   1557  do {                                                           \
   1558    uint32_t a = vget_lane_u32(vreinterpret_u32_u16(src), lane); \
   1559    memcpy(dst, &a, 4);                                          \
   1560  } while (0)
   1561 
   1562 #define store_u16_4x1_lane(dst, src, lane)                         \
   1563  do {                                                             \
   1564    uint64_t a = vgetq_lane_u64(vreinterpretq_u64_u16(src), lane); \
   1565    memcpy(dst, &a, 8);                                            \
   1566  } while (0)
   1567 
   1568 #define store_s16_4x1_lane(dst, src, lane)                        \
   1569  do {                                                            \
   1570    int64_t a = vgetq_lane_s64(vreinterpretq_s64_s16(src), lane); \
   1571    memcpy(dst, &a, 8);                                           \
   1572  } while (0)
   1573 
   1574 // Store the low 16-bits from a single vector.
   1575 static inline void store_u8_2x1(uint8_t *dst, const uint8x8_t src) {
   1576  store_u8_2x1_lane(dst, src, 0);
   1577 }
   1578 
   1579 // Store the low 32-bits from a single vector.
   1580 static inline void store_u8_4x1(uint8_t *dst, const uint8x8_t src) {
   1581  store_u8_4x1_lane(dst, src, 0);
   1582 }
   1583 
   1584 // Store two blocks of 16-bits from a single vector.
   1585 static inline void store_u8x2_strided_x2(uint8_t *dst, ptrdiff_t dst_stride,
   1586                                         uint8x8_t src) {
   1587  store_u8_2x1_lane(dst, src, 0);
   1588  dst += dst_stride;
   1589  store_u8_2x1_lane(dst, src, 1);
   1590 }
   1591 
   1592 static inline void store_u8x2_strided_x4(uint8_t *dst, ptrdiff_t dst_stride,
   1593                                         uint8x8_t src) {
   1594  store_u8_2x1_lane(dst, src, 0);
   1595  dst += dst_stride;
   1596  store_u8_2x1_lane(dst, src, 1);
   1597  dst += dst_stride;
   1598  store_u8_2x1_lane(dst, src, 2);
   1599  dst += dst_stride;
   1600  store_u8_2x1_lane(dst, src, 3);
   1601 }
   1602 
   1603 // Store two blocks of 32-bits from a single vector.
   1604 static inline void store_u8x4_strided_x2(uint8_t *dst, ptrdiff_t stride,
   1605                                         uint8x8_t src) {
   1606  store_u8_4x1_lane(dst, src, 0);
   1607  dst += stride;
   1608  store_u8_4x1_lane(dst, src, 1);
   1609 }
   1610 
   1611 // Store four blocks of 32-bits from a single vector.
   1612 static inline void store_u8x4_strided_x4(uint8_t *dst, ptrdiff_t stride,
   1613                                         uint8x16_t src) {
   1614  store_u8_4x1_lane(dst, vget_low_u8(src), 0);
   1615  dst += stride;
   1616  store_u8_4x1_lane(dst, vget_low_u8(src), 1);
   1617  dst += stride;
   1618  store_u8_4x1_lane(dst, vget_high_u8(src), 0);
   1619  dst += stride;
   1620  store_u8_4x1_lane(dst, vget_high_u8(src), 1);
   1621 }
   1622 
   1623 // Store the low 32-bits from a single vector.
   1624 static inline void store_u16_2x1(uint16_t *dst, const uint16x4_t src) {
   1625  store_u16_2x1_lane(dst, src, 0);
   1626 }
   1627 
   1628 // Store two blocks of 32-bits from a single vector.
   1629 static inline void store_u16x2_strided_x2(uint16_t *dst, ptrdiff_t dst_stride,
   1630                                          uint16x4_t src) {
   1631  store_u16_2x1_lane(dst, src, 0);
   1632  dst += dst_stride;
   1633  store_u16_2x1_lane(dst, src, 1);
   1634 }
   1635 
   1636 // Store two blocks of 64-bits from a single vector.
   1637 static inline void store_u16x4_strided_x2(uint16_t *dst, ptrdiff_t dst_stride,
   1638                                          uint16x8_t src) {
   1639  store_u16_4x1_lane(dst, src, 0);
   1640  dst += dst_stride;
   1641  store_u16_4x1_lane(dst, src, 1);
   1642 }
   1643 
   1644 // Store two blocks of 64-bits from a single vector.
   1645 static inline void store_s16x4_strided_x2(int16_t *dst, ptrdiff_t dst_stride,
   1646                                          int16x8_t src) {
   1647  store_s16_4x1_lane(dst, src, 0);
   1648  dst += dst_stride;
   1649  store_s16_4x1_lane(dst, src, 1);
   1650 }
   1651 
   1652 #undef store_u8_2x1_lane
   1653 #undef store_u8_4x1_lane
   1654 #undef store_u16_2x1_lane
   1655 #undef store_u16_4x1_lane
   1656 #undef store_s16_4x1_lane
   1657 
   1658 #endif  // AOM_AOM_DSP_ARM_MEM_NEON_H_