tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

transpose_neon.h (58345B)


      1 /*
      2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #ifndef AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_
     13 #define AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_
     14 
     15 #include <arm_neon.h>
     16 
     17 #include "aom_dsp/aom_dsp_common.h"  // For AOM_FORCE_INLINE.
     18 #include "config/aom_config.h"
     19 
     20 static inline void transpose_concat_elems_u8_4x4(uint8x8_t a0, uint8x8_t a1,
     21                                                 uint8x8_t a2, uint8x8_t a3,
     22                                                 uint8x16_t *b) {
     23  // Transpose 8-bit elements and concatenate result rows as follows:
     24  // a0: 00, 01, 02, 03, XX, XX, XX, XX
     25  // a1: 10, 11, 12, 13, XX, XX, XX, XX
     26  // a2: 20, 21, 22, 23, XX, XX, XX, XX
     27  // a3: 30, 31, 32, 33, XX, XX, XX, XX
     28  //
     29  // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
     30 
     31  uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0));
     32  uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0));
     33  uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0));
     34  uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0));
     35 
     36  uint8x16_t a02 = vzipq_u8(a0q, a2q).val[0];
     37  uint8x16_t a13 = vzipq_u8(a1q, a3q).val[0];
     38 
     39  *b = vzipq_u8(a02, a13).val[0];
     40 }
     41 
     42 static inline void transpose_concat_elems_u8_8x4(uint8x8_t a0, uint8x8_t a1,
     43                                                 uint8x8_t a2, uint8x8_t a3,
     44                                                 uint8x16_t *b0,
     45                                                 uint8x16_t *b1) {
     46  // Transpose 8-bit elements and concatenate result rows as follows:
     47  // a0: 00, 01, 02, 03, 04, 05, 06, 07
     48  // a1: 10, 11, 12, 13, 14, 15, 16, 17
     49  // a2: 20, 21, 22, 23, 24, 25, 26, 27
     50  // a3: 30, 31, 32, 33, 34, 35, 36, 37
     51  //
     52  // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
     53  // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
     54 
     55  uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0));
     56  uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0));
     57  uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0));
     58  uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0));
     59 
     60  uint8x16_t a02 = vzipq_u8(a0q, a2q).val[0];
     61  uint8x16_t a13 = vzipq_u8(a1q, a3q).val[0];
     62 
     63  uint8x16x2_t a0123 = vzipq_u8(a02, a13);
     64 
     65  *b0 = a0123.val[0];
     66  *b1 = a0123.val[1];
     67 }
     68 
     69 static inline void transpose_concat_elems_s8_4x4(int8x8_t a0, int8x8_t a1,
     70                                                 int8x8_t a2, int8x8_t a3,
     71                                                 int8x16_t *b) {
     72  // Transpose 8-bit elements and concatenate result rows as follows:
     73  // a0: 00, 01, 02, 03, XX, XX, XX, XX
     74  // a1: 10, 11, 12, 13, XX, XX, XX, XX
     75  // a2: 20, 21, 22, 23, XX, XX, XX, XX
     76  // a3: 30, 31, 32, 33, XX, XX, XX, XX
     77  //
     78  // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
     79 
     80  int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0));
     81  int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0));
     82  int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0));
     83  int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0));
     84 
     85  int8x16_t a02 = vzipq_s8(a0q, a2q).val[0];
     86  int8x16_t a13 = vzipq_s8(a1q, a3q).val[0];
     87 
     88  *b = vzipq_s8(a02, a13).val[0];
     89 }
     90 
     91 static inline void transpose_concat_elems_s8_8x4(int8x8_t a0, int8x8_t a1,
     92                                                 int8x8_t a2, int8x8_t a3,
     93                                                 int8x16_t *b0, int8x16_t *b1) {
     94  // Transpose 8-bit elements and concatenate result rows as follows:
     95  // a0: 00, 01, 02, 03, 04, 05, 06, 07
     96  // a1: 10, 11, 12, 13, 14, 15, 16, 17
     97  // a2: 20, 21, 22, 23, 24, 25, 26, 27
     98  // a3: 30, 31, 32, 33, 34, 35, 36, 37
     99  //
    100  // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
    101  // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
    102 
    103  int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0));
    104  int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0));
    105  int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0));
    106  int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0));
    107 
    108  int8x16_t a02 = vzipq_s8(a0q, a2q).val[0];
    109  int8x16_t a13 = vzipq_s8(a1q, a3q).val[0];
    110 
    111  int8x16x2_t a0123 = vzipq_s8(a02, a13);
    112 
    113  *b0 = a0123.val[0];
    114  *b1 = a0123.val[1];
    115 }
    116 
    117 static inline void transpose_concat_elems_s16_4x4(int16x4_t s0, int16x4_t s1,
    118                                                  int16x4_t s2, int16x4_t s3,
    119                                                  int16x8_t res[2]) {
    120  // Transpose 16-bit elements and concatenate result rows as follows:
    121  // s0: 00, 01, 02, 03
    122  // s1: 10, 11, 12, 13
    123  // s2: 20, 21, 22, 23
    124  // s3: 30, 31, 32, 33
    125  //
    126  // res[0]: 00 10 20 30 01 11 21 31
    127  // res[1]: 02 12 22 32 03 13 23 33
    128 
    129  int16x8_t s0q = vcombine_s16(s0, vdup_n_s16(0));
    130  int16x8_t s1q = vcombine_s16(s1, vdup_n_s16(0));
    131  int16x8_t s2q = vcombine_s16(s2, vdup_n_s16(0));
    132  int16x8_t s3q = vcombine_s16(s3, vdup_n_s16(0));
    133 
    134  int16x8_t s02 = vzipq_s16(s0q, s2q).val[0];
    135  int16x8_t s13 = vzipq_s16(s1q, s3q).val[0];
    136 
    137  int16x8x2_t s0123 = vzipq_s16(s02, s13);
    138 
    139  res[0] = s0123.val[0];
    140  res[1] = s0123.val[1];
    141 }
    142 
    143 static inline void transpose_concat_elems_s16_8x4(int16x8_t s0, int16x8_t s1,
    144                                                  int16x8_t s2, int16x8_t s3,
    145                                                  int16x8_t res[4]) {
    146  // Transpose 16-bit elements and concatenate result rows as follows:
    147  // s0: 00, 01, 02, 03, 04, 05, 06, 07
    148  // s1: 10, 11, 12, 13, 14, 15, 16, 17
    149  // s2: 20, 21, 22, 23, 24, 25, 26, 27
    150  // s3: 30, 31, 32, 33, 34, 35, 36, 37
    151  //
    152  // res[0]: 00 10 20 30 01 11 21 31
    153  // res[1]: 02 12 22 32 03 13 23 33
    154  // res[2]: 04 14 24 34 05 15 25 35
    155  // res[3]: 06 16 26 36 07 17 27 37
    156 
    157  int16x8x2_t s02 = vzipq_s16(s0, s2);
    158  int16x8x2_t s13 = vzipq_s16(s1, s3);
    159 
    160  int16x8x2_t s0123_lo = vzipq_s16(s02.val[0], s13.val[0]);
    161  int16x8x2_t s0123_hi = vzipq_s16(s02.val[1], s13.val[1]);
    162 
    163  res[0] = s0123_lo.val[0];
    164  res[1] = s0123_lo.val[1];
    165  res[2] = s0123_hi.val[0];
    166  res[3] = s0123_hi.val[1];
    167 }
    168 
    169 static inline void transpose_elems_u8_8x8(
    170    uint8x8_t a0, uint8x8_t a1, uint8x8_t a2, uint8x8_t a3, uint8x8_t a4,
    171    uint8x8_t a5, uint8x8_t a6, uint8x8_t a7, uint8x8_t *o0, uint8x8_t *o1,
    172    uint8x8_t *o2, uint8x8_t *o3, uint8x8_t *o4, uint8x8_t *o5, uint8x8_t *o6,
    173    uint8x8_t *o7) {
    174  // Swap 8 bit elements. Goes from:
    175  // a0: 00 01 02 03 04 05 06 07
    176  // a1: 10 11 12 13 14 15 16 17
    177  // a2: 20 21 22 23 24 25 26 27
    178  // a3: 30 31 32 33 34 35 36 37
    179  // a4: 40 41 42 43 44 45 46 47
    180  // a5: 50 51 52 53 54 55 56 57
    181  // a6: 60 61 62 63 64 65 66 67
    182  // a7: 70 71 72 73 74 75 76 77
    183  // to:
    184  // b0.val[0]: 00 10 02 12 04 14 06 16  40 50 42 52 44 54 46 56
    185  // b0.val[1]: 01 11 03 13 05 15 07 17  41 51 43 53 45 55 47 57
    186  // b1.val[0]: 20 30 22 32 24 34 26 36  60 70 62 72 64 74 66 76
    187  // b1.val[1]: 21 31 23 33 25 35 27 37  61 71 63 73 65 75 67 77
    188 
    189  const uint8x16x2_t b0 = vtrnq_u8(vcombine_u8(a0, a4), vcombine_u8(a1, a5));
    190  const uint8x16x2_t b1 = vtrnq_u8(vcombine_u8(a2, a6), vcombine_u8(a3, a7));
    191 
    192  // Swap 16 bit elements resulting in:
    193  // c0.val[0]: 00 10 20 30 04 14 24 34  40 50 60 70 44 54 64 74
    194  // c0.val[1]: 02 12 22 32 06 16 26 36  42 52 62 72 46 56 66 76
    195  // c1.val[0]: 01 11 21 31 05 15 25 35  41 51 61 71 45 55 65 75
    196  // c1.val[1]: 03 13 23 33 07 17 27 37  43 53 63 73 47 57 67 77
    197 
    198  const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
    199                                    vreinterpretq_u16_u8(b1.val[0]));
    200  const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
    201                                    vreinterpretq_u16_u8(b1.val[1]));
    202 
    203  // Unzip 32 bit elements resulting in:
    204  // d0.val[0]: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
    205  // d0.val[1]: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
    206  // d1.val[0]: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
    207  // d1.val[1]: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
    208  const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
    209                                    vreinterpretq_u32_u16(c1.val[0]));
    210  const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
    211                                    vreinterpretq_u32_u16(c1.val[1]));
    212 
    213  *o0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
    214  *o1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
    215  *o2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
    216  *o3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
    217  *o4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
    218  *o5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
    219  *o6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
    220  *o7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
    221 }
    222 
    223 static inline void transpose_elems_inplace_u8_8x8(uint8x8_t *a0, uint8x8_t *a1,
    224                                                  uint8x8_t *a2, uint8x8_t *a3,
    225                                                  uint8x8_t *a4, uint8x8_t *a5,
    226                                                  uint8x8_t *a6,
    227                                                  uint8x8_t *a7) {
    228  transpose_elems_u8_8x8(*a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7, a0, a1, a2, a3,
    229                         a4, a5, a6, a7);
    230 }
    231 
    232 static inline void transpose_arrays_u8_8x8(const uint8x8_t *in,
    233                                           uint8x8_t *out) {
    234  transpose_elems_u8_8x8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7],
    235                         &out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
    236                         &out[6], &out[7]);
    237 }
    238 
    239 static AOM_FORCE_INLINE void transpose_arrays_u8_8x16(const uint8x8_t *x,
    240                                                      uint8x16_t *d) {
    241  uint8x8x2_t w0 = vzip_u8(x[0], x[1]);
    242  uint8x8x2_t w1 = vzip_u8(x[2], x[3]);
    243  uint8x8x2_t w2 = vzip_u8(x[4], x[5]);
    244  uint8x8x2_t w3 = vzip_u8(x[6], x[7]);
    245 
    246  uint8x8x2_t w8 = vzip_u8(x[8], x[9]);
    247  uint8x8x2_t w9 = vzip_u8(x[10], x[11]);
    248  uint8x8x2_t w10 = vzip_u8(x[12], x[13]);
    249  uint8x8x2_t w11 = vzip_u8(x[14], x[15]);
    250 
    251  uint16x4x2_t w4 =
    252      vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]));
    253  uint16x4x2_t w5 =
    254      vzip_u16(vreinterpret_u16_u8(w2.val[0]), vreinterpret_u16_u8(w3.val[0]));
    255  uint16x4x2_t w12 =
    256      vzip_u16(vreinterpret_u16_u8(w8.val[0]), vreinterpret_u16_u8(w9.val[0]));
    257  uint16x4x2_t w13 = vzip_u16(vreinterpret_u16_u8(w10.val[0]),
    258                              vreinterpret_u16_u8(w11.val[0]));
    259 
    260  uint32x2x2_t w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]),
    261                             vreinterpret_u32_u16(w5.val[0]));
    262  uint32x2x2_t w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]),
    263                             vreinterpret_u32_u16(w5.val[1]));
    264  uint32x2x2_t w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]),
    265                              vreinterpret_u32_u16(w13.val[0]));
    266  uint32x2x2_t w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]),
    267                              vreinterpret_u32_u16(w13.val[1]));
    268 
    269  // Store first 4-line result
    270  d[0] = vreinterpretq_u8_u32(vcombine_u32(w6.val[0], w14.val[0]));
    271  d[1] = vreinterpretq_u8_u32(vcombine_u32(w6.val[1], w14.val[1]));
    272  d[2] = vreinterpretq_u8_u32(vcombine_u32(w7.val[0], w15.val[0]));
    273  d[3] = vreinterpretq_u8_u32(vcombine_u32(w7.val[1], w15.val[1]));
    274 
    275  w4 = vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1]));
    276  w5 = vzip_u16(vreinterpret_u16_u8(w2.val[1]), vreinterpret_u16_u8(w3.val[1]));
    277  w12 =
    278      vzip_u16(vreinterpret_u16_u8(w8.val[1]), vreinterpret_u16_u8(w9.val[1]));
    279  w13 = vzip_u16(vreinterpret_u16_u8(w10.val[1]),
    280                 vreinterpret_u16_u8(w11.val[1]));
    281 
    282  w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]),
    283                vreinterpret_u32_u16(w5.val[0]));
    284  w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]),
    285                vreinterpret_u32_u16(w5.val[1]));
    286  w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]),
    287                 vreinterpret_u32_u16(w13.val[0]));
    288  w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]),
    289                 vreinterpret_u32_u16(w13.val[1]));
    290 
    291  // Store second 4-line result
    292  d[4] = vreinterpretq_u8_u32(vcombine_u32(w6.val[0], w14.val[0]));
    293  d[5] = vreinterpretq_u8_u32(vcombine_u32(w6.val[1], w14.val[1]));
    294  d[6] = vreinterpretq_u8_u32(vcombine_u32(w7.val[0], w15.val[0]));
    295  d[7] = vreinterpretq_u8_u32(vcombine_u32(w7.val[1], w15.val[1]));
    296 }
    297 
    298 static AOM_FORCE_INLINE void transpose_arrays_u8_16x8(const uint8x16_t *x,
    299                                                      uint8x8_t *d) {
    300  uint8x16x2_t w0 = vzipq_u8(x[0], x[1]);
    301  uint8x16x2_t w1 = vzipq_u8(x[2], x[3]);
    302  uint8x16x2_t w2 = vzipq_u8(x[4], x[5]);
    303  uint8x16x2_t w3 = vzipq_u8(x[6], x[7]);
    304 
    305  uint16x8x2_t w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
    306                              vreinterpretq_u16_u8(w1.val[0]));
    307  uint16x8x2_t w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]),
    308                              vreinterpretq_u16_u8(w3.val[0]));
    309  uint16x8x2_t w6 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
    310                              vreinterpretq_u16_u8(w1.val[1]));
    311  uint16x8x2_t w7 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]),
    312                              vreinterpretq_u16_u8(w3.val[1]));
    313 
    314  uint32x4x2_t w8 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]),
    315                              vreinterpretq_u32_u16(w5.val[0]));
    316  uint32x4x2_t w9 = vzipq_u32(vreinterpretq_u32_u16(w6.val[0]),
    317                              vreinterpretq_u32_u16(w7.val[0]));
    318  uint32x4x2_t w10 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]),
    319                               vreinterpretq_u32_u16(w5.val[1]));
    320  uint32x4x2_t w11 = vzipq_u32(vreinterpretq_u32_u16(w6.val[1]),
    321                               vreinterpretq_u32_u16(w7.val[1]));
    322 
    323  d[0] = vreinterpret_u8_u32(vget_low_u32(w8.val[0]));
    324  d[1] = vreinterpret_u8_u32(vget_high_u32(w8.val[0]));
    325  d[2] = vreinterpret_u8_u32(vget_low_u32(w8.val[1]));
    326  d[3] = vreinterpret_u8_u32(vget_high_u32(w8.val[1]));
    327  d[4] = vreinterpret_u8_u32(vget_low_u32(w10.val[0]));
    328  d[5] = vreinterpret_u8_u32(vget_high_u32(w10.val[0]));
    329  d[6] = vreinterpret_u8_u32(vget_low_u32(w10.val[1]));
    330  d[7] = vreinterpret_u8_u32(vget_high_u32(w10.val[1]));
    331  d[8] = vreinterpret_u8_u32(vget_low_u32(w9.val[0]));
    332  d[9] = vreinterpret_u8_u32(vget_high_u32(w9.val[0]));
    333  d[10] = vreinterpret_u8_u32(vget_low_u32(w9.val[1]));
    334  d[11] = vreinterpret_u8_u32(vget_high_u32(w9.val[1]));
    335  d[12] = vreinterpret_u8_u32(vget_low_u32(w11.val[0]));
    336  d[13] = vreinterpret_u8_u32(vget_high_u32(w11.val[0]));
    337  d[14] = vreinterpret_u8_u32(vget_low_u32(w11.val[1]));
    338  d[15] = vreinterpret_u8_u32(vget_high_u32(w11.val[1]));
    339 }
    340 
    341 static inline uint16x8x2_t aom_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) {
    342  uint16x8x2_t b0;
    343 #if AOM_ARCH_AARCH64
    344  b0.val[0] = vreinterpretq_u16_u64(
    345      vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
    346  b0.val[1] = vreinterpretq_u16_u64(
    347      vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
    348 #else
    349  b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)),
    350                           vreinterpret_u16_u32(vget_low_u32(a1)));
    351  b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)),
    352                           vreinterpret_u16_u32(vget_high_u32(a1)));
    353 #endif
    354  return b0;
    355 }
    356 
    357 static inline void transpose_arrays_u8_16x16(const uint8x16_t *x,
    358                                             uint8x16_t *d) {
    359  uint8x16x2_t w0 = vzipq_u8(x[0], x[1]);
    360  uint8x16x2_t w1 = vzipq_u8(x[2], x[3]);
    361  uint8x16x2_t w2 = vzipq_u8(x[4], x[5]);
    362  uint8x16x2_t w3 = vzipq_u8(x[6], x[7]);
    363 
    364  uint8x16x2_t w4 = vzipq_u8(x[8], x[9]);
    365  uint8x16x2_t w5 = vzipq_u8(x[10], x[11]);
    366  uint8x16x2_t w6 = vzipq_u8(x[12], x[13]);
    367  uint8x16x2_t w7 = vzipq_u8(x[14], x[15]);
    368 
    369  uint16x8x2_t w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
    370                              vreinterpretq_u16_u8(w1.val[0]));
    371  uint16x8x2_t w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]),
    372                              vreinterpretq_u16_u8(w3.val[0]));
    373  uint16x8x2_t w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[0]),
    374                               vreinterpretq_u16_u8(w5.val[0]));
    375  uint16x8x2_t w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[0]),
    376                               vreinterpretq_u16_u8(w7.val[0]));
    377 
    378  uint32x4x2_t w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]),
    379                               vreinterpretq_u32_u16(w9.val[0]));
    380  uint32x4x2_t w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]),
    381                               vreinterpretq_u32_u16(w11.val[0]));
    382  uint32x4x2_t w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]),
    383                               vreinterpretq_u32_u16(w9.val[1]));
    384  uint32x4x2_t w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]),
    385                               vreinterpretq_u32_u16(w11.val[1]));
    386 
    387  uint16x8x2_t d01 = aom_vtrnq_u64_to_u16(w12.val[0], w13.val[0]);
    388  d[0] = vreinterpretq_u8_u16(d01.val[0]);
    389  d[1] = vreinterpretq_u8_u16(d01.val[1]);
    390  uint16x8x2_t d23 = aom_vtrnq_u64_to_u16(w12.val[1], w13.val[1]);
    391  d[2] = vreinterpretq_u8_u16(d23.val[0]);
    392  d[3] = vreinterpretq_u8_u16(d23.val[1]);
    393  uint16x8x2_t d45 = aom_vtrnq_u64_to_u16(w14.val[0], w15.val[0]);
    394  d[4] = vreinterpretq_u8_u16(d45.val[0]);
    395  d[5] = vreinterpretq_u8_u16(d45.val[1]);
    396  uint16x8x2_t d67 = aom_vtrnq_u64_to_u16(w14.val[1], w15.val[1]);
    397  d[6] = vreinterpretq_u8_u16(d67.val[0]);
    398  d[7] = vreinterpretq_u8_u16(d67.val[1]);
    399 
    400  // upper half
    401  w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
    402                 vreinterpretq_u16_u8(w1.val[1]));
    403  w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]),
    404                 vreinterpretq_u16_u8(w3.val[1]));
    405  w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[1]),
    406                  vreinterpretq_u16_u8(w5.val[1]));
    407  w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[1]),
    408                  vreinterpretq_u16_u8(w7.val[1]));
    409 
    410  w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]),
    411                  vreinterpretq_u32_u16(w9.val[0]));
    412  w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]),
    413                  vreinterpretq_u32_u16(w11.val[0]));
    414  w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]),
    415                  vreinterpretq_u32_u16(w9.val[1]));
    416  w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]),
    417                  vreinterpretq_u32_u16(w11.val[1]));
    418 
    419  d01 = aom_vtrnq_u64_to_u16(w12.val[0], w13.val[0]);
    420  d[8] = vreinterpretq_u8_u16(d01.val[0]);
    421  d[9] = vreinterpretq_u8_u16(d01.val[1]);
    422  d23 = aom_vtrnq_u64_to_u16(w12.val[1], w13.val[1]);
    423  d[10] = vreinterpretq_u8_u16(d23.val[0]);
    424  d[11] = vreinterpretq_u8_u16(d23.val[1]);
    425  d45 = aom_vtrnq_u64_to_u16(w14.val[0], w15.val[0]);
    426  d[12] = vreinterpretq_u8_u16(d45.val[0]);
    427  d[13] = vreinterpretq_u8_u16(d45.val[1]);
    428  d67 = aom_vtrnq_u64_to_u16(w14.val[1], w15.val[1]);
    429  d[14] = vreinterpretq_u8_u16(d67.val[0]);
    430  d[15] = vreinterpretq_u8_u16(d67.val[1]);
    431 }
    432 
    433 static AOM_FORCE_INLINE void transpose_arrays_u8_32x16(const uint8x16x2_t *x,
    434                                                       uint8x16_t *d) {
    435  uint8x16_t x2[32];
    436  for (int i = 0; i < 16; ++i) {
    437    x2[i] = x[i].val[0];
    438    x2[i + 16] = x[i].val[1];
    439  }
    440  transpose_arrays_u8_16x16(x2, d);
    441  transpose_arrays_u8_16x16(x2 + 16, d + 16);
    442 }
    443 
    444 static inline void transpose_elems_inplace_u8_8x4(uint8x8_t *a0, uint8x8_t *a1,
    445                                                  uint8x8_t *a2,
    446                                                  uint8x8_t *a3) {
    447  // Swap 8 bit elements. Goes from:
    448  // a0: 00 01 02 03 04 05 06 07
    449  // a1: 10 11 12 13 14 15 16 17
    450  // a2: 20 21 22 23 24 25 26 27
    451  // a3: 30 31 32 33 34 35 36 37
    452  // to:
    453  // b0.val[0]: 00 10 02 12 04 14 06 16
    454  // b0.val[1]: 01 11 03 13 05 15 07 17
    455  // b1.val[0]: 20 30 22 32 24 34 26 36
    456  // b1.val[1]: 21 31 23 33 25 35 27 37
    457 
    458  const uint8x8x2_t b0 = vtrn_u8(*a0, *a1);
    459  const uint8x8x2_t b1 = vtrn_u8(*a2, *a3);
    460 
    461  // Swap 16 bit elements resulting in:
    462  // c0.val[0]: 00 10 20 30 04 14 24 34
    463  // c0.val[1]: 02 12 22 32 06 16 26 36
    464  // c1.val[0]: 01 11 21 31 05 15 25 35
    465  // c1.val[1]: 03 13 23 33 07 17 27 37
    466 
    467  const uint16x4x2_t c0 =
    468      vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0]));
    469  const uint16x4x2_t c1 =
    470      vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1]));
    471 
    472  *a0 = vreinterpret_u8_u16(c0.val[0]);
    473  *a1 = vreinterpret_u8_u16(c1.val[0]);
    474  *a2 = vreinterpret_u8_u16(c0.val[1]);
    475  *a3 = vreinterpret_u8_u16(c1.val[1]);
    476 }
    477 
    478 static inline void transpose_elems_inplace_u8_16x4(uint8x16_t *a0,
    479                                                   uint8x16_t *a1,
    480                                                   uint8x16_t *a2,
    481                                                   uint8x16_t *a3) {
    482  // Swap 8 bit elements. Goes from:
    483  // a0: 00 01 02 03 04 05 06 07 08 09 010 011 012 013 014 015
    484  // a1: 10 11 12 13 14 15 16 17 18 19 110 111 112 113 114 115
    485  // a2: 20 21 22 23 24 25 26 27 28 29 210 211 212 213 214 215
    486  // a3: 30 31 32 33 34 35 36 37 38 39 310 311 312 313 314 315
    487  // to:
    488  // b0.val[0]: 00 10 02 12 04 14 06 16 08 18 010 110 012 112 014 114
    489  // b0.val[1]: 01 11 03 13 05 15 07 17 09 19 011 111 013 113 015 115
    490  // b1.val[0]: 20 30 22 32 24 34 26 36 28 38 210 310 212 312 214 314
    491  // b1.val[1]: 21 31 23 33 25 35 27 37 29 39 211 311 213 313 215 315
    492 
    493  const uint8x16x2_t b0 = vtrnq_u8(*a0, *a1);
    494  const uint8x16x2_t b1 = vtrnq_u8(*a2, *a3);
    495 
    496  // Swap 16 bit elements resulting in:
    497  // c0.val[0]: 00 10 20 30 04 14 24 34 08  18  28  38  012 112 212 312
    498  // c0.val[1]: 02 12 22 32 06 16 26 36 09  19  29  39  013 113 213 313
    499  // c1.val[0]: 01 11 21 31 05 15 25 35 010 110 210 310 014 114 214 314
    500  // c1.val[1]: 03 13 23 33 07 17 27 37 011 111 211 311 015 115 215 315
    501 
    502  const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
    503                                    vreinterpretq_u16_u8(b1.val[0]));
    504  const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
    505                                    vreinterpretq_u16_u8(b1.val[1]));
    506 
    507  *a0 = vreinterpretq_u8_u16(c0.val[0]);
    508  *a1 = vreinterpretq_u8_u16(c1.val[0]);
    509  *a2 = vreinterpretq_u8_u16(c0.val[1]);
    510  *a3 = vreinterpretq_u8_u16(c1.val[1]);
    511 }
    512 
    513 static inline void transpose_elems_inplace_u8_4x4(uint8x8_t *a0,
    514                                                  uint8x8_t *a1) {
    515  // Swap 16 bit elements. Goes from:
    516  // a0: 00 01 02 03  10 11 12 13
    517  // a1: 20 21 22 23  30 31 32 33
    518  // to:
    519  // b0.val[0]: 00 01 20 21  10 11 30 31
    520  // b0.val[1]: 02 03 22 23  12 13 32 33
    521 
    522  const uint16x4x2_t b0 =
    523      vtrn_u16(vreinterpret_u16_u8(*a0), vreinterpret_u16_u8(*a1));
    524 
    525  // Swap 32 bit elements resulting in:
    526  // c0.val[0]: 00 01 20 21  02 03 22 23
    527  // c0.val[1]: 10 11 30 31  12 13 32 33
    528 
    529  const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
    530                                   vreinterpret_u32_u16(b0.val[1]));
    531 
    532  // Swap 8 bit elements resulting in:
    533  // d0.val[0]: 00 10 20 30  02 12 22 32
    534  // d0.val[1]: 01 11 21 31  03 13 23 33
    535 
    536  const uint8x8x2_t d0 =
    537      vtrn_u8(vreinterpret_u8_u32(c0.val[0]), vreinterpret_u8_u32(c0.val[1]));
    538 
    539  *a0 = d0.val[0];
    540  *a1 = d0.val[1];
    541 }
    542 
    543 static inline void transpose_elems_u8_4x8(uint8x8_t a0, uint8x8_t a1,
    544                                          uint8x8_t a2, uint8x8_t a3,
    545                                          uint8x8_t a4, uint8x8_t a5,
    546                                          uint8x8_t a6, uint8x8_t a7,
    547                                          uint8x8_t *o0, uint8x8_t *o1,
    548                                          uint8x8_t *o2, uint8x8_t *o3) {
    549  // Swap 32 bit elements. Goes from:
    550  // a0: 00 01 02 03 XX XX XX XX
    551  // a1: 10 11 12 13 XX XX XX XX
    552  // a2: 20 21 22 23 XX XX XX XX
    553  // a3; 30 31 32 33 XX XX XX XX
    554  // a4: 40 41 42 43 XX XX XX XX
    555  // a5: 50 51 52 53 XX XX XX XX
    556  // a6: 60 61 62 63 XX XX XX XX
    557  // a7: 70 71 72 73 XX XX XX XX
    558  // to:
    559  // b0.val[0]: 00 01 02 03 40 41 42 43
    560  // b1.val[0]: 10 11 12 13 50 51 52 53
    561  // b2.val[0]: 20 21 22 23 60 61 62 63
    562  // b3.val[0]: 30 31 32 33 70 71 72 73
    563 
    564  const uint32x2x2_t b0 =
    565      vtrn_u32(vreinterpret_u32_u8(a0), vreinterpret_u32_u8(a4));
    566  const uint32x2x2_t b1 =
    567      vtrn_u32(vreinterpret_u32_u8(a1), vreinterpret_u32_u8(a5));
    568  const uint32x2x2_t b2 =
    569      vtrn_u32(vreinterpret_u32_u8(a2), vreinterpret_u32_u8(a6));
    570  const uint32x2x2_t b3 =
    571      vtrn_u32(vreinterpret_u32_u8(a3), vreinterpret_u32_u8(a7));
    572 
    573  // Swap 16 bit elements resulting in:
    574  // c0.val[0]: 00 01 20 21 40 41 60 61
    575  // c0.val[1]: 02 03 22 23 42 43 62 63
    576  // c1.val[0]: 10 11 30 31 50 51 70 71
    577  // c1.val[1]: 12 13 32 33 52 53 72 73
    578 
    579  const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u32(b0.val[0]),
    580                                   vreinterpret_u16_u32(b2.val[0]));
    581  const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u32(b1.val[0]),
    582                                   vreinterpret_u16_u32(b3.val[0]));
    583 
    584  // Swap 8 bit elements resulting in:
    585  // d0.val[0]: 00 10 20 30 40 50 60 70
    586  // d0.val[1]: 01 11 21 31 41 51 61 71
    587  // d1.val[0]: 02 12 22 32 42 52 62 72
    588  // d1.val[1]: 03 13 23 33 43 53 63 73
    589 
    590  const uint8x8x2_t d0 =
    591      vtrn_u8(vreinterpret_u8_u16(c0.val[0]), vreinterpret_u8_u16(c1.val[0]));
    592  const uint8x8x2_t d1 =
    593      vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1]));
    594 
    595  *o0 = d0.val[0];
    596  *o1 = d0.val[1];
    597  *o2 = d1.val[0];
    598  *o3 = d1.val[1];
    599 }
    600 
    601 static inline void transpose_array_inplace_u16_4x4(uint16x4_t a[4]) {
    602  // Input:
    603  // 00 01 02 03
    604  // 10 11 12 13
    605  // 20 21 22 23
    606  // 30 31 32 33
    607 
    608  // b:
    609  // 00 10 02 12
    610  // 01 11 03 13
    611  const uint16x4x2_t b = vtrn_u16(a[0], a[1]);
    612  // c:
    613  // 20 30 22 32
    614  // 21 31 23 33
    615  const uint16x4x2_t c = vtrn_u16(a[2], a[3]);
    616  // d:
    617  // 00 10 20 30
    618  // 02 12 22 32
    619  const uint32x2x2_t d =
    620      vtrn_u32(vreinterpret_u32_u16(b.val[0]), vreinterpret_u32_u16(c.val[0]));
    621  // e:
    622  // 01 11 21 31
    623  // 03 13 23 33
    624  const uint32x2x2_t e =
    625      vtrn_u32(vreinterpret_u32_u16(b.val[1]), vreinterpret_u32_u16(c.val[1]));
    626 
    627  // Output:
    628  // 00 10 20 30
    629  // 01 11 21 31
    630  // 02 12 22 32
    631  // 03 13 23 33
    632  a[0] = vreinterpret_u16_u32(d.val[0]);
    633  a[1] = vreinterpret_u16_u32(e.val[0]);
    634  a[2] = vreinterpret_u16_u32(d.val[1]);
    635  a[3] = vreinterpret_u16_u32(e.val[1]);
    636 }
    637 
    638 static inline void transpose_array_inplace_u16_4x8(uint16x8_t a[4]) {
    639  // 4x8 Input:
    640  // a[0]: 00 01 02 03 04 05 06 07
    641  // a[1]: 10 11 12 13 14 15 16 17
    642  // a[2]: 20 21 22 23 24 25 26 27
    643  // a[3]: 30 31 32 33 34 35 36 37
    644 
    645  // b0.val[0]: 00 10 02 12 04 14 06 16
    646  // b0.val[1]: 01 11 03 13 05 15 07 17
    647  // b1.val[0]: 20 30 22 32 24 34 26 36
    648  // b1.val[1]: 21 31 23 33 25 35 27 37
    649  const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
    650  const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
    651 
    652  // c0.val[0]: 00 10 20 30 04 14 24 34
    653  // c0.val[1]: 02 12 22 32 06 16 26 36
    654  // c1.val[0]: 01 11 21 31 05 15 25 35
    655  // c1.val[1]: 03 13 23 33 07 17 27 37
    656  const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
    657                                    vreinterpretq_u32_u16(b1.val[0]));
    658  const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
    659                                    vreinterpretq_u32_u16(b1.val[1]));
    660 
    661  // 8x4 Output:
    662  // a[0]: 00 10 20 30 04 14 24 34
    663  // a[1]: 01 11 21 31 05 15 25 35
    664  // a[2]: 02 12 22 32 06 16 26 36
    665  // a[3]: 03 13 23 33 07 17 27 37
    666  a[0] = vreinterpretq_u16_u32(c0.val[0]);
    667  a[1] = vreinterpretq_u16_u32(c1.val[0]);
    668  a[2] = vreinterpretq_u16_u32(c0.val[1]);
    669  a[3] = vreinterpretq_u16_u32(c1.val[1]);
    670 }
    671 
    672 // Special transpose for loop filter.
    673 // 4x8 Input:
    674 // p_q:  p3 p2 p1 p0 q0 q1 q2 q3
    675 // a[0]: 00 01 02 03 04 05 06 07
    676 // a[1]: 10 11 12 13 14 15 16 17
    677 // a[2]: 20 21 22 23 24 25 26 27
    678 // a[3]: 30 31 32 33 34 35 36 37
    679 // 8x4 Output:
    680 // a[0]: 03 13 23 33 04 14 24 34  p0q0
    681 // a[1]: 02 12 22 32 05 15 25 35  p1q1
    682 // a[2]: 01 11 21 31 06 16 26 36  p2q2
    683 // a[3]: 00 10 20 30 07 17 27 37  p3q3
    684 // Direct reapplication of the function will reset the high halves, but
    685 // reverse the low halves:
    686 // p_q:  p0 p1 p2 p3 q0 q1 q2 q3
    687 // a[0]: 33 32 31 30 04 05 06 07
    688 // a[1]: 23 22 21 20 14 15 16 17
    689 // a[2]: 13 12 11 10 24 25 26 27
    690 // a[3]: 03 02 01 00 34 35 36 37
    691 // Simply reordering the inputs (3, 2, 1, 0) will reset the low halves, but
    692 // reverse the high halves.
    693 // The standard transpose_u16_4x8q will produce the same reversals, but with the
    694 // order of the low halves also restored relative to the high halves. This is
    695 // preferable because it puts all values from the same source row back together,
    696 // but some post-processing is inevitable.
    697 static inline void loop_filter_transpose_u16_4x8q(uint16x8_t a[4]) {
    698  // b0.val[0]: 00 10 02 12 04 14 06 16
    699  // b0.val[1]: 01 11 03 13 05 15 07 17
    700  // b1.val[0]: 20 30 22 32 24 34 26 36
    701  // b1.val[1]: 21 31 23 33 25 35 27 37
    702  const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
    703  const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
    704 
    705  // Reverse odd vectors to bring the appropriate items to the front of zips.
    706  // b0.val[0]: 00 10 02 12 04 14 06 16
    707  // r0       : 03 13 01 11 07 17 05 15
    708  // b1.val[0]: 20 30 22 32 24 34 26 36
    709  // r1       : 23 33 21 31 27 37 25 35
    710  const uint32x4_t r0 = vrev64q_u32(vreinterpretq_u32_u16(b0.val[1]));
    711  const uint32x4_t r1 = vrev64q_u32(vreinterpretq_u32_u16(b1.val[1]));
    712 
    713  // Zip to complete the halves.
    714  // c0.val[0]: 00 10 20 30 02 12 22 32  p3p1
    715  // c0.val[1]: 04 14 24 34 06 16 26 36  q0q2
    716  // c1.val[0]: 03 13 23 33 01 11 21 31  p0p2
    717  // c1.val[1]: 07 17 27 37 05 15 25 35  q3q1
    718  const uint32x4x2_t c0 = vzipq_u32(vreinterpretq_u32_u16(b0.val[0]),
    719                                    vreinterpretq_u32_u16(b1.val[0]));
    720  const uint32x4x2_t c1 = vzipq_u32(r0, r1);
    721 
    722  // d0.val[0]: 00 10 20 30 07 17 27 37  p3q3
    723  // d0.val[1]: 02 12 22 32 05 15 25 35  p1q1
    724  // d1.val[0]: 03 13 23 33 04 14 24 34  p0q0
    725  // d1.val[1]: 01 11 21 31 06 16 26 36  p2q2
    726  const uint16x8x2_t d0 = aom_vtrnq_u64_to_u16(c0.val[0], c1.val[1]);
    727  // The third row of c comes first here to swap p2 with q0.
    728  const uint16x8x2_t d1 = aom_vtrnq_u64_to_u16(c1.val[0], c0.val[1]);
    729 
    730  // 8x4 Output:
    731  // a[0]: 03 13 23 33 04 14 24 34  p0q0
    732  // a[1]: 02 12 22 32 05 15 25 35  p1q1
    733  // a[2]: 01 11 21 31 06 16 26 36  p2q2
    734  // a[3]: 00 10 20 30 07 17 27 37  p3q3
    735  a[0] = d1.val[0];  // p0q0
    736  a[1] = d0.val[1];  // p1q1
    737  a[2] = d1.val[1];  // p2q2
    738  a[3] = d0.val[0];  // p3q3
    739 }
    740 
    741 static inline void transpose_elems_u16_4x8(
    742    const uint16x4_t a0, const uint16x4_t a1, const uint16x4_t a2,
    743    const uint16x4_t a3, const uint16x4_t a4, const uint16x4_t a5,
    744    const uint16x4_t a6, const uint16x4_t a7, uint16x8_t *o0, uint16x8_t *o1,
    745    uint16x8_t *o2, uint16x8_t *o3) {
    746  // Combine rows. Goes from:
    747  // a0: 00 01 02 03
    748  // a1: 10 11 12 13
    749  // a2: 20 21 22 23
    750  // a3: 30 31 32 33
    751  // a4: 40 41 42 43
    752  // a5: 50 51 52 53
    753  // a6: 60 61 62 63
    754  // a7: 70 71 72 73
    755  // to:
    756  // b0: 00 01 02 03 40 41 42 43
    757  // b1: 10 11 12 13 50 51 52 53
    758  // b2: 20 21 22 23 60 61 62 63
    759  // b3: 30 31 32 33 70 71 72 73
    760 
    761  const uint16x8_t b0 = vcombine_u16(a0, a4);
    762  const uint16x8_t b1 = vcombine_u16(a1, a5);
    763  const uint16x8_t b2 = vcombine_u16(a2, a6);
    764  const uint16x8_t b3 = vcombine_u16(a3, a7);
    765 
    766  // Swap 16 bit elements resulting in:
    767  // c0.val[0]: 00 10 02 12 40 50 42 52
    768  // c0.val[1]: 01 11 03 13 41 51 43 53
    769  // c1.val[0]: 20 30 22 32 60 70 62 72
    770  // c1.val[1]: 21 31 23 33 61 71 63 73
    771 
    772  const uint16x8x2_t c0 = vtrnq_u16(b0, b1);
    773  const uint16x8x2_t c1 = vtrnq_u16(b2, b3);
    774 
    775  // Swap 32 bit elements resulting in:
    776  // d0.val[0]: 00 10 20 30 40 50 60 70
    777  // d0.val[1]: 02 12 22 32 42 52 62 72
    778  // d1.val[0]: 01 11 21 31 41 51 61 71
    779  // d1.val[1]: 03 13 23 33 43 53 63 73
    780 
    781  const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
    782                                    vreinterpretq_u32_u16(c1.val[0]));
    783  const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
    784                                    vreinterpretq_u32_u16(c1.val[1]));
    785 
    786  *o0 = vreinterpretq_u16_u32(d0.val[0]);
    787  *o1 = vreinterpretq_u16_u32(d1.val[0]);
    788  *o2 = vreinterpretq_u16_u32(d0.val[1]);
    789  *o3 = vreinterpretq_u16_u32(d1.val[1]);
    790 }
    791 
    792 static inline void transpose_elems_s16_4x8(
    793    const int16x4_t a0, const int16x4_t a1, const int16x4_t a2,
    794    const int16x4_t a3, const int16x4_t a4, const int16x4_t a5,
    795    const int16x4_t a6, const int16x4_t a7, int16x8_t *o0, int16x8_t *o1,
    796    int16x8_t *o2, int16x8_t *o3) {
    797  // Combine rows. Goes from:
    798  // a0: 00 01 02 03
    799  // a1: 10 11 12 13
    800  // a2: 20 21 22 23
    801  // a3: 30 31 32 33
    802  // a4: 40 41 42 43
    803  // a5: 50 51 52 53
    804  // a6: 60 61 62 63
    805  // a7: 70 71 72 73
    806  // to:
    807  // b0: 00 01 02 03 40 41 42 43
    808  // b1: 10 11 12 13 50 51 52 53
    809  // b2: 20 21 22 23 60 61 62 63
    810  // b3: 30 31 32 33 70 71 72 73
    811 
    812  const int16x8_t b0 = vcombine_s16(a0, a4);
    813  const int16x8_t b1 = vcombine_s16(a1, a5);
    814  const int16x8_t b2 = vcombine_s16(a2, a6);
    815  const int16x8_t b3 = vcombine_s16(a3, a7);
    816 
    817  // Swap 16 bit elements resulting in:
    818  // c0.val[0]: 00 10 02 12 40 50 42 52
    819  // c0.val[1]: 01 11 03 13 41 51 43 53
    820  // c1.val[0]: 20 30 22 32 60 70 62 72
    821  // c1.val[1]: 21 31 23 33 61 71 63 73
    822 
    823  const int16x8x2_t c0 = vtrnq_s16(b0, b1);
    824  const int16x8x2_t c1 = vtrnq_s16(b2, b3);
    825 
    826  // Swap 32 bit elements resulting in:
    827  // d0.val[0]: 00 10 20 30 40 50 60 70
    828  // d0.val[1]: 02 12 22 32 42 52 62 72
    829  // d1.val[0]: 01 11 21 31 41 51 61 71
    830  // d1.val[1]: 03 13 23 33 43 53 63 73
    831 
    832  const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
    833                                   vreinterpretq_s32_s16(c1.val[0]));
    834  const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
    835                                   vreinterpretq_s32_s16(c1.val[1]));
    836 
    837  *o0 = vreinterpretq_s16_s32(d0.val[0]);
    838  *o1 = vreinterpretq_s16_s32(d1.val[0]);
    839  *o2 = vreinterpretq_s16_s32(d0.val[1]);
    840  *o3 = vreinterpretq_s16_s32(d1.val[1]);
    841 }
    842 
    843 static inline void transpose_elems_inplace_u16_8x8(
    844    uint16x8_t *a0, uint16x8_t *a1, uint16x8_t *a2, uint16x8_t *a3,
    845    uint16x8_t *a4, uint16x8_t *a5, uint16x8_t *a6, uint16x8_t *a7) {
    846  // Swap 16 bit elements. Goes from:
    847  // a0: 00 01 02 03 04 05 06 07
    848  // a1: 10 11 12 13 14 15 16 17
    849  // a2: 20 21 22 23 24 25 26 27
    850  // a3: 30 31 32 33 34 35 36 37
    851  // a4: 40 41 42 43 44 45 46 47
    852  // a5: 50 51 52 53 54 55 56 57
    853  // a6: 60 61 62 63 64 65 66 67
    854  // a7: 70 71 72 73 74 75 76 77
    855  // to:
    856  // b0.val[0]: 00 10 02 12 04 14 06 16
    857  // b0.val[1]: 01 11 03 13 05 15 07 17
    858  // b1.val[0]: 20 30 22 32 24 34 26 36
    859  // b1.val[1]: 21 31 23 33 25 35 27 37
    860  // b2.val[0]: 40 50 42 52 44 54 46 56
    861  // b2.val[1]: 41 51 43 53 45 55 47 57
    862  // b3.val[0]: 60 70 62 72 64 74 66 76
    863  // b3.val[1]: 61 71 63 73 65 75 67 77
    864 
    865  const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1);
    866  const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3);
    867  const uint16x8x2_t b2 = vtrnq_u16(*a4, *a5);
    868  const uint16x8x2_t b3 = vtrnq_u16(*a6, *a7);
    869 
    870  // Swap 32 bit elements resulting in:
    871  // c0.val[0]: 00 10 20 30 04 14 24 34
    872  // c0.val[1]: 02 12 22 32 06 16 26 36
    873  // c1.val[0]: 01 11 21 31 05 15 25 35
    874  // c1.val[1]: 03 13 23 33 07 17 27 37
    875  // c2.val[0]: 40 50 60 70 44 54 64 74
    876  // c2.val[1]: 42 52 62 72 46 56 66 76
    877  // c3.val[0]: 41 51 61 71 45 55 65 75
    878  // c3.val[1]: 43 53 63 73 47 57 67 77
    879 
    880  const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
    881                                    vreinterpretq_u32_u16(b1.val[0]));
    882  const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
    883                                    vreinterpretq_u32_u16(b1.val[1]));
    884  const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]),
    885                                    vreinterpretq_u32_u16(b3.val[0]));
    886  const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]),
    887                                    vreinterpretq_u32_u16(b3.val[1]));
    888 
    889  // Swap 64 bit elements resulting in:
    890  // d0.val[0]: 00 10 20 30 40 50 60 70
    891  // d0.val[1]: 04 14 24 34 44 54 64 74
    892  // d1.val[0]: 01 11 21 31 41 51 61 71
    893  // d1.val[1]: 05 15 25 35 45 55 65 75
    894  // d2.val[0]: 02 12 22 32 42 52 62 72
    895  // d2.val[1]: 06 16 26 36 46 56 66 76
    896  // d3.val[0]: 03 13 23 33 43 53 63 73
    897  // d3.val[1]: 07 17 27 37 47 57 67 77
    898 
    899  const uint16x8x2_t d0 = aom_vtrnq_u64_to_u16(c0.val[0], c2.val[0]);
    900  const uint16x8x2_t d1 = aom_vtrnq_u64_to_u16(c1.val[0], c3.val[0]);
    901  const uint16x8x2_t d2 = aom_vtrnq_u64_to_u16(c0.val[1], c2.val[1]);
    902  const uint16x8x2_t d3 = aom_vtrnq_u64_to_u16(c1.val[1], c3.val[1]);
    903 
    904  *a0 = d0.val[0];
    905  *a1 = d1.val[0];
    906  *a2 = d2.val[0];
    907  *a3 = d3.val[0];
    908  *a4 = d0.val[1];
    909  *a5 = d1.val[1];
    910  *a6 = d2.val[1];
    911  *a7 = d3.val[1];
    912 }
    913 
    914 static inline int16x8x2_t aom_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
    915  int16x8x2_t b0;
    916 #if AOM_ARCH_AARCH64
    917  b0.val[0] = vreinterpretq_s16_s64(
    918      vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
    919  b0.val[1] = vreinterpretq_s16_s64(
    920      vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
    921 #else
    922  b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
    923                           vreinterpret_s16_s32(vget_low_s32(a1)));
    924  b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
    925                           vreinterpret_s16_s32(vget_high_s32(a1)));
    926 #endif
    927  return b0;
    928 }
    929 
    930 static inline void transpose_elems_inplace_s16_8x8(int16x8_t *a0, int16x8_t *a1,
    931                                                   int16x8_t *a2, int16x8_t *a3,
    932                                                   int16x8_t *a4, int16x8_t *a5,
    933                                                   int16x8_t *a6,
    934                                                   int16x8_t *a7) {
    935  // Swap 16 bit elements. Goes from:
    936  // a0: 00 01 02 03 04 05 06 07
    937  // a1: 10 11 12 13 14 15 16 17
    938  // a2: 20 21 22 23 24 25 26 27
    939  // a3: 30 31 32 33 34 35 36 37
    940  // a4: 40 41 42 43 44 45 46 47
    941  // a5: 50 51 52 53 54 55 56 57
    942  // a6: 60 61 62 63 64 65 66 67
    943  // a7: 70 71 72 73 74 75 76 77
    944  // to:
    945  // b0.val[0]: 00 10 02 12 04 14 06 16
    946  // b0.val[1]: 01 11 03 13 05 15 07 17
    947  // b1.val[0]: 20 30 22 32 24 34 26 36
    948  // b1.val[1]: 21 31 23 33 25 35 27 37
    949  // b2.val[0]: 40 50 42 52 44 54 46 56
    950  // b2.val[1]: 41 51 43 53 45 55 47 57
    951  // b3.val[0]: 60 70 62 72 64 74 66 76
    952  // b3.val[1]: 61 71 63 73 65 75 67 77
    953 
    954  const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
    955  const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
    956  const int16x8x2_t b2 = vtrnq_s16(*a4, *a5);
    957  const int16x8x2_t b3 = vtrnq_s16(*a6, *a7);
    958 
    959  // Swap 32 bit elements resulting in:
    960  // c0.val[0]: 00 10 20 30 04 14 24 34
    961  // c0.val[1]: 02 12 22 32 06 16 26 36
    962  // c1.val[0]: 01 11 21 31 05 15 25 35
    963  // c1.val[1]: 03 13 23 33 07 17 27 37
    964  // c2.val[0]: 40 50 60 70 44 54 64 74
    965  // c2.val[1]: 42 52 62 72 46 56 66 76
    966  // c3.val[0]: 41 51 61 71 45 55 65 75
    967  // c3.val[1]: 43 53 63 73 47 57 67 77
    968 
    969  const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
    970                                   vreinterpretq_s32_s16(b1.val[0]));
    971  const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
    972                                   vreinterpretq_s32_s16(b1.val[1]));
    973  const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
    974                                   vreinterpretq_s32_s16(b3.val[0]));
    975  const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
    976                                   vreinterpretq_s32_s16(b3.val[1]));
    977 
    978  // Swap 64 bit elements resulting in:
    979  // d0.val[0]: 00 10 20 30 40 50 60 70
    980  // d0.val[1]: 04 14 24 34 44 54 64 74
    981  // d1.val[0]: 01 11 21 31 41 51 61 71
    982  // d1.val[1]: 05 15 25 35 45 55 65 75
    983  // d2.val[0]: 02 12 22 32 42 52 62 72
    984  // d2.val[1]: 06 16 26 36 46 56 66 76
    985  // d3.val[0]: 03 13 23 33 43 53 63 73
    986  // d3.val[1]: 07 17 27 37 47 57 67 77
    987 
    988  const int16x8x2_t d0 = aom_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
    989  const int16x8x2_t d1 = aom_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
    990  const int16x8x2_t d2 = aom_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
    991  const int16x8x2_t d3 = aom_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
    992 
    993  *a0 = d0.val[0];
    994  *a1 = d1.val[0];
    995  *a2 = d2.val[0];
    996  *a3 = d3.val[0];
    997  *a4 = d0.val[1];
    998  *a5 = d1.val[1];
    999  *a6 = d2.val[1];
   1000  *a7 = d3.val[1];
   1001 }
   1002 
   1003 static inline void transpose_arrays_s16_8x8(const int16x8_t *a,
   1004                                            int16x8_t *out) {
   1005  // Swap 16 bit elements. Goes from:
   1006  // a0: 00 01 02 03 04 05 06 07
   1007  // a1: 10 11 12 13 14 15 16 17
   1008  // a2: 20 21 22 23 24 25 26 27
   1009  // a3: 30 31 32 33 34 35 36 37
   1010  // a4: 40 41 42 43 44 45 46 47
   1011  // a5: 50 51 52 53 54 55 56 57
   1012  // a6: 60 61 62 63 64 65 66 67
   1013  // a7: 70 71 72 73 74 75 76 77
   1014  // to:
   1015  // b0.val[0]: 00 10 02 12 04 14 06 16
   1016  // b0.val[1]: 01 11 03 13 05 15 07 17
   1017  // b1.val[0]: 20 30 22 32 24 34 26 36
   1018  // b1.val[1]: 21 31 23 33 25 35 27 37
   1019  // b2.val[0]: 40 50 42 52 44 54 46 56
   1020  // b2.val[1]: 41 51 43 53 45 55 47 57
   1021  // b3.val[0]: 60 70 62 72 64 74 66 76
   1022  // b3.val[1]: 61 71 63 73 65 75 67 77
   1023 
   1024  const int16x8x2_t b0 = vtrnq_s16(a[0], a[1]);
   1025  const int16x8x2_t b1 = vtrnq_s16(a[2], a[3]);
   1026  const int16x8x2_t b2 = vtrnq_s16(a[4], a[5]);
   1027  const int16x8x2_t b3 = vtrnq_s16(a[6], a[7]);
   1028 
   1029  // Swap 32 bit elements resulting in:
   1030  // c0.val[0]: 00 10 20 30 04 14 24 34
   1031  // c0.val[1]: 02 12 22 32 06 16 26 36
   1032  // c1.val[0]: 01 11 21 31 05 15 25 35
   1033  // c1.val[1]: 03 13 23 33 07 17 27 37
   1034  // c2.val[0]: 40 50 60 70 44 54 64 74
   1035  // c2.val[1]: 42 52 62 72 46 56 66 76
   1036  // c3.val[0]: 41 51 61 71 45 55 65 75
   1037  // c3.val[1]: 43 53 63 73 47 57 67 77
   1038 
   1039  const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
   1040                                   vreinterpretq_s32_s16(b1.val[0]));
   1041  const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
   1042                                   vreinterpretq_s32_s16(b1.val[1]));
   1043  const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
   1044                                   vreinterpretq_s32_s16(b3.val[0]));
   1045  const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
   1046                                   vreinterpretq_s32_s16(b3.val[1]));
   1047 
   1048  // Swap 64 bit elements resulting in:
   1049  // d0.val[0]: 00 10 20 30 40 50 60 70
   1050  // d0.val[1]: 04 14 24 34 44 54 64 74
   1051  // d1.val[0]: 01 11 21 31 41 51 61 71
   1052  // d1.val[1]: 05 15 25 35 45 55 65 75
   1053  // d2.val[0]: 02 12 22 32 42 52 62 72
   1054  // d2.val[1]: 06 16 26 36 46 56 66 76
   1055  // d3.val[0]: 03 13 23 33 43 53 63 73
   1056  // d3.val[1]: 07 17 27 37 47 57 67 77
   1057 
   1058  const int16x8x2_t d0 = aom_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
   1059  const int16x8x2_t d1 = aom_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
   1060  const int16x8x2_t d2 = aom_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
   1061  const int16x8x2_t d3 = aom_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
   1062 
   1063  out[0] = d0.val[0];
   1064  out[1] = d1.val[0];
   1065  out[2] = d2.val[0];
   1066  out[3] = d3.val[0];
   1067  out[4] = d0.val[1];
   1068  out[5] = d1.val[1];
   1069  out[6] = d2.val[1];
   1070  out[7] = d3.val[1];
   1071 }
   1072 
   1073 static inline void transpose_elems_inplace_s16_8x4(int16x8_t *a0, int16x8_t *a1,
   1074                                                   int16x8_t *a2,
   1075                                                   int16x8_t *a3) {
   1076  // Swap 16 bit elements. Goes from:
   1077  // a0: 00 01 02 03 04 05 06 07
   1078  // a1: 10 11 12 13 14 15 16 17
   1079  // a2: 20 21 22 23 24 25 26 27
   1080  // a3: 30 31 32 33 34 35 36 37
   1081  // to:
   1082  // b0.val[0]: 00 10 02 12 04 14 06 16
   1083  // b0.val[1]: 01 11 03 13 05 15 07 17
   1084  // b1.val[0]: 20 30 22 32 24 34 26 36
   1085  // b1.val[1]: 21 31 23 33 25 35 27 37
   1086 
   1087  const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
   1088  const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
   1089 
   1090  // Swap 32 bit elements resulting in:
   1091  // c0.val[0]: 00 10 20 30 04 14 24 34
   1092  // c0.val[1]: 01 11 21 31 05 15 25 35
   1093  // c1.val[0]: 02 12 22 32 06 16 26 36
   1094  // c1.val[1]: 03 13 23 33 07 17 27 37
   1095 
   1096  const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
   1097                                   vreinterpretq_s32_s16(b1.val[0]));
   1098  const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
   1099                                   vreinterpretq_s32_s16(b1.val[1]));
   1100 
   1101  *a0 = vreinterpretq_s16_s32(c0.val[0]);
   1102  *a1 = vreinterpretq_s16_s32(c1.val[0]);
   1103  *a2 = vreinterpretq_s16_s32(c0.val[1]);
   1104  *a3 = vreinterpretq_s16_s32(c1.val[1]);
   1105 }
   1106 
   1107 static inline void transpose_elems_inplace_u16_4x4(uint16x4_t *a0,
   1108                                                   uint16x4_t *a1,
   1109                                                   uint16x4_t *a2,
   1110                                                   uint16x4_t *a3) {
   1111  // Swap 16 bit elements. Goes from:
   1112  // a0: 00 01 02 03
   1113  // a1: 10 11 12 13
   1114  // a2: 20 21 22 23
   1115  // a3: 30 31 32 33
   1116  // to:
   1117  // b0.val[0]: 00 10 02 12
   1118  // b0.val[1]: 01 11 03 13
   1119  // b1.val[0]: 20 30 22 32
   1120  // b1.val[1]: 21 31 23 33
   1121 
   1122  const uint16x4x2_t b0 = vtrn_u16(*a0, *a1);
   1123  const uint16x4x2_t b1 = vtrn_u16(*a2, *a3);
   1124 
   1125  // Swap 32 bit elements resulting in:
   1126  // c0.val[0]: 00 10 20 30
   1127  // c0.val[1]: 02 12 22 32
   1128  // c1.val[0]: 01 11 21 31
   1129  // c1.val[1]: 03 13 23 33
   1130 
   1131  const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
   1132                                   vreinterpret_u32_u16(b1.val[0]));
   1133  const uint32x2x2_t c1 = vtrn_u32(vreinterpret_u32_u16(b0.val[1]),
   1134                                   vreinterpret_u32_u16(b1.val[1]));
   1135 
   1136  *a0 = vreinterpret_u16_u32(c0.val[0]);
   1137  *a1 = vreinterpret_u16_u32(c1.val[0]);
   1138  *a2 = vreinterpret_u16_u32(c0.val[1]);
   1139  *a3 = vreinterpret_u16_u32(c1.val[1]);
   1140 }
   1141 
   1142 static inline void transpose_elems_inplace_s16_4x4(int16x4_t *a0, int16x4_t *a1,
   1143                                                   int16x4_t *a2,
   1144                                                   int16x4_t *a3) {
   1145  // Swap 16 bit elements. Goes from:
   1146  // a0: 00 01 02 03
   1147  // a1: 10 11 12 13
   1148  // a2: 20 21 22 23
   1149  // a3: 30 31 32 33
   1150  // to:
   1151  // b0.val[0]: 00 10 02 12
   1152  // b0.val[1]: 01 11 03 13
   1153  // b1.val[0]: 20 30 22 32
   1154  // b1.val[1]: 21 31 23 33
   1155 
   1156  const int16x4x2_t b0 = vtrn_s16(*a0, *a1);
   1157  const int16x4x2_t b1 = vtrn_s16(*a2, *a3);
   1158 
   1159  // Swap 32 bit elements resulting in:
   1160  // c0.val[0]: 00 10 20 30
   1161  // c0.val[1]: 02 12 22 32
   1162  // c1.val[0]: 01 11 21 31
   1163  // c1.val[1]: 03 13 23 33
   1164 
   1165  const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
   1166                                  vreinterpret_s32_s16(b1.val[0]));
   1167  const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
   1168                                  vreinterpret_s32_s16(b1.val[1]));
   1169 
   1170  *a0 = vreinterpret_s16_s32(c0.val[0]);
   1171  *a1 = vreinterpret_s16_s32(c1.val[0]);
   1172  *a2 = vreinterpret_s16_s32(c0.val[1]);
   1173  *a3 = vreinterpret_s16_s32(c1.val[1]);
   1174 }
   1175 
   1176 static inline int32x4x2_t aom_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
   1177  int32x4x2_t b0;
   1178 #if AOM_ARCH_AARCH64
   1179  b0.val[0] = vreinterpretq_s32_s64(
   1180      vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
   1181  b0.val[1] = vreinterpretq_s32_s64(
   1182      vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
   1183 #else
   1184  b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1));
   1185  b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1));
   1186 #endif
   1187  return b0;
   1188 }
   1189 
   1190 static inline void transpose_elems_s32_4x4(const int32x4_t a0,
   1191                                           const int32x4_t a1,
   1192                                           const int32x4_t a2,
   1193                                           const int32x4_t a3, int32x4_t *o0,
   1194                                           int32x4_t *o1, int32x4_t *o2,
   1195                                           int32x4_t *o3) {
   1196  // Swap 32 bit elements. Goes from:
   1197  // a0: 00 01 02 03
   1198  // a1: 10 11 12 13
   1199  // a2: 20 21 22 23
   1200  // a3: 30 31 32 33
   1201  // to:
   1202  // b0.val[0]: 00 10 02 12
   1203  // b0.val[1]: 01 11 03 13
   1204  // b1.val[0]: 20 30 22 32
   1205  // b1.val[1]: 21 31 23 33
   1206 
   1207  const int32x4x2_t b0 = vtrnq_s32(a0, a1);
   1208  const int32x4x2_t b1 = vtrnq_s32(a2, a3);
   1209 
   1210  // Swap 64 bit elements resulting in:
   1211  // c0.val[0]: 00 10 20 30
   1212  // c0.val[1]: 02 12 22 32
   1213  // c1.val[0]: 01 11 21 31
   1214  // c1.val[1]: 03 13 23 33
   1215 
   1216  const int32x4x2_t c0 = aom_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
   1217  const int32x4x2_t c1 = aom_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
   1218 
   1219  *o0 = c0.val[0];
   1220  *o1 = c1.val[0];
   1221  *o2 = c0.val[1];
   1222  *o3 = c1.val[1];
   1223 }
   1224 
   1225 static inline void transpose_elems_inplace_s32_4x4(int32x4_t *a0, int32x4_t *a1,
   1226                                                   int32x4_t *a2,
   1227                                                   int32x4_t *a3) {
   1228  transpose_elems_s32_4x4(*a0, *a1, *a2, *a3, a0, a1, a2, a3);
   1229 }
   1230 
   1231 static inline void transpose_arrays_s32_4x4(const int32x4_t *in,
   1232                                            int32x4_t *out) {
   1233  transpose_elems_s32_4x4(in[0], in[1], in[2], in[3], &out[0], &out[1], &out[2],
   1234                          &out[3]);
   1235 }
   1236 
   1237 static AOM_FORCE_INLINE void transpose_arrays_s32_4nx4n(const int32x4_t *in,
   1238                                                        int32x4_t *out,
   1239                                                        const int width,
   1240                                                        const int height) {
   1241  const int h = height >> 2;
   1242  const int w = width >> 2;
   1243  for (int j = 0; j < w; j++) {
   1244    for (int i = 0; i < h; i++) {
   1245      transpose_arrays_s32_4x4(in + j * height + i * 4,
   1246                               out + i * width + j * 4);
   1247    }
   1248  }
   1249 }
   1250 
   1251 #define TRANSPOSE_ARRAYS_S32_WXH_NEON(w, h)                    \
   1252  static AOM_FORCE_INLINE void transpose_arrays_s32_##w##x##h( \
   1253      const int32x4_t *in, int32x4_t *out) {                   \
   1254    transpose_arrays_s32_4nx4n(in, out, w, h);                 \
   1255  }
   1256 
   1257 TRANSPOSE_ARRAYS_S32_WXH_NEON(4, 8)
   1258 TRANSPOSE_ARRAYS_S32_WXH_NEON(4, 16)
   1259 TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 4)
   1260 TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 8)
   1261 TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 16)
   1262 TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 32)
   1263 TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 8)
   1264 TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 16)
   1265 TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 32)
   1266 TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 64)
   1267 TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 8)
   1268 TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 16)
   1269 TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 32)
   1270 TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 64)
   1271 TRANSPOSE_ARRAYS_S32_WXH_NEON(64, 16)
   1272 TRANSPOSE_ARRAYS_S32_WXH_NEON(64, 32)
   1273 
   1274 #undef TRANSPOSE_ARRAYS_S32_WXH_NEON
   1275 
   1276 static inline int64x2_t aom_vtrn1q_s64(int64x2_t a, int64x2_t b) {
   1277 #if AOM_ARCH_AARCH64
   1278  return vtrn1q_s64(a, b);
   1279 #else
   1280  return vcombine_s64(vget_low_s64(a), vget_low_s64(b));
   1281 #endif
   1282 }
   1283 
   1284 static inline int64x2_t aom_vtrn2q_s64(int64x2_t a, int64x2_t b) {
   1285 #if AOM_ARCH_AARCH64
   1286  return vtrn2q_s64(a, b);
   1287 #else
   1288  return vcombine_s64(vget_high_s64(a), vget_high_s64(b));
   1289 #endif
   1290 }
   1291 
   1292 static inline void transpose_elems_s32_4x8(int32x4_t a0, int32x4_t a1,
   1293                                           int32x4_t a2, int32x4_t a3,
   1294                                           int32x4_t a4, int32x4_t a5,
   1295                                           int32x4_t a6, int32x4_t a7,
   1296                                           int32x4x2_t *o0, int32x4x2_t *o1,
   1297                                           int32x4x2_t *o2, int32x4x2_t *o3) {
   1298  // Perform a 4 x 8 matrix transpose by building on top of the existing 4 x 4
   1299  // matrix transpose implementation:
   1300  // [ A ]^T => [ A^T B^T ]
   1301  // [ B ]
   1302 
   1303  transpose_elems_inplace_s32_4x4(&a0, &a1, &a2, &a3);  // A^T
   1304  transpose_elems_inplace_s32_4x4(&a4, &a5, &a6, &a7);  // B^T
   1305 
   1306  o0->val[0] = a0;
   1307  o1->val[0] = a1;
   1308  o2->val[0] = a2;
   1309  o3->val[0] = a3;
   1310 
   1311  o0->val[1] = a4;
   1312  o1->val[1] = a5;
   1313  o2->val[1] = a6;
   1314  o3->val[1] = a7;
   1315 }
   1316 
   1317 static inline void transpose_elems_inplace_s32_8x8(
   1318    int32x4x2_t *a0, int32x4x2_t *a1, int32x4x2_t *a2, int32x4x2_t *a3,
   1319    int32x4x2_t *a4, int32x4x2_t *a5, int32x4x2_t *a6, int32x4x2_t *a7) {
   1320  // Perform an 8 x 8 matrix transpose by building on top of the existing 4 x 4
   1321  // matrix transpose implementation:
   1322  // [ A B ]^T => [ A^T C^T ]
   1323  // [ C D ]      [ B^T D^T ]
   1324 
   1325  int32x4_t q0_v1 = a0->val[0];
   1326  int32x4_t q0_v2 = a1->val[0];
   1327  int32x4_t q0_v3 = a2->val[0];
   1328  int32x4_t q0_v4 = a3->val[0];
   1329 
   1330  int32x4_t q1_v1 = a0->val[1];
   1331  int32x4_t q1_v2 = a1->val[1];
   1332  int32x4_t q1_v3 = a2->val[1];
   1333  int32x4_t q1_v4 = a3->val[1];
   1334 
   1335  int32x4_t q2_v1 = a4->val[0];
   1336  int32x4_t q2_v2 = a5->val[0];
   1337  int32x4_t q2_v3 = a6->val[0];
   1338  int32x4_t q2_v4 = a7->val[0];
   1339 
   1340  int32x4_t q3_v1 = a4->val[1];
   1341  int32x4_t q3_v2 = a5->val[1];
   1342  int32x4_t q3_v3 = a6->val[1];
   1343  int32x4_t q3_v4 = a7->val[1];
   1344 
   1345  transpose_elems_inplace_s32_4x4(&q0_v1, &q0_v2, &q0_v3, &q0_v4);  // A^T
   1346  transpose_elems_inplace_s32_4x4(&q1_v1, &q1_v2, &q1_v3, &q1_v4);  // B^T
   1347  transpose_elems_inplace_s32_4x4(&q2_v1, &q2_v2, &q2_v3, &q2_v4);  // C^T
   1348  transpose_elems_inplace_s32_4x4(&q3_v1, &q3_v2, &q3_v3, &q3_v4);  // D^T
   1349 
   1350  a0->val[0] = q0_v1;
   1351  a1->val[0] = q0_v2;
   1352  a2->val[0] = q0_v3;
   1353  a3->val[0] = q0_v4;
   1354 
   1355  a0->val[1] = q2_v1;
   1356  a1->val[1] = q2_v2;
   1357  a2->val[1] = q2_v3;
   1358  a3->val[1] = q2_v4;
   1359 
   1360  a4->val[0] = q1_v1;
   1361  a5->val[0] = q1_v2;
   1362  a6->val[0] = q1_v3;
   1363  a7->val[0] = q1_v4;
   1364 
   1365  a4->val[1] = q3_v1;
   1366  a5->val[1] = q3_v2;
   1367  a6->val[1] = q3_v3;
   1368  a7->val[1] = q3_v4;
   1369 }
   1370 
   1371 static inline void transpose_arrays_s16_4x4(const int16x4_t *const in,
   1372                                            int16x4_t *const out) {
   1373  int16x4_t a0 = in[0];
   1374  int16x4_t a1 = in[1];
   1375  int16x4_t a2 = in[2];
   1376  int16x4_t a3 = in[3];
   1377 
   1378  transpose_elems_inplace_s16_4x4(&a0, &a1, &a2, &a3);
   1379 
   1380  out[0] = a0;
   1381  out[1] = a1;
   1382  out[2] = a2;
   1383  out[3] = a3;
   1384 }
   1385 
   1386 static inline void transpose_arrays_s16_4x8(const int16x4_t *const in,
   1387                                            int16x8_t *const out) {
   1388 #if AOM_ARCH_AARCH64
   1389  const int16x8_t a0 = vzip1q_s16(vcombine_s16(in[0], vdup_n_s16(0)),
   1390                                  vcombine_s16(in[1], vdup_n_s16(0)));
   1391  const int16x8_t a1 = vzip1q_s16(vcombine_s16(in[2], vdup_n_s16(0)),
   1392                                  vcombine_s16(in[3], vdup_n_s16(0)));
   1393  const int16x8_t a2 = vzip1q_s16(vcombine_s16(in[4], vdup_n_s16(0)),
   1394                                  vcombine_s16(in[5], vdup_n_s16(0)));
   1395  const int16x8_t a3 = vzip1q_s16(vcombine_s16(in[6], vdup_n_s16(0)),
   1396                                  vcombine_s16(in[7], vdup_n_s16(0)));
   1397 #else
   1398  int16x4x2_t temp;
   1399  temp = vzip_s16(in[0], in[1]);
   1400  const int16x8_t a0 = vcombine_s16(temp.val[0], temp.val[1]);
   1401  temp = vzip_s16(in[2], in[3]);
   1402  const int16x8_t a1 = vcombine_s16(temp.val[0], temp.val[1]);
   1403  temp = vzip_s16(in[4], in[5]);
   1404  const int16x8_t a2 = vcombine_s16(temp.val[0], temp.val[1]);
   1405  temp = vzip_s16(in[6], in[7]);
   1406  const int16x8_t a3 = vcombine_s16(temp.val[0], temp.val[1]);
   1407 #endif
   1408 
   1409  const int32x4x2_t b02 =
   1410      vzipq_s32(vreinterpretq_s32_s16(a0), vreinterpretq_s32_s16(a1));
   1411  const int32x4x2_t b13 =
   1412      vzipq_s32(vreinterpretq_s32_s16(a2), vreinterpretq_s32_s16(a3));
   1413 
   1414 #if AOM_ARCH_AARCH64
   1415  out[0] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b02.val[0]),
   1416                                            vreinterpretq_s64_s32(b13.val[0])));
   1417  out[1] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b02.val[0]),
   1418                                            vreinterpretq_s64_s32(b13.val[0])));
   1419  out[2] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b02.val[1]),
   1420                                            vreinterpretq_s64_s32(b13.val[1])));
   1421  out[3] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b02.val[1]),
   1422                                            vreinterpretq_s64_s32(b13.val[1])));
   1423 #else
   1424  out[0] = vreinterpretq_s16_s32(
   1425      vextq_s32(vextq_s32(b02.val[0], b02.val[0], 2), b13.val[0], 2));
   1426  out[2] = vreinterpretq_s16_s32(
   1427      vextq_s32(vextq_s32(b02.val[1], b02.val[1], 2), b13.val[1], 2));
   1428  out[1] = vreinterpretq_s16_s32(
   1429      vextq_s32(b02.val[0], vextq_s32(b13.val[0], b13.val[0], 2), 2));
   1430  out[3] = vreinterpretq_s16_s32(
   1431      vextq_s32(b02.val[1], vextq_s32(b13.val[1], b13.val[1], 2), 2));
   1432 #endif
   1433 }
   1434 
   1435 static inline void transpose_arrays_s16_8x4(const int16x8_t *const in,
   1436                                            int16x4_t *const out) {
   1437  // Swap 16 bit elements. Goes from:
   1438  // in[0]: 00 01 02 03 04 05 06 07
   1439  // in[1]: 10 11 12 13 14 15 16 17
   1440  // in[2]: 20 21 22 23 24 25 26 27
   1441  // in[3]: 30 31 32 33 34 35 36 37
   1442  // to:
   1443  // b0.val[0]: 00 10 02 12 04 14 06 16
   1444  // b0.val[1]: 01 11 03 13 05 15 07 17
   1445  // b1.val[0]: 20 30 22 32 24 34 26 36
   1446  // b1.val[1]: 21 31 23 33 25 35 27 37
   1447 
   1448  const int16x8x2_t b0 = vtrnq_s16(in[0], in[1]);
   1449  const int16x8x2_t b1 = vtrnq_s16(in[2], in[3]);
   1450 
   1451  // Swap 32 bit elements resulting in:
   1452  // c0.val[0]: 00 10 20 30 04 14 24 34
   1453  // c0.val[1]: 02 12 22 32 06 16 26 36
   1454  // c1.val[0]: 01 11 21 31 05 15 25 35
   1455  // c1.val[1]: 03 13 23 33 07 17 27 37
   1456 
   1457  const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_s16(b0.val[0]),
   1458                                    vreinterpretq_u32_s16(b1.val[0]));
   1459  const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_s16(b0.val[1]),
   1460                                    vreinterpretq_u32_s16(b1.val[1]));
   1461 
   1462  // Unpack 64 bit elements resulting in:
   1463  // out[0]: 00 10 20 30
   1464  // out[1]: 01 11 21 31
   1465  // out[2]: 02 12 22 32
   1466  // out[3]: 03 13 23 33
   1467  // out[4]: 04 14 24 34
   1468  // out[5]: 05 15 25 35
   1469  // out[6]: 06 16 26 36
   1470  // out[7]: 07 17 27 37
   1471 
   1472  out[0] = vget_low_s16(vreinterpretq_s16_u32(c0.val[0]));
   1473  out[1] = vget_low_s16(vreinterpretq_s16_u32(c1.val[0]));
   1474  out[2] = vget_low_s16(vreinterpretq_s16_u32(c0.val[1]));
   1475  out[3] = vget_low_s16(vreinterpretq_s16_u32(c1.val[1]));
   1476  out[4] = vget_high_s16(vreinterpretq_s16_u32(c0.val[0]));
   1477  out[5] = vget_high_s16(vreinterpretq_s16_u32(c1.val[0]));
   1478  out[6] = vget_high_s16(vreinterpretq_s16_u32(c0.val[1]));
   1479  out[7] = vget_high_s16(vreinterpretq_s16_u32(c1.val[1]));
   1480 }
   1481 
   1482 static inline void transpose_arrays_s64_4x4(const int64x2_t *in,
   1483                                            int64x2_t *out) {
   1484  // Perform a 4x4 matrix transpose going from:
   1485  // in[0] = 00 01
   1486  // in[1] = 02 03
   1487  // in[2] = 10 11
   1488  // in[3] = 12 13
   1489  // in[4] = 20 21
   1490  // in[5] = 22 23
   1491  // in[6] = 30 31
   1492  // in[7] = 32 33
   1493  //
   1494  // to:
   1495  // out[0] = 00 10
   1496  // out[1] = 20 30
   1497  // out[2] = 01 11
   1498  // out[3] = 21 31
   1499  // out[4] = 02 12
   1500  // out[5] = 22 32
   1501  // out[6] = 03 13
   1502  // out[7] = 23 33
   1503 
   1504  out[0] = aom_vtrn1q_s64(in[0], in[2]);
   1505  out[1] = aom_vtrn1q_s64(in[4], in[6]);
   1506  out[2] = aom_vtrn2q_s64(in[0], in[2]);
   1507  out[3] = aom_vtrn2q_s64(in[4], in[6]);
   1508  out[4] = aom_vtrn1q_s64(in[1], in[3]);
   1509  out[5] = aom_vtrn1q_s64(in[5], in[7]);
   1510  out[6] = aom_vtrn2q_s64(in[1], in[3]);
   1511  out[7] = aom_vtrn2q_s64(in[5], in[7]);
   1512 }
   1513 
   1514 #endif  // AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_