tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

highbd_inv_txfm_neon.c (222078B)


      1 /*
      2 * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you canzip
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <arm_neon.h>
     13 #include <assert.h>
     14 
     15 #include "av1/common/av1_inv_txfm1d_cfg.h"
     16 #include "av1/common/idct.h"
     17 #include "config/aom_config.h"
     18 #include "config/av1_rtcd.h"
     19 
     20 #if AOM_ARCH_AARCH64
     21 #define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3)         \
     22  do {                                                        \
     23    int32x4x2_t swap_low = vtrnq_s32(x0, x1);                 \
     24    int32x4x2_t swap_high = vtrnq_s32(x2, x3);                \
     25    y0 = vreinterpretq_s32_s64(                               \
     26        vzip1q_s64(vreinterpretq_s64_s32(swap_low.val[0]),    \
     27                   vreinterpretq_s64_s32(swap_high.val[0]))); \
     28    y1 = vreinterpretq_s32_s64(                               \
     29        vzip1q_s64(vreinterpretq_s64_s32(swap_low.val[1]),    \
     30                   vreinterpretq_s64_s32(swap_high.val[1]))); \
     31    y2 = vreinterpretq_s32_s64(                               \
     32        vzip2q_s64(vreinterpretq_s64_s32(swap_low.val[0]),    \
     33                   vreinterpretq_s64_s32(swap_high.val[0]))); \
     34    y3 = vreinterpretq_s32_s64(                               \
     35        vzip2q_s64(vreinterpretq_s64_s32(swap_low.val[1]),    \
     36                   vreinterpretq_s64_s32(swap_high.val[1]))); \
     37  } while (0)
     38 #else
     39 #define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3)                    \
     40  do {                                                                   \
     41    int32x4x2_t swap_low = vtrnq_s32(x0, x1);                            \
     42    int32x4x2_t swap_high = vtrnq_s32(x2, x3);                           \
     43    y0 = vextq_s32(vextq_s32(swap_low.val[0], swap_low.val[0], 2),       \
     44                   swap_high.val[0], 2);                                 \
     45    y1 = vextq_s32(vextq_s32(swap_low.val[1], swap_low.val[1], 2),       \
     46                   swap_high.val[1], 2);                                 \
     47    y2 = vextq_s32(swap_low.val[0],                                      \
     48                   vextq_s32(swap_high.val[0], swap_high.val[0], 2), 2); \
     49    y3 = vextq_s32(swap_low.val[1],                                      \
     50                   vextq_s32(swap_high.val[1], swap_high.val[1], 2), 2); \
     51  } while (0)
     52 #endif  // AOM_ARCH_AARCH64
     53 
     54 static inline void transpose_4x4(const int32x4_t *in, int32x4_t *out) {
     55  TRANSPOSE_4X4(in[0], in[1], in[2], in[3], out[0], out[1], out[2], out[3]);
     56 }
     57 
     58 static inline void transpose_8x8(const int32x4_t *in, int32x4_t *out) {
     59  TRANSPOSE_4X4(in[0], in[2], in[4], in[6], out[0], out[2], out[4], out[6]);
     60  TRANSPOSE_4X4(in[1], in[3], in[5], in[7], out[8], out[10], out[12], out[14]);
     61  TRANSPOSE_4X4(in[8], in[10], in[12], in[14], out[1], out[3], out[5], out[7]);
     62  TRANSPOSE_4X4(in[9], in[11], in[13], in[15], out[9], out[11], out[13],
     63                out[15]);
     64 }
     65 
     66 static inline void round_shift_array_32_neon(int32x4_t *input,
     67                                             int32x4_t *output, const int size,
     68                                             const int bit) {
     69  const int32x4_t v_bit = vdupq_n_s32(-bit);
     70  for (int i = 0; i < size; i++) {
     71    output[i] = vrshlq_s32(input[i], v_bit);
     72  }
     73 }
     74 
     75 static inline void round_shift_rect_array_32_neon(int32x4_t *input,
     76                                                  int32x4_t *output,
     77                                                  const int size) {
     78  for (int i = 0; i < size; i++) {
     79    const int32x4_t r0 = vmulq_n_s32(input[i], NewInvSqrt2);
     80    output[i] = vrshrq_n_s32(r0, NewSqrt2Bits);
     81  }
     82 }
     83 
     84 static inline int32x4_t half_btf_neon_r(const int32_t *n0, const int32x4_t *w0,
     85                                        const int32_t *n1, const int32x4_t *w1,
     86                                        const int32x4_t *v_bit,
     87                                        const int32x4_t *rnding) {
     88  int32x4_t x;
     89  x = vmlaq_n_s32(*rnding, *w0, *n0);
     90  x = vmlaq_n_s32(x, *w1, *n1);
     91  x = vshlq_s32(x, *v_bit);
     92  return x;
     93 }
     94 
     95 static inline int32x4_t half_btf_neon_mode11_r(
     96    const int32_t *n0, const int32x4_t *w0, const int32_t *n1,
     97    const int32x4_t *w1, const int32x4_t *v_bit, const int32x4_t *rnding) {
     98  int32x4_t x;
     99  x = vmlaq_n_s32(*rnding, *w0, -*n0);
    100  x = vmlaq_n_s32(x, *w1, -*n1);
    101  x = vshlq_s32(x, *v_bit);
    102  return x;
    103 }
    104 
    105 static inline int32x4_t half_btf_neon_mode01_r(
    106    const int32_t *n0, const int32x4_t *w0, const int32_t *n1,
    107    const int32x4_t *w1, const int32x4_t *v_bit, const int32x4_t *rnding) {
    108  int32x4_t x;
    109  x = vmlaq_n_s32(*rnding, *w0, *n0);
    110  x = vmlsq_n_s32(x, *w1, *n1);
    111  x = vshlq_s32(x, *v_bit);
    112  return x;
    113 }
    114 
    115 static inline int32x4_t half_btf_neon_mode10_r(
    116    const int32_t *n0, const int32x4_t *w0, const int32_t *n1,
    117    const int32x4_t *w1, const int32x4_t *v_bit, const int32x4_t *rnding) {
    118  int32x4_t x;
    119  x = vmlaq_n_s32(*rnding, *w1, *n1);
    120  x = vmlsq_n_s32(x, *w0, *n0);
    121  x = vshlq_s32(x, *v_bit);
    122  return x;
    123 }
    124 
    125 static inline int32x4_t half_btf_0_neon_r(const int32_t *n0,
    126                                          const int32x4_t *w0,
    127                                          const int32x4_t *v_bit,
    128                                          const int32x4_t *rnding) {
    129  int32x4_t x;
    130  x = vmlaq_n_s32(*rnding, *w0, *n0);
    131  x = vshlq_s32(x, *v_bit);
    132  return x;
    133 }
    134 
    135 static inline int32x4_t half_btf_0_m_neon_r(const int32_t *n0,
    136                                            const int32x4_t *w0,
    137                                            const int32x4_t *v_bit,
    138                                            const int32x4_t *rnding) {
    139  int32x4_t x;
    140  x = vmlaq_n_s32(*rnding, *w0, -*n0);
    141  x = vshlq_s32(x, *v_bit);
    142  return x;
    143 }
    144 
    145 static inline void flip_buf_neon(int32x4_t *in, int32x4_t *out, int size) {
    146  for (int i = 0; i < size; ++i) {
    147    out[size - i - 1] = in[i];
    148  }
    149 }
    150 
    151 typedef void (*fwd_transform_1d_neon)(int32x4_t *in, int32x4_t *out, int bit,
    152                                      const int num_cols);
    153 
    154 typedef void (*transform_1d_neon)(int32x4_t *in, int32x4_t *out, int32_t bit,
    155                                  int32_t do_cols, int32_t bd,
    156                                  int32_t out_shift);
    157 
    158 static inline uint16x8_t highbd_clamp_u16(uint16x8_t *u, const uint16x8_t *min,
    159                                          const uint16x8_t *max) {
    160  int16x8_t clamped;
    161  clamped = vminq_s16(vreinterpretq_s16_u16(*u), vreinterpretq_s16_u16(*max));
    162  clamped = vmaxq_s16(clamped, vreinterpretq_s16_u16(*min));
    163  return vreinterpretq_u16_s16(clamped);
    164 }
    165 
    166 static inline void round_shift_4x4(int32x4_t *in, int shift) {
    167  if (shift != 0) {
    168    const int32x4_t v_shift = vdupq_n_s32(-shift);
    169    in[0] = vrshlq_s32(in[0], v_shift);
    170    in[1] = vrshlq_s32(in[1], v_shift);
    171    in[2] = vrshlq_s32(in[2], v_shift);
    172    in[3] = vrshlq_s32(in[3], v_shift);
    173  }
    174 }
    175 
    176 static void round_shift_8x8(int32x4_t *in, int shift) {
    177  assert(shift != 0);
    178  const int32x4_t v_shift = vdupq_n_s32(-shift);
    179  in[0] = vrshlq_s32(in[0], v_shift);
    180  in[1] = vrshlq_s32(in[1], v_shift);
    181  in[2] = vrshlq_s32(in[2], v_shift);
    182  in[3] = vrshlq_s32(in[3], v_shift);
    183  in[4] = vrshlq_s32(in[4], v_shift);
    184  in[5] = vrshlq_s32(in[5], v_shift);
    185  in[6] = vrshlq_s32(in[6], v_shift);
    186  in[7] = vrshlq_s32(in[7], v_shift);
    187  in[8] = vrshlq_s32(in[8], v_shift);
    188  in[9] = vrshlq_s32(in[9], v_shift);
    189  in[10] = vrshlq_s32(in[10], v_shift);
    190  in[11] = vrshlq_s32(in[11], v_shift);
    191  in[12] = vrshlq_s32(in[12], v_shift);
    192  in[13] = vrshlq_s32(in[13], v_shift);
    193  in[14] = vrshlq_s32(in[14], v_shift);
    194  in[15] = vrshlq_s32(in[15], v_shift);
    195 }
    196 
    197 static void highbd_clamp_s32_neon(int32x4_t *in, int32x4_t *out,
    198                                  const int32x4_t *clamp_lo,
    199                                  const int32x4_t *clamp_hi, int size) {
    200  int32x4_t a0, a1;
    201  for (int i = 0; i < size; i += 4) {
    202    a0 = vmaxq_s32(in[i], *clamp_lo);
    203    out[i] = vminq_s32(a0, *clamp_hi);
    204 
    205    a1 = vmaxq_s32(in[i + 1], *clamp_lo);
    206    out[i + 1] = vminq_s32(a1, *clamp_hi);
    207 
    208    a0 = vmaxq_s32(in[i + 2], *clamp_lo);
    209    out[i + 2] = vminq_s32(a0, *clamp_hi);
    210 
    211    a1 = vmaxq_s32(in[i + 3], *clamp_lo);
    212    out[i + 3] = vminq_s32(a1, *clamp_hi);
    213  }
    214 }
    215 
    216 static inline uint16x8_t highbd_get_recon_8x8_neon(const uint16x8_t pred,
    217                                                   int32x4_t res0,
    218                                                   int32x4_t res1,
    219                                                   const int bd) {
    220  const uint16x8_t v_zero = vdupq_n_u16(0);
    221  int32x4_t min_clip_val = vreinterpretq_s32_u16(v_zero);
    222  int32x4_t max_clip_val = vdupq_n_s32((1 << bd) - 1);
    223  uint16x8x2_t x;
    224  x.val[0] = vreinterpretq_u16_s32(
    225      vaddw_s16(res0, vreinterpret_s16_u16(vget_low_u16(pred))));
    226  x.val[1] = vreinterpretq_u16_s32(
    227      vaddw_s16(res1, vreinterpret_s16_u16(vget_high_u16(pred))));
    228  x.val[0] = vreinterpretq_u16_s32(
    229      vmaxq_s32(vreinterpretq_s32_u16(x.val[0]), min_clip_val));
    230  x.val[0] = vreinterpretq_u16_s32(
    231      vminq_s32(vreinterpretq_s32_u16(x.val[0]), max_clip_val));
    232  x.val[1] = vreinterpretq_u16_s32(
    233      vmaxq_s32(vreinterpretq_s32_u16(x.val[1]), min_clip_val));
    234  x.val[1] = vreinterpretq_u16_s32(
    235      vminq_s32(vreinterpretq_s32_u16(x.val[1]), max_clip_val));
    236  uint16x8_t res = vcombine_u16(vqmovn_u32(vreinterpretq_u32_u16(x.val[0])),
    237                                vqmovn_u32(vreinterpretq_u32_u16(x.val[1])));
    238  return res;
    239 }
    240 
    241 static inline uint16x4_t highbd_get_recon_4xn_neon(uint16x4_t pred,
    242                                                   int32x4_t res0,
    243                                                   const int bd) {
    244  uint16x4_t x0_ = vreinterpret_u16_s16(
    245      vmovn_s32(vaddw_s16(res0, vreinterpret_s16_u16(pred))));
    246  uint16x8_t x0 = vcombine_u16(x0_, x0_);
    247  const uint16x8_t vmin = vdupq_n_u16(0);
    248  const uint16x8_t vmax = vdupq_n_u16((1 << bd) - 1);
    249  x0 = highbd_clamp_u16(&x0, &vmin, &vmax);
    250  return vget_low_u16(x0);
    251 }
    252 
    253 static inline void highbd_write_buffer_4xn_neon(int32x4_t *in, uint16_t *output,
    254                                                int stride, int flipud,
    255                                                int height, const int bd) {
    256  int j = flipud ? (height - 1) : 0;
    257  const int step = flipud ? -1 : 1;
    258  for (int i = 0; i < height; ++i, j += step) {
    259    uint16x4_t v = vld1_u16(output + i * stride);
    260    uint16x4_t u = highbd_get_recon_4xn_neon(v, in[j], bd);
    261 
    262    vst1_u16(output + i * stride, u);
    263  }
    264 }
    265 
    266 static inline void highbd_write_buffer_8xn_neon(int32x4_t *in, uint16_t *output,
    267                                                int stride, int flipud,
    268                                                int height, const int bd) {
    269  int j = flipud ? (height - 1) : 0;
    270  const int step = flipud ? -1 : 1;
    271  for (int i = 0; i < height; ++i, j += step) {
    272    uint16x8_t v = vld1q_u16(output + i * stride);
    273    uint16x8_t u = highbd_get_recon_8x8_neon(v, in[j], in[j + height], bd);
    274 
    275    vst1q_u16(output + i * stride, u);
    276  }
    277 }
    278 
    279 static inline void load_buffer_32bit_input(const int32_t *in, int stride,
    280                                           int32x4_t *out, int out_size) {
    281  for (int i = 0; i < out_size; ++i) {
    282    out[i] = vld1q_s32(in + i * stride);
    283  }
    284 }
    285 
    286 static inline void load_buffer_4x4(const int32_t *coeff, int32x4_t *in) {
    287  in[0] = vld1q_s32(coeff + 0);
    288  in[1] = vld1q_s32(coeff + 4);
    289  in[2] = vld1q_s32(coeff + 8);
    290  in[3] = vld1q_s32(coeff + 12);
    291 }
    292 
    293 static void addsub_neon(const int32x4_t in0, const int32x4_t in1,
    294                        int32x4_t *out0, int32x4_t *out1,
    295                        const int32x4_t *clamp_lo, const int32x4_t *clamp_hi) {
    296  int32x4_t a0 = vaddq_s32(in0, in1);
    297  int32x4_t a1 = vsubq_s32(in0, in1);
    298 
    299  a0 = vmaxq_s32(a0, *clamp_lo);
    300  a0 = vminq_s32(a0, *clamp_hi);
    301  a1 = vmaxq_s32(a1, *clamp_lo);
    302  a1 = vminq_s32(a1, *clamp_hi);
    303 
    304  *out0 = a0;
    305  *out1 = a1;
    306 }
    307 
    308 static void shift_and_clamp_neon(int32x4_t *in0, int32x4_t *in1,
    309                                 const int32x4_t *clamp_lo,
    310                                 const int32x4_t *clamp_hi,
    311                                 const int32x4_t *v_shift) {
    312  int32x4_t in0_w_offset = vrshlq_s32(*in0, *v_shift);
    313  int32x4_t in1_w_offset = vrshlq_s32(*in1, *v_shift);
    314 
    315  in0_w_offset = vmaxq_s32(in0_w_offset, *clamp_lo);
    316  in0_w_offset = vminq_s32(in0_w_offset, *clamp_hi);
    317  in1_w_offset = vmaxq_s32(in1_w_offset, *clamp_lo);
    318  in1_w_offset = vminq_s32(in1_w_offset, *clamp_hi);
    319 
    320  *in0 = in0_w_offset;
    321  *in1 = in1_w_offset;
    322 }
    323 
    324 static inline void idct32_stage4_neon(int32x4_t *bf1, const int32_t *cospi,
    325                                      const int32x4_t *v_bit,
    326                                      const int32x4_t *rnding) {
    327  int32x4_t temp1, temp2;
    328  temp1 = half_btf_neon_mode10_r(&cospi[8], &bf1[17], &cospi[56], &bf1[30],
    329                                 v_bit, rnding);
    330  bf1[30] =
    331      half_btf_neon_r(&cospi[56], &bf1[17], &cospi[8], &bf1[30], v_bit, rnding);
    332  bf1[17] = temp1;
    333 
    334  temp2 = half_btf_neon_mode11_r(&cospi[56], &bf1[18], &cospi[8], &bf1[29],
    335                                 v_bit, rnding);
    336  bf1[29] = half_btf_neon_mode10_r(&cospi[8], &bf1[18], &cospi[56], &bf1[29],
    337                                   v_bit, rnding);
    338  bf1[18] = temp2;
    339 
    340  temp1 = half_btf_neon_mode10_r(&cospi[40], &bf1[21], &cospi[24], &bf1[26],
    341                                 v_bit, rnding);
    342  bf1[26] = half_btf_neon_r(&cospi[24], &bf1[21], &cospi[40], &bf1[26], v_bit,
    343                            rnding);
    344  bf1[21] = temp1;
    345 
    346  temp2 = half_btf_neon_mode11_r(&cospi[24], &bf1[22], &cospi[40], &bf1[25],
    347                                 v_bit, rnding);
    348  bf1[25] = half_btf_neon_mode10_r(&cospi[40], &bf1[22], &cospi[24], &bf1[25],
    349                                   v_bit, rnding);
    350  bf1[22] = temp2;
    351 }
    352 
    353 static inline void idct32_stage5_neon(int32x4_t *bf1, const int32_t *cospi,
    354                                      const int32x4_t *clamp_lo,
    355                                      const int32x4_t *clamp_hi,
    356                                      const int32x4_t *v_bit,
    357                                      const int32x4_t *rnding) {
    358  int32x4_t temp1, temp2;
    359  temp1 = half_btf_neon_mode10_r(&cospi[16], &bf1[9], &cospi[48], &bf1[14],
    360                                 v_bit, rnding);
    361  bf1[14] =
    362      half_btf_neon_r(&cospi[48], &bf1[9], &cospi[16], &bf1[14], v_bit, rnding);
    363  bf1[9] = temp1;
    364 
    365  temp2 = half_btf_neon_mode11_r(&cospi[48], &bf1[10], &cospi[16], &bf1[13],
    366                                 v_bit, rnding);
    367  bf1[13] = half_btf_neon_mode10_r(&cospi[16], &bf1[10], &cospi[48], &bf1[13],
    368                                   v_bit, rnding);
    369  bf1[10] = temp2;
    370 
    371  addsub_neon(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi);
    372  addsub_neon(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi);
    373  addsub_neon(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi);
    374  addsub_neon(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi);
    375  addsub_neon(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi);
    376  addsub_neon(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi);
    377  addsub_neon(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi);
    378  addsub_neon(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
    379 }
    380 
    381 static inline void idct32_stage6_neon(int32x4_t *bf1, const int32_t *cospi,
    382                                      const int32x4_t *clamp_lo,
    383                                      const int32x4_t *clamp_hi,
    384                                      const int32x4_t *v_bit,
    385                                      const int32x4_t *rnding) {
    386  int32x4_t temp1, temp2;
    387  temp1 = half_btf_neon_mode10_r(&cospi[32], &bf1[5], &cospi[32], &bf1[6],
    388                                 v_bit, rnding);
    389  bf1[6] =
    390      half_btf_neon_r(&cospi[32], &bf1[5], &cospi[32], &bf1[6], v_bit, rnding);
    391  bf1[5] = temp1;
    392 
    393  addsub_neon(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi);
    394  addsub_neon(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi);
    395  addsub_neon(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi);
    396  addsub_neon(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi);
    397 
    398  temp1 = half_btf_neon_mode10_r(&cospi[16], &bf1[18], &cospi[48], &bf1[29],
    399                                 v_bit, rnding);
    400  bf1[29] = half_btf_neon_r(&cospi[48], &bf1[18], &cospi[16], &bf1[29], v_bit,
    401                            rnding);
    402  bf1[18] = temp1;
    403  temp2 = half_btf_neon_mode10_r(&cospi[16], &bf1[19], &cospi[48], &bf1[28],
    404                                 v_bit, rnding);
    405  bf1[28] = half_btf_neon_r(&cospi[48], &bf1[19], &cospi[16], &bf1[28], v_bit,
    406                            rnding);
    407  bf1[19] = temp2;
    408  temp1 = half_btf_neon_mode11_r(&cospi[48], &bf1[20], &cospi[16], &bf1[27],
    409                                 v_bit, rnding);
    410  bf1[27] = half_btf_neon_mode10_r(&cospi[16], &bf1[20], &cospi[48], &bf1[27],
    411                                   v_bit, rnding);
    412  bf1[20] = temp1;
    413  temp2 = half_btf_neon_mode11_r(&cospi[48], &bf1[21], &cospi[16], &bf1[26],
    414                                 v_bit, rnding);
    415  bf1[26] = half_btf_neon_mode10_r(&cospi[16], &bf1[21], &cospi[48], &bf1[26],
    416                                   v_bit, rnding);
    417  bf1[21] = temp2;
    418 }
    419 
    420 static inline void idct32_stage7_neon(int32x4_t *bf1, const int32_t *cospi,
    421                                      const int32x4_t *clamp_lo,
    422                                      const int32x4_t *clamp_hi,
    423                                      const int32x4_t *v_bit,
    424                                      const int32x4_t *rnding) {
    425  int32x4_t temp1, temp2;
    426  addsub_neon(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi);
    427  addsub_neon(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi);
    428  addsub_neon(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi);
    429  addsub_neon(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi);
    430  temp1 = half_btf_neon_mode10_r(&cospi[32], &bf1[10], &cospi[32], &bf1[13],
    431                                 v_bit, rnding);
    432  bf1[13] = half_btf_neon_r(&cospi[32], &bf1[10], &cospi[32], &bf1[13], v_bit,
    433                            rnding);
    434  bf1[10] = temp1;
    435  temp2 = half_btf_neon_mode10_r(&cospi[32], &bf1[11], &cospi[32], &bf1[12],
    436                                 v_bit, rnding);
    437  bf1[12] = half_btf_neon_r(&cospi[32], &bf1[11], &cospi[32], &bf1[12], v_bit,
    438                            rnding);
    439  bf1[11] = temp2;
    440 
    441  addsub_neon(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi);
    442  addsub_neon(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi);
    443  addsub_neon(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi);
    444  addsub_neon(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi);
    445  addsub_neon(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi);
    446  addsub_neon(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi);
    447  addsub_neon(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi);
    448  addsub_neon(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
    449 }
    450 
    451 static inline void idct32_stage8_neon(int32x4_t *bf1, const int32_t *cospi,
    452                                      const int32x4_t *clamp_lo,
    453                                      const int32x4_t *clamp_hi,
    454                                      const int32x4_t *v_bit,
    455                                      const int32x4_t *rnding) {
    456  int32x4_t temp1, temp2;
    457  addsub_neon(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi);
    458  addsub_neon(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi);
    459  addsub_neon(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi);
    460  addsub_neon(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi);
    461  addsub_neon(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi);
    462  addsub_neon(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi);
    463  addsub_neon(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi);
    464  addsub_neon(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi);
    465  temp1 = half_btf_neon_mode10_r(&cospi[32], &bf1[20], &cospi[32], &bf1[27],
    466                                 v_bit, rnding);
    467  bf1[27] = half_btf_neon_r(&cospi[32], &bf1[20], &cospi[32], &bf1[27], v_bit,
    468                            rnding);
    469  bf1[20] = temp1;
    470  temp2 = half_btf_neon_mode10_r(&cospi[32], &bf1[21], &cospi[32], &bf1[26],
    471                                 v_bit, rnding);
    472  bf1[26] = half_btf_neon_r(&cospi[32], &bf1[21], &cospi[32], &bf1[26], v_bit,
    473                            rnding);
    474  bf1[21] = temp2;
    475  temp1 = half_btf_neon_mode10_r(&cospi[32], &bf1[22], &cospi[32], &bf1[25],
    476                                 v_bit, rnding);
    477  bf1[25] = half_btf_neon_r(&cospi[32], &bf1[22], &cospi[32], &bf1[25], v_bit,
    478                            rnding);
    479  bf1[22] = temp1;
    480  temp2 = half_btf_neon_mode10_r(&cospi[32], &bf1[23], &cospi[32], &bf1[24],
    481                                 v_bit, rnding);
    482  bf1[24] = half_btf_neon_r(&cospi[32], &bf1[23], &cospi[32], &bf1[24], v_bit,
    483                            rnding);
    484  bf1[23] = temp2;
    485 }
    486 
    487 static inline void idct32_stage9_neon(int32x4_t *bf1, int32x4_t *out,
    488                                      const int do_cols, const int bd,
    489                                      const int out_shift,
    490                                      const int32x4_t *clamp_lo,
    491                                      const int32x4_t *clamp_hi) {
    492  addsub_neon(bf1[0], bf1[31], out + 0, out + 31, clamp_lo, clamp_hi);
    493  addsub_neon(bf1[1], bf1[30], out + 1, out + 30, clamp_lo, clamp_hi);
    494  addsub_neon(bf1[2], bf1[29], out + 2, out + 29, clamp_lo, clamp_hi);
    495  addsub_neon(bf1[3], bf1[28], out + 3, out + 28, clamp_lo, clamp_hi);
    496  addsub_neon(bf1[4], bf1[27], out + 4, out + 27, clamp_lo, clamp_hi);
    497  addsub_neon(bf1[5], bf1[26], out + 5, out + 26, clamp_lo, clamp_hi);
    498  addsub_neon(bf1[6], bf1[25], out + 6, out + 25, clamp_lo, clamp_hi);
    499  addsub_neon(bf1[7], bf1[24], out + 7, out + 24, clamp_lo, clamp_hi);
    500  addsub_neon(bf1[8], bf1[23], out + 8, out + 23, clamp_lo, clamp_hi);
    501  addsub_neon(bf1[9], bf1[22], out + 9, out + 22, clamp_lo, clamp_hi);
    502  addsub_neon(bf1[10], bf1[21], out + 10, out + 21, clamp_lo, clamp_hi);
    503  addsub_neon(bf1[11], bf1[20], out + 11, out + 20, clamp_lo, clamp_hi);
    504  addsub_neon(bf1[12], bf1[19], out + 12, out + 19, clamp_lo, clamp_hi);
    505  addsub_neon(bf1[13], bf1[18], out + 13, out + 18, clamp_lo, clamp_hi);
    506  addsub_neon(bf1[14], bf1[17], out + 14, out + 17, clamp_lo, clamp_hi);
    507  addsub_neon(bf1[15], bf1[16], out + 15, out + 16, clamp_lo, clamp_hi);
    508 
    509  if (!do_cols) {
    510    const int log_range_out = AOMMAX(16, bd + 6);
    511    const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
    512    const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
    513    for (int i = 0; i < 32; i += 8) {
    514      round_shift_4x4(out + i, out_shift);
    515      round_shift_4x4(out + i + 4, out_shift);
    516    }
    517    highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 32);
    518  }
    519 }
    520 
    521 static void neg_shift_neon(const int32x4_t *in0, const int32x4_t *in1,
    522                           int32x4_t *out0, int32x4_t *out1,
    523                           const int32x4_t *clamp_lo, const int32x4_t *clamp_hi,
    524                           const int32x4_t *v_shift, int32x4_t *offset) {
    525  int32x4_t a0 = vaddq_s32(*offset, *in0);
    526  int32x4_t a1 = vsubq_s32(*offset, *in1);
    527 
    528  a0 = vshlq_s32(a0, *v_shift);
    529  a1 = vshlq_s32(a1, *v_shift);
    530 
    531  a0 = vmaxq_s32(a0, *clamp_lo);
    532  a0 = vminq_s32(a0, *clamp_hi);
    533  a1 = vmaxq_s32(a1, *clamp_lo);
    534  a1 = vminq_s32(a1, *clamp_hi);
    535 
    536  *out0 = a0;
    537  *out1 = a1;
    538 }
    539 
    540 static void idct4x4_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
    541                         int bd, int out_shift) {
    542  const int32_t *cospi = cospi_arr(bit);
    543  int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
    544  int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
    545  int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
    546  int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
    547 
    548  int32x4_t u0, u1, u2, u3;
    549  int32x4_t v0, v1, v2, v3, x, y;
    550 
    551  // Stage 0-1-2
    552 
    553  u0 = in[0];
    554  u1 = in[1];
    555  u2 = in[2];
    556  u3 = in[3];
    557 
    558  const int32x4_t v_bit = vdupq_n_s32(-bit);
    559 
    560  x = vmlaq_n_s32(rnding, u0, cospi[32]);
    561  y = vmulq_n_s32(u2, cospi[32]);
    562  v0 = vaddq_s32(x, y);
    563  v0 = vshlq_s32(v0, v_bit);
    564 
    565  v1 = vsubq_s32(x, y);
    566  v1 = vshlq_s32(v1, v_bit);
    567 
    568  x = vmlaq_n_s32(rnding, u1, cospi[48]);
    569  v2 = vmlsq_n_s32(x, u3, cospi[16]);
    570  v2 = vshlq_s32(v2, v_bit);
    571 
    572  x = vmlaq_n_s32(rnding, u1, cospi[16]);
    573  v3 = vmlaq_n_s32(x, u3, cospi[48]);
    574  v3 = vshlq_s32(v3, v_bit);
    575  // Stage 3
    576  addsub_neon(v0, v3, out + 0, out + 3, &clamp_lo, &clamp_hi);
    577  addsub_neon(v1, v2, out + 1, out + 2, &clamp_lo, &clamp_hi);
    578 
    579  if (!do_cols) {
    580    log_range = AOMMAX(16, bd + 6);
    581    clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
    582    clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
    583    const int32x4_t v_shift = vdupq_n_s32(-out_shift);
    584    shift_and_clamp_neon(out + 0, out + 3, &clamp_lo, &clamp_hi, &v_shift);
    585    shift_and_clamp_neon(out + 1, out + 2, &clamp_lo, &clamp_hi, &v_shift);
    586  }
    587 }
    588 
    589 static void iadst4x4_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
    590                          int bd, int out_shift) {
    591  const int32_t *sinpi = sinpi_arr(bit);
    592  const int32x4_t zero = vdupq_n_s32(0);
    593  int64x2_t rnding = vdupq_n_s64(1ll << (bit + 4 - 1));
    594  const int32x2_t mul = vdup_n_s32(1 << 4);
    595  int32x4_t t;
    596  int32x4_t s0, s1, s2, s3, s4, s5, s6, s7;
    597  int32x4_t x0, x1, x2, x3;
    598  int32x4_t u0, u1, u2, u3;
    599 
    600  x0 = in[0];
    601  x1 = in[1];
    602  x2 = in[2];
    603  x3 = in[3];
    604 
    605  s0 = vmulq_n_s32(x0, sinpi[1]);
    606  s1 = vmulq_n_s32(x0, sinpi[2]);
    607  s2 = vmulq_n_s32(x1, sinpi[3]);
    608  s3 = vmulq_n_s32(x2, sinpi[4]);
    609  s4 = vmulq_n_s32(x2, sinpi[1]);
    610  s5 = vmulq_n_s32(x3, sinpi[2]);
    611  s6 = vmulq_n_s32(x3, sinpi[4]);
    612  t = vsubq_s32(x0, x2);
    613  s7 = vaddq_s32(t, x3);
    614 
    615  t = vaddq_s32(s0, s3);
    616  s0 = vaddq_s32(t, s5);
    617  t = vsubq_s32(s1, s4);
    618  s1 = vsubq_s32(t, s6);
    619  s3 = s2;
    620  s2 = vmulq_n_s32(s7, sinpi[3]);
    621 
    622  u0 = vaddq_s32(s0, s3);
    623  u1 = vaddq_s32(s1, s3);
    624  u2 = s2;
    625  t = vaddq_s32(s0, s1);
    626  u3 = vsubq_s32(t, s3);
    627 
    628  // u0
    629  int32x4x2_t u0x;
    630  u0x.val[0] = vreinterpretq_s32_s64(
    631      vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u0)), mul));
    632  u0x.val[0] = vreinterpretq_s32_s64(
    633      vaddq_s64(vreinterpretq_s64_s32(u0x.val[0]), rnding));
    634 
    635  u0 = vextq_s32(u0, zero, 1);
    636  u0x.val[1] = vreinterpretq_s32_s64(
    637      vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u0)), mul));
    638  u0x.val[1] = vreinterpretq_s32_s64(
    639      vaddq_s64(vreinterpretq_s64_s32(u0x.val[1]), rnding));
    640 
    641  u0x.val[0] = vreinterpretq_s32_s16(vextq_s16(
    642      vreinterpretq_s16_s32(u0x.val[0]), vreinterpretq_s16_s32(zero), 1));
    643  u0x.val[1] = vreinterpretq_s32_s16(vextq_s16(
    644      vreinterpretq_s16_s32(u0x.val[1]), vreinterpretq_s16_s32(zero), 1));
    645 
    646  u0x = vzipq_s32(u0x.val[0], u0x.val[1]);
    647 #if AOM_ARCH_AARCH64
    648  u0 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u0x.val[0]),
    649                                        vreinterpretq_s64_s32(u0x.val[1])));
    650 #else
    651  u0 = vcombine_s32(vget_low_s32(u0x.val[0]), vget_low_s32(u0x.val[1]));
    652 #endif  // AOM_ARCH_AARCH64
    653  // u1
    654  int32x4x2_t u1x;
    655  u1x.val[0] = vreinterpretq_s32_s64(
    656      vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u1)), mul));
    657  u1x.val[0] = vreinterpretq_s32_s64(
    658      vaddq_s64(vreinterpretq_s64_s32(u1x.val[0]), rnding));
    659 
    660  u1 = vextq_s32(u1, zero, 1);
    661  u1x.val[1] = vreinterpretq_s32_s64(
    662      vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u1)), mul));
    663  u1x.val[1] = vreinterpretq_s32_s64(
    664      vaddq_s64(vreinterpretq_s64_s32(u1x.val[1]), rnding));
    665 
    666  u1x.val[0] = vreinterpretq_s32_s16(vextq_s16(
    667      vreinterpretq_s16_s32(u1x.val[0]), vreinterpretq_s16_s32(zero), 1));
    668  u1x.val[1] = vreinterpretq_s32_s16(vextq_s16(
    669      vreinterpretq_s16_s32(u1x.val[1]), vreinterpretq_s16_s32(zero), 1));
    670 
    671  u1x = vzipq_s32(u1x.val[0], u1x.val[1]);
    672 #if AOM_ARCH_AARCH64
    673  u1 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u1x.val[0]),
    674                                        vreinterpretq_s64_s32(u1x.val[1])));
    675 #else
    676  u1 = vcombine_s32(vget_low_s32(u1x.val[0]), vget_low_s32(u1x.val[1]));
    677 #endif  // AOM_ARCH_AARCH64
    678 
    679  // u2
    680  int32x4x2_t u2x;
    681  u2x.val[0] = vreinterpretq_s32_s64(
    682      vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u2)), mul));
    683  u2x.val[0] = vreinterpretq_s32_s64(
    684      vaddq_s64(vreinterpretq_s64_s32(u2x.val[0]), rnding));
    685 
    686  u2 = vextq_s32(u2, zero, 1);
    687  u2x.val[1] = vreinterpretq_s32_s64(
    688      vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u2)), mul));
    689  u2x.val[1] = vreinterpretq_s32_s64(
    690      vaddq_s64(vreinterpretq_s64_s32(u2x.val[1]), rnding));
    691 
    692  u2x.val[0] = vreinterpretq_s32_s16(vextq_s16(
    693      vreinterpretq_s16_s32(u2x.val[0]), vreinterpretq_s16_s32(zero), 1));
    694  u2x.val[1] = vreinterpretq_s32_s16(vextq_s16(
    695      vreinterpretq_s16_s32(u2x.val[1]), vreinterpretq_s16_s32(zero), 1));
    696 
    697  u2x = vzipq_s32(u2x.val[0], u2x.val[1]);
    698 #if AOM_ARCH_AARCH64
    699  u2 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u2x.val[0]),
    700                                        vreinterpretq_s64_s32(u2x.val[1])));
    701 #else
    702  u2 = vcombine_s32(vget_low_s32(u2x.val[0]), vget_low_s32(u2x.val[1]));
    703 #endif  // AOM_ARCH_AARCH64
    704 
    705  // u3
    706  int32x4x2_t u3x;
    707  u3x.val[0] = vreinterpretq_s32_s64(
    708      vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u3)), mul));
    709  u3x.val[0] = vreinterpretq_s32_s64(
    710      vaddq_s64(vreinterpretq_s64_s32(u3x.val[0]), rnding));
    711 
    712  u3 = vextq_s32(u3, zero, 1);
    713  u3x.val[1] = vreinterpretq_s32_s64(
    714      vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u3)), mul));
    715  u3x.val[1] = vreinterpretq_s32_s64(
    716      vaddq_s64(vreinterpretq_s64_s32(u3x.val[1]), rnding));
    717 
    718  u3x.val[0] = vreinterpretq_s32_s16(vextq_s16(
    719      vreinterpretq_s16_s32(u3x.val[0]), vreinterpretq_s16_s32(zero), 1));
    720  u3x.val[1] = vreinterpretq_s32_s16(vextq_s16(
    721      vreinterpretq_s16_s32(u3x.val[1]), vreinterpretq_s16_s32(zero), 1));
    722 
    723  u3x = vzipq_s32(u3x.val[0], u3x.val[1]);
    724 #if AOM_ARCH_AARCH64
    725  u3 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u3x.val[0]),
    726                                        vreinterpretq_s64_s32(u3x.val[1])));
    727 #else
    728  u3 = vcombine_s32(vget_low_s32(u3x.val[0]), vget_low_s32(u3x.val[1]));
    729 #endif  // AOM_ARCH_AARCH64
    730 
    731  out[0] = u0;
    732  out[1] = u1;
    733  out[2] = u2;
    734  out[3] = u3;
    735 
    736  if (!do_cols) {
    737    const int log_range = AOMMAX(16, bd + 6);
    738    const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
    739    const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
    740    round_shift_4x4(out, out_shift);
    741    highbd_clamp_s32_neon(out, out, &clamp_lo, &clamp_hi, 4);
    742  }
    743 }
    744 
    745 static void write_buffer_4x4(int32x4_t *in, uint16_t *output, int stride,
    746                             int fliplr, int flipud, int shift, int bd) {
    747  uint32x4_t u0, u1, u2, u3;
    748  uint16x4_t v0, v1, v2, v3;
    749  round_shift_4x4(in, shift);
    750 
    751  v0 = vld1_u16(output + 0 * stride);
    752  v1 = vld1_u16(output + 1 * stride);
    753  v2 = vld1_u16(output + 2 * stride);
    754  v3 = vld1_u16(output + 3 * stride);
    755 
    756  if (fliplr) {
    757    u0 = vrev64q_u32(vreinterpretq_u32_s32(in[0]));
    758    in[0] = vreinterpretq_s32_u32(vextq_u32(u0, u0, 2));
    759    u0 = vrev64q_u32(vreinterpretq_u32_s32(in[1]));
    760    in[1] = vreinterpretq_s32_u32(vextq_u32(u0, u0, 2));
    761    u0 = vrev64q_u32(vreinterpretq_u32_s32(in[2]));
    762    in[2] = vreinterpretq_s32_u32(vextq_u32(u0, u0, 2));
    763    u0 = vrev64q_u32(vreinterpretq_u32_s32(in[3]));
    764    in[3] = vreinterpretq_s32_u32(vextq_u32(u0, u0, 2));
    765  }
    766 
    767  if (flipud) {
    768    u0 = vaddw_u16(vreinterpretq_u32_s32(in[3]), v0);
    769    u1 = vaddw_u16(vreinterpretq_u32_s32(in[2]), v1);
    770    u2 = vaddw_u16(vreinterpretq_u32_s32(in[1]), v2);
    771    u3 = vaddw_u16(vreinterpretq_u32_s32(in[0]), v3);
    772  } else {
    773    u0 = vaddw_u16(vreinterpretq_u32_s32(in[0]), v0);
    774    u1 = vaddw_u16(vreinterpretq_u32_s32(in[1]), v1);
    775    u2 = vaddw_u16(vreinterpretq_u32_s32(in[2]), v2);
    776    u3 = vaddw_u16(vreinterpretq_u32_s32(in[3]), v3);
    777  }
    778 
    779  uint16x8_t u4 = vcombine_u16(vqmovn_u32(u0), vqmovn_u32(u1));
    780  uint16x8_t u5 = vcombine_u16(vqmovn_u32(u2), vqmovn_u32(u3));
    781  const uint16x8_t vmin = vdupq_n_u16(0);
    782  const uint16x8_t vmax = vdupq_n_u16((1 << bd) - 1);
    783  u4 = highbd_clamp_u16(&u4, &vmin, &vmax);
    784  u5 = highbd_clamp_u16(&u5, &vmin, &vmax);
    785 
    786  vst1_u16(output + 0 * stride, vget_low_u16(u4));
    787  vst1_u16(output + 1 * stride, vget_high_u16(u4));
    788  vst1_u16(output + 2 * stride, vget_low_u16(u5));
    789  vst1_u16(output + 3 * stride, vget_high_u16(u5));
    790 }
    791 
    792 static void iidentity4_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
    793                            int bd, int out_shift) {
    794  (void)bit;
    795  int32x4_t zero = vdupq_n_s32(0);
    796  int32x2_t fact = vdup_n_s32(NewSqrt2);
    797  int32x4x2_t a0;
    798  const int64x2_t rnding = vdupq_n_s64(1 << (NewSqrt2Bits - 1));
    799 
    800  for (int i = 0; i < 4; i++) {
    801    a0.val[0] = vreinterpretq_s32_s64(
    802        vmlal_s32(rnding, vmovn_s64(vreinterpretq_s64_s32(in[i])), fact));
    803    a0.val[0] = vreinterpretq_s32_s64(
    804        vshrq_n_s64(vreinterpretq_s64_s32(a0.val[0]), NewSqrt2Bits));
    805    a0.val[1] = vextq_s32(in[i], zero, 1);
    806    a0.val[1] = vreinterpretq_s32_s64(
    807        vmlal_s32(rnding, vmovn_s64(vreinterpretq_s64_s32(a0.val[1])), fact));
    808    a0.val[1] = vreinterpretq_s32_s64(
    809        vshrq_n_s64(vreinterpretq_s64_s32(a0.val[1]), NewSqrt2Bits));
    810 
    811    a0 = vzipq_s32(a0.val[0], a0.val[1]);
    812 #if AOM_ARCH_AARCH64
    813    out[i] = vreinterpretq_s32_s64(vzip1q_s64(
    814        vreinterpretq_s64_s32(a0.val[0]), vreinterpretq_s64_s32(a0.val[1])));
    815 #else
    816    out[i] = vextq_s32(vextq_s32(a0.val[0], a0.val[0], 2), a0.val[1], 2);
    817 #endif
    818  }
    819  if (!do_cols) {
    820    const int log_range = AOMMAX(16, bd + 6);
    821    const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
    822    const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
    823    round_shift_4x4(out, out_shift);
    824    highbd_clamp_s32_neon(out, out, &clamp_lo, &clamp_hi, 4);
    825  }
    826 }
    827 
    828 void av1_inv_txfm2d_add_4x4_neon(const int32_t *input, uint16_t *output,
    829                                 int stride, TX_TYPE tx_type, int bd) {
    830  int32x4_t in[4];
    831 
    832  const int8_t *shift = av1_inv_txfm_shift_ls[TX_4X4];
    833 
    834  switch (tx_type) {
    835    case DCT_DCT:
    836      load_buffer_4x4(input, in);
    837      idct4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
    838      transpose_4x4(in, in);
    839      idct4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
    840      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
    841      break;
    842    case ADST_DCT:
    843      load_buffer_4x4(input, in);
    844      idct4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
    845      transpose_4x4(in, in);
    846      iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
    847      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
    848      break;
    849    case DCT_ADST:
    850      load_buffer_4x4(input, in);
    851      iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
    852      transpose_4x4(in, in);
    853      idct4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
    854      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
    855      break;
    856    case ADST_ADST:
    857      load_buffer_4x4(input, in);
    858      iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
    859      transpose_4x4(in, in);
    860      iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
    861      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
    862      break;
    863    case FLIPADST_DCT:
    864      load_buffer_4x4(input, in);
    865      idct4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
    866      transpose_4x4(in, in);
    867      iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
    868      write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
    869      break;
    870    case DCT_FLIPADST:
    871      load_buffer_4x4(input, in);
    872      iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
    873      transpose_4x4(in, in);
    874      idct4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
    875      write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
    876      break;
    877    case FLIPADST_FLIPADST:
    878      load_buffer_4x4(input, in);
    879      iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
    880      transpose_4x4(in, in);
    881      iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
    882      write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd);
    883      break;
    884    case ADST_FLIPADST:
    885      load_buffer_4x4(input, in);
    886      iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
    887      transpose_4x4(in, in);
    888      iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
    889      write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
    890      break;
    891    case FLIPADST_ADST:
    892      load_buffer_4x4(input, in);
    893      iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
    894      transpose_4x4(in, in);
    895      iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
    896      write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
    897      break;
    898    case IDTX:
    899      load_buffer_4x4(input, in);
    900      iidentity4_neon(in, in, INV_COS_BIT, 0, bd, 0);
    901      transpose_4x4(in, in);
    902      iidentity4_neon(in, in, INV_COS_BIT, 1, bd, 0);
    903      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
    904      break;
    905    case V_DCT:
    906      load_buffer_4x4(input, in);
    907      iidentity4_neon(in, in, INV_COS_BIT, 0, bd, 0);
    908      transpose_4x4(in, in);
    909      idct4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
    910      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
    911      break;
    912    case H_DCT:
    913      load_buffer_4x4(input, in);
    914      idct4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
    915      transpose_4x4(in, in);
    916      iidentity4_neon(in, in, INV_COS_BIT, 1, bd, 0);
    917      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
    918      break;
    919    case V_ADST:
    920      load_buffer_4x4(input, in);
    921      iidentity4_neon(in, in, INV_COS_BIT, 0, bd, 0);
    922      transpose_4x4(in, in);
    923      iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
    924      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
    925      break;
    926    case H_ADST:
    927      load_buffer_4x4(input, in);
    928      iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
    929      transpose_4x4(in, in);
    930      iidentity4_neon(in, in, INV_COS_BIT, 1, bd, 0);
    931      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
    932      break;
    933    case V_FLIPADST:
    934      load_buffer_4x4(input, in);
    935      iidentity4_neon(in, in, INV_COS_BIT, 0, bd, 0);
    936      transpose_4x4(in, in);
    937      iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
    938      write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
    939      break;
    940    case H_FLIPADST:
    941      load_buffer_4x4(input, in);
    942      iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
    943      transpose_4x4(in, in);
    944      iidentity4_neon(in, in, INV_COS_BIT, 1, bd, 0);
    945      write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
    946      break;
    947    default: assert(0);
    948  }
    949 }
    950 
    951 // 8x8
    952 static void load_buffer_8x8(const int32_t *coeff, int32x4_t *in) {
    953  in[0] = vld1q_s32(coeff + 0);
    954  in[1] = vld1q_s32(coeff + 4);
    955  in[2] = vld1q_s32(coeff + 8);
    956  in[3] = vld1q_s32(coeff + 12);
    957  in[4] = vld1q_s32(coeff + 16);
    958  in[5] = vld1q_s32(coeff + 20);
    959  in[6] = vld1q_s32(coeff + 24);
    960  in[7] = vld1q_s32(coeff + 28);
    961  in[8] = vld1q_s32(coeff + 32);
    962  in[9] = vld1q_s32(coeff + 36);
    963  in[10] = vld1q_s32(coeff + 40);
    964  in[11] = vld1q_s32(coeff + 44);
    965  in[12] = vld1q_s32(coeff + 48);
    966  in[13] = vld1q_s32(coeff + 52);
    967  in[14] = vld1q_s32(coeff + 56);
    968  in[15] = vld1q_s32(coeff + 60);
    969 }
    970 
    971 static void idct8x8_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
    972                         int bd, int out_shift) {
    973  const int32_t *cospi = cospi_arr(bit);
    974  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
    975  const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
    976  const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
    977  int32x4_t u0, u1, u2, u3, u4, u5, u6, u7;
    978  int32x4_t v0, v1, v2, v3, v4, v5, v6, v7;
    979  int32x4_t x, y;
    980  int col;
    981  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
    982  const int32x4_t v_bit = vdupq_n_s32(-bit);
    983  // Note:
    984  //  Even column: 0, 2, ..., 14
    985  //  Odd column: 1, 3, ..., 15
    986  //  one even column plus one odd column constructs one row (8 coeffs)
    987  //  total we have 8 rows (8x8).
    988  for (col = 0; col < 2; ++col) {
    989    // stage 0
    990    // stage 1
    991    // stage 2
    992    u0 = in[0 * 2 + col];
    993    u1 = in[4 * 2 + col];
    994    u2 = in[2 * 2 + col];
    995    u3 = in[6 * 2 + col];
    996 
    997    x = vmulq_n_s32(in[1 * 2 + col], cospi[56]);
    998    u4 = vmlaq_n_s32(x, in[7 * 2 + col], -cospi[8]);
    999    u4 = vaddq_s32(u4, rnding);
   1000    u4 = vshlq_s32(u4, v_bit);
   1001 
   1002    x = vmulq_n_s32(in[1 * 2 + col], cospi[8]);
   1003    u7 = vmlaq_n_s32(x, in[7 * 2 + col], cospi[56]);
   1004    u7 = vaddq_s32(u7, rnding);
   1005    u7 = vshlq_s32(u7, v_bit);
   1006 
   1007    x = vmulq_n_s32(in[5 * 2 + col], cospi[24]);
   1008    u5 = vmlaq_n_s32(x, in[3 * 2 + col], -cospi[40]);
   1009    u5 = vaddq_s32(u5, rnding);
   1010    u5 = vshlq_s32(u5, v_bit);
   1011 
   1012    x = vmulq_n_s32(in[5 * 2 + col], cospi[40]);
   1013    u6 = vmlaq_n_s32(x, in[3 * 2 + col], cospi[24]);
   1014    u6 = vaddq_s32(u6, rnding);
   1015    u6 = vshlq_s32(u6, v_bit);
   1016 
   1017    // stage 3
   1018    x = vmulq_n_s32(u0, cospi[32]);
   1019    y = vmulq_n_s32(u1, cospi[32]);
   1020    v0 = vaddq_s32(x, y);
   1021    v0 = vaddq_s32(v0, rnding);
   1022    v0 = vshlq_s32(v0, v_bit);
   1023 
   1024    v1 = vsubq_s32(x, y);
   1025    v1 = vaddq_s32(v1, rnding);
   1026    v1 = vshlq_s32(v1, v_bit);
   1027 
   1028    x = vmulq_n_s32(u2, cospi[48]);
   1029    v2 = vmlaq_n_s32(x, u3, -cospi[16]);
   1030    v2 = vaddq_s32(v2, rnding);
   1031    v2 = vshlq_s32(v2, v_bit);
   1032 
   1033    x = vmulq_n_s32(u2, cospi[16]);
   1034    v3 = vmlaq_n_s32(x, u3, cospi[48]);
   1035    v3 = vaddq_s32(v3, rnding);
   1036    v3 = vshlq_s32(v3, v_bit);
   1037 
   1038    addsub_neon(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
   1039    addsub_neon(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
   1040 
   1041    // stage 4
   1042    addsub_neon(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
   1043    addsub_neon(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
   1044    u4 = v4;
   1045    u7 = v7;
   1046 
   1047    x = vmulq_n_s32(v5, cospi[32]);
   1048    y = vmulq_n_s32(v6, cospi[32]);
   1049    u6 = vaddq_s32(y, x);
   1050    u6 = vaddq_s32(u6, rnding);
   1051    u6 = vshlq_s32(u6, v_bit);
   1052 
   1053    u5 = vsubq_s32(y, x);
   1054    u5 = vaddq_s32(u5, rnding);
   1055    u5 = vshlq_s32(u5, v_bit);
   1056 
   1057    // stage 5
   1058    addsub_neon(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col, &clamp_lo,
   1059                &clamp_hi);
   1060    addsub_neon(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col, &clamp_lo,
   1061                &clamp_hi);
   1062    addsub_neon(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col, &clamp_lo,
   1063                &clamp_hi);
   1064    addsub_neon(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col, &clamp_lo,
   1065                &clamp_hi);
   1066  }
   1067 
   1068  if (!do_cols) {
   1069    const int log_range_out = AOMMAX(16, bd + 6);
   1070    const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
   1071    const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
   1072    round_shift_8x8(out, out_shift);
   1073    highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 16);
   1074  }
   1075 }
   1076 
   1077 static void iadst8x8_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
   1078                          int bd, int out_shift) {
   1079  const int32_t *cospi = cospi_arr(bit);
   1080  const int32x4_t kZero = vdupq_n_s32(0);
   1081  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
   1082  const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
   1083  const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
   1084  int32x4_t u[8], v[8], x;
   1085  const int32x4_t v_bit = vdupq_n_s32(-bit);
   1086  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
   1087  // stage 0-1-2
   1088  // (1)
   1089  u[0] = vmlaq_n_s32(rnding, in[14], cospi[4]);
   1090  u[0] = vmlaq_n_s32(u[0], in[0], cospi[60]);
   1091  u[0] = vshlq_s32(u[0], v_bit);
   1092 
   1093  u[1] = vmlaq_n_s32(rnding, in[14], cospi[60]);
   1094  u[1] = vmlsq_n_s32(u[1], in[0], cospi[4]);
   1095  u[1] = vshlq_s32(u[1], v_bit);
   1096 
   1097  // (2)
   1098  u[2] = vmlaq_n_s32(rnding, in[10], cospi[20]);
   1099  u[2] = vmlaq_n_s32(u[2], in[4], cospi[44]);
   1100  u[2] = vshlq_s32(u[2], v_bit);
   1101 
   1102  u[3] = vmlaq_n_s32(rnding, in[10], cospi[44]);
   1103  u[3] = vmlsq_n_s32(u[3], in[4], cospi[20]);
   1104  u[3] = vshlq_s32(u[3], v_bit);
   1105 
   1106  // (3)
   1107  u[4] = vmlaq_n_s32(rnding, in[6], cospi[36]);
   1108  u[4] = vmlaq_n_s32(u[4], in[8], cospi[28]);
   1109  u[4] = vshlq_s32(u[4], v_bit);
   1110 
   1111  u[5] = vmlaq_n_s32(rnding, in[6], cospi[28]);
   1112  u[5] = vmlsq_n_s32(u[5], in[8], cospi[36]);
   1113  u[5] = vshlq_s32(u[5], v_bit);
   1114 
   1115  // (4)
   1116  u[6] = vmlaq_n_s32(rnding, in[2], cospi[52]);
   1117  u[6] = vmlaq_n_s32(u[6], in[12], cospi[12]);
   1118  u[6] = vshlq_s32(u[6], v_bit);
   1119 
   1120  u[7] = vmlaq_n_s32(rnding, in[2], cospi[12]);
   1121  u[7] = vmlsq_n_s32(u[7], in[12], cospi[52]);
   1122  u[7] = vshlq_s32(u[7], v_bit);
   1123 
   1124  // stage 3
   1125  addsub_neon(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
   1126  addsub_neon(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
   1127  addsub_neon(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
   1128  addsub_neon(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
   1129 
   1130  // stage 4
   1131  u[0] = v[0];
   1132  u[1] = v[1];
   1133  u[2] = v[2];
   1134  u[3] = v[3];
   1135 
   1136  u[4] = vmlaq_n_s32(rnding, v[4], cospi[16]);
   1137  u[4] = vmlaq_n_s32(u[4], v[5], cospi[48]);
   1138  u[4] = vshlq_s32(u[4], v_bit);
   1139 
   1140  u[5] = vmlaq_n_s32(rnding, v[4], cospi[48]);
   1141  u[5] = vmlsq_n_s32(u[5], v[5], cospi[16]);
   1142  u[5] = vshlq_s32(u[5], v_bit);
   1143 
   1144  u[6] = vmlaq_n_s32(rnding, v[7], cospi[16]);
   1145  u[6] = vmlsq_n_s32(u[6], v[6], cospi[48]);
   1146  u[6] = vshlq_s32(u[6], v_bit);
   1147 
   1148  u[7] = vmlaq_n_s32(rnding, v[7], cospi[48]);
   1149  u[7] = vmlaq_n_s32(u[7], v[6], cospi[16]);
   1150  u[7] = vshlq_s32(u[7], v_bit);
   1151 
   1152  // stage 5
   1153  addsub_neon(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
   1154  addsub_neon(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
   1155  addsub_neon(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
   1156  addsub_neon(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
   1157 
   1158  // stage 6
   1159  u[0] = v[0];
   1160  u[1] = v[1];
   1161  u[4] = v[4];
   1162  u[5] = v[5];
   1163 
   1164  v[0] = vmlaq_n_s32(rnding, v[2], cospi[32]);
   1165  x = vmulq_n_s32(v[3], cospi[32]);
   1166  u[2] = vaddq_s32(v[0], x);
   1167  u[2] = vshlq_s32(u[2], v_bit);
   1168 
   1169  u[3] = vsubq_s32(v[0], x);
   1170  u[3] = vshlq_s32(u[3], v_bit);
   1171 
   1172  v[0] = vmlaq_n_s32(rnding, v[6], cospi[32]);
   1173  x = vmulq_n_s32(v[7], cospi[32]);
   1174  u[6] = vaddq_s32(v[0], x);
   1175  u[6] = vshlq_s32(u[6], v_bit);
   1176 
   1177  u[7] = vsubq_s32(v[0], x);
   1178  u[7] = vshlq_s32(u[7], v_bit);
   1179 
   1180  // stage 7
   1181  if (do_cols) {
   1182    out[0] = u[0];
   1183    out[2] = vsubq_s32(kZero, u[4]);
   1184    out[4] = u[6];
   1185    out[6] = vsubq_s32(kZero, u[2]);
   1186    out[8] = u[3];
   1187    out[10] = vsubq_s32(kZero, u[7]);
   1188    out[12] = u[5];
   1189    out[14] = vsubq_s32(kZero, u[1]);
   1190  } else {
   1191    const int log_range_out = AOMMAX(16, bd + 6);
   1192    const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
   1193    const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
   1194    const int32x4_t v_shift = vdupq_n_s32(-out_shift);
   1195    int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
   1196    neg_shift_neon(&u[0], &u[4], out + 0, out + 2, &clamp_lo_out, &clamp_hi_out,
   1197                   &v_shift, &offset);
   1198    neg_shift_neon(&u[6], &u[2], out + 4, out + 6, &clamp_lo_out, &clamp_hi_out,
   1199                   &v_shift, &offset);
   1200    neg_shift_neon(&u[3], &u[7], out + 8, out + 10, &clamp_lo_out,
   1201                   &clamp_hi_out, &v_shift, &offset);
   1202    neg_shift_neon(&u[5], &u[1], out + 12, out + 14, &clamp_lo_out,
   1203                   &clamp_hi_out, &v_shift, &offset);
   1204  }
   1205 
   1206  // Odd 8 points: 1, 3, ..., 15
   1207  // stage 0
   1208  // stage 1
   1209  // stage 2
   1210  // (1)
   1211  u[0] = vmlaq_n_s32(rnding, in[15], cospi[4]);
   1212  u[0] = vmlaq_n_s32(u[0], in[1], cospi[60]);
   1213  u[0] = vshlq_s32(u[0], v_bit);
   1214 
   1215  u[1] = vmlaq_n_s32(rnding, in[15], cospi[60]);
   1216  u[1] = vmlsq_n_s32(u[1], in[1], cospi[4]);
   1217  u[1] = vshlq_s32(u[1], v_bit);
   1218 
   1219  // (2)
   1220  u[2] = vmlaq_n_s32(rnding, in[11], cospi[20]);
   1221  u[2] = vmlaq_n_s32(u[2], in[5], cospi[44]);
   1222  u[2] = vshlq_s32(u[2], v_bit);
   1223 
   1224  u[3] = vmlaq_n_s32(rnding, in[11], cospi[44]);
   1225  u[3] = vmlsq_n_s32(u[3], in[5], cospi[20]);
   1226  u[3] = vshlq_s32(u[3], v_bit);
   1227 
   1228  // (3)
   1229  u[4] = vmlaq_n_s32(rnding, in[7], cospi[36]);
   1230  u[4] = vmlaq_n_s32(u[4], in[9], cospi[28]);
   1231  u[4] = vshlq_s32(u[4], v_bit);
   1232 
   1233  u[5] = vmlaq_n_s32(rnding, in[7], cospi[28]);
   1234  u[5] = vmlsq_n_s32(u[5], in[9], cospi[36]);
   1235  u[5] = vshlq_s32(u[5], v_bit);
   1236 
   1237  // (4)
   1238  u[6] = vmlaq_n_s32(rnding, in[3], cospi[52]);
   1239  u[6] = vmlaq_n_s32(u[6], in[13], cospi[12]);
   1240  u[6] = vshlq_s32(u[6], v_bit);
   1241 
   1242  u[7] = vmlaq_n_s32(rnding, in[3], cospi[12]);
   1243  u[7] = vmlsq_n_s32(u[7], in[13], cospi[52]);
   1244  u[7] = vshlq_s32(u[7], v_bit);
   1245 
   1246  // stage 3
   1247  addsub_neon(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
   1248  addsub_neon(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
   1249  addsub_neon(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
   1250  addsub_neon(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
   1251 
   1252  // stage 4
   1253  u[0] = v[0];
   1254  u[1] = v[1];
   1255  u[2] = v[2];
   1256  u[3] = v[3];
   1257 
   1258  u[4] = vmlaq_n_s32(rnding, v[4], cospi[16]);
   1259  u[4] = vmlaq_n_s32(u[4], v[5], cospi[48]);
   1260  u[4] = vshlq_s32(u[4], v_bit);
   1261 
   1262  u[5] = vmlaq_n_s32(rnding, v[4], cospi[48]);
   1263  u[5] = vmlsq_n_s32(u[5], v[5], cospi[16]);
   1264  u[5] = vshlq_s32(u[5], v_bit);
   1265 
   1266  u[6] = vmlaq_n_s32(rnding, v[7], cospi[16]);
   1267  u[6] = vmlsq_n_s32(u[6], v[6], cospi[48]);
   1268  u[6] = vshlq_s32(u[6], v_bit);
   1269 
   1270  u[7] = vmlaq_n_s32(rnding, v[6], cospi[16]);
   1271  u[7] = vmlaq_n_s32(u[7], v[7], cospi[48]);
   1272  u[7] = vshlq_s32(u[7], v_bit);
   1273 
   1274  // stage 5
   1275  addsub_neon(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
   1276  addsub_neon(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
   1277  addsub_neon(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
   1278  addsub_neon(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
   1279 
   1280  // stage 6
   1281  u[0] = v[0];
   1282  u[1] = v[1];
   1283  u[4] = v[4];
   1284  u[5] = v[5];
   1285 
   1286  v[0] = vmlaq_n_s32(rnding, v[2], cospi[32]);
   1287  x = vmulq_n_s32(v[3], cospi[32]);
   1288  u[2] = vaddq_s32(v[0], x);
   1289  u[2] = vshlq_s32(u[2], v_bit);
   1290 
   1291  u[3] = vsubq_s32(v[0], x);
   1292  u[3] = vshlq_s32(u[3], v_bit);
   1293 
   1294  v[0] = vmlaq_n_s32(rnding, v[6], cospi[32]);
   1295  x = vmulq_n_s32(v[7], cospi[32]);
   1296  u[6] = vaddq_s32(v[0], x);
   1297  u[6] = vshlq_s32(u[6], v_bit);
   1298 
   1299  u[7] = vsubq_s32(v[0], x);
   1300  u[7] = vshlq_s32(u[7], v_bit);
   1301 
   1302  // stage 7
   1303  if (do_cols) {
   1304    out[1] = u[0];
   1305    out[3] = vsubq_s32(kZero, u[4]);
   1306    out[5] = u[6];
   1307    out[7] = vsubq_s32(kZero, u[2]);
   1308    out[9] = u[3];
   1309    out[11] = vsubq_s32(kZero, u[7]);
   1310    out[13] = u[5];
   1311    out[15] = vsubq_s32(kZero, u[1]);
   1312  } else {
   1313    const int log_range_out = AOMMAX(16, bd + 6);
   1314    const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
   1315    const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
   1316    const int32x4_t v_shift = vdupq_n_s32(-out_shift);
   1317    int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
   1318    neg_shift_neon(&u[0], &u[4], out + 1, out + 3, &clamp_lo_out, &clamp_hi_out,
   1319                   &v_shift, &offset);
   1320    neg_shift_neon(&u[6], &u[2], out + 5, out + 7, &clamp_lo_out, &clamp_hi_out,
   1321                   &v_shift, &offset);
   1322    neg_shift_neon(&u[3], &u[7], out + 9, out + 11, &clamp_lo_out,
   1323                   &clamp_hi_out, &v_shift, &offset);
   1324    neg_shift_neon(&u[5], &u[1], out + 13, out + 15, &clamp_lo_out,
   1325                   &clamp_hi_out, &v_shift, &offset);
   1326  }
   1327 }
   1328 
   1329 static void iidentity8_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
   1330                            int bd, int out_shift) {
   1331  (void)bit;
   1332  out[0] = vaddq_s32(in[0], in[0]);
   1333  out[1] = vaddq_s32(in[1], in[1]);
   1334  out[2] = vaddq_s32(in[2], in[2]);
   1335  out[3] = vaddq_s32(in[3], in[3]);
   1336  out[4] = vaddq_s32(in[4], in[4]);
   1337  out[5] = vaddq_s32(in[5], in[5]);
   1338  out[6] = vaddq_s32(in[6], in[6]);
   1339  out[7] = vaddq_s32(in[7], in[7]);
   1340 
   1341  if (!do_cols) {
   1342    const int log_range = AOMMAX(16, bd + 6);
   1343    const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
   1344    const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
   1345    round_shift_4x4(out, out_shift);
   1346    round_shift_4x4(out + 4, out_shift);
   1347    highbd_clamp_s32_neon(out, out, &clamp_lo, &clamp_hi, 8);
   1348  }
   1349 }
   1350 
   1351 static uint16x8_t get_recon_8x8(const uint16x8_t pred, int32x4_t res_lo,
   1352                                int32x4_t res_hi, int fliplr, int bd) {
   1353  uint16x8x2_t x;
   1354 
   1355  if (fliplr) {
   1356    res_lo = vrev64q_s32(res_lo);
   1357    res_lo = vextq_s32(res_lo, res_lo, 2);
   1358    res_hi = vrev64q_s32(res_hi);
   1359    res_hi = vextq_s32(res_hi, res_hi, 2);
   1360    x.val[0] = vreinterpretq_u16_s32(
   1361        vaddw_s16(res_hi, vreinterpret_s16_u16(vget_low_u16(pred))));
   1362    x.val[1] = vreinterpretq_u16_s32(
   1363        vaddw_s16(res_lo, vreinterpret_s16_u16(vget_high_u16(pred))));
   1364 
   1365  } else {
   1366    x.val[0] = vreinterpretq_u16_s32(
   1367        vaddw_s16(res_lo, vreinterpret_s16_u16(vget_low_u16(pred))));
   1368    x.val[1] = vreinterpretq_u16_s32(
   1369        vaddw_s16(res_hi, vreinterpret_s16_u16(vget_high_u16(pred))));
   1370  }
   1371 
   1372  uint16x8_t x2 = vcombine_u16(vqmovn_u32(vreinterpretq_u32_u16(x.val[0])),
   1373                               vqmovn_u32(vreinterpretq_u32_u16(x.val[1])));
   1374  const uint16x8_t vmin = vdupq_n_u16(0);
   1375  const uint16x8_t vmax = vdupq_n_u16((1 << bd) - 1);
   1376  return highbd_clamp_u16(&x2, &vmin, &vmax);
   1377 }
   1378 
   1379 static void write_buffer_8x8(int32x4_t *in, uint16_t *output, int stride,
   1380                             int fliplr, int flipud, int shift, int bd) {
   1381  uint16x8_t u0, u1, u2, u3, u4, u5, u6, u7;
   1382  uint16x8_t v0, v1, v2, v3, v4, v5, v6, v7;
   1383  round_shift_8x8(in, shift);
   1384 
   1385  v0 = vld1q_u16(output + 0 * stride);
   1386  v1 = vld1q_u16(output + 1 * stride);
   1387  v2 = vld1q_u16(output + 2 * stride);
   1388  v3 = vld1q_u16(output + 3 * stride);
   1389  v4 = vld1q_u16(output + 4 * stride);
   1390  v5 = vld1q_u16(output + 5 * stride);
   1391  v6 = vld1q_u16(output + 6 * stride);
   1392  v7 = vld1q_u16(output + 7 * stride);
   1393 
   1394  if (flipud) {
   1395    u0 = get_recon_8x8(v0, in[14], in[15], fliplr, bd);
   1396    u1 = get_recon_8x8(v1, in[12], in[13], fliplr, bd);
   1397    u2 = get_recon_8x8(v2, in[10], in[11], fliplr, bd);
   1398    u3 = get_recon_8x8(v3, in[8], in[9], fliplr, bd);
   1399    u4 = get_recon_8x8(v4, in[6], in[7], fliplr, bd);
   1400    u5 = get_recon_8x8(v5, in[4], in[5], fliplr, bd);
   1401    u6 = get_recon_8x8(v6, in[2], in[3], fliplr, bd);
   1402    u7 = get_recon_8x8(v7, in[0], in[1], fliplr, bd);
   1403  } else {
   1404    u0 = get_recon_8x8(v0, in[0], in[1], fliplr, bd);
   1405    u1 = get_recon_8x8(v1, in[2], in[3], fliplr, bd);
   1406    u2 = get_recon_8x8(v2, in[4], in[5], fliplr, bd);
   1407    u3 = get_recon_8x8(v3, in[6], in[7], fliplr, bd);
   1408    u4 = get_recon_8x8(v4, in[8], in[9], fliplr, bd);
   1409    u5 = get_recon_8x8(v5, in[10], in[11], fliplr, bd);
   1410    u6 = get_recon_8x8(v6, in[12], in[13], fliplr, bd);
   1411    u7 = get_recon_8x8(v7, in[14], in[15], fliplr, bd);
   1412  }
   1413 
   1414  vst1q_u16(output + 0 * stride, u0);
   1415  vst1q_u16(output + 1 * stride, u1);
   1416  vst1q_u16(output + 2 * stride, u2);
   1417  vst1q_u16(output + 3 * stride, u3);
   1418  vst1q_u16(output + 4 * stride, u4);
   1419  vst1q_u16(output + 5 * stride, u5);
   1420  vst1q_u16(output + 6 * stride, u6);
   1421  vst1q_u16(output + 7 * stride, u7);
   1422 }
   1423 
   1424 void av1_inv_txfm2d_add_8x8_neon(const int32_t *input, uint16_t *output,
   1425                                 int stride, TX_TYPE tx_type, int bd) {
   1426  int32x4_t in[16], out[16];
   1427  const int8_t *shift = av1_inv_txfm_shift_ls[TX_8X8];
   1428 
   1429  switch (tx_type) {
   1430    case DCT_DCT:
   1431      load_buffer_8x8(input, in);
   1432      idct8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]);
   1433      transpose_8x8(out, in);
   1434      idct8x8_neon(in, out, INV_COS_BIT, 1, bd, 0);
   1435      write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd);
   1436      break;
   1437    case DCT_ADST:
   1438      load_buffer_8x8(input, in);
   1439      iadst8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]);
   1440      transpose_8x8(out, in);
   1441      idct8x8_neon(in, out, INV_COS_BIT, 1, bd, 0);
   1442      write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd);
   1443      break;
   1444    case ADST_DCT:
   1445      load_buffer_8x8(input, in);
   1446      idct8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]);
   1447      transpose_8x8(out, in);
   1448      iadst8x8_neon(in, out, INV_COS_BIT, 1, bd, 0);
   1449      write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd);
   1450      break;
   1451    case ADST_ADST:
   1452      load_buffer_8x8(input, in);
   1453      iadst8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]);
   1454      transpose_8x8(out, in);
   1455      iadst8x8_neon(in, out, INV_COS_BIT, 1, bd, 0);
   1456      write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd);
   1457      break;
   1458    case FLIPADST_DCT:
   1459      load_buffer_8x8(input, in);
   1460      idct8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]);
   1461      transpose_8x8(out, in);
   1462      iadst8x8_neon(in, out, INV_COS_BIT, 1, bd, 0);
   1463      write_buffer_8x8(out, output, stride, 0, 1, -shift[1], bd);
   1464      break;
   1465    case DCT_FLIPADST:
   1466      load_buffer_8x8(input, in);
   1467      iadst8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]);
   1468      transpose_8x8(out, in);
   1469      idct8x8_neon(in, out, INV_COS_BIT, 1, bd, 0);
   1470      write_buffer_8x8(out, output, stride, 1, 0, -shift[1], bd);
   1471      break;
   1472    case ADST_FLIPADST:
   1473      load_buffer_8x8(input, in);
   1474      iadst8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]);
   1475      transpose_8x8(out, in);
   1476      iadst8x8_neon(in, out, INV_COS_BIT, 1, bd, 0);
   1477      write_buffer_8x8(out, output, stride, 1, 0, -shift[1], bd);
   1478      break;
   1479    case FLIPADST_FLIPADST:
   1480      load_buffer_8x8(input, in);
   1481      iadst8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]);
   1482      transpose_8x8(out, in);
   1483      iadst8x8_neon(in, out, INV_COS_BIT, 1, bd, 0);
   1484      write_buffer_8x8(out, output, stride, 1, 1, -shift[1], bd);
   1485      break;
   1486    case FLIPADST_ADST:
   1487      load_buffer_8x8(input, in);
   1488      iadst8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]);
   1489      transpose_8x8(out, in);
   1490      iadst8x8_neon(in, out, INV_COS_BIT, 1, bd, 0);
   1491      write_buffer_8x8(out, output, stride, 0, 1, -shift[1], bd);
   1492      break;
   1493    default: assert(0);
   1494  }
   1495 }
   1496 
   1497 static void idct8x8_low1_neon(int32x4_t *in, int32x4_t *out, int bit,
   1498                              int do_cols, int bd, int out_shift) {
   1499  const int32_t *cospi = cospi_arr(bit);
   1500  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
   1501  int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
   1502  int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
   1503  int32x4_t x;
   1504  const int32x4_t v_bit = vdupq_n_s32(-bit);
   1505  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
   1506  // stage 0-1-2-3
   1507  x = vmulq_n_s32(in[0], cospi[32]);
   1508  x = vaddq_s32(vshlq_s32(x, v_bit), rnding);
   1509 
   1510  // stage 4-5
   1511  if (!do_cols) {
   1512    const int log_range_out = AOMMAX(16, bd + 6);
   1513    clamp_lo = vdupq_n_s32(-(1 << (log_range_out - 1)));
   1514    clamp_hi = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
   1515 
   1516    int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
   1517    x = vaddq_s32(x, offset);
   1518    x = vshlq_s32(x, vdupq_n_s32(-out_shift));
   1519  }
   1520 
   1521  x = vmaxq_s32(x, clamp_lo);
   1522  x = vminq_s32(x, clamp_hi);
   1523  out[0] = x;
   1524  out[1] = x;
   1525  out[2] = x;
   1526  out[3] = x;
   1527  out[4] = x;
   1528  out[5] = x;
   1529  out[6] = x;
   1530  out[7] = x;
   1531 }
   1532 
   1533 static void idct8x8_new_neon(int32x4_t *in, int32x4_t *out, int bit,
   1534                             int do_cols, int bd, int out_shift) {
   1535  const int32_t *cospi = cospi_arr(bit);
   1536  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
   1537  const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
   1538  const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
   1539  int32x4_t u0, u1, u2, u3, u4, u5, u6, u7;
   1540  int32x4_t v0, v1, v2, v3, v4, v5, v6, v7;
   1541  int32x4_t x, y;
   1542  const int32x4_t v_bit = vdupq_n_s32(-bit);
   1543  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
   1544 
   1545  // stage 0
   1546  // stage 1
   1547  // stage 2
   1548  u0 = in[0];
   1549  u1 = in[4];
   1550  u2 = in[2];
   1551  u3 = in[6];
   1552 
   1553  x = vmlaq_n_s32(rnding, in[1], cospi[56]);
   1554  u4 = vmlaq_n_s32(x, in[7], -cospi[8]);
   1555  u4 = vshlq_s32(u4, v_bit);
   1556 
   1557  x = vmlaq_n_s32(rnding, in[1], cospi[8]);
   1558  u7 = vmlaq_n_s32(x, in[7], cospi[56]);
   1559  u7 = vshlq_s32(u7, v_bit);
   1560 
   1561  x = vmlaq_n_s32(rnding, in[5], cospi[24]);
   1562  u5 = vmlaq_n_s32(x, in[3], -cospi[40]);
   1563  u5 = vshlq_s32(u5, v_bit);
   1564 
   1565  x = vmlaq_n_s32(rnding, in[5], cospi[40]);
   1566  u6 = vmlaq_n_s32(x, in[3], cospi[24]);
   1567  u6 = vshlq_s32(u6, v_bit);
   1568 
   1569  // stage 3
   1570  x = vmlaq_n_s32(rnding, u0, cospi[32]);
   1571  y = vmulq_n_s32(u1, cospi[32]);
   1572  v0 = vaddq_s32(x, y);
   1573  v0 = vshlq_s32(v0, v_bit);
   1574 
   1575  v1 = vsubq_s32(x, y);
   1576  v1 = vshlq_s32(v1, v_bit);
   1577 
   1578  x = vmlaq_n_s32(rnding, u2, cospi[48]);
   1579  v2 = vmlaq_n_s32(x, u3, -cospi[16]);
   1580  v2 = vshlq_s32(v2, v_bit);
   1581 
   1582  x = vmlaq_n_s32(rnding, u2, cospi[16]);
   1583  v3 = vmlaq_n_s32(x, u3, cospi[48]);
   1584  v3 = vshlq_s32(v3, v_bit);
   1585 
   1586  addsub_neon(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
   1587  addsub_neon(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
   1588 
   1589  // stage 4
   1590  addsub_neon(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
   1591  addsub_neon(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
   1592  u4 = v4;
   1593  u7 = v7;
   1594 
   1595  x = vmulq_n_s32(v5, cospi[32]);
   1596  y = vmlaq_n_s32(rnding, v6, cospi[32]);
   1597  u6 = vaddq_s32(y, x);
   1598  u6 = vshlq_s32(u6, v_bit);
   1599 
   1600  u5 = vsubq_s32(y, x);
   1601  u5 = vshlq_s32(u5, v_bit);
   1602 
   1603  // stage 5
   1604  addsub_neon(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi);
   1605  addsub_neon(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi);
   1606  addsub_neon(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi);
   1607  addsub_neon(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi);
   1608 
   1609  if (!do_cols) {
   1610    const int log_range_out = AOMMAX(16, bd + 6);
   1611    const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
   1612    const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
   1613    round_shift_4x4(out, out_shift);
   1614    round_shift_4x4(out + 4, out_shift);
   1615    highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 8);
   1616  }
   1617 }
   1618 
   1619 static void iadst8x8_low1_neon(int32x4_t *in, int32x4_t *out, int bit,
   1620                               int do_cols, int bd, int out_shift) {
   1621  const int32_t *cospi = cospi_arr(bit);
   1622  int32x4_t u[8], x;
   1623  const int32x4_t v_bit = vdupq_n_s32(-bit);
   1624  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
   1625  // stage 0-2
   1626 
   1627  u[0] = vmlaq_n_s32(rnding, in[0], cospi[60]);
   1628  u[0] = vshlq_s32(u[0], v_bit);
   1629 
   1630  u[1] = vmlaq_n_s32(rnding, in[0], cospi[4]);
   1631  u[1] = vshlq_s32(vnegq_s32(u[1]), v_bit);
   1632 
   1633  // stage 3-4
   1634  int32x4_t temp1, temp2;
   1635  temp1 = vmlaq_n_s32(rnding, u[0], cospi[16]);
   1636  temp1 = vmlaq_n_s32(temp1, u[1], cospi[48]);
   1637  temp1 = vshlq_s32(temp1, v_bit);
   1638  u[4] = temp1;
   1639 
   1640  temp2 = vmlaq_n_s32(rnding, u[0], cospi[48]);
   1641  u[5] = vmlsq_n_s32(temp2, u[1], cospi[16]);
   1642  u[5] = vshlq_s32(u[5], v_bit);
   1643 
   1644  // stage 5-6
   1645  temp1 = vmlaq_n_s32(rnding, u[0], cospi[32]);
   1646  x = vmulq_n_s32(u[1], cospi[32]);
   1647  u[2] = vaddq_s32(temp1, x);
   1648  u[2] = vshlq_s32(u[2], v_bit);
   1649 
   1650  u[3] = vsubq_s32(temp1, x);
   1651  u[3] = vshlq_s32(u[3], v_bit);
   1652 
   1653  temp1 = vmlaq_n_s32(rnding, u[4], cospi[32]);
   1654  x = vmulq_n_s32(u[5], cospi[32]);
   1655  u[6] = vaddq_s32(temp1, x);
   1656  u[6] = vshlq_s32(u[6], v_bit);
   1657 
   1658  u[7] = vsubq_s32(temp1, x);
   1659  u[7] = vshlq_s32(u[7], v_bit);
   1660 
   1661  // stage 7
   1662  if (do_cols) {
   1663    out[0] = u[0];
   1664    out[1] = vnegq_s32(u[4]);
   1665    out[2] = u[6];
   1666    out[3] = vnegq_s32(u[2]);
   1667    out[4] = u[3];
   1668    out[5] = vnegq_s32(u[7]);
   1669    out[6] = u[5];
   1670    out[7] = vnegq_s32(u[1]);
   1671  } else {
   1672    const int log_range_out = AOMMAX(16, bd + 6);
   1673    const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
   1674    const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
   1675    const int32x4_t v_shift = vdupq_n_s32(-out_shift);
   1676    int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
   1677    neg_shift_neon(&u[0], &u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
   1678                   &v_shift, &offset);
   1679    neg_shift_neon(&u[6], &u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
   1680                   &v_shift, &offset);
   1681    neg_shift_neon(&u[3], &u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
   1682                   &v_shift, &offset);
   1683    neg_shift_neon(&u[5], &u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
   1684                   &v_shift, &offset);
   1685  }
   1686 }
   1687 
   1688 static void iadst8x8_new_neon(int32x4_t *in, int32x4_t *out, int bit,
   1689                              int do_cols, int bd, int out_shift) {
   1690  const int32_t *cospi = cospi_arr(bit);
   1691  // const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
   1692  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
   1693  const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
   1694  const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
   1695  int32x4_t u[8], v[8], x;
   1696  const int32x4_t v_bit = vdupq_n_s32(-bit);
   1697  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
   1698  // stage 0-2
   1699 
   1700  u[0] = vmlaq_n_s32(rnding, in[7], cospi[4]);
   1701  u[0] = vmlaq_n_s32(u[0], in[0], cospi[60]);
   1702  u[0] = vshlq_s32(u[0], v_bit);
   1703 
   1704  u[1] = vmlaq_n_s32(rnding, in[7], cospi[60]);
   1705  u[1] = vmlsq_n_s32(u[1], in[0], cospi[4]);
   1706  u[1] = vshlq_s32(u[1], v_bit);
   1707 
   1708  // (2)
   1709  u[2] = vmlaq_n_s32(rnding, in[5], cospi[20]);
   1710  u[2] = vmlaq_n_s32(u[2], in[2], cospi[44]);
   1711  u[2] = vshlq_s32(u[2], v_bit);
   1712 
   1713  u[3] = vmlaq_n_s32(rnding, in[5], cospi[44]);
   1714  u[3] = vmlsq_n_s32(u[3], in[2], cospi[20]);
   1715  u[3] = vshlq_s32(u[3], v_bit);
   1716 
   1717  // (3)
   1718  u[4] = vmlaq_n_s32(rnding, in[3], cospi[36]);
   1719  u[4] = vmlaq_n_s32(u[4], in[4], cospi[28]);
   1720  u[4] = vshlq_s32(u[4], v_bit);
   1721 
   1722  u[5] = vmlaq_n_s32(rnding, in[3], cospi[28]);
   1723  u[5] = vmlsq_n_s32(u[5], in[4], cospi[36]);
   1724  u[5] = vshlq_s32(u[5], v_bit);
   1725 
   1726  // (4)
   1727  u[6] = vmulq_n_s32(in[1], cospi[52]);
   1728  u[6] = vmlaq_n_s32(u[6], in[6], cospi[12]);
   1729  u[6] = vaddq_s32(u[6], rnding);
   1730  u[6] = vshlq_s32(u[6], v_bit);
   1731 
   1732  u[7] = vmulq_n_s32(in[1], cospi[12]);
   1733  u[7] = vmlsq_n_s32(u[7], in[6], cospi[52]);
   1734  u[7] = vaddq_s32(u[7], rnding);
   1735  u[7] = vshlq_s32(u[7], v_bit);
   1736 
   1737  // stage 3
   1738  addsub_neon(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
   1739  addsub_neon(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
   1740  addsub_neon(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
   1741  addsub_neon(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
   1742 
   1743  // stage 4
   1744  u[0] = v[0];
   1745  u[1] = v[1];
   1746  u[2] = v[2];
   1747  u[3] = v[3];
   1748 
   1749  u[4] = vmlaq_n_s32(rnding, v[4], cospi[16]);
   1750  u[4] = vmlaq_n_s32(u[4], v[5], cospi[48]);
   1751  u[4] = vshlq_s32(u[4], v_bit);
   1752 
   1753  u[5] = vmlaq_n_s32(rnding, v[4], cospi[48]);
   1754  u[5] = vmlsq_n_s32(u[5], v[5], cospi[16]);
   1755  u[5] = vshlq_s32(u[5], v_bit);
   1756 
   1757  u[6] = vmlsq_n_s32(rnding, v[6], cospi[48]);
   1758  u[6] = vmlaq_n_s32(u[6], v[7], cospi[16]);
   1759  u[6] = vshlq_s32(u[6], v_bit);
   1760 
   1761  u[7] = vmlaq_n_s32(rnding, v[6], cospi[16]);
   1762  u[7] = vmlaq_n_s32(u[7], v[7], cospi[48]);
   1763  u[7] = vshlq_s32(u[7], v_bit);
   1764 
   1765  // stage 5
   1766  addsub_neon(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
   1767  addsub_neon(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
   1768  addsub_neon(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
   1769  addsub_neon(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
   1770 
   1771  // stage 6
   1772  u[0] = v[0];
   1773  u[1] = v[1];
   1774  u[4] = v[4];
   1775  u[5] = v[5];
   1776 
   1777  v[0] = vmlaq_n_s32(rnding, v[2], cospi[32]);
   1778  x = vmulq_n_s32(v[3], cospi[32]);
   1779  u[2] = vaddq_s32(v[0], x);
   1780  u[2] = vshlq_s32(u[2], v_bit);
   1781 
   1782  u[3] = vsubq_s32(v[0], x);
   1783  u[3] = vshlq_s32(u[3], v_bit);
   1784 
   1785  v[0] = vmlaq_n_s32(rnding, v[6], cospi[32]);
   1786  x = vmulq_n_s32(v[7], cospi[32]);
   1787  u[6] = vaddq_s32(v[0], x);
   1788  u[6] = vshlq_s32(u[6], v_bit);
   1789 
   1790  u[7] = vsubq_s32(v[0], x);
   1791  u[7] = vshlq_s32(u[7], v_bit);
   1792 
   1793  // stage 7
   1794  if (do_cols) {
   1795    out[0] = u[0];
   1796    out[1] = vnegq_s32(u[4]);
   1797    out[2] = u[6];
   1798    out[3] = vnegq_s32(u[2]);
   1799    out[4] = u[3];
   1800    out[5] = vnegq_s32(u[7]);
   1801    out[6] = u[5];
   1802    out[7] = vnegq_s32(u[1]);
   1803  } else {
   1804    const int log_range_out = AOMMAX(16, bd + 6);
   1805    const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
   1806    const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
   1807    const int32x4_t v_shift = vdupq_n_s32(-out_shift);
   1808    int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
   1809    neg_shift_neon(&u[0], &u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
   1810                   &v_shift, &offset);
   1811    neg_shift_neon(&u[6], &u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
   1812                   &v_shift, &offset);
   1813    neg_shift_neon(&u[3], &u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
   1814                   &v_shift, &offset);
   1815    neg_shift_neon(&u[5], &u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
   1816                   &v_shift, &offset);
   1817  }
   1818 }
   1819 
   1820 static void idct16x16_low1_neon(int32x4_t *in, int32x4_t *out, int bit,
   1821                                int do_cols, int bd, int out_shift) {
   1822  const int32_t *cospi = cospi_arr(bit);
   1823  int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
   1824  int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
   1825  int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
   1826  const int32x4_t v_bit = vdupq_n_s32(-bit);
   1827  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
   1828  // stage 0-4
   1829  in[0] = vmlaq_n_s32(rnding, in[0], cospi[32]);
   1830  in[0] = vshlq_s32(in[0], v_bit);
   1831 
   1832  // stage 5-7
   1833  if (!do_cols) {
   1834    log_range = AOMMAX(16, bd + 6);
   1835    clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
   1836    clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
   1837    if (out_shift != 0) {
   1838      int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
   1839      in[0] = vaddq_s32(in[0], offset);
   1840      in[0] = vshlq_s32(in[0], vdupq_n_s32(-out_shift));
   1841    }
   1842  }
   1843 
   1844  in[0] = vmaxq_s32(in[0], clamp_lo);
   1845  in[0] = vminq_s32(in[0], clamp_hi);
   1846  out[0] = in[0];
   1847  out[1] = in[0];
   1848  out[2] = in[0];
   1849  out[3] = in[0];
   1850  out[4] = in[0];
   1851  out[5] = in[0];
   1852  out[6] = in[0];
   1853  out[7] = in[0];
   1854  out[8] = in[0];
   1855  out[9] = in[0];
   1856  out[10] = in[0];
   1857  out[11] = in[0];
   1858  out[12] = in[0];
   1859  out[13] = in[0];
   1860  out[14] = in[0];
   1861  out[15] = in[0];
   1862 }
   1863 
   1864 static void idct16x16_low8_neon(int32x4_t *in, int32x4_t *out, int bit,
   1865                                int do_cols, int bd, int out_shift) {
   1866  const int32_t *cospi = cospi_arr(bit);
   1867  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
   1868  const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
   1869  const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
   1870  const int32x4_t v_bit = vdupq_n_s32(-bit);
   1871  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
   1872  int32x4_t u[16], x, y;
   1873  // stage 0-1
   1874  u[0] = in[0];
   1875  u[2] = in[4];
   1876  u[4] = in[2];
   1877  u[6] = in[6];
   1878  u[8] = in[1];
   1879  u[10] = in[5];
   1880  u[12] = in[3];
   1881  u[14] = in[7];
   1882 
   1883  // stage 2
   1884  u[15] = half_btf_0_neon_r(&cospi[4], &u[8], &v_bit, &rnding);
   1885  u[8] = half_btf_0_neon_r(&cospi[60], &u[8], &v_bit, &rnding);
   1886 
   1887  u[9] = half_btf_0_m_neon_r(&cospi[36], &u[14], &v_bit, &rnding);
   1888  u[14] = half_btf_0_neon_r(&cospi[28], &u[14], &v_bit, &rnding);
   1889 
   1890  u[13] = half_btf_0_neon_r(&cospi[20], &u[10], &v_bit, &rnding);
   1891  u[10] = half_btf_0_neon_r(&cospi[44], &u[10], &v_bit, &rnding);
   1892 
   1893  u[11] = half_btf_0_m_neon_r(&cospi[52], &u[12], &v_bit, &rnding);
   1894  u[12] = half_btf_0_neon_r(&cospi[12], &u[12], &v_bit, &rnding);
   1895 
   1896  // stage 3
   1897  u[7] = half_btf_0_neon_r(&cospi[8], &u[4], &v_bit, &rnding);
   1898  u[4] = half_btf_0_neon_r(&cospi[56], &u[4], &v_bit, &rnding);
   1899  u[5] = half_btf_0_m_neon_r(&cospi[40], &u[6], &v_bit, &rnding);
   1900  u[6] = half_btf_0_neon_r(&cospi[24], &u[6], &v_bit, &rnding);
   1901 
   1902  addsub_neon(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
   1903  addsub_neon(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
   1904  addsub_neon(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
   1905  addsub_neon(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
   1906 
   1907  // stage 4
   1908  x = vmlaq_n_s32(rnding, u[0], cospi[32]);
   1909  u[0] = vshlq_s32(x, v_bit);
   1910  u[1] = u[0];
   1911 
   1912  u[3] = half_btf_0_neon_r(&cospi[16], &u[2], &v_bit, &rnding);
   1913  u[2] = half_btf_0_neon_r(&cospi[48], &u[2], &v_bit, &rnding);
   1914 
   1915  addsub_neon(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
   1916  addsub_neon(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
   1917 
   1918  x = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14], &v_bit,
   1919                             &rnding);
   1920  u[14] =
   1921      half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding);
   1922  u[9] = x;
   1923  y = half_btf_neon_mode11_r(&cospi[48], &u[10], &cospi[16], &u[13], &v_bit,
   1924                             &rnding);
   1925  u[13] = half_btf_neon_mode10_r(&cospi[16], &u[10], &cospi[48], &u[13], &v_bit,
   1926                                 &rnding);
   1927  u[10] = y;
   1928 
   1929  // stage 5
   1930  addsub_neon(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
   1931  addsub_neon(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
   1932 
   1933  x = vmulq_n_s32(u[5], cospi[32]);
   1934  y = vmlaq_n_s32(rnding, u[6], cospi[32]);
   1935  u[5] = vsubq_s32(y, x);
   1936  u[5] = vshlq_s32(u[5], v_bit);
   1937 
   1938  u[6] = vaddq_s32(y, x);
   1939  u[6] = vshlq_s32(u[6], v_bit);
   1940 
   1941  addsub_neon(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
   1942  addsub_neon(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
   1943  addsub_neon(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
   1944  addsub_neon(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
   1945 
   1946  // stage 6
   1947  addsub_neon(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
   1948  addsub_neon(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
   1949  addsub_neon(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
   1950  addsub_neon(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
   1951 
   1952  x = vmulq_n_s32(u[10], cospi[32]);
   1953  y = vmlaq_n_s32(rnding, u[13], cospi[32]);
   1954  u[10] = vsubq_s32(y, x);
   1955  u[10] = vshlq_s32(u[10], v_bit);
   1956 
   1957  u[13] = vaddq_s32(x, y);
   1958  u[13] = vshlq_s32(u[13], v_bit);
   1959 
   1960  x = vmulq_n_s32(u[11], cospi[32]);
   1961  y = vmlaq_n_s32(rnding, u[12], cospi[32]);
   1962  u[11] = vsubq_s32(y, x);
   1963  u[11] = vshlq_s32(u[11], v_bit);
   1964 
   1965  u[12] = vaddq_s32(x, y);
   1966  u[12] = vshlq_s32(u[12], v_bit);
   1967  // stage 7
   1968  addsub_neon(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
   1969  addsub_neon(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
   1970  addsub_neon(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
   1971  addsub_neon(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
   1972  addsub_neon(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
   1973  addsub_neon(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
   1974  addsub_neon(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
   1975  addsub_neon(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
   1976 
   1977  if (!do_cols) {
   1978    const int log_range_out = AOMMAX(16, bd + 6);
   1979    const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
   1980    const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
   1981    round_shift_8x8(out, out_shift);
   1982    highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 16);
   1983  }
   1984 }
   1985 
   1986 static void iadst16x16_low1_neon(int32x4_t *in, int32x4_t *out, int bit,
   1987                                 int do_cols, int bd, int out_shift) {
   1988  const int32_t *cospi = cospi_arr(bit);
   1989  int32x4_t v[16], x, y, temp1, temp2;
   1990  const int32x4_t v_bit = vdupq_n_s32(-bit);
   1991  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
   1992  // stage 0
   1993  // stage 1
   1994  // stage 2
   1995  v[0] = vmlaq_n_s32(rnding, in[0], cospi[62]);
   1996  v[0] = vshlq_s32(v[0], v_bit);
   1997 
   1998  v[1] = vmlsq_n_s32(rnding, in[0], cospi[2]);
   1999  v[1] = vshlq_s32(v[1], v_bit);
   2000 
   2001  // stage 3
   2002  v[8] = v[0];
   2003  v[9] = v[1];
   2004 
   2005  // stage 4
   2006  temp1 = vmlaq_n_s32(rnding, v[8], cospi[8]);
   2007  temp1 = vmlaq_n_s32(temp1, v[9], cospi[56]);
   2008  temp1 = vshlq_s32(temp1, v_bit);
   2009 
   2010  temp2 = vmlaq_n_s32(rnding, v[8], cospi[56]);
   2011  temp2 = vmlsq_n_s32(temp2, v[9], cospi[8]);
   2012  temp2 = vshlq_s32(temp2, v_bit);
   2013  v[8] = temp1;
   2014  v[9] = temp2;
   2015 
   2016  // stage 5
   2017  v[4] = v[0];
   2018  v[5] = v[1];
   2019  v[12] = v[8];
   2020  v[13] = v[9];
   2021 
   2022  // stage 6
   2023  temp1 = vmlaq_n_s32(rnding, v[4], cospi[16]);
   2024  temp1 = vmlaq_n_s32(temp1, v[5], cospi[48]);
   2025  temp1 = vshlq_s32(temp1, v_bit);
   2026 
   2027  temp2 = vmlaq_n_s32(rnding, v[4], cospi[48]);
   2028  temp2 = vmlsq_n_s32(temp2, v[5], cospi[16]);
   2029  temp2 = vshlq_s32(temp2, v_bit);
   2030  v[4] = temp1;
   2031  v[5] = temp2;
   2032 
   2033  temp1 = vmlaq_n_s32(rnding, v[12], cospi[16]);
   2034  temp1 = vmlaq_n_s32(temp1, v[13], cospi[48]);
   2035  temp1 = vshlq_s32(temp1, v_bit);
   2036 
   2037  temp2 = vmlaq_n_s32(rnding, v[12], cospi[48]);
   2038  temp2 = vmlsq_n_s32(temp2, v[13], cospi[16]);
   2039  temp2 = vshlq_s32(temp2, v_bit);
   2040  v[12] = temp1;
   2041  v[13] = temp2;
   2042 
   2043  // stage 7
   2044  v[2] = v[0];
   2045  v[3] = v[1];
   2046  v[6] = v[4];
   2047  v[7] = v[5];
   2048  v[10] = v[8];
   2049  v[11] = v[9];
   2050  v[14] = v[12];
   2051  v[15] = v[13];
   2052 
   2053  // stage 8
   2054  y = vmlaq_n_s32(rnding, v[2], cospi[32]);
   2055  x = vmulq_n_s32(v[3], cospi[32]);
   2056  v[2] = vaddq_s32(y, x);
   2057  v[2] = vshlq_s32(v[2], v_bit);
   2058 
   2059  v[3] = vsubq_s32(y, x);
   2060  v[3] = vshlq_s32(v[3], v_bit);
   2061 
   2062  y = vmlaq_n_s32(rnding, v[6], cospi[32]);
   2063  x = vmulq_n_s32(v[7], cospi[32]);
   2064  v[6] = vaddq_s32(y, x);
   2065  v[6] = vshlq_s32(v[6], v_bit);
   2066 
   2067  v[7] = vsubq_s32(y, x);
   2068  v[7] = vshlq_s32(v[7], v_bit);
   2069 
   2070  y = vmlaq_n_s32(rnding, v[10], cospi[32]);
   2071  x = vmulq_n_s32(v[11], cospi[32]);
   2072  v[10] = vaddq_s32(y, x);
   2073  v[10] = vshlq_s32(v[10], v_bit);
   2074 
   2075  v[11] = vsubq_s32(y, x);
   2076  v[11] = vshlq_s32(v[11], v_bit);
   2077 
   2078  y = vmlaq_n_s32(rnding, v[14], cospi[32]);
   2079  x = vmulq_n_s32(v[15], cospi[32]);
   2080  v[14] = vaddq_s32(y, x);
   2081  v[14] = vshlq_s32(v[14], v_bit);
   2082 
   2083  v[15] = vsubq_s32(y, x);
   2084  v[15] = vshlq_s32(v[15], v_bit);
   2085 
   2086  // stage 9
   2087  if (do_cols) {
   2088    out[0] = v[0];
   2089    out[1] = vnegq_s32(v[8]);
   2090    out[2] = v[12];
   2091    out[3] = vnegq_s32(v[4]);
   2092    out[4] = v[6];
   2093    out[5] = vnegq_s32(v[14]);
   2094    out[6] = v[10];
   2095    out[7] = vnegq_s32(v[2]);
   2096    out[8] = v[3];
   2097    out[9] = vnegq_s32(v[11]);
   2098    out[10] = v[15];
   2099    out[11] = vnegq_s32(v[7]);
   2100    out[12] = v[5];
   2101    out[13] = vnegq_s32(v[13]);
   2102    out[14] = v[9];
   2103    out[15] = vnegq_s32(v[1]);
   2104  } else {
   2105    const int log_range_out = AOMMAX(16, bd + 6);
   2106    const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
   2107    const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
   2108    const int32x4_t v_shift = vdupq_n_s32(-out_shift);
   2109    int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
   2110    neg_shift_neon(&v[0], &v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
   2111                   &v_shift, &offset);
   2112    neg_shift_neon(&v[12], &v[4], out + 2, out + 3, &clamp_lo_out,
   2113                   &clamp_hi_out, &v_shift, &offset);
   2114    neg_shift_neon(&v[6], &v[14], out + 4, out + 5, &clamp_lo_out,
   2115                   &clamp_hi_out, &v_shift, &offset);
   2116    neg_shift_neon(&v[10], &v[2], out + 6, out + 7, &clamp_lo_out,
   2117                   &clamp_hi_out, &v_shift, &offset);
   2118    neg_shift_neon(&v[3], &v[11], out + 8, out + 9, &clamp_lo_out,
   2119                   &clamp_hi_out, &v_shift, &offset);
   2120    neg_shift_neon(&v[15], &v[7], out + 10, out + 11, &clamp_lo_out,
   2121                   &clamp_hi_out, &v_shift, &offset);
   2122    neg_shift_neon(&v[5], &v[13], out + 12, out + 13, &clamp_lo_out,
   2123                   &clamp_hi_out, &v_shift, &offset);
   2124    neg_shift_neon(&v[9], &v[1], out + 14, out + 15, &clamp_lo_out,
   2125                   &clamp_hi_out, &v_shift, &offset);
   2126  }
   2127 }
   2128 
   2129 static void iadst16x16_low8_neon(int32x4_t *in, int32x4_t *out, int bit,
   2130                                 int do_cols, int bd, int out_shift) {
   2131  const int32_t *cospi = cospi_arr(bit);
   2132  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
   2133  const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
   2134  const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
   2135  int32x4_t zero = vdupq_n_s32(0);
   2136  int32x4_t u[16], x, y;
   2137  const int32x4_t v_bit = vdupq_n_s32(-bit);
   2138  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
   2139  // stage 0-2
   2140  u[0] = vmlaq_n_s32(rnding, in[0], cospi[62]);
   2141  u[0] = vshlq_s32(u[0], v_bit);
   2142 
   2143  u[1] = vmlsq_n_s32(rnding, in[0], cospi[2]);
   2144  u[1] = vshlq_s32(u[1], v_bit);
   2145 
   2146  u[2] = vmlaq_n_s32(rnding, in[2], cospi[54]);
   2147  u[2] = vshlq_s32(u[2], v_bit);
   2148 
   2149  u[3] = vmlsq_n_s32(rnding, in[2], cospi[10]);
   2150  u[3] = vshlq_s32(u[3], v_bit);
   2151 
   2152  u[4] = vmlaq_n_s32(rnding, in[4], cospi[46]);
   2153  u[4] = vshlq_s32(u[4], v_bit);
   2154 
   2155  u[5] = vmlsq_n_s32(rnding, in[4], cospi[18]);
   2156  u[5] = vshlq_s32(u[5], v_bit);
   2157 
   2158  u[6] = vmlaq_n_s32(rnding, in[6], cospi[38]);
   2159  u[6] = vshlq_s32(u[6], v_bit);
   2160 
   2161  u[7] = vmlsq_n_s32(rnding, in[6], cospi[26]);
   2162  u[7] = vshlq_s32(u[7], v_bit);
   2163 
   2164  u[8] = vmlaq_n_s32(rnding, in[7], cospi[34]);
   2165  u[8] = vshlq_s32(u[8], v_bit);
   2166 
   2167  u[9] = vmlaq_n_s32(rnding, in[7], cospi[30]);
   2168  u[9] = vshlq_s32(u[9], v_bit);
   2169 
   2170  u[10] = vmlaq_n_s32(rnding, in[5], cospi[42]);
   2171  u[10] = vshlq_s32(u[10], v_bit);
   2172 
   2173  u[11] = vmlaq_n_s32(rnding, in[5], cospi[22]);
   2174  u[11] = vshlq_s32(u[11], v_bit);
   2175 
   2176  u[12] = vmlaq_n_s32(rnding, in[3], cospi[50]);
   2177  u[12] = vshlq_s32(u[12], v_bit);
   2178 
   2179  u[13] = vmlaq_n_s32(rnding, in[3], cospi[14]);
   2180  u[13] = vshlq_s32(u[13], v_bit);
   2181 
   2182  u[14] = vmlaq_n_s32(rnding, in[1], cospi[58]);
   2183  u[14] = vshlq_s32(u[14], v_bit);
   2184 
   2185  u[15] = vmlaq_n_s32(rnding, in[1], cospi[6]);
   2186  u[15] = vshlq_s32(u[15], v_bit);
   2187 
   2188  // stage 3
   2189  addsub_neon(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
   2190  addsub_neon(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
   2191  addsub_neon(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
   2192  addsub_neon(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
   2193  addsub_neon(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
   2194  addsub_neon(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
   2195  addsub_neon(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
   2196  addsub_neon(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
   2197 
   2198  // stage 4
   2199  y = vmlaq_n_s32(rnding, u[8], cospi[56]);
   2200  u[8] = vmlaq_n_s32(rnding, u[8], cospi[8]);
   2201  u[8] = vmlaq_n_s32(u[8], u[9], cospi[56]);
   2202  u[8] = vshlq_s32(u[8], v_bit);
   2203 
   2204  u[9] = vmlsq_n_s32(y, u[9], cospi[8]);
   2205  u[9] = vshlq_s32(u[9], v_bit);
   2206 
   2207  y = vmlaq_n_s32(rnding, u[10], cospi[24]);
   2208  u[10] = vmlaq_n_s32(rnding, u[10], cospi[40]);
   2209  u[10] = vmlaq_n_s32(u[10], u[11], cospi[24]);
   2210  u[10] = vshlq_s32(u[10], v_bit);
   2211 
   2212  u[11] = vmlsq_n_s32(y, u[11], cospi[40]);
   2213  u[11] = vshlq_s32(u[11], v_bit);
   2214 
   2215  y = vmlaq_n_s32(rnding, u[12], cospi[8]);
   2216  u[12] = vmlsq_n_s32(rnding, u[12], cospi[56]);
   2217  u[12] = vmlaq_n_s32(u[12], u[13], cospi[8]);
   2218  u[12] = vshlq_s32(u[12], v_bit);
   2219 
   2220  u[13] = vmlaq_n_s32(y, u[13], cospi[56]);
   2221  u[13] = vshlq_s32(u[13], v_bit);
   2222 
   2223  y = vmlaq_n_s32(rnding, u[14], cospi[40]);
   2224  u[14] = vmlsq_n_s32(rnding, u[14], cospi[24]);
   2225  u[14] = vmlaq_n_s32(u[14], u[15], cospi[40]);
   2226  u[14] = vshlq_s32(u[14], v_bit);
   2227 
   2228  u[15] = vmlaq_n_s32(y, u[15], cospi[24]);
   2229  u[15] = vshlq_s32(u[15], v_bit);
   2230 
   2231  // stage 5
   2232  addsub_neon(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
   2233  addsub_neon(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
   2234  addsub_neon(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
   2235  addsub_neon(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
   2236  addsub_neon(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
   2237  addsub_neon(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
   2238  addsub_neon(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
   2239  addsub_neon(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
   2240 
   2241  // stage 6
   2242  y = vmlaq_n_s32(rnding, u[4], cospi[48]);
   2243  u[4] = vmlaq_n_s32(rnding, u[4], cospi[16]);
   2244  u[4] = vmlaq_n_s32(u[4], u[5], cospi[48]);
   2245  u[4] = vshlq_s32(u[4], v_bit);
   2246 
   2247  u[5] = vmlsq_n_s32(y, u[5], cospi[16]);
   2248  u[5] = vshlq_s32(u[5], v_bit);
   2249 
   2250  y = vmlaq_n_s32(rnding, u[6], cospi[16]);
   2251  u[6] = vmlsq_n_s32(rnding, u[6], cospi[48]);
   2252  u[6] = vmlaq_n_s32(u[6], u[7], cospi[16]);
   2253  u[6] = vshlq_s32(u[6], v_bit);
   2254 
   2255  u[7] = vmlaq_n_s32(y, u[7], cospi[48]);
   2256  u[7] = vshlq_s32(u[7], v_bit);
   2257 
   2258  y = vmlaq_n_s32(rnding, u[12], cospi[48]);
   2259  u[12] = vmulq_n_s32(u[12], cospi[16]);
   2260  u[12] = vmlaq_n_s32(u[12], u[13], cospi[48]);
   2261  u[12] = vshlq_s32(u[12], v_bit);
   2262 
   2263  u[13] = vmlsq_n_s32(y, u[13], cospi[16]);
   2264  u[13] = vshlq_s32(u[13], v_bit);
   2265 
   2266  y = vmlaq_n_s32(rnding, u[14], cospi[16]);
   2267  u[14] = vmlsq_n_s32(rnding, u[14], cospi[48]);
   2268  u[14] = vmlaq_n_s32(u[14], u[15], cospi[16]);
   2269  u[14] = vshlq_s32(u[14], v_bit);
   2270 
   2271  u[15] = vmlaq_n_s32(y, u[15], cospi[48]);
   2272  u[15] = vshlq_s32(u[15], v_bit);
   2273 
   2274  // stage 7
   2275  addsub_neon(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
   2276  addsub_neon(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
   2277  addsub_neon(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
   2278  addsub_neon(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
   2279  addsub_neon(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
   2280  addsub_neon(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
   2281  addsub_neon(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
   2282  addsub_neon(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
   2283 
   2284  // stage 8
   2285  y = vmlaq_n_s32(rnding, u[2], cospi[32]);
   2286  x = vmulq_n_s32(u[3], cospi[32]);
   2287  u[2] = vaddq_s32(y, x);
   2288  u[2] = vshlq_s32(u[2], v_bit);
   2289 
   2290  u[3] = vsubq_s32(y, x);
   2291  u[3] = vshlq_s32(u[3], v_bit);
   2292  y = vmlaq_n_s32(rnding, u[6], cospi[32]);
   2293  x = vmulq_n_s32(u[7], cospi[32]);
   2294  u[6] = vaddq_s32(y, x);
   2295  u[6] = vshlq_s32(u[6], v_bit);
   2296 
   2297  u[7] = vsubq_s32(y, x);
   2298  u[7] = vshlq_s32(u[7], v_bit);
   2299 
   2300  y = vmlaq_n_s32(rnding, u[10], cospi[32]);
   2301  x = vmulq_n_s32(u[11], cospi[32]);
   2302  u[10] = vaddq_s32(y, x);
   2303  u[10] = vshlq_s32(u[10], v_bit);
   2304 
   2305  u[11] = vsubq_s32(y, x);
   2306  u[11] = vshlq_s32(u[11], v_bit);
   2307 
   2308  y = vmlaq_n_s32(rnding, u[14], cospi[32]);
   2309  x = vmulq_n_s32(u[15], cospi[32]);
   2310  u[14] = vaddq_s32(y, x);
   2311  u[14] = vshlq_s32(u[14], v_bit);
   2312 
   2313  u[15] = vsubq_s32(y, x);
   2314  u[15] = vshlq_s32(u[15], v_bit);
   2315 
   2316  // stage 9
   2317  if (do_cols) {
   2318    out[0] = u[0];
   2319    out[1] = vsubq_s32(zero, u[8]);
   2320    out[2] = u[12];
   2321    out[3] = vsubq_s32(zero, u[4]);
   2322    out[4] = u[6];
   2323    out[5] = vsubq_s32(zero, u[14]);
   2324    out[6] = u[10];
   2325    out[7] = vsubq_s32(zero, u[2]);
   2326    out[8] = u[3];
   2327    out[9] = vsubq_s32(zero, u[11]);
   2328    out[10] = u[15];
   2329    out[11] = vsubq_s32(zero, u[7]);
   2330    out[12] = u[5];
   2331    out[13] = vsubq_s32(zero, u[13]);
   2332    out[14] = u[9];
   2333    out[15] = vsubq_s32(zero, u[1]);
   2334  } else {
   2335    const int log_range_out = AOMMAX(16, bd + 6);
   2336    const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
   2337    const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
   2338    const int32x4_t v_shift = vdupq_n_s32(-out_shift);
   2339    int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
   2340    neg_shift_neon(&u[0], &u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
   2341                   &v_shift, &offset);
   2342    neg_shift_neon(&u[12], &u[4], out + 2, out + 3, &clamp_lo_out,
   2343                   &clamp_hi_out, &v_shift, &offset);
   2344    neg_shift_neon(&u[6], &u[14], out + 4, out + 5, &clamp_lo_out,
   2345                   &clamp_hi_out, &v_shift, &offset);
   2346    neg_shift_neon(&u[10], &u[2], out + 6, out + 7, &clamp_lo_out,
   2347                   &clamp_hi_out, &v_shift, &offset);
   2348    neg_shift_neon(&u[3], &u[11], out + 8, out + 9, &clamp_lo_out,
   2349                   &clamp_hi_out, &v_shift, &offset);
   2350    neg_shift_neon(&u[15], &u[7], out + 10, out + 11, &clamp_lo_out,
   2351                   &clamp_hi_out, &v_shift, &offset);
   2352    neg_shift_neon(&u[5], &u[13], out + 12, out + 13, &clamp_lo_out,
   2353                   &clamp_hi_out, &v_shift, &offset);
   2354    neg_shift_neon(&u[9], &u[1], out + 14, out + 15, &clamp_lo_out,
   2355                   &clamp_hi_out, &v_shift, &offset);
   2356  }
   2357 }
   2358 
   2359 static void idct16x16_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
   2360                           int bd, int out_shift) {
   2361  const int32_t *cospi = cospi_arr(bit);
   2362  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
   2363  const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
   2364  const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
   2365  int32x4_t u[16], v[16], x, y;
   2366  const int32x4_t v_bit = vdupq_n_s32(-bit);
   2367  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
   2368 
   2369  {
   2370    // stage 0-1
   2371    u[0] = in[0];
   2372    u[1] = in[8];
   2373    u[2] = in[4];
   2374    u[3] = in[12];
   2375    u[4] = in[2];
   2376    u[5] = in[10];
   2377    u[6] = in[6];
   2378    u[7] = in[14];
   2379    u[8] = in[1];
   2380    u[9] = in[9];
   2381    u[10] = in[5];
   2382    u[11] = in[13];
   2383    u[12] = in[3];
   2384    u[13] = in[11];
   2385    u[14] = in[7];
   2386    u[15] = in[15];
   2387 
   2388    // stage 2
   2389    v[0] = u[0];
   2390    v[1] = u[1];
   2391    v[2] = u[2];
   2392    v[3] = u[3];
   2393    v[4] = u[4];
   2394    v[5] = u[5];
   2395    v[6] = u[6];
   2396    v[7] = u[7];
   2397 
   2398    v[8] = half_btf_neon_mode01_r(&cospi[60], &u[8], &cospi[4], &u[15], &v_bit,
   2399                                  &rnding);
   2400    v[9] = half_btf_neon_mode01_r(&cospi[28], &u[9], &cospi[36], &u[14], &v_bit,
   2401                                  &rnding);
   2402    v[10] = half_btf_neon_mode01_r(&cospi[44], &u[10], &cospi[20], &u[13],
   2403                                   &v_bit, &rnding);
   2404    v[11] = half_btf_neon_mode01_r(&cospi[12], &u[11], &cospi[52], &u[12],
   2405                                   &v_bit, &rnding);
   2406    v[12] = half_btf_neon_r(&cospi[52], &u[11], &cospi[12], &u[12], &v_bit,
   2407                            &rnding);
   2408    v[13] = half_btf_neon_r(&cospi[20], &u[10], &cospi[44], &u[13], &v_bit,
   2409                            &rnding);
   2410    v[14] =
   2411        half_btf_neon_r(&cospi[36], &u[9], &cospi[28], &u[14], &v_bit, &rnding);
   2412    v[15] =
   2413        half_btf_neon_r(&cospi[4], &u[8], &cospi[60], &u[15], &v_bit, &rnding);
   2414 
   2415    // stage 3
   2416    u[0] = v[0];
   2417    u[1] = v[1];
   2418    u[2] = v[2];
   2419    u[3] = v[3];
   2420    u[4] = half_btf_neon_mode01_r(&cospi[56], &v[4], &cospi[8], &v[7], &v_bit,
   2421                                  &rnding);
   2422    u[5] = half_btf_neon_mode01_r(&cospi[24], &v[5], &cospi[40], &v[6], &v_bit,
   2423                                  &rnding);
   2424    u[6] =
   2425        half_btf_neon_r(&cospi[40], &v[5], &cospi[24], &v[6], &v_bit, &rnding);
   2426    u[7] =
   2427        half_btf_neon_r(&cospi[8], &v[4], &cospi[56], &v[7], &v_bit, &rnding);
   2428    addsub_neon(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
   2429    addsub_neon(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
   2430    addsub_neon(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
   2431    addsub_neon(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
   2432 
   2433    // stage 4
   2434    x = vmlaq_n_s32(rnding, u[0], cospi[32]);
   2435    y = vmulq_n_s32(u[1], cospi[32]);
   2436    v[0] = vaddq_s32(x, y);
   2437    v[0] = vshlq_s32(v[0], v_bit);
   2438 
   2439    v[1] = vsubq_s32(x, y);
   2440    v[1] = vshlq_s32(v[1], v_bit);
   2441 
   2442    v[2] = half_btf_neon_mode01_r(&cospi[48], &u[2], &cospi[16], &u[3], &v_bit,
   2443                                  &rnding);
   2444    v[3] =
   2445        half_btf_neon_r(&cospi[16], &u[2], &cospi[48], &u[3], &v_bit, &rnding);
   2446    addsub_neon(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
   2447    addsub_neon(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
   2448    v[8] = u[8];
   2449    v[9] = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14], &v_bit,
   2450                                  &rnding);
   2451    v[10] = half_btf_neon_mode11_r(&cospi[48], &u[10], &cospi[16], &u[13],
   2452                                   &v_bit, &rnding);
   2453    v[11] = u[11];
   2454    v[12] = u[12];
   2455    v[13] = half_btf_neon_mode10_r(&cospi[16], &u[10], &cospi[48], &u[13],
   2456                                   &v_bit, &rnding);
   2457    v[14] =
   2458        half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding);
   2459    v[15] = u[15];
   2460 
   2461    // stage 5
   2462    addsub_neon(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
   2463    addsub_neon(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
   2464    u[4] = v[4];
   2465 
   2466    x = vmulq_n_s32(v[5], cospi[32]);
   2467    y = vmlaq_n_s32(rnding, v[6], cospi[32]);
   2468    u[5] = vsubq_s32(y, x);
   2469    u[5] = vshlq_s32(u[5], v_bit);
   2470 
   2471    u[6] = vaddq_s32(y, x);
   2472    u[6] = vshlq_s32(u[6], v_bit);
   2473 
   2474    u[7] = v[7];
   2475    addsub_neon(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
   2476    addsub_neon(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
   2477    addsub_neon(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
   2478    addsub_neon(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
   2479 
   2480    // stage 6
   2481    addsub_neon(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi);
   2482    addsub_neon(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi);
   2483    addsub_neon(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi);
   2484    addsub_neon(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi);
   2485    v[8] = u[8];
   2486    v[9] = u[9];
   2487 
   2488    x = vmulq_n_s32(u[10], cospi[32]);
   2489    y = vmlaq_n_s32(rnding, u[13], cospi[32]);
   2490    v[10] = vsubq_s32(y, x);
   2491    v[10] = vshlq_s32(v[10], v_bit);
   2492 
   2493    v[13] = vaddq_s32(x, y);
   2494    v[13] = vshlq_s32(v[13], v_bit);
   2495 
   2496    x = vmulq_n_s32(u[11], cospi[32]);
   2497    y = vmlaq_n_s32(rnding, u[12], cospi[32]);
   2498    v[11] = vsubq_s32(y, x);
   2499    v[11] = vshlq_s32(v[11], v_bit);
   2500 
   2501    v[12] = vaddq_s32(x, y);
   2502    v[12] = vshlq_s32(v[12], v_bit);
   2503 
   2504    v[14] = u[14];
   2505    v[15] = u[15];
   2506 
   2507    // stage 7
   2508    addsub_neon(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
   2509    addsub_neon(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
   2510    addsub_neon(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
   2511    addsub_neon(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
   2512    addsub_neon(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
   2513    addsub_neon(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
   2514    addsub_neon(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
   2515    addsub_neon(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
   2516 
   2517    if (!do_cols) {
   2518      const int log_range_out = AOMMAX(16, bd + 6);
   2519      const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
   2520      const int32x4_t clamp_hi_out =
   2521          vdupq_n_s32((1 << (log_range_out - 1)) - 1);
   2522      round_shift_8x8(out, out_shift);
   2523      highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 16);
   2524    }
   2525  }
   2526 }
   2527 
   2528 static void iadst16x16_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
   2529                            int bd, int out_shift) {
   2530  const int32_t *cospi = cospi_arr(bit);
   2531  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
   2532  const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
   2533  const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
   2534  const int32x4_t zero = vdupq_n_s32(0);
   2535  const int32x4_t v_bit = vdupq_n_s32(-bit);
   2536  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
   2537  int32x4_t u[16], v[16], x, y;
   2538  // Calculate the column 0, 1, 2, 3
   2539  // stage 0
   2540  // stage 1
   2541  // stage 2
   2542  v[0] = vmlaq_n_s32(rnding, in[15], cospi[2]);
   2543  v[0] = vmlaq_n_s32(v[0], in[0], cospi[62]);
   2544  v[0] = vshlq_s32(v[0], v_bit);
   2545 
   2546  v[1] = vmlaq_n_s32(rnding, in[15], cospi[62]);
   2547  v[1] = vmlsq_n_s32(v[1], in[0], cospi[2]);
   2548  v[1] = vshlq_s32(v[1], v_bit);
   2549 
   2550  v[2] = vmlaq_n_s32(rnding, in[13], cospi[10]);
   2551  v[2] = vmlaq_n_s32(v[2], in[2], cospi[54]);
   2552  v[2] = vshlq_s32(v[2], v_bit);
   2553 
   2554  v[3] = vmlaq_n_s32(rnding, in[13], cospi[54]);
   2555  v[3] = vmlsq_n_s32(v[3], in[2], cospi[10]);
   2556  v[3] = vshlq_s32(v[3], v_bit);
   2557 
   2558  v[4] = vmlaq_n_s32(rnding, in[11], cospi[18]);
   2559  v[4] = vmlaq_n_s32(v[4], in[4], cospi[46]);
   2560  v[4] = vshlq_s32(v[4], v_bit);
   2561 
   2562  v[5] = vmlaq_n_s32(rnding, in[11], cospi[46]);
   2563  v[5] = vmlsq_n_s32(v[5], in[4], cospi[18]);
   2564  v[5] = vshlq_s32(v[5], v_bit);
   2565 
   2566  v[6] = vmlaq_n_s32(rnding, in[9], cospi[26]);
   2567  v[6] = vmlaq_n_s32(v[6], in[6], cospi[38]);
   2568  v[6] = vshlq_s32(v[6], v_bit);
   2569 
   2570  v[7] = vmlaq_n_s32(rnding, in[9], cospi[38]);
   2571  v[7] = vmlsq_n_s32(v[7], in[6], cospi[26]);
   2572  v[7] = vshlq_s32(v[7], v_bit);
   2573 
   2574  v[8] = vmlaq_n_s32(rnding, in[7], cospi[34]);
   2575  v[8] = vmlaq_n_s32(v[8], in[8], cospi[30]);
   2576  v[8] = vshlq_s32(v[8], v_bit);
   2577 
   2578  v[9] = vmlaq_n_s32(rnding, in[7], cospi[30]);
   2579  v[9] = vmlsq_n_s32(v[9], in[8], cospi[34]);
   2580  v[9] = vshlq_s32(v[9], v_bit);
   2581 
   2582  v[10] = vmlaq_n_s32(rnding, in[5], cospi[42]);
   2583  v[10] = vmlaq_n_s32(v[10], in[10], cospi[22]);
   2584  v[10] = vshlq_s32(v[10], v_bit);
   2585 
   2586  v[11] = vmlaq_n_s32(rnding, in[5], cospi[22]);
   2587  v[11] = vmlsq_n_s32(v[11], in[10], cospi[42]);
   2588  v[11] = vshlq_s32(v[11], v_bit);
   2589 
   2590  v[12] = vmlaq_n_s32(rnding, in[3], cospi[50]);
   2591  v[12] = vmlaq_n_s32(v[12], in[12], cospi[14]);
   2592  v[12] = vshlq_s32(v[12], v_bit);
   2593 
   2594  v[13] = vmlaq_n_s32(rnding, in[3], cospi[14]);
   2595  v[13] = vmlsq_n_s32(v[13], in[12], cospi[50]);
   2596  v[13] = vshlq_s32(v[13], v_bit);
   2597 
   2598  v[14] = vmlaq_n_s32(rnding, in[1], cospi[58]);
   2599  v[14] = vmlaq_n_s32(v[14], in[14], cospi[6]);
   2600  v[14] = vshlq_s32(v[14], v_bit);
   2601 
   2602  v[15] = vmlaq_n_s32(rnding, in[1], cospi[6]);
   2603  v[15] = vmlsq_n_s32(v[15], in[14], cospi[58]);
   2604  v[15] = vshlq_s32(v[15], v_bit);
   2605 
   2606  // stage 3
   2607  addsub_neon(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
   2608  addsub_neon(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
   2609  addsub_neon(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
   2610  addsub_neon(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
   2611  addsub_neon(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
   2612  addsub_neon(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
   2613  addsub_neon(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
   2614  addsub_neon(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
   2615 
   2616  // stage 4
   2617  v[0] = u[0];
   2618  v[1] = u[1];
   2619  v[2] = u[2];
   2620  v[3] = u[3];
   2621  v[4] = u[4];
   2622  v[5] = u[5];
   2623  v[6] = u[6];
   2624  v[7] = u[7];
   2625 
   2626  v[8] = vmlaq_n_s32(rnding, u[8], cospi[8]);
   2627  v[8] = vmlaq_n_s32(v[8], u[9], cospi[56]);
   2628  v[8] = vshlq_s32(v[8], v_bit);
   2629 
   2630  v[9] = vmlaq_n_s32(rnding, u[8], cospi[56]);
   2631  v[9] = vmlsq_n_s32(v[9], u[9], cospi[8]);
   2632  v[9] = vshlq_s32(v[9], v_bit);
   2633 
   2634  v[10] = vmlaq_n_s32(rnding, u[10], cospi[40]);
   2635  v[10] = vmlaq_n_s32(v[10], u[11], cospi[24]);
   2636  v[10] = vshlq_s32(v[10], v_bit);
   2637 
   2638  v[11] = vmlaq_n_s32(rnding, u[10], cospi[24]);
   2639  v[11] = vmlsq_n_s32(v[11], u[11], cospi[40]);
   2640  v[11] = vshlq_s32(v[11], v_bit);
   2641 
   2642  v[12] = vmlaq_n_s32(rnding, u[12], -cospi[56]);
   2643  v[12] = vmlaq_n_s32(v[12], u[13], cospi[8]);
   2644  v[12] = vshlq_s32(v[12], v_bit);
   2645 
   2646  v[13] = vmlaq_n_s32(rnding, u[12], cospi[8]);
   2647  v[13] = vmlsq_n_s32(v[13], u[13], -cospi[56]);
   2648  v[13] = vshlq_s32(v[13], v_bit);
   2649 
   2650  v[14] = vmlaq_n_s32(rnding, u[14], -cospi[24]);
   2651  v[14] = vmlaq_n_s32(v[14], u[15], cospi[40]);
   2652  v[14] = vshlq_s32(v[14], v_bit);
   2653 
   2654  v[15] = vmlaq_n_s32(rnding, u[14], cospi[40]);
   2655  v[15] = vmlsq_n_s32(v[15], u[15], -cospi[24]);
   2656  v[15] = vshlq_s32(v[15], v_bit);
   2657 
   2658  // stage 5
   2659  addsub_neon(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
   2660  addsub_neon(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
   2661  addsub_neon(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
   2662  addsub_neon(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
   2663  addsub_neon(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
   2664  addsub_neon(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
   2665  addsub_neon(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
   2666  addsub_neon(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
   2667 
   2668  // stage 6
   2669  v[0] = u[0];
   2670  v[1] = u[1];
   2671  v[2] = u[2];
   2672  v[3] = u[3];
   2673 
   2674  v[4] = vmlaq_n_s32(rnding, u[4], cospi[16]);
   2675  v[4] = vmlaq_n_s32(v[4], u[5], cospi[48]);
   2676  v[4] = vshlq_s32(v[4], v_bit);
   2677 
   2678  v[5] = vmlaq_n_s32(rnding, u[4], cospi[48]);
   2679  v[5] = vmlsq_n_s32(v[5], u[5], cospi[16]);
   2680  v[5] = vshlq_s32(v[5], v_bit);
   2681 
   2682  v[6] = vmlaq_n_s32(rnding, u[6], -cospi[48]);
   2683  v[6] = vmlaq_n_s32(v[6], u[7], cospi[16]);
   2684  v[6] = vshlq_s32(v[6], v_bit);
   2685 
   2686  v[7] = vmlaq_n_s32(rnding, u[6], cospi[16]);
   2687  v[7] = vmlsq_n_s32(v[7], u[7], -cospi[48]);
   2688  v[7] = vshlq_s32(v[7], v_bit);
   2689 
   2690  v[8] = u[8];
   2691  v[9] = u[9];
   2692  v[10] = u[10];
   2693  v[11] = u[11];
   2694 
   2695  v[12] = vmlaq_n_s32(rnding, u[12], cospi[16]);
   2696  v[12] = vmlaq_n_s32(v[12], u[13], cospi[48]);
   2697  v[12] = vshlq_s32(v[12], v_bit);
   2698 
   2699  v[13] = vmlaq_n_s32(rnding, u[12], cospi[48]);
   2700  v[13] = vmlsq_n_s32(v[13], u[13], cospi[16]);
   2701  v[13] = vshlq_s32(v[13], v_bit);
   2702 
   2703  v[14] = vmlaq_n_s32(rnding, u[14], -cospi[48]);
   2704  v[14] = vmlaq_n_s32(v[14], u[15], cospi[16]);
   2705  v[14] = vshlq_s32(v[14], v_bit);
   2706 
   2707  v[15] = vmlaq_n_s32(rnding, u[14], cospi[16]);
   2708  v[15] = vmlsq_n_s32(v[15], u[15], -cospi[48]);
   2709  v[15] = vshlq_s32(v[15], v_bit);
   2710 
   2711  // stage 7
   2712  addsub_neon(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
   2713  addsub_neon(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
   2714  addsub_neon(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
   2715  addsub_neon(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
   2716  addsub_neon(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
   2717  addsub_neon(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
   2718  addsub_neon(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
   2719  addsub_neon(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
   2720 
   2721  // stage 8
   2722  v[0] = u[0];
   2723  v[1] = u[1];
   2724 
   2725  y = vmlaq_n_s32(rnding, u[2], cospi[32]);
   2726  x = vmulq_n_s32(u[3], cospi[32]);
   2727  v[2] = vaddq_s32(y, x);
   2728  v[2] = vshlq_s32(v[2], v_bit);
   2729 
   2730  v[3] = vsubq_s32(y, x);
   2731  v[3] = vshlq_s32(v[3], v_bit);
   2732 
   2733  v[4] = u[4];
   2734  v[5] = u[5];
   2735 
   2736  y = vmlaq_n_s32(rnding, u[6], cospi[32]);
   2737  x = vmulq_n_s32(u[7], cospi[32]);
   2738  v[6] = vaddq_s32(y, x);
   2739  v[6] = vshlq_s32(v[6], v_bit);
   2740 
   2741  v[7] = vsubq_s32(y, x);
   2742  v[7] = vshlq_s32(v[7], v_bit);
   2743 
   2744  v[8] = u[8];
   2745  v[9] = u[9];
   2746 
   2747  y = vmlaq_n_s32(rnding, u[10], cospi[32]);
   2748  x = vmulq_n_s32(u[11], cospi[32]);
   2749  v[10] = vaddq_s32(y, x);
   2750  v[10] = vshlq_s32(v[10], v_bit);
   2751 
   2752  v[11] = vsubq_s32(y, x);
   2753  v[11] = vshlq_s32(v[11], v_bit);
   2754 
   2755  v[12] = u[12];
   2756  v[13] = u[13];
   2757 
   2758  y = vmlaq_n_s32(rnding, u[14], cospi[32]);
   2759  x = vmulq_n_s32(u[15], cospi[32]);
   2760  v[14] = vaddq_s32(y, x);
   2761  v[14] = vshlq_s32(v[14], v_bit);
   2762 
   2763  v[15] = vsubq_s32(y, x);
   2764  v[15] = vshlq_s32(v[15], v_bit);
   2765 
   2766  // stage 9
   2767  if (do_cols) {
   2768    out[0] = v[0];
   2769    out[1] = vsubq_s32(zero, v[8]);
   2770    out[2] = v[12];
   2771    out[3] = vsubq_s32(zero, v[4]);
   2772    out[4] = v[6];
   2773    out[5] = vsubq_s32(zero, v[14]);
   2774    out[6] = v[10];
   2775    out[7] = vsubq_s32(zero, v[2]);
   2776    out[8] = v[3];
   2777    out[9] = vsubq_s32(zero, v[11]);
   2778    out[10] = v[15];
   2779    out[11] = vsubq_s32(zero, v[7]);
   2780    out[12] = v[5];
   2781    out[13] = vsubq_s32(zero, v[13]);
   2782    out[14] = v[9];
   2783    out[15] = vsubq_s32(zero, v[1]);
   2784  } else {
   2785    const int log_range_out = AOMMAX(16, bd + 6);
   2786    const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
   2787    const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
   2788    const int32x4_t v_shift = vdupq_n_s32(-out_shift);
   2789    int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
   2790    neg_shift_neon(&v[0], &v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
   2791                   &v_shift, &offset);
   2792    neg_shift_neon(&v[12], &v[4], out + 2, out + 3, &clamp_lo_out,
   2793                   &clamp_hi_out, &v_shift, &offset);
   2794    neg_shift_neon(&v[6], &v[14], out + 4, out + 5, &clamp_lo_out,
   2795                   &clamp_hi_out, &v_shift, &offset);
   2796    neg_shift_neon(&v[10], &v[2], out + 6, out + 7, &clamp_lo_out,
   2797                   &clamp_hi_out, &v_shift, &offset);
   2798    neg_shift_neon(&v[3], &v[11], out + 8, out + 9, &clamp_lo_out,
   2799                   &clamp_hi_out, &v_shift, &offset);
   2800    neg_shift_neon(&v[15], &v[7], out + 10, out + 11, &clamp_lo_out,
   2801                   &clamp_hi_out, &v_shift, &offset);
   2802    neg_shift_neon(&v[5], &v[13], out + 12, out + 13, &clamp_lo_out,
   2803                   &clamp_hi_out, &v_shift, &offset);
   2804    neg_shift_neon(&v[9], &v[1], out + 14, out + 15, &clamp_lo_out,
   2805                   &clamp_hi_out, &v_shift, &offset);
   2806  }
   2807 }
   2808 
   2809 static void iidentity16_neon(int32x4_t *in, int32x4_t *out, int bit,
   2810                             int do_cols, int bd, int out_shift) {
   2811  (void)bit;
   2812  int32x2_t fact = vdup_n_s32(2 * NewSqrt2);
   2813  int32x4x2_t a0;
   2814  int32x4_t zero = vdupq_n_s32(0);
   2815  const int64x2_t rnding = vdupq_n_s64(1 << (NewSqrt2Bits - 1));
   2816  for (int i = 0; i < 16; i++) {
   2817    a0.val[0] = vreinterpretq_s32_s64(
   2818        vmlal_s32(rnding, vmovn_s64(vreinterpretq_s64_s32(in[i])), fact));
   2819    a0.val[0] = vreinterpretq_s32_s64(
   2820        vshrq_n_s64(vreinterpretq_s64_s32(a0.val[0]), NewSqrt2Bits));
   2821    a0.val[1] = vextq_s32(in[i], zero, 1);
   2822    a0.val[1] = vreinterpretq_s32_s64(
   2823        vmlal_s32(rnding, vmovn_s64(vreinterpretq_s64_s32(a0.val[1])), fact));
   2824    a0.val[1] = vreinterpretq_s32_s64(
   2825        vshrq_n_s64(vreinterpretq_s64_s32(a0.val[1]), NewSqrt2Bits));
   2826    a0 = vzipq_s32(a0.val[0], a0.val[1]);
   2827 #if AOM_ARCH_AARCH64
   2828    out[i] = vreinterpretq_s32_s64(vzip1q_s64(
   2829        vreinterpretq_s64_s32(a0.val[0]), vreinterpretq_s64_s32(a0.val[1])));
   2830 #else
   2831    out[i] = vextq_s32(vextq_s32(a0.val[0], a0.val[0], 2), a0.val[1], 2);
   2832 #endif
   2833  }
   2834 
   2835  if (!do_cols) {
   2836    const int log_range = AOMMAX(16, bd + 6);
   2837    const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
   2838    const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
   2839    round_shift_8x8(out, out_shift);
   2840    highbd_clamp_s32_neon(out, out, &clamp_lo, &clamp_hi, 16);
   2841  }
   2842 }
   2843 
   2844 static inline void idct64_stage8_neon(int32x4_t *u, const int32_t *cospi,
   2845                                      const int32x4_t *clamp_lo,
   2846                                      const int32x4_t *clamp_hi,
   2847                                      const int32x4_t *v_bit,
   2848                                      const int32x4_t *rnding) {
   2849  int i;
   2850  int32x4_t temp1, temp2, temp3, temp4;
   2851  temp1 = half_btf_neon_mode10_r(&cospi[32], &u[10], &cospi[32], &u[13], v_bit,
   2852                                 rnding);
   2853  u[13] =
   2854      half_btf_neon_r(&cospi[32], &u[10], &cospi[32], &u[13], v_bit, rnding);
   2855  u[10] = temp1;
   2856  temp2 = half_btf_neon_mode10_r(&cospi[32], &u[11], &cospi[32], &u[12], v_bit,
   2857                                 rnding);
   2858  u[12] =
   2859      half_btf_neon_r(&cospi[32], &u[11], &cospi[32], &u[12], v_bit, rnding);
   2860  u[11] = temp2;
   2861 
   2862  for (i = 16; i < 20; ++i) {
   2863    addsub_neon(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi);
   2864    addsub_neon(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo, clamp_hi);
   2865  }
   2866 
   2867  temp1 = half_btf_neon_mode10_r(&cospi[16], &u[36], &cospi[48], &u[59], v_bit,
   2868                                 rnding);
   2869  temp2 = half_btf_neon_mode10_r(&cospi[16], &u[37], &cospi[48], &u[58], v_bit,
   2870                                 rnding);
   2871  temp3 = half_btf_neon_mode10_r(&cospi[16], &u[38], &cospi[48], &u[57], v_bit,
   2872                                 rnding);
   2873  temp4 = half_btf_neon_mode10_r(&cospi[16], &u[39], &cospi[48], &u[56], v_bit,
   2874                                 rnding);
   2875  u[56] =
   2876      half_btf_neon_r(&cospi[48], &u[39], &cospi[16], &u[56], v_bit, rnding);
   2877  u[57] =
   2878      half_btf_neon_r(&cospi[48], &u[38], &cospi[16], &u[57], v_bit, rnding);
   2879  u[58] =
   2880      half_btf_neon_r(&cospi[48], &u[37], &cospi[16], &u[58], v_bit, rnding);
   2881  u[59] =
   2882      half_btf_neon_r(&cospi[48], &u[36], &cospi[16], &u[59], v_bit, rnding);
   2883  u[36] = temp1;
   2884  u[37] = temp2;
   2885  u[38] = temp3;
   2886  u[39] = temp4;
   2887 
   2888  temp1 = half_btf_neon_mode11_r(&cospi[48], &u[40], &cospi[16], &u[55], v_bit,
   2889                                 rnding);
   2890  temp2 = half_btf_neon_mode11_r(&cospi[48], &u[41], &cospi[16], &u[54], v_bit,
   2891                                 rnding);
   2892  temp3 = half_btf_neon_mode11_r(&cospi[48], &u[42], &cospi[16], &u[53], v_bit,
   2893                                 rnding);
   2894  temp4 = half_btf_neon_mode11_r(&cospi[48], &u[43], &cospi[16], &u[52], v_bit,
   2895                                 rnding);
   2896  u[52] = half_btf_neon_mode10_r(&cospi[16], &u[43], &cospi[48], &u[52], v_bit,
   2897                                 rnding);
   2898  u[53] = half_btf_neon_mode10_r(&cospi[16], &u[42], &cospi[48], &u[53], v_bit,
   2899                                 rnding);
   2900  u[54] = half_btf_neon_mode10_r(&cospi[16], &u[41], &cospi[48], &u[54], v_bit,
   2901                                 rnding);
   2902  u[55] = half_btf_neon_mode10_r(&cospi[16], &u[40], &cospi[48], &u[55], v_bit,
   2903                                 rnding);
   2904  u[40] = temp1;
   2905  u[41] = temp2;
   2906  u[42] = temp3;
   2907  u[43] = temp4;
   2908 }
   2909 
   2910 static inline void idct64_stage9_neon(int32x4_t *u, const int32_t *cospi,
   2911                                      const int32x4_t *clamp_lo,
   2912                                      const int32x4_t *clamp_hi,
   2913                                      const int32x4_t *v_bit,
   2914                                      const int32x4_t *rnding) {
   2915  int i;
   2916  int32x4_t temp1, temp2, temp3, temp4;
   2917  for (i = 0; i < 8; ++i) {
   2918    addsub_neon(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi);
   2919  }
   2920  temp1 = half_btf_neon_mode10_r(&cospi[32], &u[20], &cospi[32], &u[27], v_bit,
   2921                                 rnding);
   2922  temp2 = half_btf_neon_mode10_r(&cospi[32], &u[21], &cospi[32], &u[26], v_bit,
   2923                                 rnding);
   2924  temp3 = half_btf_neon_mode10_r(&cospi[32], &u[22], &cospi[32], &u[25], v_bit,
   2925                                 rnding);
   2926  temp4 = half_btf_neon_mode10_r(&cospi[32], &u[23], &cospi[32], &u[24], v_bit,
   2927                                 rnding);
   2928  u[24] =
   2929      half_btf_neon_r(&cospi[32], &u[23], &cospi[32], &u[24], v_bit, rnding);
   2930  u[25] =
   2931      half_btf_neon_r(&cospi[32], &u[22], &cospi[32], &u[25], v_bit, rnding);
   2932  u[26] =
   2933      half_btf_neon_r(&cospi[32], &u[21], &cospi[32], &u[26], v_bit, rnding);
   2934  u[27] =
   2935      half_btf_neon_r(&cospi[32], &u[20], &cospi[32], &u[27], v_bit, rnding);
   2936  u[20] = temp1;
   2937  u[21] = temp2;
   2938  u[22] = temp3;
   2939  u[23] = temp4;
   2940  for (i = 32; i < 40; i++) {
   2941    addsub_neon(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi);
   2942  }
   2943 
   2944  for (i = 48; i < 56; i++) {
   2945    addsub_neon(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi);
   2946  }
   2947 }
   2948 
   2949 static inline void idct64_stage10_neon(int32x4_t *u, const int32_t *cospi,
   2950                                       const int32x4_t *clamp_lo,
   2951                                       const int32x4_t *clamp_hi,
   2952                                       const int32x4_t *v_bit,
   2953                                       const int32x4_t *rnding) {
   2954  int32x4_t temp1, temp2, temp3, temp4;
   2955  for (int i = 0; i < 16; i++) {
   2956    addsub_neon(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi);
   2957  }
   2958  temp1 = half_btf_neon_mode10_r(&cospi[32], &u[40], &cospi[32], &u[55], v_bit,
   2959                                 rnding);
   2960  temp2 = half_btf_neon_mode10_r(&cospi[32], &u[41], &cospi[32], &u[54], v_bit,
   2961                                 rnding);
   2962  temp3 = half_btf_neon_mode10_r(&cospi[32], &u[42], &cospi[32], &u[53], v_bit,
   2963                                 rnding);
   2964  temp4 = half_btf_neon_mode10_r(&cospi[32], &u[43], &cospi[32], &u[52], v_bit,
   2965                                 rnding);
   2966  u[52] =
   2967      half_btf_neon_r(&cospi[32], &u[43], &cospi[32], &u[52], v_bit, rnding);
   2968  u[53] =
   2969      half_btf_neon_r(&cospi[32], &u[42], &cospi[32], &u[53], v_bit, rnding);
   2970  u[54] =
   2971      half_btf_neon_r(&cospi[32], &u[41], &cospi[32], &u[54], v_bit, rnding);
   2972  u[55] =
   2973      half_btf_neon_r(&cospi[32], &u[40], &cospi[32], &u[55], v_bit, rnding);
   2974  u[40] = temp1;
   2975  u[41] = temp2;
   2976  u[42] = temp3;
   2977  u[43] = temp4;
   2978 
   2979  temp1 = half_btf_neon_mode10_r(&cospi[32], &u[44], &cospi[32], &u[51], v_bit,
   2980                                 rnding);
   2981  temp2 = half_btf_neon_mode10_r(&cospi[32], &u[45], &cospi[32], &u[50], v_bit,
   2982                                 rnding);
   2983  temp3 = half_btf_neon_mode10_r(&cospi[32], &u[46], &cospi[32], &u[49], v_bit,
   2984                                 rnding);
   2985  temp4 = half_btf_neon_mode10_r(&cospi[32], &u[47], &cospi[32], &u[48], v_bit,
   2986                                 rnding);
   2987  u[48] =
   2988      half_btf_neon_r(&cospi[32], &u[47], &cospi[32], &u[48], v_bit, rnding);
   2989  u[49] =
   2990      half_btf_neon_r(&cospi[32], &u[46], &cospi[32], &u[49], v_bit, rnding);
   2991  u[50] =
   2992      half_btf_neon_r(&cospi[32], &u[45], &cospi[32], &u[50], v_bit, rnding);
   2993  u[51] =
   2994      half_btf_neon_r(&cospi[32], &u[44], &cospi[32], &u[51], v_bit, rnding);
   2995  u[44] = temp1;
   2996  u[45] = temp2;
   2997  u[46] = temp3;
   2998  u[47] = temp4;
   2999 }
   3000 
   3001 static inline void idct64_stage11_neon(int32x4_t *u, int32x4_t *out,
   3002                                       int do_cols, int bd, int out_shift,
   3003                                       const int32x4_t *clamp_lo,
   3004                                       const int32x4_t *clamp_hi) {
   3005  for (int i = 0; i < 32; i++) {
   3006    addsub_neon(u[i], u[63 - i], out + i, out + 63 - i, clamp_lo, clamp_hi);
   3007  }
   3008 
   3009  if (!do_cols) {
   3010    const int log_range_out = AOMMAX(16, bd + 6);
   3011    const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
   3012    const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
   3013    for (int i = 0; i < 64; i += 4) {
   3014      round_shift_4x4(out + i, out_shift);
   3015      highbd_clamp_s32_neon(out + i, out + i, &clamp_lo_out, &clamp_hi_out, 4);
   3016    }
   3017  }
   3018 }
   3019 
   3020 static void idct64x64_low1_neon(int32x4_t *in, int32x4_t *out, int bit,
   3021                                int do_cols, int bd, int out_shift) {
   3022  const int32_t *cospi = cospi_arr(bit);
   3023  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
   3024  int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
   3025  int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
   3026 
   3027  const int32x4_t v_bit = vdupq_n_s32(-bit);
   3028  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
   3029  {
   3030    int32x4_t x;
   3031 
   3032    // stage 1
   3033    // stage 2
   3034    // stage 3
   3035    // stage 4
   3036    // stage 5
   3037    // stage 6
   3038    x = half_btf_0_neon_r(&cospi[32], &in[0], &v_bit, &rnding);
   3039 
   3040    // stage 8
   3041    // stage 9
   3042    // stage 10
   3043    // stage 11
   3044    if (!do_cols) {
   3045      const int log_range_out = AOMMAX(16, bd + 6);
   3046      clamp_lo = vdupq_n_s32(-(1 << (log_range_out - 1)));
   3047      clamp_hi = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
   3048      if (out_shift != 0) {
   3049        int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
   3050        x = vaddq_s32(x, offset);
   3051        x = vshlq_s32(x, vdupq_n_s32(-out_shift));
   3052      }
   3053    }
   3054    x = vmaxq_s32(x, clamp_lo);
   3055    x = vminq_s32(x, clamp_hi);
   3056    out[0] = x;
   3057    out[1] = x;
   3058    out[2] = x;
   3059    out[3] = x;
   3060    out[4] = x;
   3061    out[5] = x;
   3062    out[6] = x;
   3063    out[7] = x;
   3064    out[8] = x;
   3065    out[9] = x;
   3066    out[10] = x;
   3067    out[11] = x;
   3068    out[12] = x;
   3069    out[13] = x;
   3070    out[14] = x;
   3071    out[15] = x;
   3072    out[16] = x;
   3073    out[17] = x;
   3074    out[18] = x;
   3075    out[19] = x;
   3076    out[20] = x;
   3077    out[21] = x;
   3078    out[22] = x;
   3079    out[23] = x;
   3080    out[24] = x;
   3081    out[25] = x;
   3082    out[26] = x;
   3083    out[27] = x;
   3084    out[28] = x;
   3085    out[29] = x;
   3086    out[30] = x;
   3087    out[31] = x;
   3088    out[32] = x;
   3089    out[33] = x;
   3090    out[34] = x;
   3091    out[35] = x;
   3092    out[36] = x;
   3093    out[37] = x;
   3094    out[38] = x;
   3095    out[39] = x;
   3096    out[40] = x;
   3097    out[41] = x;
   3098    out[42] = x;
   3099    out[43] = x;
   3100    out[44] = x;
   3101    out[45] = x;
   3102    out[46] = x;
   3103    out[47] = x;
   3104    out[48] = x;
   3105    out[49] = x;
   3106    out[50] = x;
   3107    out[51] = x;
   3108    out[52] = x;
   3109    out[53] = x;
   3110    out[54] = x;
   3111    out[55] = x;
   3112    out[56] = x;
   3113    out[57] = x;
   3114    out[58] = x;
   3115    out[59] = x;
   3116    out[60] = x;
   3117    out[61] = x;
   3118    out[62] = x;
   3119    out[63] = x;
   3120  }
   3121 }
   3122 
   3123 static void idct64x64_low8_neon(int32x4_t *in, int32x4_t *out, int bit,
   3124                                int do_cols, int bd, int out_shift) {
   3125  int i, j;
   3126  const int32_t *cospi = cospi_arr(bit);
   3127  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
   3128  const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
   3129  const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
   3130  const int32x4_t v_bit = vdupq_n_s32(-bit);
   3131  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
   3132  {
   3133    int32x4_t u[64];
   3134 
   3135    // stage 1
   3136    u[0] = in[0];
   3137    u[8] = in[4];
   3138    u[16] = in[2];
   3139    u[24] = in[6];
   3140    u[32] = in[1];
   3141    u[40] = in[5];
   3142    u[48] = in[3];
   3143    u[56] = in[7];
   3144 
   3145    // stage 2
   3146    u[63] = half_btf_0_neon_r(&cospi[1], &u[32], &v_bit, &rnding);
   3147    u[32] = half_btf_0_neon_r(&cospi[63], &u[32], &v_bit, &rnding);
   3148    u[39] = half_btf_0_m_neon_r(&cospi[57], &u[56], &v_bit, &rnding);
   3149    u[56] = half_btf_0_neon_r(&cospi[7], &u[56], &v_bit, &rnding);
   3150    u[55] = half_btf_0_neon_r(&cospi[5], &u[40], &v_bit, &rnding);
   3151    u[40] = half_btf_0_neon_r(&cospi[59], &u[40], &v_bit, &rnding);
   3152    u[47] = half_btf_0_m_neon_r(&cospi[61], &u[48], &v_bit, &rnding);
   3153    u[48] = half_btf_0_neon_r(&cospi[3], &u[48], &v_bit, &rnding);
   3154 
   3155    // stage 3
   3156    u[31] = half_btf_0_neon_r(&cospi[2], &u[16], &v_bit, &rnding);
   3157    u[16] = half_btf_0_neon_r(&cospi[62], &u[16], &v_bit, &rnding);
   3158    u[23] = half_btf_0_m_neon_r(&cospi[58], &u[24], &v_bit, &rnding);
   3159    u[24] = half_btf_0_neon_r(&cospi[6], &u[24], &v_bit, &rnding);
   3160    u[33] = u[32];
   3161    u[38] = u[39];
   3162    u[41] = u[40];
   3163    u[46] = u[47];
   3164    u[49] = u[48];
   3165    u[54] = u[55];
   3166    u[57] = u[56];
   3167    u[62] = u[63];
   3168 
   3169    // stage 4
   3170    int32x4_t temp1, temp2;
   3171    u[15] = half_btf_0_neon_r(&cospi[4], &u[8], &v_bit, &rnding);
   3172    u[8] = half_btf_0_neon_r(&cospi[60], &u[8], &v_bit, &rnding);
   3173    u[17] = u[16];
   3174    u[22] = u[23];
   3175    u[25] = u[24];
   3176    u[30] = u[31];
   3177 
   3178    temp1 = half_btf_neon_mode10_r(&cospi[4], &u[33], &cospi[60], &u[62],
   3179                                   &v_bit, &rnding);
   3180    u[62] =
   3181        half_btf_neon_r(&cospi[60], &u[33], &cospi[4], &u[62], &v_bit, &rnding);
   3182    u[33] = temp1;
   3183 
   3184    temp2 = half_btf_neon_mode10_r(&cospi[36], &u[38], &cospi[28], &u[57],
   3185                                   &v_bit, &rnding);
   3186    u[38] = half_btf_neon_mode11_r(&cospi[28], &u[38], &cospi[36], &u[57],
   3187                                   &v_bit, &rnding);
   3188    u[57] = temp2;
   3189 
   3190    temp1 = half_btf_neon_mode10_r(&cospi[20], &u[41], &cospi[44], &u[54],
   3191                                   &v_bit, &rnding);
   3192    u[54] = half_btf_neon_r(&cospi[44], &u[41], &cospi[20], &u[54], &v_bit,
   3193                            &rnding);
   3194    u[41] = temp1;
   3195 
   3196    temp2 = half_btf_neon_mode11_r(&cospi[12], &u[46], &cospi[52], &u[49],
   3197                                   &v_bit, &rnding);
   3198    u[49] = half_btf_neon_mode10_r(&cospi[52], &u[46], &cospi[12], &u[49],
   3199                                   &v_bit, &rnding);
   3200    u[46] = temp2;
   3201 
   3202    // stage 5
   3203    u[9] = u[8];
   3204    u[14] = u[15];
   3205 
   3206    temp1 = half_btf_neon_mode10_r(&cospi[8], &u[17], &cospi[56], &u[30],
   3207                                   &v_bit, &rnding);
   3208    u[30] =
   3209        half_btf_neon_r(&cospi[56], &u[17], &cospi[8], &u[30], &v_bit, &rnding);
   3210    u[17] = temp1;
   3211 
   3212    temp2 = half_btf_neon_mode11_r(&cospi[24], &u[22], &cospi[40], &u[25],
   3213                                   &v_bit, &rnding);
   3214    u[25] = half_btf_neon_mode10_r(&cospi[40], &u[22], &cospi[24], &u[25],
   3215                                   &v_bit, &rnding);
   3216    u[22] = temp2;
   3217 
   3218    u[35] = u[32];
   3219    u[34] = u[33];
   3220    u[36] = u[39];
   3221    u[37] = u[38];
   3222    u[43] = u[40];
   3223    u[42] = u[41];
   3224    u[44] = u[47];
   3225    u[45] = u[46];
   3226    u[51] = u[48];
   3227    u[50] = u[49];
   3228    u[52] = u[55];
   3229    u[53] = u[54];
   3230    u[59] = u[56];
   3231    u[58] = u[57];
   3232    u[60] = u[63];
   3233    u[61] = u[62];
   3234 
   3235    // stage 6
   3236    temp1 = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding);
   3237    u[1] = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding);
   3238    u[0] = temp1;
   3239 
   3240    temp2 = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14],
   3241                                   &v_bit, &rnding);
   3242    u[14] =
   3243        half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding);
   3244    u[9] = temp2;
   3245    u[19] = u[16];
   3246    u[18] = u[17];
   3247    u[20] = u[23];
   3248    u[21] = u[22];
   3249    u[27] = u[24];
   3250    u[26] = u[25];
   3251    u[28] = u[31];
   3252    u[29] = u[30];
   3253 
   3254    temp1 = half_btf_neon_mode10_r(&cospi[8], &u[34], &cospi[56], &u[61],
   3255                                   &v_bit, &rnding);
   3256    u[61] =
   3257        half_btf_neon_r(&cospi[56], &u[34], &cospi[8], &u[61], &v_bit, &rnding);
   3258    u[34] = temp1;
   3259    temp2 = half_btf_neon_mode10_r(&cospi[8], &u[35], &cospi[56], &u[60],
   3260                                   &v_bit, &rnding);
   3261    u[60] =
   3262        half_btf_neon_r(&cospi[56], &u[35], &cospi[8], &u[60], &v_bit, &rnding);
   3263    u[35] = temp2;
   3264    temp1 = half_btf_neon_mode11_r(&cospi[56], &u[36], &cospi[8], &u[59],
   3265                                   &v_bit, &rnding);
   3266    u[59] = half_btf_neon_mode10_r(&cospi[8], &u[36], &cospi[56], &u[59],
   3267                                   &v_bit, &rnding);
   3268    u[36] = temp1;
   3269    temp2 = half_btf_neon_mode11_r(&cospi[56], &u[37], &cospi[8], &u[58],
   3270                                   &v_bit, &rnding);
   3271    u[58] = half_btf_neon_mode10_r(&cospi[8], &u[37], &cospi[56], &u[58],
   3272                                   &v_bit, &rnding);
   3273    u[37] = temp2;
   3274    temp1 = half_btf_neon_mode10_r(&cospi[40], &u[42], &cospi[24], &u[53],
   3275                                   &v_bit, &rnding);
   3276    u[53] = half_btf_neon_r(&cospi[24], &u[42], &cospi[40], &u[53], &v_bit,
   3277                            &rnding);
   3278    u[42] = temp1;
   3279    temp2 = half_btf_neon_mode10_r(&cospi[40], &u[43], &cospi[24], &u[52],
   3280                                   &v_bit, &rnding);
   3281    u[52] = half_btf_neon_r(&cospi[24], &u[43], &cospi[40], &u[52], &v_bit,
   3282                            &rnding);
   3283    u[43] = temp2;
   3284    temp1 = half_btf_neon_mode11_r(&cospi[24], &u[44], &cospi[40], &u[51],
   3285                                   &v_bit, &rnding);
   3286    u[51] = half_btf_neon_mode10_r(&cospi[40], &u[44], &cospi[24], &u[51],
   3287                                   &v_bit, &rnding);
   3288    u[44] = temp1;
   3289    temp2 = half_btf_neon_mode11_r(&cospi[24], &u[45], &cospi[40], &u[50],
   3290                                   &v_bit, &rnding);
   3291    u[50] = half_btf_neon_mode10_r(&cospi[40], &u[45], &cospi[24], &u[50],
   3292                                   &v_bit, &rnding);
   3293    u[45] = temp2;
   3294 
   3295    // stage 7
   3296    u[3] = u[0];
   3297    u[2] = u[1];
   3298    u[11] = u[8];
   3299    u[10] = u[9];
   3300    u[12] = u[15];
   3301    u[13] = u[14];
   3302 
   3303    temp1 = half_btf_neon_mode10_r(&cospi[16], &u[18], &cospi[48], &u[29],
   3304                                   &v_bit, &rnding);
   3305    u[29] = half_btf_neon_r(&cospi[48], &u[18], &cospi[16], &u[29], &v_bit,
   3306                            &rnding);
   3307    u[18] = temp1;
   3308    temp2 = half_btf_neon_mode10_r(&cospi[16], &u[19], &cospi[48], &u[28],
   3309                                   &v_bit, &rnding);
   3310    u[28] = half_btf_neon_r(&cospi[48], &u[19], &cospi[16], &u[28], &v_bit,
   3311                            &rnding);
   3312    u[19] = temp2;
   3313    temp1 = half_btf_neon_mode11_r(&cospi[48], &u[20], &cospi[16], &u[27],
   3314                                   &v_bit, &rnding);
   3315    u[27] = half_btf_neon_mode10_r(&cospi[16], &u[20], &cospi[48], &u[27],
   3316                                   &v_bit, &rnding);
   3317    u[20] = temp1;
   3318    temp2 = half_btf_neon_mode11_r(&cospi[48], &u[21], &cospi[16], &u[26],
   3319                                   &v_bit, &rnding);
   3320    u[26] = half_btf_neon_mode10_r(&cospi[16], &u[21], &cospi[48], &u[26],
   3321                                   &v_bit, &rnding);
   3322    u[21] = temp2;
   3323    for (i = 32; i < 64; i += 16) {
   3324      for (j = i; j < i + 4; j++) {
   3325        addsub_neon(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
   3326        addsub_neon(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
   3327                    &clamp_hi);
   3328      }
   3329    }
   3330 
   3331    // stage 8
   3332    u[7] = u[0];
   3333    u[6] = u[1];
   3334    u[5] = u[2];
   3335    u[4] = u[3];
   3336    u[9] = u[9];
   3337 
   3338    idct64_stage8_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
   3339 
   3340    // stage 9
   3341    idct64_stage9_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
   3342 
   3343    // stage 10
   3344    idct64_stage10_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
   3345 
   3346    // stage 11
   3347    idct64_stage11_neon(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
   3348  }
   3349 }
   3350 
   3351 static void idct64x64_low16_neon(int32x4_t *in, int32x4_t *out, int bit,
   3352                                 int do_cols, int bd, int out_shift) {
   3353  int i, j;
   3354  const int32_t *cospi = cospi_arr(bit);
   3355  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
   3356  const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
   3357  const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
   3358  const int32x4_t v_bit = vdupq_n_s32(-bit);
   3359  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
   3360 
   3361  {
   3362    int32x4_t u[64];
   3363    int32x4_t tmp1, tmp2, tmp3, tmp4;
   3364    // stage 1
   3365    u[0] = in[0];
   3366    u[32] = in[1];
   3367    u[36] = in[9];
   3368    u[40] = in[5];
   3369    u[44] = in[13];
   3370    u[48] = in[3];
   3371    u[52] = in[11];
   3372    u[56] = in[7];
   3373    u[60] = in[15];
   3374    u[16] = in[2];
   3375    u[20] = in[10];
   3376    u[24] = in[6];
   3377    u[28] = in[14];
   3378    u[4] = in[8];
   3379    u[8] = in[4];
   3380    u[12] = in[12];
   3381 
   3382    // stage 2
   3383    u[63] = half_btf_0_neon_r(&cospi[1], &u[32], &v_bit, &rnding);
   3384    u[32] = half_btf_0_neon_r(&cospi[63], &u[32], &v_bit, &rnding);
   3385    u[35] = half_btf_0_m_neon_r(&cospi[49], &u[60], &v_bit, &rnding);
   3386    u[60] = half_btf_0_neon_r(&cospi[15], &u[60], &v_bit, &rnding);
   3387    u[59] = half_btf_0_neon_r(&cospi[9], &u[36], &v_bit, &rnding);
   3388    u[36] = half_btf_0_neon_r(&cospi[55], &u[36], &v_bit, &rnding);
   3389    u[39] = half_btf_0_m_neon_r(&cospi[57], &u[56], &v_bit, &rnding);
   3390    u[56] = half_btf_0_neon_r(&cospi[7], &u[56], &v_bit, &rnding);
   3391    u[55] = half_btf_0_neon_r(&cospi[5], &u[40], &v_bit, &rnding);
   3392    u[40] = half_btf_0_neon_r(&cospi[59], &u[40], &v_bit, &rnding);
   3393    u[43] = half_btf_0_m_neon_r(&cospi[53], &u[52], &v_bit, &rnding);
   3394    u[52] = half_btf_0_neon_r(&cospi[11], &u[52], &v_bit, &rnding);
   3395    u[47] = half_btf_0_m_neon_r(&cospi[61], &u[48], &v_bit, &rnding);
   3396    u[48] = half_btf_0_neon_r(&cospi[3], &u[48], &v_bit, &rnding);
   3397    u[51] = half_btf_0_neon_r(&cospi[13], &u[44], &v_bit, &rnding);
   3398    u[44] = half_btf_0_neon_r(&cospi[51], &u[44], &v_bit, &rnding);
   3399 
   3400    // stage 3
   3401    u[31] = half_btf_0_neon_r(&cospi[2], &u[16], &v_bit, &rnding);
   3402    u[16] = half_btf_0_neon_r(&cospi[62], &u[16], &v_bit, &rnding);
   3403    u[19] = half_btf_0_m_neon_r(&cospi[50], &u[28], &v_bit, &rnding);
   3404    u[28] = half_btf_0_neon_r(&cospi[14], &u[28], &v_bit, &rnding);
   3405    u[27] = half_btf_0_neon_r(&cospi[10], &u[20], &v_bit, &rnding);
   3406    u[20] = half_btf_0_neon_r(&cospi[54], &u[20], &v_bit, &rnding);
   3407    u[23] = half_btf_0_m_neon_r(&cospi[58], &u[24], &v_bit, &rnding);
   3408    u[24] = half_btf_0_neon_r(&cospi[6], &u[24], &v_bit, &rnding);
   3409    u[33] = u[32];
   3410    u[34] = u[35];
   3411    u[37] = u[36];
   3412    u[38] = u[39];
   3413    u[41] = u[40];
   3414    u[42] = u[43];
   3415    u[45] = u[44];
   3416    u[46] = u[47];
   3417    u[49] = u[48];
   3418    u[50] = u[51];
   3419    u[53] = u[52];
   3420    u[54] = u[55];
   3421    u[57] = u[56];
   3422    u[58] = u[59];
   3423    u[61] = u[60];
   3424    u[62] = u[63];
   3425 
   3426    // stage 4
   3427    u[15] = half_btf_0_neon_r(&cospi[4], &u[8], &v_bit, &rnding);
   3428    u[8] = half_btf_0_neon_r(&cospi[60], &u[8], &v_bit, &rnding);
   3429    u[11] = half_btf_0_m_neon_r(&cospi[52], &u[12], &v_bit, &rnding);
   3430    u[12] = half_btf_0_neon_r(&cospi[12], &u[12], &v_bit, &rnding);
   3431 
   3432    u[17] = u[16];
   3433    u[18] = u[19];
   3434    u[21] = u[20];
   3435    u[22] = u[23];
   3436    u[25] = u[24];
   3437    u[26] = u[27];
   3438    u[29] = u[28];
   3439    u[30] = u[31];
   3440 
   3441    tmp1 = half_btf_neon_mode10_r(&cospi[4], &u[33], &cospi[60], &u[62], &v_bit,
   3442                                  &rnding);
   3443    tmp2 = half_btf_neon_mode11_r(&cospi[60], &u[34], &cospi[4], &u[61], &v_bit,
   3444                                  &rnding);
   3445    tmp3 = half_btf_neon_mode10_r(&cospi[36], &u[37], &cospi[28], &u[58],
   3446                                  &v_bit, &rnding);
   3447    tmp4 = half_btf_neon_mode11_r(&cospi[28], &u[38], &cospi[36], &u[57],
   3448                                  &v_bit, &rnding);
   3449    u[57] = half_btf_neon_mode10_r(&cospi[36], &u[38], &cospi[28], &u[57],
   3450                                   &v_bit, &rnding);
   3451    u[58] = half_btf_neon_r(&cospi[28], &u[37], &cospi[36], &u[58], &v_bit,
   3452                            &rnding);
   3453    u[61] = half_btf_neon_mode10_r(&cospi[4], &u[34], &cospi[60], &u[61],
   3454                                   &v_bit, &rnding);
   3455    u[62] =
   3456        half_btf_neon_r(&cospi[60], &u[33], &cospi[4], &u[62], &v_bit, &rnding);
   3457    u[33] = tmp1;
   3458    u[34] = tmp2;
   3459    u[37] = tmp3;
   3460    u[38] = tmp4;
   3461 
   3462    tmp1 = half_btf_neon_mode10_r(&cospi[20], &u[41], &cospi[44], &u[54],
   3463                                  &v_bit, &rnding);
   3464    tmp2 = half_btf_neon_mode11_r(&cospi[44], &u[42], &cospi[20], &u[53],
   3465                                  &v_bit, &rnding);
   3466    tmp3 = half_btf_neon_r(&cospi[52], &u[45], &cospi[12], &u[50], &v_bit,
   3467                           &rnding);
   3468    tmp4 = half_btf_neon_mode11_r(&cospi[12], &u[46], &cospi[52], &u[49],
   3469                                  &v_bit, &rnding);
   3470    u[49] = half_btf_neon_mode10_r(&cospi[52], &u[46], &cospi[12], &u[49],
   3471                                   &v_bit, &rnding);
   3472    u[50] = half_btf_neon_r(&cospi[12], &u[45], &cospi[52], &u[50], &v_bit,
   3473                            &rnding);
   3474    u[53] = half_btf_neon_mode10_r(&cospi[20], &u[42], &cospi[44], &u[53],
   3475                                   &v_bit, &rnding);
   3476    u[54] = half_btf_neon_r(&cospi[44], &u[41], &cospi[20], &u[54], &v_bit,
   3477                            &rnding);
   3478    u[41] = tmp1;
   3479    u[42] = tmp2;
   3480    u[45] = tmp3;
   3481    u[46] = tmp4;
   3482 
   3483    // stage 5
   3484    u[7] = half_btf_0_neon_r(&cospi[8], &u[4], &v_bit, &rnding);
   3485    u[4] = half_btf_0_neon_r(&cospi[56], &u[4], &v_bit, &rnding);
   3486 
   3487    u[9] = u[8];
   3488    u[10] = u[11];
   3489    u[13] = u[12];
   3490    u[14] = u[15];
   3491 
   3492    tmp1 = half_btf_neon_mode10_r(&cospi[8], &u[17], &cospi[56], &u[30], &v_bit,
   3493                                  &rnding);
   3494    tmp2 = half_btf_neon_mode11_r(&cospi[56], &u[18], &cospi[8], &u[29], &v_bit,
   3495                                  &rnding);
   3496    tmp3 = half_btf_neon_mode10_r(&cospi[40], &u[21], &cospi[24], &u[26],
   3497                                  &v_bit, &rnding);
   3498    tmp4 = half_btf_neon_mode11_r(&cospi[24], &u[22], &cospi[40], &u[25],
   3499                                  &v_bit, &rnding);
   3500    u[25] = half_btf_neon_mode10_r(&cospi[40], &u[22], &cospi[24], &u[25],
   3501                                   &v_bit, &rnding);
   3502    u[26] = half_btf_neon_r(&cospi[24], &u[21], &cospi[40], &u[26], &v_bit,
   3503                            &rnding);
   3504    u[29] = half_btf_neon_mode10_r(&cospi[8], &u[18], &cospi[56], &u[29],
   3505                                   &v_bit, &rnding);
   3506    u[30] =
   3507        half_btf_neon_r(&cospi[56], &u[17], &cospi[8], &u[30], &v_bit, &rnding);
   3508    u[17] = tmp1;
   3509    u[18] = tmp2;
   3510    u[21] = tmp3;
   3511    u[22] = tmp4;
   3512 
   3513    for (i = 32; i < 64; i += 8) {
   3514      addsub_neon(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
   3515                  &clamp_hi);
   3516      addsub_neon(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
   3517                  &clamp_hi);
   3518 
   3519      addsub_neon(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
   3520                  &clamp_hi);
   3521      addsub_neon(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
   3522                  &clamp_hi);
   3523    }
   3524 
   3525    // stage 6
   3526    tmp1 = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding);
   3527    u[1] = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding);
   3528    u[0] = tmp1;
   3529    u[5] = u[4];
   3530    u[6] = u[7];
   3531 
   3532    tmp1 = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14], &v_bit,
   3533                                  &rnding);
   3534    u[14] =
   3535        half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding);
   3536    u[9] = tmp1;
   3537    tmp2 = half_btf_neon_mode01_r(&cospi[48], &u[10], &cospi[16], &u[13],
   3538                                  &v_bit, &rnding);
   3539    u[13] = half_btf_neon_mode10_r(&cospi[16], &u[10], &cospi[48], &u[13],
   3540                                   &v_bit, &rnding);
   3541    u[10] = tmp2;
   3542 
   3543    for (i = 16; i < 32; i += 8) {
   3544      addsub_neon(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
   3545                  &clamp_hi);
   3546      addsub_neon(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
   3547                  &clamp_hi);
   3548 
   3549      addsub_neon(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
   3550                  &clamp_hi);
   3551      addsub_neon(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
   3552                  &clamp_hi);
   3553    }
   3554 
   3555    tmp1 = half_btf_neon_mode10_r(&cospi[8], &u[34], &cospi[56], &u[61], &v_bit,
   3556                                  &rnding);
   3557    tmp2 = half_btf_neon_mode10_r(&cospi[8], &u[35], &cospi[56], &u[60], &v_bit,
   3558                                  &rnding);
   3559    tmp3 = half_btf_neon_mode11_r(&cospi[56], &u[36], &cospi[8], &u[59], &v_bit,
   3560                                  &rnding);
   3561    tmp4 = half_btf_neon_mode11_r(&cospi[56], &u[37], &cospi[8], &u[58], &v_bit,
   3562                                  &rnding);
   3563    u[58] = half_btf_neon_mode10_r(&cospi[8], &u[37], &cospi[56], &u[58],
   3564                                   &v_bit, &rnding);
   3565    u[59] = half_btf_neon_mode10_r(&cospi[8], &u[36], &cospi[56], &u[59],
   3566                                   &v_bit, &rnding);
   3567    u[60] =
   3568        half_btf_neon_r(&cospi[56], &u[35], &cospi[8], &u[60], &v_bit, &rnding);
   3569    u[61] =
   3570        half_btf_neon_r(&cospi[56], &u[34], &cospi[8], &u[61], &v_bit, &rnding);
   3571    u[34] = tmp1;
   3572    u[35] = tmp2;
   3573    u[36] = tmp3;
   3574    u[37] = tmp4;
   3575 
   3576    tmp1 = half_btf_neon_mode10_r(&cospi[40], &u[42], &cospi[24], &u[53],
   3577                                  &v_bit, &rnding);
   3578    tmp2 = half_btf_neon_mode10_r(&cospi[40], &u[43], &cospi[24], &u[52],
   3579                                  &v_bit, &rnding);
   3580    tmp3 = half_btf_neon_mode11_r(&cospi[24], &u[44], &cospi[40], &u[51],
   3581                                  &v_bit, &rnding);
   3582    tmp4 = half_btf_neon_mode11_r(&cospi[24], &u[45], &cospi[40], &u[50],
   3583                                  &v_bit, &rnding);
   3584    u[50] = half_btf_neon_mode10_r(&cospi[40], &u[45], &cospi[24], &u[50],
   3585                                   &v_bit, &rnding);
   3586    u[51] = half_btf_neon_mode10_r(&cospi[40], &u[44], &cospi[24], &u[51],
   3587                                   &v_bit, &rnding);
   3588    u[52] = half_btf_neon_r(&cospi[24], &u[43], &cospi[40], &u[52], &v_bit,
   3589                            &rnding);
   3590    u[53] = half_btf_neon_r(&cospi[24], &u[42], &cospi[40], &u[53], &v_bit,
   3591                            &rnding);
   3592    u[42] = tmp1;
   3593    u[43] = tmp2;
   3594    u[44] = tmp3;
   3595    u[45] = tmp4;
   3596 
   3597    // stage 7
   3598    u[3] = u[0];
   3599    u[2] = u[1];
   3600    tmp1 = half_btf_neon_mode10_r(&cospi[32], &u[5], &cospi[32], &u[6], &v_bit,
   3601                                  &rnding);
   3602    u[6] =
   3603        half_btf_neon_r(&cospi[32], &u[5], &cospi[32], &u[6], &v_bit, &rnding);
   3604    u[5] = tmp1;
   3605    addsub_neon(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
   3606    addsub_neon(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
   3607    addsub_neon(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
   3608    addsub_neon(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
   3609 
   3610    tmp1 = half_btf_neon_mode10_r(&cospi[16], &u[18], &cospi[48], &u[29],
   3611                                  &v_bit, &rnding);
   3612    tmp2 = half_btf_neon_mode10_r(&cospi[16], &u[19], &cospi[48], &u[28],
   3613                                  &v_bit, &rnding);
   3614    tmp3 = half_btf_neon_mode11_r(&cospi[48], &u[20], &cospi[16], &u[27],
   3615                                  &v_bit, &rnding);
   3616    tmp4 = half_btf_neon_mode11_r(&cospi[48], &u[21], &cospi[16], &u[26],
   3617                                  &v_bit, &rnding);
   3618    u[26] = half_btf_neon_mode10_r(&cospi[16], &u[21], &cospi[48], &u[26],
   3619                                   &v_bit, &rnding);
   3620    u[27] = half_btf_neon_mode10_r(&cospi[16], &u[20], &cospi[48], &u[27],
   3621                                   &v_bit, &rnding);
   3622    u[28] = half_btf_neon_r(&cospi[48], &u[19], &cospi[16], &u[28], &v_bit,
   3623                            &rnding);
   3624    u[29] = half_btf_neon_r(&cospi[48], &u[18], &cospi[16], &u[29], &v_bit,
   3625                            &rnding);
   3626    u[18] = tmp1;
   3627    u[19] = tmp2;
   3628    u[20] = tmp3;
   3629    u[21] = tmp4;
   3630 
   3631    for (i = 32; i < 64; i += 16) {
   3632      for (j = i; j < i + 4; j++) {
   3633        addsub_neon(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
   3634        addsub_neon(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
   3635                    &clamp_hi);
   3636      }
   3637    }
   3638 
   3639    // stage 8
   3640    for (i = 0; i < 4; ++i) {
   3641      addsub_neon(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi);
   3642    }
   3643 
   3644    idct64_stage8_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
   3645 
   3646    // stage 9
   3647    idct64_stage9_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
   3648 
   3649    // stage 10
   3650    idct64_stage10_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
   3651 
   3652    // stage 11
   3653    idct64_stage11_neon(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
   3654  }
   3655 }
   3656 
   3657 static void idct64x64_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
   3658                           int bd, int out_shift) {
   3659  int i, j;
   3660  const int32_t *cospi = cospi_arr(bit);
   3661  const int32x4_t v_bit = vdupq_n_s32(-bit);
   3662  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
   3663 
   3664  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
   3665  const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
   3666  const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
   3667 
   3668  {
   3669    int32x4_t u[64], v[64];
   3670 
   3671    // stage 1
   3672    u[32] = in[1];
   3673    u[34] = in[17];
   3674    u[36] = in[9];
   3675    u[38] = in[25];
   3676    u[40] = in[5];
   3677    u[42] = in[21];
   3678    u[44] = in[13];
   3679    u[46] = in[29];
   3680    u[48] = in[3];
   3681    u[50] = in[19];
   3682    u[52] = in[11];
   3683    u[54] = in[27];
   3684    u[56] = in[7];
   3685    u[58] = in[23];
   3686    u[60] = in[15];
   3687    u[62] = in[31];
   3688 
   3689    v[16] = in[2];
   3690    v[18] = in[18];
   3691    v[20] = in[10];
   3692    v[22] = in[26];
   3693    v[24] = in[6];
   3694    v[26] = in[22];
   3695    v[28] = in[14];
   3696    v[30] = in[30];
   3697 
   3698    u[8] = in[4];
   3699    u[10] = in[20];
   3700    u[12] = in[12];
   3701    u[14] = in[28];
   3702 
   3703    v[4] = in[8];
   3704    v[6] = in[24];
   3705 
   3706    u[0] = in[0];
   3707    u[2] = in[16];
   3708 
   3709    // stage 2
   3710    v[32] = half_btf_0_neon_r(&cospi[63], &u[32], &v_bit, &rnding);
   3711    v[33] = half_btf_0_m_neon_r(&cospi[33], &u[62], &v_bit, &rnding);
   3712    v[34] = half_btf_0_neon_r(&cospi[47], &u[34], &v_bit, &rnding);
   3713    v[35] = half_btf_0_m_neon_r(&cospi[49], &u[60], &v_bit, &rnding);
   3714    v[36] = half_btf_0_neon_r(&cospi[55], &u[36], &v_bit, &rnding);
   3715    v[37] = half_btf_0_m_neon_r(&cospi[41], &u[58], &v_bit, &rnding);
   3716    v[38] = half_btf_0_neon_r(&cospi[39], &u[38], &v_bit, &rnding);
   3717    v[39] = half_btf_0_m_neon_r(&cospi[57], &u[56], &v_bit, &rnding);
   3718    v[40] = half_btf_0_neon_r(&cospi[59], &u[40], &v_bit, &rnding);
   3719    v[41] = half_btf_0_m_neon_r(&cospi[37], &u[54], &v_bit, &rnding);
   3720    v[42] = half_btf_0_neon_r(&cospi[43], &u[42], &v_bit, &rnding);
   3721    v[43] = half_btf_0_m_neon_r(&cospi[53], &u[52], &v_bit, &rnding);
   3722    v[44] = half_btf_0_neon_r(&cospi[51], &u[44], &v_bit, &rnding);
   3723    v[45] = half_btf_0_m_neon_r(&cospi[45], &u[50], &v_bit, &rnding);
   3724    v[46] = half_btf_0_neon_r(&cospi[35], &u[46], &v_bit, &rnding);
   3725    v[47] = half_btf_0_m_neon_r(&cospi[61], &u[48], &v_bit, &rnding);
   3726    v[48] = half_btf_0_neon_r(&cospi[3], &u[48], &v_bit, &rnding);
   3727    v[49] = half_btf_0_neon_r(&cospi[29], &u[46], &v_bit, &rnding);
   3728    v[50] = half_btf_0_neon_r(&cospi[19], &u[50], &v_bit, &rnding);
   3729    v[51] = half_btf_0_neon_r(&cospi[13], &u[44], &v_bit, &rnding);
   3730    v[52] = half_btf_0_neon_r(&cospi[11], &u[52], &v_bit, &rnding);
   3731    v[53] = half_btf_0_neon_r(&cospi[21], &u[42], &v_bit, &rnding);
   3732    v[54] = half_btf_0_neon_r(&cospi[27], &u[54], &v_bit, &rnding);
   3733    v[55] = half_btf_0_neon_r(&cospi[5], &u[40], &v_bit, &rnding);
   3734    v[56] = half_btf_0_neon_r(&cospi[7], &u[56], &v_bit, &rnding);
   3735    v[57] = half_btf_0_neon_r(&cospi[25], &u[38], &v_bit, &rnding);
   3736    v[58] = half_btf_0_neon_r(&cospi[23], &u[58], &v_bit, &rnding);
   3737    v[59] = half_btf_0_neon_r(&cospi[9], &u[36], &v_bit, &rnding);
   3738    v[60] = half_btf_0_neon_r(&cospi[15], &u[60], &v_bit, &rnding);
   3739    v[61] = half_btf_0_neon_r(&cospi[17], &u[34], &v_bit, &rnding);
   3740    v[62] = half_btf_0_neon_r(&cospi[31], &u[62], &v_bit, &rnding);
   3741    v[63] = half_btf_0_neon_r(&cospi[1], &u[32], &v_bit, &rnding);
   3742 
   3743    // stage 3
   3744    u[16] = half_btf_0_neon_r(&cospi[62], &v[16], &v_bit, &rnding);
   3745    u[17] = half_btf_0_m_neon_r(&cospi[34], &v[30], &v_bit, &rnding);
   3746    u[18] = half_btf_0_neon_r(&cospi[46], &v[18], &v_bit, &rnding);
   3747    u[19] = half_btf_0_m_neon_r(&cospi[50], &v[28], &v_bit, &rnding);
   3748    u[20] = half_btf_0_neon_r(&cospi[54], &v[20], &v_bit, &rnding);
   3749    u[21] = half_btf_0_m_neon_r(&cospi[42], &v[26], &v_bit, &rnding);
   3750    u[22] = half_btf_0_neon_r(&cospi[38], &v[22], &v_bit, &rnding);
   3751    u[23] = half_btf_0_m_neon_r(&cospi[58], &v[24], &v_bit, &rnding);
   3752    u[24] = half_btf_0_neon_r(&cospi[6], &v[24], &v_bit, &rnding);
   3753    u[25] = half_btf_0_neon_r(&cospi[26], &v[22], &v_bit, &rnding);
   3754    u[26] = half_btf_0_neon_r(&cospi[22], &v[26], &v_bit, &rnding);
   3755    u[27] = half_btf_0_neon_r(&cospi[10], &v[20], &v_bit, &rnding);
   3756    u[28] = half_btf_0_neon_r(&cospi[14], &v[28], &v_bit, &rnding);
   3757    u[29] = half_btf_0_neon_r(&cospi[18], &v[18], &v_bit, &rnding);
   3758    u[30] = half_btf_0_neon_r(&cospi[30], &v[30], &v_bit, &rnding);
   3759    u[31] = half_btf_0_neon_r(&cospi[2], &v[16], &v_bit, &rnding);
   3760 
   3761    for (i = 32; i < 64; i += 4) {
   3762      addsub_neon(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
   3763                  &clamp_hi);
   3764      addsub_neon(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
   3765                  &clamp_hi);
   3766    }
   3767 
   3768    // stage 4
   3769    v[8] = half_btf_0_neon_r(&cospi[60], &u[8], &v_bit, &rnding);
   3770    v[9] = half_btf_0_m_neon_r(&cospi[36], &u[14], &v_bit, &rnding);
   3771    v[10] = half_btf_0_neon_r(&cospi[44], &u[10], &v_bit, &rnding);
   3772    v[11] = half_btf_0_m_neon_r(&cospi[52], &u[12], &v_bit, &rnding);
   3773    v[12] = half_btf_0_neon_r(&cospi[12], &u[12], &v_bit, &rnding);
   3774    v[13] = half_btf_0_neon_r(&cospi[20], &u[10], &v_bit, &rnding);
   3775    v[14] = half_btf_0_neon_r(&cospi[28], &u[14], &v_bit, &rnding);
   3776    v[15] = half_btf_0_neon_r(&cospi[4], &u[8], &v_bit, &rnding);
   3777 
   3778    for (i = 16; i < 32; i += 4) {
   3779      addsub_neon(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo,
   3780                  &clamp_hi);
   3781      addsub_neon(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo,
   3782                  &clamp_hi);
   3783    }
   3784 
   3785    for (i = 32; i < 64; i += 4) {
   3786      v[i + 0] = u[i + 0];
   3787      v[i + 3] = u[i + 3];
   3788    }
   3789 
   3790    v[33] = half_btf_neon_mode10_r(&cospi[4], &u[33], &cospi[60], &u[62],
   3791                                   &v_bit, &rnding);
   3792    v[34] = half_btf_neon_mode11_r(&cospi[60], &u[34], &cospi[4], &u[61],
   3793                                   &v_bit, &rnding);
   3794    v[37] = half_btf_neon_mode10_r(&cospi[36], &u[37], &cospi[28], &u[58],
   3795                                   &v_bit, &rnding);
   3796    v[38] = half_btf_neon_mode11_r(&cospi[28], &u[38], &cospi[36], &u[57],
   3797                                   &v_bit, &rnding);
   3798    v[41] = half_btf_neon_mode10_r(&cospi[20], &u[41], &cospi[44], &u[54],
   3799                                   &v_bit, &rnding);
   3800    v[42] = half_btf_neon_mode11_r(&cospi[44], &u[42], &cospi[20], &u[53],
   3801                                   &v_bit, &rnding);
   3802    v[45] = half_btf_neon_mode10_r(&cospi[52], &u[45], &cospi[12], &u[50],
   3803                                   &v_bit, &rnding);
   3804    v[46] = half_btf_neon_mode11_r(&cospi[12], &u[46], &cospi[52], &u[49],
   3805                                   &v_bit, &rnding);
   3806    v[49] = half_btf_neon_mode10_r(&cospi[52], &u[46], &cospi[12], &u[49],
   3807                                   &v_bit, &rnding);
   3808    v[50] = half_btf_neon_r(&cospi[12], &u[45], &cospi[52], &u[50], &v_bit,
   3809                            &rnding);
   3810    v[53] = half_btf_neon_mode10_r(&cospi[20], &u[42], &cospi[44], &u[53],
   3811                                   &v_bit, &rnding);
   3812    v[54] = half_btf_neon_r(&cospi[44], &u[41], &cospi[20], &u[54], &v_bit,
   3813                            &rnding);
   3814    v[57] = half_btf_neon_mode10_r(&cospi[36], &u[38], &cospi[28], &u[57],
   3815                                   &v_bit, &rnding);
   3816    v[58] = half_btf_neon_r(&cospi[28], &u[37], &cospi[36], &u[58], &v_bit,
   3817                            &rnding);
   3818    v[61] = half_btf_neon_mode10_r(&cospi[4], &u[34], &cospi[60], &u[61],
   3819                                   &v_bit, &rnding);
   3820    v[62] =
   3821        half_btf_neon_r(&cospi[60], &u[33], &cospi[4], &u[62], &v_bit, &rnding);
   3822 
   3823    // stage 5
   3824    u[4] = half_btf_0_neon_r(&cospi[56], &v[4], &v_bit, &rnding);
   3825    u[5] = half_btf_0_m_neon_r(&cospi[40], &v[6], &v_bit, &rnding);
   3826    u[6] = half_btf_0_neon_r(&cospi[24], &v[6], &v_bit, &rnding);
   3827    u[7] = half_btf_0_neon_r(&cospi[8], &v[4], &v_bit, &rnding);
   3828 
   3829    for (i = 8; i < 16; i += 4) {
   3830      addsub_neon(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
   3831                  &clamp_hi);
   3832      addsub_neon(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
   3833                  &clamp_hi);
   3834    }
   3835 
   3836    for (i = 16; i < 32; i += 4) {
   3837      u[i + 0] = v[i + 0];
   3838      u[i + 3] = v[i + 3];
   3839    }
   3840 
   3841    u[17] = half_btf_neon_mode10_r(&cospi[8], &v[17], &cospi[56], &v[30],
   3842                                   &v_bit, &rnding);
   3843    u[18] = half_btf_neon_mode11_r(&cospi[56], &v[18], &cospi[8], &v[29],
   3844                                   &v_bit, &rnding);
   3845    u[21] = half_btf_neon_mode10_r(&cospi[40], &v[21], &cospi[24], &v[26],
   3846                                   &v_bit, &rnding);
   3847    u[22] = half_btf_neon_mode11_r(&cospi[24], &v[22], &cospi[40], &v[25],
   3848                                   &v_bit, &rnding);
   3849    u[25] = half_btf_neon_mode10_r(&cospi[40], &v[22], &cospi[24], &v[25],
   3850                                   &v_bit, &rnding);
   3851    u[26] = half_btf_neon_r(&cospi[24], &v[21], &cospi[40], &v[26], &v_bit,
   3852                            &rnding);
   3853    u[29] = half_btf_neon_mode10_r(&cospi[8], &v[18], &cospi[56], &v[29],
   3854                                   &v_bit, &rnding);
   3855    u[30] =
   3856        half_btf_neon_r(&cospi[56], &v[17], &cospi[8], &v[30], &v_bit, &rnding);
   3857 
   3858    for (i = 32; i < 64; i += 8) {
   3859      addsub_neon(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
   3860                  &clamp_hi);
   3861      addsub_neon(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
   3862                  &clamp_hi);
   3863 
   3864      addsub_neon(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
   3865                  &clamp_hi);
   3866      addsub_neon(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
   3867                  &clamp_hi);
   3868    }
   3869 
   3870    // stage 6
   3871    v[0] = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding);
   3872    v[1] = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding);
   3873    v[2] = half_btf_0_neon_r(&cospi[48], &u[2], &v_bit, &rnding);
   3874    v[3] = half_btf_0_neon_r(&cospi[16], &u[2], &v_bit, &rnding);
   3875 
   3876    addsub_neon(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
   3877    addsub_neon(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
   3878 
   3879    for (i = 8; i < 16; i += 4) {
   3880      v[i + 0] = u[i + 0];
   3881      v[i + 3] = u[i + 3];
   3882    }
   3883 
   3884    v[9] = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14], &v_bit,
   3885                                  &rnding);
   3886    v[10] = half_btf_neon_mode11_r(&cospi[48], &u[10], &cospi[16], &u[13],
   3887                                   &v_bit, &rnding);
   3888    v[13] = half_btf_neon_mode10_r(&cospi[16], &u[10], &cospi[48], &u[13],
   3889                                   &v_bit, &rnding);
   3890    v[14] =
   3891        half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding);
   3892 
   3893    for (i = 16; i < 32; i += 8) {
   3894      addsub_neon(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo,
   3895                  &clamp_hi);
   3896      addsub_neon(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo,
   3897                  &clamp_hi);
   3898 
   3899      addsub_neon(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo,
   3900                  &clamp_hi);
   3901      addsub_neon(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo,
   3902                  &clamp_hi);
   3903    }
   3904 
   3905    for (i = 32; i < 64; i += 8) {
   3906      v[i + 0] = u[i + 0];
   3907      v[i + 1] = u[i + 1];
   3908      v[i + 6] = u[i + 6];
   3909      v[i + 7] = u[i + 7];
   3910    }
   3911 
   3912    v[34] = half_btf_neon_mode10_r(&cospi[8], &u[34], &cospi[56], &u[61],
   3913                                   &v_bit, &rnding);
   3914    v[35] = half_btf_neon_mode10_r(&cospi[8], &u[35], &cospi[56], &u[60],
   3915                                   &v_bit, &rnding);
   3916    v[36] = half_btf_neon_mode11_r(&cospi[56], &u[36], &cospi[8], &u[59],
   3917                                   &v_bit, &rnding);
   3918    v[37] = half_btf_neon_mode11_r(&cospi[56], &u[37], &cospi[8], &u[58],
   3919                                   &v_bit, &rnding);
   3920    v[42] = half_btf_neon_mode10_r(&cospi[40], &u[42], &cospi[24], &u[53],
   3921                                   &v_bit, &rnding);
   3922    v[43] = half_btf_neon_mode10_r(&cospi[40], &u[43], &cospi[24], &u[52],
   3923                                   &v_bit, &rnding);
   3924    v[44] = half_btf_neon_mode11_r(&cospi[24], &u[44], &cospi[40], &u[51],
   3925                                   &v_bit, &rnding);
   3926    v[45] = half_btf_neon_mode11_r(&cospi[24], &u[45], &cospi[40], &u[50],
   3927                                   &v_bit, &rnding);
   3928    v[50] = half_btf_neon_mode10_r(&cospi[40], &u[45], &cospi[24], &u[50],
   3929                                   &v_bit, &rnding);
   3930    v[51] = half_btf_neon_mode10_r(&cospi[40], &u[44], &cospi[24], &u[51],
   3931                                   &v_bit, &rnding);
   3932    v[52] = half_btf_neon_r(&cospi[24], &u[43], &cospi[40], &u[52], &v_bit,
   3933                            &rnding);
   3934    v[53] = half_btf_neon_r(&cospi[24], &u[42], &cospi[40], &u[53], &v_bit,
   3935                            &rnding);
   3936    v[58] = half_btf_neon_mode10_r(&cospi[8], &u[37], &cospi[56], &u[58],
   3937                                   &v_bit, &rnding);
   3938    v[59] = half_btf_neon_mode10_r(&cospi[8], &u[36], &cospi[56], &u[59],
   3939                                   &v_bit, &rnding);
   3940    v[60] =
   3941        half_btf_neon_r(&cospi[56], &u[35], &cospi[8], &u[60], &v_bit, &rnding);
   3942    v[61] =
   3943        half_btf_neon_r(&cospi[56], &u[34], &cospi[8], &u[61], &v_bit, &rnding);
   3944 
   3945    // stage 7
   3946    addsub_neon(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
   3947    addsub_neon(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
   3948 
   3949    u[4] = v[4];
   3950    u[7] = v[7];
   3951    u[5] = half_btf_neon_mode10_r(&cospi[32], &v[5], &cospi[32], &v[6], &v_bit,
   3952                                  &rnding);
   3953    u[6] =
   3954        half_btf_neon_r(&cospi[32], &v[5], &cospi[32], &v[6], &v_bit, &rnding);
   3955 
   3956    addsub_neon(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
   3957    addsub_neon(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
   3958    addsub_neon(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
   3959    addsub_neon(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
   3960 
   3961    for (i = 16; i < 32; i += 8) {
   3962      u[i + 0] = v[i + 0];
   3963      u[i + 1] = v[i + 1];
   3964      u[i + 6] = v[i + 6];
   3965      u[i + 7] = v[i + 7];
   3966    }
   3967 
   3968    u[18] = half_btf_neon_mode10_r(&cospi[16], &v[18], &cospi[48], &v[29],
   3969                                   &v_bit, &rnding);
   3970    u[19] = half_btf_neon_mode10_r(&cospi[16], &v[19], &cospi[48], &v[28],
   3971                                   &v_bit, &rnding);
   3972    u[20] = half_btf_neon_mode11_r(&cospi[48], &v[20], &cospi[16], &v[27],
   3973                                   &v_bit, &rnding);
   3974    u[21] = half_btf_neon_mode11_r(&cospi[48], &v[21], &cospi[16], &v[26],
   3975                                   &v_bit, &rnding);
   3976    u[26] = half_btf_neon_mode10_r(&cospi[16], &v[21], &cospi[48], &v[26],
   3977                                   &v_bit, &rnding);
   3978    u[27] = half_btf_neon_mode10_r(&cospi[16], &v[20], &cospi[48], &v[27],
   3979                                   &v_bit, &rnding);
   3980    u[28] = half_btf_neon_r(&cospi[48], &v[19], &cospi[16], &v[28], &v_bit,
   3981                            &rnding);
   3982    u[29] = half_btf_neon_r(&cospi[48], &v[18], &cospi[16], &v[29], &v_bit,
   3983                            &rnding);
   3984 
   3985    for (i = 32; i < 64; i += 16) {
   3986      for (j = i; j < i + 4; j++) {
   3987        addsub_neon(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
   3988        addsub_neon(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
   3989                    &clamp_hi);
   3990      }
   3991    }
   3992 
   3993    // stage 8
   3994    for (i = 0; i < 4; ++i) {
   3995      addsub_neon(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi);
   3996    }
   3997 
   3998    v[8] = u[8];
   3999    v[9] = u[9];
   4000    v[14] = u[14];
   4001    v[15] = u[15];
   4002 
   4003    v[10] = half_btf_neon_mode10_r(&cospi[32], &u[10], &cospi[32], &u[13],
   4004                                   &v_bit, &rnding);
   4005    v[11] = half_btf_neon_mode10_r(&cospi[32], &u[11], &cospi[32], &u[12],
   4006                                   &v_bit, &rnding);
   4007    v[12] = half_btf_neon_r(&cospi[32], &u[11], &cospi[32], &u[12], &v_bit,
   4008                            &rnding);
   4009    v[13] = half_btf_neon_r(&cospi[32], &u[10], &cospi[32], &u[13], &v_bit,
   4010                            &rnding);
   4011 
   4012    for (i = 16; i < 20; ++i) {
   4013      addsub_neon(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi);
   4014      addsub_neon(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo,
   4015                  &clamp_hi);
   4016    }
   4017 
   4018    for (i = 32; i < 36; ++i) {
   4019      v[i] = u[i];
   4020      v[i + 12] = u[i + 12];
   4021      v[i + 16] = u[i + 16];
   4022      v[i + 28] = u[i + 28];
   4023    }
   4024 
   4025    v[36] = half_btf_neon_mode10_r(&cospi[16], &u[36], &cospi[48], &u[59],
   4026                                   &v_bit, &rnding);
   4027    v[37] = half_btf_neon_mode10_r(&cospi[16], &u[37], &cospi[48], &u[58],
   4028                                   &v_bit, &rnding);
   4029    v[38] = half_btf_neon_mode10_r(&cospi[16], &u[38], &cospi[48], &u[57],
   4030                                   &v_bit, &rnding);
   4031    v[39] = half_btf_neon_mode10_r(&cospi[16], &u[39], &cospi[48], &u[56],
   4032                                   &v_bit, &rnding);
   4033    v[40] = half_btf_neon_mode11_r(&cospi[48], &u[40], &cospi[16], &u[55],
   4034                                   &v_bit, &rnding);
   4035    v[41] = half_btf_neon_mode11_r(&cospi[48], &u[41], &cospi[16], &u[54],
   4036                                   &v_bit, &rnding);
   4037    v[42] = half_btf_neon_mode11_r(&cospi[48], &u[42], &cospi[16], &u[53],
   4038                                   &v_bit, &rnding);
   4039    v[43] = half_btf_neon_mode11_r(&cospi[48], &u[43], &cospi[16], &u[52],
   4040                                   &v_bit, &rnding);
   4041    v[52] = half_btf_neon_mode10_r(&cospi[16], &u[43], &cospi[48], &u[52],
   4042                                   &v_bit, &rnding);
   4043    v[53] = half_btf_neon_mode10_r(&cospi[16], &u[42], &cospi[48], &u[53],
   4044                                   &v_bit, &rnding);
   4045    v[54] = half_btf_neon_mode10_r(&cospi[16], &u[41], &cospi[48], &u[54],
   4046                                   &v_bit, &rnding);
   4047    v[55] = half_btf_neon_mode10_r(&cospi[16], &u[40], &cospi[48], &u[55],
   4048                                   &v_bit, &rnding);
   4049    v[56] = half_btf_neon_r(&cospi[48], &u[39], &cospi[16], &u[56], &v_bit,
   4050                            &rnding);
   4051    v[57] = half_btf_neon_r(&cospi[48], &u[38], &cospi[16], &u[57], &v_bit,
   4052                            &rnding);
   4053    v[58] = half_btf_neon_r(&cospi[48], &u[37], &cospi[16], &u[58], &v_bit,
   4054                            &rnding);
   4055    v[59] = half_btf_neon_r(&cospi[48], &u[36], &cospi[16], &u[59], &v_bit,
   4056                            &rnding);
   4057 
   4058    // stage 9
   4059    for (i = 0; i < 8; ++i) {
   4060      addsub_neon(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi);
   4061    }
   4062 
   4063    for (i = 16; i < 20; ++i) {
   4064      u[i] = v[i];
   4065      u[i + 12] = v[i + 12];
   4066    }
   4067 
   4068    u[20] = half_btf_neon_mode10_r(&cospi[32], &v[20], &cospi[32], &v[27],
   4069                                   &v_bit, &rnding);
   4070    u[21] = half_btf_neon_mode10_r(&cospi[32], &v[21], &cospi[32], &v[26],
   4071                                   &v_bit, &rnding);
   4072    u[22] = half_btf_neon_mode10_r(&cospi[32], &v[22], &cospi[32], &v[25],
   4073                                   &v_bit, &rnding);
   4074    u[23] = half_btf_neon_mode10_r(&cospi[32], &v[23], &cospi[32], &v[24],
   4075                                   &v_bit, &rnding);
   4076    u[24] = half_btf_neon_r(&cospi[32], &v[23], &cospi[32], &v[24], &v_bit,
   4077                            &rnding);
   4078    u[25] = half_btf_neon_r(&cospi[32], &v[22], &cospi[32], &v[25], &v_bit,
   4079                            &rnding);
   4080    u[26] = half_btf_neon_r(&cospi[32], &v[21], &cospi[32], &v[26], &v_bit,
   4081                            &rnding);
   4082    u[27] = half_btf_neon_r(&cospi[32], &v[20], &cospi[32], &v[27], &v_bit,
   4083                            &rnding);
   4084 
   4085    for (i = 32; i < 40; i++) {
   4086      addsub_neon(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi);
   4087    }
   4088 
   4089    for (i = 48; i < 56; i++) {
   4090      addsub_neon(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi);
   4091    }
   4092 
   4093    // stage 10
   4094    for (i = 0; i < 16; i++) {
   4095      addsub_neon(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi);
   4096    }
   4097 
   4098    for (i = 32; i < 40; i++) v[i] = u[i];
   4099 
   4100    v[40] = half_btf_neon_mode10_r(&cospi[32], &u[40], &cospi[32], &u[55],
   4101                                   &v_bit, &rnding);
   4102    v[41] = half_btf_neon_mode10_r(&cospi[32], &u[41], &cospi[32], &u[54],
   4103                                   &v_bit, &rnding);
   4104    v[42] = half_btf_neon_mode10_r(&cospi[32], &u[42], &cospi[32], &u[53],
   4105                                   &v_bit, &rnding);
   4106    v[43] = half_btf_neon_mode10_r(&cospi[32], &u[43], &cospi[32], &u[52],
   4107                                   &v_bit, &rnding);
   4108    v[44] = half_btf_neon_mode10_r(&cospi[32], &u[44], &cospi[32], &u[51],
   4109                                   &v_bit, &rnding);
   4110    v[45] = half_btf_neon_mode10_r(&cospi[32], &u[45], &cospi[32], &u[50],
   4111                                   &v_bit, &rnding);
   4112    v[46] = half_btf_neon_mode10_r(&cospi[32], &u[46], &cospi[32], &u[49],
   4113                                   &v_bit, &rnding);
   4114    v[47] = half_btf_neon_mode10_r(&cospi[32], &u[47], &cospi[32], &u[48],
   4115                                   &v_bit, &rnding);
   4116    v[48] = half_btf_neon_r(&cospi[32], &u[47], &cospi[32], &u[48], &v_bit,
   4117                            &rnding);
   4118    v[49] = half_btf_neon_r(&cospi[32], &u[46], &cospi[32], &u[49], &v_bit,
   4119                            &rnding);
   4120    v[50] = half_btf_neon_r(&cospi[32], &u[45], &cospi[32], &u[50], &v_bit,
   4121                            &rnding);
   4122    v[51] = half_btf_neon_r(&cospi[32], &u[44], &cospi[32], &u[51], &v_bit,
   4123                            &rnding);
   4124    v[52] = half_btf_neon_r(&cospi[32], &u[43], &cospi[32], &u[52], &v_bit,
   4125                            &rnding);
   4126    v[53] = half_btf_neon_r(&cospi[32], &u[42], &cospi[32], &u[53], &v_bit,
   4127                            &rnding);
   4128    v[54] = half_btf_neon_r(&cospi[32], &u[41], &cospi[32], &u[54], &v_bit,
   4129                            &rnding);
   4130    v[55] = half_btf_neon_r(&cospi[32], &u[40], &cospi[32], &u[55], &v_bit,
   4131                            &rnding);
   4132 
   4133    for (i = 56; i < 64; i++) v[i] = u[i];
   4134 
   4135    // stage 11
   4136    for (i = 0; i < 32; i++) {
   4137      addsub_neon(v[i], v[63 - i], &out[(i)], &out[(63 - i)], &clamp_lo,
   4138                  &clamp_hi);
   4139    }
   4140 
   4141    if (!do_cols) {
   4142      const int log_range_out = AOMMAX(16, bd + 6);
   4143      const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
   4144      const int32x4_t clamp_hi_out =
   4145          vdupq_n_s32((1 << (log_range_out - 1)) - 1);
   4146      for (i = 0; i < 64; i += 4) {
   4147        round_shift_4x4(out + i, out_shift);
   4148        highbd_clamp_s32_neon(out + i, out + i, &clamp_lo_out, &clamp_hi_out,
   4149                              4);
   4150      }
   4151    }
   4152  }
   4153 }
   4154 
   4155 static void idct32x32_low1_neon(int32x4_t *in, int32x4_t *out, int bit,
   4156                                int do_cols, int bd, int out_shift) {
   4157  const int32_t *cospi = cospi_arr(bit);
   4158  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
   4159  int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
   4160  int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
   4161  int32x4_t bf1;
   4162  const int32x4_t v_bit = vdupq_n_s32(-bit);
   4163  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
   4164  // stage 0-1
   4165  bf1 = in[0];
   4166 
   4167  // stage 2-5
   4168  bf1 = half_btf_0_neon_r(&cospi[32], &bf1, &v_bit, &rnding);
   4169 
   4170  // stage 6-9
   4171  if (do_cols) {
   4172    bf1 = vmaxq_s32(bf1, clamp_lo);
   4173    bf1 = vminq_s32(bf1, clamp_hi);
   4174  } else {
   4175    const int log_range_out = AOMMAX(16, bd + 6);
   4176    clamp_lo = vdupq_n_s32(-(1 << (log_range_out - 1)));
   4177    clamp_hi = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
   4178    if (out_shift != 0) {
   4179      bf1 = vrshlq_s32(bf1, vdupq_n_s32(-out_shift));
   4180    }
   4181  }
   4182 
   4183  bf1 = vmaxq_s32(bf1, clamp_lo);
   4184  bf1 = vminq_s32(bf1, clamp_hi);
   4185 
   4186  for (int i = 0; i < 32; i++) out[i] = bf1;
   4187 }
   4188 
   4189 static void idct32x32_low8_neon(int32x4_t *in, int32x4_t *out, int bit,
   4190                                int do_cols, int bd, int out_shift) {
   4191  const int32_t *cospi = cospi_arr(bit);
   4192  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
   4193  const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
   4194  const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
   4195  int32x4_t bf1[32];
   4196  const int32x4_t v_bit = vdupq_n_s32(-bit);
   4197  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
   4198 
   4199  // stage 0-1
   4200  bf1[0] = in[0];
   4201  bf1[4] = in[4];
   4202  bf1[8] = in[2];
   4203  bf1[12] = in[6];
   4204  bf1[16] = in[1];
   4205  bf1[20] = in[5];
   4206  bf1[24] = in[3];
   4207  bf1[28] = in[7];
   4208 
   4209  // stage 2
   4210  bf1[31] = half_btf_0_neon_r(&cospi[2], &bf1[16], &v_bit, &rnding);
   4211  bf1[16] = half_btf_0_neon_r(&cospi[62], &bf1[16], &v_bit, &rnding);
   4212  bf1[19] = half_btf_0_m_neon_r(&cospi[50], &bf1[28], &v_bit, &rnding);
   4213  bf1[28] = half_btf_0_neon_r(&cospi[14], &bf1[28], &v_bit, &rnding);
   4214  bf1[27] = half_btf_0_neon_r(&cospi[10], &bf1[20], &v_bit, &rnding);
   4215  bf1[20] = half_btf_0_neon_r(&cospi[54], &bf1[20], &v_bit, &rnding);
   4216  bf1[23] = half_btf_0_m_neon_r(&cospi[58], &bf1[24], &v_bit, &rnding);
   4217  bf1[24] = half_btf_0_neon_r(&cospi[6], &bf1[24], &v_bit, &rnding);
   4218 
   4219  // stage 3
   4220  bf1[15] = half_btf_0_neon_r(&cospi[4], &bf1[8], &v_bit, &rnding);
   4221  bf1[8] = half_btf_0_neon_r(&cospi[60], &bf1[8], &v_bit, &rnding);
   4222 
   4223  bf1[11] = half_btf_0_m_neon_r(&cospi[52], &bf1[12], &v_bit, &rnding);
   4224  bf1[12] = half_btf_0_neon_r(&cospi[12], &bf1[12], &v_bit, &rnding);
   4225  bf1[17] = bf1[16];
   4226  bf1[18] = bf1[19];
   4227  bf1[21] = bf1[20];
   4228  bf1[22] = bf1[23];
   4229  bf1[25] = bf1[24];
   4230  bf1[26] = bf1[27];
   4231  bf1[29] = bf1[28];
   4232  bf1[30] = bf1[31];
   4233 
   4234  // stage 4 :
   4235  bf1[7] = half_btf_0_neon_r(&cospi[8], &bf1[4], &v_bit, &rnding);
   4236  bf1[4] = half_btf_0_neon_r(&cospi[56], &bf1[4], &v_bit, &rnding);
   4237 
   4238  bf1[9] = bf1[8];
   4239  bf1[10] = bf1[11];
   4240  bf1[13] = bf1[12];
   4241  bf1[14] = bf1[15];
   4242 
   4243  idct32_stage4_neon(bf1, cospi, &v_bit, &rnding);
   4244 
   4245  // stage 5
   4246  bf1[0] = half_btf_0_neon_r(&cospi[32], &bf1[0], &v_bit, &rnding);
   4247  bf1[1] = bf1[0];
   4248  bf1[5] = bf1[4];
   4249  bf1[6] = bf1[7];
   4250 
   4251  idct32_stage5_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
   4252 
   4253  // stage 6
   4254  bf1[3] = bf1[0];
   4255  bf1[2] = bf1[1];
   4256 
   4257  idct32_stage6_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
   4258 
   4259  // stage 7
   4260  idct32_stage7_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
   4261 
   4262  // stage 8
   4263  idct32_stage8_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
   4264 
   4265  // stage 9
   4266  idct32_stage9_neon(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
   4267 }
   4268 
   4269 static void idct32x32_low16_neon(int32x4_t *in, int32x4_t *out, int bit,
   4270                                 int do_cols, int bd, int out_shift) {
   4271  const int32_t *cospi = cospi_arr(bit);
   4272  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
   4273  const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
   4274  const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
   4275  int32x4_t bf1[32];
   4276  const int32x4_t v_bit = vdupq_n_s32(-bit);
   4277  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
   4278 
   4279  // stage 0-1
   4280 
   4281  bf1[0] = in[0];
   4282  bf1[2] = in[8];
   4283  bf1[4] = in[4];
   4284  bf1[6] = in[12];
   4285  bf1[8] = in[2];
   4286  bf1[10] = in[10];
   4287  bf1[12] = in[6];
   4288  bf1[14] = in[14];
   4289  bf1[16] = in[1];
   4290  bf1[18] = in[9];
   4291  bf1[20] = in[5];
   4292  bf1[22] = in[13];
   4293  bf1[24] = in[3];
   4294  bf1[26] = in[11];
   4295  bf1[28] = in[7];
   4296  bf1[30] = in[15];
   4297 
   4298  // stage 2
   4299  bf1[31] = half_btf_0_neon_r(&cospi[2], &bf1[16], &v_bit, &rnding);
   4300  bf1[16] = half_btf_0_neon_r(&cospi[62], &bf1[16], &v_bit, &rnding);
   4301  bf1[17] = half_btf_0_m_neon_r(&cospi[34], &bf1[30], &v_bit, &rnding);
   4302  bf1[30] = half_btf_0_neon_r(&cospi[30], &bf1[30], &v_bit, &rnding);
   4303  bf1[29] = half_btf_0_neon_r(&cospi[18], &bf1[18], &v_bit, &rnding);
   4304  bf1[18] = half_btf_0_neon_r(&cospi[46], &bf1[18], &v_bit, &rnding);
   4305  bf1[19] = half_btf_0_m_neon_r(&cospi[50], &bf1[28], &v_bit, &rnding);
   4306  bf1[28] = half_btf_0_neon_r(&cospi[14], &bf1[28], &v_bit, &rnding);
   4307  bf1[27] = half_btf_0_neon_r(&cospi[10], &bf1[20], &v_bit, &rnding);
   4308  bf1[20] = half_btf_0_neon_r(&cospi[54], &bf1[20], &v_bit, &rnding);
   4309  bf1[21] = half_btf_0_m_neon_r(&cospi[42], &bf1[26], &v_bit, &rnding);
   4310  bf1[26] = half_btf_0_neon_r(&cospi[22], &bf1[26], &v_bit, &rnding);
   4311  bf1[25] = half_btf_0_neon_r(&cospi[26], &bf1[22], &v_bit, &rnding);
   4312  bf1[22] = half_btf_0_neon_r(&cospi[38], &bf1[22], &v_bit, &rnding);
   4313  bf1[23] = half_btf_0_m_neon_r(&cospi[58], &bf1[24], &v_bit, &rnding);
   4314  bf1[24] = half_btf_0_neon_r(&cospi[6], &bf1[24], &v_bit, &rnding);
   4315 
   4316  // stage 3
   4317  bf1[15] = half_btf_0_neon_r(&cospi[4], &bf1[8], &v_bit, &rnding);
   4318  bf1[8] = half_btf_0_neon_r(&cospi[60], &bf1[8], &v_bit, &rnding);
   4319  bf1[9] = half_btf_0_m_neon_r(&cospi[36], &bf1[14], &v_bit, &rnding);
   4320  bf1[14] = half_btf_0_neon_r(&cospi[28], &bf1[14], &v_bit, &rnding);
   4321  bf1[13] = half_btf_0_neon_r(&cospi[20], &bf1[10], &v_bit, &rnding);
   4322  bf1[10] = half_btf_0_neon_r(&cospi[44], &bf1[10], &v_bit, &rnding);
   4323  bf1[11] = half_btf_0_m_neon_r(&cospi[52], &bf1[12], &v_bit, &rnding);
   4324  bf1[12] = half_btf_0_neon_r(&cospi[12], &bf1[12], &v_bit, &rnding);
   4325 
   4326  addsub_neon(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
   4327  addsub_neon(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
   4328  addsub_neon(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
   4329  addsub_neon(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
   4330  addsub_neon(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
   4331  addsub_neon(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
   4332  addsub_neon(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
   4333  addsub_neon(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
   4334  // stage 4
   4335  bf1[7] = half_btf_0_neon_r(&cospi[8], &bf1[4], &v_bit, &rnding);
   4336  bf1[4] = half_btf_0_neon_r(&cospi[56], &bf1[4], &v_bit, &rnding);
   4337  bf1[5] = half_btf_0_m_neon_r(&cospi[40], &bf1[6], &v_bit, &rnding);
   4338  bf1[6] = half_btf_0_neon_r(&cospi[24], &bf1[6], &v_bit, &rnding);
   4339 
   4340  addsub_neon(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi);
   4341  addsub_neon(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi);
   4342  addsub_neon(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi);
   4343  addsub_neon(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi);
   4344 
   4345  idct32_stage4_neon(bf1, cospi, &v_bit, &rnding);
   4346 
   4347  // stage 5
   4348  bf1[0] = half_btf_0_neon_r(&cospi[32], &bf1[0], &v_bit, &rnding);
   4349  bf1[1] = bf1[0];
   4350  bf1[3] = half_btf_0_neon_r(&cospi[16], &bf1[2], &v_bit, &rnding);
   4351  bf1[2] = half_btf_0_neon_r(&cospi[48], &bf1[2], &v_bit, &rnding);
   4352 
   4353  addsub_neon(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
   4354  addsub_neon(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
   4355 
   4356  idct32_stage5_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
   4357 
   4358  // stage 6
   4359  addsub_neon(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi);
   4360  addsub_neon(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi);
   4361 
   4362  idct32_stage6_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
   4363 
   4364  // stage 7
   4365  idct32_stage7_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
   4366 
   4367  // stage 8
   4368  idct32_stage8_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
   4369  // stage 9
   4370  idct32_stage9_neon(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
   4371 }
   4372 
   4373 static void idct32x32_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
   4374                           int bd, int out_shift) {
   4375  const int32_t *cospi = cospi_arr(bit);
   4376  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
   4377  const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
   4378  const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
   4379  int32x4_t bf1[32], bf0[32];
   4380  const int32x4_t v_bit = vdupq_n_s32(-bit);
   4381  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
   4382  // stage 0
   4383  // stage 1
   4384  bf1[0] = in[0];
   4385  bf1[1] = in[16];
   4386  bf1[2] = in[8];
   4387  bf1[3] = in[24];
   4388  bf1[4] = in[4];
   4389  bf1[5] = in[20];
   4390  bf1[6] = in[12];
   4391  bf1[7] = in[28];
   4392  bf1[8] = in[2];
   4393  bf1[9] = in[18];
   4394  bf1[10] = in[10];
   4395  bf1[11] = in[26];
   4396  bf1[12] = in[6];
   4397  bf1[13] = in[22];
   4398  bf1[14] = in[14];
   4399  bf1[15] = in[30];
   4400  bf1[16] = in[1];
   4401  bf1[17] = in[17];
   4402  bf1[18] = in[9];
   4403  bf1[19] = in[25];
   4404  bf1[20] = in[5];
   4405  bf1[21] = in[21];
   4406  bf1[22] = in[13];
   4407  bf1[23] = in[29];
   4408  bf1[24] = in[3];
   4409  bf1[25] = in[19];
   4410  bf1[26] = in[11];
   4411  bf1[27] = in[27];
   4412  bf1[28] = in[7];
   4413  bf1[29] = in[23];
   4414  bf1[30] = in[15];
   4415  bf1[31] = in[31];
   4416 
   4417  // stage 2
   4418  for (int i = 0; i < 16; i++) bf0[i] = bf1[i];
   4419 
   4420  bf0[16] = half_btf_neon_mode01_r(&cospi[62], &bf1[16], &cospi[2], &bf1[31],
   4421                                   &v_bit, &rnding);
   4422  bf0[17] = half_btf_neon_mode01_r(&cospi[30], &bf1[17], &cospi[34], &bf1[30],
   4423                                   &v_bit, &rnding);
   4424  bf0[18] = half_btf_neon_mode01_r(&cospi[46], &bf1[18], &cospi[18], &bf1[29],
   4425                                   &v_bit, &rnding);
   4426  bf0[19] = half_btf_neon_mode01_r(&cospi[14], &bf1[19], &cospi[50], &bf1[28],
   4427                                   &v_bit, &rnding);
   4428  bf0[20] = half_btf_neon_mode01_r(&cospi[54], &bf1[20], &cospi[10], &bf1[27],
   4429                                   &v_bit, &rnding);
   4430  bf0[21] = half_btf_neon_mode01_r(&cospi[22], &bf1[21], &cospi[42], &bf1[26],
   4431                                   &v_bit, &rnding);
   4432  bf0[22] = half_btf_neon_mode01_r(&cospi[38], &bf1[22], &cospi[26], &bf1[25],
   4433                                   &v_bit, &rnding);
   4434  bf0[23] = half_btf_neon_mode01_r(&cospi[6], &bf1[23], &cospi[58], &bf1[24],
   4435                                   &v_bit, &rnding);
   4436  bf0[24] = half_btf_neon_r(&cospi[58], &bf1[23], &cospi[6], &bf1[24], &v_bit,
   4437                            &rnding);
   4438  bf0[25] = half_btf_neon_r(&cospi[26], &bf1[22], &cospi[38], &bf1[25], &v_bit,
   4439                            &rnding);
   4440  bf0[26] = half_btf_neon_r(&cospi[42], &bf1[21], &cospi[22], &bf1[26], &v_bit,
   4441                            &rnding);
   4442  bf0[27] = half_btf_neon_r(&cospi[10], &bf1[20], &cospi[54], &bf1[27], &v_bit,
   4443                            &rnding);
   4444  bf0[28] = half_btf_neon_r(&cospi[50], &bf1[19], &cospi[14], &bf1[28], &v_bit,
   4445                            &rnding);
   4446  bf0[29] = half_btf_neon_r(&cospi[18], &bf1[18], &cospi[46], &bf1[29], &v_bit,
   4447                            &rnding);
   4448  bf0[30] = half_btf_neon_r(&cospi[34], &bf1[17], &cospi[30], &bf1[30], &v_bit,
   4449                            &rnding);
   4450  bf0[31] = half_btf_neon_r(&cospi[2], &bf1[16], &cospi[62], &bf1[31], &v_bit,
   4451                            &rnding);
   4452 
   4453  // stage 3
   4454  for (int i = 0; i < 8; i++) bf1[i] = bf0[i];
   4455 
   4456  bf1[8] = half_btf_neon_mode01_r(&cospi[60], &bf0[8], &cospi[4], &bf0[15],
   4457                                  &v_bit, &rnding);
   4458  bf1[9] = half_btf_neon_mode01_r(&cospi[28], &bf0[9], &cospi[36], &bf0[14],
   4459                                  &v_bit, &rnding);
   4460  bf1[10] = half_btf_neon_mode01_r(&cospi[44], &bf0[10], &cospi[20], &bf0[13],
   4461                                   &v_bit, &rnding);
   4462  bf1[11] = half_btf_neon_mode01_r(&cospi[12], &bf0[11], &cospi[52], &bf0[12],
   4463                                   &v_bit, &rnding);
   4464  bf1[12] = half_btf_neon_r(&cospi[52], &bf0[11], &cospi[12], &bf0[12], &v_bit,
   4465                            &rnding);
   4466  bf1[13] = half_btf_neon_r(&cospi[20], &bf0[10], &cospi[44], &bf0[13], &v_bit,
   4467                            &rnding);
   4468  bf1[14] = half_btf_neon_r(&cospi[36], &bf0[9], &cospi[28], &bf0[14], &v_bit,
   4469                            &rnding);
   4470  bf1[15] = half_btf_neon_r(&cospi[4], &bf0[8], &cospi[60], &bf0[15], &v_bit,
   4471                            &rnding);
   4472 
   4473  addsub_neon(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
   4474  addsub_neon(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
   4475  addsub_neon(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
   4476  addsub_neon(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
   4477  addsub_neon(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
   4478  addsub_neon(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
   4479  addsub_neon(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
   4480  addsub_neon(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
   4481 
   4482  // stage 4
   4483  bf0[0] = bf1[0];
   4484  bf0[1] = bf1[1];
   4485  bf0[2] = bf1[2];
   4486  bf0[3] = bf1[3];
   4487  bf0[4] = half_btf_neon_mode01_r(&cospi[56], &bf1[4], &cospi[8], &bf1[7],
   4488                                  &v_bit, &rnding);
   4489  bf0[5] = half_btf_neon_mode01_r(&cospi[24], &bf1[5], &cospi[40], &bf1[6],
   4490                                  &v_bit, &rnding);
   4491  bf0[6] = half_btf_neon_r(&cospi[40], &bf1[5], &cospi[24], &bf1[6], &v_bit,
   4492                           &rnding);
   4493  bf0[7] =
   4494      half_btf_neon_r(&cospi[8], &bf1[4], &cospi[56], &bf1[7], &v_bit, &rnding);
   4495 
   4496  addsub_neon(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi);
   4497  addsub_neon(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi);
   4498  addsub_neon(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi);
   4499  addsub_neon(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi);
   4500 
   4501  bf0[16] = bf1[16];
   4502  bf0[17] = half_btf_neon_mode10_r(&cospi[8], &bf1[17], &cospi[56], &bf1[30],
   4503                                   &v_bit, &rnding);
   4504  bf0[18] = half_btf_neon_mode11_r(&cospi[56], &bf1[18], &cospi[8], &bf1[29],
   4505                                   &v_bit, &rnding);
   4506  bf0[19] = bf1[19];
   4507  bf0[20] = bf1[20];
   4508  bf0[21] = half_btf_neon_mode10_r(&cospi[40], &bf1[21], &cospi[24], &bf1[26],
   4509                                   &v_bit, &rnding);
   4510  bf0[22] = half_btf_neon_mode11_r(&cospi[24], &bf1[22], &cospi[40], &bf1[25],
   4511                                   &v_bit, &rnding);
   4512  bf0[23] = bf1[23];
   4513  bf0[24] = bf1[24];
   4514  bf0[25] = half_btf_neon_mode10_r(&cospi[40], &bf1[22], &cospi[24], &bf1[25],
   4515                                   &v_bit, &rnding);
   4516  bf0[26] = half_btf_neon_r(&cospi[24], &bf1[21], &cospi[40], &bf1[26], &v_bit,
   4517                            &rnding);
   4518  bf0[27] = bf1[27];
   4519  bf0[28] = bf1[28];
   4520  bf0[29] = half_btf_neon_mode10_r(&cospi[8], &bf1[18], &cospi[56], &bf1[29],
   4521                                   &v_bit, &rnding);
   4522  bf0[30] = half_btf_neon_r(&cospi[56], &bf1[17], &cospi[8], &bf1[30], &v_bit,
   4523                            &rnding);
   4524  bf0[31] = bf1[31];
   4525 
   4526  // stage 5
   4527  bf1[0] = half_btf_neon_r(&cospi[32], &bf0[0], &cospi[32], &bf0[1], &v_bit,
   4528                           &rnding);
   4529  bf1[1] = half_btf_neon_mode01_r(&cospi[32], &bf0[0], &cospi[32], &bf0[1],
   4530                                  &v_bit, &rnding);
   4531  bf1[2] = half_btf_neon_mode01_r(&cospi[48], &bf0[2], &cospi[16], &bf0[3],
   4532                                  &v_bit, &rnding);
   4533  bf1[3] = half_btf_neon_r(&cospi[16], &bf0[2], &cospi[48], &bf0[3], &v_bit,
   4534                           &rnding);
   4535  addsub_neon(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
   4536  addsub_neon(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
   4537  bf1[8] = bf0[8];
   4538  bf1[9] = half_btf_neon_mode10_r(&cospi[16], &bf0[9], &cospi[48], &bf0[14],
   4539                                  &v_bit, &rnding);
   4540  bf1[10] = half_btf_neon_mode11_r(&cospi[48], &bf0[10], &cospi[16], &bf0[13],
   4541                                   &v_bit, &rnding);
   4542  bf1[11] = bf0[11];
   4543  bf1[12] = bf0[12];
   4544  bf1[13] = half_btf_neon_mode10_r(&cospi[16], &bf0[10], &cospi[48], &bf0[13],
   4545                                   &v_bit, &rnding);
   4546  bf1[14] = half_btf_neon_r(&cospi[48], &bf0[9], &cospi[16], &bf0[14], &v_bit,
   4547                            &rnding);
   4548  bf1[15] = bf0[15];
   4549  addsub_neon(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi);
   4550  addsub_neon(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi);
   4551  addsub_neon(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi);
   4552  addsub_neon(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi);
   4553  addsub_neon(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi);
   4554  addsub_neon(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi);
   4555  addsub_neon(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi);
   4556  addsub_neon(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi);
   4557 
   4558  // stage 6
   4559  addsub_neon(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi);
   4560  addsub_neon(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi);
   4561  bf0[4] = bf1[4];
   4562  bf0[5] = half_btf_neon_mode10_r(&cospi[32], &bf1[5], &cospi[32], &bf1[6],
   4563                                  &v_bit, &rnding);
   4564  bf0[6] = half_btf_neon_r(&cospi[32], &bf1[5], &cospi[32], &bf1[6], &v_bit,
   4565                           &rnding);
   4566  bf0[7] = bf1[7];
   4567  addsub_neon(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi);
   4568  addsub_neon(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi);
   4569  addsub_neon(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi);
   4570  addsub_neon(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi);
   4571  bf0[16] = bf1[16];
   4572  bf0[17] = bf1[17];
   4573  bf0[18] = half_btf_neon_mode10_r(&cospi[16], &bf1[18], &cospi[48], &bf1[29],
   4574                                   &v_bit, &rnding);
   4575  bf0[19] = half_btf_neon_mode10_r(&cospi[16], &bf1[19], &cospi[48], &bf1[28],
   4576                                   &v_bit, &rnding);
   4577  bf0[20] = half_btf_neon_mode11_r(&cospi[48], &bf1[20], &cospi[16], &bf1[27],
   4578                                   &v_bit, &rnding);
   4579  bf0[21] = half_btf_neon_mode11_r(&cospi[48], &bf1[21], &cospi[16], &bf1[26],
   4580                                   &v_bit, &rnding);
   4581  bf0[22] = bf1[22];
   4582  bf0[23] = bf1[23];
   4583  bf0[24] = bf1[24];
   4584  bf0[25] = bf1[25];
   4585  bf0[26] = half_btf_neon_mode10_r(&cospi[16], &bf1[21], &cospi[48], &bf1[26],
   4586                                   &v_bit, &rnding);
   4587  bf0[27] = half_btf_neon_mode10_r(&cospi[16], &bf1[20], &cospi[48], &bf1[27],
   4588                                   &v_bit, &rnding);
   4589  bf0[28] = half_btf_neon_r(&cospi[48], &bf1[19], &cospi[16], &bf1[28], &v_bit,
   4590                            &rnding);
   4591  bf0[29] = half_btf_neon_r(&cospi[48], &bf1[18], &cospi[16], &bf1[29], &v_bit,
   4592                            &rnding);
   4593  bf0[30] = bf1[30];
   4594  bf0[31] = bf1[31];
   4595 
   4596  // stage 7
   4597  addsub_neon(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi);
   4598  addsub_neon(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi);
   4599  addsub_neon(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi);
   4600  addsub_neon(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi);
   4601  bf1[8] = bf0[8];
   4602  bf1[9] = bf0[9];
   4603  bf1[10] = half_btf_neon_mode10_r(&cospi[32], &bf0[10], &cospi[32], &bf0[13],
   4604                                   &v_bit, &rnding);
   4605  bf1[11] = half_btf_neon_mode10_r(&cospi[32], &bf0[11], &cospi[32], &bf0[12],
   4606                                   &v_bit, &rnding);
   4607  bf1[12] = half_btf_neon_r(&cospi[32], &bf0[11], &cospi[32], &bf0[12], &v_bit,
   4608                            &rnding);
   4609  bf1[13] = half_btf_neon_r(&cospi[32], &bf0[10], &cospi[32], &bf0[13], &v_bit,
   4610                            &rnding);
   4611  bf1[14] = bf0[14];
   4612  bf1[15] = bf0[15];
   4613  addsub_neon(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi);
   4614  addsub_neon(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi);
   4615  addsub_neon(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi);
   4616  addsub_neon(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi);
   4617  addsub_neon(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi);
   4618  addsub_neon(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi);
   4619  addsub_neon(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi);
   4620  addsub_neon(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi);
   4621 
   4622  // stage 8
   4623  addsub_neon(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi);
   4624  addsub_neon(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi);
   4625  addsub_neon(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi);
   4626  addsub_neon(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi);
   4627  addsub_neon(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi);
   4628  addsub_neon(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi);
   4629  addsub_neon(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi);
   4630  addsub_neon(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi);
   4631  bf0[16] = bf1[16];
   4632  bf0[17] = bf1[17];
   4633  bf0[18] = bf1[18];
   4634  bf0[19] = bf1[19];
   4635  bf0[20] = half_btf_neon_mode10_r(&cospi[32], &bf1[20], &cospi[32], &bf1[27],
   4636                                   &v_bit, &rnding);
   4637  bf0[21] = half_btf_neon_mode10_r(&cospi[32], &bf1[21], &cospi[32], &bf1[26],
   4638                                   &v_bit, &rnding);
   4639  bf0[22] = half_btf_neon_mode10_r(&cospi[32], &bf1[22], &cospi[32], &bf1[25],
   4640                                   &v_bit, &rnding);
   4641  bf0[23] = half_btf_neon_mode10_r(&cospi[32], &bf1[23], &cospi[32], &bf1[24],
   4642                                   &v_bit, &rnding);
   4643  bf0[24] = half_btf_neon_r(&cospi[32], &bf1[23], &cospi[32], &bf1[24], &v_bit,
   4644                            &rnding);
   4645  bf0[25] = half_btf_neon_r(&cospi[32], &bf1[22], &cospi[32], &bf1[25], &v_bit,
   4646                            &rnding);
   4647  bf0[26] = half_btf_neon_r(&cospi[32], &bf1[21], &cospi[32], &bf1[26], &v_bit,
   4648                            &rnding);
   4649  bf0[27] = half_btf_neon_r(&cospi[32], &bf1[20], &cospi[32], &bf1[27], &v_bit,
   4650                            &rnding);
   4651  bf0[28] = bf1[28];
   4652  bf0[29] = bf1[29];
   4653  bf0[30] = bf1[30];
   4654  bf0[31] = bf1[31];
   4655 
   4656  // stage 9
   4657  addsub_neon(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo, &clamp_hi);
   4658  addsub_neon(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo, &clamp_hi);
   4659  addsub_neon(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo, &clamp_hi);
   4660  addsub_neon(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo, &clamp_hi);
   4661  addsub_neon(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo, &clamp_hi);
   4662  addsub_neon(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo, &clamp_hi);
   4663  addsub_neon(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo, &clamp_hi);
   4664  addsub_neon(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo, &clamp_hi);
   4665  addsub_neon(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo, &clamp_hi);
   4666  addsub_neon(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo, &clamp_hi);
   4667  addsub_neon(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo, &clamp_hi);
   4668  addsub_neon(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo, &clamp_hi);
   4669  addsub_neon(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo, &clamp_hi);
   4670  addsub_neon(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo, &clamp_hi);
   4671  addsub_neon(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo, &clamp_hi);
   4672  addsub_neon(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo, &clamp_hi);
   4673 
   4674  if (!do_cols) {
   4675    const int log_range_out = AOMMAX(16, bd + 6);
   4676    const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
   4677    const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
   4678    round_shift_8x8(out, out_shift);
   4679    round_shift_8x8(out + 16, out_shift);
   4680    highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 32);
   4681  }
   4682 }
   4683 
   4684 static void iidentity32_neon(int32x4_t *in, int32x4_t *out, int bit,
   4685                             int do_cols, int bd, int out_shift) {
   4686  (void)bit;
   4687  for (int i = 0; i < 32; i += 16) {
   4688    out[i] = vshlq_n_s32(in[i], 2);
   4689    out[i + 1] = vshlq_n_s32(in[i + 1], 2);
   4690    out[i + 2] = vshlq_n_s32(in[i + 2], 2);
   4691    out[i + 3] = vshlq_n_s32(in[i + 3], 2);
   4692    out[i + 4] = vshlq_n_s32(in[i + 4], 2);
   4693    out[i + 5] = vshlq_n_s32(in[i + 5], 2);
   4694    out[i + 6] = vshlq_n_s32(in[i + 6], 2);
   4695    out[i + 7] = vshlq_n_s32(in[i + 7], 2);
   4696    out[i + 8] = vshlq_n_s32(in[i + 8], 2);
   4697    out[i + 9] = vshlq_n_s32(in[i + 9], 2);
   4698    out[i + 10] = vshlq_n_s32(in[i + 10], 2);
   4699    out[i + 11] = vshlq_n_s32(in[i + 11], 2);
   4700    out[i + 12] = vshlq_n_s32(in[i + 12], 2);
   4701    out[i + 13] = vshlq_n_s32(in[i + 13], 2);
   4702    out[i + 14] = vshlq_n_s32(in[i + 14], 2);
   4703    out[i + 15] = vshlq_n_s32(in[i + 15], 2);
   4704  }
   4705 
   4706  if (!do_cols) {
   4707    const int log_range_out = AOMMAX(16, bd + 6);
   4708    const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
   4709    const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
   4710    round_shift_8x8(out, out_shift);
   4711    round_shift_8x8(out + 16, out_shift);
   4712    highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 32);
   4713  }
   4714 }
   4715 
   4716 // 1D itx types
   4717 typedef enum ATTRIBUTE_PACKED {
   4718  IDCT_1D,
   4719  IADST_1D,
   4720  IFLIPADST_1D = IADST_1D,
   4721  IIDENTITY_1D,
   4722  ITX_TYPES_1D,
   4723 } ITX_TYPE_1D;
   4724 
   4725 static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = {
   4726  IDCT_1D,      IADST_1D,     IDCT_1D,      IADST_1D,
   4727  IFLIPADST_1D, IDCT_1D,      IFLIPADST_1D, IADST_1D,
   4728  IFLIPADST_1D, IIDENTITY_1D, IDCT_1D,      IIDENTITY_1D,
   4729  IADST_1D,     IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D,
   4730 };
   4731 static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = {
   4732  IDCT_1D,      IDCT_1D,      IADST_1D,     IADST_1D,
   4733  IDCT_1D,      IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D,
   4734  IADST_1D,     IIDENTITY_1D, IIDENTITY_1D, IDCT_1D,
   4735  IIDENTITY_1D, IADST_1D,     IIDENTITY_1D, IFLIPADST_1D,
   4736 };
   4737 
   4738 static const transform_1d_neon
   4739    highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
   4740      {
   4741          { idct4x4_neon, NULL, NULL, NULL },
   4742          { iadst4x4_neon, NULL, NULL, NULL },
   4743          { iidentity4_neon, iidentity4_neon, iidentity4_neon, NULL },
   4744      },
   4745      { { idct8x8_low1_neon, idct8x8_new_neon, NULL, NULL },
   4746        { iadst8x8_low1_neon, iadst8x8_new_neon, NULL, NULL },
   4747        { iidentity8_neon, iidentity8_neon, NULL, NULL } },
   4748      {
   4749          { idct16x16_low1_neon, idct16x16_low8_neon, idct16x16_neon, NULL },
   4750          { iadst16x16_low1_neon, iadst16x16_low8_neon, iadst16x16_neon, NULL },
   4751          { iidentity16_neon, NULL, iidentity16_neon, NULL },
   4752      },
   4753      { { idct32x32_low1_neon, idct32x32_low8_neon, idct32x32_low16_neon,
   4754          idct32x32_neon },
   4755        { NULL, NULL, NULL, NULL },
   4756        { iidentity32_neon, NULL, NULL, NULL } },
   4757      { { idct64x64_low1_neon, idct64x64_low8_neon, idct64x64_low16_neon,
   4758          idct64x64_neon },
   4759        { NULL, NULL, NULL, NULL },
   4760        { NULL, NULL, NULL, NULL } }
   4761    };
   4762 
   4763 void av1_inv_txfm2d_add_4x8_neon(const tran_low_t *input, uint16_t *output,
   4764                                 int stride, TX_TYPE tx_type, const int bd) {
   4765  TX_SIZE tx_size = TX_4X8;
   4766  int32x4_t buf1[32] = { vdupq_n_s32(0) };
   4767 
   4768  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   4769  const int txw_idx = get_txw_idx(tx_size);
   4770  const int txh_idx = get_txh_idx(tx_size);
   4771  const int txfm_size_col = tx_size_wide[tx_size];
   4772  const int txfm_size_row = tx_size_high[tx_size];
   4773  const transform_1d_neon row_txfm =
   4774      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
   4775  const transform_1d_neon col_txfm =
   4776      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][1];
   4777  const int input_stride = AOMMIN(32, txfm_size_row);
   4778 
   4779  assert(col_txfm != NULL);
   4780  assert(row_txfm != NULL);
   4781  int ud_flip, lr_flip;
   4782  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   4783 
   4784  // 1st stage: column transform
   4785  int32x4_t buf0[8];
   4786  load_buffer_32bit_input(input, input_stride, buf0, txfm_size_col);
   4787  load_buffer_32bit_input(input + 4, input_stride, buf0 + 4, txfm_size_col);
   4788  round_shift_rect_array_32_neon(buf0, buf0, txfm_size_row);
   4789  row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
   4790  row_txfm(buf0 + 4, buf0 + 4, INV_COS_BIT, 0, bd, -shift[0]);
   4791 
   4792  if (lr_flip) {
   4793    TRANSPOSE_4X4(buf0[3], buf0[2], buf0[1], buf0[0], buf1[0], buf1[1], buf1[2],
   4794                  buf1[3]);
   4795 
   4796    TRANSPOSE_4X4(buf0[7], buf0[6], buf0[5], buf0[4], buf1[4], buf1[5], buf1[6],
   4797                  buf1[7]);
   4798  } else {
   4799    TRANSPOSE_4X4(buf0[0], buf0[1], buf0[2], buf0[3], buf1[0], buf1[1], buf1[2],
   4800                  buf1[3]);
   4801 
   4802    TRANSPOSE_4X4(buf0[4], buf0[5], buf0[6], buf0[7], buf1[4], buf1[5], buf1[6],
   4803                  buf1[7]);
   4804  }
   4805 
   4806  // 2nd stage: column transform
   4807  col_txfm(buf1, buf1, INV_COS_BIT, 1, bd, 0);
   4808 
   4809  round_shift_array_32_neon(buf1, buf1, txfm_size_row, -shift[1]);
   4810 
   4811  // write to buffer
   4812  highbd_write_buffer_4xn_neon(buf1, output, stride, ud_flip, txfm_size_row,
   4813                               bd);
   4814 }
   4815 
   4816 void av1_inv_txfm2d_add_8x4_neon(const int32_t *input, uint16_t *output,
   4817                                 int stride, TX_TYPE tx_type, const int bd) {
   4818  TX_SIZE tx_size = TX_8X4;
   4819  int32x4_t buf1[8];
   4820  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   4821  const int txw_idx = get_txw_idx(tx_size);
   4822  const int txh_idx = get_txh_idx(tx_size);
   4823  const int txfm_size_col = tx_size_wide[tx_size];
   4824  const int txfm_size_row = tx_size_high[tx_size];
   4825  const transform_1d_neon row_txfm =
   4826      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][1];
   4827  const transform_1d_neon col_txfm =
   4828      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
   4829 
   4830  assert(col_txfm != NULL);
   4831  assert(row_txfm != NULL);
   4832  int ud_flip, lr_flip;
   4833  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   4834 
   4835  // 1st stage: column transform
   4836  int32x4_t buf0[8];
   4837  const int32_t *input_row = input;
   4838  load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col);
   4839 
   4840  round_shift_rect_array_32_neon(buf0, buf0, txfm_size_col);
   4841  row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
   4842 
   4843  int32x4_t *buf1_ptr;
   4844  if (lr_flip) {
   4845    flip_buf_neon(buf0, buf1, txfm_size_col);
   4846    buf1_ptr = buf1;
   4847  } else {
   4848    buf1_ptr = buf0;
   4849  }
   4850 
   4851  // 2nd stage: column transform
   4852  for (int i = 0; i < 2; i++) {
   4853    int32x4_t *buf1_cur = buf1_ptr + i * txfm_size_row;
   4854    transpose_4x4(buf1_cur, buf1_cur);
   4855    col_txfm(buf1_cur, buf1_cur, INV_COS_BIT, 1, bd, 0);
   4856  }
   4857  round_shift_array_32_neon(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
   4858  // write to buffer
   4859  highbd_write_buffer_8xn_neon(buf1_ptr, output, stride, ud_flip, txfm_size_row,
   4860                               bd);
   4861 }
   4862 
   4863 void av1_inv_txfm2d_add_4x16_neon(const int32_t *input, uint16_t *output,
   4864                                  int stride, TX_TYPE tx_type, const int bd) {
   4865  TX_SIZE tx_size = TX_4X16;
   4866  int32x4_t buf1[16];
   4867  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   4868  const int txw_idx = get_txw_idx(tx_size);
   4869  const int txh_idx = get_txh_idx(tx_size);
   4870  const int txfm_size_col = tx_size_wide[tx_size];
   4871  const int txfm_size_row = tx_size_high[tx_size];
   4872  const int buf_size_h_div8 = txfm_size_row >> 2;
   4873  const transform_1d_neon row_txfm =
   4874      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
   4875  const transform_1d_neon col_txfm =
   4876      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][2];
   4877  const int input_stride = AOMMIN(32, txfm_size_row);
   4878 
   4879  assert(col_txfm != NULL);
   4880  assert(row_txfm != NULL);
   4881  int ud_flip, lr_flip;
   4882  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   4883 
   4884  // 1st stage: column transform
   4885  int32x4_t buf0[16];
   4886  for (int i = 0; i < (txfm_size_row >> 2); i++) {
   4887    const int32_t *input_row = input + i * 4;
   4888    int32x4_t *buf0_cur = buf0 + i * 4;
   4889    load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_col);
   4890    row_txfm(buf0 + (i << 2), buf0 + (i << 2), INV_COS_BIT, 0, bd, -shift[0]);
   4891  }
   4892 
   4893  if (lr_flip) {
   4894    for (int j = 0; j < buf_size_h_div8; ++j) {
   4895      TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
   4896                    buf0[4 * j], buf1[4 * j], buf1[4 * j + 1], buf1[4 * j + 2],
   4897                    buf1[4 * j + 3]);
   4898    }
   4899  } else {
   4900    for (int j = 0; j < buf_size_h_div8; ++j) {
   4901      TRANSPOSE_4X4(buf0[4 * j], buf0[4 * j + 1], buf0[4 * j + 2],
   4902                    buf0[4 * j + 3], buf1[4 * j], buf1[4 * j + 1],
   4903                    buf1[4 * j + 2], buf1[4 * j + 3]);
   4904    }
   4905  }
   4906 
   4907  // 2nd stage: column transform
   4908  col_txfm(buf1, buf1, INV_COS_BIT, 1, bd, 0);
   4909 
   4910  round_shift_array_32_neon(buf1, buf1, txfm_size_row, -shift[1]);
   4911 
   4912  // write to buffer
   4913  highbd_write_buffer_4xn_neon(buf1, output, stride, ud_flip, txfm_size_row,
   4914                               bd);
   4915 }
   4916 
   4917 void av1_inv_txfm2d_add_16x4_neon(const int32_t *input, uint16_t *output,
   4918                                  int stride, TX_TYPE tx_type, const int bd) {
   4919  TX_SIZE tx_size = TX_16X4;
   4920  int32x4_t buf1[16];
   4921  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   4922  const int txw_idx = get_txw_idx(tx_size);
   4923  const int txh_idx = get_txh_idx(tx_size);
   4924  const int txfm_size_col = tx_size_wide[tx_size];
   4925  const int txfm_size_row = tx_size_high[tx_size];
   4926  const int buf_size_w_div8 = txfm_size_col >> 2;
   4927  const transform_1d_neon row_txfm =
   4928      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][2];
   4929  const transform_1d_neon col_txfm =
   4930      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
   4931 
   4932  assert(col_txfm != NULL);
   4933  assert(row_txfm != NULL);
   4934  int ud_flip, lr_flip;
   4935  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   4936 
   4937  // 1st stage: column transform
   4938  int32x4_t buf0[16];
   4939  const int32_t *input_row = input;
   4940  load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col);
   4941 
   4942  row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
   4943 
   4944  int32x4_t *buf1_ptr;
   4945  if (lr_flip) {
   4946    flip_buf_neon(buf0, buf1, txfm_size_col);
   4947    buf1_ptr = buf1;
   4948  } else {
   4949    buf1_ptr = buf0;
   4950  }
   4951 
   4952  // 2nd stage: column transform
   4953  for (int i = 0; i < buf_size_w_div8; i++) {
   4954    int32x4_t *buf1_cur = buf1_ptr + i * txfm_size_row;
   4955    transpose_4x4(buf1_cur, buf1_cur);
   4956    col_txfm(buf1_cur, buf1_cur, INV_COS_BIT, 1, bd, 0);
   4957  }
   4958  round_shift_array_32_neon(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
   4959 
   4960  // write to buffer
   4961  for (int i = 0; i < (txfm_size_col >> 3); i++) {
   4962    highbd_write_buffer_8xn_neon(buf1_ptr + i * txfm_size_row * 2,
   4963                                 output + 8 * i, stride, ud_flip, txfm_size_row,
   4964                                 bd);
   4965  }
   4966 }
   4967 
   4968 static const int lowbd_txfm_all_1d_zeros_idx[32] = {
   4969  0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
   4970  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
   4971 };
   4972 
   4973 // Transform block width in log2 for eob (size of 64 map to 32)
   4974 static const int tx_size_wide_log2_eob[TX_SIZES_ALL] = {
   4975  2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5,
   4976 };
   4977 
   4978 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = {
   4979  0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707,
   4980 };
   4981 
   4982 DECLARE_ALIGNED(16, static const int16_t,
   4983                av1_eob_to_eobxy_16x16_default[16]) = {
   4984  0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
   4985  0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
   4986 };
   4987 
   4988 DECLARE_ALIGNED(16, static const int16_t,
   4989                av1_eob_to_eobxy_32x32_default[32]) = {
   4990  0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
   4991  0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
   4992  0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
   4993  0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
   4994 };
   4995 
   4996 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x16_default[16]) = {
   4997  0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
   4998  0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07,
   4999 };
   5000 
   5001 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x8_default[8]) = {
   5002  0x0707, 0x0707, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f,
   5003 };
   5004 
   5005 DECLARE_ALIGNED(16, static const int16_t,
   5006                av1_eob_to_eobxy_16x32_default[32]) = {
   5007  0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
   5008  0x0f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
   5009  0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
   5010  0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
   5011 };
   5012 
   5013 DECLARE_ALIGNED(16, static const int16_t,
   5014                av1_eob_to_eobxy_32x16_default[16]) = {
   5015  0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
   5016  0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
   5017 };
   5018 
   5019 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x32_default[32]) = {
   5020  0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
   5021  0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x1f07, 0x1f07, 0x1f07,
   5022  0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
   5023  0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
   5024 };
   5025 
   5026 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x8_default[8]) = {
   5027  0x0707, 0x070f, 0x070f, 0x071f, 0x071f, 0x071f, 0x071f, 0x071f,
   5028 };
   5029 
   5030 DECLARE_ALIGNED(16, static const int16_t *,
   5031                av1_eob_to_eobxy_default[TX_SIZES_ALL]) = {
   5032  NULL,
   5033  av1_eob_to_eobxy_8x8_default,
   5034  av1_eob_to_eobxy_16x16_default,
   5035  av1_eob_to_eobxy_32x32_default,
   5036  av1_eob_to_eobxy_32x32_default,
   5037  NULL,
   5038  NULL,
   5039  av1_eob_to_eobxy_8x16_default,
   5040  av1_eob_to_eobxy_16x8_default,
   5041  av1_eob_to_eobxy_16x32_default,
   5042  av1_eob_to_eobxy_32x16_default,
   5043  av1_eob_to_eobxy_32x32_default,
   5044  av1_eob_to_eobxy_32x32_default,
   5045  NULL,
   5046  NULL,
   5047  av1_eob_to_eobxy_8x32_default,
   5048  av1_eob_to_eobxy_32x8_default,
   5049  av1_eob_to_eobxy_16x32_default,
   5050  av1_eob_to_eobxy_32x16_default,
   5051 };
   5052 
   5053 static inline void highbd_get_eobx_eoby_scan_default(int *eobx, int *eoby,
   5054                                                     TX_SIZE tx_size, int eob) {
   5055  if (eob == 1) {
   5056    *eobx = 0;
   5057    *eoby = 0;
   5058    return;
   5059  }
   5060 
   5061  const int tx_w_log2 = tx_size_wide_log2_eob[tx_size];
   5062  const int eob_row = (eob - 1) >> tx_w_log2;
   5063  const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row];
   5064  *eobx = eobxy & 0xFF;
   5065  *eoby = eobxy >> 8;
   5066 }
   5067 
   5068 static inline void get_eobx_eoby_scan_default(int *eobx, int *eoby,
   5069                                              TX_SIZE tx_size) {
   5070  if (tx_size == 2) {
   5071    *eoby = 15, *eobx = 15;
   5072  } else if (tx_size == 3) {
   5073    *eoby = 31, *eobx = 31;
   5074  } else if (tx_size == 4) {
   5075    *eoby = 31, *eobx = 31;
   5076  } else if (tx_size == 7) {
   5077    *eoby = 15, *eobx = 7;
   5078  } else if (tx_size == 8) {
   5079    *eoby = 7, *eobx = 15;
   5080  } else if (tx_size == 9) {
   5081    *eoby = 31, *eobx = 15;
   5082  } else if (tx_size == 10) {
   5083    *eoby = 15, *eobx = 31;
   5084  } else if (tx_size == 11) {
   5085    *eoby = 31, *eobx = 31;
   5086  } else if (tx_size == 12) {
   5087    *eoby = 31, *eobx = 31;
   5088  } else if (tx_size == 15) {
   5089    *eoby = 31, *eobx = 7;
   5090  } else if (tx_size == 16) {
   5091    *eoby = 7, *eobx = 31;
   5092  } else if (tx_size == 17) {
   5093    *eoby = 31, *eobx = 15;
   5094  } else if (tx_size == 18) {
   5095    *eoby = 15, *eobx = 31;
   5096  } else {
   5097    *eoby = 0, *eobx = 0;
   5098  }
   5099 }
   5100 
   5101 static inline void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby,
   5102                                                 TX_SIZE tx_size) {
   5103  const int txfm_size_row = tx_size_high[tx_size];
   5104  *eoby = AOMMIN(32, txfm_size_row) - 1;
   5105  *eobx = 0;
   5106 }
   5107 
   5108 static inline void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby,
   5109                                                 TX_SIZE tx_size) {
   5110  const int txfm_size_col = tx_size_wide[tx_size];
   5111  *eobx = AOMMIN(32, txfm_size_col) - 1;
   5112  *eoby = 0;
   5113 }
   5114 
   5115 static void inv_txfm2d_add_h_identity_neon(const int32_t *input,
   5116                                           uint16_t *output, int stride,
   5117                                           TX_TYPE tx_type, TX_SIZE tx_size,
   5118                                           const int bd) {
   5119  int32x4_t buf1[64];
   5120  int eobx, eoby;
   5121  get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size);
   5122  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   5123  const int txw_idx = get_txw_idx(tx_size);
   5124  const int txh_idx = get_txh_idx(tx_size);
   5125  const int txfm_size_col = tx_size_wide[tx_size];
   5126  const int txfm_size_row = tx_size_high[tx_size];
   5127  const int buf_size_w = AOMMIN(32, txfm_size_col);
   5128  const int buf_size_w_div4 = buf_size_w >> 2;
   5129  const int buf_size_h_div8 = (eoby + 8) >> 3;
   5130  const int row_max = AOMMIN(32, txfm_size_row);
   5131  const int input_stride = row_max;
   5132  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
   5133  const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
   5134  const transform_1d_neon row_txfm =
   5135      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
   5136  assert(row_txfm != NULL);
   5137  const transform_1d_neon col_txfm =
   5138      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
   5139  assert(col_txfm != NULL);
   5140  int ud_flip, lr_flip;
   5141  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   5142 
   5143  for (int i = 0; i < (buf_size_h_div8 << 1); ++i) {
   5144    int32x4_t buf0[16];
   5145    load_buffer_32bit_input(input + i * 4, input_stride, buf0, buf_size_w);
   5146    if (rect_type == 1 || rect_type == -1) {
   5147      round_shift_rect_array_32_neon(buf0, buf0, buf_size_w);
   5148    }
   5149    row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
   5150 
   5151    int32x4_t *_buf1 = buf1 + i * 4;
   5152 
   5153    for (int j = 0; j < buf_size_w_div4; ++j) {
   5154      int32x4_t *buf0_cur = buf0 + j * 4;
   5155      TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
   5156                    buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
   5157      _buf1[j * txfm_size_row + 0] = buf0_cur[0];
   5158      _buf1[j * txfm_size_row + 1] = buf0_cur[1];
   5159      _buf1[j * txfm_size_row + 2] = buf0_cur[2];
   5160      _buf1[j * txfm_size_row + 3] = buf0_cur[3];
   5161    }
   5162  }
   5163  for (int i = 0; i < buf_size_w_div4; i++) {
   5164    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
   5165             bd, 0);
   5166 
   5167    round_shift_array_32_neon(buf1 + i * txfm_size_row,
   5168                              buf1 + i * txfm_size_row, txfm_size_row,
   5169                              -shift[1]);
   5170  }
   5171 
   5172  // write to buffer
   5173  for (int i = 0; i < (txfm_size_col >> 3); i++) {
   5174    highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i,
   5175                                 stride, ud_flip, txfm_size_row, bd);
   5176  }
   5177 }
   5178 
   5179 static void inv_txfm2d_add_v_identity_neon(const int32_t *input,
   5180                                           uint16_t *output, int stride,
   5181                                           TX_TYPE tx_type, TX_SIZE tx_size,
   5182                                           const int bd) {
   5183  int32x4_t buf1[64];
   5184  int eobx, eoby;
   5185  get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size);
   5186  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   5187  const int txw_idx = get_txw_idx(tx_size);
   5188  const int txh_idx = get_txh_idx(tx_size);
   5189  const int txfm_size_col = tx_size_wide[tx_size];
   5190  const int txfm_size_row = tx_size_high[tx_size];
   5191  const int buf_size_w_div4 = AOMMIN(32, txfm_size_col) >> 2;
   5192  const int row_max = AOMMIN(32, txfm_size_row);
   5193  const int input_stride = row_max;
   5194  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
   5195  const int buf_size_nonzero_w = buf_size_nonzero_w_div8 << 3;
   5196  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
   5197  const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
   5198  const transform_1d_neon row_txfm =
   5199      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
   5200  assert(row_txfm != NULL);
   5201  const transform_1d_neon col_txfm =
   5202      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
   5203  assert(col_txfm != NULL);
   5204  int ud_flip, lr_flip;
   5205  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   5206 
   5207  for (int i = 0; i < (row_max >> 2); ++i) {
   5208    int32x4_t buf0[16];
   5209    load_buffer_32bit_input(input + i * 4, input_stride, buf0,
   5210                            buf_size_nonzero_w);
   5211    if (rect_type == 1 || rect_type == -1) {
   5212      round_shift_rect_array_32_neon(buf0, buf0, buf_size_nonzero_w);
   5213    }
   5214    row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
   5215 
   5216    int32x4_t *_buf1 = buf1 + i * 4;
   5217    if (lr_flip) {
   5218      for (int j = 0; j < buf_size_w_div4; ++j) {
   5219        TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
   5220                      buf0[4 * j],
   5221                      _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0],
   5222                      _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 1],
   5223                      _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 2],
   5224                      _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3]);
   5225      }
   5226    } else {
   5227      for (int j = 0; j < buf_size_w_div4; ++j) {
   5228        TRANSPOSE_4X4(
   5229            buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
   5230            _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
   5231            _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
   5232      }
   5233    }
   5234  }
   5235  for (int i = 0; i < buf_size_w_div4; i++) {
   5236    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
   5237             bd, 0);
   5238 
   5239    round_shift_array_32_neon(buf1 + i * txfm_size_row,
   5240                              buf1 + i * txfm_size_row, txfm_size_row,
   5241                              -shift[1]);
   5242  }
   5243 
   5244  // write to buffer
   5245  {
   5246    for (int i = 0; i < (txfm_size_col >> 3); i++) {
   5247      highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i,
   5248                                   stride, ud_flip, txfm_size_row, bd);
   5249    }
   5250  }
   5251 }
   5252 
   5253 static void inv_txfm2d_add_idtx_neon(const int32_t *input, uint16_t *output,
   5254                                     int stride, TX_TYPE tx_type,
   5255                                     TX_SIZE tx_size, const int bd) {
   5256  int32x4_t buf1[64 * 4];
   5257  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   5258  const int txw_idx = get_txw_idx(tx_size);
   5259  const int txh_idx = get_txh_idx(tx_size);
   5260  const int txfm_size_col = tx_size_wide[tx_size];
   5261  const int txfm_size_row = tx_size_high[tx_size];
   5262  const int row_max = AOMMIN(32, txfm_size_row);
   5263  const int input_stride = row_max;
   5264  const int buf_size_w = AOMMIN(32, txfm_size_col);
   5265  const int buf_size_w_div4 = buf_size_w >> 2;
   5266  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
   5267  const transform_1d_neon row_txfm =
   5268      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
   5269  assert(row_txfm != NULL);
   5270  const transform_1d_neon col_txfm =
   5271      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
   5272  assert(col_txfm != NULL);
   5273  for (int i = 0; i < (row_max >> 2); ++i) {
   5274    int32x4_t buf0[32];
   5275    load_buffer_32bit_input(input + i * 4, input_stride, buf0, buf_size_w);
   5276    if (rect_type == 1 || rect_type == -1) {
   5277      round_shift_rect_array_32_neon(buf0, buf0, buf_size_w);
   5278    }
   5279    row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
   5280 
   5281    int32x4_t *_buf1 = buf1 + i * 4;
   5282    for (int j = 0; j < buf_size_w_div4; ++j) {
   5283      int32x4_t *buf0_cur = buf0 + j * 4;
   5284      TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
   5285                    buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
   5286      _buf1[j * txfm_size_row + 0] = buf0_cur[0];
   5287      _buf1[j * txfm_size_row + 1] = buf0_cur[1];
   5288      _buf1[j * txfm_size_row + 2] = buf0_cur[2];
   5289      _buf1[j * txfm_size_row + 3] = buf0_cur[3];
   5290    }
   5291  }
   5292  for (int i = 0; i < buf_size_w_div4; i++) {
   5293    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
   5294             bd, 0);
   5295 
   5296    round_shift_array_32_neon(buf1 + i * txfm_size_row,
   5297                              buf1 + i * txfm_size_row, txfm_size_row,
   5298                              -shift[1]);
   5299  }
   5300 
   5301  // write to buffer
   5302  {
   5303    for (int i = 0; i < (txfm_size_col >> 3); i++) {
   5304      highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i,
   5305                                   stride, 0, txfm_size_row, bd);
   5306    }
   5307  }
   5308 }
   5309 
   5310 static void inv_txfm2d_add_no_identity_neon(const int32_t *input,
   5311                                            uint16_t *output, int stride,
   5312                                            TX_TYPE tx_type, TX_SIZE tx_size,
   5313                                            const int bd) {
   5314  int32x4_t buf1[64 * 16];
   5315  int eobx, eoby;
   5316  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size);
   5317  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   5318  const int txw_idx = get_txw_idx(tx_size);
   5319  const int txh_idx = get_txh_idx(tx_size);
   5320  const int txfm_size_col = tx_size_wide[tx_size];
   5321  const int txfm_size_row = tx_size_high[tx_size];
   5322  const int buf_size_w_div4 = txfm_size_col >> 2;
   5323  const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3;
   5324  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
   5325  const int input_stride = AOMMIN(32, txfm_size_row);
   5326  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
   5327 
   5328  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
   5329  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
   5330  const transform_1d_neon row_txfm =
   5331      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
   5332  const transform_1d_neon col_txfm =
   5333      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
   5334 
   5335  assert(col_txfm != NULL);
   5336  assert(row_txfm != NULL);
   5337  int ud_flip, lr_flip;
   5338  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   5339  // 1st stage: column transform
   5340  for (int i = 0; i < buf_size_nonzero_h_div8 << 1; i++) {
   5341    int32x4_t buf0[64];
   5342    load_buffer_32bit_input(input + i * 4, input_stride, buf0,
   5343                            buf_size_nonzero_w);
   5344    if (rect_type == 1 || rect_type == -1) {
   5345      round_shift_rect_array_32_neon(buf0, buf0, buf_size_nonzero_w);
   5346    }
   5347    row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
   5348 
   5349    int32x4_t *_buf1 = &buf1[i * 4];
   5350 
   5351    if (lr_flip) {
   5352      for (int j = 0; j < buf_size_w_div4; ++j) {
   5353        TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
   5354                      buf0[4 * j],
   5355                      _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0],
   5356                      _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 1],
   5357                      _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 2],
   5358                      _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3]);
   5359      }
   5360    } else {
   5361      for (int j = 0; j < buf_size_w_div4; ++j) {
   5362        TRANSPOSE_4X4(
   5363            buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
   5364            _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
   5365            _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
   5366      }
   5367    }
   5368  }
   5369  // 2nd stage: column transform
   5370  for (int i = 0; i < buf_size_w_div4; i++) {
   5371    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
   5372             bd, 0);
   5373 
   5374    round_shift_array_32_neon(buf1 + i * txfm_size_row,
   5375                              buf1 + i * txfm_size_row, txfm_size_row,
   5376                              -shift[1]);
   5377  }
   5378 
   5379  // write to buffer
   5380  {
   5381    for (int i = 0; i < (txfm_size_col >> 3); i++) {
   5382      highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i,
   5383                                   stride, ud_flip, txfm_size_row, bd);
   5384    }
   5385  }
   5386 }
   5387 
   5388 static void highbd_inv_txfm2d_add_no_identity_neon(const int32_t *input,
   5389                                                   uint16_t *output, int stride,
   5390                                                   TX_TYPE tx_type,
   5391                                                   TX_SIZE tx_size, int eob,
   5392                                                   const int bd) {
   5393  int32x4_t buf1[64 * 16];
   5394  int eobx, eoby;
   5395  highbd_get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
   5396  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   5397  const int txw_idx = get_txw_idx(tx_size);
   5398  const int txh_idx = get_txh_idx(tx_size);
   5399  const int txfm_size_col = tx_size_wide[tx_size];
   5400  const int txfm_size_row = tx_size_high[tx_size];
   5401  const int buf_size_w_div8 = txfm_size_col >> 2;
   5402  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
   5403  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
   5404  const int input_stride = AOMMIN(32, txfm_size_col);
   5405  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
   5406 
   5407  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
   5408  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
   5409  const transform_1d_neon row_txfm =
   5410      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
   5411  const transform_1d_neon col_txfm =
   5412      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
   5413 
   5414  assert(col_txfm != NULL);
   5415  assert(row_txfm != NULL);
   5416  int ud_flip, lr_flip;
   5417  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   5418  // 1st stage: column transform
   5419  for (int i = 0; i < buf_size_nonzero_h_div8 << 1; i++) {
   5420    int32x4_t buf0[64];
   5421    const int32_t *input_row = input + i * input_stride * 4;
   5422    for (int j = 0; j < buf_size_nonzero_w_div8 << 1; ++j) {
   5423      int32x4_t *buf0_cur = &buf0[j * 4];
   5424      load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
   5425 
   5426      TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
   5427                    buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
   5428    }
   5429    if (rect_type == 1 || rect_type == -1) {
   5430      round_shift_rect_array_32_neon(buf0, buf0, buf_size_nonzero_w_div8 << 3);
   5431    }
   5432    row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
   5433 
   5434    int32x4_t *_buf1 = &buf1[i * 4];
   5435 
   5436    if (lr_flip) {
   5437      for (int j = 0; j < buf_size_w_div8; ++j) {
   5438        TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
   5439                      buf0[4 * j],
   5440                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0],
   5441                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1],
   5442                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2],
   5443                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]);
   5444      }
   5445    } else {
   5446      for (int j = 0; j < buf_size_w_div8; ++j) {
   5447        TRANSPOSE_4X4(
   5448            buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
   5449            _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
   5450            _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
   5451      }
   5452    }
   5453  }
   5454  // 2nd stage: column transform
   5455  for (int i = 0; i < buf_size_w_div8; i++) {
   5456    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
   5457             bd, 0);
   5458 
   5459    round_shift_array_32_neon(buf1 + i * txfm_size_row,
   5460                              buf1 + i * txfm_size_row, txfm_size_row,
   5461                              -shift[1]);
   5462  }
   5463 
   5464  // write to buffer
   5465  {
   5466    for (int i = 0; i < (txfm_size_col >> 3); i++) {
   5467      highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i,
   5468                                   stride, ud_flip, txfm_size_row, bd);
   5469    }
   5470  }
   5471 }
   5472 
   5473 static void highbd_inv_txfm2d_add_universe_neon(const int32_t *input,
   5474                                                uint8_t *output, int stride,
   5475                                                TX_TYPE tx_type,
   5476                                                TX_SIZE tx_size, int eob,
   5477                                                const int bd) {
   5478  switch (tx_type) {
   5479    case DCT_DCT:
   5480    case ADST_DCT:
   5481    case DCT_ADST:
   5482    case ADST_ADST:
   5483    case FLIPADST_DCT:
   5484    case DCT_FLIPADST:
   5485    case FLIPADST_FLIPADST:
   5486    case ADST_FLIPADST:
   5487    case FLIPADST_ADST:
   5488      highbd_inv_txfm2d_add_no_identity_neon(input, CONVERT_TO_SHORTPTR(output),
   5489                                             stride, tx_type, tx_size, eob, bd);
   5490      break;
   5491    case V_DCT:
   5492    case V_ADST:
   5493    case V_FLIPADST:
   5494      inv_txfm2d_add_h_identity_neon(input, CONVERT_TO_SHORTPTR(output), stride,
   5495                                     tx_type, tx_size, bd);
   5496      break;
   5497    case H_DCT:
   5498    case H_ADST:
   5499    case H_FLIPADST:
   5500      inv_txfm2d_add_v_identity_neon(input, CONVERT_TO_SHORTPTR(output), stride,
   5501                                     tx_type, tx_size, bd);
   5502      break;
   5503    case IDTX:
   5504      inv_txfm2d_add_idtx_neon(input, CONVERT_TO_SHORTPTR(output), stride,
   5505                               tx_type, tx_size, bd);
   5506      break;
   5507    default: assert(0); break;
   5508  }
   5509 }
   5510 
   5511 static void inv_txfm2d_add_universe_neon(const int32_t *input, uint8_t *output,
   5512                                         int stride, TX_TYPE tx_type,
   5513                                         TX_SIZE tx_size, const int bd) {
   5514  switch (tx_type) {
   5515    case DCT_DCT:
   5516    case ADST_DCT:
   5517    case DCT_ADST:
   5518    case ADST_ADST:
   5519    case FLIPADST_DCT:
   5520    case DCT_FLIPADST:
   5521    case FLIPADST_FLIPADST:
   5522    case ADST_FLIPADST:
   5523    case FLIPADST_ADST:
   5524      inv_txfm2d_add_no_identity_neon(input, CONVERT_TO_SHORTPTR(output),
   5525                                      stride, tx_type, tx_size, bd);
   5526      break;
   5527    case V_DCT:
   5528    case V_ADST:
   5529    case V_FLIPADST:
   5530      inv_txfm2d_add_h_identity_neon(input, CONVERT_TO_SHORTPTR(output), stride,
   5531                                     tx_type, tx_size, bd);
   5532      break;
   5533    case H_DCT:
   5534    case H_ADST:
   5535    case H_FLIPADST:
   5536      inv_txfm2d_add_v_identity_neon(input, CONVERT_TO_SHORTPTR(output), stride,
   5537                                     tx_type, tx_size, bd);
   5538      break;
   5539    case IDTX:
   5540      inv_txfm2d_add_idtx_neon(input, CONVERT_TO_SHORTPTR(output), stride,
   5541                               tx_type, tx_size, bd);
   5542      break;
   5543    default: assert(0); break;
   5544  }
   5545 }
   5546 
   5547 static void highbd_inv_txfm_add_8x8_neon(const tran_low_t *input, uint8_t *dest,
   5548                                         int stride,
   5549                                         const TxfmParam *txfm_param) {
   5550  int bd = txfm_param->bd;
   5551  const TX_TYPE tx_type = txfm_param->tx_type;
   5552  const int32_t *src = cast_to_int32(input);
   5553  switch (tx_type) {
   5554    case IDTX:
   5555    case H_DCT:
   5556    case H_ADST:
   5557    case H_FLIPADST:
   5558    case V_DCT:
   5559    case V_ADST:
   5560    case V_FLIPADST:
   5561      highbd_inv_txfm2d_add_universe_neon(input, dest, stride, tx_type,
   5562                                          txfm_param->tx_size, txfm_param->eob,
   5563                                          bd);
   5564      break;
   5565    default:
   5566      av1_inv_txfm2d_add_8x8_neon(src, CONVERT_TO_SHORTPTR(dest), stride,
   5567                                  tx_type, bd);
   5568      break;
   5569  }
   5570 }
   5571 
   5572 static void highbd_inv_txfm_add_4x4_neon(const tran_low_t *input, uint8_t *dest,
   5573                                         int stride,
   5574                                         const TxfmParam *txfm_param) {
   5575  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
   5576  int eob = txfm_param->eob;
   5577  int bd = txfm_param->bd;
   5578  int lossless = txfm_param->lossless;
   5579  const int32_t *src = cast_to_int32(input);
   5580  const TX_TYPE tx_type = txfm_param->tx_type;
   5581  if (lossless) {
   5582    assert(tx_type == DCT_DCT);
   5583    av1_highbd_iwht4x4_add(input, dest, stride, eob, bd);
   5584    return;
   5585  }
   5586  av1_inv_txfm2d_add_4x4_neon(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
   5587                              bd);
   5588 }
   5589 
   5590 void av1_inv_txfm2d_add_8x16_neon(const tran_low_t *input, uint16_t *dest,
   5591                                  int stride, TX_TYPE tx_type, const int bd) {
   5592  inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, TX_8X16,
   5593                               bd);
   5594 }
   5595 
   5596 void av1_inv_txfm2d_add_16x8_neon(const tran_low_t *input, uint16_t *dest,
   5597                                  int stride, TX_TYPE tx_type, const int bd) {
   5598  inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, TX_16X8,
   5599                               bd);
   5600 }
   5601 
   5602 void av1_inv_txfm2d_add_16x32_neon(const tran_low_t *input, uint16_t *dest,
   5603                                   int stride, TX_TYPE tx_type, const int bd) {
   5604  inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
   5605                               TX_16X32, bd);
   5606 }
   5607 
   5608 void av1_inv_txfm2d_add_32x16_neon(const tran_low_t *input, uint16_t *dest,
   5609                                   int stride, TX_TYPE tx_type, const int bd) {
   5610  inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
   5611                               TX_32X16, bd);
   5612 }
   5613 
   5614 void av1_inv_txfm2d_add_32x32_neon(const tran_low_t *input, uint16_t *dest,
   5615                                   int stride, TX_TYPE tx_type, const int bd) {
   5616  inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
   5617                               TX_32X32, bd);
   5618 }
   5619 
   5620 void av1_inv_txfm2d_add_64x64_neon(const tran_low_t *input, uint16_t *dest,
   5621                                   int stride, TX_TYPE tx_type, const int bd) {
   5622  inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
   5623                               TX_64X64, bd);
   5624 }
   5625 
   5626 void av1_inv_txfm2d_add_32x64_neon(const tran_low_t *input, uint16_t *dest,
   5627                                   int stride, TX_TYPE tx_type, const int bd) {
   5628  inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
   5629                               TX_32X64, bd);
   5630 }
   5631 
   5632 void av1_inv_txfm2d_add_64x32_neon(const tran_low_t *input, uint16_t *dest,
   5633                                   int stride, TX_TYPE tx_type, const int bd) {
   5634  inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
   5635                               TX_64X32, bd);
   5636 }
   5637 
   5638 void av1_inv_txfm2d_add_64x16_neon(const tran_low_t *input, uint16_t *dest,
   5639                                   int stride, TX_TYPE tx_type, const int bd) {
   5640  inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
   5641                               TX_64X16, bd);
   5642 }
   5643 
   5644 void av1_inv_txfm2d_add_16x64_neon(const tran_low_t *input, uint16_t *dest,
   5645                                   int stride, TX_TYPE tx_type, const int bd) {
   5646  inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
   5647                               TX_16X64, bd);
   5648 }
   5649 
   5650 static void av1_inv_txfm2d_add_16x16_neon(const tran_low_t *input,
   5651                                          uint16_t *dest, int stride,
   5652                                          TX_TYPE tx_type, const int bd) {
   5653  inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
   5654                               TX_16X16, bd);
   5655 }
   5656 
   5657 void av1_inv_txfm2d_add_32x8_neon(const tran_low_t *input, uint16_t *dest,
   5658                                  int stride, TX_TYPE tx_type, const int bd) {
   5659  inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, TX_32X8,
   5660                               bd);
   5661 }
   5662 
   5663 void av1_inv_txfm2d_add_8x32_neon(const tran_low_t *input, uint16_t *dest,
   5664                                  int stride, TX_TYPE tx_type, const int bd) {
   5665  inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, TX_8X32,
   5666                               bd);
   5667 }
   5668 
   5669 void av1_highbd_inv_txfm_add_neon(const tran_low_t *input, uint8_t *dest,
   5670                                  int stride, const TxfmParam *txfm_param) {
   5671  const TX_SIZE tx_size = txfm_param->tx_size;
   5672 
   5673  TX_TYPE tx_type = txfm_param->tx_type;
   5674  int bd = txfm_param->bd;
   5675  switch (tx_size) {
   5676    case TX_8X8:
   5677      highbd_inv_txfm_add_8x8_neon(input, dest, stride, txfm_param);
   5678      break;
   5679    case TX_4X8:
   5680      av1_inv_txfm2d_add_4x8_neon(input, CONVERT_TO_SHORTPTR(dest), stride,
   5681                                  txfm_param->tx_type, txfm_param->bd);
   5682      break;
   5683    case TX_8X4:
   5684      av1_inv_txfm2d_add_8x4_neon(input, CONVERT_TO_SHORTPTR(dest), stride,
   5685                                  txfm_param->tx_type, txfm_param->bd);
   5686      break;
   5687    case TX_4X4:
   5688      highbd_inv_txfm_add_4x4_neon(input, dest, stride, txfm_param);
   5689      break;
   5690    case TX_16X4:
   5691      av1_inv_txfm2d_add_16x4_neon(input, CONVERT_TO_SHORTPTR(dest), stride,
   5692                                   txfm_param->tx_type, txfm_param->bd);
   5693      break;
   5694    case TX_4X16:
   5695      av1_inv_txfm2d_add_4x16_neon(input, CONVERT_TO_SHORTPTR(dest), stride,
   5696                                   txfm_param->tx_type, txfm_param->bd);
   5697      break;
   5698    case TX_8X16:
   5699      av1_inv_txfm2d_add_8x16_neon(input, (uint16_t *)dest, stride, tx_type,
   5700                                   bd);
   5701      break;
   5702    case TX_16X8:
   5703      av1_inv_txfm2d_add_16x8_neon(input, (uint16_t *)dest, stride, tx_type,
   5704                                   bd);
   5705      break;
   5706    case TX_16X32:
   5707      av1_inv_txfm2d_add_16x32_neon(input, (uint16_t *)dest, stride, tx_type,
   5708                                    bd);
   5709      break;
   5710    case TX_32X16:
   5711      av1_inv_txfm2d_add_32x16_neon(input, (uint16_t *)dest, stride, tx_type,
   5712                                    bd);
   5713      break;
   5714    case TX_16X16:
   5715      av1_inv_txfm2d_add_16x16_neon(input, (uint16_t *)dest, stride, tx_type,
   5716                                    bd);
   5717      break;
   5718    case TX_32X32:
   5719      av1_inv_txfm2d_add_32x32_neon(input, (uint16_t *)dest, stride, tx_type,
   5720                                    bd);
   5721      break;
   5722    case TX_64X64:
   5723      av1_inv_txfm2d_add_64x64_neon(input, (uint16_t *)dest, stride, tx_type,
   5724                                    bd);
   5725      break;
   5726    case TX_32X64:
   5727      av1_inv_txfm2d_add_32x64_neon(input, (uint16_t *)dest, stride, tx_type,
   5728                                    bd);
   5729      break;
   5730    case TX_64X32:
   5731      av1_inv_txfm2d_add_64x32_neon(input, (uint16_t *)dest, stride, tx_type,
   5732                                    bd);
   5733      break;
   5734    case TX_16X64:
   5735      av1_inv_txfm2d_add_16x64_neon(input, (uint16_t *)dest, stride, tx_type,
   5736                                    bd);
   5737      break;
   5738    case TX_64X16:
   5739      av1_inv_txfm2d_add_64x16_neon(input, (uint16_t *)dest, stride, tx_type,
   5740                                    bd);
   5741      break;
   5742    case TX_32X8:
   5743      av1_inv_txfm2d_add_32x8_neon(input, (uint16_t *)dest, stride, tx_type,
   5744                                   bd);
   5745      break;
   5746    case TX_8X32:
   5747      av1_inv_txfm2d_add_8x32_neon(input, (uint16_t *)dest, stride, tx_type,
   5748                                   bd);
   5749      break;
   5750  }
   5751 }