tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

av1_fwd_txfm2d_neon.c (127018B)


      1 /*
      2 * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <arm_neon.h>
     13 #include <assert.h>
     14 
     15 #include "aom_dsp/arm/mem_neon.h"
     16 #include "aom_dsp/arm/transpose_neon.h"
     17 #include "aom_dsp/txfm_common.h"
     18 #include "aom_ports/mem.h"
     19 #include "av1/common/av1_txfm.h"
     20 #include "av1/encoder/av1_fwd_txfm1d_cfg.h"
     21 #include "config/aom_config.h"
     22 #include "config/av1_rtcd.h"
     23 #include "shift_neon.h"
     24 #include "txfm_neon.h"
     25 
     26 #define TXFM_COS_BIT_MAX 13
     27 
     28 // A note on butterfly helper naming:
     29 //
     30 // butterfly_[input_ty]_[acc_ty]_[input_num]_[weight_num]_[weight_neg]_neon
     31 // e.g. butterfly_s32_s32_x4_0231_neon
     32 //                |   |   |  ^ Weights are applied as indices 0, 2, 3, 1
     33 //                |   |   |    (see more detail below)
     34 //                |   |   ^ (int32)x4 input/output parameters
     35 //                |   ^ 32-bit accumulators internally
     36 //                ^ 32-bit input/output parameters
     37 //
     38 // Weights are stored as 4-tuples in Q2.13 format as (w0, 1-w0, -w0, w0-1) to
     39 // avoid needing separate negation instructions. This is represented in the
     40 // helper naming by referring to the lane index in the loaded tuple that each
     41 // multiply is performed with:
     42 //
     43 //        in0  in1
     44 //      /----------
     45 // out0 |  w0   w1   ==>  out0 = in0 * w0 + in1 * w1
     46 // out1 |  w2   w3   ==>  out1 = in0 * w2 + in1 * w3
     47 //
     48 // So for indices 0331 from the earlier example, we end up with:
     49 //
     50 //          in0       in1
     51 //      /------------------
     52 // out0 | (lane 0) (lane 2)   ==>  out0 = in0 *   w0   + in1 *  -w0
     53 // out1 | (lane 3) (lane 1)   ==>  out1 = in0 * (w0-1) + in1 * (1-w0)
     54 
     55 static AOM_FORCE_INLINE void butterfly_s32_s32_x4_0112_neon(
     56    const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1,
     57    int32x4_t *out0, int32x4_t *out1) {
     58  int32x4_t w0101 = vmovl_s16(w0101_s16);
     59  int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 0);
     60  o0 = vmlaq_lane_s32(o0, in1, vget_low_s32(w0101), 1);
     61  int32x4_t o1 = vmulq_lane_s32(in0, vget_low_s32(w0101), 1);
     62  o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 0);
     63  *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX);
     64  *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX);
     65 }
     66 
     67 static AOM_FORCE_INLINE void butterfly_s32_s32_x4_0332_neon(
     68    const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1,
     69    int32x4_t *out0, int32x4_t *out1) {
     70  int32x4_t w0101 = vmovl_s16(w0101_s16);
     71  int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 0);
     72  o0 = vmlaq_lane_s32(o0, in1, vget_high_s32(w0101), 1);
     73  int32x4_t o1 = vmulq_lane_s32(in0, vget_high_s32(w0101), 1);
     74  o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 0);
     75  *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX);
     76  *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX);
     77 }
     78 
     79 static AOM_FORCE_INLINE void butterfly_s32_s32_x4_1003_neon(
     80    const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1,
     81    int32x4_t *out0, int32x4_t *out1) {
     82  int32x4_t w0101 = vmovl_s16(w0101_s16);
     83  int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 1);
     84  o0 = vmlaq_lane_s32(o0, in1, vget_low_s32(w0101), 0);
     85  int32x4_t o1 = vmulq_lane_s32(in0, vget_low_s32(w0101), 0);
     86  o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 1);
     87  *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX);
     88  *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX);
     89 }
     90 
     91 static AOM_FORCE_INLINE void butterfly_s32_s32_x4_1223_neon(
     92    const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1,
     93    int32x4_t *out0, int32x4_t *out1) {
     94  int32x4_t w0101 = vmovl_s16(w0101_s16);
     95  int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 1);
     96  o0 = vmlaq_lane_s32(o0, in1, vget_high_s32(w0101), 0);
     97  int32x4_t o1 = vmulq_lane_s32(in0, vget_high_s32(w0101), 0);
     98  o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 1);
     99  *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX);
    100  *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX);
    101 }
    102 
    103 #define butterfly_s16_s32_x4_neon(wvec, lane0, lane1, lane2, lane3, in0, in1, \
    104                                  out0, out1)                                 \
    105  do {                                                                        \
    106    int32x4_t u0 = vmull_lane_s16(in0, wvec, lane0);                          \
    107    u0 = vmlal_lane_s16(u0, in1, wvec, lane1);                                \
    108    int32x4_t v0 = vmull_lane_s16(in0, wvec, lane2);                          \
    109    v0 = vmlal_lane_s16(v0, in1, wvec, lane3);                                \
    110    *out0 = vqrshrn_n_s32(u0, TXFM_COS_BIT_MAX);                              \
    111    *out1 = vqrshrn_n_s32(v0, TXFM_COS_BIT_MAX);                              \
    112  } while (0)
    113 
    114 static AOM_FORCE_INLINE void butterfly_s16_s32_x4_0112_neon(
    115    const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1,
    116    int16x4_t *out0, int16x4_t *out1) {
    117  butterfly_s16_s32_x4_neon(w0101, 0, 1, 1, 2, in0, in1, out0, out1);
    118 }
    119 
    120 static AOM_FORCE_INLINE void butterfly_s16_s32_x4_0332_neon(
    121    const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1,
    122    int16x4_t *out0, int16x4_t *out1) {
    123  butterfly_s16_s32_x4_neon(w0101, 0, 3, 3, 2, in0, in1, out0, out1);
    124 }
    125 
    126 static AOM_FORCE_INLINE void butterfly_s16_s32_x4_1003_neon(
    127    const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1,
    128    int16x4_t *out0, int16x4_t *out1) {
    129  butterfly_s16_s32_x4_neon(w0101, 1, 0, 0, 3, in0, in1, out0, out1);
    130 }
    131 
    132 static AOM_FORCE_INLINE void butterfly_s16_s32_x4_1223_neon(
    133    const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1,
    134    int16x4_t *out0, int16x4_t *out1) {
    135  butterfly_s16_s32_x4_neon(w0101, 1, 2, 2, 3, in0, in1, out0, out1);
    136 }
    137 
    138 #define butterfly_s16_s32_x8_neon(wvec, lane0, lane1, lane2, lane3, in0, in1, \
    139                                  out0, out1)                                 \
    140  do {                                                                        \
    141    int32x4_t u0 = vmull_lane_s16(vget_low_s16(in0), wvec, lane0);            \
    142    u0 = vmlal_lane_s16(u0, vget_low_s16(in1), wvec, lane1);                  \
    143    int32x4_t u1 = vmull_lane_s16(vget_high_s16(in0), wvec, lane0);           \
    144    u1 = vmlal_lane_s16(u1, vget_high_s16(in1), wvec, lane1);                 \
    145    int32x4_t v0 = vmull_lane_s16(vget_low_s16(in0), wvec, lane2);            \
    146    v0 = vmlal_lane_s16(v0, vget_low_s16(in1), wvec, lane3);                  \
    147    int32x4_t v1 = vmull_lane_s16(vget_high_s16(in0), wvec, lane2);           \
    148    v1 = vmlal_lane_s16(v1, vget_high_s16(in1), wvec, lane3);                 \
    149    const int16x4_t c0 = vrshrn_n_s32(u0, TXFM_COS_BIT_MAX);                  \
    150    const int16x4_t c1 = vrshrn_n_s32(u1, TXFM_COS_BIT_MAX);                  \
    151    const int16x4_t d0 = vrshrn_n_s32(v0, TXFM_COS_BIT_MAX);                  \
    152    const int16x4_t d1 = vrshrn_n_s32(v1, TXFM_COS_BIT_MAX);                  \
    153    *out0 = vcombine_s16(c0, c1);                                             \
    154    *out1 = vcombine_s16(d0, d1);                                             \
    155  } while (0)
    156 
    157 static AOM_FORCE_INLINE void butterfly_s16_s32_x8_0112_neon(
    158    const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1,
    159    int16x8_t *out0, int16x8_t *out1) {
    160  butterfly_s16_s32_x8_neon(w0101, 0, 1, 1, 2, in0, in1, out0, out1);
    161 }
    162 
    163 static AOM_FORCE_INLINE void butterfly_s16_s32_x8_0332_neon(
    164    const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1,
    165    int16x8_t *out0, int16x8_t *out1) {
    166  butterfly_s16_s32_x8_neon(w0101, 0, 3, 3, 2, in0, in1, out0, out1);
    167 }
    168 
    169 static AOM_FORCE_INLINE void butterfly_s16_s32_x8_1003_neon(
    170    const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1,
    171    int16x8_t *out0, int16x8_t *out1) {
    172  butterfly_s16_s32_x8_neon(w0101, 1, 0, 0, 3, in0, in1, out0, out1);
    173 }
    174 
    175 static AOM_FORCE_INLINE void butterfly_s16_s32_x8_1223_neon(
    176    const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1,
    177    int16x8_t *out0, int16x8_t *out1) {
    178  butterfly_s16_s32_x8_neon(w0101, 1, 2, 2, 3, in0, in1, out0, out1);
    179 }
    180 
    181 static AOM_FORCE_INLINE void flip_buf_4_neon(int16x4_t *in, int16x4_t *out,
    182                                             int size) {
    183  for (int i = 0; i < size; ++i) {
    184    out[size - i - 1] = in[i];
    185  }
    186 }
    187 
    188 static AOM_FORCE_INLINE void flip_buf_8_neon(int16x8_t *in, int16x8_t *out,
    189                                             int size) {
    190  for (int i = 0; i < size; ++i) {
    191    out[size - i - 1] = in[i];
    192  }
    193 }
    194 
    195 static AOM_FORCE_INLINE void store_buffer_interleaved_s32_x8(
    196    int32_t *const out, const int32x4_t *const in1, const int32x4_t *const in2,
    197    const int stride, const int out_size) {
    198  for (int i = 0; i < out_size; ++i) {
    199    vst1q_s32(out + stride * i, in1[i]);
    200    vst1q_s32(out + stride * i + 4, in2[i]);
    201  }
    202 }
    203 
    204 static AOM_FORCE_INLINE void load_buffer_s16_x4(const int16_t *in,
    205                                                const int stride,
    206                                                int16x4_t *const out,
    207                                                const int out_size) {
    208  for (int i = 0; i < out_size; ++i) {
    209    out[i] = vld1_s16(in);
    210    in += stride;
    211  }
    212 }
    213 
    214 static AOM_FORCE_INLINE void load_buffer_s16_x8(const int16_t *in, int stride,
    215                                                int16x8_t *out, int out_size) {
    216  for (int i = 0; i < out_size; ++i) {
    217    out[i] = vld1q_s16(in + i * stride);
    218  }
    219 }
    220 
    221 static AOM_FORCE_INLINE void store_buffer_s16_x4(const int16x4_t *const in,
    222                                                 int32_t *const out,
    223                                                 const int stride,
    224                                                 const int out_size) {
    225  for (int i = 0; i < out_size; ++i) {
    226    vst1q_s32(out + i * stride, vmovl_s16(in[i]));
    227  }
    228 }
    229 
    230 static AOM_FORCE_INLINE void store_buffer_s16_x8(const int16x8_t *const in,
    231                                                 int32_t *const out,
    232                                                 const int stride,
    233                                                 const int out_size) {
    234  for (int i = 0; i < out_size; ++i) {
    235    vst1q_s32(out + i * stride + 0, vmovl_s16(vget_low_s16(in[i])));
    236    vst1q_s32(out + i * stride + 4, vmovl_s16(vget_high_s16(in[i])));
    237  }
    238 }
    239 
    240 // A note on naming:
    241 //   round_shift_[sqrt2]_s16_s32_4x1_neon(...)
    242 //                |      |   |     ^ 1 => a single vector
    243 //                |      |   |       n => an array of vectors
    244 //                |      |   |   ^ input/output vector element count
    245 //                |      |   ^ output type
    246 //                |      ^ input type
    247 //                ^ multiplicand and shift identifier
    248 
    249 static AOM_FORCE_INLINE int16x4_t
    250 round_shift_sqrt2_s16_s16_4x1_neon(int16x4_t a) {
    251  return vqrshrn_n_s32(vmull_n_s16(a, NewSqrt2), NewSqrt2Bits);
    252 }
    253 
    254 static AOM_FORCE_INLINE int16x8_t
    255 round_shift_sqrt2_s16_s16_8x1_neon(int16x8_t a) {
    256  return vcombine_s16(round_shift_sqrt2_s16_s16_4x1_neon(vget_low_s16(a)),
    257                      round_shift_sqrt2_s16_s16_4x1_neon(vget_high_s16(a)));
    258 }
    259 
    260 static AOM_FORCE_INLINE int16x4_t
    261 round_shift_2sqrt2_s16_s16_4x1_neon(int16x4_t a) {
    262  return vqrshrn_n_s32(vmull_n_s16(a, 2 * NewSqrt2), NewSqrt2Bits);
    263 }
    264 
    265 static AOM_FORCE_INLINE int16x8_t
    266 round_shift_2sqrt2_s16_s16_8x1_neon(int16x8_t a) {
    267  return vcombine_s16(round_shift_2sqrt2_s16_s16_4x1_neon(vget_low_s16(a)),
    268                      round_shift_2sqrt2_s16_s16_4x1_neon(vget_high_s16(a)));
    269 }
    270 
    271 static AOM_FORCE_INLINE int32x4_t
    272 round_shift_sqrt2_s16_s32_4x1_neon(int16x4_t a) {
    273  return vrshrq_n_s32(vmull_n_s16(a, NewSqrt2), NewSqrt2Bits);
    274 }
    275 
    276 static AOM_FORCE_INLINE int32x4_t
    277 round_shift_sqrt2_s32_s32_4x1_neon(int32x4_t a) {
    278  return vrshrq_n_s32(vmulq_n_s32(a, NewSqrt2), NewSqrt2Bits);
    279 }
    280 
    281 #define ROUND_SHIFT_SQRT_LOOP_HELPER(name, type0, type1, fn)                 \
    282  static AOM_FORCE_INLINE void name(const type0 *in, type1 *out, int size) { \
    283    for (int i = 0; i < size; ++i) {                                         \
    284      out[i] = fn(in[i]);                                                    \
    285    }                                                                        \
    286  }
    287 
    288 ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_sqrt2_s32_s32_4xn_neon, int32x4_t,
    289                             int32x4_t, round_shift_sqrt2_s32_s32_4x1_neon)
    290 ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_sqrt2_s16_s16_4xn_neon, int16x4_t,
    291                             int16x4_t, round_shift_sqrt2_s16_s16_4x1_neon)
    292 ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_sqrt2_s16_s16_8xn_neon, int16x8_t,
    293                             int16x8_t, round_shift_sqrt2_s16_s16_8x1_neon)
    294 ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_2sqrt2_s16_s16_4xn_neon, int16x4_t,
    295                             int16x4_t, round_shift_2sqrt2_s16_s16_4x1_neon)
    296 ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_2sqrt2_s16_s16_8xn_neon, int16x8_t,
    297                             int16x8_t, round_shift_2sqrt2_s16_s16_8x1_neon)
    298 
    299 static AOM_FORCE_INLINE void store_rect_buffer_s16_x4(const int16x4_t *const in,
    300                                                      int32_t *const out,
    301                                                      const int stride,
    302                                                      const int out_size) {
    303  for (int i = 0; i < out_size; ++i) {
    304    vst1q_s32(out + i * stride, round_shift_sqrt2_s16_s32_4x1_neon(in[i]));
    305  }
    306 }
    307 
    308 static AOM_FORCE_INLINE void store_rect_buffer_s16_x8(const int16x8_t *const in,
    309                                                      int32_t *const out,
    310                                                      const int stride,
    311                                                      const int out_size) {
    312  for (int i = 0; i < out_size; ++i) {
    313    vst1q_s32(out + i * stride + 0,
    314              round_shift_sqrt2_s16_s32_4x1_neon(vget_low_s16(in[i])));
    315    vst1q_s32(out + i * stride + 4,
    316              round_shift_sqrt2_s16_s32_4x1_neon(vget_high_s16(in[i])));
    317  }
    318 }
    319 
    320 static AOM_FORCE_INLINE void fadst4x4_neon(const int16x4_t *input,
    321                                           int16x4_t *output, int cos_bit) {
    322  int32x4_t u[6], v[6];
    323  const int16x4_t sinpi = vld1_s16(sinpi_arr_q13(cos_bit));
    324  const int16x4_t u01 = vqadd_s16(input[0], input[1]);
    325 
    326  v[5] = vmull_lane_s16(input[2], sinpi, 2);
    327  v[0] = vmull_lane_s16(input[1], sinpi, 1);
    328  v[0] = vmlal_lane_s16(v[0], input[0], sinpi, 0);
    329  v[1] = vmlal_lane_s16(v[5], input[3], sinpi, 3);
    330  v[2] = vmull_lane_s16(u01, sinpi, 2);
    331  v[3] = vmull_lane_s16(input[0], sinpi, 3);
    332  v[3] = vmlsl_lane_s16(v[3], input[1], sinpi, 0);
    333  v[4] = vmlsl_lane_s16(v[5], input[3], sinpi, 1);
    334 
    335  u[0] = vaddq_s32(v[0], v[1]);
    336  u[1] = vmlsl_lane_s16(v[2], input[3], sinpi, 2);
    337  u[2] = vsubq_s32(v[3], v[4]);
    338  u[3] = vsubq_s32(u[2], u[0]);
    339  u[3] = vmlaq_n_s32(u[3], v[5], 3);
    340 
    341  output[0] = vrshrn_n_s32(u[0], TXFM_COS_BIT_MAX);
    342  output[1] = vrshrn_n_s32(u[1], TXFM_COS_BIT_MAX);
    343  output[2] = vrshrn_n_s32(u[2], TXFM_COS_BIT_MAX);
    344  output[3] = vrshrn_n_s32(u[3], TXFM_COS_BIT_MAX);
    345 }
    346 
    347 static AOM_FORCE_INLINE void fadst4x8_neon(const int16x4_t *input,
    348                                           int16x4_t *output, int cos_bit) {
    349  const int16_t *cospi = cospi_arr_q13(cos_bit);
    350 
    351  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
    352  const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
    353  const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
    354 
    355  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
    356  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
    357  const int16x4_t cospi4 = vget_low_s16(cospi4_12);
    358  const int16x4_t cospi12 = vget_high_s16(cospi4_12);
    359  const int16x4_t cospi20 = vget_low_s16(cospi20_28);
    360  const int16x4_t cospi28 = vget_high_s16(cospi20_28);
    361 
    362  // stage 1-2
    363  int16x4_t x2[8];
    364  butterfly_s16_s32_x4_0332_neon(cospi32, input[4], input[3], &x2[2], &x2[3]);
    365  butterfly_s16_s32_x4_0112_neon(cospi32, input[2], input[5], &x2[7], &x2[6]);
    366 
    367  // stage 3
    368  int16x4_t x3[8];
    369  x3[0] = vqadd_s16(input[0], x2[2]);
    370  x3[1] = vqsub_s16(x2[3], input[7]);
    371  x3[2] = vqsub_s16(input[0], x2[2]);
    372  x3[3] = vqadd_s16(input[7], x2[3]);
    373  x3[4] = vqsub_s16(x2[6], input[1]);
    374  x3[5] = vqadd_s16(input[6], x2[7]);
    375  x3[6] = vqadd_s16(input[1], x2[6]);
    376  x3[7] = vqsub_s16(input[6], x2[7]);
    377 
    378  // stage 4
    379  int16x4_t x4[8];
    380  butterfly_s16_s32_x4_0112_neon(cospi16, x3[4], x3[5], &x4[4], &x4[5]);
    381  butterfly_s16_s32_x4_0112_neon(cospi16, x3[7], x3[6], &x4[6], &x4[7]);
    382 
    383  // stage 5
    384  int16x4_t x5[8];
    385  x5[0] = vqadd_s16(x3[0], x4[4]);
    386  x5[1] = vqadd_s16(x3[1], x4[5]);
    387  x5[2] = vqadd_s16(x3[2], x4[6]);
    388  x5[3] = vqsub_s16(x4[7], x3[3]);
    389  x5[4] = vqsub_s16(x3[0], x4[4]);
    390  x5[5] = vqsub_s16(x3[1], x4[5]);
    391  x5[6] = vqsub_s16(x3[2], x4[6]);
    392  x5[7] = vqadd_s16(x3[3], x4[7]);
    393 
    394  // stage 6-7
    395  butterfly_s16_s32_x4_0112_neon(cospi4, x5[0], x5[1], &output[7], &output[0]);
    396  butterfly_s16_s32_x4_0112_neon(cospi20, x5[2], x5[3], &output[5], &output[2]);
    397  butterfly_s16_s32_x4_1003_neon(cospi28, x5[4], x5[5], &output[3], &output[4]);
    398  butterfly_s16_s32_x4_0112_neon(cospi12, x5[6], x5[7], &output[6], &output[1]);
    399 }
    400 
    401 static AOM_FORCE_INLINE void fadst8x4_neon(const int16x8_t *input,
    402                                           int16x8_t *output, int cos_bit) {
    403  int32x4_t u_lo[4], u_hi[4];
    404  const int16x4_t sinpi = vld1_s16(sinpi_arr_q13(cos_bit));
    405  const int16x8_t u01 = vqaddq_s16(input[0], input[1]);
    406 
    407  u_lo[0] = vmull_lane_s16(vget_low_s16(input[1]), sinpi, 1);
    408  u_hi[0] = vmull_lane_s16(vget_high_s16(input[1]), sinpi, 1);
    409 
    410  u_lo[0] = vmlal_lane_s16(u_lo[0], vget_low_s16(input[0]), sinpi, 0);
    411  u_hi[0] = vmlal_lane_s16(u_hi[0], vget_high_s16(input[0]), sinpi, 0);
    412 
    413  u_lo[0] = vmlal_lane_s16(u_lo[0], vget_low_s16(input[3]), sinpi, 3);
    414  u_hi[0] = vmlal_lane_s16(u_hi[0], vget_high_s16(input[3]), sinpi, 3);
    415 
    416  u_lo[0] = vmlal_lane_s16(u_lo[0], vget_low_s16(input[2]), sinpi, 2);
    417  u_hi[0] = vmlal_lane_s16(u_hi[0], vget_high_s16(input[2]), sinpi, 2);
    418 
    419  u_lo[1] = vmull_lane_s16(vget_low_s16(u01), sinpi, 2);
    420  u_hi[1] = vmull_lane_s16(vget_high_s16(u01), sinpi, 2);
    421 
    422  u_lo[2] = vmull_lane_s16(vget_low_s16(input[0]), sinpi, 3);
    423  u_hi[2] = vmull_lane_s16(vget_high_s16(input[0]), sinpi, 3);
    424 
    425  u_lo[2] = vmlsl_lane_s16(u_lo[2], vget_low_s16(input[1]), sinpi, 0);
    426  u_hi[2] = vmlsl_lane_s16(u_hi[2], vget_high_s16(input[1]), sinpi, 0);
    427 
    428  u_lo[2] = vmlal_lane_s16(u_lo[2], vget_low_s16(input[3]), sinpi, 1);
    429  u_hi[2] = vmlal_lane_s16(u_hi[2], vget_high_s16(input[3]), sinpi, 1);
    430 
    431  u_lo[2] = vmlsl_lane_s16(u_lo[2], vget_low_s16(input[2]), sinpi, 2);
    432  u_hi[2] = vmlsl_lane_s16(u_hi[2], vget_high_s16(input[2]), sinpi, 2);
    433 
    434  u_lo[1] = vmlsl_lane_s16(u_lo[1], vget_low_s16(input[3]), sinpi, 2);
    435  u_hi[1] = vmlsl_lane_s16(u_hi[1], vget_high_s16(input[3]), sinpi, 2);
    436 
    437  u_lo[3] = vsubq_s32(u_lo[2], u_lo[0]);
    438  u_hi[3] = vsubq_s32(u_hi[2], u_hi[0]);
    439 
    440  const int16x4_t sinpix3 = vmul_n_s16(sinpi, 3);
    441  u_lo[3] = vmlal_lane_s16(u_lo[3], vget_low_s16(input[2]), sinpix3, 2);
    442  u_hi[3] = vmlal_lane_s16(u_hi[3], vget_high_s16(input[2]), sinpix3, 2);
    443 
    444  output[0] = vcombine_s16(vrshrn_n_s32(u_lo[0], TXFM_COS_BIT_MAX),
    445                           vrshrn_n_s32(u_hi[0], TXFM_COS_BIT_MAX));
    446  output[1] = vcombine_s16(vrshrn_n_s32(u_lo[1], TXFM_COS_BIT_MAX),
    447                           vrshrn_n_s32(u_hi[1], TXFM_COS_BIT_MAX));
    448  output[2] = vcombine_s16(vrshrn_n_s32(u_lo[2], TXFM_COS_BIT_MAX),
    449                           vrshrn_n_s32(u_hi[2], TXFM_COS_BIT_MAX));
    450  output[3] = vcombine_s16(vrshrn_n_s32(u_lo[3], TXFM_COS_BIT_MAX),
    451                           vrshrn_n_s32(u_hi[3], TXFM_COS_BIT_MAX));
    452 }
    453 
    454 static AOM_FORCE_INLINE void fdct4x4_neon(const int16x4_t *input,
    455                                          int16x4_t *output, int cos_bit) {
    456  const int16_t *cospi = cospi_arr_q13(cos_bit);
    457  const int16x4_t cospi16 = vld1_s16(&cospi[4 * 1]);
    458 
    459  int16x4_t in12a = vadd_s16(input[1], input[2]);
    460  int16x4_t in12s = vsub_s16(input[1], input[2]);
    461  int16x4_t in03a = vadd_s16(input[0], input[3]);
    462  int16x4_t in03s = vsub_s16(input[0], input[3]);
    463 
    464  int32x4_t u0ad1 = vmull_n_s16(in12a, cospi[4 * 0]);
    465  int32x4_t u0ad2 = vmull_n_s16(in03a, cospi[4 * 0]);
    466 
    467  int32x4_t u[4];
    468  u[0] = vaddq_s32(u0ad1, u0ad2);
    469  u[1] = vsubq_s32(u0ad2, u0ad1);
    470  u[2] = vmull_lane_s16(in12s, cospi16, 1);
    471  u[2] = vmlal_lane_s16(u[2], in03s, cospi16, 0);
    472  u[3] = vmull_lane_s16(in03s, cospi16, 1);
    473  u[3] = vmlsl_lane_s16(u[3], in12s, cospi16, 0);
    474 
    475  output[0] = vrshrn_n_s32(u[0], TXFM_COS_BIT_MAX);
    476  output[1] = vrshrn_n_s32(u[2], TXFM_COS_BIT_MAX);
    477  output[2] = vrshrn_n_s32(u[1], TXFM_COS_BIT_MAX);
    478  output[3] = vrshrn_n_s32(u[3], TXFM_COS_BIT_MAX);
    479 }
    480 
    481 // Butterfly pre-processing:
    482 // e.g. n=4:
    483 //   out[0] = in[0] + in[3]
    484 //   out[1] = in[1] + in[2]
    485 //   out[2] = in[1] - in[2]
    486 //   out[3] = in[0] - in[3]
    487 
    488 static AOM_FORCE_INLINE void butterfly_dct_pre_s16_x4(const int16x4_t *input,
    489                                                      int16x4_t *output,
    490                                                      int n) {
    491  for (int i = 0; i < n / 2; ++i) {
    492    output[i] = vqadd_s16(input[i], input[n - i - 1]);
    493  }
    494  for (int i = 0; i < n / 2; ++i) {
    495    output[n / 2 + i] = vqsub_s16(input[n / 2 - i - 1], input[n / 2 + i]);
    496  }
    497 }
    498 
    499 static AOM_FORCE_INLINE void butterfly_dct_pre_s16_x8(const int16x8_t *input,
    500                                                      int16x8_t *output,
    501                                                      int n) {
    502  for (int i = 0; i < n / 2; ++i) {
    503    output[i] = vqaddq_s16(input[i], input[n - i - 1]);
    504  }
    505  for (int i = 0; i < n / 2; ++i) {
    506    output[n / 2 + i] = vqsubq_s16(input[n / 2 - i - 1], input[n / 2 + i]);
    507  }
    508 }
    509 
    510 static AOM_FORCE_INLINE void butterfly_dct_pre_s32_x4(const int32x4_t *input,
    511                                                      int32x4_t *output,
    512                                                      int n) {
    513  for (int i = 0; i < n / 2; ++i) {
    514    output[i] = vqaddq_s32(input[i], input[n - i - 1]);
    515  }
    516  for (int i = 0; i < n / 2; ++i) {
    517    output[n / 2 + i] = vqsubq_s32(input[n / 2 - i - 1], input[n / 2 + i]);
    518  }
    519 }
    520 
    521 // Butterfly post-processing:
    522 // e.g. n=8:
    523 //   out[0] = in0[0] + in1[3];
    524 //   out[1] = in0[1] + in1[2];
    525 //   out[2] = in0[1] - in1[2];
    526 //   out[3] = in0[0] - in1[3];
    527 //   out[4] = in0[7] - in1[4];
    528 //   out[5] = in0[6] - in1[5];
    529 //   out[6] = in0[6] + in1[5];
    530 //   out[7] = in0[7] + in1[4];
    531 
    532 static AOM_FORCE_INLINE void butterfly_dct_post_s16_x4(const int16x4_t *in0,
    533                                                       const int16x4_t *in1,
    534                                                       int16x4_t *output,
    535                                                       int n) {
    536  for (int i = 0; i < n / 4; ++i) {
    537    output[i] = vqadd_s16(in0[i], in1[n / 2 - i - 1]);
    538  }
    539  for (int i = 0; i < n / 4; ++i) {
    540    output[n / 4 + i] = vqsub_s16(in0[n / 4 - i - 1], in1[n / 4 + i]);
    541  }
    542  for (int i = 0; i < n / 4; ++i) {
    543    output[n / 2 + i] = vqsub_s16(in0[n - i - 1], in1[n / 2 + i]);
    544  }
    545  for (int i = 0; i < n / 4; ++i) {
    546    output[(3 * n) / 4 + i] =
    547        vqadd_s16(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]);
    548  }
    549 }
    550 
    551 static AOM_FORCE_INLINE void butterfly_dct_post_s16_x8(const int16x8_t *in0,
    552                                                       const int16x8_t *in1,
    553                                                       int16x8_t *output,
    554                                                       int n) {
    555  for (int i = 0; i < n / 4; ++i) {
    556    output[i] = vqaddq_s16(in0[i], in1[n / 2 - i - 1]);
    557  }
    558  for (int i = 0; i < n / 4; ++i) {
    559    output[n / 4 + i] = vqsubq_s16(in0[n / 4 - i - 1], in1[n / 4 + i]);
    560  }
    561  for (int i = 0; i < n / 4; ++i) {
    562    output[n / 2 + i] = vqsubq_s16(in0[n - i - 1], in1[n / 2 + i]);
    563  }
    564  for (int i = 0; i < n / 4; ++i) {
    565    output[(3 * n) / 4 + i] =
    566        vqaddq_s16(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]);
    567  }
    568 }
    569 
    570 static AOM_FORCE_INLINE void butterfly_dct_post_s32_x4(const int32x4_t *in0,
    571                                                       const int32x4_t *in1,
    572                                                       int32x4_t *output,
    573                                                       int n) {
    574  for (int i = 0; i < n / 4; ++i) {
    575    output[i] = vqaddq_s32(in0[i], in1[n / 2 - i - 1]);
    576  }
    577  for (int i = 0; i < n / 4; ++i) {
    578    output[n / 4 + i] = vqsubq_s32(in0[n / 4 - i - 1], in1[n / 4 + i]);
    579  }
    580  for (int i = 0; i < n / 4; ++i) {
    581    output[n / 2 + i] = vqsubq_s32(in0[n - i - 1], in1[n / 2 + i]);
    582  }
    583  for (int i = 0; i < n / 4; ++i) {
    584    output[(3 * n) / 4 + i] =
    585        vqaddq_s32(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]);
    586  }
    587 }
    588 
    589 static AOM_FORCE_INLINE void fdct8x4_neon(const int16x8_t *input,
    590                                          int16x8_t *output, int cos_bit) {
    591  const int16_t *cospi = cospi_arr_q13(cos_bit);
    592 
    593  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
    594 
    595  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
    596  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
    597 
    598  // stage 1
    599  int16x8_t x1[4];
    600  butterfly_dct_pre_s16_x8(input, x1, 4);
    601 
    602  // stage 2
    603  int16x8_t x2[4];
    604  butterfly_s16_s32_x8_0112_neon(cospi32, x1[0], x1[1], &x2[0], &x2[1]);
    605  butterfly_s16_s32_x8_0112_neon(cospi16, x1[3], x1[2], &x2[2], &x2[3]);
    606 
    607  // stage 3
    608  output[0] = x2[0];
    609  output[1] = x2[2];
    610  output[2] = x2[1];
    611  output[3] = x2[3];
    612 }
    613 
    614 static AOM_FORCE_INLINE void fdct4x8_neon(const int16x4_t *input,
    615                                          int16x4_t *output, int cos_bit) {
    616  const int16_t *cospi = cospi_arr_q13(cos_bit);
    617 
    618  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
    619  const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
    620 
    621  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
    622  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
    623  const int16x4_t cospi8 = vget_low_s16(cospi8_24);
    624  const int16x4_t cospi24 = vget_high_s16(cospi8_24);
    625 
    626  // stage 1
    627  int16x4_t x1[8];
    628  butterfly_dct_pre_s16_x4(input, x1, 8);
    629 
    630  // stage 2
    631  int16x4_t x2[8];
    632  butterfly_dct_pre_s16_x4(x1, x2, 4);
    633  butterfly_s16_s32_x4_0112_neon(cospi32, x1[6], x1[5], &x2[6], &x2[5]);
    634 
    635  // stage 3
    636  int16x4_t x3[8];
    637  butterfly_s16_s32_x4_0112_neon(cospi32, x2[0], x2[1], &output[0], &output[4]);
    638  butterfly_s16_s32_x4_0112_neon(cospi16, x2[3], x2[2], &output[2], &output[6]);
    639  butterfly_dct_post_s16_x4(x1 + 4, x2 + 4, x3 + 4, 4);
    640 
    641  // stage 4-5
    642  butterfly_s16_s32_x4_0112_neon(cospi8, x3[7], x3[4], &output[1], &output[7]);
    643  butterfly_s16_s32_x4_1003_neon(cospi24, x3[6], x3[5], &output[5], &output[3]);
    644 }
    645 
    646 static AOM_FORCE_INLINE void fdct8x8_neon(const int16x8_t *input,
    647                                          int16x8_t *output, int cos_bit) {
    648  const int16_t *cospi = cospi_arr_q13(cos_bit);
    649 
    650  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
    651  const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
    652 
    653  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
    654  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
    655  const int16x4_t cospi8 = vget_low_s16(cospi8_24);
    656  const int16x4_t cospi24 = vget_high_s16(cospi8_24);
    657 
    658  // stage 1
    659  int16x8_t x1[8];
    660  butterfly_dct_pre_s16_x8(input, x1, 8);
    661 
    662  // stage 2
    663  int16x8_t x2[8];
    664  butterfly_dct_pre_s16_x8(x1, x2, 4);
    665  butterfly_s16_s32_x8_0112_neon(cospi32, x1[6], x1[5], &x2[6], &x2[5]);
    666 
    667  // stage 3
    668  int16x8_t x3[8];
    669  butterfly_s16_s32_x8_0112_neon(cospi32, x2[0], x2[1], &output[0], &output[4]);
    670  butterfly_s16_s32_x8_0112_neon(cospi16, x2[3], x2[2], &output[2], &output[6]);
    671  butterfly_dct_post_s16_x8(x1 + 4, x2 + 4, x3 + 4, 4);
    672 
    673  // stage 4-5
    674  butterfly_s16_s32_x8_0112_neon(cospi8, x3[7], x3[4], &output[1], &output[7]);
    675  butterfly_s16_s32_x8_1003_neon(cospi24, x3[6], x3[5], &output[5], &output[3]);
    676 }
    677 
    678 static AOM_FORCE_INLINE void fdct4x16_neon(const int16x4_t *input,
    679                                           int16x4_t *output, int cos_bit) {
    680  const int16_t *cospi = cospi_arr_q13(cos_bit);
    681 
    682  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
    683  const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
    684  const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
    685  const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
    686 
    687  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
    688  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
    689  const int16x4_t cospi8 = vget_low_s16(cospi8_24);
    690  const int16x4_t cospi24 = vget_high_s16(cospi8_24);
    691  const int16x4_t cospi4 = vget_low_s16(cospi4_12);
    692  const int16x4_t cospi12 = vget_high_s16(cospi4_12);
    693  const int16x4_t cospi20 = vget_low_s16(cospi20_28);
    694  const int16x4_t cospi28 = vget_high_s16(cospi20_28);
    695 
    696  // stage 1
    697  int16x4_t x1[16];
    698  butterfly_dct_pre_s16_x4(input, x1, 16);
    699 
    700  // stage 2
    701  int16x4_t x2[16];
    702  butterfly_dct_pre_s16_x4(x1, x2, 8);
    703  butterfly_s16_s32_x4_0112_neon(cospi32, x1[13], x1[10], &x2[13], &x2[10]);
    704  butterfly_s16_s32_x4_0112_neon(cospi32, x1[12], x1[11], &x2[12], &x2[11]);
    705 
    706  // stage 3
    707  int16x4_t x3[16];
    708  butterfly_dct_pre_s16_x4(x2, x3, 4);
    709  butterfly_s16_s32_x4_0112_neon(cospi32, x2[6], x2[5], &x3[6], &x3[5]);
    710  butterfly_dct_post_s16_x4(x1 + 8, x2 + 8, x3 + 8, 8);
    711 
    712  // stage 4
    713  int16x4_t x4[16];
    714  butterfly_s16_s32_x4_0112_neon(cospi32, x3[0], x3[1], &output[0], &output[8]);
    715  butterfly_s16_s32_x4_0112_neon(cospi16, x3[3], x3[2], &output[4],
    716                                 &output[12]);
    717  butterfly_dct_post_s16_x4(x2 + 4, x3 + 4, x4 + 4, 4);
    718  butterfly_s16_s32_x4_0112_neon(cospi16, x3[14], x3[9], &x4[14], &x4[9]);
    719  butterfly_s16_s32_x4_1223_neon(cospi16, x3[13], x3[10], &x4[13], &x4[10]);
    720 
    721  // stage 5
    722  int16x4_t x5[16];
    723  butterfly_s16_s32_x4_0112_neon(cospi8, x4[7], x4[4], &output[2], &output[14]);
    724  butterfly_s16_s32_x4_1003_neon(cospi24, x4[6], x4[5], &output[10],
    725                                 &output[6]);
    726  butterfly_dct_post_s16_x4(x3 + 8, x4 + 8, x5 + 8, 4);
    727  butterfly_dct_post_s16_x4(x3 + 12, x4 + 12, x5 + 12, 4);
    728 
    729  // stage 6-7
    730  butterfly_s16_s32_x4_0112_neon(cospi4, x5[15], x5[8], &output[1],
    731                                 &output[15]);
    732  butterfly_s16_s32_x4_1003_neon(cospi28, x5[14], x5[9], &output[9],
    733                                 &output[7]);
    734  butterfly_s16_s32_x4_0112_neon(cospi20, x5[13], x5[10], &output[5],
    735                                 &output[11]);
    736  butterfly_s16_s32_x4_1003_neon(cospi12, x5[12], x5[11], &output[13],
    737                                 &output[3]);
    738 }
    739 
    740 static AOM_FORCE_INLINE void fdct8x16_neon(const int16x8_t *input,
    741                                           int16x8_t *output, int cos_bit) {
    742  const int16_t *cospi = cospi_arr_q13(cos_bit);
    743 
    744  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
    745  const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
    746  const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
    747  const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
    748 
    749  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
    750  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
    751  const int16x4_t cospi8 = vget_low_s16(cospi8_24);
    752  const int16x4_t cospi24 = vget_high_s16(cospi8_24);
    753  const int16x4_t cospi4 = vget_low_s16(cospi4_12);
    754  const int16x4_t cospi12 = vget_high_s16(cospi4_12);
    755  const int16x4_t cospi20 = vget_low_s16(cospi20_28);
    756  const int16x4_t cospi28 = vget_high_s16(cospi20_28);
    757 
    758  // stage 1
    759  int16x8_t x1[16];
    760  butterfly_dct_pre_s16_x8(input, x1, 16);
    761 
    762  // stage 2
    763  int16x8_t x2[16];
    764  butterfly_dct_pre_s16_x8(x1, x2, 8);
    765  butterfly_s16_s32_x8_0112_neon(cospi32, x1[13], x1[10], &x2[13], &x2[10]);
    766  butterfly_s16_s32_x8_0112_neon(cospi32, x1[12], x1[11], &x2[12], &x2[11]);
    767 
    768  // stage 3
    769  int16x8_t x3[16];
    770  butterfly_dct_pre_s16_x8(x2, x3, 4);
    771  butterfly_s16_s32_x8_0112_neon(cospi32, x2[6], x2[5], &x3[6], &x3[5]);
    772  butterfly_dct_post_s16_x8(x1 + 8, x2 + 8, x3 + 8, 8);
    773 
    774  // stage 4
    775  int16x8_t x4[16];
    776  butterfly_s16_s32_x8_0112_neon(cospi32, x3[0], x3[1], &output[0], &output[8]);
    777  butterfly_s16_s32_x8_0112_neon(cospi16, x3[3], x3[2], &output[4],
    778                                 &output[12]);
    779  butterfly_dct_post_s16_x8(x2 + 4, x3 + 4, x4 + 4, 4);
    780  butterfly_s16_s32_x8_0112_neon(cospi16, x3[14], x3[9], &x4[14], &x4[9]);
    781  butterfly_s16_s32_x8_1223_neon(cospi16, x3[13], x3[10], &x4[13], &x4[10]);
    782 
    783  // stage 5
    784  int16x8_t x5[16];
    785  butterfly_s16_s32_x8_0112_neon(cospi8, x4[7], x4[4], &output[2], &output[14]);
    786  butterfly_s16_s32_x8_1003_neon(cospi24, x4[6], x4[5], &output[10],
    787                                 &output[6]);
    788  butterfly_dct_post_s16_x8(x3 + 8, x4 + 8, x5 + 8, 4);
    789  butterfly_dct_post_s16_x8(x3 + 12, x4 + 12, x5 + 12, 4);
    790 
    791  // stage 6-7
    792  butterfly_s16_s32_x8_0112_neon(cospi4, x5[15], x5[8], &output[1],
    793                                 &output[15]);
    794  butterfly_s16_s32_x8_1003_neon(cospi28, x5[14], x5[9], &output[9],
    795                                 &output[7]);
    796  butterfly_s16_s32_x8_0112_neon(cospi20, x5[13], x5[10], &output[5],
    797                                 &output[11]);
    798  butterfly_s16_s32_x8_1003_neon(cospi12, x5[12], x5[11], &output[13],
    799                                 &output[3]);
    800 }
    801 
    802 static AOM_FORCE_INLINE void fdct8x32_neon(const int16x8_t *input,
    803                                           int16x8_t *output, int cos_bit) {
    804  const int16_t *cospi = cospi_arr_q13(cos_bit);
    805 
    806  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
    807  const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
    808  const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
    809  const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
    810  const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
    811  const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
    812  const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
    813  const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
    814 
    815  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
    816  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
    817  const int16x4_t cospi8 = vget_low_s16(cospi8_24);
    818  const int16x4_t cospi24 = vget_high_s16(cospi8_24);
    819  const int16x4_t cospi4 = vget_low_s16(cospi4_12);
    820  const int16x4_t cospi12 = vget_high_s16(cospi4_12);
    821  const int16x4_t cospi20 = vget_low_s16(cospi20_28);
    822  const int16x4_t cospi28 = vget_high_s16(cospi20_28);
    823  const int16x4_t cospi2 = vget_low_s16(cospi2_6);
    824  const int16x4_t cospi6 = vget_high_s16(cospi2_6);
    825  const int16x4_t cospi10 = vget_low_s16(cospi10_14);
    826  const int16x4_t cospi14 = vget_high_s16(cospi10_14);
    827  const int16x4_t cospi18 = vget_low_s16(cospi18_22);
    828  const int16x4_t cospi22 = vget_high_s16(cospi18_22);
    829  const int16x4_t cospi26 = vget_low_s16(cospi26_30);
    830  const int16x4_t cospi30 = vget_high_s16(cospi26_30);
    831 
    832  // stage 1
    833  int16x8_t x1[32];
    834  butterfly_dct_pre_s16_x8(input, x1, 32);
    835 
    836  // stage 2
    837  int16x8_t x2[32];
    838  butterfly_dct_pre_s16_x8(x1, x2, 16);
    839  butterfly_s16_s32_x8_0112_neon(cospi32, x1[27], x1[20], &x2[27], &x2[20]);
    840  butterfly_s16_s32_x8_0112_neon(cospi32, x1[26], x1[21], &x2[26], &x2[21]);
    841  butterfly_s16_s32_x8_0112_neon(cospi32, x1[25], x1[22], &x2[25], &x2[22]);
    842  butterfly_s16_s32_x8_0112_neon(cospi32, x1[24], x1[23], &x2[24], &x2[23]);
    843 
    844  // stage 3
    845  int16x8_t x3[32];
    846  butterfly_dct_pre_s16_x8(x2, x3, 8);
    847  butterfly_s16_s32_x8_0112_neon(cospi32, x2[13], x2[10], &x3[13], &x3[10]);
    848  butterfly_s16_s32_x8_0112_neon(cospi32, x2[12], x2[11], &x3[12], &x3[11]);
    849  butterfly_dct_post_s16_x8(x1 + 16, x2 + 16, x3 + 16, 16);
    850 
    851  // stage 4
    852  int16x8_t x4[32];
    853  butterfly_dct_pre_s16_x8(x3, x4, 4);
    854  butterfly_s16_s32_x8_0112_neon(cospi32, x3[6], x3[5], &x4[6], &x4[5]);
    855  butterfly_dct_post_s16_x8(x2 + 8, x3 + 8, x4 + 8, 8);
    856  butterfly_s16_s32_x8_0112_neon(cospi16, x3[29], x3[18], &x4[29], &x4[18]);
    857  butterfly_s16_s32_x8_0112_neon(cospi16, x3[28], x3[19], &x4[28], &x4[19]);
    858  butterfly_s16_s32_x8_1223_neon(cospi16, x3[27], x3[20], &x4[27], &x4[20]);
    859  butterfly_s16_s32_x8_1223_neon(cospi16, x3[26], x3[21], &x4[26], &x4[21]);
    860 
    861  // stage 5
    862  int16x8_t x5[32];
    863  butterfly_s16_s32_x8_0112_neon(cospi32, x4[0], x4[1], &output[0],
    864                                 &output[16]);
    865  butterfly_s16_s32_x8_0112_neon(cospi16, x4[3], x4[2], &output[8],
    866                                 &output[24]);
    867  butterfly_dct_post_s16_x8(x3 + 4, x4 + 4, x5 + 4, 4);
    868  butterfly_s16_s32_x8_0112_neon(cospi16, x4[14], x4[9], &x5[14], &x5[9]);
    869  butterfly_s16_s32_x8_1223_neon(cospi16, x4[13], x4[10], &x5[13], &x5[10]);
    870  butterfly_dct_post_s16_x8(x3 + 16, x4 + 16, x5 + 16, 8);
    871  butterfly_dct_post_s16_x8(x3 + 24, x4 + 24, x5 + 24, 8);
    872 
    873  // stage 6
    874  int16x8_t x6[32];
    875  butterfly_s16_s32_x8_0112_neon(cospi8, x5[7], x5[4], &output[4], &output[28]);
    876  butterfly_s16_s32_x8_1003_neon(cospi24, x5[6], x5[5], &output[20],
    877                                 &output[12]);
    878  butterfly_dct_post_s16_x8(x4 + 8, x5 + 8, x6 + 8, 4);
    879  butterfly_dct_post_s16_x8(x4 + 12, x5 + 12, x6 + 12, 4);
    880  butterfly_s16_s32_x8_0112_neon(cospi8, x5[30], x5[17], &x6[30], &x6[17]);
    881  butterfly_s16_s32_x8_1223_neon(cospi8, x5[29], x5[18], &x6[29], &x6[18]);
    882  butterfly_s16_s32_x8_1003_neon(cospi24, x5[26], x5[21], &x6[26], &x6[21]);
    883  butterfly_s16_s32_x8_0332_neon(cospi24, x5[25], x5[22], &x6[25], &x6[22]);
    884 
    885  // stage 7
    886  int16x8_t x7[32];
    887  butterfly_s16_s32_x8_0112_neon(cospi4, x6[15], x6[8], &output[2],
    888                                 &output[30]);
    889  butterfly_s16_s32_x8_1003_neon(cospi28, x6[14], x6[9], &output[18],
    890                                 &output[14]);
    891  butterfly_s16_s32_x8_0112_neon(cospi20, x6[13], x6[10], &output[10],
    892                                 &output[22]);
    893  butterfly_s16_s32_x8_1003_neon(cospi12, x6[12], x6[11], &output[26],
    894                                 &output[6]);
    895  butterfly_dct_post_s16_x8(x5 + 16, x6 + 16, x7 + 16, 4);
    896  butterfly_dct_post_s16_x8(x5 + 20, x6 + 20, x7 + 20, 4);
    897  butterfly_dct_post_s16_x8(x5 + 24, x6 + 24, x7 + 24, 4);
    898  butterfly_dct_post_s16_x8(x5 + 28, x6 + 28, x7 + 28, 4);
    899 
    900  butterfly_s16_s32_x8_0112_neon(cospi2, x7[31], x7[16], &output[1],
    901                                 &output[31]);
    902  butterfly_s16_s32_x8_1003_neon(cospi30, x7[30], x7[17], &output[17],
    903                                 &output[15]);
    904  butterfly_s16_s32_x8_0112_neon(cospi18, x7[29], x7[18], &output[9],
    905                                 &output[23]);
    906  butterfly_s16_s32_x8_1003_neon(cospi14, x7[28], x7[19], &output[25],
    907                                 &output[7]);
    908  butterfly_s16_s32_x8_0112_neon(cospi10, x7[27], x7[20], &output[5],
    909                                 &output[27]);
    910  butterfly_s16_s32_x8_1003_neon(cospi22, x7[26], x7[21], &output[21],
    911                                 &output[11]);
    912  butterfly_s16_s32_x8_0112_neon(cospi26, x7[25], x7[22], &output[13],
    913                                 &output[19]);
    914  butterfly_s16_s32_x8_1003_neon(cospi6, x7[24], x7[23], &output[29],
    915                                 &output[3]);
    916 }
    917 
    918 static AOM_FORCE_INLINE void fdct8x64_neon(const int16x8_t *input,
    919                                           int16x8_t *output, int cos_bit) {
    920  const int16_t *cospi = cospi_arr_q13(cos_bit);
    921 
    922  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
    923  const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
    924  const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
    925  const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
    926  const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
    927  const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
    928  const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
    929  const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
    930  const int16x8_t cospi1_3 = vld1q_s16(&cospi[4 * 16]);
    931  const int16x8_t cospi5_7 = vld1q_s16(&cospi[4 * 18]);
    932  const int16x8_t cospi9_11 = vld1q_s16(&cospi[4 * 20]);
    933  const int16x8_t cospi13_15 = vld1q_s16(&cospi[4 * 22]);
    934  const int16x8_t cospi17_19 = vld1q_s16(&cospi[4 * 24]);
    935  const int16x8_t cospi21_23 = vld1q_s16(&cospi[4 * 26]);
    936  const int16x8_t cospi25_27 = vld1q_s16(&cospi[4 * 28]);
    937  const int16x8_t cospi29_31 = vld1q_s16(&cospi[4 * 30]);
    938 
    939  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
    940  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
    941  const int16x4_t cospi8 = vget_low_s16(cospi8_24);
    942  const int16x4_t cospi24 = vget_high_s16(cospi8_24);
    943  const int16x4_t cospi4 = vget_low_s16(cospi4_12);
    944  const int16x4_t cospi12 = vget_high_s16(cospi4_12);
    945  const int16x4_t cospi20 = vget_low_s16(cospi20_28);
    946  const int16x4_t cospi28 = vget_high_s16(cospi20_28);
    947  const int16x4_t cospi2 = vget_low_s16(cospi2_6);
    948  const int16x4_t cospi6 = vget_high_s16(cospi2_6);
    949  const int16x4_t cospi10 = vget_low_s16(cospi10_14);
    950  const int16x4_t cospi14 = vget_high_s16(cospi10_14);
    951  const int16x4_t cospi18 = vget_low_s16(cospi18_22);
    952  const int16x4_t cospi22 = vget_high_s16(cospi18_22);
    953  const int16x4_t cospi26 = vget_low_s16(cospi26_30);
    954  const int16x4_t cospi30 = vget_high_s16(cospi26_30);
    955  const int16x4_t cospi1 = vget_low_s16(cospi1_3);
    956  const int16x4_t cospi3 = vget_high_s16(cospi1_3);
    957  const int16x4_t cospi5 = vget_low_s16(cospi5_7);
    958  const int16x4_t cospi7 = vget_high_s16(cospi5_7);
    959  const int16x4_t cospi9 = vget_low_s16(cospi9_11);
    960  const int16x4_t cospi11 = vget_high_s16(cospi9_11);
    961  const int16x4_t cospi13 = vget_low_s16(cospi13_15);
    962  const int16x4_t cospi15 = vget_high_s16(cospi13_15);
    963  const int16x4_t cospi17 = vget_low_s16(cospi17_19);
    964  const int16x4_t cospi19 = vget_high_s16(cospi17_19);
    965  const int16x4_t cospi21 = vget_low_s16(cospi21_23);
    966  const int16x4_t cospi23 = vget_high_s16(cospi21_23);
    967  const int16x4_t cospi25 = vget_low_s16(cospi25_27);
    968  const int16x4_t cospi27 = vget_high_s16(cospi25_27);
    969  const int16x4_t cospi29 = vget_low_s16(cospi29_31);
    970  const int16x4_t cospi31 = vget_high_s16(cospi29_31);
    971 
    972  // stage 1
    973  int16x8_t x1[64];
    974  butterfly_dct_pre_s16_x8(input, x1, 64);
    975 
    976  // stage 2
    977  int16x8_t x2[64];
    978  butterfly_dct_pre_s16_x8(x1, x2, 32);
    979  butterfly_s16_s32_x8_0112_neon(cospi32, x1[55], x1[40], &x2[55], &x2[40]);
    980  butterfly_s16_s32_x8_0112_neon(cospi32, x1[54], x1[41], &x2[54], &x2[41]);
    981  butterfly_s16_s32_x8_0112_neon(cospi32, x1[53], x1[42], &x2[53], &x2[42]);
    982  butterfly_s16_s32_x8_0112_neon(cospi32, x1[52], x1[43], &x2[52], &x2[43]);
    983  butterfly_s16_s32_x8_0112_neon(cospi32, x1[51], x1[44], &x2[51], &x2[44]);
    984  butterfly_s16_s32_x8_0112_neon(cospi32, x1[50], x1[45], &x2[50], &x2[45]);
    985  butterfly_s16_s32_x8_0112_neon(cospi32, x1[49], x1[46], &x2[49], &x2[46]);
    986  butterfly_s16_s32_x8_0112_neon(cospi32, x1[48], x1[47], &x2[48], &x2[47]);
    987 
    988  // stage 3
    989  int16x8_t x3[64];
    990  butterfly_dct_pre_s16_x8(x2, x3, 16);
    991  x3[16] = x2[16];
    992  x3[17] = x2[17];
    993  x3[18] = x2[18];
    994  x3[19] = x2[19];
    995  butterfly_s16_s32_x8_0112_neon(cospi32, x2[27], x2[20], &x3[27], &x3[20]);
    996  butterfly_s16_s32_x8_0112_neon(cospi32, x2[26], x2[21], &x3[26], &x3[21]);
    997  butterfly_s16_s32_x8_0112_neon(cospi32, x2[25], x2[22], &x3[25], &x3[22]);
    998  butterfly_s16_s32_x8_0112_neon(cospi32, x2[24], x2[23], &x3[24], &x3[23]);
    999  x3[28] = x2[28];
   1000  x3[29] = x2[29];
   1001  x3[30] = x2[30];
   1002  x3[31] = x2[31];
   1003  butterfly_dct_post_s16_x8(x1 + 32, x2 + 32, x3 + 32, 32);
   1004 
   1005  // stage 4
   1006  int16x8_t x4[64];
   1007  butterfly_dct_pre_s16_x8(x3, x4, 8);
   1008  butterfly_s16_s32_x8_0112_neon(cospi32, x3[13], x3[10], &x4[13], &x4[10]);
   1009  butterfly_s16_s32_x8_0112_neon(cospi32, x3[12], x3[11], &x4[12], &x4[11]);
   1010  butterfly_dct_post_s16_x8(x3 + 16, x3 + 16, x4 + 16, 16);
   1011  butterfly_s16_s32_x8_0112_neon(cospi16, x3[59], x3[36], &x4[59], &x4[36]);
   1012  butterfly_s16_s32_x8_0112_neon(cospi16, x3[58], x3[37], &x4[58], &x4[37]);
   1013  butterfly_s16_s32_x8_0112_neon(cospi16, x3[57], x3[38], &x4[57], &x4[38]);
   1014  butterfly_s16_s32_x8_0112_neon(cospi16, x3[56], x3[39], &x4[56], &x4[39]);
   1015  butterfly_s16_s32_x8_1223_neon(cospi16, x3[55], x3[40], &x4[55], &x4[40]);
   1016  butterfly_s16_s32_x8_1223_neon(cospi16, x3[54], x3[41], &x4[54], &x4[41]);
   1017  butterfly_s16_s32_x8_1223_neon(cospi16, x3[53], x3[42], &x4[53], &x4[42]);
   1018  butterfly_s16_s32_x8_1223_neon(cospi16, x3[52], x3[43], &x4[52], &x4[43]);
   1019 
   1020  // stage 5
   1021  int16x8_t x5[64];
   1022  butterfly_dct_pre_s16_x8(x4, x5, 4);
   1023  butterfly_s16_s32_x8_0112_neon(cospi32, x4[6], x4[5], &x5[6], &x5[5]);
   1024  butterfly_dct_post_s16_x8(x3 + 8, x4 + 8, x5 + 8, 8);
   1025  butterfly_s16_s32_x8_0112_neon(cospi16, x4[29], x4[18], &x5[29], &x5[18]);
   1026  butterfly_s16_s32_x8_0112_neon(cospi16, x4[28], x4[19], &x5[28], &x5[19]);
   1027  butterfly_s16_s32_x8_1223_neon(cospi16, x4[27], x4[20], &x5[27], &x5[20]);
   1028  butterfly_s16_s32_x8_1223_neon(cospi16, x4[26], x4[21], &x5[26], &x5[21]);
   1029  butterfly_dct_post_s16_x8(x3 + 32, x4 + 32, x5 + 32, 16);
   1030  butterfly_dct_post_s16_x8(x3 + 48, x4 + 48, x5 + 48, 16);
   1031 
   1032  // stage 6
   1033  int16x8_t x6[64];
   1034  butterfly_s16_s32_x8_0112_neon(cospi32, x5[1], x5[0], &x6[0], &x6[1]);
   1035  butterfly_s16_s32_x8_0112_neon(cospi16, x5[3], x5[2], &x6[2], &x6[3]);
   1036  butterfly_dct_post_s16_x8(x4 + 4, x5 + 4, x6 + 4, 4);
   1037  butterfly_s16_s32_x8_0112_neon(cospi16, x5[14], x5[9], &x6[14], &x6[9]);
   1038  butterfly_s16_s32_x8_1223_neon(cospi16, x5[13], x5[10], &x6[13], &x6[10]);
   1039  butterfly_dct_post_s16_x8(x4 + 16, x5 + 16, x6 + 16, 8);
   1040  butterfly_dct_post_s16_x8(x4 + 24, x5 + 24, x6 + 24, 8);
   1041  butterfly_s16_s32_x8_0112_neon(cospi8, x5[61], x5[34], &x6[61], &x6[34]);
   1042  butterfly_s16_s32_x8_0112_neon(cospi8, x5[60], x5[35], &x6[60], &x6[35]);
   1043  butterfly_s16_s32_x8_1223_neon(cospi8, x5[59], x5[36], &x6[59], &x6[36]);
   1044  butterfly_s16_s32_x8_1223_neon(cospi8, x5[58], x5[37], &x6[58], &x6[37]);
   1045  butterfly_s16_s32_x8_1003_neon(cospi24, x5[53], x5[42], &x6[53], &x6[42]);
   1046  butterfly_s16_s32_x8_1003_neon(cospi24, x5[52], x5[43], &x6[52], &x6[43]);
   1047  butterfly_s16_s32_x8_0332_neon(cospi24, x5[51], x5[44], &x6[51], &x6[44]);
   1048  butterfly_s16_s32_x8_0332_neon(cospi24, x5[50], x5[45], &x6[50], &x6[45]);
   1049 
   1050  // stage 7
   1051  int16x8_t x7[64];
   1052  butterfly_s16_s32_x8_0112_neon(cospi8, x6[7], x6[4], &x7[4], &x7[7]);
   1053  butterfly_s16_s32_x8_1003_neon(cospi24, x6[6], x6[5], &x7[5], &x7[6]);
   1054  butterfly_dct_post_s16_x8(x5 + 8, x6 + 8, x7 + 8, 4);
   1055  butterfly_dct_post_s16_x8(x5 + 12, x6 + 12, x7 + 12, 4);
   1056  butterfly_s16_s32_x8_0112_neon(cospi8, x6[30], x6[17], &x7[30], &x7[17]);
   1057  butterfly_s16_s32_x8_1223_neon(cospi8, x6[29], x6[18], &x7[29], &x7[18]);
   1058  butterfly_s16_s32_x8_1003_neon(cospi24, x6[26], x6[21], &x7[26], &x7[21]);
   1059  butterfly_s16_s32_x8_0332_neon(cospi24, x6[25], x6[22], &x7[25], &x7[22]);
   1060  butterfly_dct_post_s16_x8(x5 + 32, x6 + 32, x7 + 32, 8);
   1061  butterfly_dct_post_s16_x8(x5 + 40, x6 + 40, x7 + 40, 8);
   1062  butterfly_dct_post_s16_x8(x5 + 48, x6 + 48, x7 + 48, 8);
   1063  butterfly_dct_post_s16_x8(x5 + 56, x6 + 56, x7 + 56, 8);
   1064 
   1065  // stage 8
   1066  int16x8_t x8[64];
   1067  butterfly_s16_s32_x8_0112_neon(cospi4, x7[15], x7[8], &x8[8], &x8[15]);
   1068  butterfly_s16_s32_x8_1003_neon(cospi28, x7[14], x7[9], &x8[9], &x8[14]);
   1069  butterfly_s16_s32_x8_0112_neon(cospi20, x7[13], x7[10], &x8[10], &x8[13]);
   1070  butterfly_s16_s32_x8_1003_neon(cospi12, x7[12], x7[11], &x8[11], &x8[12]);
   1071  butterfly_dct_post_s16_x8(x6 + 16, x7 + 16, x8 + 16, 4);
   1072  butterfly_dct_post_s16_x8(x6 + 20, x7 + 20, x8 + 20, 4);
   1073  butterfly_dct_post_s16_x8(x6 + 24, x7 + 24, x8 + 24, 4);
   1074  butterfly_dct_post_s16_x8(x6 + 28, x7 + 28, x8 + 28, 4);
   1075  butterfly_s16_s32_x8_0112_neon(cospi4, x7[62], x7[33], &x8[62], &x8[33]);
   1076  butterfly_s16_s32_x8_1223_neon(cospi4, x7[61], x7[34], &x8[61], &x8[34]);
   1077  butterfly_s16_s32_x8_1003_neon(cospi28, x7[58], x7[37], &x8[58], &x8[37]);
   1078  butterfly_s16_s32_x8_0332_neon(cospi28, x7[57], x7[38], &x8[57], &x8[38]);
   1079  butterfly_s16_s32_x8_0112_neon(cospi20, x7[54], x7[41], &x8[54], &x8[41]);
   1080  butterfly_s16_s32_x8_1223_neon(cospi20, x7[53], x7[42], &x8[53], &x8[42]);
   1081  butterfly_s16_s32_x8_1003_neon(cospi12, x7[50], x7[45], &x8[50], &x8[45]);
   1082  butterfly_s16_s32_x8_0332_neon(cospi12, x7[49], x7[46], &x8[49], &x8[46]);
   1083 
   1084  // stage 9
   1085  int16x8_t x9[64];
   1086  butterfly_s16_s32_x8_0112_neon(cospi2, x8[31], x8[16], &x9[16], &x9[31]);
   1087  butterfly_s16_s32_x8_1003_neon(cospi30, x8[30], x8[17], &x9[17], &x9[30]);
   1088  butterfly_s16_s32_x8_0112_neon(cospi18, x8[29], x8[18], &x9[18], &x9[29]);
   1089  butterfly_s16_s32_x8_1003_neon(cospi14, x8[28], x8[19], &x9[19], &x9[28]);
   1090  butterfly_s16_s32_x8_0112_neon(cospi10, x8[27], x8[20], &x9[20], &x9[27]);
   1091  butterfly_s16_s32_x8_1003_neon(cospi22, x8[26], x8[21], &x9[21], &x9[26]);
   1092  butterfly_s16_s32_x8_0112_neon(cospi26, x8[25], x8[22], &x9[22], &x9[25]);
   1093  butterfly_s16_s32_x8_1003_neon(cospi6, x8[24], x8[23], &x9[23], &x9[24]);
   1094  butterfly_dct_post_s16_x8(x7 + 32, x8 + 32, x9 + 32, 4);
   1095  butterfly_dct_post_s16_x8(x7 + 36, x8 + 36, x9 + 36, 4);
   1096  butterfly_dct_post_s16_x8(x7 + 40, x8 + 40, x9 + 40, 4);
   1097  butterfly_dct_post_s16_x8(x7 + 44, x8 + 44, x9 + 44, 4);
   1098  butterfly_dct_post_s16_x8(x7 + 48, x8 + 48, x9 + 48, 4);
   1099  butterfly_dct_post_s16_x8(x7 + 52, x8 + 52, x9 + 52, 4);
   1100  butterfly_dct_post_s16_x8(x7 + 56, x8 + 56, x9 + 56, 4);
   1101  butterfly_dct_post_s16_x8(x7 + 60, x8 + 60, x9 + 60, 4);
   1102 
   1103  // stage 10
   1104  butterfly_s16_s32_x8_0112_neon(cospi1, x9[63], x9[32], &output[1],
   1105                                 &output[63]);
   1106  butterfly_s16_s32_x8_1003_neon(cospi31, x9[62], x9[33], &output[33],
   1107                                 &output[31]);
   1108  butterfly_s16_s32_x8_0112_neon(cospi17, x9[61], x9[34], &output[17],
   1109                                 &output[47]);
   1110  butterfly_s16_s32_x8_1003_neon(cospi15, x9[60], x9[35], &output[49],
   1111                                 &output[15]);
   1112  butterfly_s16_s32_x8_0112_neon(cospi9, x9[59], x9[36], &output[9],
   1113                                 &output[55]);
   1114  butterfly_s16_s32_x8_1003_neon(cospi23, x9[58], x9[37], &output[41],
   1115                                 &output[23]);
   1116  butterfly_s16_s32_x8_0112_neon(cospi25, x9[57], x9[38], &output[25],
   1117                                 &output[39]);
   1118  butterfly_s16_s32_x8_1003_neon(cospi7, x9[56], x9[39], &output[57],
   1119                                 &output[7]);
   1120  butterfly_s16_s32_x8_0112_neon(cospi5, x9[55], x9[40], &output[5],
   1121                                 &output[59]);
   1122  butterfly_s16_s32_x8_1003_neon(cospi27, x9[54], x9[41], &output[37],
   1123                                 &output[27]);
   1124  butterfly_s16_s32_x8_0112_neon(cospi21, x9[53], x9[42], &output[21],
   1125                                 &output[43]);
   1126  butterfly_s16_s32_x8_1003_neon(cospi11, x9[52], x9[43], &output[53],
   1127                                 &output[11]);
   1128  butterfly_s16_s32_x8_0112_neon(cospi13, x9[51], x9[44], &output[13],
   1129                                 &output[51]);
   1130  butterfly_s16_s32_x8_1003_neon(cospi19, x9[50], x9[45], &output[45],
   1131                                 &output[19]);
   1132  butterfly_s16_s32_x8_0112_neon(cospi29, x9[49], x9[46], &output[29],
   1133                                 &output[35]);
   1134  butterfly_s16_s32_x8_1003_neon(cospi3, x9[48], x9[47], &output[61],
   1135                                 &output[3]);
   1136 
   1137  // stage 11
   1138  output[0] = x6[0];
   1139  output[2] = x9[16];
   1140  output[4] = x8[8];
   1141  output[6] = x9[24];
   1142  output[8] = x7[4];
   1143  output[10] = x9[20];
   1144  output[12] = x8[12];
   1145  output[14] = x9[28];
   1146  output[16] = x6[2];
   1147  output[18] = x9[18];
   1148  output[20] = x8[10];
   1149  output[22] = x9[26];
   1150  output[24] = x7[6];
   1151  output[26] = x9[22];
   1152  output[28] = x8[14];
   1153  output[30] = x9[30];
   1154  output[32] = x6[1];
   1155  output[34] = x9[17];
   1156  output[36] = x8[9];
   1157  output[38] = x9[25];
   1158  output[40] = x7[5];
   1159  output[42] = x9[21];
   1160  output[44] = x8[13];
   1161  output[46] = x9[29];
   1162  output[48] = x6[3];
   1163  output[52] = x8[11];
   1164  output[54] = x9[27];
   1165  output[56] = x7[7];
   1166  output[58] = x9[23];
   1167  output[60] = x8[15];
   1168  output[62] = x9[31];
   1169 }
   1170 
   1171 static AOM_FORCE_INLINE void fadst8x8_neon(const int16x8_t *input,
   1172                                           int16x8_t *output, int cos_bit) {
   1173  const int16_t *cospi = cospi_arr_q13(cos_bit);
   1174 
   1175  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
   1176  const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
   1177  const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
   1178 
   1179  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
   1180  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
   1181  const int16x4_t cospi4 = vget_low_s16(cospi4_12);
   1182  const int16x4_t cospi12 = vget_high_s16(cospi4_12);
   1183  const int16x4_t cospi20 = vget_low_s16(cospi20_28);
   1184  const int16x4_t cospi28 = vget_high_s16(cospi20_28);
   1185 
   1186  // stage 2
   1187  int16x8_t x2[8];
   1188  butterfly_s16_s32_x8_0332_neon(cospi32, input[4], input[3], &x2[2], &x2[3]);
   1189  butterfly_s16_s32_x8_0112_neon(cospi32, input[2], input[5], &x2[7], &x2[6]);
   1190 
   1191  // stage 3
   1192  int16x8_t x3[8];
   1193  x3[0] = vqaddq_s16(input[0], x2[2]);
   1194  x3[1] = vqsubq_s16(x2[3], input[7]);
   1195  x3[2] = vqsubq_s16(input[0], x2[2]);
   1196  x3[3] = vqaddq_s16(input[7], x2[3]);
   1197  x3[4] = vqsubq_s16(x2[6], input[1]);
   1198  x3[5] = vqaddq_s16(input[6], x2[7]);
   1199  x3[6] = vqaddq_s16(input[1], x2[6]);
   1200  x3[7] = vqsubq_s16(input[6], x2[7]);
   1201 
   1202  // stage 4
   1203  butterfly_s16_s32_x8_0112_neon(cospi16, x3[4], x3[5], &x3[4], &x3[5]);
   1204  butterfly_s16_s32_x8_0112_neon(cospi16, x3[7], x3[6], &x3[6], &x3[7]);
   1205 
   1206  // stage 5
   1207  int16x8_t x5[8];
   1208  x5[0] = vqaddq_s16(x3[0], x3[4]);
   1209  x5[1] = vqaddq_s16(x3[1], x3[5]);
   1210  x5[2] = vqaddq_s16(x3[2], x3[6]);
   1211  x5[3] = vqsubq_s16(x3[7], x3[3]);
   1212  x5[4] = vqsubq_s16(x3[0], x3[4]);
   1213  x5[5] = vqsubq_s16(x3[1], x3[5]);
   1214  x5[6] = vqsubq_s16(x3[2], x3[6]);
   1215  x5[7] = vqaddq_s16(x3[3], x3[7]);
   1216 
   1217  // stage 6
   1218  butterfly_s16_s32_x8_0112_neon(cospi4, x5[0], x5[1], &output[7], &output[0]);
   1219  butterfly_s16_s32_x8_0112_neon(cospi20, x5[2], x5[3], &output[5], &output[2]);
   1220  butterfly_s16_s32_x8_1003_neon(cospi28, x5[4], x5[5], &output[3], &output[4]);
   1221  butterfly_s16_s32_x8_0112_neon(cospi12, x5[6], x5[7], &output[6], &output[1]);
   1222 }
   1223 
   1224 static AOM_FORCE_INLINE void fadst4x16_neon(const int16x4_t *input,
   1225                                            int16x4_t *output, int cos_bit) {
   1226  const int16_t *cospi = cospi_arr_q13(cos_bit);
   1227 
   1228  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
   1229  const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
   1230  const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
   1231  const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
   1232  const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
   1233  const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
   1234 
   1235  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
   1236  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
   1237  const int16x4_t cospi8 = vget_low_s16(cospi8_24);
   1238  const int16x4_t cospi24 = vget_high_s16(cospi8_24);
   1239  const int16x4_t cospi2 = vget_low_s16(cospi2_6);
   1240  const int16x4_t cospi6 = vget_high_s16(cospi2_6);
   1241  const int16x4_t cospi10 = vget_low_s16(cospi10_14);
   1242  const int16x4_t cospi14 = vget_high_s16(cospi10_14);
   1243  const int16x4_t cospi18 = vget_low_s16(cospi18_22);
   1244  const int16x4_t cospi22 = vget_high_s16(cospi18_22);
   1245  const int16x4_t cospi26 = vget_low_s16(cospi26_30);
   1246  const int16x4_t cospi30 = vget_high_s16(cospi26_30);
   1247 
   1248  // stage 2
   1249  int16x4_t x2[8];
   1250  butterfly_s16_s32_x4_0332_neon(cospi32, input[8], input[7], &x2[0], &x2[1]);
   1251  butterfly_s16_s32_x4_0112_neon(cospi32, input[4], input[11], &x2[3], &x2[2]);
   1252  butterfly_s16_s32_x4_0112_neon(cospi32, input[6], input[9], &x2[5], &x2[4]);
   1253  butterfly_s16_s32_x4_0332_neon(cospi32, input[10], input[5], &x2[6], &x2[7]);
   1254 
   1255  // stage 3
   1256  int16x4_t x3[16];
   1257  x3[0] = vqadd_s16(input[0], x2[0]);
   1258  x3[1] = vqsub_s16(x2[1], input[15]);
   1259  x3[2] = vqsub_s16(input[0], x2[0]);
   1260  x3[3] = vqadd_s16(input[15], x2[1]);
   1261  x3[4] = vqsub_s16(x2[2], input[3]);
   1262  x3[5] = vqadd_s16(input[12], x2[3]);
   1263  x3[6] = vqadd_s16(input[3], x2[2]);
   1264  x3[7] = vqsub_s16(input[12], x2[3]);
   1265  x3[8] = vqsub_s16(x2[4], input[1]);
   1266  x3[9] = vqadd_s16(input[14], x2[5]);
   1267  x3[10] = vqadd_s16(input[1], x2[4]);
   1268  x3[11] = vqsub_s16(input[14], x2[5]);
   1269  x3[12] = vqadd_s16(input[2], x2[6]);
   1270  x3[13] = vqsub_s16(x2[7], input[13]);
   1271  x3[14] = vqsub_s16(input[2], x2[6]);
   1272  x3[15] = vqadd_s16(input[13], x2[7]);
   1273 
   1274  // stage 4
   1275  butterfly_s16_s32_x4_0112_neon(cospi16, x3[4], x3[5], &x3[4], &x3[5]);
   1276  butterfly_s16_s32_x4_0112_neon(cospi16, x3[7], x3[6], &x3[6], &x3[7]);
   1277  butterfly_s16_s32_x4_0112_neon(cospi16, x3[12], x3[13], &x3[12], &x3[13]);
   1278  butterfly_s16_s32_x4_0332_neon(cospi16, x3[14], x3[15], &x3[15], &x3[14]);
   1279 
   1280  // stage 5
   1281  int16x4_t x5[16];
   1282  x5[0] = vqadd_s16(x3[0], x3[4]);
   1283  x5[1] = vqadd_s16(x3[1], x3[5]);
   1284  x5[2] = vqadd_s16(x3[2], x3[6]);
   1285  x5[3] = vqsub_s16(x3[7], x3[3]);
   1286  x5[4] = vqsub_s16(x3[0], x3[4]);
   1287  x5[5] = vqsub_s16(x3[1], x3[5]);
   1288  x5[6] = vqsub_s16(x3[2], x3[6]);
   1289  x5[7] = vqadd_s16(x3[3], x3[7]);
   1290  x5[8] = vqadd_s16(x3[8], x3[12]);
   1291  x5[9] = vqadd_s16(x3[9], x3[13]);
   1292  x5[10] = vqsub_s16(x3[14], x3[10]);
   1293  x5[11] = vqadd_s16(x3[11], x3[15]);
   1294  x5[12] = vqsub_s16(x3[8], x3[12]);
   1295  x5[13] = vqsub_s16(x3[9], x3[13]);
   1296  x5[14] = vqadd_s16(x3[10], x3[14]);
   1297  x5[15] = vqsub_s16(x3[11], x3[15]);
   1298 
   1299  // stage 6
   1300  butterfly_s16_s32_x4_0112_neon(cospi8, x5[8], x5[9], &x5[8], &x5[9]);
   1301  butterfly_s16_s32_x4_1003_neon(cospi24, x5[10], x5[11], &x5[10], &x5[11]);
   1302  butterfly_s16_s32_x4_1003_neon(cospi8, x5[13], x5[12], &x5[13], &x5[12]);
   1303  butterfly_s16_s32_x4_1003_neon(cospi24, x5[15], x5[14], &x5[14], &x5[15]);
   1304 
   1305  // stage 7
   1306  int16x4_t x7[16];
   1307  x7[0] = vqadd_s16(x5[0], x5[8]);
   1308  x7[1] = vqadd_s16(x5[1], x5[9]);
   1309  x7[2] = vqadd_s16(x5[2], x5[10]);
   1310  x7[3] = vqadd_s16(x5[3], x5[11]);
   1311  x7[4] = vqadd_s16(x5[4], x5[12]);
   1312  x7[5] = vqadd_s16(x5[5], x5[13]);
   1313  x7[6] = vqadd_s16(x5[6], x5[14]);
   1314  x7[7] = vqsub_s16(x5[15], x5[7]);
   1315  x7[8] = vqsub_s16(x5[0], x5[8]);
   1316  x7[9] = vqsub_s16(x5[1], x5[9]);
   1317  x7[10] = vqsub_s16(x5[2], x5[10]);
   1318  x7[11] = vqsub_s16(x5[3], x5[11]);
   1319  x7[12] = vqsub_s16(x5[4], x5[12]);
   1320  x7[13] = vqsub_s16(x5[5], x5[13]);
   1321  x7[14] = vqsub_s16(x5[6], x5[14]);
   1322  x7[15] = vqadd_s16(x5[7], x5[15]);
   1323 
   1324  // stage 8
   1325  butterfly_s16_s32_x4_0112_neon(cospi2, x7[0], x7[1], &output[15], &output[0]);
   1326  butterfly_s16_s32_x4_0112_neon(cospi10, x7[2], x7[3], &output[13],
   1327                                 &output[2]);
   1328  butterfly_s16_s32_x4_0112_neon(cospi18, x7[4], x7[5], &output[11],
   1329                                 &output[4]);
   1330  butterfly_s16_s32_x4_0112_neon(cospi26, x7[6], x7[7], &output[9], &output[6]);
   1331  butterfly_s16_s32_x4_1003_neon(cospi30, x7[8], x7[9], &output[7], &output[8]);
   1332  butterfly_s16_s32_x4_1003_neon(cospi22, x7[10], x7[11], &output[5],
   1333                                 &output[10]);
   1334  butterfly_s16_s32_x4_1003_neon(cospi14, x7[12], x7[13], &output[3],
   1335                                 &output[12]);
   1336  butterfly_s16_s32_x4_0112_neon(cospi6, x7[14], x7[15], &output[14],
   1337                                 &output[1]);
   1338 }
   1339 
   1340 static AOM_FORCE_INLINE void fadst8x16_neon(const int16x8_t *input,
   1341                                            int16x8_t *output, int cos_bit) {
   1342  const int16_t *cospi = cospi_arr_q13(cos_bit);
   1343 
   1344  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
   1345  const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
   1346  const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
   1347  const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
   1348  const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
   1349  const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
   1350 
   1351  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
   1352  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
   1353  const int16x4_t cospi8 = vget_low_s16(cospi8_24);
   1354  const int16x4_t cospi24 = vget_high_s16(cospi8_24);
   1355  const int16x4_t cospi2 = vget_low_s16(cospi2_6);
   1356  const int16x4_t cospi6 = vget_high_s16(cospi2_6);
   1357  const int16x4_t cospi10 = vget_low_s16(cospi10_14);
   1358  const int16x4_t cospi14 = vget_high_s16(cospi10_14);
   1359  const int16x4_t cospi18 = vget_low_s16(cospi18_22);
   1360  const int16x4_t cospi22 = vget_high_s16(cospi18_22);
   1361  const int16x4_t cospi26 = vget_low_s16(cospi26_30);
   1362  const int16x4_t cospi30 = vget_high_s16(cospi26_30);
   1363 
   1364  // stage 2
   1365  int16x8_t x2[8];
   1366  butterfly_s16_s32_x8_0332_neon(cospi32, input[8], input[7], &x2[0], &x2[1]);
   1367  butterfly_s16_s32_x8_0112_neon(cospi32, input[4], input[11], &x2[3], &x2[2]);
   1368  butterfly_s16_s32_x8_0112_neon(cospi32, input[6], input[9], &x2[5], &x2[4]);
   1369  butterfly_s16_s32_x8_0332_neon(cospi32, input[10], input[5], &x2[6], &x2[7]);
   1370 
   1371  // stage 3
   1372  int16x8_t x3[16];
   1373  x3[0] = vqaddq_s16(input[0], x2[0]);
   1374  x3[1] = vqsubq_s16(x2[1], input[15]);
   1375  x3[2] = vqsubq_s16(input[0], x2[0]);
   1376  x3[3] = vqaddq_s16(input[15], x2[1]);
   1377  x3[4] = vqsubq_s16(x2[2], input[3]);
   1378  x3[5] = vqaddq_s16(input[12], x2[3]);
   1379  x3[6] = vqaddq_s16(input[3], x2[2]);
   1380  x3[7] = vqsubq_s16(input[12], x2[3]);
   1381  x3[8] = vqsubq_s16(x2[4], input[1]);
   1382  x3[9] = vqaddq_s16(input[14], x2[5]);
   1383  x3[10] = vqaddq_s16(input[1], x2[4]);
   1384  x3[11] = vqsubq_s16(input[14], x2[5]);
   1385  x3[12] = vqaddq_s16(input[2], x2[6]);
   1386  x3[13] = vqsubq_s16(x2[7], input[13]);
   1387  x3[14] = vqsubq_s16(input[2], x2[6]);
   1388  x3[15] = vqaddq_s16(input[13], x2[7]);
   1389 
   1390  // stage 4
   1391  butterfly_s16_s32_x8_0112_neon(cospi16, x3[4], x3[5], &x3[4], &x3[5]);
   1392  butterfly_s16_s32_x8_0112_neon(cospi16, x3[7], x3[6], &x3[6], &x3[7]);
   1393  butterfly_s16_s32_x8_0112_neon(cospi16, x3[12], x3[13], &x3[12], &x3[13]);
   1394  butterfly_s16_s32_x8_0332_neon(cospi16, x3[14], x3[15], &x3[15], &x3[14]);
   1395 
   1396  // stage 5
   1397  int16x8_t x5[16];
   1398  x5[0] = vqaddq_s16(x3[0], x3[4]);
   1399  x5[1] = vqaddq_s16(x3[1], x3[5]);
   1400  x5[2] = vqaddq_s16(x3[2], x3[6]);
   1401  x5[3] = vqsubq_s16(x3[7], x3[3]);
   1402  x5[4] = vqsubq_s16(x3[0], x3[4]);
   1403  x5[5] = vqsubq_s16(x3[1], x3[5]);
   1404  x5[6] = vqsubq_s16(x3[2], x3[6]);
   1405  x5[7] = vqaddq_s16(x3[3], x3[7]);
   1406  x5[8] = vqaddq_s16(x3[8], x3[12]);
   1407  x5[9] = vqaddq_s16(x3[9], x3[13]);
   1408  x5[10] = vqsubq_s16(x3[14], x3[10]);
   1409  x5[11] = vqaddq_s16(x3[11], x3[15]);
   1410  x5[12] = vqsubq_s16(x3[8], x3[12]);
   1411  x5[13] = vqsubq_s16(x3[9], x3[13]);
   1412  x5[14] = vqaddq_s16(x3[10], x3[14]);
   1413  x5[15] = vqsubq_s16(x3[11], x3[15]);
   1414 
   1415  // stage 6
   1416  butterfly_s16_s32_x8_0112_neon(cospi8, x5[8], x5[9], &x5[8], &x5[9]);
   1417  butterfly_s16_s32_x8_1003_neon(cospi24, x5[10], x5[11], &x5[10], &x5[11]);
   1418  butterfly_s16_s32_x8_1003_neon(cospi8, x5[13], x5[12], &x5[13], &x5[12]);
   1419  butterfly_s16_s32_x8_1003_neon(cospi24, x5[15], x5[14], &x5[14], &x5[15]);
   1420 
   1421  // stage 7
   1422  int16x8_t x7[16];
   1423  x7[0] = vqaddq_s16(x5[0], x5[8]);
   1424  x7[1] = vqaddq_s16(x5[1], x5[9]);
   1425  x7[2] = vqaddq_s16(x5[2], x5[10]);
   1426  x7[3] = vqaddq_s16(x5[3], x5[11]);
   1427  x7[4] = vqaddq_s16(x5[4], x5[12]);
   1428  x7[5] = vqaddq_s16(x5[5], x5[13]);
   1429  x7[6] = vqaddq_s16(x5[6], x5[14]);
   1430  x7[7] = vqsubq_s16(x5[15], x5[7]);
   1431  x7[8] = vqsubq_s16(x5[0], x5[8]);
   1432  x7[9] = vqsubq_s16(x5[1], x5[9]);
   1433  x7[10] = vqsubq_s16(x5[2], x5[10]);
   1434  x7[11] = vqsubq_s16(x5[3], x5[11]);
   1435  x7[12] = vqsubq_s16(x5[4], x5[12]);
   1436  x7[13] = vqsubq_s16(x5[5], x5[13]);
   1437  x7[14] = vqsubq_s16(x5[6], x5[14]);
   1438  x7[15] = vqaddq_s16(x5[7], x5[15]);
   1439 
   1440  // stage 8
   1441  butterfly_s16_s32_x8_0112_neon(cospi2, x7[0], x7[1], &output[15], &output[0]);
   1442  butterfly_s16_s32_x8_0112_neon(cospi10, x7[2], x7[3], &output[13],
   1443                                 &output[2]);
   1444  butterfly_s16_s32_x8_0112_neon(cospi18, x7[4], x7[5], &output[11],
   1445                                 &output[4]);
   1446  butterfly_s16_s32_x8_0112_neon(cospi26, x7[6], x7[7], &output[9], &output[6]);
   1447  butterfly_s16_s32_x8_1003_neon(cospi30, x7[8], x7[9], &output[7], &output[8]);
   1448  butterfly_s16_s32_x8_1003_neon(cospi22, x7[10], x7[11], &output[5],
   1449                                 &output[10]);
   1450  butterfly_s16_s32_x8_1003_neon(cospi14, x7[12], x7[13], &output[3],
   1451                                 &output[12]);
   1452  butterfly_s16_s32_x8_0112_neon(cospi6, x7[14], x7[15], &output[14],
   1453                                 &output[1]);
   1454 }
   1455 
   1456 static AOM_FORCE_INLINE void fidentity4x4_neon(const int16x4_t *const input,
   1457                                               int16x4_t *const output,
   1458                                               const int cos_bit) {
   1459  (void)cos_bit;
   1460  round_shift_sqrt2_s16_s16_4xn_neon(input, output, 4);
   1461 }
   1462 
   1463 static AOM_FORCE_INLINE void fidentity8x4_neon(const int16x8_t *const input,
   1464                                               int16x8_t *const output,
   1465                                               const int cos_bit) {
   1466  (void)cos_bit;
   1467  round_shift_sqrt2_s16_s16_8xn_neon(input, output, 4);
   1468 }
   1469 
   1470 static AOM_FORCE_INLINE void fidentity4x8_neon(const int16x4_t *input,
   1471                                               int16x4_t *output, int cos_bit) {
   1472  (void)cos_bit;
   1473  shift_left_1_s16_x4(input, output, 8);
   1474 }
   1475 
   1476 static AOM_FORCE_INLINE void fidentity8x8_neon(const int16x8_t *input,
   1477                                               int16x8_t *output, int cos_bit) {
   1478  (void)cos_bit;
   1479  shift_left_1_s16_x8(input, output, 8);
   1480 }
   1481 
   1482 static AOM_FORCE_INLINE void fidentity4x16_neon(const int16x4_t *input,
   1483                                                int16x4_t *output,
   1484                                                int cos_bit) {
   1485  (void)cos_bit;
   1486  round_shift_2sqrt2_s16_s16_4xn_neon(input, output, 16);
   1487 }
   1488 
   1489 static AOM_FORCE_INLINE void fidentity8x16_neon(const int16x8_t *input,
   1490                                                int16x8_t *output,
   1491                                                int cos_bit) {
   1492  (void)cos_bit;
   1493  round_shift_2sqrt2_s16_s16_8xn_neon(input, output, 16);
   1494 }
   1495 
   1496 static AOM_FORCE_INLINE void fidentity8x32_neon(const int16x8_t *input,
   1497                                                int16x8_t *output,
   1498                                                int cos_bit) {
   1499  (void)cos_bit;
   1500  shift_left_2_s16_x8(input, output, 32);
   1501 }
   1502 
   1503 #define TRANSFORM_COL(name, tw, n)                                          \
   1504  static void name##_col_neon(const int16_t *input, int16x##tw##_t *output, \
   1505                              int stride, int cos_bit) {                    \
   1506    int16x##tw##_t buf0[n];                                                 \
   1507    load_buffer_s16_x##tw(input, stride, buf0, n);                          \
   1508    shift_left_2_s16_x##tw(buf0, buf0, n);                                  \
   1509    name##_neon(buf0, output, cos_bit);                                     \
   1510  }
   1511 
   1512 TRANSFORM_COL(fadst4x4, 4, 4)
   1513 TRANSFORM_COL(fadst4x8, 4, 8)
   1514 TRANSFORM_COL(fadst4x16, 4, 16)
   1515 TRANSFORM_COL(fadst8x4, 8, 4)
   1516 TRANSFORM_COL(fadst8x8, 8, 8)
   1517 TRANSFORM_COL(fadst8x16, 8, 16)
   1518 TRANSFORM_COL(fdct4x4, 4, 4)
   1519 TRANSFORM_COL(fdct4x8, 4, 8)
   1520 TRANSFORM_COL(fdct4x16, 4, 16)
   1521 TRANSFORM_COL(fdct8x4, 8, 4)
   1522 TRANSFORM_COL(fdct8x8, 8, 8)
   1523 TRANSFORM_COL(fdct8x16, 8, 16)
   1524 TRANSFORM_COL(fdct8x32, 8, 32)
   1525 TRANSFORM_COL(fidentity4x4, 4, 4)
   1526 TRANSFORM_COL(fidentity4x8, 4, 8)
   1527 TRANSFORM_COL(fidentity4x16, 4, 16)
   1528 TRANSFORM_COL(fidentity8x4, 8, 4)
   1529 TRANSFORM_COL(fidentity8x8, 8, 8)
   1530 TRANSFORM_COL(fidentity8x16, 8, 16)
   1531 TRANSFORM_COL(fidentity8x32, 8, 32)
   1532 
   1533 #define TRANSFORM_ROW(name, tw, n)                                          \
   1534  static void name##_row_neon(const int16x##tw##_t *input, int32_t *output, \
   1535                              int stride, int cos_bit) {                    \
   1536    int16x##tw##_t buf0[n];                                                 \
   1537    name##_neon(input, buf0, cos_bit);                                      \
   1538    store_buffer_s16_x##tw(buf0, output, stride, n);                        \
   1539  }
   1540 
   1541 #define TRANSFORM_ROW_RECT(name, tw, n)                                        \
   1542  static void name##_row_rect_neon(const int16x##tw##_t *input,                \
   1543                                   int32_t *output, int stride, int cos_bit) { \
   1544    int16x##tw##_t buf0[n];                                                    \
   1545    name##_neon(input, buf0, cos_bit);                                         \
   1546    store_rect_buffer_s16_x##tw(buf0, output, stride, n);                      \
   1547  }
   1548 
   1549 TRANSFORM_ROW(fadst4x4, 4, 4)
   1550 TRANSFORM_ROW(fadst4x16, 4, 16)
   1551 TRANSFORM_ROW(fadst8x4, 8, 4)
   1552 TRANSFORM_ROW(fadst8x8, 8, 8)
   1553 TRANSFORM_ROW(fadst8x16, 8, 16)
   1554 TRANSFORM_ROW(fdct4x4, 4, 4)
   1555 TRANSFORM_ROW(fdct4x16, 4, 16)
   1556 TRANSFORM_ROW(fdct8x4, 8, 4)
   1557 TRANSFORM_ROW(fdct8x8, 8, 8)
   1558 TRANSFORM_ROW(fdct8x16, 8, 16)
   1559 TRANSFORM_ROW(fdct8x32, 8, 32)
   1560 TRANSFORM_ROW(fidentity4x4, 4, 4)
   1561 TRANSFORM_ROW(fidentity4x16, 4, 16)
   1562 TRANSFORM_ROW(fidentity8x4, 8, 4)
   1563 TRANSFORM_ROW(fidentity8x8, 8, 8)
   1564 TRANSFORM_ROW(fidentity8x16, 8, 16)
   1565 TRANSFORM_ROW(fidentity8x32, 8, 32)
   1566 
   1567 TRANSFORM_ROW_RECT(fadst4x8, 4, 8)
   1568 TRANSFORM_ROW_RECT(fadst8x4, 8, 4)
   1569 TRANSFORM_ROW_RECT(fadst8x8, 8, 8)
   1570 TRANSFORM_ROW_RECT(fadst8x16, 8, 16)
   1571 TRANSFORM_ROW_RECT(fdct4x8, 4, 8)
   1572 TRANSFORM_ROW_RECT(fdct8x4, 8, 4)
   1573 TRANSFORM_ROW_RECT(fdct8x8, 8, 8)
   1574 TRANSFORM_ROW_RECT(fdct8x16, 8, 16)
   1575 TRANSFORM_ROW_RECT(fdct8x32, 8, 32)
   1576 TRANSFORM_ROW_RECT(fidentity4x8, 4, 8)
   1577 TRANSFORM_ROW_RECT(fidentity8x4, 8, 4)
   1578 TRANSFORM_ROW_RECT(fidentity8x8, 8, 8)
   1579 TRANSFORM_ROW_RECT(fidentity8x16, 8, 16)
   1580 TRANSFORM_ROW_RECT(fidentity8x32, 8, 32)
   1581 
   1582 typedef void (*transform_1d_lbd_4_neon)(const int16x4_t *input,
   1583                                        int16x4_t *output, int cos_bit);
   1584 typedef void (*transform_1d_lbd_8_neon)(const int16x8_t *input,
   1585                                        int16x8_t *output, int cos_bit);
   1586 
   1587 typedef void (*col_transform_1d_lbd_4_neon)(const int16_t *input,
   1588                                            int16x4_t *output, int stride,
   1589                                            int cos_bit);
   1590 typedef void (*col_transform_1d_lbd_8_neon)(const int16_t *input,
   1591                                            int16x8_t *output, int stride,
   1592                                            int cos_bit);
   1593 
   1594 typedef void (*row_transform_1d_lbd_4_neon)(const int16x4_t *input,
   1595                                            int32_t *output, int stride,
   1596                                            int cos_bit);
   1597 typedef void (*row_transform_1d_lbd_8_neon)(const int16x8_t *input,
   1598                                            int32_t *output, int stride,
   1599                                            int cos_bit);
   1600 
   1601 static const col_transform_1d_lbd_4_neon col_txfm4x8_arr[TX_TYPES] = {
   1602  fdct4x8_col_neon,       // DCT_DCT
   1603  fadst4x8_col_neon,      // ADST_DCT
   1604  fdct4x8_col_neon,       // DCT_ADST
   1605  fadst4x8_col_neon,      // ADST_ADST
   1606  fadst4x8_col_neon,      // FLIPADST_DCT
   1607  fdct4x8_col_neon,       // DCT_FLIPADST
   1608  fadst4x8_col_neon,      // FLIPADST_FLIPADST
   1609  fadst4x8_col_neon,      // ADST_FLIPADST
   1610  fadst4x8_col_neon,      // FLIPADST_ADST
   1611  fidentity4x8_col_neon,  // IDTX
   1612  fdct4x8_col_neon,       // V_DCT
   1613  fidentity4x8_col_neon,  // H_DCT
   1614  fadst4x8_col_neon,      // V_ADST
   1615  fidentity4x8_col_neon,  // H_ADST
   1616  fadst4x8_col_neon,      // V_FLIPADST
   1617  fidentity4x8_col_neon   // H_FLIPADST
   1618 };
   1619 
   1620 static const row_transform_1d_lbd_8_neon row_txfm8x4_arr[TX_TYPES] = {
   1621  fdct8x4_row_neon,       // DCT_DCT
   1622  fdct8x4_row_neon,       // ADST_DCT
   1623  fadst8x4_row_neon,      // DCT_ADST
   1624  fadst8x4_row_neon,      // ADST_ADST
   1625  fdct8x4_row_neon,       // FLIPADST_DCT
   1626  fadst8x4_row_neon,      // DCT_FLIPADST
   1627  fadst8x4_row_neon,      // FLIPADST_FLIPADST
   1628  fadst8x4_row_neon,      // ADST_FLIPADST
   1629  fadst8x4_row_neon,      // FLIPADST_ADST
   1630  fidentity8x4_row_neon,  // IDTX
   1631  fidentity8x4_row_neon,  // V_DCT
   1632  fdct8x4_row_neon,       // H_DCT
   1633  fidentity8x4_row_neon,  // V_ADST
   1634  fadst8x4_row_neon,      // H_ADST
   1635  fidentity8x4_row_neon,  // V_FLIPADST
   1636  fadst8x4_row_neon       // H_FLIPADST
   1637 };
   1638 
   1639 static const row_transform_1d_lbd_8_neon row_rect_txfm8x4_arr[TX_TYPES] = {
   1640  fdct8x4_row_rect_neon,       // DCT_DCT
   1641  fdct8x4_row_rect_neon,       // ADST_DCT
   1642  fadst8x4_row_rect_neon,      // DCT_ADST
   1643  fadst8x4_row_rect_neon,      // ADST_ADST
   1644  fdct8x4_row_rect_neon,       // FLIPADST_DCT
   1645  fadst8x4_row_rect_neon,      // DCT_FLIPADST
   1646  fadst8x4_row_rect_neon,      // FLIPADST_FLIPADST
   1647  fadst8x4_row_rect_neon,      // ADST_FLIPADST
   1648  fadst8x4_row_rect_neon,      // FLIPADST_ADST
   1649  fidentity8x4_row_rect_neon,  // IDTX
   1650  fidentity8x4_row_rect_neon,  // V_DCT
   1651  fdct8x4_row_rect_neon,       // H_DCT
   1652  fidentity8x4_row_rect_neon,  // V_ADST
   1653  fadst8x4_row_rect_neon,      // H_ADST
   1654  fidentity8x4_row_rect_neon,  // V_FLIPADST
   1655  fadst8x4_row_rect_neon       // H_FLIPADST
   1656 };
   1657 
   1658 static const col_transform_1d_lbd_8_neon col_txfm8x4_arr[TX_TYPES] = {
   1659  fdct8x4_col_neon,       // DCT_DCT
   1660  fadst8x4_col_neon,      // ADST_DCT
   1661  fdct8x4_col_neon,       // DCT_ADST
   1662  fadst8x4_col_neon,      // ADST_ADST
   1663  fadst8x4_col_neon,      // FLIPADST_DCT
   1664  fdct8x4_col_neon,       // DCT_FLIPADST
   1665  fadst8x4_col_neon,      // FLIPADST_FLIPADST
   1666  fadst8x4_col_neon,      // ADST_FLIPADST
   1667  fadst8x4_col_neon,      // FLIPADST_ADST
   1668  fidentity8x4_col_neon,  // IDTX
   1669  fdct8x4_col_neon,       // V_DCT
   1670  fidentity8x4_col_neon,  // H_DCT
   1671  fadst8x4_col_neon,      // V_ADST
   1672  fidentity8x4_col_neon,  // H_ADST
   1673  fadst8x4_col_neon,      // V_FLIPADST
   1674  fidentity8x4_col_neon   // H_FLIPADST
   1675 };
   1676 
   1677 static const row_transform_1d_lbd_4_neon row_rect_txfm4x8_arr[TX_TYPES] = {
   1678  fdct4x8_row_rect_neon,       // DCT_DCT
   1679  fdct4x8_row_rect_neon,       // ADST_DCT
   1680  fadst4x8_row_rect_neon,      // DCT_ADST
   1681  fadst4x8_row_rect_neon,      // ADST_ADST
   1682  fdct4x8_row_rect_neon,       // FLIPADST_DCT
   1683  fadst4x8_row_rect_neon,      // DCT_FLIPADST
   1684  fadst4x8_row_rect_neon,      // FLIPADST_FLIPADST
   1685  fadst4x8_row_rect_neon,      // ADST_FLIPADST
   1686  fadst4x8_row_rect_neon,      // FLIPADST_ADST
   1687  fidentity4x8_row_rect_neon,  // IDTX
   1688  fidentity4x8_row_rect_neon,  // V_DCT
   1689  fdct4x8_row_rect_neon,       // H_DCT
   1690  fidentity4x8_row_rect_neon,  // V_ADST
   1691  fadst4x8_row_rect_neon,      // H_ADST
   1692  fidentity4x8_row_rect_neon,  // V_FLIPADST
   1693  fadst4x8_row_rect_neon       // H_FLIPADST
   1694 };
   1695 
   1696 static const col_transform_1d_lbd_8_neon col_txfm8x8_arr[TX_TYPES] = {
   1697  fdct8x8_col_neon,       // DCT_DCT
   1698  fadst8x8_col_neon,      // ADST_DCT
   1699  fdct8x8_col_neon,       // DCT_ADST
   1700  fadst8x8_col_neon,      // ADST_ADST
   1701  fadst8x8_col_neon,      // FLIPADST_DCT
   1702  fdct8x8_col_neon,       // DCT_FLIPADST
   1703  fadst8x8_col_neon,      // FLIPADST_FLIPADST
   1704  fadst8x8_col_neon,      // ADST_FLIPADST
   1705  fadst8x8_col_neon,      // FLIPADST_ADST
   1706  fidentity8x8_col_neon,  // IDTX
   1707  fdct8x8_col_neon,       // V_DCT
   1708  fidentity8x8_col_neon,  // H_DCT
   1709  fadst8x8_col_neon,      // V_ADST
   1710  fidentity8x8_col_neon,  // H_ADST
   1711  fadst8x8_col_neon,      // V_FLIPADST
   1712  fidentity8x8_col_neon,  // H_FLIPADST
   1713 };
   1714 
   1715 static const row_transform_1d_lbd_8_neon row_txfm8x8_arr[TX_TYPES] = {
   1716  fdct8x8_row_neon,       // DCT_DCT
   1717  fdct8x8_row_neon,       // ADST_DCT
   1718  fadst8x8_row_neon,      // DCT_ADST
   1719  fadst8x8_row_neon,      // ADST_ADST
   1720  fdct8x8_row_neon,       // FLIPADST_DCT
   1721  fadst8x8_row_neon,      // DCT_FLIPADST
   1722  fadst8x8_row_neon,      // FLIPADST_FLIPADST
   1723  fadst8x8_row_neon,      // ADST_FLIPADST
   1724  fadst8x8_row_neon,      // FLIPADST_ADST
   1725  fidentity8x8_row_neon,  // IDTX
   1726  fidentity8x8_row_neon,  // V_DCT
   1727  fdct8x8_row_neon,       // H_DCT
   1728  fidentity8x8_row_neon,  // V_ADST
   1729  fadst8x8_row_neon,      // H_ADST
   1730  fidentity8x8_row_neon,  // V_FLIPADST
   1731  fadst8x8_row_neon       // H_FLIPADST
   1732 };
   1733 
   1734 static const row_transform_1d_lbd_8_neon row_rect_txfm8x8_arr[TX_TYPES] = {
   1735  fdct8x8_row_rect_neon,       // DCT_DCT
   1736  fdct8x8_row_rect_neon,       // ADST_DCT
   1737  fadst8x8_row_rect_neon,      // DCT_ADST
   1738  fadst8x8_row_rect_neon,      // ADST_ADST
   1739  fdct8x8_row_rect_neon,       // FLIPADST_DCT
   1740  fadst8x8_row_rect_neon,      // DCT_FLIPADST
   1741  fadst8x8_row_rect_neon,      // FLIPADST_FLIPADST
   1742  fadst8x8_row_rect_neon,      // ADST_FLIPADST
   1743  fadst8x8_row_rect_neon,      // FLIPADST_ADST
   1744  fidentity8x8_row_rect_neon,  // IDTX
   1745  fidentity8x8_row_rect_neon,  // V_DCT
   1746  fdct8x8_row_rect_neon,       // H_DCT
   1747  fidentity8x8_row_rect_neon,  // V_ADST
   1748  fadst8x8_row_rect_neon,      // H_ADST
   1749  fidentity8x8_row_rect_neon,  // V_FLIPADST
   1750  fadst8x8_row_rect_neon       // H_FLIPADST
   1751 };
   1752 
   1753 static const col_transform_1d_lbd_4_neon col_txfm4x16_arr[TX_TYPES] = {
   1754  fdct4x16_col_neon,       // DCT_DCT
   1755  fadst4x16_col_neon,      // ADST_DCT
   1756  fdct4x16_col_neon,       // DCT_ADST
   1757  fadst4x16_col_neon,      // ADST_ADST
   1758  fadst4x16_col_neon,      // FLIPADST_DCT
   1759  fdct4x16_col_neon,       // DCT_FLIPADST
   1760  fadst4x16_col_neon,      // FLIPADST_FLIPADST
   1761  fadst4x16_col_neon,      // ADST_FLIPADST
   1762  fadst4x16_col_neon,      // FLIPADST_ADST
   1763  fidentity4x16_col_neon,  // IDTX
   1764  fdct4x16_col_neon,       // V_DCT
   1765  fidentity4x16_col_neon,  // H_DCT
   1766  fadst4x16_col_neon,      // V_ADST
   1767  fidentity4x16_col_neon,  // H_ADST
   1768  fadst4x16_col_neon,      // V_FLIPADST
   1769  fidentity4x16_col_neon   // H_FLIPADST
   1770 };
   1771 
   1772 static const row_transform_1d_lbd_4_neon row_txfm4x16_arr[TX_TYPES] = {
   1773  fdct4x16_row_neon,       // DCT_DCT
   1774  fdct4x16_row_neon,       // ADST_DCT
   1775  fadst4x16_row_neon,      // DCT_ADST
   1776  fadst4x16_row_neon,      // ADST_ADST
   1777  fdct4x16_row_neon,       // FLIPADST_DCT
   1778  fadst4x16_row_neon,      // DCT_FLIPADST
   1779  fadst4x16_row_neon,      // FLIPADST_FLIPADST
   1780  fadst4x16_row_neon,      // ADST_FLIPADST
   1781  fadst4x16_row_neon,      // FLIPADST_ADST
   1782  fidentity4x16_row_neon,  // IDTX
   1783  fidentity4x16_row_neon,  // V_DCT
   1784  fdct4x16_row_neon,       // H_DCT
   1785  fidentity4x16_row_neon,  // V_ADST
   1786  fadst4x16_row_neon,      // H_ADST
   1787  fidentity4x16_row_neon,  // V_FLIPADST
   1788  fadst4x16_row_neon       // H_FLIPADST
   1789 };
   1790 
   1791 static const col_transform_1d_lbd_8_neon col_txfm8x16_arr[TX_TYPES] = {
   1792  fdct8x16_col_neon,       // DCT_DCT
   1793  fadst8x16_col_neon,      // ADST_DCT
   1794  fdct8x16_col_neon,       // DCT_ADST
   1795  fadst8x16_col_neon,      // ADST_ADST
   1796  fadst8x16_col_neon,      // FLIPADST_DCT
   1797  fdct8x16_col_neon,       // DCT_FLIPADST
   1798  fadst8x16_col_neon,      // FLIPADST_FLIPADST
   1799  fadst8x16_col_neon,      // ADST_FLIPADST
   1800  fadst8x16_col_neon,      // FLIPADST_ADST
   1801  fidentity8x16_col_neon,  // IDTX
   1802  fdct8x16_col_neon,       // V_DCT
   1803  fidentity8x16_col_neon,  // H_DCT
   1804  fadst8x16_col_neon,      // V_ADST
   1805  fidentity8x16_col_neon,  // H_ADST
   1806  fadst8x16_col_neon,      // V_FLIPADST
   1807  fidentity8x16_col_neon   // H_FLIPADST
   1808 };
   1809 
   1810 static const row_transform_1d_lbd_8_neon row_txfm8x16_arr[TX_TYPES] = {
   1811  fdct8x16_row_neon,       // DCT_DCT
   1812  fdct8x16_row_neon,       // ADST_DCT
   1813  fadst8x16_row_neon,      // DCT_ADST
   1814  fadst8x16_row_neon,      // ADST_ADST
   1815  fdct8x16_row_neon,       // FLIPADST_DCT
   1816  fadst8x16_row_neon,      // DCT_FLIPADST
   1817  fadst8x16_row_neon,      // FLIPADST_FLIPADST
   1818  fadst8x16_row_neon,      // ADST_FLIPADST
   1819  fadst8x16_row_neon,      // FLIPADST_ADST
   1820  fidentity8x16_row_neon,  // IDTX
   1821  fidentity8x16_row_neon,  // V_DCT
   1822  fdct8x16_row_neon,       // H_DCT
   1823  fidentity8x16_row_neon,  // V_ADST
   1824  fadst8x16_row_neon,      // H_ADST
   1825  fidentity8x16_row_neon,  // V_FLIPADST
   1826  fadst8x16_row_neon       // H_FLIPADST
   1827 };
   1828 
   1829 static const row_transform_1d_lbd_8_neon row_rect_txfm8x16_arr[TX_TYPES] = {
   1830  fdct8x16_row_rect_neon,       // DCT_DCT
   1831  fdct8x16_row_rect_neon,       // ADST_DCT
   1832  fadst8x16_row_rect_neon,      // DCT_ADST
   1833  fadst8x16_row_rect_neon,      // ADST_ADST
   1834  fdct8x16_row_rect_neon,       // FLIPADST_DCT
   1835  fadst8x16_row_rect_neon,      // DCT_FLIPADST
   1836  fadst8x16_row_rect_neon,      // FLIPADST_FLIPADST
   1837  fadst8x16_row_rect_neon,      // ADST_FLIPADST
   1838  fadst8x16_row_rect_neon,      // FLIPADST_ADST
   1839  fidentity8x16_row_rect_neon,  // IDTX
   1840  fidentity8x16_row_rect_neon,  // V_DCT
   1841  fdct8x16_row_rect_neon,       // H_DCT
   1842  fidentity8x16_row_rect_neon,  // V_ADST
   1843  fadst8x16_row_rect_neon,      // H_ADST
   1844  fidentity8x16_row_rect_neon,  // V_FLIPADST
   1845  fadst8x16_row_rect_neon       // H_FLIPADST
   1846 };
   1847 
   1848 static const row_transform_1d_lbd_8_neon row_txfm8x32_arr[TX_TYPES] = {
   1849  fdct8x32_row_neon,       // DCT_DCT
   1850  NULL,                    // ADST_DCT
   1851  NULL,                    // DCT_ADST
   1852  NULL,                    // ADST_ADST
   1853  NULL,                    // FLIPADST_DCT
   1854  NULL,                    // DCT_FLIPADST
   1855  NULL,                    // FLIPADST_FLIPADST
   1856  NULL,                    // ADST_FLIPADST
   1857  NULL,                    // FLIPADST_ADST
   1858  fidentity8x32_row_neon,  // IDTX
   1859  fidentity8x32_row_neon,  // V_DCT
   1860  fdct8x32_row_neon,       // H_DCT
   1861  NULL,                    // V_ADST
   1862  NULL,                    // H_ADST
   1863  NULL,                    // V_FLIPADST
   1864  NULL                     // H_FLIPADST
   1865 };
   1866 
   1867 static const row_transform_1d_lbd_8_neon row_rect_txfm8x32_arr[TX_TYPES] = {
   1868  fdct8x32_row_rect_neon,       // DCT_DCT
   1869  NULL,                         // ADST_DCT
   1870  NULL,                         // DCT_ADST
   1871  NULL,                         // ADST_ADST
   1872  NULL,                         // FLIPADST_DCT
   1873  NULL,                         // DCT_FLIPADST
   1874  NULL,                         // FLIPADST_FLIPADST
   1875  NULL,                         // ADST_FLIPADST
   1876  NULL,                         // FLIPADST_ADST
   1877  fidentity8x32_row_rect_neon,  // IDTX
   1878  fidentity8x32_row_rect_neon,  // V_DCT
   1879  fdct8x32_row_rect_neon,       // H_DCT
   1880  NULL,                         // V_ADST
   1881  NULL,                         // H_ADST
   1882  NULL,                         // V_FLIPADST
   1883  NULL                          // H_FLIPADST
   1884 };
   1885 
   1886 static const col_transform_1d_lbd_8_neon col_txfm8x32_arr[TX_TYPES] = {
   1887  fdct8x32_col_neon,       // DCT_DCT
   1888  NULL,                    // ADST_DCT
   1889  NULL,                    // DCT_ADST
   1890  NULL,                    // ADST_ADST
   1891  NULL,                    // FLIPADST_DCT
   1892  NULL,                    // DCT_FLIPADST
   1893  NULL,                    // FLIPADST_FLIPADST
   1894  NULL,                    // ADST_FLIPADST
   1895  NULL,                    // FLIPADST_ADST
   1896  fidentity8x32_col_neon,  // IDTX
   1897  fdct8x32_col_neon,       // V_DCT
   1898  fidentity8x32_col_neon,  // H_DCT
   1899  NULL,                    // V_ADST
   1900  NULL,                    // H_ADST
   1901  NULL,                    // V_FLIPADST
   1902  NULL                     // H_FLIPADST
   1903 };
   1904 
   1905 static void lowbd_fwd_txfm2d_4x4_neon(const int16_t *input, int32_t *output,
   1906                                      int stride, TX_TYPE tx_type, int bd) {
   1907  (void)bd;
   1908  int ud_flip, lr_flip;
   1909  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   1910  ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
   1911 
   1912  int16x4_t buf0[4], buf1[4];
   1913  switch (tx_type) {
   1914    case DCT_DCT:
   1915      fdct4x4_col_neon(input, buf0, stride, 13);
   1916      transpose_arrays_s16_4x4(buf0, buf1);
   1917      fdct4x4_row_neon(buf1, output, 4, 13);
   1918      break;
   1919    case ADST_DCT:
   1920      fadst4x4_col_neon(input, buf0, stride, 13);
   1921      transpose_arrays_s16_4x4(buf0, buf1);
   1922      fdct4x4_row_neon(buf1, output, 4, 13);
   1923      break;
   1924    case DCT_ADST:
   1925      fdct4x4_col_neon(input, buf0, stride, 13);
   1926      transpose_arrays_s16_4x4(buf0, buf1);
   1927      fadst4x4_row_neon(buf1, output, 4, 13);
   1928      break;
   1929    case ADST_ADST:
   1930      fadst4x4_col_neon(input, buf0, stride, 13);
   1931      transpose_arrays_s16_4x4(buf0, buf1);
   1932      fadst4x4_row_neon(buf1, output, 4, 13);
   1933      break;
   1934    case FLIPADST_DCT:
   1935      fadst4x4_col_neon(input, buf0, stride, 13);
   1936      transpose_arrays_s16_4x4(buf0, buf1);
   1937      fdct4x4_row_neon(buf1, output, 4, 13);
   1938      break;
   1939    case DCT_FLIPADST:
   1940      fdct4x4_col_neon(input, buf0, stride, 13);
   1941      transpose_arrays_s16_4x4(buf0, buf1);
   1942      flip_buf_4_neon(buf1, buf0, 4);
   1943      fadst4x4_row_neon(buf0, output, 4, 13);
   1944      break;
   1945    case FLIPADST_FLIPADST:
   1946      fadst4x4_col_neon(input, buf0, stride, 13);
   1947      transpose_arrays_s16_4x4(buf0, buf1);
   1948      flip_buf_4_neon(buf1, buf0, 4);
   1949      fadst4x4_row_neon(buf0, output, 4, 13);
   1950      break;
   1951    case ADST_FLIPADST:
   1952      fadst4x4_col_neon(input, buf0, stride, 13);
   1953      transpose_arrays_s16_4x4(buf0, buf1);
   1954      flip_buf_4_neon(buf1, buf0, 4);
   1955      fadst4x4_row_neon(buf0, output, 4, 13);
   1956      break;
   1957    case FLIPADST_ADST:
   1958      fadst4x4_col_neon(input, buf0, stride, 13);
   1959      transpose_arrays_s16_4x4(buf0, buf1);
   1960      fadst4x4_row_neon(buf1, output, 4, 13);
   1961      break;
   1962    case IDTX:
   1963      fidentity4x4_col_neon(input, buf0, stride, 13);
   1964      transpose_arrays_s16_4x4(buf0, buf1);
   1965      fidentity4x4_row_neon(buf1, output, 4, 13);
   1966      break;
   1967    case V_DCT:
   1968      fdct4x4_col_neon(input, buf0, stride, 13);
   1969      transpose_arrays_s16_4x4(buf0, buf1);
   1970      fidentity4x4_row_neon(buf1, output, 4, 13);
   1971      break;
   1972    case H_DCT:
   1973      fidentity4x4_col_neon(input, buf0, stride, 13);
   1974      transpose_arrays_s16_4x4(buf0, buf1);
   1975      fdct4x4_row_neon(buf1, output, 4, 13);
   1976      break;
   1977    case V_ADST:
   1978      fadst4x4_col_neon(input, buf0, stride, 13);
   1979      transpose_arrays_s16_4x4(buf0, buf1);
   1980      fidentity4x4_row_neon(buf1, output, 4, 13);
   1981      break;
   1982    case H_ADST:
   1983      fidentity4x4_col_neon(input, buf0, stride, 13);
   1984      transpose_arrays_s16_4x4(buf0, buf1);
   1985      fadst4x4_row_neon(buf1, output, 4, 13);
   1986      break;
   1987    case V_FLIPADST:
   1988      fadst4x4_col_neon(input, buf0, stride, 13);
   1989      transpose_arrays_s16_4x4(buf0, buf1);
   1990      fidentity4x4_row_neon(buf1, output, 4, 13);
   1991      break;
   1992    case H_FLIPADST:
   1993      fidentity4x4_col_neon(input, buf0, stride, 13);
   1994      transpose_arrays_s16_4x4(buf0, buf1);
   1995      flip_buf_4_neon(buf1, buf0, 4);
   1996      fadst4x4_row_neon(buf0, output, 4, 13);
   1997      break;
   1998  }
   1999 }
   2000 
   2001 static void lowbd_fwd_txfm2d_4x8_neon(const int16_t *input, int32_t *output,
   2002                                      int stride, TX_TYPE tx_type, int bd) {
   2003  (void)bd;
   2004  int16x4_t buf0[8];
   2005  int16x8_t buf1[8];
   2006  const col_transform_1d_lbd_4_neon col_txfm = col_txfm4x8_arr[tx_type];
   2007  const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x4_arr[tx_type];
   2008 
   2009  int ud_flip, lr_flip;
   2010  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   2011  ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
   2012  col_txfm(input, buf0, stride, 13);
   2013  shift_right_1_round_s16_x4(buf0, buf0, 8);
   2014  transpose_arrays_s16_4x8(buf0, buf1);
   2015 
   2016  if (lr_flip) {
   2017    int16x8_t buf2[8];
   2018    flip_buf_8_neon(buf1, buf2, 4);
   2019    row_txfm(buf2, output, 8, 13);
   2020  } else {
   2021    row_txfm(buf1, output, 8, 13);
   2022  }
   2023 }
   2024 
   2025 static void lowbd_fwd_txfm2d_4x16_neon(const int16_t *input, int32_t *output,
   2026                                       int stride, TX_TYPE tx_type, int bd) {
   2027  (void)bd;
   2028  int16x4_t buf0[16];
   2029  int16x8_t buf1[16];
   2030  const col_transform_1d_lbd_4_neon col_txfm = col_txfm4x16_arr[tx_type];
   2031  const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x4_arr[tx_type];
   2032  int ud_flip, lr_flip;
   2033 
   2034  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   2035  ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
   2036  col_txfm(input, buf0, stride, 13);
   2037  shift_right_1_round_s16_x4(buf0, buf0, 16);
   2038  transpose_arrays_s16_4x8(buf0, buf1);
   2039  transpose_arrays_s16_4x8(buf0 + 8, buf1 + 8);
   2040 
   2041  for (int i = 0; i < 2; i++) {
   2042    if (lr_flip) {
   2043      int16x8_t buf2[16];
   2044      flip_buf_8_neon(buf1 + 8 * i, buf2, 4);
   2045      row_txfm(buf2, output + 8 * i, 16, 12);
   2046    } else {
   2047      int16x8_t *buf = buf1 + 8 * i;
   2048      row_txfm(buf, output + 8 * i, 16, 12);
   2049    }
   2050  }
   2051 }
   2052 
   2053 static void lowbd_fwd_txfm2d_8x4_neon(const int16_t *input, int32_t *output,
   2054                                      int stride, TX_TYPE tx_type, int bd) {
   2055  (void)bd;
   2056  int16x8_t buf0[8];
   2057  int16x4_t buf1[8];
   2058  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x4_arr[tx_type];
   2059  const row_transform_1d_lbd_4_neon row_txfm = row_rect_txfm4x8_arr[tx_type];
   2060  int ud_flip, lr_flip;
   2061 
   2062  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   2063  ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
   2064  col_txfm(input, buf0, stride, 13);
   2065  shift_right_1_round_s16_x8(buf0, buf0, 4);
   2066  transpose_arrays_s16_8x4(buf0, buf1);
   2067 
   2068  if (lr_flip) {
   2069    int16x4_t buf2[8];
   2070    flip_buf_4_neon(buf1, buf2, 8);
   2071    row_txfm(buf2, output, 4, 13);
   2072  } else {
   2073    row_txfm(buf1, output, 4, 13);
   2074  }
   2075 }
   2076 
   2077 static void lowbd_fwd_txfm2d_8x8_neon(const int16_t *input, int32_t *output,
   2078                                      int stride, TX_TYPE tx_type, int bd) {
   2079  (void)bd;
   2080  int ud_flip, lr_flip;
   2081  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   2082  ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
   2083 
   2084  int16x8_t buf0[8], buf1[8];
   2085 
   2086  switch (tx_type) {
   2087    case DCT_DCT:
   2088      fdct8x8_col_neon(input, buf0, stride, 13);
   2089      shift_right_1_round_s16_x8(buf0, buf0, 8);
   2090      transpose_arrays_s16_8x8(buf0, buf1);
   2091      fdct8x8_row_neon(buf1, output, 8, 13);
   2092      break;
   2093    case ADST_DCT:
   2094      fadst8x8_col_neon(input, buf0, stride, 13);
   2095      shift_right_1_round_s16_x8(buf0, buf0, 8);
   2096      transpose_arrays_s16_8x8(buf0, buf1);
   2097      fdct8x8_row_neon(buf1, output, 8, 13);
   2098      break;
   2099    case DCT_ADST:
   2100      fdct8x8_col_neon(input, buf0, stride, 13);
   2101      shift_right_1_round_s16_x8(buf0, buf0, 8);
   2102      transpose_arrays_s16_8x8(buf0, buf1);
   2103      fadst8x8_row_neon(buf1, output, 8, 13);
   2104      break;
   2105    case ADST_ADST:
   2106      fadst8x8_col_neon(input, buf0, stride, 13);
   2107      shift_right_1_round_s16_x8(buf0, buf0, 8);
   2108      transpose_arrays_s16_8x8(buf0, buf1);
   2109      fadst8x8_row_neon(buf1, output, 8, 13);
   2110      break;
   2111    case FLIPADST_DCT:
   2112      fadst8x8_col_neon(input, buf0, stride, 13);
   2113      shift_right_1_round_s16_x8(buf0, buf0, 8);
   2114      transpose_arrays_s16_8x8(buf0, buf1);
   2115      fdct8x8_row_neon(buf1, output, 8, 13);
   2116      break;
   2117    case DCT_FLIPADST:
   2118      fdct8x8_col_neon(input, buf0, stride, 13);
   2119      shift_right_1_round_s16_x8(buf0, buf0, 8);
   2120      transpose_arrays_s16_8x8(buf0, buf1);
   2121      flip_buf_8_neon(buf1, buf0, 8);
   2122      fadst8x8_row_neon(buf0, output, 8, 13);
   2123      break;
   2124    case FLIPADST_FLIPADST:
   2125      fadst8x8_col_neon(input, buf0, stride, 13);
   2126      shift_right_1_round_s16_x8(buf0, buf0, 8);
   2127      transpose_arrays_s16_8x8(buf0, buf1);
   2128      flip_buf_8_neon(buf1, buf0, 8);
   2129      fadst8x8_row_neon(buf0, output, 8, 13);
   2130      break;
   2131    case ADST_FLIPADST:
   2132      fadst8x8_col_neon(input, buf0, stride, 13);
   2133      shift_right_1_round_s16_x8(buf0, buf0, 8);
   2134      transpose_arrays_s16_8x8(buf0, buf1);
   2135      flip_buf_8_neon(buf1, buf0, 8);
   2136      fadst8x8_row_neon(buf0, output, 8, 13);
   2137      break;
   2138    case FLIPADST_ADST:
   2139      fadst8x8_col_neon(input, buf0, stride, 13);
   2140      shift_right_1_round_s16_x8(buf0, buf0, 8);
   2141      transpose_arrays_s16_8x8(buf0, buf1);
   2142      fadst8x8_row_neon(buf1, output, 8, 13);
   2143      break;
   2144    case IDTX:
   2145      fidentity8x8_col_neon(input, buf0, stride, 13);
   2146      shift_right_1_round_s16_x8(buf0, buf0, 8);
   2147      transpose_arrays_s16_8x8(buf0, buf1);
   2148      fidentity8x8_row_neon(buf1, output, 8, 13);
   2149      break;
   2150    case V_DCT:
   2151      fdct8x8_col_neon(input, buf0, stride, 13);
   2152      shift_right_1_round_s16_x8(buf0, buf0, 8);
   2153      transpose_arrays_s16_8x8(buf0, buf1);
   2154      fidentity8x8_row_neon(buf1, output, 8, 13);
   2155      break;
   2156    case H_DCT:
   2157      fidentity8x8_col_neon(input, buf0, stride, 13);
   2158      shift_right_1_round_s16_x8(buf0, buf0, 8);
   2159      transpose_arrays_s16_8x8(buf0, buf1);
   2160      fdct8x8_row_neon(buf1, output, 8, 13);
   2161      break;
   2162    case V_ADST:
   2163      fadst8x8_col_neon(input, buf0, stride, 13);
   2164      shift_right_1_round_s16_x8(buf0, buf0, 8);
   2165      transpose_arrays_s16_8x8(buf0, buf1);
   2166      fidentity8x8_row_neon(buf1, output, 8, 13);
   2167      break;
   2168    case H_ADST:
   2169      fidentity8x8_col_neon(input, buf0, stride, 13);
   2170      shift_right_1_round_s16_x8(buf0, buf0, 8);
   2171      transpose_arrays_s16_8x8(buf0, buf1);
   2172      fadst8x8_row_neon(buf1, output, 8, 13);
   2173      break;
   2174    case V_FLIPADST:
   2175      fadst8x8_col_neon(input, buf0, stride, 13);
   2176      shift_right_1_round_s16_x8(buf0, buf0, 8);
   2177      transpose_arrays_s16_8x8(buf0, buf1);
   2178      fidentity8x8_row_neon(buf1, output, 8, 13);
   2179      break;
   2180    case H_FLIPADST:
   2181      fidentity8x8_col_neon(input, buf0, stride, 13);
   2182      shift_right_1_round_s16_x8(buf0, buf0, 8);
   2183      transpose_arrays_s16_8x8(buf0, buf1);
   2184      flip_buf_8_neon(buf1, buf0, 8);
   2185      fadst8x8_row_neon(buf0, output, 8, 13);
   2186      break;
   2187  }
   2188 }
   2189 
   2190 static void lowbd_fwd_txfm2d_8x16_neon(const int16_t *input, int32_t *output,
   2191                                       int stride, TX_TYPE tx_type, int bd) {
   2192  (void)bd;
   2193  int16x8_t buf0[16], buf1[16];
   2194  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x16_arr[tx_type];
   2195  const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x8_arr[tx_type];
   2196  int ud_flip, lr_flip;
   2197 
   2198  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   2199  ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
   2200  col_txfm(input, buf0, stride, 13);
   2201  shift_right_2_round_s16_x8(buf0, buf0, 16);
   2202  transpose_arrays_s16_8x8(buf0, buf1);
   2203  transpose_arrays_s16_8x8(buf0 + 8, buf1 + 8);
   2204 
   2205  for (int i = 0; i < 2; i++) {
   2206    if (lr_flip) {
   2207      flip_buf_8_neon(buf1 + 8 * i, buf0, 8);
   2208      row_txfm(buf0, output + 8 * i, 16, 13);
   2209    } else {
   2210      int16x8_t *buf = buf1 + 8 * i;
   2211      row_txfm(buf, output + 8 * i, 16, 13);
   2212    }
   2213  }
   2214 }
   2215 
   2216 static void lowbd_fwd_txfm2d_8x32_neon(const int16_t *input, int32_t *output,
   2217                                       int stride, TX_TYPE tx_type, int bd) {
   2218  (void)bd;
   2219  int16x8_t buf0[32], buf1[32];
   2220  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type];
   2221  const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x8_arr[tx_type];
   2222  int ud_flip, lr_flip;
   2223 
   2224  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   2225  ud_adjust_input_and_stride(ud_flip, &input, &stride, 32);
   2226  col_txfm(input, buf0, stride, 12);
   2227  shift_right_2_round_s16_x8(buf0, buf0, 32);
   2228  transpose_arrays_s16_8x8(buf0, buf1);
   2229  transpose_arrays_s16_8x8(buf0 + 8, buf1 + 8);
   2230  transpose_arrays_s16_8x8(buf0 + 16, buf1 + 16);
   2231  transpose_arrays_s16_8x8(buf0 + 24, buf1 + 24);
   2232 
   2233  for (int i = 0; i < 4; i++) {
   2234    if (lr_flip) {
   2235      flip_buf_8_neon(buf1 + 8 * i, buf0, 8);
   2236      row_txfm(buf0, output + 8 * i, 32, 12);
   2237    } else {
   2238      int16x8_t *buf = buf1 + 8 * i;
   2239      row_txfm(buf, output + 8 * i, 32, 12);
   2240    }
   2241  }
   2242 }
   2243 
   2244 static void lowbd_fwd_txfm2d_16x4_neon(const int16_t *input, int32_t *output,
   2245                                       int stride, TX_TYPE tx_type, int bd) {
   2246  (void)bd;
   2247  int16x8_t buf0[16];
   2248  int16x4_t buf1[16];
   2249  int16x4_t buf2[16];
   2250  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x4_arr[tx_type];
   2251  const row_transform_1d_lbd_4_neon row_txfm = row_txfm4x16_arr[tx_type];
   2252  int ud_flip, lr_flip;
   2253 
   2254  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   2255  ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
   2256  for (int i = 0; i < 2; i++) {
   2257    col_txfm(input + 8 * i, buf0, stride, 13);
   2258    shift_right_1_round_s16_x8(buf0, buf0, 4);
   2259    transpose_arrays_s16_8x4(buf0, buf1 + 8 * i);
   2260  }
   2261 
   2262  if (lr_flip) {
   2263    flip_buf_4_neon(buf1, buf2, 16);
   2264    row_txfm(buf2, output, 4, 13);
   2265  } else {
   2266    row_txfm(buf1, output, 4, 13);
   2267  }
   2268 }
   2269 
   2270 static void lowbd_fwd_txfm2d_16x8_neon(const int16_t *input, int32_t *output,
   2271                                       int stride, TX_TYPE tx_type, int bd) {
   2272  (void)bd;
   2273  int16x8_t buf0[16], buf1[16];
   2274  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x8_arr[tx_type];
   2275  const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x16_arr[tx_type];
   2276  int ud_flip, lr_flip;
   2277 
   2278  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   2279  ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
   2280  for (int i = 0; i < 2; i++) {
   2281    col_txfm(input + 8 * i, buf0, stride, 13);
   2282    shift_right_2_round_s16_x8(buf0, buf0, 8);
   2283    transpose_arrays_s16_8x8(buf0, buf1 + 8 * i);
   2284  }
   2285 
   2286  if (lr_flip) {
   2287    flip_buf_8_neon(buf1, buf0, 16);
   2288    row_txfm(buf0, output, 8, 13);
   2289  } else {
   2290    row_txfm(buf1, output, 8, 13);
   2291  }
   2292 }
   2293 
   2294 static void lowbd_fwd_txfm2d_16x16_neon(const int16_t *input, int32_t *output,
   2295                                        int stride, TX_TYPE tx_type, int bd) {
   2296  (void)bd;
   2297  int16x8_t buf0[16], buf1[32];
   2298  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x16_arr[tx_type];
   2299  const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x16_arr[tx_type];
   2300  int ud_flip, lr_flip;
   2301 
   2302  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   2303  ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
   2304  for (int i = 0; i < 2; i++) {
   2305    col_txfm(input + 8 * i, buf0, stride, 13);
   2306    shift_right_2_round_s16_x8(buf0, buf0, 16);
   2307    transpose_arrays_s16_8x8(buf0, buf1 + 0 * 16 + 8 * i);
   2308    transpose_arrays_s16_8x8(buf0 + 8, buf1 + 1 * 16 + 8 * i);
   2309  }
   2310 
   2311  for (int i = 0; i < 2; i++) {
   2312    if (lr_flip) {
   2313      flip_buf_8_neon(buf1 + 16 * i, buf0, 16);
   2314      row_txfm(buf0, output + 8 * i, 16, 12);
   2315    } else {
   2316      int16x8_t *buf = buf1 + 16 * i;
   2317      row_txfm(buf, output + 8 * i, 16, 12);
   2318    }
   2319  }
   2320 }
   2321 
   2322 static void lowbd_fwd_txfm2d_16x32_neon(const int16_t *input, int32_t *output,
   2323                                        int stride, TX_TYPE tx_type, int bd) {
   2324  (void)bd;
   2325  int16x8_t buf0[32], buf1[64];
   2326  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type];
   2327  const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x16_arr[tx_type];
   2328 
   2329  if (col_txfm == NULL || row_txfm == NULL) {
   2330    av1_fwd_txfm2d_16x32_c(input, output, stride, tx_type, bd);
   2331    return;
   2332  }
   2333 
   2334  int ud_flip, lr_flip;
   2335  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   2336  ud_adjust_input_and_stride(ud_flip, &input, &stride, 32);
   2337  for (int i = 0; i < 2; i++) {
   2338    col_txfm(input + 8 * i, buf0, stride, 12);
   2339    shift_right_4_round_s16_x8(buf0, buf0, 32);
   2340    transpose_arrays_s16_8x8(buf0 + 0 * 8, buf1 + 0 * 16 + 8 * i);
   2341    transpose_arrays_s16_8x8(buf0 + 1 * 8, buf1 + 1 * 16 + 8 * i);
   2342    transpose_arrays_s16_8x8(buf0 + 2 * 8, buf1 + 2 * 16 + 8 * i);
   2343    transpose_arrays_s16_8x8(buf0 + 3 * 8, buf1 + 3 * 16 + 8 * i);
   2344  }
   2345 
   2346  for (int i = 0; i < 4; i++) {
   2347    if (lr_flip) {
   2348      flip_buf_8_neon(buf1 + 16 * i, buf0, 16);
   2349      row_txfm(buf0, output + 8 * i, 32, 13);
   2350    } else {
   2351      int16x8_t *buf = buf1 + 16 * i;
   2352      row_txfm(buf, output + 8 * i, 32, 13);
   2353    }
   2354  }
   2355 }
   2356 
   2357 static void lowbd_fwd_txfm2d_32x8_neon(const int16_t *input, int32_t *output,
   2358                                       int stride, TX_TYPE tx_type, int bd) {
   2359  (void)bd;
   2360  int16x8_t buf0[32], buf1[32];
   2361  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x8_arr[tx_type];
   2362  const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x32_arr[tx_type];
   2363 
   2364  if (col_txfm == NULL || row_txfm == NULL) {
   2365    av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
   2366    return;
   2367  }
   2368 
   2369  int ud_flip, lr_flip;
   2370  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   2371  ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
   2372  for (int i = 0; i < 4; i++) {
   2373    col_txfm(input + 8 * i, buf0, stride, 13);
   2374    shift_right_2_round_s16_x8(buf0, buf0, 8);
   2375    transpose_arrays_s16_8x8(buf0, buf1 + 0 * 32 + 8 * i);
   2376  }
   2377 
   2378  if (lr_flip) {
   2379    flip_buf_8_neon(buf1, buf0, 32);
   2380    row_txfm(buf0, output, 8, 12);
   2381  } else {
   2382    row_txfm(buf1, output, 8, 12);
   2383  }
   2384 }
   2385 
   2386 static void lowbd_fwd_txfm2d_32x16_neon(const int16_t *input, int32_t *output,
   2387                                        int stride, TX_TYPE tx_type, int bd) {
   2388  (void)bd;
   2389  int16x8_t buf0[32], buf1[64];
   2390  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x16_arr[tx_type];
   2391  const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x32_arr[tx_type];
   2392 
   2393  if (col_txfm == NULL || row_txfm == NULL) {
   2394    av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
   2395    return;
   2396  }
   2397 
   2398  int ud_flip, lr_flip;
   2399  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   2400  ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
   2401  for (int i = 0; i < 4; i++) {
   2402    col_txfm(input + 8 * i, buf0, stride, 13);
   2403    shift_right_4_round_s16_x8(buf0, buf0, 16);
   2404    transpose_arrays_s16_8x8(buf0, buf1 + 0 * 32 + 8 * i);
   2405    transpose_arrays_s16_8x8(buf0 + 8, buf1 + 1 * 32 + 8 * i);
   2406  }
   2407 
   2408  for (int i = 0; i < 2; i++) {
   2409    if (lr_flip) {
   2410      flip_buf_8_neon(buf1 + 32 * i, buf0, 32);
   2411      row_txfm(buf0, output + 8 * i, 16, 13);
   2412    } else {
   2413      int16x8_t *buf = buf1 + 32 * i;
   2414      row_txfm(buf, output + 8 * i, 16, 13);
   2415    }
   2416  }
   2417 }
   2418 
   2419 static void lowbd_fwd_txfm2d_32x32_neon(const int16_t *input, int32_t *output,
   2420                                        int stride, TX_TYPE tx_type, int bd) {
   2421  (void)bd;
   2422  int16x8_t buf0[32], buf1[128];
   2423  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type];
   2424  const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x32_arr[tx_type];
   2425 
   2426  if (col_txfm == NULL || row_txfm == NULL) {
   2427    av1_fwd_txfm2d_32x32_c(input, output, stride, tx_type, bd);
   2428    return;
   2429  }
   2430 
   2431  int ud_flip, lr_flip;
   2432  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   2433  ud_adjust_input_and_stride(ud_flip, &input, &stride, 32);
   2434  for (int i = 0; i < 4; i++) {
   2435    col_txfm(input + 8 * i, buf0, stride, 12);
   2436    shift_right_4_round_s16_x8(buf0, buf0, 32);
   2437    transpose_arrays_s16_8x8(buf0 + 0 * 8, buf1 + 0 * 32 + 8 * i);
   2438    transpose_arrays_s16_8x8(buf0 + 1 * 8, buf1 + 1 * 32 + 8 * i);
   2439    transpose_arrays_s16_8x8(buf0 + 2 * 8, buf1 + 2 * 32 + 8 * i);
   2440    transpose_arrays_s16_8x8(buf0 + 3 * 8, buf1 + 3 * 32 + 8 * i);
   2441  }
   2442 
   2443  for (int i = 0; i < 4; i++) {
   2444    if (lr_flip) {
   2445      flip_buf_8_neon(buf1 + 32 * i, buf0, 32);
   2446      row_txfm(buf0, output + 8 * i, 32, 12);
   2447    } else {
   2448      int16x8_t *buf = buf1 + 32 * i;
   2449      row_txfm(buf, output + 8 * i, 32, 12);
   2450    }
   2451  }
   2452 }
   2453 
   2454 static void lowbd_fwd_txfm2d_64x16_neon(const int16_t *input, int32_t *output,
   2455                                        int stride, TX_TYPE tx_type, int bd) {
   2456  (void)bd;
   2457  (void)tx_type;
   2458  assert(tx_type == DCT_DCT);
   2459  int16x8_t buf0[64], buf1[128];
   2460  const transform_1d_lbd_8_neon col_txfm = fdct8x16_neon;
   2461  const transform_1d_lbd_8_neon row_txfm = fdct8x64_neon;
   2462 
   2463  for (int i = 0; i < 8; i++) {
   2464    load_buffer_s16_x8(input + 8 * i, stride, buf0, 16);
   2465    shift_left_2_s16_x8(buf0, buf0, 16);
   2466    col_txfm(buf0, buf0, 13);
   2467    shift_right_4_round_s16_x8(buf0, buf0, 16);
   2468    for (int j = 0; j < 2; ++j) {
   2469      transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 64 + 8 * i);
   2470    }
   2471  }
   2472 
   2473  for (int i = 0; i < 2; i++) {
   2474    int16x8_t *buf = buf1 + 64 * i;
   2475    row_txfm(buf, buf, 12);
   2476    store_buffer_s16_x8(buf, output + 8 * i, 16, 32);
   2477  }
   2478  // Zero out the bottom 16x32 area.
   2479  memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
   2480 }
   2481 
   2482 static void lowbd_fwd_txfm2d_16x64_neon(const int16_t *input, int32_t *output,
   2483                                        int stride, TX_TYPE tx_type, int bd) {
   2484  (void)bd;
   2485  (void)tx_type;
   2486  assert(tx_type == DCT_DCT);
   2487  int16x8_t buf0[64], buf1[128];
   2488  const transform_1d_lbd_8_neon col_txfm = fdct8x64_neon;
   2489  const transform_1d_lbd_8_neon row_txfm = fdct8x16_neon;
   2490 
   2491  for (int i = 0; i < 2; i++) {
   2492    load_buffer_s16_x8(input + 8 * i, stride, buf0, 64);
   2493    col_txfm(buf0, buf0, 13);
   2494    shift_right_2_round_s16_x8(buf0, buf0, 64);
   2495    for (int j = 0; j < 8; ++j) {
   2496      transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 16 + 8 * i);
   2497    }
   2498  }
   2499 
   2500  for (int i = 0; i < 4; i++) {
   2501    int16x8_t *buf = buf1 + 16 * i;
   2502    row_txfm(buf, buf, 12);
   2503    store_buffer_s16_x8(buf, output + 8 * i, 32, 16);
   2504  }
   2505 }
   2506 
   2507 static void fdct32_neon(const int32x4_t *input, int32x4_t *output,
   2508                        int cos_bit) {
   2509  const int16_t *cospi = cospi_arr_q13(cos_bit);
   2510 
   2511  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
   2512  const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
   2513  const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
   2514  const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
   2515  const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
   2516  const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
   2517  const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
   2518  const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
   2519 
   2520  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
   2521  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
   2522  const int16x4_t cospi8 = vget_low_s16(cospi8_24);
   2523  const int16x4_t cospi24 = vget_high_s16(cospi8_24);
   2524  const int16x4_t cospi4 = vget_low_s16(cospi4_12);
   2525  const int16x4_t cospi12 = vget_high_s16(cospi4_12);
   2526  const int16x4_t cospi20 = vget_low_s16(cospi20_28);
   2527  const int16x4_t cospi28 = vget_high_s16(cospi20_28);
   2528  const int16x4_t cospi2 = vget_low_s16(cospi2_6);
   2529  const int16x4_t cospi6 = vget_high_s16(cospi2_6);
   2530  const int16x4_t cospi10 = vget_low_s16(cospi10_14);
   2531  const int16x4_t cospi14 = vget_high_s16(cospi10_14);
   2532  const int16x4_t cospi18 = vget_low_s16(cospi18_22);
   2533  const int16x4_t cospi22 = vget_high_s16(cospi18_22);
   2534  const int16x4_t cospi26 = vget_low_s16(cospi26_30);
   2535  const int16x4_t cospi30 = vget_high_s16(cospi26_30);
   2536 
   2537  int32x4_t buf0[32];
   2538  int32x4_t buf1[32];
   2539 
   2540  // stage 1
   2541  butterfly_dct_pre_s32_x4(input, buf1, 32);
   2542 
   2543  // stage 2
   2544  butterfly_dct_pre_s32_x4(buf1, buf0, 16);
   2545  buf0[16] = buf1[16];
   2546  buf0[17] = buf1[17];
   2547  buf0[18] = buf1[18];
   2548  buf0[19] = buf1[19];
   2549  butterfly_s32_s32_x4_0112_neon(cospi32, buf1[27], buf1[20], &buf0[27],
   2550                                 &buf0[20]);
   2551  butterfly_s32_s32_x4_0112_neon(cospi32, buf1[26], buf1[21], &buf0[26],
   2552                                 &buf0[21]);
   2553  butterfly_s32_s32_x4_0112_neon(cospi32, buf1[25], buf1[22], &buf0[25],
   2554                                 &buf0[22]);
   2555  butterfly_s32_s32_x4_0112_neon(cospi32, buf1[24], buf1[23], &buf0[24],
   2556                                 &buf0[23]);
   2557  buf0[28] = buf1[28];
   2558  buf0[29] = buf1[29];
   2559  buf0[30] = buf1[30];
   2560  buf0[31] = buf1[31];
   2561 
   2562  // stage 3
   2563  butterfly_dct_pre_s32_x4(buf0, buf1, 8);
   2564  buf1[8] = buf0[8];
   2565  buf1[9] = buf0[9];
   2566  butterfly_s32_s32_x4_0112_neon(cospi32, buf0[13], buf0[10], &buf1[13],
   2567                                 &buf1[10]);
   2568  butterfly_s32_s32_x4_0112_neon(cospi32, buf0[12], buf0[11], &buf1[12],
   2569                                 &buf1[11]);
   2570  buf1[14] = buf0[14];
   2571  buf1[15] = buf0[15];
   2572  butterfly_dct_post_s32_x4(buf0 + 16, buf0 + 16, buf1 + 16, 16);
   2573 
   2574  // stage 4
   2575  butterfly_dct_pre_s32_x4(buf1, buf0, 4);
   2576  buf0[4] = buf1[4];
   2577  butterfly_s32_s32_x4_0112_neon(cospi32, buf1[6], buf1[5], &buf0[6], &buf0[5]);
   2578  buf0[7] = buf1[7];
   2579  butterfly_dct_post_s32_x4(buf1 + 8, buf1 + 8, buf0 + 8, 8);
   2580  buf0[16] = buf1[16];
   2581  buf0[17] = buf1[17];
   2582  butterfly_s32_s32_x4_0112_neon(cospi16, buf1[29], buf1[18], &buf0[29],
   2583                                 &buf0[18]);
   2584  butterfly_s32_s32_x4_0112_neon(cospi16, buf1[28], buf1[19], &buf0[28],
   2585                                 &buf0[19]);
   2586  butterfly_s32_s32_x4_1223_neon(cospi16, buf1[27], buf1[20], &buf0[27],
   2587                                 &buf0[20]);
   2588  butterfly_s32_s32_x4_1223_neon(cospi16, buf1[26], buf1[21], &buf0[26],
   2589                                 &buf0[21]);
   2590  buf0[22] = buf1[22];
   2591  buf0[23] = buf1[23];
   2592  buf0[24] = buf1[24];
   2593  buf0[25] = buf1[25];
   2594  buf0[30] = buf1[30];
   2595  buf0[31] = buf1[31];
   2596 
   2597  // stage 5
   2598  butterfly_s32_s32_x4_0112_neon(cospi32, buf0[0], buf0[1], &buf1[0], &buf1[1]);
   2599  butterfly_s32_s32_x4_0112_neon(cospi16, buf0[3], buf0[2], &buf1[2], &buf1[3]);
   2600  butterfly_dct_post_s32_x4(buf0 + 4, buf0 + 4, buf1 + 4, 4);
   2601  buf1[8] = buf0[8];
   2602  butterfly_s32_s32_x4_0112_neon(cospi16, buf0[14], buf0[9], &buf1[14],
   2603                                 &buf1[9]);
   2604  butterfly_s32_s32_x4_1223_neon(cospi16, buf0[13], buf0[10], &buf1[13],
   2605                                 &buf1[10]);
   2606  buf1[11] = buf0[11];
   2607  buf1[12] = buf0[12];
   2608  buf1[15] = buf0[15];
   2609  butterfly_dct_post_s32_x4(buf0 + 16, buf0 + 16, buf1 + 16, 8);
   2610  butterfly_dct_post_s32_x4(buf0 + 24, buf0 + 24, buf1 + 24, 8);
   2611 
   2612  // stage 6
   2613  buf0[0] = buf1[0];
   2614  buf0[1] = buf1[1];
   2615  buf0[2] = buf1[2];
   2616  buf0[3] = buf1[3];
   2617  butterfly_s32_s32_x4_0112_neon(cospi8, buf1[7], buf1[4], &buf0[4], &buf0[7]);
   2618  butterfly_s32_s32_x4_1003_neon(cospi24, buf1[6], buf1[5], &buf0[5], &buf0[6]);
   2619  butterfly_dct_post_s32_x4(buf1 + 8, buf1 + 8, buf0 + 8, 4);
   2620  butterfly_dct_post_s32_x4(buf1 + 12, buf1 + 12, buf0 + 12, 4);
   2621  buf0[16] = buf1[16];
   2622  butterfly_s32_s32_x4_0112_neon(cospi8, buf1[30], buf1[17], &buf0[30],
   2623                                 &buf0[17]);
   2624  butterfly_s32_s32_x4_1223_neon(cospi8, buf1[29], buf1[18], &buf0[29],
   2625                                 &buf0[18]);
   2626  buf0[19] = buf1[19];
   2627  buf0[20] = buf1[20];
   2628  butterfly_s32_s32_x4_1003_neon(cospi24, buf1[26], buf1[21], &buf0[26],
   2629                                 &buf0[21]);
   2630  butterfly_s32_s32_x4_0332_neon(cospi24, buf1[25], buf1[22], &buf0[25],
   2631                                 &buf0[22]);
   2632  buf0[23] = buf1[23];
   2633  buf0[24] = buf1[24];
   2634  buf0[27] = buf1[27];
   2635  buf0[28] = buf1[28];
   2636  buf0[31] = buf1[31];
   2637 
   2638  // stage 7
   2639  buf1[0] = buf0[0];
   2640  buf1[1] = buf0[1];
   2641  buf1[2] = buf0[2];
   2642  buf1[3] = buf0[3];
   2643  buf1[4] = buf0[4];
   2644  buf1[5] = buf0[5];
   2645  buf1[6] = buf0[6];
   2646  buf1[7] = buf0[7];
   2647  butterfly_s32_s32_x4_0112_neon(cospi4, buf0[15], buf0[8], &buf1[8],
   2648                                 &buf1[15]);
   2649  butterfly_s32_s32_x4_1003_neon(cospi28, buf0[14], buf0[9], &buf1[9],
   2650                                 &buf1[14]);
   2651  butterfly_s32_s32_x4_0112_neon(cospi20, buf0[13], buf0[10], &buf1[10],
   2652                                 &buf1[13]);
   2653  butterfly_s32_s32_x4_1003_neon(cospi12, buf0[12], buf0[11], &buf1[11],
   2654                                 &buf1[12]);
   2655  butterfly_dct_post_s32_x4(buf0 + 16, buf0 + 16, buf1 + 16, 4);
   2656  butterfly_dct_post_s32_x4(buf0 + 20, buf0 + 20, buf1 + 20, 4);
   2657  butterfly_dct_post_s32_x4(buf0 + 24, buf0 + 24, buf1 + 24, 4);
   2658  butterfly_dct_post_s32_x4(buf0 + 28, buf0 + 28, buf1 + 28, 4);
   2659 
   2660  // stage 8
   2661  buf0[0] = buf1[0];
   2662  buf0[1] = buf1[1];
   2663  buf0[2] = buf1[2];
   2664  buf0[3] = buf1[3];
   2665  buf0[4] = buf1[4];
   2666  buf0[5] = buf1[5];
   2667  buf0[6] = buf1[6];
   2668  buf0[7] = buf1[7];
   2669  buf0[8] = buf1[8];
   2670  buf0[9] = buf1[9];
   2671  buf0[10] = buf1[10];
   2672  buf0[11] = buf1[11];
   2673  buf0[12] = buf1[12];
   2674  buf0[13] = buf1[13];
   2675  buf0[14] = buf1[14];
   2676  buf0[15] = buf1[15];
   2677  butterfly_s32_s32_x4_0112_neon(cospi2, buf1[31], buf1[16], &buf0[16],
   2678                                 &buf0[31]);
   2679  butterfly_s32_s32_x4_1003_neon(cospi30, buf1[30], buf1[17], &buf0[17],
   2680                                 &buf0[30]);
   2681  butterfly_s32_s32_x4_0112_neon(cospi18, buf1[29], buf1[18], &buf0[18],
   2682                                 &buf0[29]);
   2683  butterfly_s32_s32_x4_1003_neon(cospi14, buf1[28], buf1[19], &buf0[19],
   2684                                 &buf0[28]);
   2685  butterfly_s32_s32_x4_0112_neon(cospi10, buf1[27], buf1[20], &buf0[20],
   2686                                 &buf0[27]);
   2687  butterfly_s32_s32_x4_1003_neon(cospi22, buf1[26], buf1[21], &buf0[21],
   2688                                 &buf0[26]);
   2689  butterfly_s32_s32_x4_0112_neon(cospi26, buf1[25], buf1[22], &buf0[22],
   2690                                 &buf0[25]);
   2691  butterfly_s32_s32_x4_1003_neon(cospi6, buf1[24], buf1[23], &buf0[23],
   2692                                 &buf0[24]);
   2693 
   2694  // stage 9
   2695  output[0] = buf0[0];
   2696  output[1] = buf0[16];
   2697  output[2] = buf0[8];
   2698  output[3] = buf0[24];
   2699  output[4] = buf0[4];
   2700  output[5] = buf0[20];
   2701  output[6] = buf0[12];
   2702  output[7] = buf0[28];
   2703  output[8] = buf0[2];
   2704  output[9] = buf0[18];
   2705  output[10] = buf0[10];
   2706  output[11] = buf0[26];
   2707  output[12] = buf0[6];
   2708  output[13] = buf0[22];
   2709  output[14] = buf0[14];
   2710  output[15] = buf0[30];
   2711  output[16] = buf0[1];
   2712  output[17] = buf0[17];
   2713  output[18] = buf0[9];
   2714  output[19] = buf0[25];
   2715  output[20] = buf0[5];
   2716  output[21] = buf0[21];
   2717  output[22] = buf0[13];
   2718  output[23] = buf0[29];
   2719  output[24] = buf0[3];
   2720  output[25] = buf0[19];
   2721  output[26] = buf0[11];
   2722  output[27] = buf0[27];
   2723  output[28] = buf0[7];
   2724  output[29] = buf0[23];
   2725  output[30] = buf0[15];
   2726  output[31] = buf0[31];
   2727 }
   2728 
   2729 static void fdct64_neon(const int32x4_t *input, int32x4_t *output,
   2730                        int cos_bit) {
   2731  const int16_t *cospi = cospi_arr_q13(cos_bit);
   2732 
   2733  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
   2734  const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
   2735  const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
   2736  const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
   2737  const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
   2738  const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
   2739  const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
   2740  const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
   2741  const int16x8_t cospi1_3 = vld1q_s16(&cospi[4 * 16]);
   2742  const int16x8_t cospi5_7 = vld1q_s16(&cospi[4 * 18]);
   2743  const int16x8_t cospi9_11 = vld1q_s16(&cospi[4 * 20]);
   2744  const int16x8_t cospi13_15 = vld1q_s16(&cospi[4 * 22]);
   2745  const int16x8_t cospi17_19 = vld1q_s16(&cospi[4 * 24]);
   2746  const int16x8_t cospi21_23 = vld1q_s16(&cospi[4 * 26]);
   2747  const int16x8_t cospi25_27 = vld1q_s16(&cospi[4 * 28]);
   2748  const int16x8_t cospi29_31 = vld1q_s16(&cospi[4 * 30]);
   2749 
   2750  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
   2751  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
   2752  const int16x4_t cospi8 = vget_low_s16(cospi8_24);
   2753  const int16x4_t cospi24 = vget_high_s16(cospi8_24);
   2754  const int16x4_t cospi4 = vget_low_s16(cospi4_12);
   2755  const int16x4_t cospi12 = vget_high_s16(cospi4_12);
   2756  const int16x4_t cospi20 = vget_low_s16(cospi20_28);
   2757  const int16x4_t cospi28 = vget_high_s16(cospi20_28);
   2758  const int16x4_t cospi2 = vget_low_s16(cospi2_6);
   2759  const int16x4_t cospi6 = vget_high_s16(cospi2_6);
   2760  const int16x4_t cospi10 = vget_low_s16(cospi10_14);
   2761  const int16x4_t cospi14 = vget_high_s16(cospi10_14);
   2762  const int16x4_t cospi18 = vget_low_s16(cospi18_22);
   2763  const int16x4_t cospi22 = vget_high_s16(cospi18_22);
   2764  const int16x4_t cospi26 = vget_low_s16(cospi26_30);
   2765  const int16x4_t cospi30 = vget_high_s16(cospi26_30);
   2766  const int16x4_t cospi1 = vget_low_s16(cospi1_3);
   2767  const int16x4_t cospi3 = vget_high_s16(cospi1_3);
   2768  const int16x4_t cospi5 = vget_low_s16(cospi5_7);
   2769  const int16x4_t cospi7 = vget_high_s16(cospi5_7);
   2770  const int16x4_t cospi9 = vget_low_s16(cospi9_11);
   2771  const int16x4_t cospi11 = vget_high_s16(cospi9_11);
   2772  const int16x4_t cospi13 = vget_low_s16(cospi13_15);
   2773  const int16x4_t cospi15 = vget_high_s16(cospi13_15);
   2774  const int16x4_t cospi17 = vget_low_s16(cospi17_19);
   2775  const int16x4_t cospi19 = vget_high_s16(cospi17_19);
   2776  const int16x4_t cospi21 = vget_low_s16(cospi21_23);
   2777  const int16x4_t cospi23 = vget_high_s16(cospi21_23);
   2778  const int16x4_t cospi25 = vget_low_s16(cospi25_27);
   2779  const int16x4_t cospi27 = vget_high_s16(cospi25_27);
   2780  const int16x4_t cospi29 = vget_low_s16(cospi29_31);
   2781  const int16x4_t cospi31 = vget_high_s16(cospi29_31);
   2782 
   2783  // stage 1
   2784  int32x4_t x1[64];
   2785  butterfly_dct_pre_s32_x4(input, x1, 64);
   2786 
   2787  // stage 2
   2788  int32x4_t x2[64];
   2789  butterfly_dct_pre_s32_x4(x1, x2, 32);
   2790  butterfly_s32_s32_x4_0112_neon(cospi32, x1[55], x1[40], &x2[55], &x2[40]);
   2791  butterfly_s32_s32_x4_0112_neon(cospi32, x1[54], x1[41], &x2[54], &x2[41]);
   2792  butterfly_s32_s32_x4_0112_neon(cospi32, x1[53], x1[42], &x2[53], &x2[42]);
   2793  butterfly_s32_s32_x4_0112_neon(cospi32, x1[52], x1[43], &x2[52], &x2[43]);
   2794  butterfly_s32_s32_x4_0112_neon(cospi32, x1[51], x1[44], &x2[51], &x2[44]);
   2795  butterfly_s32_s32_x4_0112_neon(cospi32, x1[50], x1[45], &x2[50], &x2[45]);
   2796  butterfly_s32_s32_x4_0112_neon(cospi32, x1[49], x1[46], &x2[49], &x2[46]);
   2797  butterfly_s32_s32_x4_0112_neon(cospi32, x1[48], x1[47], &x2[48], &x2[47]);
   2798 
   2799  // stage 3
   2800  int32x4_t x3[64];
   2801  butterfly_dct_pre_s32_x4(x2, x3, 16);
   2802  butterfly_s32_s32_x4_0112_neon(cospi32, x2[27], x2[20], &x3[27], &x3[20]);
   2803  butterfly_s32_s32_x4_0112_neon(cospi32, x2[26], x2[21], &x3[26], &x3[21]);
   2804  butterfly_s32_s32_x4_0112_neon(cospi32, x2[25], x2[22], &x3[25], &x3[22]);
   2805  butterfly_s32_s32_x4_0112_neon(cospi32, x2[24], x2[23], &x3[24], &x3[23]);
   2806  butterfly_dct_post_s32_x4(x1 + 32, x2 + 32, x3 + 32, 32);
   2807 
   2808  // stage 4
   2809  int32x4_t x4[64];
   2810  butterfly_dct_pre_s32_x4(x3, x4, 8);
   2811  butterfly_s32_s32_x4_0112_neon(cospi32, x3[13], x3[10], &x4[13], &x4[10]);
   2812  butterfly_s32_s32_x4_0112_neon(cospi32, x3[12], x3[11], &x4[12], &x4[11]);
   2813  butterfly_dct_post_s32_x4(x2 + 16, x3 + 16, x4 + 16, 16);
   2814  butterfly_s32_s32_x4_0112_neon(cospi16, x3[59], x3[36], &x4[59], &x4[36]);
   2815  butterfly_s32_s32_x4_0112_neon(cospi16, x3[58], x3[37], &x4[58], &x4[37]);
   2816  butterfly_s32_s32_x4_0112_neon(cospi16, x3[57], x3[38], &x4[57], &x4[38]);
   2817  butterfly_s32_s32_x4_0112_neon(cospi16, x3[56], x3[39], &x4[56], &x4[39]);
   2818  butterfly_s32_s32_x4_1223_neon(cospi16, x3[55], x3[40], &x4[55], &x4[40]);
   2819  butterfly_s32_s32_x4_1223_neon(cospi16, x3[54], x3[41], &x4[54], &x4[41]);
   2820  butterfly_s32_s32_x4_1223_neon(cospi16, x3[53], x3[42], &x4[53], &x4[42]);
   2821  butterfly_s32_s32_x4_1223_neon(cospi16, x3[52], x3[43], &x4[52], &x4[43]);
   2822 
   2823  // stage 5
   2824  int32x4_t x5[64];
   2825  butterfly_dct_pre_s32_x4(x4, x5, 4);
   2826  butterfly_s32_s32_x4_0112_neon(cospi32, x4[6], x4[5], &x5[6], &x5[5]);
   2827  butterfly_dct_post_s32_x4(x3 + 8, x4 + 8, x5 + 8, 8);
   2828  butterfly_s32_s32_x4_0112_neon(cospi16, x4[29], x4[18], &x5[29], &x5[18]);
   2829  butterfly_s32_s32_x4_0112_neon(cospi16, x4[28], x4[19], &x5[28], &x5[19]);
   2830  butterfly_s32_s32_x4_1223_neon(cospi16, x4[27], x4[20], &x5[27], &x5[20]);
   2831  butterfly_s32_s32_x4_1223_neon(cospi16, x4[26], x4[21], &x5[26], &x5[21]);
   2832  butterfly_dct_post_s32_x4(x3 + 32, x4 + 32, x5 + 32, 16);
   2833  butterfly_dct_post_s32_x4(x3 + 48, x4 + 48, x5 + 48, 16);
   2834 
   2835  // stage 6
   2836  int32x4_t x6[64];
   2837  butterfly_s32_s32_x4_0112_neon(cospi32, x5[0], x5[1], &x6[0], &x6[1]);
   2838  butterfly_s32_s32_x4_0112_neon(cospi16, x5[3], x5[2], &x6[2], &x6[3]);
   2839  butterfly_dct_post_s32_x4(x4 + 4, x5 + 4, x6 + 4, 4);
   2840  butterfly_s32_s32_x4_0112_neon(cospi16, x5[14], x5[9], &x6[14], &x6[9]);
   2841  butterfly_s32_s32_x4_1223_neon(cospi16, x5[13], x5[10], &x6[13], &x6[10]);
   2842  butterfly_dct_post_s32_x4(x4 + 16, x5 + 16, x6 + 16, 8);
   2843  butterfly_dct_post_s32_x4(x4 + 24, x5 + 24, x6 + 24, 8);
   2844  butterfly_s32_s32_x4_0112_neon(cospi8, x5[61], x5[34], &x6[61], &x6[34]);
   2845  butterfly_s32_s32_x4_0112_neon(cospi8, x5[60], x5[35], &x6[60], &x6[35]);
   2846  butterfly_s32_s32_x4_1223_neon(cospi8, x5[59], x5[36], &x6[59], &x6[36]);
   2847  butterfly_s32_s32_x4_1223_neon(cospi8, x5[58], x5[37], &x6[58], &x6[37]);
   2848  butterfly_s32_s32_x4_1003_neon(cospi24, x5[53], x5[42], &x6[53], &x6[42]);
   2849  butterfly_s32_s32_x4_1003_neon(cospi24, x5[52], x5[43], &x6[52], &x6[43]);
   2850  butterfly_s32_s32_x4_0332_neon(cospi24, x5[51], x5[44], &x6[51], &x6[44]);
   2851  butterfly_s32_s32_x4_0332_neon(cospi24, x5[50], x5[45], &x6[50], &x6[45]);
   2852 
   2853  // stage 7
   2854  int32x4_t x7[64];
   2855  butterfly_s32_s32_x4_0112_neon(cospi8, x6[7], x6[4], &x7[4], &x7[7]);
   2856  butterfly_s32_s32_x4_1003_neon(cospi24, x6[6], x6[5], &x7[5], &x7[6]);
   2857  butterfly_dct_post_s32_x4(x5 + 8, x6 + 8, x7 + 8, 4);
   2858  butterfly_dct_post_s32_x4(x5 + 12, x6 + 12, x7 + 12, 4);
   2859  butterfly_s32_s32_x4_0112_neon(cospi8, x6[30], x6[17], &x7[30], &x7[17]);
   2860  butterfly_s32_s32_x4_1223_neon(cospi8, x6[29], x6[18], &x7[29], &x7[18]);
   2861  butterfly_s32_s32_x4_1003_neon(cospi24, x6[26], x6[21], &x7[26], &x7[21]);
   2862  butterfly_s32_s32_x4_0332_neon(cospi24, x6[25], x6[22], &x7[25], &x7[22]);
   2863  butterfly_dct_post_s32_x4(x5 + 32, x6 + 32, x7 + 32, 8);
   2864  butterfly_dct_post_s32_x4(x5 + 40, x6 + 40, x7 + 40, 8);
   2865  butterfly_dct_post_s32_x4(x5 + 48, x6 + 48, x7 + 48, 8);
   2866  butterfly_dct_post_s32_x4(x5 + 56, x6 + 56, x7 + 56, 8);
   2867 
   2868  // stage 8
   2869  int32x4_t x8[64];
   2870  butterfly_s32_s32_x4_0112_neon(cospi4, x7[15], x7[8], &x8[8], &x8[15]);
   2871  butterfly_s32_s32_x4_1003_neon(cospi28, x7[14], x7[9], &x8[9], &x8[14]);
   2872  butterfly_s32_s32_x4_0112_neon(cospi20, x7[13], x7[10], &x8[10], &x8[13]);
   2873  butterfly_s32_s32_x4_1003_neon(cospi12, x7[12], x7[11], &x8[11], &x8[12]);
   2874  butterfly_dct_post_s32_x4(x6 + 16, x7 + 16, x8 + 16, 4);
   2875  butterfly_dct_post_s32_x4(x6 + 20, x7 + 20, x8 + 20, 4);
   2876  butterfly_dct_post_s32_x4(x6 + 24, x7 + 24, x8 + 24, 4);
   2877  butterfly_dct_post_s32_x4(x6 + 28, x7 + 28, x8 + 28, 4);
   2878  butterfly_s32_s32_x4_0112_neon(cospi4, x7[62], x7[33], &x8[62], &x8[33]);
   2879  butterfly_s32_s32_x4_1223_neon(cospi4, x7[61], x7[34], &x8[61], &x8[34]);
   2880  butterfly_s32_s32_x4_1003_neon(cospi28, x7[58], x7[37], &x8[58], &x8[37]);
   2881  butterfly_s32_s32_x4_0332_neon(cospi28, x7[57], x7[38], &x8[57], &x8[38]);
   2882  butterfly_s32_s32_x4_0112_neon(cospi20, x7[54], x7[41], &x8[54], &x8[41]);
   2883  butterfly_s32_s32_x4_1223_neon(cospi20, x7[53], x7[42], &x8[53], &x8[42]);
   2884  butterfly_s32_s32_x4_1003_neon(cospi12, x7[50], x7[45], &x8[50], &x8[45]);
   2885  butterfly_s32_s32_x4_0332_neon(cospi12, x7[49], x7[46], &x8[49], &x8[46]);
   2886 
   2887  // stage 9
   2888  int32x4_t x9[64];
   2889  butterfly_s32_s32_x4_0112_neon(cospi2, x8[31], x8[16], &x9[16], &x9[31]);
   2890  butterfly_s32_s32_x4_1003_neon(cospi30, x8[30], x8[17], &x9[17], &x9[30]);
   2891  butterfly_s32_s32_x4_0112_neon(cospi18, x8[29], x8[18], &x9[18], &x9[29]);
   2892  butterfly_s32_s32_x4_1003_neon(cospi14, x8[28], x8[19], &x9[19], &x9[28]);
   2893  butterfly_s32_s32_x4_0112_neon(cospi10, x8[27], x8[20], &x9[20], &x9[27]);
   2894  butterfly_s32_s32_x4_1003_neon(cospi22, x8[26], x8[21], &x9[21], &x9[26]);
   2895  butterfly_s32_s32_x4_0112_neon(cospi26, x8[25], x8[22], &x9[22], &x9[25]);
   2896  butterfly_s32_s32_x4_1003_neon(cospi6, x8[24], x8[23], &x9[23], &x9[24]);
   2897  butterfly_dct_post_s32_x4(x7 + 32, x8 + 32, x9 + 32, 4);
   2898  butterfly_dct_post_s32_x4(x7 + 36, x8 + 36, x9 + 36, 4);
   2899  butterfly_dct_post_s32_x4(x7 + 40, x8 + 40, x9 + 40, 4);
   2900  butterfly_dct_post_s32_x4(x7 + 44, x8 + 44, x9 + 44, 4);
   2901  butterfly_dct_post_s32_x4(x7 + 48, x8 + 48, x9 + 48, 4);
   2902  butterfly_dct_post_s32_x4(x7 + 52, x8 + 52, x9 + 52, 4);
   2903  butterfly_dct_post_s32_x4(x7 + 56, x8 + 56, x9 + 56, 4);
   2904  butterfly_dct_post_s32_x4(x7 + 60, x8 + 60, x9 + 60, 4);
   2905 
   2906  // stage 10
   2907  int32x4_t x10[64];
   2908  butterfly_s32_s32_x4_0112_neon(cospi1, x9[63], x9[32], &x10[32], &x10[63]);
   2909  butterfly_s32_s32_x4_1003_neon(cospi31, x9[62], x9[33], &x10[33], &x10[62]);
   2910  butterfly_s32_s32_x4_0112_neon(cospi17, x9[61], x9[34], &x10[34], &x10[61]);
   2911  butterfly_s32_s32_x4_1003_neon(cospi15, x9[60], x9[35], &x10[35], &x10[60]);
   2912  butterfly_s32_s32_x4_0112_neon(cospi9, x9[59], x9[36], &x10[36], &x10[59]);
   2913  butterfly_s32_s32_x4_1003_neon(cospi23, x9[58], x9[37], &x10[37], &x10[58]);
   2914  butterfly_s32_s32_x4_0112_neon(cospi25, x9[57], x9[38], &x10[38], &x10[57]);
   2915  butterfly_s32_s32_x4_1003_neon(cospi7, x9[56], x9[39], &x10[39], &x10[56]);
   2916  butterfly_s32_s32_x4_0112_neon(cospi5, x9[55], x9[40], &x10[40], &x10[55]);
   2917  butterfly_s32_s32_x4_1003_neon(cospi27, x9[54], x9[41], &x10[41], &x10[54]);
   2918  butterfly_s32_s32_x4_0112_neon(cospi21, x9[53], x9[42], &x10[42], &x10[53]);
   2919  butterfly_s32_s32_x4_1003_neon(cospi11, x9[52], x9[43], &x10[43], &x10[52]);
   2920  butterfly_s32_s32_x4_0112_neon(cospi13, x9[51], x9[44], &x10[44], &x10[51]);
   2921  butterfly_s32_s32_x4_1003_neon(cospi19, x9[50], x9[45], &x10[45], &x10[50]);
   2922  butterfly_s32_s32_x4_0112_neon(cospi29, x9[49], x9[46], &x10[46], &x10[49]);
   2923  butterfly_s32_s32_x4_1003_neon(cospi3, x9[48], x9[47], &x10[47], &x10[48]);
   2924 
   2925  // stage 11, only store into the low 32 output indices.
   2926  output[0] = x6[0];
   2927  output[1] = x10[32];
   2928  output[2] = x9[16];
   2929  output[3] = x10[48];
   2930  output[4] = x8[8];
   2931  output[5] = x10[40];
   2932  output[6] = x9[24];
   2933  output[7] = x10[56];
   2934  output[8] = x7[4];
   2935  output[9] = x10[36];
   2936  output[10] = x9[20];
   2937  output[11] = x10[52];
   2938  output[12] = x8[12];
   2939  output[13] = x10[44];
   2940  output[14] = x9[28];
   2941  output[15] = x10[60];
   2942  output[16] = x6[2];
   2943  output[17] = x10[34];
   2944  output[18] = x9[18];
   2945  output[19] = x10[50];
   2946  output[20] = x8[10];
   2947  output[21] = x10[42];
   2948  output[22] = x9[26];
   2949  output[23] = x10[58];
   2950  output[24] = x7[6];
   2951  output[25] = x10[38];
   2952  output[26] = x9[22];
   2953  output[27] = x10[54];
   2954  output[28] = x8[14];
   2955  output[29] = x10[46];
   2956  output[30] = x9[30];
   2957  output[31] = x10[62];
   2958 }
   2959 
   2960 static void lowbd_fwd_txfm2d_64x64_neon(const int16_t *input, int32_t *output,
   2961                                        int stride, TX_TYPE tx_type, int bd) {
   2962  (void)bd;
   2963  (void)tx_type;
   2964  assert(tx_type == DCT_DCT);
   2965  int16x8_t buf0[64], buf1[512];
   2966  const transform_1d_lbd_8_neon col_txfm = fdct8x64_neon;
   2967 
   2968  for (int i = 0; i < 8; i++) {
   2969    load_buffer_s16_x8(input + 8 * i, stride, buf0, 64);
   2970    col_txfm(buf0, buf0, 13);
   2971    shift_right_2_round_s16_x8(buf0, buf0, 64);
   2972    for (int j = 0; j < 4; ++j) {
   2973      transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 64 + 8 * i);
   2974    }
   2975  }
   2976  for (int i = 0; i < 4; i++) {
   2977    int32x4_t bufA[64];
   2978    int32x4_t bufB[64];
   2979    int16x8_t *buf = buf1 + 64 * i;
   2980    for (int j = 0; j < 64; ++j) {
   2981      bufA[j] = vmovl_s16(vget_low_s16(buf[j]));
   2982      bufB[j] = vmovl_s16(vget_high_s16(buf[j]));
   2983    }
   2984    fdct64_neon(bufA, bufA, 10);
   2985    fdct64_neon(bufB, bufB, 10);
   2986    shift_right_2_round_s32_x4(bufA, bufA, 32);
   2987    shift_right_2_round_s32_x4(bufB, bufB, 32);
   2988    store_buffer_interleaved_s32_x8(output + i * 8, bufA, bufB, 32, 32);
   2989  }
   2990 }
   2991 
   2992 static void lowbd_fwd_txfm2d_64x32_neon(const int16_t *input, int32_t *output,
   2993                                        int stride, TX_TYPE tx_type, int bd) {
   2994  (void)bd;
   2995  int16x8_t buf0[64], buf1[256];
   2996  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type];
   2997 
   2998  for (int i = 0; i < 8; i++) {
   2999    col_txfm(input + 8 * i, buf0, stride, 12);
   3000    shift_right_4_round_s16_x8(buf0, buf0, 32);
   3001    for (int j = 0; j < 4; ++j) {
   3002      transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 64 + 8 * i);
   3003    }
   3004  }
   3005  assert(tx_type == DCT_DCT);
   3006  for (int i = 0; i < 4; i++) {
   3007    int32x4_t bufA[64];
   3008    int32x4_t bufB[64];
   3009    int16x8_t *buf = buf1 + 64 * i;
   3010    for (int j = 0; j < 64; ++j) {
   3011      bufA[j] = vmovl_s16(vget_low_s16(buf[j]));
   3012      bufB[j] = vmovl_s16(vget_high_s16(buf[j]));
   3013    }
   3014    fdct64_neon(bufA, bufA, 11);
   3015    fdct64_neon(bufB, bufB, 11);
   3016    shift_right_2_round_s32_x4(bufA, bufA, 32);
   3017    shift_right_2_round_s32_x4(bufB, bufB, 32);
   3018    round_shift_sqrt2_s32_s32_4xn_neon(bufA, bufA, 32);
   3019    round_shift_sqrt2_s32_s32_4xn_neon(bufB, bufB, 32);
   3020    store_buffer_interleaved_s32_x8(output + i * 8, bufA, bufB, 32, 32);
   3021  }
   3022 }
   3023 
   3024 static void lowbd_fwd_txfm2d_32x64_neon(const int16_t *input, int32_t *output,
   3025                                        int stride, TX_TYPE tx_type, int bd) {
   3026  (void)bd;
   3027  (void)tx_type;
   3028  assert(tx_type == DCT_DCT);
   3029  int16x8_t buf0[64], buf1[256];
   3030  const transform_1d_lbd_8_neon col_txfm = fdct8x64_neon;
   3031 
   3032  for (int i = 0; i < 4; i++) {
   3033    load_buffer_s16_x8(input + 8 * i, stride, buf0, 64);
   3034    col_txfm(buf0, buf0, 13);
   3035    shift_right_2_round_s16_x8(buf0, buf0, 64);
   3036    for (int j = 0; j < 4; ++j) {
   3037      transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 32 + 8 * i);
   3038    }
   3039  }
   3040 
   3041  for (int i = 0; i < 4; i++) {
   3042    int32x4_t bufA[32];
   3043    int32x4_t bufB[32];
   3044    int16x8_t *buf = buf1 + 32 * i;
   3045    for (int j = 0; j < 32; ++j) {
   3046      bufA[j] = vmovl_s16(vget_low_s16(buf[j]));
   3047      bufB[j] = vmovl_s16(vget_high_s16(buf[j]));
   3048    }
   3049    fdct32_neon(bufA, bufA, 11);
   3050    fdct32_neon(bufB, bufB, 11);
   3051    shift_right_2_round_s32_x4(bufA, bufA, 32);
   3052    shift_right_2_round_s32_x4(bufB, bufB, 32);
   3053    round_shift_sqrt2_s32_s32_4xn_neon(bufA, bufA, 32);
   3054    round_shift_sqrt2_s32_s32_4xn_neon(bufB, bufB, 32);
   3055    store_buffer_interleaved_s32_x8(output + i * 8, bufA, bufB, 32, 32);
   3056  }
   3057 }
   3058 
   3059 static FwdTxfm2dFunc lowbd_fwd_txfm_func_ls[TX_SIZES_ALL] = {
   3060  lowbd_fwd_txfm2d_4x4_neon,    // 4x4 transform
   3061  lowbd_fwd_txfm2d_8x8_neon,    // 8x8 transform
   3062  lowbd_fwd_txfm2d_16x16_neon,  // 16x16 transform
   3063  lowbd_fwd_txfm2d_32x32_neon,  // 32x32 transform
   3064  lowbd_fwd_txfm2d_64x64_neon,  // 64x64 transform
   3065  lowbd_fwd_txfm2d_4x8_neon,    // 4x8 transform
   3066  lowbd_fwd_txfm2d_8x4_neon,    // 8x4 transform
   3067  lowbd_fwd_txfm2d_8x16_neon,   // 8x16 transform
   3068  lowbd_fwd_txfm2d_16x8_neon,   // 16x8 transform
   3069  lowbd_fwd_txfm2d_16x32_neon,  // 16x32 transform
   3070  lowbd_fwd_txfm2d_32x16_neon,  // 32x16 transform
   3071  lowbd_fwd_txfm2d_32x64_neon,  // 32x64 transform
   3072  lowbd_fwd_txfm2d_64x32_neon,  // 64x32 transform
   3073  lowbd_fwd_txfm2d_4x16_neon,   // 4x16 transform
   3074  lowbd_fwd_txfm2d_16x4_neon,   // 16x4 transform
   3075  lowbd_fwd_txfm2d_8x32_neon,   // 8x32 transform
   3076  lowbd_fwd_txfm2d_32x8_neon,   // 32x8 transform
   3077  lowbd_fwd_txfm2d_16x64_neon,  // 16x64 transform
   3078  lowbd_fwd_txfm2d_64x16_neon,  // 64x16 transform
   3079 };
   3080 
   3081 void av1_lowbd_fwd_txfm_neon(const int16_t *src_diff, tran_low_t *coeff,
   3082                             int diff_stride, TxfmParam *txfm_param) {
   3083  FwdTxfm2dFunc fwd_txfm2d_func = lowbd_fwd_txfm_func_ls[txfm_param->tx_size];
   3084  if (txfm_param->lossless && txfm_param->tx_size == TX_4X4) {
   3085    av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
   3086  } else {
   3087    fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
   3088                    txfm_param->bd);
   3089  }
   3090 }