tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

highbd_fwd_txfm_neon.c (98194B)


      1 /*
      2 * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <arm_neon.h>
     13 #include <assert.h>
     14 
     15 #include "aom_dsp/arm/transpose_neon.h"
     16 #include "aom_dsp/txfm_common.h"
     17 #include "aom_ports/mem.h"
     18 #include "av1/common/av1_txfm.h"
     19 #include "av1/encoder/av1_fwd_txfm1d_cfg.h"
     20 #include "config/aom_config.h"
     21 #include "config/av1_rtcd.h"
     22 #include "shift_neon.h"
     23 #include "txfm_neon.h"
     24 
     25 static AOM_FORCE_INLINE void transpose_arrays_s32_64x64(const int32x4_t *in,
     26                                                        int32x4_t *out) {
     27  // This is not quite the same as the other transposes defined in
     28  // transpose_neon.h: We only write the low 64x32 sub-matrix since the rest is
     29  // unused by the following row transform.
     30  for (int j = 0; j < 8; ++j) {
     31    for (int i = 0; i < 16; ++i) {
     32      transpose_arrays_s32_4x4(in + 64 * i + 4 * j, out + 64 * j + 4 * i);
     33    }
     34  }
     35 }
     36 
     37 // A note on butterfly helper naming:
     38 //
     39 // butterfly_[weight_indices]_neon
     40 // e.g. butterfly_0312_neon
     41 //                ^ Weights are applied as indices 0, 3, 2, 1
     42 //                  (see more detail below)
     43 //
     44 // Weight indices are treated as an index into the 4-tuple of the weight
     45 // itself, plus related and negated constants: w=(w0, 1-w0, -w0, w0-1).
     46 // This is then represented in the helper naming by referring to the lane index
     47 // in the loaded tuple that each multiply is performed with:
     48 //
     49 //         in0   in1
     50 //      /------------
     51 // out0 |  w[0]  w[1]   ==>  out0 = in0 * w[0] + in1 * w[1]
     52 // out1 |  w[2]  w[3]   ==>  out1 = in0 * w[2] + in1 * w[3]
     53 //
     54 // So for indices 0321 from the earlier example, we end up with:
     55 //
     56 //          in0       in1
     57 //      /------------------
     58 // out0 | (lane 0) (lane 3)   ==>  out0 = in0 *  w0 + in1 * (w0-1)
     59 // out1 | (lane 2) (lane 1)   ==>  out1 = in0 * -w0 + in1 * (1-w0)
     60 
     61 #define butterfly_half_neon(wvec, lane0, lane1, in0, in1, out, v_bit)   \
     62  do {                                                                  \
     63    int32x2x2_t wvecs = { { wvec, vneg_s32(wvec) } };                   \
     64    int32x4_t x = vmulq_lane_s32(in0, wvecs.val[lane0 / 2], lane0 % 2); \
     65    x = vmlaq_lane_s32(x, in1, wvecs.val[lane1 / 2], lane1 % 2);        \
     66    *out = vrshlq_s32(x, v_bit);                                        \
     67  } while (false)
     68 
     69 static AOM_FORCE_INLINE void butterfly_0112_neon(
     70    const int32_t *cospi, const int widx0, const int32x4_t n0,
     71    const int32x4_t n1, int32x4_t *out0, int32x4_t *out1,
     72    const int32x4_t v_bit) {
     73  int32x2_t w01 = vld1_s32(cospi + 2 * widx0);
     74  butterfly_half_neon(w01, 0, 1, n0, n1, out0, v_bit);
     75  butterfly_half_neon(w01, 1, 2, n0, n1, out1, v_bit);
     76 }
     77 
     78 static AOM_FORCE_INLINE void butterfly_2312_neon(
     79    const int32_t *cospi, const int widx0, const int32x4_t n0,
     80    const int32x4_t n1, int32x4_t *out0, int32x4_t *out1,
     81    const int32x4_t v_bit) {
     82  int32x2_t w01 = vld1_s32(cospi + 2 * widx0);
     83  butterfly_half_neon(w01, 2, 3, n0, n1, out0, v_bit);
     84  butterfly_half_neon(w01, 1, 2, n0, n1, out1, v_bit);
     85 }
     86 
     87 static AOM_FORCE_INLINE void butterfly_0332_neon(
     88    const int32_t *cospi, const int widx0, const int32x4_t n0,
     89    const int32x4_t n1, int32x4_t *out0, int32x4_t *out1,
     90    const int32x4_t v_bit) {
     91  int32x2_t w01 = vld1_s32(cospi + 2 * widx0);
     92  butterfly_half_neon(w01, 0, 3, n0, n1, out0, v_bit);
     93  butterfly_half_neon(w01, 3, 2, n0, n1, out1, v_bit);
     94 }
     95 
     96 static AOM_FORCE_INLINE void butterfly_0130_neon(
     97    const int32_t *cospi, const int widx0, const int32x4_t n0,
     98    const int32x4_t n1, int32x4_t *out0, int32x4_t *out1,
     99    const int32x4_t v_bit) {
    100  int32x2_t w01 = vld1_s32(cospi + 2 * widx0);
    101  butterfly_half_neon(w01, 0, 1, n0, n1, out0, v_bit);
    102  butterfly_half_neon(w01, 3, 0, n0, n1, out1, v_bit);
    103 }
    104 
    105 static AOM_FORCE_INLINE void butterfly_cospi32_0002_neon(
    106    const int32_t *cospi, const int32x4_t n0, const int32x4_t n1,
    107    int32x4_t *out0, int32x4_t *out1, const int32x4_t v_bit) {
    108  int32x2_t w01 = vld1_s32(cospi + 2 * 32);
    109  butterfly_half_neon(w01, 0, 0, n0, n1, out0, v_bit);
    110  butterfly_half_neon(w01, 0, 2, n0, n1, out1, v_bit);
    111 }
    112 
    113 static AOM_FORCE_INLINE void butterfly_cospi32_0222_neon(
    114    const int32_t *cospi, const int32x4_t n0, const int32x4_t n1,
    115    int32x4_t *out0, int32x4_t *out1, const int32x4_t v_bit) {
    116  int32x2_t w01 = vld1_s32(cospi + 2 * 32);
    117  butterfly_half_neon(w01, 0, 2, n0, n1, out0, v_bit);
    118  butterfly_half_neon(w01, 2, 2, n0, n1, out1, v_bit);
    119 }
    120 
    121 static AOM_FORCE_INLINE void round_rect_array_s32_neon(const int32x4_t *input,
    122                                                       int32x4_t *output,
    123                                                       const int size) {
    124  const int32x4_t sqrt2 = vdupq_n_s32(NewSqrt2);
    125  int i = 0;
    126  do {
    127    const int32x4_t r1 = vmulq_s32(input[i], sqrt2);
    128    output[i] = vrshrq_n_s32(r1, NewSqrt2Bits);
    129  } while (++i < size);
    130 }
    131 
    132 static AOM_FORCE_INLINE void round_shift2_rect_array_s32_neon(
    133    const int32x4_t *input, int32x4_t *output, const int size) {
    134  const int32x4_t sqrt2 = vdupq_n_s32(NewSqrt2);
    135  int i = 0;
    136  do {
    137    const int32x4_t r0 = vrshrq_n_s32(input[i], 2);
    138    const int32x4_t r1 = vmulq_s32(r0, sqrt2);
    139    output[i] = vrshrq_n_s32(r1, NewSqrt2Bits);
    140  } while (++i < size);
    141 }
    142 
    143 #define LOAD_BUFFER_4XH(h)                                           \
    144  static AOM_FORCE_INLINE void load_buffer_4x##h(                    \
    145      const int16_t *input, int32x4_t *in, int stride, int fliplr) { \
    146    if (fliplr) {                                                    \
    147      for (int i = 0; i < (h); ++i) {                                \
    148        int16x4_t a = vld1_s16(input + i * stride);                  \
    149        a = vrev64_s16(a);                                           \
    150        in[i] = vshll_n_s16(a, 2);                                   \
    151      }                                                              \
    152    } else {                                                         \
    153      for (int i = 0; i < (h); ++i) {                                \
    154        int16x4_t a = vld1_s16(input + i * stride);                  \
    155        in[i] = vshll_n_s16(a, 2);                                   \
    156      }                                                              \
    157    }                                                                \
    158  }
    159 
    160 // AArch32 does not permit the argument to vshll_n_s16 to be zero, so need to
    161 // avoid the expression even though the compiler can prove that the code path
    162 // is never taken if `shift == 0`.
    163 #define shift_left_long_s16(a, shift) \
    164  ((shift) == 0 ? vmovl_s16(a) : vshll_n_s16((a), (shift) == 0 ? 1 : (shift)))
    165 
    166 #define LOAD_BUFFER_WXH(w, h, shift)                                 \
    167  static AOM_FORCE_INLINE void load_buffer_##w##x##h(                \
    168      const int16_t *input, int32x4_t *in, int stride, int fliplr) { \
    169    assert(w >= 8);                                                  \
    170    if (fliplr) {                                                    \
    171      for (int i = 0; i < (h); ++i) {                                \
    172        for (int j = 0; j < (w) / 8; ++j) {                          \
    173          int16x8_t a = vld1q_s16(input + i * stride + j * 8);       \
    174          a = vrev64q_s16(a);                                        \
    175          int j2 = (w) / 8 - j - 1;                                  \
    176          in[i + (h) * (2 * j2 + 0)] =                               \
    177              shift_left_long_s16(vget_high_s16(a), (shift));        \
    178          in[i + (h) * (2 * j2 + 1)] =                               \
    179              shift_left_long_s16(vget_low_s16(a), (shift));         \
    180        }                                                            \
    181      }                                                              \
    182    } else {                                                         \
    183      for (int i = 0; i < (h); ++i) {                                \
    184        for (int j = 0; j < (w) / 8; ++j) {                          \
    185          int16x8_t a = vld1q_s16(input + i * stride + j * 8);       \
    186          in[i + (h) * (2 * j + 0)] =                                \
    187              shift_left_long_s16(vget_low_s16(a), (shift));         \
    188          in[i + (h) * (2 * j + 1)] =                                \
    189              shift_left_long_s16(vget_high_s16(a), (shift));        \
    190        }                                                            \
    191      }                                                              \
    192    }                                                                \
    193  }
    194 
    195 LOAD_BUFFER_4XH(4)
    196 LOAD_BUFFER_4XH(8)
    197 LOAD_BUFFER_4XH(16)
    198 LOAD_BUFFER_4XH(32)
    199 LOAD_BUFFER_WXH(8, 8, 2)
    200 LOAD_BUFFER_WXH(16, 16, 2)
    201 LOAD_BUFFER_WXH(32, 64, 0)
    202 LOAD_BUFFER_WXH(64, 32, 2)
    203 LOAD_BUFFER_WXH(64, 64, 0)
    204 
    205 #if !CONFIG_REALTIME_ONLY
    206 LOAD_BUFFER_WXH(16, 64, 0)
    207 LOAD_BUFFER_WXH(64, 16, 2)
    208 #endif  // !CONFIG_REALTIME_ONLY
    209 
    210 #define STORE_BUFFER_WXH(w, h)                                \
    211  static AOM_FORCE_INLINE void store_buffer_##w##x##h(        \
    212      const int32x4_t *in, int32_t *out, int stride) {        \
    213    for (int i = 0; i < (w); ++i) {                           \
    214      for (int j = 0; j < (h) / 4; ++j) {                     \
    215        vst1q_s32(&out[i * stride + j * 4], in[i + j * (w)]); \
    216      }                                                       \
    217    }                                                         \
    218  }
    219 
    220 STORE_BUFFER_WXH(4, 4)
    221 STORE_BUFFER_WXH(8, 4)
    222 STORE_BUFFER_WXH(8, 8)
    223 STORE_BUFFER_WXH(16, 4)
    224 STORE_BUFFER_WXH(16, 16)
    225 STORE_BUFFER_WXH(32, 4)
    226 STORE_BUFFER_WXH(32, 32)
    227 STORE_BUFFER_WXH(64, 32)
    228 
    229 #if !CONFIG_REALTIME_ONLY
    230 STORE_BUFFER_WXH(16, 32)
    231 STORE_BUFFER_WXH(64, 16)
    232 #endif  // !CONFIG_REALTIME_ONLY
    233 
    234 static AOM_FORCE_INLINE void highbd_fdct4_x4_neon(const int32x4_t *in,
    235                                                  int32x4_t *out, int bit) {
    236  const int32_t *const cospi = cospi_arr_s32(bit);
    237  const int32x4_t cospi32 = vdupq_n_s32(cospi[2 * 32]);
    238  const int32x2_t cospi16_48 = vld1_s32(&cospi[2 * 16]);
    239 
    240  const int32x4_t a0 = vaddq_s32(in[0], in[3]);
    241  const int32x4_t a1 = vsubq_s32(in[0], in[3]);
    242  const int32x4_t a2 = vaddq_s32(in[1], in[2]);
    243  const int32x4_t a3 = vsubq_s32(in[1], in[2]);
    244 
    245  const int32x4_t b0 = vmulq_s32(a0, cospi32);
    246  const int32x4_t b1 = vmulq_lane_s32(a1, cospi16_48, 1);
    247  const int32x4_t b2 = vmulq_s32(a2, cospi32);
    248  const int32x4_t b3 = vmulq_lane_s32(a3, cospi16_48, 1);
    249 
    250  const int32x4_t c0 = vaddq_s32(b0, b2);
    251  const int32x4_t c1 = vsubq_s32(b0, b2);
    252  const int32x4_t c2 = vmlaq_lane_s32(b3, a1, cospi16_48, 0);
    253  const int32x4_t c3 = vmlsq_lane_s32(b1, a3, cospi16_48, 0);
    254 
    255  const int32x4_t v_bit = vdupq_n_s32(-bit);
    256  const int32x4_t d0 = vrshlq_s32(c0, v_bit);
    257  const int32x4_t d1 = vrshlq_s32(c1, v_bit);
    258  const int32x4_t d2 = vrshlq_s32(c2, v_bit);
    259  const int32x4_t d3 = vrshlq_s32(c3, v_bit);
    260 
    261  out[0] = d0;
    262  out[1] = d2;
    263  out[2] = d1;
    264  out[3] = d3;
    265 }
    266 
    267 static AOM_FORCE_INLINE void highbd_fadst4_x4_neon(const int32x4_t *in,
    268                                                   int32x4_t *out, int bit) {
    269  const int32x4_t sinpi = vld1q_s32(sinpi_arr(bit) + 1);
    270 
    271  const int32x4_t a0 = vaddq_s32(in[0], in[1]);
    272  const int32x4_t a1 = vmulq_lane_s32(in[0], vget_low_s32(sinpi), 0);
    273  const int32x4_t a2 = vmulq_lane_s32(in[0], vget_high_s32(sinpi), 1);
    274  const int32x4_t a3 = vmulq_lane_s32(in[2], vget_high_s32(sinpi), 0);
    275 
    276  const int32x4_t b0 = vmlaq_lane_s32(a1, in[1], vget_low_s32(sinpi), 1);
    277  const int32x4_t b1 = vmlsq_lane_s32(a2, in[1], vget_low_s32(sinpi), 0);
    278  const int32x4_t b2 = vsubq_s32(a0, in[3]);
    279 
    280  const int32x4_t c0 = vmlaq_lane_s32(b0, in[3], vget_high_s32(sinpi), 1);
    281  const int32x4_t c1 = vmlaq_lane_s32(b1, in[3], vget_low_s32(sinpi), 1);
    282  const int32x4_t c2 = vmulq_lane_s32(b2, vget_high_s32(sinpi), 0);
    283 
    284  const int32x4_t d0 = vaddq_s32(c0, a3);
    285  const int32x4_t d1 = vsubq_s32(c1, a3);
    286  const int32x4_t d2 = vsubq_s32(c1, c0);
    287 
    288  const int32x4_t e0 = vaddq_s32(d2, a3);
    289 
    290  const int32x4_t v_bit = vdupq_n_s32(-bit);
    291  out[0] = vrshlq_s32(d0, v_bit);
    292  out[1] = vrshlq_s32(c2, v_bit);
    293  out[2] = vrshlq_s32(d1, v_bit);
    294  out[3] = vrshlq_s32(e0, v_bit);
    295 }
    296 
    297 static AOM_FORCE_INLINE void highbd_fidentity4_x4_neon(const int32x4_t *in,
    298                                                       int32x4_t *out,
    299                                                       int bit) {
    300  (void)bit;
    301  int32x4_t fact = vdupq_n_s32(NewSqrt2);
    302 
    303  for (int i = 0; i < 4; i++) {
    304    const int32x4_t a_low = vmulq_s32(in[i], fact);
    305    out[i] = vrshrq_n_s32(a_low, NewSqrt2Bits);
    306  }
    307 }
    308 
    309 void av1_fwd_txfm2d_4x4_neon(const int16_t *input, int32_t *coeff,
    310                             int input_stride, TX_TYPE tx_type, int bd) {
    311  (void)bd;
    312 
    313  int ud_flip, lr_flip;
    314  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
    315  ud_adjust_input_and_stride(ud_flip, &input, &input_stride, 4);
    316 
    317  // Workspace for column/row-wise transforms.
    318  int32x4_t buf[4];
    319 
    320  switch (tx_type) {
    321    case DCT_DCT:
    322      load_buffer_4x4(input, buf, input_stride, 0);
    323      highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
    324      transpose_arrays_s32_4x4(buf, buf);
    325      highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
    326      store_buffer_4x4(buf, coeff, /*stride=*/4);
    327      break;
    328    case ADST_DCT:
    329      load_buffer_4x4(input, buf, input_stride, 0);
    330      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
    331      transpose_arrays_s32_4x4(buf, buf);
    332      highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
    333      store_buffer_4x4(buf, coeff, /*stride=*/4);
    334      break;
    335    case DCT_ADST:
    336      load_buffer_4x4(input, buf, input_stride, 0);
    337      highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
    338      transpose_arrays_s32_4x4(buf, buf);
    339      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
    340      store_buffer_4x4(buf, coeff, /*stride=*/4);
    341      break;
    342    case ADST_ADST:
    343      load_buffer_4x4(input, buf, input_stride, 0);
    344      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
    345      transpose_arrays_s32_4x4(buf, buf);
    346      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
    347      store_buffer_4x4(buf, coeff, /*stride=*/4);
    348      break;
    349    case FLIPADST_DCT:
    350      load_buffer_4x4(input, buf, input_stride, 0);
    351      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
    352      transpose_arrays_s32_4x4(buf, buf);
    353      highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
    354      store_buffer_4x4(buf, coeff, /*stride=*/4);
    355      break;
    356    case DCT_FLIPADST:
    357      load_buffer_4x4(input, buf, input_stride, 1);
    358      highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
    359      transpose_arrays_s32_4x4(buf, buf);
    360      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
    361      store_buffer_4x4(buf, coeff, /*stride=*/4);
    362      break;
    363    case FLIPADST_FLIPADST:
    364      load_buffer_4x4(input, buf, input_stride, 1);
    365      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
    366      transpose_arrays_s32_4x4(buf, buf);
    367      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
    368      store_buffer_4x4(buf, coeff, /*stride=*/4);
    369      break;
    370    case ADST_FLIPADST:
    371      load_buffer_4x4(input, buf, input_stride, 1);
    372      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
    373      transpose_arrays_s32_4x4(buf, buf);
    374      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
    375      store_buffer_4x4(buf, coeff, /*stride=*/4);
    376      break;
    377    case FLIPADST_ADST:
    378      load_buffer_4x4(input, buf, input_stride, 0);
    379      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
    380      transpose_arrays_s32_4x4(buf, buf);
    381      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
    382      store_buffer_4x4(buf, coeff, /*stride=*/4);
    383      break;
    384    case IDTX:
    385      load_buffer_4x4(input, buf, input_stride, 0);
    386      highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
    387      transpose_arrays_s32_4x4(buf, buf);
    388      highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
    389      store_buffer_4x4(buf, coeff, /*stride=*/4);
    390      break;
    391    case V_DCT:
    392      load_buffer_4x4(input, buf, input_stride, 0);
    393      highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
    394      transpose_arrays_s32_4x4(buf, buf);
    395      highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
    396      store_buffer_4x4(buf, coeff, /*stride=*/4);
    397      break;
    398    case H_DCT:
    399      load_buffer_4x4(input, buf, input_stride, 0);
    400      highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
    401      transpose_arrays_s32_4x4(buf, buf);
    402      highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
    403      store_buffer_4x4(buf, coeff, /*stride=*/4);
    404      break;
    405    case V_ADST:
    406      load_buffer_4x4(input, buf, input_stride, 0);
    407      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
    408      transpose_arrays_s32_4x4(buf, buf);
    409      highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
    410      store_buffer_4x4(buf, coeff, /*stride=*/4);
    411      break;
    412    case H_ADST:
    413      load_buffer_4x4(input, buf, input_stride, 0);
    414      highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
    415      transpose_arrays_s32_4x4(buf, buf);
    416      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
    417      store_buffer_4x4(buf, coeff, /*stride=*/4);
    418      break;
    419    case V_FLIPADST:
    420      load_buffer_4x4(input, buf, input_stride, 0);
    421      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
    422      transpose_arrays_s32_4x4(buf, buf);
    423      highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
    424      store_buffer_4x4(buf, coeff, /*stride=*/4);
    425      break;
    426    case H_FLIPADST:
    427      load_buffer_4x4(input, buf, input_stride, 1);
    428      highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
    429      transpose_arrays_s32_4x4(buf, buf);
    430      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
    431      store_buffer_4x4(buf, coeff, /*stride=*/4);
    432      break;
    433    default: assert(0);
    434  }
    435 }
    436 
    437 // Butterfly pre-processing:
    438 // e.g. n=4:
    439 //   out[0] = in[0] + in[3]
    440 //   out[1] = in[1] + in[2]
    441 //   out[2] = in[1] - in[2]
    442 //   out[3] = in[0] - in[3]
    443 
    444 static AOM_FORCE_INLINE void butterfly_dct_pre(const int32x4_t *input,
    445                                               int32x4_t *output, int n) {
    446  for (int i = 0; i < n / 2; ++i) {
    447    output[i] = vaddq_s32(input[i], input[n - i - 1]);
    448  }
    449  for (int i = 0; i < n / 2; ++i) {
    450    output[n / 2 + i] = vsubq_s32(input[n / 2 - i - 1], input[n / 2 + i]);
    451  }
    452 }
    453 
    454 // Butterfly post-processing:
    455 // e.g. n=8:
    456 //   out[0] = in0[0] + in1[3];
    457 //   out[1] = in0[1] + in1[2];
    458 //   out[2] = in0[1] - in1[2];
    459 //   out[3] = in0[0] - in1[3];
    460 //   out[4] = in0[7] - in1[4];
    461 //   out[5] = in0[6] - in1[5];
    462 //   out[6] = in0[6] + in1[5];
    463 //   out[7] = in0[7] + in1[4];
    464 
    465 static AOM_FORCE_INLINE void butterfly_dct_post(const int32x4_t *in0,
    466                                                const int32x4_t *in1,
    467                                                int32x4_t *output, int n) {
    468  for (int i = 0; i < n / 4; ++i) {
    469    output[i] = vaddq_s32(in0[i], in1[n / 2 - i - 1]);
    470  }
    471  for (int i = 0; i < n / 4; ++i) {
    472    output[n / 4 + i] = vsubq_s32(in0[n / 4 - i - 1], in1[n / 4 + i]);
    473  }
    474  for (int i = 0; i < n / 4; ++i) {
    475    output[n / 2 + i] = vsubq_s32(in0[n - i - 1], in1[n / 2 + i]);
    476  }
    477  for (int i = 0; i < n / 4; ++i) {
    478    output[(3 * n) / 4 + i] =
    479        vaddq_s32(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]);
    480  }
    481 }
    482 
    483 static AOM_FORCE_INLINE void highbd_fdct8_x4_neon(const int32x4_t *in,
    484                                                  int32x4_t *out, int bit) {
    485  const int32_t *const cospi = cospi_arr_s32(bit);
    486  const int32x4_t v_bit = vdupq_n_s32(-bit);
    487 
    488  // stage 1
    489  int32x4_t a[8];
    490  butterfly_dct_pre(in, a, 8);
    491 
    492  // stage 2
    493  int32x4_t b[8];
    494  butterfly_dct_pre(a, b, 4);
    495  butterfly_0130_neon(cospi, 32, a[5], a[6], &b[6], &b[5], v_bit);
    496 
    497  // stage 3
    498  int32x4_t c[8];
    499  butterfly_0130_neon(cospi, 32, b[1], b[0], &c[0], &c[1], v_bit);
    500  butterfly_0112_neon(cospi, 16, b[3], b[2], &c[2], &c[3], v_bit);
    501  butterfly_dct_post(a + 4, b + 4, c + 4, 4);
    502 
    503  // stage 4-5
    504  butterfly_0112_neon(cospi, 8, c[7], c[4], &out[1], &out[7], v_bit);
    505  butterfly_0130_neon(cospi, 24, c[5], c[6], &out[5], &out[3], v_bit);
    506 
    507  out[0] = c[0];
    508  out[2] = c[2];
    509  out[4] = c[1];
    510  out[6] = c[3];
    511 }
    512 
    513 static AOM_FORCE_INLINE void highbd_fadst8_x4_neon(const int32x4_t *in,
    514                                                   int32x4_t *out, int bit) {
    515  const int32_t *const cospi = cospi_arr_s32(bit);
    516  const int32x4_t v_bit = vdupq_n_s32(-bit);
    517 
    518  int32x4_t u0, u1, u2, u3, u4, u5, u6, u7;
    519  int32x4_t v0, v1, v2, v3, v4, v5, v6, v7;
    520 
    521  // stage 0-1
    522  u0 = in[0];
    523  u1 = in[7];
    524  u2 = in[3];
    525  u3 = in[4];
    526  u4 = in[1];
    527  u5 = in[6];
    528  u6 = in[2];
    529  u7 = in[5];
    530 
    531  // stage 2
    532  v0 = u0;
    533  v1 = u1;
    534  butterfly_cospi32_0222_neon(cospi, u3, u2, &v2, &v3, v_bit);
    535  v4 = u4;
    536  v5 = u5;
    537  butterfly_cospi32_0002_neon(cospi, u6, u7, &v7, &v6, v_bit);
    538 
    539  // stage 3
    540  u0 = vaddq_s32(v0, v2);
    541  u1 = vsubq_s32(v3, v1);
    542  u2 = vsubq_s32(v0, v2);
    543  u3 = vaddq_s32(v1, v3);
    544  u4 = vsubq_s32(v6, v4);
    545  u5 = vaddq_s32(v5, v7);
    546  u6 = vaddq_s32(v4, v6);
    547  u7 = vsubq_s32(v5, v7);
    548 
    549  // stage 4
    550  v0 = u0;
    551  v1 = u1;
    552  v2 = u2;
    553  v3 = u3;
    554 
    555  butterfly_0112_neon(cospi, 16, u4, u5, &v4, &v5, v_bit);
    556  butterfly_0112_neon(cospi, 16, u7, u6, &v6, &v7, v_bit);
    557 
    558  // stage 5
    559  u0 = vaddq_s32(v0, v4);
    560  u1 = vaddq_s32(v1, v5);
    561  u2 = vaddq_s32(v2, v6);
    562  u3 = vsubq_s32(v7, v3);
    563  u4 = vsubq_s32(v0, v4);
    564  u5 = vsubq_s32(v1, v5);
    565  u6 = vsubq_s32(v2, v6);
    566  u7 = vaddq_s32(v3, v7);
    567 
    568  // stage 6
    569  butterfly_0112_neon(cospi, 4, u0, u1, &v0, &v1, v_bit);
    570  butterfly_0112_neon(cospi, 20, u2, u3, &v2, &v3, v_bit);
    571  butterfly_0130_neon(cospi, 28, u5, u4, &v4, &v5, v_bit);
    572  butterfly_0112_neon(cospi, 12, u6, u7, &v7, &v6, v_bit);
    573 
    574  // stage 7
    575  out[0] = v1;
    576  out[1] = v6;
    577  out[2] = v3;
    578  out[3] = v4;
    579  out[4] = v5;
    580  out[5] = v2;
    581  out[6] = v7;
    582  out[7] = v0;
    583 }
    584 
    585 static AOM_FORCE_INLINE void highbd_fidentity8_x4_neon(const int32x4_t *in,
    586                                                       int32x4_t *out,
    587                                                       int bit) {
    588  (void)bit;
    589  out[0] = vshlq_n_s32(in[0], 1);
    590  out[1] = vshlq_n_s32(in[1], 1);
    591  out[2] = vshlq_n_s32(in[2], 1);
    592  out[3] = vshlq_n_s32(in[3], 1);
    593  out[4] = vshlq_n_s32(in[4], 1);
    594  out[5] = vshlq_n_s32(in[5], 1);
    595  out[6] = vshlq_n_s32(in[6], 1);
    596  out[7] = vshlq_n_s32(in[7], 1);
    597 }
    598 
    599 static AOM_FORCE_INLINE void highbd_fdct8_xn_neon(const int32x4_t *in,
    600                                                  int32x4_t *out, int bit,
    601                                                  int howmany) {
    602  const int stride = 8;
    603  int i = 0;
    604  do {
    605    highbd_fdct8_x4_neon(in + i * stride, out + i * stride, bit);
    606  } while (++i < howmany);
    607 }
    608 
    609 static AOM_FORCE_INLINE void highbd_fadst8_xn_neon(const int32x4_t *in,
    610                                                   int32x4_t *out, int bit,
    611                                                   int howmany) {
    612  const int stride = 8;
    613  int i = 0;
    614  do {
    615    highbd_fadst8_x4_neon(in + i * stride, out + i * stride, bit);
    616  } while (++i < howmany);
    617 }
    618 
    619 static AOM_FORCE_INLINE void highbd_fidentity8_xn_neon(const int32x4_t *in,
    620                                                       int32x4_t *out, int bit,
    621                                                       int howmany) {
    622  (void)bit;
    623  const int stride = 8;
    624  int i = 0;
    625  do {
    626    highbd_fidentity8_x4_neon(in + i * stride, out + i * stride, bit);
    627  } while (++i < howmany);
    628 }
    629 
    630 void av1_fwd_txfm2d_8x8_neon(const int16_t *input, int32_t *coeff, int stride,
    631                             TX_TYPE tx_type, int bd) {
    632  (void)bd;
    633 
    634  int ud_flip, lr_flip;
    635  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
    636  ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
    637 
    638  // Workspaces for column/row-wise transforms.
    639  int32x4_t buf0[16], buf1[16];
    640 
    641  switch (tx_type) {
    642    case DCT_DCT:
    643      load_buffer_8x8(input, buf0, stride, 0);
    644      highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
    645      shift_right_1_round_s32_x4(buf0, buf0, 16);
    646      transpose_arrays_s32_8x8(buf0, buf1);
    647      highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
    648      store_buffer_8x8(buf1, coeff, /*stride=*/8);
    649      break;
    650    case ADST_DCT:
    651      load_buffer_8x8(input, buf0, stride, 0);
    652      highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
    653      shift_right_1_round_s32_x4(buf0, buf0, 16);
    654      transpose_arrays_s32_8x8(buf0, buf1);
    655      highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
    656      store_buffer_8x8(buf1, coeff, /*stride=*/8);
    657      break;
    658    case DCT_ADST:
    659      load_buffer_8x8(input, buf0, stride, 0);
    660      highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
    661      shift_right_1_round_s32_x4(buf0, buf0, 16);
    662      transpose_arrays_s32_8x8(buf0, buf1);
    663      highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
    664      store_buffer_8x8(buf1, coeff, /*stride=*/8);
    665      break;
    666    case ADST_ADST:
    667      load_buffer_8x8(input, buf0, stride, 0);
    668      highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
    669      shift_right_1_round_s32_x4(buf0, buf0, 16);
    670      transpose_arrays_s32_8x8(buf0, buf1);
    671      highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
    672      store_buffer_8x8(buf1, coeff, /*stride=*/8);
    673      break;
    674    case FLIPADST_DCT:
    675      load_buffer_8x8(input, buf0, stride, 0);
    676      highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
    677      shift_right_1_round_s32_x4(buf0, buf0, 16);
    678      transpose_arrays_s32_8x8(buf0, buf1);
    679      highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
    680      store_buffer_8x8(buf1, coeff, /*stride=*/8);
    681      break;
    682    case DCT_FLIPADST:
    683      load_buffer_8x8(input, buf0, stride, 1);
    684      highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
    685      shift_right_1_round_s32_x4(buf0, buf0, 16);
    686      transpose_arrays_s32_8x8(buf0, buf1);
    687      highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
    688      store_buffer_8x8(buf1, coeff, /*stride=*/8);
    689      break;
    690    case FLIPADST_FLIPADST:
    691      load_buffer_8x8(input, buf0, stride, 1);
    692      highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
    693      shift_right_1_round_s32_x4(buf0, buf0, 16);
    694      transpose_arrays_s32_8x8(buf0, buf1);
    695      highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
    696      store_buffer_8x8(buf1, coeff, /*stride=*/8);
    697      break;
    698    case ADST_FLIPADST:
    699      load_buffer_8x8(input, buf0, stride, 1);
    700      highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
    701      shift_right_1_round_s32_x4(buf0, buf0, 16);
    702      transpose_arrays_s32_8x8(buf0, buf1);
    703      highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
    704      store_buffer_8x8(buf1, coeff, /*stride=*/8);
    705      break;
    706    case FLIPADST_ADST:
    707      load_buffer_8x8(input, buf0, stride, 0);
    708      highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
    709      shift_right_1_round_s32_x4(buf0, buf0, 16);
    710      transpose_arrays_s32_8x8(buf0, buf1);
    711      highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
    712      store_buffer_8x8(buf1, coeff, /*stride=*/8);
    713      break;
    714    case IDTX:
    715      load_buffer_8x8(input, buf0, stride, 0);
    716      highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
    717      shift_right_1_round_s32_x4(buf0, buf0, 16);
    718      transpose_arrays_s32_8x8(buf0, buf1);
    719      highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
    720      store_buffer_8x8(buf1, coeff, /*stride=*/8);
    721      break;
    722    case V_DCT:
    723      load_buffer_8x8(input, buf0, stride, 0);
    724      highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
    725      shift_right_1_round_s32_x4(buf0, buf0, 16);
    726      transpose_arrays_s32_8x8(buf0, buf1);
    727      highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
    728      store_buffer_8x8(buf1, coeff, /*stride=*/8);
    729      break;
    730    case H_DCT:
    731      load_buffer_8x8(input, buf0, stride, 0);
    732      highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
    733      shift_right_1_round_s32_x4(buf0, buf0, 16);
    734      transpose_arrays_s32_8x8(buf0, buf1);
    735      highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
    736      store_buffer_8x8(buf1, coeff, /*stride=*/8);
    737      break;
    738    case V_ADST:
    739      load_buffer_8x8(input, buf0, stride, 0);
    740      highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
    741      shift_right_1_round_s32_x4(buf0, buf0, 16);
    742      transpose_arrays_s32_8x8(buf0, buf1);
    743      highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
    744      store_buffer_8x8(buf1, coeff, /*stride=*/8);
    745      break;
    746    case H_ADST:
    747      load_buffer_8x8(input, buf0, stride, 0);
    748      highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
    749      shift_right_1_round_s32_x4(buf0, buf0, 16);
    750      transpose_arrays_s32_8x8(buf0, buf1);
    751      highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
    752      store_buffer_8x8(buf1, coeff, /*stride=*/8);
    753      break;
    754    case V_FLIPADST:
    755      load_buffer_8x8(input, buf0, stride, 0);
    756      highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
    757      shift_right_1_round_s32_x4(buf0, buf0, 16);
    758      transpose_arrays_s32_8x8(buf0, buf1);
    759      highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
    760      store_buffer_8x8(buf1, coeff, /*stride=*/8);
    761      break;
    762    case H_FLIPADST:
    763      load_buffer_8x8(input, buf0, stride, 1);
    764      highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
    765      shift_right_1_round_s32_x4(buf0, buf0, 16);
    766      transpose_arrays_s32_8x8(buf0, buf1);
    767      highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
    768      store_buffer_8x8(buf1, coeff, /*stride=*/8);
    769      break;
    770    default: assert(0);
    771  }
    772 }
    773 
    774 static void highbd_fdct16_x4_neon(const int32x4_t *in, int32x4_t *out,
    775                                  int bit) {
    776  const int32_t *const cospi = cospi_arr_s32(bit);
    777  const int32x4_t v_bit = vdupq_n_s32(-bit);
    778 
    779  int32x4_t u[16], v[16];
    780 
    781  // stage 1
    782  butterfly_dct_pre(in, u, 16);
    783 
    784  // stage 2
    785  butterfly_dct_pre(u, v, 8);
    786  v[8] = u[8];
    787  v[9] = u[9];
    788  butterfly_cospi32_0002_neon(cospi, u[13], u[10], &v[13], &v[10], v_bit);
    789  butterfly_cospi32_0002_neon(cospi, u[12], u[11], &v[12], &v[11], v_bit);
    790  v[14] = u[14];
    791  v[15] = u[15];
    792 
    793  // stage 3
    794  butterfly_dct_pre(v, u, 4);
    795  u[4] = v[4];
    796  butterfly_cospi32_0002_neon(cospi, v[6], v[5], &u[6], &u[5], v_bit);
    797  u[7] = v[7];
    798  butterfly_dct_post(v + 8, v + 8, u + 8, 8);
    799 
    800  // stage 4
    801  butterfly_cospi32_0002_neon(cospi, u[0], u[1], &v[0], &v[1], v_bit);
    802  butterfly_0112_neon(cospi, 16, u[3], u[2], &v[2], &v[3], v_bit);
    803  butterfly_dct_post(u + 4, u + 4, v + 4, 4);
    804  v[8] = u[8];
    805  butterfly_0112_neon(cospi, 16, u[14], u[9], &v[14], &v[9], v_bit);
    806  butterfly_2312_neon(cospi, 16, u[13], u[10], &v[10], &v[13], v_bit);
    807  v[11] = u[11];
    808  v[12] = u[12];
    809  v[15] = u[15];
    810 
    811  // stage 5
    812  u[0] = v[0];
    813  u[1] = v[1];
    814  u[2] = v[2];
    815  u[3] = v[3];
    816  butterfly_0112_neon(cospi, 8, v[7], v[4], &u[4], &u[7], v_bit);
    817  butterfly_0130_neon(cospi, 24, v[5], v[6], &u[5], &u[6], v_bit);
    818  butterfly_dct_post(v + 8, v + 8, u + 8, 4);
    819  butterfly_dct_post(v + 12, v + 12, u + 12, 4);
    820 
    821  // stage 6
    822  v[0] = u[0];
    823  v[1] = u[1];
    824  v[2] = u[2];
    825  v[3] = u[3];
    826  v[4] = u[4];
    827  v[5] = u[5];
    828  v[6] = u[6];
    829  v[7] = u[7];
    830  butterfly_0112_neon(cospi, 4, u[15], u[8], &v[8], &v[15], v_bit);
    831  butterfly_0130_neon(cospi, 28, u[9], u[14], &v[9], &v[14], v_bit);
    832  butterfly_0112_neon(cospi, 20, u[13], u[10], &v[10], &v[13], v_bit);
    833  butterfly_0130_neon(cospi, 12, u[11], u[12], &v[11], &v[12], v_bit);
    834 
    835  out[0] = v[0];
    836  out[1] = v[8];
    837  out[2] = v[4];
    838  out[3] = v[12];
    839  out[4] = v[2];
    840  out[5] = v[10];
    841  out[6] = v[6];
    842  out[7] = v[14];
    843  out[8] = v[1];
    844  out[9] = v[9];
    845  out[10] = v[5];
    846  out[11] = v[13];
    847  out[12] = v[3];
    848  out[13] = v[11];
    849  out[14] = v[7];
    850  out[15] = v[15];
    851 }
    852 
    853 static void highbd_fadst16_x4_neon(const int32x4_t *in, int32x4_t *out,
    854                                   int bit) {
    855  const int32_t *const cospi = cospi_arr_s32(bit);
    856  const int32x4_t v_bit = vdupq_n_s32(-bit);
    857 
    858  int32x4_t u[16], v[16];
    859 
    860  // stage 0-1
    861  u[0] = in[0];
    862  u[1] = in[15];
    863  u[2] = in[7];
    864  u[3] = in[8];
    865  u[4] = in[3];
    866  u[5] = in[12];
    867  u[6] = in[4];
    868  u[7] = in[11];
    869  u[8] = in[1];
    870  u[9] = in[14];
    871  u[10] = in[6];
    872  u[11] = in[9];
    873  u[12] = in[2];
    874  u[13] = in[13];
    875  u[14] = in[5];
    876  u[15] = in[10];
    877 
    878  // stage 2
    879  v[0] = u[0];
    880  v[1] = u[1];
    881  butterfly_cospi32_0222_neon(cospi, u[3], u[2], &v[2], &v[3], v_bit);
    882  v[4] = u[4];
    883  v[5] = u[5];
    884  butterfly_cospi32_0002_neon(cospi, u[6], u[7], &v[7], &v[6], v_bit);
    885  v[8] = u[8];
    886  v[9] = u[9];
    887  butterfly_cospi32_0002_neon(cospi, u[10], u[11], &v[11], &v[10], v_bit);
    888  v[12] = u[12];
    889  v[13] = u[13];
    890  butterfly_cospi32_0222_neon(cospi, u[15], u[14], &v[14], &v[15], v_bit);
    891 
    892  // stage 3
    893  u[0] = vaddq_s32(v[0], v[2]);
    894  u[1] = vsubq_s32(v[3], v[1]);
    895  u[2] = vsubq_s32(v[0], v[2]);
    896  u[3] = vaddq_s32(v[1], v[3]);
    897  u[4] = vsubq_s32(v[6], v[4]);
    898  u[5] = vaddq_s32(v[5], v[7]);
    899  u[6] = vaddq_s32(v[4], v[6]);
    900  u[7] = vsubq_s32(v[5], v[7]);
    901  u[8] = vsubq_s32(v[10], v[8]);
    902  u[9] = vaddq_s32(v[9], v[11]);
    903  u[10] = vaddq_s32(v[8], v[10]);
    904  u[11] = vsubq_s32(v[9], v[11]);
    905  u[12] = vaddq_s32(v[12], v[14]);
    906  u[13] = vsubq_s32(v[15], v[13]);
    907  u[14] = vsubq_s32(v[12], v[14]);
    908  u[15] = vaddq_s32(v[13], v[15]);
    909 
    910  // stage 4
    911  v[0] = u[0];
    912  v[1] = u[1];
    913  v[2] = u[2];
    914  v[3] = u[3];
    915  butterfly_0112_neon(cospi, 16, u[4], u[5], &v[4], &v[5], v_bit);
    916  butterfly_0112_neon(cospi, 16, u[7], u[6], &v[6], &v[7], v_bit);
    917 
    918  v[8] = u[8];
    919  v[9] = u[9];
    920  v[10] = u[10];
    921  v[11] = u[11];
    922 
    923  butterfly_0112_neon(cospi, 16, u[12], u[13], &v[12], &v[13], v_bit);
    924  butterfly_0332_neon(cospi, 16, u[14], u[15], &v[15], &v[14], v_bit);
    925 
    926  // stage 5
    927  u[0] = vaddq_s32(v[0], v[4]);
    928  u[1] = vaddq_s32(v[1], v[5]);
    929  u[2] = vaddq_s32(v[2], v[6]);
    930  u[3] = vsubq_s32(v[7], v[3]);
    931  u[4] = vsubq_s32(v[0], v[4]);
    932  u[5] = vsubq_s32(v[1], v[5]);
    933  u[6] = vsubq_s32(v[2], v[6]);
    934  u[7] = vaddq_s32(v[3], v[7]);
    935  u[8] = vaddq_s32(v[8], v[12]);
    936  u[9] = vaddq_s32(v[9], v[13]);
    937  u[10] = vsubq_s32(v[14], v[10]);
    938  u[11] = vaddq_s32(v[11], v[15]);
    939  u[12] = vsubq_s32(v[8], v[12]);
    940  u[13] = vsubq_s32(v[9], v[13]);
    941  u[14] = vaddq_s32(v[10], v[14]);
    942  u[15] = vsubq_s32(v[11], v[15]);
    943 
    944  // stage 6
    945  v[0] = u[0];
    946  v[1] = u[1];
    947  v[2] = u[2];
    948  v[3] = u[3];
    949  v[4] = u[4];
    950  v[5] = u[5];
    951  v[6] = u[6];
    952  v[7] = u[7];
    953 
    954  butterfly_0112_neon(cospi, 8, u[8], u[9], &v[8], &v[9], v_bit);
    955  butterfly_0130_neon(cospi, 8, u[12], u[13], &v[13], &v[12], v_bit);
    956  butterfly_0130_neon(cospi, 24, u[11], u[10], &v[10], &v[11], v_bit);
    957  butterfly_0130_neon(cospi, 24, u[14], u[15], &v[14], &v[15], v_bit);
    958 
    959  // stage 7
    960  u[0] = vaddq_s32(v[0], v[8]);
    961  u[1] = vaddq_s32(v[1], v[9]);
    962  u[2] = vaddq_s32(v[2], v[10]);
    963  u[3] = vaddq_s32(v[3], v[11]);
    964  u[4] = vaddq_s32(v[4], v[12]);
    965  u[5] = vaddq_s32(v[5], v[13]);
    966  u[6] = vaddq_s32(v[6], v[14]);
    967  u[7] = vsubq_s32(v[15], v[7]);
    968  u[8] = vsubq_s32(v[0], v[8]);
    969  u[9] = vsubq_s32(v[1], v[9]);
    970  u[10] = vsubq_s32(v[2], v[10]);
    971  u[11] = vsubq_s32(v[3], v[11]);
    972  u[12] = vsubq_s32(v[4], v[12]);
    973  u[13] = vsubq_s32(v[5], v[13]);
    974  u[14] = vsubq_s32(v[6], v[14]);
    975  u[15] = vaddq_s32(v[7], v[15]);
    976 
    977  // stage 8
    978  butterfly_0112_neon(cospi, 2, u[0], u[1], &v[0], &v[1], v_bit);
    979  butterfly_0112_neon(cospi, 10, u[2], u[3], &v[2], &v[3], v_bit);
    980  butterfly_0112_neon(cospi, 18, u[4], u[5], &v[4], &v[5], v_bit);
    981  butterfly_0112_neon(cospi, 26, u[6], u[7], &v[6], &v[7], v_bit);
    982  butterfly_0130_neon(cospi, 30, u[9], u[8], &v[8], &v[9], v_bit);
    983  butterfly_0130_neon(cospi, 22, u[11], u[10], &v[10], &v[11], v_bit);
    984  butterfly_0130_neon(cospi, 14, u[13], u[12], &v[12], &v[13], v_bit);
    985  butterfly_0112_neon(cospi, 6, u[14], u[15], &v[15], &v[14], v_bit);
    986 
    987  // stage 9
    988  out[0] = v[1];
    989  out[1] = v[14];
    990  out[2] = v[3];
    991  out[3] = v[12];
    992  out[4] = v[5];
    993  out[5] = v[10];
    994  out[6] = v[7];
    995  out[7] = v[8];
    996  out[8] = v[9];
    997  out[9] = v[6];
    998  out[10] = v[11];
    999  out[11] = v[4];
   1000  out[12] = v[13];
   1001  out[13] = v[2];
   1002  out[14] = v[15];
   1003  out[15] = v[0];
   1004 }
   1005 
   1006 static void highbd_fidentity16_x4_neon(const int32x4_t *in, int32x4_t *out,
   1007                                       int bit) {
   1008  (void)bit;
   1009  const int32x4_t fact = vdupq_n_s32(2 * NewSqrt2);
   1010  const int32x4_t offset = vdupq_n_s32(1 << (NewSqrt2Bits - 1));
   1011 
   1012  for (int i = 0; i < 16; i++) {
   1013    int32x4_t a = vmulq_s32(in[i], fact);
   1014    a = vaddq_s32(a, offset);
   1015    out[i] = vshrq_n_s32(a, NewSqrt2Bits);
   1016  }
   1017 }
   1018 
   1019 static void highbd_fdct16_xn_neon(const int32x4_t *in, int32x4_t *out, int bit,
   1020                                  const int howmany) {
   1021  const int stride = 16;
   1022  int i = 0;
   1023  do {
   1024    highbd_fdct16_x4_neon(in + i * stride, out + i * stride, bit);
   1025  } while (++i < howmany);
   1026 }
   1027 
   1028 static void highbd_fadst16_xn_neon(const int32x4_t *in, int32x4_t *out, int bit,
   1029                                   int howmany) {
   1030  const int stride = 16;
   1031  int i = 0;
   1032  do {
   1033    highbd_fadst16_x4_neon(in + i * stride, out + i * stride, bit);
   1034  } while (++i < howmany);
   1035 }
   1036 
   1037 static void highbd_fidentity16_xn_neon(const int32x4_t *in, int32x4_t *out,
   1038                                       int bit, int howmany) {
   1039  const int stride = 16;
   1040  int i = 0;
   1041  do {
   1042    highbd_fidentity16_x4_neon(in + i * stride, out + i * stride, bit);
   1043  } while (++i < howmany);
   1044 }
   1045 
   1046 void av1_fwd_txfm2d_16x16_neon(const int16_t *input, int32_t *coeff, int stride,
   1047                               TX_TYPE tx_type, int bd) {
   1048  (void)bd;
   1049  int ud_flip, lr_flip;
   1050  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   1051  ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
   1052 
   1053  // Workspaces for column/row-wise transforms.
   1054  int32x4_t buf0[64], buf1[64];
   1055 
   1056  switch (tx_type) {
   1057    case DCT_DCT:
   1058      load_buffer_16x16(input, buf0, stride, 0);
   1059      highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
   1060      shift_right_2_round_s32_x4(buf0, buf0, 64);
   1061      transpose_arrays_s32_16x16(buf0, buf1);
   1062      highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
   1063      store_buffer_16x16(buf1, coeff, /*stride=*/16);
   1064      break;
   1065    case ADST_DCT:
   1066      load_buffer_16x16(input, buf0, stride, 0);
   1067      highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
   1068      shift_right_2_round_s32_x4(buf0, buf0, 64);
   1069      transpose_arrays_s32_16x16(buf0, buf1);
   1070      highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
   1071      store_buffer_16x16(buf1, coeff, /*stride=*/16);
   1072      break;
   1073    case DCT_ADST:
   1074      load_buffer_16x16(input, buf0, stride, 0);
   1075      highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
   1076      shift_right_2_round_s32_x4(buf0, buf0, 64);
   1077      transpose_arrays_s32_16x16(buf0, buf1);
   1078      highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
   1079      store_buffer_16x16(buf1, coeff, /*stride=*/16);
   1080      break;
   1081    case ADST_ADST:
   1082      load_buffer_16x16(input, buf0, stride, 0);
   1083      highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
   1084      shift_right_2_round_s32_x4(buf0, buf0, 64);
   1085      transpose_arrays_s32_16x16(buf0, buf1);
   1086      highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
   1087      store_buffer_16x16(buf1, coeff, /*stride=*/16);
   1088      break;
   1089    case FLIPADST_DCT:
   1090      load_buffer_16x16(input, buf0, stride, 0);
   1091      highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
   1092      shift_right_2_round_s32_x4(buf0, buf0, 64);
   1093      transpose_arrays_s32_16x16(buf0, buf1);
   1094      highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
   1095      store_buffer_16x16(buf1, coeff, /*stride=*/16);
   1096      break;
   1097    case DCT_FLIPADST:
   1098      load_buffer_16x16(input, buf0, stride, 1);
   1099      highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
   1100      shift_right_2_round_s32_x4(buf0, buf0, 64);
   1101      transpose_arrays_s32_16x16(buf0, buf1);
   1102      highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
   1103      store_buffer_16x16(buf1, coeff, /*stride=*/16);
   1104      break;
   1105    case FLIPADST_FLIPADST:
   1106      load_buffer_16x16(input, buf0, stride, 1);
   1107      highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
   1108      shift_right_2_round_s32_x4(buf0, buf0, 64);
   1109      transpose_arrays_s32_16x16(buf0, buf1);
   1110      highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
   1111      store_buffer_16x16(buf1, coeff, /*stride=*/16);
   1112      break;
   1113    case ADST_FLIPADST:
   1114      load_buffer_16x16(input, buf0, stride, 1);
   1115      highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
   1116      shift_right_2_round_s32_x4(buf0, buf0, 64);
   1117      transpose_arrays_s32_16x16(buf0, buf1);
   1118      highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
   1119      store_buffer_16x16(buf1, coeff, /*stride=*/16);
   1120      break;
   1121    case FLIPADST_ADST:
   1122      load_buffer_16x16(input, buf0, stride, 0);
   1123      highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
   1124      shift_right_2_round_s32_x4(buf0, buf0, 64);
   1125      transpose_arrays_s32_16x16(buf0, buf1);
   1126      highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
   1127      store_buffer_16x16(buf1, coeff, /*stride=*/16);
   1128      break;
   1129    case IDTX:
   1130      load_buffer_16x16(input, buf0, stride, 0);
   1131      highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
   1132      shift_right_2_round_s32_x4(buf0, buf0, 64);
   1133      transpose_arrays_s32_16x16(buf0, buf1);
   1134      highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
   1135      store_buffer_16x16(buf1, coeff, /*stride=*/16);
   1136      break;
   1137    case V_DCT:
   1138      load_buffer_16x16(input, buf0, stride, 0);
   1139      highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
   1140      shift_right_2_round_s32_x4(buf0, buf0, 64);
   1141      transpose_arrays_s32_16x16(buf0, buf1);
   1142      highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
   1143      store_buffer_16x16(buf1, coeff, /*stride=*/16);
   1144      break;
   1145    case H_DCT:
   1146      load_buffer_16x16(input, buf0, stride, 0);
   1147      highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
   1148      shift_right_2_round_s32_x4(buf0, buf0, 64);
   1149      transpose_arrays_s32_16x16(buf0, buf1);
   1150      highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
   1151      store_buffer_16x16(buf1, coeff, /*stride=*/16);
   1152      break;
   1153    case V_ADST:
   1154      load_buffer_16x16(input, buf0, stride, 0);
   1155      highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
   1156      shift_right_2_round_s32_x4(buf0, buf0, 64);
   1157      transpose_arrays_s32_16x16(buf0, buf1);
   1158      highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
   1159      store_buffer_16x16(buf1, coeff, /*stride=*/16);
   1160      break;
   1161    case H_ADST:
   1162      load_buffer_16x16(input, buf0, stride, 0);
   1163      highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
   1164      shift_right_2_round_s32_x4(buf0, buf0, 64);
   1165      transpose_arrays_s32_16x16(buf0, buf1);
   1166      highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
   1167      store_buffer_16x16(buf1, coeff, /*stride=*/16);
   1168      break;
   1169    case V_FLIPADST:
   1170      load_buffer_16x16(input, buf0, stride, 0);
   1171      highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
   1172      shift_right_2_round_s32_x4(buf0, buf0, 64);
   1173      transpose_arrays_s32_16x16(buf0, buf1);
   1174      highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
   1175      store_buffer_16x16(buf1, coeff, /*stride=*/16);
   1176      break;
   1177    case H_FLIPADST:
   1178      load_buffer_16x16(input, buf0, stride, 1);
   1179      highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
   1180      shift_right_2_round_s32_x4(buf0, buf0, 64);
   1181      transpose_arrays_s32_16x16(buf0, buf1);
   1182      highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
   1183      store_buffer_16x16(buf1, coeff, /*stride=*/16);
   1184      break;
   1185    default: assert(0);
   1186  }
   1187 }
   1188 
   1189 typedef void (*fwd_transform_1d_col_neon)(const int16_t *in, int32x4_t *out,
   1190                                          int stride, int bit, int lr_flip);
   1191 typedef void (*fwd_transform_1d_col_many_neon)(const int16_t *in,
   1192                                               int32x4_t *out, int stride,
   1193                                               int bit, int lr_flip,
   1194                                               int howmany, int hm_stride);
   1195 
   1196 typedef void (*fwd_transform_1d_row_neon)(const int32x4_t *in, int32_t *out,
   1197                                          int bit, int stride);
   1198 typedef void (*fwd_transform_1d_row_many_neon)(const int32x4_t *in,
   1199                                               int32_t *out, int bit,
   1200                                               int howmany, int hm_stride,
   1201                                               int stride);
   1202 
   1203 // Construct component kernels that include the load_buffer and store_buffer
   1204 // stages to avoid the need to spill loaded data to the stack between these and
   1205 // the txfm kernel calls.
   1206 // The TRANSFORM_*_ONE cases are only ever called in situations where the
   1207 // howmany parameter would be one, so no need for the loop at all in these
   1208 // cases.
   1209 
   1210 #define TRANSFORM_COL_ONE(name, n)                                    \
   1211  static void highbd_##name##_col_neon(const int16_t *input,          \
   1212                                       int32x4_t *output, int stride, \
   1213                                       int cos_bit, int lr_flip) {    \
   1214    int32x4_t buf0[n];                                                \
   1215    load_buffer_4x##n(input, buf0, stride, lr_flip);                  \
   1216    highbd_##name##_x4_neon(buf0, output, cos_bit);                   \
   1217  }
   1218 
   1219 #define TRANSFORM_COL_MANY(name, n)                                     \
   1220  static void highbd_##name##_col_many_neon(                            \
   1221      const int16_t *input, int32x4_t *output, int stride, int cos_bit, \
   1222      int lr_flip, int howmany, int hm_stride) {                        \
   1223    int i = 0;                                                          \
   1224    do {                                                                \
   1225      int32x4_t buf0[n];                                                \
   1226      load_buffer_4x##n(input + 4 * i, buf0, stride, lr_flip);          \
   1227      highbd_##name##_x4_neon(buf0, output + i * hm_stride, cos_bit);   \
   1228    } while (++i < howmany);                                            \
   1229  }
   1230 
   1231 #define TRANSFORM_ROW_ONE(name, n)                                        \
   1232  static void highbd_##name##_row_neon(                                   \
   1233      const int32x4_t *input, int32_t *output, int cos_bit, int stride) { \
   1234    int32x4_t buf0[n];                                                    \
   1235    highbd_##name##_x4_neon(input, buf0, cos_bit);                        \
   1236    store_buffer_##n##x4(buf0, output, stride);                           \
   1237  }
   1238 
   1239 #define TRANSFORM_ROW_RECT_ONE(name, n)                                   \
   1240  static void highbd_##name##_row_rect_neon(                              \
   1241      const int32x4_t *input, int32_t *output, int cos_bit, int stride) { \
   1242    int32x4_t buf0[n];                                                    \
   1243    highbd_##name##_x4_neon(input, buf0, cos_bit);                        \
   1244    round_rect_array_s32_neon(buf0, buf0, (n));                           \
   1245    store_buffer_##n##x4(buf0, output, stride);                           \
   1246  }
   1247 
   1248 #define TRANSFORM_ROW_MANY(name, n)                                      \
   1249  static void highbd_##name##_row_many_neon(                             \
   1250      const int32x4_t *input, int32_t *output, int cos_bit, int howmany, \
   1251      int hm_stride, int stride) {                                       \
   1252    int i = 0;                                                           \
   1253    do {                                                                 \
   1254      int32x4_t buf0[n];                                                 \
   1255      highbd_##name##_x4_neon(input + hm_stride * i, buf0, cos_bit);     \
   1256      store_buffer_##n##x4(buf0, output + 4 * i, stride);                \
   1257    } while (++i < howmany);                                             \
   1258  }
   1259 
   1260 #define TRANSFORM_ROW_RECT_MANY(name, n)                                 \
   1261  static void highbd_##name##_row_rect_many_neon(                        \
   1262      const int32x4_t *input, int32_t *output, int cos_bit, int howmany, \
   1263      int hm_stride, int stride) {                                       \
   1264    int i = 0;                                                           \
   1265    do {                                                                 \
   1266      int32x4_t buf0[n];                                                 \
   1267      highbd_##name##_x4_neon(input + hm_stride * i, buf0, cos_bit);     \
   1268      round_rect_array_s32_neon(buf0, buf0, (n));                        \
   1269      store_buffer_##n##x4(buf0, output + 4 * i, stride);                \
   1270    } while (++i < howmany);                                             \
   1271  }
   1272 
   1273 TRANSFORM_COL_ONE(fdct8, 8)
   1274 TRANSFORM_COL_ONE(fadst8, 8)
   1275 TRANSFORM_COL_ONE(fidentity8, 8)
   1276 
   1277 TRANSFORM_COL_MANY(fdct4, 4)
   1278 TRANSFORM_COL_MANY(fdct8, 8)
   1279 TRANSFORM_COL_MANY(fdct16, 16)
   1280 TRANSFORM_COL_MANY(fadst4, 4)
   1281 TRANSFORM_COL_MANY(fadst8, 8)
   1282 TRANSFORM_COL_MANY(fadst16, 16)
   1283 TRANSFORM_COL_MANY(fidentity4, 4)
   1284 TRANSFORM_COL_MANY(fidentity8, 8)
   1285 TRANSFORM_COL_MANY(fidentity16, 16)
   1286 
   1287 TRANSFORM_ROW_ONE(fdct16, 16)
   1288 TRANSFORM_ROW_ONE(fadst16, 16)
   1289 TRANSFORM_ROW_ONE(fidentity16, 16)
   1290 
   1291 TRANSFORM_ROW_RECT_ONE(fdct8, 8)
   1292 TRANSFORM_ROW_RECT_ONE(fadst8, 8)
   1293 TRANSFORM_ROW_RECT_ONE(fidentity8, 8)
   1294 
   1295 #if !CONFIG_REALTIME_ONLY
   1296 TRANSFORM_ROW_MANY(fdct4, 4)
   1297 TRANSFORM_ROW_MANY(fdct8, 8)
   1298 TRANSFORM_ROW_MANY(fadst4, 4)
   1299 TRANSFORM_ROW_MANY(fadst8, 8)
   1300 TRANSFORM_ROW_MANY(fidentity4, 4)
   1301 TRANSFORM_ROW_MANY(fidentity8, 8)
   1302 #endif
   1303 
   1304 TRANSFORM_ROW_RECT_MANY(fdct4, 4)
   1305 TRANSFORM_ROW_RECT_MANY(fdct8, 8)
   1306 TRANSFORM_ROW_RECT_MANY(fdct16, 16)
   1307 TRANSFORM_ROW_RECT_MANY(fadst4, 4)
   1308 TRANSFORM_ROW_RECT_MANY(fadst8, 8)
   1309 TRANSFORM_ROW_RECT_MANY(fadst16, 16)
   1310 TRANSFORM_ROW_RECT_MANY(fidentity4, 4)
   1311 TRANSFORM_ROW_RECT_MANY(fidentity8, 8)
   1312 TRANSFORM_ROW_RECT_MANY(fidentity16, 16)
   1313 
   1314 static const fwd_transform_1d_col_many_neon
   1315    col_highbd_txfm8_xn_arr[TX_TYPES] = {
   1316      highbd_fdct8_col_many_neon,       // DCT_DCT
   1317      highbd_fadst8_col_many_neon,      // ADST_DCT
   1318      highbd_fdct8_col_many_neon,       // DCT_ADST
   1319      highbd_fadst8_col_many_neon,      // ADST_ADST
   1320      highbd_fadst8_col_many_neon,      // FLIPADST_DCT
   1321      highbd_fdct8_col_many_neon,       // DCT_FLIPADST
   1322      highbd_fadst8_col_many_neon,      // FLIPADST_FLIPADST
   1323      highbd_fadst8_col_many_neon,      // ADST_FLIPADST
   1324      highbd_fadst8_col_many_neon,      // FLIPADST_ADST
   1325      highbd_fidentity8_col_many_neon,  // IDTX
   1326      highbd_fdct8_col_many_neon,       // V_DCT
   1327      highbd_fidentity8_col_many_neon,  // H_DCT
   1328      highbd_fadst8_col_many_neon,      // V_ADST
   1329      highbd_fidentity8_col_many_neon,  // H_ADST
   1330      highbd_fadst8_col_many_neon,      // V_FLIPADST
   1331      highbd_fidentity8_col_many_neon   // H_FLIPADST
   1332    };
   1333 
   1334 static const fwd_transform_1d_col_neon col_highbd_txfm8_x4_arr[TX_TYPES] = {
   1335  highbd_fdct8_col_neon,       // DCT_DCT
   1336  highbd_fadst8_col_neon,      // ADST_DCT
   1337  highbd_fdct8_col_neon,       // DCT_ADST
   1338  highbd_fadst8_col_neon,      // ADST_ADST
   1339  highbd_fadst8_col_neon,      // FLIPADST_DCT
   1340  highbd_fdct8_col_neon,       // DCT_FLIPADST
   1341  highbd_fadst8_col_neon,      // FLIPADST_FLIPADST
   1342  highbd_fadst8_col_neon,      // ADST_FLIPADST
   1343  highbd_fadst8_col_neon,      // FLIPADST_ADST
   1344  highbd_fidentity8_col_neon,  // IDTX
   1345  highbd_fdct8_col_neon,       // V_DCT
   1346  highbd_fidentity8_col_neon,  // H_DCT
   1347  highbd_fadst8_col_neon,      // V_ADST
   1348  highbd_fidentity8_col_neon,  // H_ADST
   1349  highbd_fadst8_col_neon,      // V_FLIPADST
   1350  highbd_fidentity8_col_neon   // H_FLIPADST
   1351 };
   1352 
   1353 static const fwd_transform_1d_col_many_neon
   1354    col_highbd_txfm16_xn_arr[TX_TYPES] = {
   1355      highbd_fdct16_col_many_neon,       // DCT_DCT
   1356      highbd_fadst16_col_many_neon,      // ADST_DCT
   1357      highbd_fdct16_col_many_neon,       // DCT_ADST
   1358      highbd_fadst16_col_many_neon,      // ADST_ADST
   1359      highbd_fadst16_col_many_neon,      // FLIPADST_DCT
   1360      highbd_fdct16_col_many_neon,       // DCT_FLIPADST
   1361      highbd_fadst16_col_many_neon,      // FLIPADST_FLIPADST
   1362      highbd_fadst16_col_many_neon,      // ADST_FLIPADST
   1363      highbd_fadst16_col_many_neon,      // FLIPADST_ADST
   1364      highbd_fidentity16_col_many_neon,  // IDTX
   1365      highbd_fdct16_col_many_neon,       // V_DCT
   1366      highbd_fidentity16_col_many_neon,  // H_DCT
   1367      highbd_fadst16_col_many_neon,      // V_ADST
   1368      highbd_fidentity16_col_many_neon,  // H_ADST
   1369      highbd_fadst16_col_many_neon,      // V_FLIPADST
   1370      highbd_fidentity16_col_many_neon   // H_FLIPADST
   1371    };
   1372 
   1373 static const fwd_transform_1d_col_many_neon
   1374    col_highbd_txfm4_xn_arr[TX_TYPES] = {
   1375      highbd_fdct4_col_many_neon,       // DCT_DCT
   1376      highbd_fadst4_col_many_neon,      // ADST_DCT
   1377      highbd_fdct4_col_many_neon,       // DCT_ADST
   1378      highbd_fadst4_col_many_neon,      // ADST_ADST
   1379      highbd_fadst4_col_many_neon,      // FLIPADST_DCT
   1380      highbd_fdct4_col_many_neon,       // DCT_FLIPADST
   1381      highbd_fadst4_col_many_neon,      // FLIPADST_FLIPADST
   1382      highbd_fadst4_col_many_neon,      // ADST_FLIPADST
   1383      highbd_fadst4_col_many_neon,      // FLIPADST_ADST
   1384      highbd_fidentity4_col_many_neon,  // IDTX
   1385      highbd_fdct4_col_many_neon,       // V_DCT
   1386      highbd_fidentity4_col_many_neon,  // H_DCT
   1387      highbd_fadst4_col_many_neon,      // V_ADST
   1388      highbd_fidentity4_col_many_neon,  // H_ADST
   1389      highbd_fadst4_col_many_neon,      // V_FLIPADST
   1390      highbd_fidentity4_col_many_neon   // H_FLIPADST
   1391    };
   1392 
   1393 static const fwd_transform_1d_row_neon row_highbd_txfm16_xn_arr[TX_TYPES] = {
   1394  highbd_fdct16_row_neon,       // DCT_DCT
   1395  highbd_fdct16_row_neon,       // ADST_DCT
   1396  highbd_fadst16_row_neon,      // DCT_ADST
   1397  highbd_fadst16_row_neon,      // ADST_ADST
   1398  highbd_fdct16_row_neon,       // FLIPADST_DCT
   1399  highbd_fadst16_row_neon,      // DCT_FLIPADST
   1400  highbd_fadst16_row_neon,      // FLIPADST_FLIPADST
   1401  highbd_fadst16_row_neon,      // ADST_FLIPADST
   1402  highbd_fadst16_row_neon,      // FLIPADST_ADST
   1403  highbd_fidentity16_row_neon,  // IDTX
   1404  highbd_fidentity16_row_neon,  // V_DCT
   1405  highbd_fdct16_row_neon,       // H_DCT
   1406  highbd_fidentity16_row_neon,  // V_ADST
   1407  highbd_fadst16_row_neon,      // H_ADST
   1408  highbd_fidentity16_row_neon,  // V_FLIPADST
   1409  highbd_fadst16_row_neon       // H_FLIPADST
   1410 };
   1411 
   1412 static const fwd_transform_1d_row_many_neon
   1413    row_rect_highbd_txfm16_xn_arr[TX_TYPES] = {
   1414      highbd_fdct16_row_rect_many_neon,       // DCT_DCT
   1415      highbd_fdct16_row_rect_many_neon,       // ADST_DCT
   1416      highbd_fadst16_row_rect_many_neon,      // DCT_ADST
   1417      highbd_fadst16_row_rect_many_neon,      // ADST_ADST
   1418      highbd_fdct16_row_rect_many_neon,       // FLIPADST_DCT
   1419      highbd_fadst16_row_rect_many_neon,      // DCT_FLIPADST
   1420      highbd_fadst16_row_rect_many_neon,      // FLIPADST_FLIPADST
   1421      highbd_fadst16_row_rect_many_neon,      // ADST_FLIPADST
   1422      highbd_fadst16_row_rect_many_neon,      // FLIPADST_ADST
   1423      highbd_fidentity16_row_rect_many_neon,  // IDTX
   1424      highbd_fidentity16_row_rect_many_neon,  // V_DCT
   1425      highbd_fdct16_row_rect_many_neon,       // H_DCT
   1426      highbd_fidentity16_row_rect_many_neon,  // V_ADST
   1427      highbd_fadst16_row_rect_many_neon,      // H_ADST
   1428      highbd_fidentity16_row_rect_many_neon,  // V_FLIPADST
   1429      highbd_fadst16_row_rect_many_neon       // H_FLIPADST
   1430    };
   1431 
   1432 #if !CONFIG_REALTIME_ONLY
   1433 static const fwd_transform_1d_row_many_neon
   1434    row_highbd_txfm8_xn_arr[TX_TYPES] = {
   1435      highbd_fdct8_row_many_neon,       // DCT_DCT
   1436      highbd_fdct8_row_many_neon,       // ADST_DCT
   1437      highbd_fadst8_row_many_neon,      // DCT_ADST
   1438      highbd_fadst8_row_many_neon,      // ADST_ADST
   1439      highbd_fdct8_row_many_neon,       // FLIPADST_DCT
   1440      highbd_fadst8_row_many_neon,      // DCT_FLIPADST
   1441      highbd_fadst8_row_many_neon,      // FLIPADST_FLIPADST
   1442      highbd_fadst8_row_many_neon,      // ADST_FLIPADST
   1443      highbd_fadst8_row_many_neon,      // FLIPADST_ADST
   1444      highbd_fidentity8_row_many_neon,  // IDTX
   1445      highbd_fidentity8_row_many_neon,  // V_DCT
   1446      highbd_fdct8_row_many_neon,       // H_DCT
   1447      highbd_fidentity8_row_many_neon,  // V_ADST
   1448      highbd_fadst8_row_many_neon,      // H_ADST
   1449      highbd_fidentity8_row_many_neon,  // V_FLIPADST
   1450      highbd_fadst8_row_many_neon       // H_FLIPADST
   1451    };
   1452 #endif
   1453 
   1454 static const fwd_transform_1d_row_many_neon
   1455    row_rect_highbd_txfm8_xn_arr[TX_TYPES] = {
   1456      highbd_fdct8_row_rect_many_neon,       // DCT_DCT
   1457      highbd_fdct8_row_rect_many_neon,       // ADST_DCT
   1458      highbd_fadst8_row_rect_many_neon,      // DCT_ADST
   1459      highbd_fadst8_row_rect_many_neon,      // ADST_ADST
   1460      highbd_fdct8_row_rect_many_neon,       // FLIPADST_DCT
   1461      highbd_fadst8_row_rect_many_neon,      // DCT_FLIPADST
   1462      highbd_fadst8_row_rect_many_neon,      // FLIPADST_FLIPADST
   1463      highbd_fadst8_row_rect_many_neon,      // ADST_FLIPADST
   1464      highbd_fadst8_row_rect_many_neon,      // FLIPADST_ADST
   1465      highbd_fidentity8_row_rect_many_neon,  // IDTX
   1466      highbd_fidentity8_row_rect_many_neon,  // V_DCT
   1467      highbd_fdct8_row_rect_many_neon,       // H_DCT
   1468      highbd_fidentity8_row_rect_many_neon,  // V_ADST
   1469      highbd_fadst8_row_rect_many_neon,      // H_ADST
   1470      highbd_fidentity8_row_rect_many_neon,  // V_FLIPADST
   1471      highbd_fadst8_row_rect_many_neon       // H_FLIPADST
   1472    };
   1473 
   1474 static const fwd_transform_1d_row_neon row_highbd_txfm8_x4_arr[TX_TYPES] = {
   1475  highbd_fdct8_row_rect_neon,       // DCT_DCT
   1476  highbd_fdct8_row_rect_neon,       // ADST_DCT
   1477  highbd_fadst8_row_rect_neon,      // DCT_ADST
   1478  highbd_fadst8_row_rect_neon,      // ADST_ADST
   1479  highbd_fdct8_row_rect_neon,       // FLIPADST_DCT
   1480  highbd_fadst8_row_rect_neon,      // DCT_FLIPADST
   1481  highbd_fadst8_row_rect_neon,      // FLIPADST_FLIPADST
   1482  highbd_fadst8_row_rect_neon,      // ADST_FLIPADST
   1483  highbd_fadst8_row_rect_neon,      // FLIPADST_ADST
   1484  highbd_fidentity8_row_rect_neon,  // IDTX
   1485  highbd_fidentity8_row_rect_neon,  // V_DCT
   1486  highbd_fdct8_row_rect_neon,       // H_DCT
   1487  highbd_fidentity8_row_rect_neon,  // V_ADST
   1488  highbd_fadst8_row_rect_neon,      // H_ADST
   1489  highbd_fidentity8_row_rect_neon,  // V_FLIPADST
   1490  highbd_fadst8_row_rect_neon       // H_FLIPADST
   1491 };
   1492 
   1493 #if !CONFIG_REALTIME_ONLY
   1494 static const fwd_transform_1d_row_many_neon
   1495    row_highbd_txfm4_xn_arr[TX_TYPES] = {
   1496      highbd_fdct4_row_many_neon,       // DCT_DCT
   1497      highbd_fdct4_row_many_neon,       // ADST_DCT
   1498      highbd_fadst4_row_many_neon,      // DCT_ADST
   1499      highbd_fadst4_row_many_neon,      // ADST_ADST
   1500      highbd_fdct4_row_many_neon,       // FLIPADST_DCT
   1501      highbd_fadst4_row_many_neon,      // DCT_FLIPADST
   1502      highbd_fadst4_row_many_neon,      // FLIPADST_FLIPADST
   1503      highbd_fadst4_row_many_neon,      // ADST_FLIPADST
   1504      highbd_fadst4_row_many_neon,      // FLIPADST_ADST
   1505      highbd_fidentity4_row_many_neon,  // IDTX
   1506      highbd_fidentity4_row_many_neon,  // V_DCT
   1507      highbd_fdct4_row_many_neon,       // H_DCT
   1508      highbd_fidentity4_row_many_neon,  // V_ADST
   1509      highbd_fadst4_row_many_neon,      // H_ADST
   1510      highbd_fidentity4_row_many_neon,  // V_FLIPADST
   1511      highbd_fadst4_row_many_neon       // H_FLIPADST
   1512    };
   1513 #endif
   1514 
   1515 static const fwd_transform_1d_row_many_neon
   1516    row_rect_highbd_txfm4_xn_arr[TX_TYPES] = {
   1517      highbd_fdct4_row_rect_many_neon,       // DCT_DCT
   1518      highbd_fdct4_row_rect_many_neon,       // ADST_DCT
   1519      highbd_fadst4_row_rect_many_neon,      // DCT_ADST
   1520      highbd_fadst4_row_rect_many_neon,      // ADST_ADST
   1521      highbd_fdct4_row_rect_many_neon,       // FLIPADST_DCT
   1522      highbd_fadst4_row_rect_many_neon,      // DCT_FLIPADST
   1523      highbd_fadst4_row_rect_many_neon,      // FLIPADST_FLIPADST
   1524      highbd_fadst4_row_rect_many_neon,      // ADST_FLIPADST
   1525      highbd_fadst4_row_rect_many_neon,      // FLIPADST_ADST
   1526      highbd_fidentity4_row_rect_many_neon,  // IDTX
   1527      highbd_fidentity4_row_rect_many_neon,  // V_DCT
   1528      highbd_fdct4_row_rect_many_neon,       // H_DCT
   1529      highbd_fidentity4_row_rect_many_neon,  // V_ADST
   1530      highbd_fadst4_row_rect_many_neon,      // H_ADST
   1531      highbd_fidentity4_row_rect_many_neon,  // V_FLIPADST
   1532      highbd_fadst4_row_rect_many_neon       // H_FLIPADST
   1533    };
   1534 
   1535 static void highbd_fdct32_x4_neon(const int32x4_t *input, int32x4_t *output,
   1536                                  int cos_bit) {
   1537  const int32_t *const cospi = cospi_arr_s32(cos_bit);
   1538  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
   1539 
   1540  // Workspaces for intermediate transform steps.
   1541  int32x4_t buf0[32];
   1542  int32x4_t buf1[32];
   1543 
   1544  // stage 1
   1545  butterfly_dct_pre(input, buf1, 32);
   1546 
   1547  // stage 2
   1548  butterfly_dct_pre(buf1, buf0, 16);
   1549  buf0[16] = buf1[16];
   1550  buf0[17] = buf1[17];
   1551  buf0[18] = buf1[18];
   1552  buf0[19] = buf1[19];
   1553  butterfly_0112_neon(cospi, 32, buf1[27], buf1[20], &buf0[27], &buf0[20],
   1554                      v_cos_bit);
   1555  butterfly_0112_neon(cospi, 32, buf1[26], buf1[21], &buf0[26], &buf0[21],
   1556                      v_cos_bit);
   1557  butterfly_0112_neon(cospi, 32, buf1[25], buf1[22], &buf0[25], &buf0[22],
   1558                      v_cos_bit);
   1559  butterfly_0112_neon(cospi, 32, buf1[24], buf1[23], &buf0[24], &buf0[23],
   1560                      v_cos_bit);
   1561  buf0[28] = buf1[28];
   1562  buf0[29] = buf1[29];
   1563  buf0[30] = buf1[30];
   1564  buf0[31] = buf1[31];
   1565 
   1566  // stage 3
   1567  butterfly_dct_pre(buf0, buf1, 8);
   1568  buf1[8] = buf0[8];
   1569  buf1[9] = buf0[9];
   1570  butterfly_0112_neon(cospi, 32, buf0[13], buf0[10], &buf1[13], &buf1[10],
   1571                      v_cos_bit);
   1572  butterfly_0112_neon(cospi, 32, buf0[12], buf0[11], &buf1[12], &buf1[11],
   1573                      v_cos_bit);
   1574  buf1[14] = buf0[14];
   1575  buf1[15] = buf0[15];
   1576  butterfly_dct_post(buf0 + 16, buf0 + 16, buf1 + 16, 16);
   1577 
   1578  // stage 4
   1579  butterfly_dct_pre(buf1, buf0, 4);
   1580  buf0[4] = buf1[4];
   1581  butterfly_0112_neon(cospi, 32, buf1[6], buf1[5], &buf0[6], &buf0[5],
   1582                      v_cos_bit);
   1583  buf0[7] = buf1[7];
   1584  butterfly_dct_post(buf1 + 8, buf1 + 8, buf0 + 8, 8);
   1585  buf0[16] = buf1[16];
   1586  buf0[17] = buf1[17];
   1587  butterfly_0112_neon(cospi, 16, buf1[29], buf1[18], &buf0[29], &buf0[18],
   1588                      v_cos_bit);
   1589  butterfly_0112_neon(cospi, 16, buf1[28], buf1[19], &buf0[28], &buf0[19],
   1590                      v_cos_bit);
   1591  butterfly_2312_neon(cospi, 16, buf1[27], buf1[20], &buf0[20], &buf0[27],
   1592                      v_cos_bit);
   1593  butterfly_2312_neon(cospi, 16, buf1[26], buf1[21], &buf0[21], &buf0[26],
   1594                      v_cos_bit);
   1595  buf0[22] = buf1[22];
   1596  buf0[23] = buf1[23];
   1597  buf0[24] = buf1[24];
   1598  buf0[25] = buf1[25];
   1599  buf0[30] = buf1[30];
   1600  buf0[31] = buf1[31];
   1601 
   1602  // stage 5
   1603  butterfly_0112_neon(cospi, 32, buf0[0], buf0[1], &buf1[0], &buf1[1],
   1604                      v_cos_bit);
   1605  butterfly_0112_neon(cospi, 16, buf0[3], buf0[2], &buf1[2], &buf1[3],
   1606                      v_cos_bit);
   1607  butterfly_dct_post(buf0 + 4, buf0 + 4, buf1 + 4, 4);
   1608  buf1[8] = buf0[8];
   1609  butterfly_0112_neon(cospi, 16, buf0[14], buf0[9], &buf1[14], &buf1[9],
   1610                      v_cos_bit);
   1611  butterfly_2312_neon(cospi, 16, buf0[13], buf0[10], &buf1[10], &buf1[13],
   1612                      v_cos_bit);
   1613  buf1[11] = buf0[11];
   1614  buf1[12] = buf0[12];
   1615  buf1[15] = buf0[15];
   1616  butterfly_dct_post(buf0 + 16, buf0 + 16, buf1 + 16, 8);
   1617  butterfly_dct_post(buf0 + 24, buf0 + 24, buf1 + 24, 8);
   1618 
   1619  // stage 6
   1620  buf0[0] = buf1[0];
   1621  buf0[1] = buf1[1];
   1622  buf0[2] = buf1[2];
   1623  buf0[3] = buf1[3];
   1624 
   1625  butterfly_0112_neon(cospi, 8, buf1[7], buf1[4], &buf0[4], &buf0[7],
   1626                      v_cos_bit);
   1627  butterfly_0112_neon(cospi, 8, buf1[30], buf1[17], &buf0[30], &buf0[17],
   1628                      v_cos_bit);
   1629  butterfly_2312_neon(cospi, 8, buf1[29], buf1[18], &buf0[18], &buf0[29],
   1630                      v_cos_bit);
   1631  butterfly_dct_post(buf1 + 8, buf1 + 8, buf0 + 8, 4);
   1632  butterfly_dct_post(buf1 + 12, buf1 + 12, buf0 + 12, 4);
   1633  buf0[16] = buf1[16];
   1634  buf0[19] = buf1[19];
   1635  buf0[20] = buf1[20];
   1636 
   1637  butterfly_0130_neon(cospi, 24, buf1[5], buf1[6], &buf0[5], &buf0[6],
   1638                      v_cos_bit);
   1639  butterfly_0130_neon(cospi, 24, buf1[21], buf1[26], &buf0[26], &buf0[21],
   1640                      v_cos_bit);
   1641  butterfly_0332_neon(cospi, 24, buf1[25], buf1[22], &buf0[25], &buf0[22],
   1642                      v_cos_bit);
   1643 
   1644  buf0[23] = buf1[23];
   1645  buf0[24] = buf1[24];
   1646  buf0[27] = buf1[27];
   1647  buf0[28] = buf1[28];
   1648  buf0[31] = buf1[31];
   1649 
   1650  // stage 7
   1651  buf1[0] = buf0[0];
   1652  buf1[1] = buf0[1];
   1653  buf1[2] = buf0[2];
   1654  buf1[3] = buf0[3];
   1655  buf1[4] = buf0[4];
   1656  buf1[5] = buf0[5];
   1657  buf1[6] = buf0[6];
   1658  buf1[7] = buf0[7];
   1659  butterfly_0112_neon(cospi, 4, buf0[15], buf0[8], &buf1[8], &buf1[15],
   1660                      v_cos_bit);
   1661  butterfly_0130_neon(cospi, 28, buf0[9], buf0[14], &buf1[9], &buf1[14],
   1662                      v_cos_bit);
   1663  butterfly_0112_neon(cospi, 20, buf0[13], buf0[10], &buf1[10], &buf1[13],
   1664                      v_cos_bit);
   1665  butterfly_0130_neon(cospi, 12, buf0[11], buf0[12], &buf1[11], &buf1[12],
   1666                      v_cos_bit);
   1667  butterfly_dct_post(buf0 + 16, buf0 + 16, buf1 + 16, 4);
   1668  butterfly_dct_post(buf0 + 20, buf0 + 20, buf1 + 20, 4);
   1669  butterfly_dct_post(buf0 + 24, buf0 + 24, buf1 + 24, 4);
   1670  butterfly_dct_post(buf0 + 28, buf0 + 28, buf1 + 28, 4);
   1671 
   1672  // stage 8
   1673  buf0[0] = buf1[0];
   1674  buf0[1] = buf1[1];
   1675  buf0[2] = buf1[2];
   1676  buf0[3] = buf1[3];
   1677  buf0[4] = buf1[4];
   1678  buf0[5] = buf1[5];
   1679  buf0[6] = buf1[6];
   1680  buf0[7] = buf1[7];
   1681  buf0[8] = buf1[8];
   1682  buf0[9] = buf1[9];
   1683  buf0[10] = buf1[10];
   1684  buf0[11] = buf1[11];
   1685  buf0[12] = buf1[12];
   1686  buf0[13] = buf1[13];
   1687  buf0[14] = buf1[14];
   1688  buf0[15] = buf1[15];
   1689  butterfly_0112_neon(cospi, 2, buf1[31], buf1[16], &buf0[16], &buf0[31],
   1690                      v_cos_bit);
   1691  butterfly_0130_neon(cospi, 30, buf1[17], buf1[30], &buf0[17], &buf0[30],
   1692                      v_cos_bit);
   1693  butterfly_0112_neon(cospi, 18, buf1[29], buf1[18], &buf0[18], &buf0[29],
   1694                      v_cos_bit);
   1695  butterfly_0130_neon(cospi, 14, buf1[19], buf1[28], &buf0[19], &buf0[28],
   1696                      v_cos_bit);
   1697  butterfly_0112_neon(cospi, 10, buf1[27], buf1[20], &buf0[20], &buf0[27],
   1698                      v_cos_bit);
   1699  butterfly_0130_neon(cospi, 22, buf1[21], buf1[26], &buf0[21], &buf0[26],
   1700                      v_cos_bit);
   1701  butterfly_0112_neon(cospi, 26, buf1[25], buf1[22], &buf0[22], &buf0[25],
   1702                      v_cos_bit);
   1703  butterfly_0130_neon(cospi, 6, buf1[23], buf1[24], &buf0[23], &buf0[24],
   1704                      v_cos_bit);
   1705 
   1706  // stage 9
   1707  output[0] = buf0[0];
   1708  output[1] = buf0[16];
   1709  output[2] = buf0[8];
   1710  output[3] = buf0[24];
   1711  output[4] = buf0[4];
   1712  output[5] = buf0[20];
   1713  output[6] = buf0[12];
   1714  output[7] = buf0[28];
   1715  output[8] = buf0[2];
   1716  output[9] = buf0[18];
   1717  output[10] = buf0[10];
   1718  output[11] = buf0[26];
   1719  output[12] = buf0[6];
   1720  output[13] = buf0[22];
   1721  output[14] = buf0[14];
   1722  output[15] = buf0[30];
   1723  output[16] = buf0[1];
   1724  output[17] = buf0[17];
   1725  output[18] = buf0[9];
   1726  output[19] = buf0[25];
   1727  output[20] = buf0[5];
   1728  output[21] = buf0[21];
   1729  output[22] = buf0[13];
   1730  output[23] = buf0[29];
   1731  output[24] = buf0[3];
   1732  output[25] = buf0[19];
   1733  output[26] = buf0[11];
   1734  output[27] = buf0[27];
   1735  output[28] = buf0[7];
   1736  output[29] = buf0[23];
   1737  output[30] = buf0[15];
   1738  output[31] = buf0[31];
   1739 }
   1740 
   1741 static void highbd_fdct64_x4_neon(const int32x4_t *input, int32x4_t *output,
   1742                                  int8_t cos_bit) {
   1743  const int32_t *const cospi = cospi_arr_s32(cos_bit);
   1744  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
   1745 
   1746  // stage 1
   1747  int32x4_t x1[64];
   1748  butterfly_dct_pre(input, x1, 64);
   1749 
   1750  // stage 2
   1751  int32x4_t x2[64];
   1752  butterfly_dct_pre(x1, x2, 32);
   1753  x2[32] = x1[32];
   1754  x2[33] = x1[33];
   1755  x2[34] = x1[34];
   1756  x2[35] = x1[35];
   1757  x2[36] = x1[36];
   1758  x2[37] = x1[37];
   1759  x2[38] = x1[38];
   1760  x2[39] = x1[39];
   1761  butterfly_0112_neon(cospi, 32, x1[55], x1[40], &x2[55], &x2[40], v_cos_bit);
   1762  butterfly_0112_neon(cospi, 32, x1[54], x1[41], &x2[54], &x2[41], v_cos_bit);
   1763  butterfly_0112_neon(cospi, 32, x1[53], x1[42], &x2[53], &x2[42], v_cos_bit);
   1764  butterfly_0112_neon(cospi, 32, x1[52], x1[43], &x2[52], &x2[43], v_cos_bit);
   1765  butterfly_0112_neon(cospi, 32, x1[51], x1[44], &x2[51], &x2[44], v_cos_bit);
   1766  butterfly_0112_neon(cospi, 32, x1[50], x1[45], &x2[50], &x2[45], v_cos_bit);
   1767  butterfly_0112_neon(cospi, 32, x1[49], x1[46], &x2[49], &x2[46], v_cos_bit);
   1768  butterfly_0112_neon(cospi, 32, x1[48], x1[47], &x2[48], &x2[47], v_cos_bit);
   1769  x2[56] = x1[56];
   1770  x2[57] = x1[57];
   1771  x2[58] = x1[58];
   1772  x2[59] = x1[59];
   1773  x2[60] = x1[60];
   1774  x2[61] = x1[61];
   1775  x2[62] = x1[62];
   1776  x2[63] = x1[63];
   1777 
   1778  // stage 3
   1779  int32x4_t x3[64];
   1780  butterfly_dct_pre(x2, x3, 16);
   1781  x3[16] = x2[16];
   1782  x3[17] = x2[17];
   1783  x3[18] = x2[18];
   1784  x3[19] = x2[19];
   1785  butterfly_0112_neon(cospi, 32, x2[27], x2[20], &x3[27], &x3[20], v_cos_bit);
   1786  butterfly_0112_neon(cospi, 32, x2[26], x2[21], &x3[26], &x3[21], v_cos_bit);
   1787  butterfly_0112_neon(cospi, 32, x2[25], x2[22], &x3[25], &x3[22], v_cos_bit);
   1788  butterfly_0112_neon(cospi, 32, x2[24], x2[23], &x3[24], &x3[23], v_cos_bit);
   1789  x3[28] = x2[28];
   1790  x3[29] = x2[29];
   1791  x3[30] = x2[30];
   1792  x3[31] = x2[31];
   1793  butterfly_dct_post(x2 + 32, x2 + 32, x3 + 32, 32);
   1794 
   1795  // stage 4
   1796  int32x4_t x4[64];
   1797  butterfly_dct_pre(x3, x4, 8);
   1798  x4[8] = x3[8];
   1799  x4[9] = x3[9];
   1800  butterfly_0112_neon(cospi, 32, x3[13], x3[10], &x4[13], &x4[10], v_cos_bit);
   1801  butterfly_0112_neon(cospi, 32, x3[12], x3[11], &x4[12], &x4[11], v_cos_bit);
   1802  x4[14] = x3[14];
   1803  x4[15] = x3[15];
   1804  butterfly_dct_post(x3 + 16, x3 + 16, x4 + 16, 16);
   1805  x4[32] = x3[32];
   1806  x4[33] = x3[33];
   1807  x4[34] = x3[34];
   1808  x4[35] = x3[35];
   1809  butterfly_0112_neon(cospi, 16, x3[59], x3[36], &x4[59], &x4[36], v_cos_bit);
   1810  butterfly_0112_neon(cospi, 16, x3[58], x3[37], &x4[58], &x4[37], v_cos_bit);
   1811  butterfly_0112_neon(cospi, 16, x3[57], x3[38], &x4[57], &x4[38], v_cos_bit);
   1812  butterfly_0112_neon(cospi, 16, x3[56], x3[39], &x4[56], &x4[39], v_cos_bit);
   1813  butterfly_2312_neon(cospi, 16, x3[55], x3[40], &x4[40], &x4[55], v_cos_bit);
   1814  butterfly_2312_neon(cospi, 16, x3[54], x3[41], &x4[41], &x4[54], v_cos_bit);
   1815  butterfly_2312_neon(cospi, 16, x3[53], x3[42], &x4[42], &x4[53], v_cos_bit);
   1816  butterfly_2312_neon(cospi, 16, x3[52], x3[43], &x4[43], &x4[52], v_cos_bit);
   1817  x4[44] = x3[44];
   1818  x4[45] = x3[45];
   1819  x4[46] = x3[46];
   1820  x4[47] = x3[47];
   1821  x4[48] = x3[48];
   1822  x4[49] = x3[49];
   1823  x4[50] = x3[50];
   1824  x4[51] = x3[51];
   1825  x4[60] = x3[60];
   1826  x4[61] = x3[61];
   1827  x4[62] = x3[62];
   1828  x4[63] = x3[63];
   1829 
   1830  // stage 5
   1831  int32x4_t x5[64];
   1832  butterfly_dct_pre(x4, x5, 4);
   1833  x5[4] = x4[4];
   1834  butterfly_0112_neon(cospi, 32, x4[6], x4[5], &x5[6], &x5[5], v_cos_bit);
   1835  x5[7] = x4[7];
   1836  butterfly_dct_post(x4 + 8, x4 + 8, x5 + 8, 8);
   1837  x5[16] = x4[16];
   1838  x5[17] = x4[17];
   1839  butterfly_0112_neon(cospi, 16, x4[29], x4[18], &x5[29], &x5[18], v_cos_bit);
   1840  butterfly_0112_neon(cospi, 16, x4[28], x4[19], &x5[28], &x5[19], v_cos_bit);
   1841  butterfly_2312_neon(cospi, 16, x4[27], x4[20], &x5[20], &x5[27], v_cos_bit);
   1842  butterfly_2312_neon(cospi, 16, x4[26], x4[21], &x5[21], &x5[26], v_cos_bit);
   1843  x5[22] = x4[22];
   1844  x5[23] = x4[23];
   1845  x5[24] = x4[24];
   1846  x5[25] = x4[25];
   1847  x5[30] = x4[30];
   1848  x5[31] = x4[31];
   1849  butterfly_dct_post(x4 + 32, x4 + 32, x5 + 32, 16);
   1850  butterfly_dct_post(x4 + 48, x4 + 48, x5 + 48, 16);
   1851 
   1852  // stage 6
   1853  int32x4_t x6[64];
   1854  butterfly_0112_neon(cospi, 32, x5[0], x5[1], &x6[0], &x6[1], v_cos_bit);
   1855  butterfly_0112_neon(cospi, 16, x5[3], x5[2], &x6[2], &x6[3], v_cos_bit);
   1856  butterfly_dct_post(x5 + 4, x5 + 4, x6 + 4, 4);
   1857  x6[8] = x5[8];
   1858  butterfly_0112_neon(cospi, 16, x5[14], x5[9], &x6[14], &x6[9], v_cos_bit);
   1859  butterfly_2312_neon(cospi, 16, x5[13], x5[10], &x6[10], &x6[13], v_cos_bit);
   1860  x6[11] = x5[11];
   1861  x6[12] = x5[12];
   1862  x6[15] = x5[15];
   1863  butterfly_dct_post(x5 + 16, x5 + 16, x6 + 16, 8);
   1864  butterfly_dct_post(x5 + 24, x5 + 24, x6 + 24, 8);
   1865  x6[32] = x5[32];
   1866  x6[33] = x5[33];
   1867  butterfly_0112_neon(cospi, 8, x5[61], x5[34], &x6[61], &x6[34], v_cos_bit);
   1868  butterfly_0112_neon(cospi, 8, x5[60], x5[35], &x6[60], &x6[35], v_cos_bit);
   1869  butterfly_2312_neon(cospi, 8, x5[59], x5[36], &x6[36], &x6[59], v_cos_bit);
   1870  butterfly_2312_neon(cospi, 8, x5[58], x5[37], &x6[37], &x6[58], v_cos_bit);
   1871  x6[38] = x5[38];
   1872  x6[39] = x5[39];
   1873  x6[40] = x5[40];
   1874  x6[41] = x5[41];
   1875  butterfly_0130_neon(cospi, 24, x5[42], x5[53], &x6[53], &x6[42], v_cos_bit);
   1876  butterfly_0130_neon(cospi, 24, x5[43], x5[52], &x6[52], &x6[43], v_cos_bit);
   1877  butterfly_0332_neon(cospi, 24, x5[51], x5[44], &x6[51], &x6[44], v_cos_bit);
   1878  butterfly_0332_neon(cospi, 24, x5[50], x5[45], &x6[50], &x6[45], v_cos_bit);
   1879  x6[46] = x5[46];
   1880  x6[47] = x5[47];
   1881  x6[48] = x5[48];
   1882  x6[49] = x5[49];
   1883  x6[54] = x5[54];
   1884  x6[55] = x5[55];
   1885  x6[56] = x5[56];
   1886  x6[57] = x5[57];
   1887  x6[62] = x5[62];
   1888  x6[63] = x5[63];
   1889 
   1890  // stage 7
   1891  int32x4_t x7[64];
   1892  x7[0] = x6[0];
   1893  x7[1] = x6[1];
   1894  x7[2] = x6[2];
   1895  x7[3] = x6[3];
   1896  butterfly_0112_neon(cospi, 8, x6[7], x6[4], &x7[4], &x7[7], v_cos_bit);
   1897  butterfly_0130_neon(cospi, 24, x6[5], x6[6], &x7[5], &x7[6], v_cos_bit);
   1898  butterfly_dct_post(x6 + 8, x6 + 8, x7 + 8, 4);
   1899  butterfly_dct_post(x6 + 12, x6 + 12, x7 + 12, 4);
   1900  x7[16] = x6[16];
   1901  butterfly_0112_neon(cospi, 8, x6[30], x6[17], &x7[30], &x7[17], v_cos_bit);
   1902  butterfly_2312_neon(cospi, 8, x6[29], x6[18], &x7[18], &x7[29], v_cos_bit);
   1903  x7[19] = x6[19];
   1904  x7[20] = x6[20];
   1905  butterfly_0130_neon(cospi, 24, x6[21], x6[26], &x7[26], &x7[21], v_cos_bit);
   1906  butterfly_0332_neon(cospi, 24, x6[25], x6[22], &x7[25], &x7[22], v_cos_bit);
   1907  x7[23] = x6[23];
   1908  x7[24] = x6[24];
   1909  x7[27] = x6[27];
   1910  x7[28] = x6[28];
   1911  x7[31] = x6[31];
   1912  butterfly_dct_post(x6 + 32, x6 + 32, x7 + 32, 8);
   1913  butterfly_dct_post(x6 + 40, x6 + 40, x7 + 40, 8);
   1914  butterfly_dct_post(x6 + 48, x6 + 48, x7 + 48, 8);
   1915  butterfly_dct_post(x6 + 56, x6 + 56, x7 + 56, 8);
   1916 
   1917  // stage 8
   1918  int32x4_t x8[64];
   1919  x8[0] = x7[0];
   1920  x8[1] = x7[1];
   1921  x8[2] = x7[2];
   1922  x8[3] = x7[3];
   1923  x8[4] = x7[4];
   1924  x8[5] = x7[5];
   1925  x8[6] = x7[6];
   1926  x8[7] = x7[7];
   1927 
   1928  butterfly_0112_neon(cospi, 4, x7[15], x7[8], &x8[8], &x8[15], v_cos_bit);
   1929  butterfly_0130_neon(cospi, 28, x7[9], x7[14], &x8[9], &x8[14], v_cos_bit);
   1930  butterfly_0112_neon(cospi, 20, x7[13], x7[10], &x8[10], &x8[13], v_cos_bit);
   1931  butterfly_0130_neon(cospi, 12, x7[11], x7[12], &x8[11], &x8[12], v_cos_bit);
   1932  butterfly_dct_post(x7 + 16, x7 + 16, x8 + 16, 4);
   1933  butterfly_dct_post(x7 + 20, x7 + 20, x8 + 20, 4);
   1934  butterfly_dct_post(x7 + 24, x7 + 24, x8 + 24, 4);
   1935  butterfly_dct_post(x7 + 28, x7 + 28, x8 + 28, 4);
   1936  x8[32] = x7[32];
   1937  butterfly_0112_neon(cospi, 4, x7[62], x7[33], &x8[62], &x8[33], v_cos_bit);
   1938  butterfly_2312_neon(cospi, 4, x7[61], x7[34], &x8[34], &x8[61], v_cos_bit);
   1939  x8[35] = x7[35];
   1940  x8[36] = x7[36];
   1941  butterfly_0130_neon(cospi, 28, x7[37], x7[58], &x8[58], &x8[37], v_cos_bit);
   1942  butterfly_0332_neon(cospi, 28, x7[57], x7[38], &x8[57], &x8[38], v_cos_bit);
   1943  x8[39] = x7[39];
   1944  x8[40] = x7[40];
   1945  butterfly_0112_neon(cospi, 20, x7[54], x7[41], &x8[54], &x8[41], v_cos_bit);
   1946  butterfly_2312_neon(cospi, 20, x7[53], x7[42], &x8[42], &x8[53], v_cos_bit);
   1947  x8[43] = x7[43];
   1948  x8[44] = x7[44];
   1949  butterfly_0130_neon(cospi, 12, x7[45], x7[50], &x8[50], &x8[45], v_cos_bit);
   1950  butterfly_0332_neon(cospi, 12, x7[49], x7[46], &x8[49], &x8[46], v_cos_bit);
   1951  x8[47] = x7[47];
   1952  x8[48] = x7[48];
   1953  x8[51] = x7[51];
   1954  x8[52] = x7[52];
   1955  x8[55] = x7[55];
   1956  x8[56] = x7[56];
   1957  x8[59] = x7[59];
   1958  x8[60] = x7[60];
   1959  x8[63] = x7[63];
   1960 
   1961  // stage 9
   1962  int32x4_t x9[64];
   1963  x9[0] = x8[0];
   1964  x9[1] = x8[1];
   1965  x9[2] = x8[2];
   1966  x9[3] = x8[3];
   1967  x9[4] = x8[4];
   1968  x9[5] = x8[5];
   1969  x9[6] = x8[6];
   1970  x9[7] = x8[7];
   1971  x9[8] = x8[8];
   1972  x9[9] = x8[9];
   1973  x9[10] = x8[10];
   1974  x9[11] = x8[11];
   1975  x9[12] = x8[12];
   1976  x9[13] = x8[13];
   1977  x9[14] = x8[14];
   1978  x9[15] = x8[15];
   1979  butterfly_0112_neon(cospi, 2, x8[31], x8[16], &x9[16], &x9[31], v_cos_bit);
   1980  butterfly_0130_neon(cospi, 30, x8[17], x8[30], &x9[17], &x9[30], v_cos_bit);
   1981  butterfly_0112_neon(cospi, 18, x8[29], x8[18], &x9[18], &x9[29], v_cos_bit);
   1982  butterfly_0130_neon(cospi, 14, x8[19], x8[28], &x9[19], &x9[28], v_cos_bit);
   1983  butterfly_0112_neon(cospi, 10, x8[27], x8[20], &x9[20], &x9[27], v_cos_bit);
   1984  butterfly_0130_neon(cospi, 22, x8[21], x8[26], &x9[21], &x9[26], v_cos_bit);
   1985  butterfly_0112_neon(cospi, 26, x8[25], x8[22], &x9[22], &x9[25], v_cos_bit);
   1986  butterfly_0130_neon(cospi, 6, x8[23], x8[24], &x9[23], &x9[24], v_cos_bit);
   1987  butterfly_dct_post(x8 + 32, x8 + 32, x9 + 32, 4);
   1988  butterfly_dct_post(x8 + 36, x8 + 36, x9 + 36, 4);
   1989  butterfly_dct_post(x8 + 40, x8 + 40, x9 + 40, 4);
   1990  butterfly_dct_post(x8 + 44, x8 + 44, x9 + 44, 4);
   1991  butterfly_dct_post(x8 + 48, x8 + 48, x9 + 48, 4);
   1992  butterfly_dct_post(x8 + 52, x8 + 52, x9 + 52, 4);
   1993  butterfly_dct_post(x8 + 56, x8 + 56, x9 + 56, 4);
   1994  butterfly_dct_post(x8 + 60, x8 + 60, x9 + 60, 4);
   1995 
   1996  // stage 10
   1997  int32x4_t x10[64];
   1998  x10[0] = x9[0];
   1999  x10[1] = x9[1];
   2000  x10[2] = x9[2];
   2001  x10[3] = x9[3];
   2002  x10[4] = x9[4];
   2003  x10[5] = x9[5];
   2004  x10[6] = x9[6];
   2005  x10[7] = x9[7];
   2006  x10[8] = x9[8];
   2007  x10[9] = x9[9];
   2008  x10[10] = x9[10];
   2009  x10[11] = x9[11];
   2010  x10[12] = x9[12];
   2011  x10[13] = x9[13];
   2012  x10[14] = x9[14];
   2013  x10[15] = x9[15];
   2014  x10[16] = x9[16];
   2015  x10[17] = x9[17];
   2016  x10[18] = x9[18];
   2017  x10[19] = x9[19];
   2018  x10[20] = x9[20];
   2019  x10[21] = x9[21];
   2020  x10[22] = x9[22];
   2021  x10[23] = x9[23];
   2022  x10[24] = x9[24];
   2023  x10[25] = x9[25];
   2024  x10[26] = x9[26];
   2025  x10[27] = x9[27];
   2026  x10[28] = x9[28];
   2027  x10[29] = x9[29];
   2028  x10[30] = x9[30];
   2029  x10[31] = x9[31];
   2030  butterfly_0112_neon(cospi, 1, x9[63], x9[32], &x10[32], &x10[63], v_cos_bit);
   2031  butterfly_0130_neon(cospi, 31, x9[33], x9[62], &x10[33], &x10[62], v_cos_bit);
   2032  butterfly_0112_neon(cospi, 17, x9[61], x9[34], &x10[34], &x10[61], v_cos_bit);
   2033  butterfly_0130_neon(cospi, 15, x9[35], x9[60], &x10[35], &x10[60], v_cos_bit);
   2034  butterfly_0112_neon(cospi, 9, x9[59], x9[36], &x10[36], &x10[59], v_cos_bit);
   2035  butterfly_0130_neon(cospi, 23, x9[37], x9[58], &x10[37], &x10[58], v_cos_bit);
   2036  butterfly_0112_neon(cospi, 25, x9[57], x9[38], &x10[38], &x10[57], v_cos_bit);
   2037  butterfly_0130_neon(cospi, 7, x9[39], x9[56], &x10[39], &x10[56], v_cos_bit);
   2038  butterfly_0112_neon(cospi, 5, x9[55], x9[40], &x10[40], &x10[55], v_cos_bit);
   2039  butterfly_0130_neon(cospi, 27, x9[41], x9[54], &x10[41], &x10[54], v_cos_bit);
   2040  butterfly_0112_neon(cospi, 21, x9[53], x9[42], &x10[42], &x10[53], v_cos_bit);
   2041  butterfly_0130_neon(cospi, 11, x9[43], x9[52], &x10[43], &x10[52], v_cos_bit);
   2042  butterfly_0112_neon(cospi, 13, x9[51], x9[44], &x10[44], &x10[51], v_cos_bit);
   2043  butterfly_0130_neon(cospi, 19, x9[45], x9[50], &x10[45], &x10[50], v_cos_bit);
   2044  butterfly_0112_neon(cospi, 29, x9[49], x9[46], &x10[46], &x10[49], v_cos_bit);
   2045  butterfly_0130_neon(cospi, 3, x9[47], x9[48], &x10[47], &x10[48], v_cos_bit);
   2046 
   2047  // stage 11
   2048  output[0] = x10[0];
   2049  output[1] = x10[32];
   2050  output[2] = x10[16];
   2051  output[3] = x10[48];
   2052  output[4] = x10[8];
   2053  output[5] = x10[40];
   2054  output[6] = x10[24];
   2055  output[7] = x10[56];
   2056  output[8] = x10[4];
   2057  output[9] = x10[36];
   2058  output[10] = x10[20];
   2059  output[11] = x10[52];
   2060  output[12] = x10[12];
   2061  output[13] = x10[44];
   2062  output[14] = x10[28];
   2063  output[15] = x10[60];
   2064  output[16] = x10[2];
   2065  output[17] = x10[34];
   2066  output[18] = x10[18];
   2067  output[19] = x10[50];
   2068  output[20] = x10[10];
   2069  output[21] = x10[42];
   2070  output[22] = x10[26];
   2071  output[23] = x10[58];
   2072  output[24] = x10[6];
   2073  output[25] = x10[38];
   2074  output[26] = x10[22];
   2075  output[27] = x10[54];
   2076  output[28] = x10[14];
   2077  output[29] = x10[46];
   2078  output[30] = x10[30];
   2079  output[31] = x10[62];
   2080  output[32] = x10[1];
   2081  output[33] = x10[33];
   2082  output[34] = x10[17];
   2083  output[35] = x10[49];
   2084  output[36] = x10[9];
   2085  output[37] = x10[41];
   2086  output[38] = x10[25];
   2087  output[39] = x10[57];
   2088  output[40] = x10[5];
   2089  output[41] = x10[37];
   2090  output[42] = x10[21];
   2091  output[43] = x10[53];
   2092  output[44] = x10[13];
   2093  output[45] = x10[45];
   2094  output[46] = x10[29];
   2095  output[47] = x10[61];
   2096  output[48] = x10[3];
   2097  output[49] = x10[35];
   2098  output[50] = x10[19];
   2099  output[51] = x10[51];
   2100  output[52] = x10[11];
   2101  output[53] = x10[43];
   2102  output[54] = x10[27];
   2103  output[55] = x10[59];
   2104  output[56] = x10[7];
   2105  output[57] = x10[39];
   2106  output[58] = x10[23];
   2107  output[59] = x10[55];
   2108  output[60] = x10[15];
   2109  output[61] = x10[47];
   2110  output[62] = x10[31];
   2111  output[63] = x10[63];
   2112 }
   2113 
   2114 static void highbd_fidentity32_x4_neon(const int32x4_t *input,
   2115                                       int32x4_t *output, int cos_bit) {
   2116  (void)cos_bit;
   2117  for (int i = 0; i < 32; i++) {
   2118    output[i] = vshlq_n_s32(input[i], 2);
   2119  }
   2120 }
   2121 
   2122 TRANSFORM_COL_MANY(fdct32, 32)
   2123 TRANSFORM_COL_MANY(fidentity32, 32)
   2124 
   2125 static const fwd_transform_1d_col_many_neon
   2126    col_highbd_txfm32_x4_arr[TX_TYPES] = {
   2127      highbd_fdct32_col_many_neon,       // DCT_DCT
   2128      NULL,                              // ADST_DCT
   2129      NULL,                              // DCT_ADST
   2130      NULL,                              // ADST_ADST
   2131      NULL,                              // FLIPADST_DCT
   2132      NULL,                              // DCT_FLIPADST
   2133      NULL,                              // FLIPADST_FLIPADST
   2134      NULL,                              // ADST_FLIPADST
   2135      NULL,                              // FLIPADST_ADST
   2136      highbd_fidentity32_col_many_neon,  // IDTX
   2137      NULL,                              // V_DCT
   2138      NULL,                              // H_DCT
   2139      NULL,                              // V_ADST
   2140      NULL,                              // H_ADST
   2141      NULL,                              // V_FLIPADST
   2142      NULL                               // H_FLIPADST
   2143    };
   2144 
   2145 TRANSFORM_ROW_MANY(fdct32, 32)
   2146 TRANSFORM_ROW_MANY(fidentity32, 32)
   2147 
   2148 static const fwd_transform_1d_row_many_neon
   2149    row_highbd_txfm32_x4_arr[TX_TYPES] = {
   2150      highbd_fdct32_row_many_neon,       // DCT_DCT
   2151      NULL,                              // ADST_DCT
   2152      NULL,                              // DCT_ADST
   2153      NULL,                              // ADST_ADST
   2154      NULL,                              // FLIPADST_DCT
   2155      NULL,                              // DCT_FLIPADST
   2156      NULL,                              // FLIPADST_FLIPADST
   2157      NULL,                              // ADST_FLIPADST
   2158      NULL,                              // FLIPADST_ADST
   2159      highbd_fidentity32_row_many_neon,  // IDTX
   2160      NULL,                              // V_DCT
   2161      NULL,                              // H_DCT
   2162      NULL,                              // V_ADST
   2163      NULL,                              // H_ADST
   2164      NULL,                              // V_FLIPADST
   2165      NULL                               // H_FLIPADST
   2166    };
   2167 
   2168 TRANSFORM_ROW_RECT_MANY(fdct32, 32)
   2169 TRANSFORM_ROW_RECT_MANY(fidentity32, 32)
   2170 
   2171 static const fwd_transform_1d_row_many_neon
   2172    row_rect_highbd_txfm32_x4_arr[TX_TYPES] = {
   2173      highbd_fdct32_row_rect_many_neon,       // DCT_DCT
   2174      NULL,                                   // ADST_DCT
   2175      NULL,                                   // DCT_ADST
   2176      NULL,                                   // ADST_ADST
   2177      NULL,                                   // FLIPADST_DCT
   2178      NULL,                                   // DCT_FLIPADST
   2179      NULL,                                   // FLIPADST_FLIPADST
   2180      NULL,                                   // ADST_FLIPADST
   2181      NULL,                                   // FLIPADST_ADST
   2182      highbd_fidentity32_row_rect_many_neon,  // IDTX
   2183      NULL,                                   // V_DCT
   2184      NULL,                                   // H_DCT
   2185      NULL,                                   // V_ADST
   2186      NULL,                                   // H_ADST
   2187      NULL,                                   // V_FLIPADST
   2188      NULL                                    // H_FLIPADST
   2189    };
   2190 
   2191 void av1_fwd_txfm2d_16x8_neon(const int16_t *input, int32_t *coeff, int stride,
   2192                              TX_TYPE tx_type, int bd) {
   2193  (void)bd;
   2194  const fwd_transform_1d_col_many_neon col_txfm =
   2195      col_highbd_txfm8_xn_arr[tx_type];
   2196  const fwd_transform_1d_row_many_neon row_txfm =
   2197      row_rect_highbd_txfm16_xn_arr[tx_type];
   2198  int bit = av1_fwd_cos_bit_col[2][1];
   2199 
   2200  int ud_flip, lr_flip;
   2201  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   2202  ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
   2203 
   2204  // Column-wise transform.
   2205  int32x4_t buf0[32];
   2206  if (lr_flip) {
   2207    col_txfm(input, buf0 + 3 * 8, stride, bit, /*lr_flip=*/1, /*howmany=*/4,
   2208             /*hm_stride=*/-8);
   2209  } else {
   2210    col_txfm(input, buf0, stride, bit, /*lr_flip=*/0, /*howmany=*/4,
   2211             /*hm_stride=*/8);
   2212  }
   2213  shift_right_2_round_s32_x4(buf0, buf0, 32);
   2214 
   2215  int32x4_t buf1[32];
   2216  transpose_arrays_s32_16x8(buf0, buf1);
   2217 
   2218  // Row-wise transform.
   2219  row_txfm(buf1, coeff, bit, /*howmany=*/2, /*hm_stride=*/16, /*stride=*/8);
   2220 }
   2221 
   2222 void av1_fwd_txfm2d_8x16_neon(const int16_t *input, int32_t *coeff, int stride,
   2223                              TX_TYPE tx_type, int bd) {
   2224  (void)bd;
   2225  const fwd_transform_1d_col_many_neon col_txfm =
   2226      col_highbd_txfm16_xn_arr[tx_type];
   2227  const fwd_transform_1d_row_many_neon row_txfm =
   2228      row_rect_highbd_txfm8_xn_arr[tx_type];
   2229  int bit = av1_fwd_cos_bit_col[1][2];
   2230 
   2231  int ud_flip, lr_flip;
   2232  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   2233  ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
   2234 
   2235  // Column-wise transform.
   2236  int32x4_t buf0[32];
   2237  if (lr_flip) {
   2238    col_txfm(input, buf0 + 16, stride, bit, /*lr_flip=*/1, /*howmany=*/2,
   2239             /*hm_stride=*/-16);
   2240  } else {
   2241    col_txfm(input, buf0, stride, bit, /*lr_flip=*/0, /*howmany=*/2,
   2242             /*hm_stride=*/16);
   2243  }
   2244  shift_right_2_round_s32_x4(buf0, buf0, 32);
   2245 
   2246  int32x4_t buf1[32];
   2247  transpose_arrays_s32_8x16(buf0, buf1);
   2248 
   2249  // Row-wise transform.
   2250  row_txfm(buf1, coeff, bit, /*howmany=*/4, /*hm_stride=*/8, /*stride=*/16);
   2251 }
   2252 
   2253 #if !CONFIG_REALTIME_ONLY
   2254 void av1_fwd_txfm2d_4x16_neon(const int16_t *input, int32_t *coeff, int stride,
   2255                              TX_TYPE tx_type, int bd) {
   2256  (void)bd;
   2257  int bitcol = av1_fwd_cos_bit_col[0][2];
   2258  int bitrow = av1_fwd_cos_bit_row[0][2];
   2259  const fwd_transform_1d_col_many_neon col_txfm =
   2260      col_highbd_txfm16_xn_arr[tx_type];
   2261  const fwd_transform_1d_row_many_neon row_txfm =
   2262      row_highbd_txfm4_xn_arr[tx_type];
   2263 
   2264  int ud_flip, lr_flip;
   2265  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   2266  ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
   2267 
   2268  // Column-wise transform.
   2269  int32x4_t buf0[16];
   2270  if (lr_flip) {
   2271    col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/1, /*howmany=*/1,
   2272             /*hm_stride=*/0);
   2273  } else {
   2274    col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/1,
   2275             /*hm_stride=*/0);
   2276  }
   2277  shift_right_1_round_s32_x4(buf0, buf0, 16);
   2278 
   2279  int32x4_t buf1[16];
   2280  transpose_arrays_s32_4x16(buf0, buf1);
   2281 
   2282  // Row-wise transform.
   2283  row_txfm(buf1, coeff, bitrow, /*howmany=*/4, /*hm_stride=*/4, /*stride=*/16);
   2284 }
   2285 #endif
   2286 
   2287 void av1_fwd_txfm2d_16x4_neon(const int16_t *input, int32_t *coeff, int stride,
   2288                              TX_TYPE tx_type, int bd) {
   2289  (void)bd;
   2290  int bitcol = av1_fwd_cos_bit_col[2][0];
   2291  int bitrow = av1_fwd_cos_bit_row[2][0];
   2292  const fwd_transform_1d_col_many_neon col_txfm =
   2293      col_highbd_txfm4_xn_arr[tx_type];
   2294  const fwd_transform_1d_row_neon row_txfm = row_highbd_txfm16_xn_arr[tx_type];
   2295 
   2296  int ud_flip, lr_flip;
   2297  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   2298  ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
   2299 
   2300  // Column-wise transform.
   2301  int32x4_t buf0[16];
   2302  if (lr_flip) {
   2303    col_txfm(input, buf0 + 3 * 4, stride, bitcol, /*lr_flip=*/1, /*howmany=*/4,
   2304             /*hm_stride=*/-4);
   2305  } else {
   2306    col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/4,
   2307             /*hm_stride=*/4);
   2308  }
   2309 
   2310  shift_right_1_round_s32_x4(buf0, buf0, 16);
   2311  transpose_arrays_s32_4x16(buf0, buf0);
   2312 
   2313  // Row-wise transform.
   2314  row_txfm(buf0, coeff, bitrow, /*stride=*/4);
   2315 }
   2316 
   2317 void av1_fwd_txfm2d_16x32_neon(const int16_t *input, int32_t *coeff, int stride,
   2318                               TX_TYPE tx_type, int bd) {
   2319  (void)bd;
   2320  const fwd_transform_1d_col_many_neon col_txfm =
   2321      col_highbd_txfm32_x4_arr[tx_type];
   2322  const fwd_transform_1d_row_many_neon row_txfm =
   2323      row_rect_highbd_txfm16_xn_arr[tx_type];
   2324  int bitcol = av1_fwd_cos_bit_col[2][3];
   2325  int bitrow = av1_fwd_cos_bit_row[2][3];
   2326 
   2327  // Column-wise transform.
   2328  int32x4_t buf0[128];
   2329  col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/4,
   2330           /*hm_stride=*/32);
   2331  shift_right_4_round_s32_x4(buf0, buf0, 128);
   2332 
   2333  int32x4_t buf1[128];
   2334  transpose_arrays_s32_16x32(buf0, buf1);
   2335 
   2336  // Row-wise transform.
   2337  row_txfm(buf1, coeff, bitrow, /*howmany=*/8, /*hm_stride=*/16, /*stride=*/32);
   2338 }
   2339 
   2340 void av1_fwd_txfm2d_32x64_neon(const int16_t *input, int32_t *coeff, int stride,
   2341                               TX_TYPE tx_type, int bd) {
   2342  (void)bd;
   2343  (void)tx_type;
   2344  int bitcol = av1_fwd_cos_bit_col[3][4];
   2345  int bitrow = av1_fwd_cos_bit_row[3][4];
   2346 
   2347  // Column-wise transform.
   2348  int32x4_t buf0[512];
   2349  load_buffer_32x64(input, buf0, stride, 0);
   2350  for (int i = 0; i < 8; i++) {
   2351    highbd_fdct64_x4_neon(buf0 + i * 64, buf0 + i * 64, bitcol);
   2352  }
   2353  shift_right_2_round_s32_x4(buf0, buf0, 512);
   2354 
   2355  int32x4_t buf1[512];
   2356  transpose_arrays_s32_32x64(buf0, buf1);
   2357 
   2358  // Row-wise transform.
   2359  for (int i = 0; i < 16; i++) {
   2360    highbd_fdct32_x4_neon(buf1 + i * 32, buf1 + i * 32, bitrow);
   2361  }
   2362  round_shift2_rect_array_s32_neon(buf1, buf1, 512);
   2363  store_buffer_32x32(buf1, coeff, /*stride=*/32);
   2364 }
   2365 
   2366 void av1_fwd_txfm2d_64x32_neon(const int16_t *input, int32_t *coeff, int stride,
   2367                               TX_TYPE tx_type, int bd) {
   2368  (void)bd;
   2369  (void)tx_type;
   2370  int bitcol = av1_fwd_cos_bit_col[4][3];
   2371  int bitrow = av1_fwd_cos_bit_row[4][3];
   2372 
   2373  // Column-wise transform.
   2374  int32x4_t buf0[512];
   2375  load_buffer_64x32(input, buf0, stride, 0);
   2376  for (int i = 0; i < 16; i++) {
   2377    highbd_fdct32_x4_neon(buf0 + i * 32, buf0 + i * 32, bitcol);
   2378  }
   2379  shift_right_4_round_s32_x4(buf0, buf0, 512);
   2380 
   2381  int32x4_t buf1[512];
   2382  transpose_arrays_s32_64x32(buf0, buf1);
   2383 
   2384  // Row-wise transform.
   2385  for (int i = 0; i < 8; i++) {
   2386    highbd_fdct64_x4_neon(buf1 + i * 64, buf1 + i * 64, bitrow);
   2387  }
   2388  round_shift2_rect_array_s32_neon(buf1, buf1, 512);
   2389  store_buffer_64x32(buf1, coeff, /*stride=*/32);
   2390 }
   2391 
   2392 void av1_fwd_txfm2d_32x16_neon(const int16_t *input, int32_t *coeff, int stride,
   2393                               TX_TYPE tx_type, int bd) {
   2394  (void)bd;
   2395  const fwd_transform_1d_col_many_neon col_txfm =
   2396      col_highbd_txfm16_xn_arr[tx_type];
   2397  const fwd_transform_1d_row_many_neon row_txfm =
   2398      row_rect_highbd_txfm32_x4_arr[tx_type];
   2399  int bitcol = av1_fwd_cos_bit_col[3][2];
   2400  int bitrow = av1_fwd_cos_bit_row[3][2];
   2401 
   2402  // Column-wise transform.
   2403  int32x4_t buf0[128];
   2404  col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/8,
   2405           /*hm_stride=*/16);
   2406  shift_right_4_round_s32_x4(buf0, buf0, 128);
   2407 
   2408  int32x4_t buf1[128];
   2409  transpose_arrays_s32_32x16(buf0, buf1);
   2410 
   2411  // Row-wise transform.
   2412  row_txfm(buf1, coeff, bitrow, /*howmany=*/4, /*hm_stride=*/32, /*stride=*/16);
   2413 }
   2414 
   2415 #if !CONFIG_REALTIME_ONLY
   2416 void av1_fwd_txfm2d_8x32_neon(const int16_t *input, int32_t *coeff, int stride,
   2417                              TX_TYPE tx_type, int bd) {
   2418  (void)bd;
   2419  const fwd_transform_1d_col_many_neon col_txfm =
   2420      col_highbd_txfm32_x4_arr[tx_type];
   2421  const fwd_transform_1d_row_many_neon row_txfm =
   2422      row_highbd_txfm8_xn_arr[tx_type];
   2423  int bitcol = av1_fwd_cos_bit_col[1][3];
   2424  int bitrow = av1_fwd_cos_bit_row[1][3];
   2425 
   2426  // Column-wise transform.
   2427  int32x4_t buf0[64];
   2428  col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/2,
   2429           /*hm_stride=*/32);
   2430  shift_right_2_round_s32_x4(buf0, buf0, 64);
   2431 
   2432  int32x4_t buf1[64];
   2433  transpose_arrays_s32_8x32(buf0, buf1);
   2434 
   2435  // Row-wise transform.
   2436  row_txfm(buf1, coeff, bitrow, /*howmany=*/8, /*hm_stride=*/8, /*stride=*/32);
   2437 }
   2438 
   2439 void av1_fwd_txfm2d_32x8_neon(const int16_t *input, int32_t *coeff, int stride,
   2440                              TX_TYPE tx_type, int bd) {
   2441  (void)bd;
   2442  const fwd_transform_1d_col_many_neon col_txfm =
   2443      col_highbd_txfm8_xn_arr[tx_type];
   2444  const fwd_transform_1d_row_many_neon row_txfm =
   2445      row_highbd_txfm32_x4_arr[tx_type];
   2446  int bitcol = av1_fwd_cos_bit_col[3][1];
   2447  int bitrow = av1_fwd_cos_bit_row[3][1];
   2448 
   2449  // Column-wise transform.
   2450  int32x4_t buf0[64];
   2451  col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/8,
   2452           /*hm_stride=*/8);
   2453  shift_right_2_round_s32_x4(buf0, buf0, 64);
   2454 
   2455  int32x4_t buf1[64];
   2456  transpose_arrays_s32_32x8(buf0, buf1);
   2457 
   2458  // Row-wise transform.
   2459  row_txfm(buf1, coeff, bitrow, /*howmany=*/2, /*hm_stride=*/32, /*stride=*/8);
   2460 }
   2461 #endif
   2462 
   2463 void av1_fwd_txfm2d_4x8_neon(const int16_t *input, int32_t *coeff, int stride,
   2464                             TX_TYPE tx_type, int bd) {
   2465  (void)bd;
   2466  int bitcol = av1_fwd_cos_bit_col[0][1];
   2467  int bitrow = av1_fwd_cos_bit_row[0][1];
   2468  const fwd_transform_1d_col_neon col_txfm = col_highbd_txfm8_x4_arr[tx_type];
   2469  const fwd_transform_1d_row_many_neon row_txfm =
   2470      row_rect_highbd_txfm4_xn_arr[tx_type];
   2471 
   2472  int ud_flip, lr_flip;
   2473  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   2474  ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
   2475 
   2476  // Column-wise transform.
   2477  int32x4_t buf0[8];
   2478  col_txfm(input, buf0, stride, bitcol, lr_flip);
   2479  shift_right_1_round_s32_x4(buf0, buf0, 8);
   2480 
   2481  int32x4_t buf1[8];
   2482  transpose_arrays_s32_4x8(buf0, buf1);
   2483 
   2484  // Row-wise transform.
   2485  row_txfm(buf1, coeff, bitrow, /*howmany=*/2, /*hm_stride=*/4, /*stride=*/8);
   2486 }
   2487 
   2488 void av1_fwd_txfm2d_8x4_neon(const int16_t *input, int32_t *coeff, int stride,
   2489                             TX_TYPE tx_type, int bd) {
   2490  (void)bd;
   2491  const int bitcol = av1_fwd_cos_bit_col[1][0];
   2492  const int bitrow = av1_fwd_cos_bit_row[1][0];
   2493  const fwd_transform_1d_col_many_neon col_txfm =
   2494      col_highbd_txfm4_xn_arr[tx_type];
   2495  const fwd_transform_1d_row_neon row_txfm = row_highbd_txfm8_x4_arr[tx_type];
   2496 
   2497  int ud_flip, lr_flip;
   2498  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   2499  ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
   2500 
   2501  // Column-wise transform.
   2502  int32x4_t buf0[8];
   2503  if (lr_flip) {
   2504    col_txfm(input, buf0 + 4, stride, bitcol, /*lr_flip=*/1, /*howmany=*/2,
   2505             /*hm_stride=*/-4);
   2506  } else {
   2507    col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/2,
   2508             /*hm_stride=*/4);
   2509  }
   2510 
   2511  shift_right_1_round_s32_x4(buf0, buf0, 8);
   2512 
   2513  int32x4_t buf1[8];
   2514  transpose_arrays_s32_8x4(buf0, buf1);
   2515 
   2516  // Row-wise transform.
   2517  row_txfm(buf1, coeff, bitrow, /*stride=*/4);
   2518 }
   2519 
   2520 #if !CONFIG_REALTIME_ONLY
   2521 void av1_fwd_txfm2d_16x64_neon(const int16_t *input, int32_t *coeff, int stride,
   2522                               TX_TYPE tx_type, int bd) {
   2523  (void)bd;
   2524  const int bitcol = av1_fwd_cos_bit_col[2][4];
   2525  const int bitrow = av1_fwd_cos_bit_row[2][4];
   2526 
   2527  int ud_flip, lr_flip;
   2528  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   2529  ud_adjust_input_and_stride(ud_flip, &input, &stride, 64);
   2530 
   2531  // Column-wise transform.
   2532  int32x4_t buf0[256];
   2533  load_buffer_16x64(input, buf0, stride, lr_flip);
   2534  for (int i = 0; i < 4; i++) {
   2535    highbd_fdct64_x4_neon(buf0 + i * 64, buf0 + i * 64, bitcol);
   2536  }
   2537  shift_right_2_round_s32_x4(buf0, buf0, 256);
   2538 
   2539  int32x4_t buf1[256];
   2540  transpose_arrays_s32_16x64(buf0, buf1);
   2541 
   2542  // Row-wise transform.
   2543  highbd_fdct16_xn_neon(buf1, buf1, bitrow, 8);
   2544  store_buffer_16x32(buf1, coeff, /*stride=*/32);
   2545 }
   2546 
   2547 void av1_fwd_txfm2d_64x16_neon(const int16_t *input, int32_t *coeff, int stride,
   2548                               TX_TYPE tx_type, int bd) {
   2549  (void)bd;
   2550  const int bitcol = av1_fwd_cos_bit_col[4][2];
   2551  const int bitrow = av1_fwd_cos_bit_row[4][2];
   2552 
   2553  int ud_flip, lr_flip;
   2554  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   2555  ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
   2556 
   2557  // Column-wise transform.
   2558  int32x4_t buf0[256];
   2559  load_buffer_64x16(input, buf0, stride, lr_flip);
   2560  highbd_fdct16_xn_neon(buf0, buf0, bitcol, 16);
   2561  shift_right_4_round_s32_x4(buf0, buf0, 256);
   2562 
   2563  int32x4_t buf1[256];
   2564  transpose_arrays_s32_64x16(buf0, buf1);
   2565 
   2566  // Row-wise transform.
   2567  for (int i = 0; i < 4; i++) {
   2568    highbd_fdct64_x4_neon(buf1 + i * 64, buf1 + i * 64, bitrow);
   2569  }
   2570  store_buffer_64x16(buf1, coeff, /*stride=*/16);
   2571  memset(coeff + 16 * 32, 0, 16 * 32 * sizeof(*coeff));
   2572 }
   2573 #endif
   2574 
   2575 void av1_fwd_txfm2d_32x32_neon(const int16_t *input, int32_t *output,
   2576                               int stride, TX_TYPE tx_type, int bd) {
   2577  (void)bd;
   2578  const fwd_transform_1d_col_many_neon col_txfm =
   2579      col_highbd_txfm32_x4_arr[tx_type];
   2580  const fwd_transform_1d_row_many_neon row_txfm =
   2581      row_highbd_txfm32_x4_arr[tx_type];
   2582 
   2583  // Column-wise transform.
   2584  int32x4_t buf0[256];
   2585  col_txfm(input, buf0, stride, /*cos_bit=*/12, /*lr_flip=*/0, /*howmany=*/8,
   2586           /*hm_stride=*/32);
   2587  shift_right_4_round_s32_x4(buf0, buf0, 256);
   2588 
   2589  int32x4_t buf1[256];
   2590  transpose_arrays_s32_32x32(buf0, buf1);
   2591 
   2592  // Row-wise transform.
   2593  row_txfm(buf1, output, /*cos_bit=*/12, /*howmany=*/8, /*hm_stride=*/32,
   2594           /*stride=*/32);
   2595 }
   2596 
   2597 void av1_fwd_txfm2d_64x64_neon(const int16_t *input, int32_t *output,
   2598                               int stride, TX_TYPE tx_type, int bd) {
   2599  (void)bd;
   2600  (void)tx_type;
   2601 
   2602  // Column-wise transform.
   2603  int32x4_t buf0[1024];
   2604  load_buffer_64x64(input, buf0, stride, 0);
   2605  for (int col = 0; col < 16; col++) {
   2606    highbd_fdct64_x4_neon(buf0 + col * 64, buf0 + col * 64, 13);
   2607  }
   2608  shift_right_2_round_s32_x4(buf0, buf0, 1024);
   2609 
   2610  int32x4_t buf1[1024];
   2611  transpose_arrays_s32_64x64(buf0, buf1);
   2612 
   2613  // Row-wise transform.
   2614  for (int col = 0; col < 8; col++) {
   2615    highbd_fdct64_x4_neon(buf1 + col * 64, buf1 + col * 64, 10);
   2616  }
   2617  shift_right_2_round_s32_x4(buf1, buf1, 512);
   2618  store_buffer_64x32(buf1, output, /*stride=*/32);
   2619 }