tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

highbd_fwd_txfm_avx2.c (127814B)


      1 /*
      2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 #include <assert.h>
     12 #include <immintrin.h> /*AVX2*/
     13 
     14 #include "config/aom_config.h"
     15 #include "config/av1_rtcd.h"
     16 #include "av1/common/av1_txfm.h"
     17 #include "av1/encoder/av1_fwd_txfm1d_cfg.h"
     18 #include "aom_dsp/txfm_common.h"
     19 #include "aom_ports/mem.h"
     20 #include "aom_dsp/x86/txfm_common_sse2.h"
     21 #include "aom_dsp/x86/txfm_common_avx2.h"
     22 
     23 static inline void load_buffer_8x8_avx2(const int16_t *input, __m256i *out,
     24                                        int stride, int flipud, int fliplr,
     25                                        int shift) {
     26  __m128i out1[8];
     27  if (!flipud) {
     28    out1[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
     29    out1[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
     30    out1[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
     31    out1[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
     32    out1[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
     33    out1[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
     34    out1[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
     35    out1[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
     36 
     37  } else {
     38    out1[7] = _mm_load_si128((const __m128i *)(input + 0 * stride));
     39    out1[6] = _mm_load_si128((const __m128i *)(input + 1 * stride));
     40    out1[5] = _mm_load_si128((const __m128i *)(input + 2 * stride));
     41    out1[4] = _mm_load_si128((const __m128i *)(input + 3 * stride));
     42    out1[3] = _mm_load_si128((const __m128i *)(input + 4 * stride));
     43    out1[2] = _mm_load_si128((const __m128i *)(input + 5 * stride));
     44    out1[1] = _mm_load_si128((const __m128i *)(input + 6 * stride));
     45    out1[0] = _mm_load_si128((const __m128i *)(input + 7 * stride));
     46  }
     47  if (!fliplr) {
     48    out[0] = _mm256_cvtepi16_epi32(out1[0]);
     49    out[1] = _mm256_cvtepi16_epi32(out1[1]);
     50    out[2] = _mm256_cvtepi16_epi32(out1[2]);
     51    out[3] = _mm256_cvtepi16_epi32(out1[3]);
     52    out[4] = _mm256_cvtepi16_epi32(out1[4]);
     53    out[5] = _mm256_cvtepi16_epi32(out1[5]);
     54    out[6] = _mm256_cvtepi16_epi32(out1[6]);
     55    out[7] = _mm256_cvtepi16_epi32(out1[7]);
     56 
     57  } else {
     58    out[0] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[0]));
     59    out[1] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[1]));
     60    out[2] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[2]));
     61    out[3] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[3]));
     62    out[4] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[4]));
     63    out[5] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[5]));
     64    out[6] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[6]));
     65    out[7] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[7]));
     66  }
     67  out[0] = _mm256_slli_epi32(out[0], shift);
     68  out[1] = _mm256_slli_epi32(out[1], shift);
     69  out[2] = _mm256_slli_epi32(out[2], shift);
     70  out[3] = _mm256_slli_epi32(out[3], shift);
     71  out[4] = _mm256_slli_epi32(out[4], shift);
     72  out[5] = _mm256_slli_epi32(out[5], shift);
     73  out[6] = _mm256_slli_epi32(out[6], shift);
     74  out[7] = _mm256_slli_epi32(out[7], shift);
     75 }
     76 static inline void col_txfm_8x8_rounding(__m256i *in, int shift) {
     77  const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
     78 
     79  in[0] = _mm256_add_epi32(in[0], rounding);
     80  in[1] = _mm256_add_epi32(in[1], rounding);
     81  in[2] = _mm256_add_epi32(in[2], rounding);
     82  in[3] = _mm256_add_epi32(in[3], rounding);
     83  in[4] = _mm256_add_epi32(in[4], rounding);
     84  in[5] = _mm256_add_epi32(in[5], rounding);
     85  in[6] = _mm256_add_epi32(in[6], rounding);
     86  in[7] = _mm256_add_epi32(in[7], rounding);
     87 
     88  in[0] = _mm256_srai_epi32(in[0], shift);
     89  in[1] = _mm256_srai_epi32(in[1], shift);
     90  in[2] = _mm256_srai_epi32(in[2], shift);
     91  in[3] = _mm256_srai_epi32(in[3], shift);
     92  in[4] = _mm256_srai_epi32(in[4], shift);
     93  in[5] = _mm256_srai_epi32(in[5], shift);
     94  in[6] = _mm256_srai_epi32(in[6], shift);
     95  in[7] = _mm256_srai_epi32(in[7], shift);
     96 }
     97 static inline void load_buffer_8x16_avx2(const int16_t *input, __m256i *out,
     98                                         int stride, int flipud, int fliplr,
     99                                         int shift) {
    100  const int16_t *topL = input;
    101  const int16_t *botL = input + 8 * stride;
    102 
    103  const int16_t *tmp;
    104 
    105  if (flipud) {
    106    tmp = topL;
    107    topL = botL;
    108    botL = tmp;
    109  }
    110  load_buffer_8x8_avx2(topL, out, stride, flipud, fliplr, shift);
    111  load_buffer_8x8_avx2(botL, out + 8, stride, flipud, fliplr, shift);
    112 }
    113 static inline void load_buffer_16xn_avx2(const int16_t *input, __m256i *out,
    114                                         int stride, int height, int outstride,
    115                                         int flipud, int fliplr) {
    116  __m256i out1[64];
    117  if (!flipud) {
    118    for (int i = 0; i < height; i++) {
    119      out1[i] = _mm256_loadu_si256((const __m256i *)(input + i * stride));
    120    }
    121  } else {
    122    for (int i = 0; i < height; i++) {
    123      out1[(height - 1) - i] =
    124          _mm256_loadu_si256((const __m256i *)(input + i * stride));
    125    }
    126  }
    127  if (!fliplr) {
    128    for (int i = 0; i < height; i++) {
    129      out[i * outstride] =
    130          _mm256_cvtepi16_epi32(_mm256_castsi256_si128(out1[i]));
    131      out[i * outstride + 1] =
    132          _mm256_cvtepi16_epi32(_mm256_extractf128_si256(out1[i], 1));
    133    }
    134  } else {
    135    for (int i = 0; i < height; i++) {
    136      out[i * outstride + 1] = _mm256_cvtepi16_epi32(
    137          mm_reverse_epi16(_mm256_castsi256_si128(out1[i])));
    138      out[i * outstride + 0] = _mm256_cvtepi16_epi32(
    139          mm_reverse_epi16(_mm256_extractf128_si256(out1[i], 1)));
    140    }
    141  }
    142 }
    143 
    144 static void fwd_txfm_transpose_8x8_avx2(const __m256i *in, __m256i *out,
    145                                        const int instride,
    146                                        const int outstride) {
    147  __m256i u0, u1, u2, u3, u4, u5, u6, u7;
    148  __m256i x0, x1;
    149 
    150  u0 = _mm256_unpacklo_epi32(in[0 * instride], in[1 * instride]);
    151  u1 = _mm256_unpackhi_epi32(in[0 * instride], in[1 * instride]);
    152 
    153  u2 = _mm256_unpacklo_epi32(in[2 * instride], in[3 * instride]);
    154  u3 = _mm256_unpackhi_epi32(in[2 * instride], in[3 * instride]);
    155 
    156  u4 = _mm256_unpacklo_epi32(in[4 * instride], in[5 * instride]);
    157  u5 = _mm256_unpackhi_epi32(in[4 * instride], in[5 * instride]);
    158 
    159  u6 = _mm256_unpacklo_epi32(in[6 * instride], in[7 * instride]);
    160  u7 = _mm256_unpackhi_epi32(in[6 * instride], in[7 * instride]);
    161 
    162  x0 = _mm256_unpacklo_epi64(u0, u2);
    163  x1 = _mm256_unpacklo_epi64(u4, u6);
    164  out[0 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20);
    165  out[4 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31);
    166 
    167  x0 = _mm256_unpackhi_epi64(u0, u2);
    168  x1 = _mm256_unpackhi_epi64(u4, u6);
    169  out[1 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20);
    170  out[5 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31);
    171 
    172  x0 = _mm256_unpacklo_epi64(u1, u3);
    173  x1 = _mm256_unpacklo_epi64(u5, u7);
    174  out[2 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20);
    175  out[6 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31);
    176 
    177  x0 = _mm256_unpackhi_epi64(u1, u3);
    178  x1 = _mm256_unpackhi_epi64(u5, u7);
    179  out[3 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20);
    180  out[7 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31);
    181 }
    182 static inline void round_shift_32_8xn_avx2(__m256i *in, int size, int bit,
    183                                           int stride) {
    184  if (bit < 0) {
    185    bit = -bit;
    186    __m256i round = _mm256_set1_epi32(1 << (bit - 1));
    187    for (int i = 0; i < size; ++i) {
    188      in[stride * i] = _mm256_add_epi32(in[stride * i], round);
    189      in[stride * i] = _mm256_srai_epi32(in[stride * i], bit);
    190    }
    191  } else if (bit > 0) {
    192    for (int i = 0; i < size; ++i) {
    193      in[stride * i] = _mm256_slli_epi32(in[stride * i], bit);
    194    }
    195  }
    196 }
    197 static inline void store_buffer_avx2(const __m256i *const in, int32_t *out,
    198                                     const int stride, const int out_size) {
    199  for (int i = 0; i < out_size; ++i) {
    200    _mm256_store_si256((__m256i *)(out), in[i]);
    201    out += stride;
    202  }
    203 }
    204 static inline void fwd_txfm_transpose_16x16_avx2(const __m256i *in,
    205                                                 __m256i *out) {
    206  fwd_txfm_transpose_8x8_avx2(&in[0], &out[0], 2, 2);
    207  fwd_txfm_transpose_8x8_avx2(&in[1], &out[16], 2, 2);
    208  fwd_txfm_transpose_8x8_avx2(&in[16], &out[1], 2, 2);
    209  fwd_txfm_transpose_8x8_avx2(&in[17], &out[17], 2, 2);
    210 }
    211 
    212 static inline __m256i av1_half_btf_avx2(const __m256i *w0, const __m256i *n0,
    213                                        const __m256i *w1, const __m256i *n1,
    214                                        const __m256i *rounding, int bit) {
    215  __m256i x, y;
    216 
    217  x = _mm256_mullo_epi32(*w0, *n0);
    218  y = _mm256_mullo_epi32(*w1, *n1);
    219  x = _mm256_add_epi32(x, y);
    220  x = _mm256_add_epi32(x, *rounding);
    221  x = _mm256_srai_epi32(x, bit);
    222  return x;
    223 }
    224 #define btf_32_avx2_type0(w0, w1, in0, in1, out0, out1, bit) \
    225  do {                                                       \
    226    const __m256i ww0 = _mm256_set1_epi32(w0);               \
    227    const __m256i ww1 = _mm256_set1_epi32(w1);               \
    228    const __m256i in0_w0 = _mm256_mullo_epi32(in0, ww0);     \
    229    const __m256i in1_w1 = _mm256_mullo_epi32(in1, ww1);     \
    230    out0 = _mm256_add_epi32(in0_w0, in1_w1);                 \
    231    round_shift_32_8xn_avx2(&out0, 1, -bit, 1);              \
    232    const __m256i in0_w1 = _mm256_mullo_epi32(in0, ww1);     \
    233    const __m256i in1_w0 = _mm256_mullo_epi32(in1, ww0);     \
    234    out1 = _mm256_sub_epi32(in0_w1, in1_w0);                 \
    235    round_shift_32_8xn_avx2(&out1, 1, -bit, 1);              \
    236  } while (0)
    237 
    238 #define btf_32_type0_avx2_new(ww0, ww1, in0, in1, out0, out1, r, bit) \
    239  do {                                                                \
    240    const __m256i in0_w0 = _mm256_mullo_epi32(in0, ww0);              \
    241    const __m256i in1_w1 = _mm256_mullo_epi32(in1, ww1);              \
    242    out0 = _mm256_add_epi32(in0_w0, in1_w1);                          \
    243    out0 = _mm256_add_epi32(out0, r);                                 \
    244    out0 = _mm256_srai_epi32(out0, bit);                              \
    245    const __m256i in0_w1 = _mm256_mullo_epi32(in0, ww1);              \
    246    const __m256i in1_w0 = _mm256_mullo_epi32(in1, ww0);              \
    247    out1 = _mm256_sub_epi32(in0_w1, in1_w0);                          \
    248    out1 = _mm256_add_epi32(out1, r);                                 \
    249    out1 = _mm256_srai_epi32(out1, bit);                              \
    250  } while (0)
    251 
    252 typedef void (*transform_1d_avx2)(__m256i *in, __m256i *out,
    253                                  const int8_t cos_bit, int instride,
    254                                  int outstride);
    255 static void fdct8_avx2(__m256i *in, __m256i *out, const int8_t bit,
    256                       const int col_num, const int outstride) {
    257  const int32_t *cospi = cospi_arr(bit);
    258  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
    259  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
    260  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
    261  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
    262  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
    263  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
    264  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
    265  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
    266  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
    267  __m256i u[8], v[8];
    268  for (int col = 0; col < col_num; ++col) {
    269    u[0] = _mm256_add_epi32(in[0 * col_num + col], in[7 * col_num + col]);
    270    v[7] = _mm256_sub_epi32(in[0 * col_num + col], in[7 * col_num + col]);
    271    u[1] = _mm256_add_epi32(in[1 * col_num + col], in[6 * col_num + col]);
    272    u[6] = _mm256_sub_epi32(in[1 * col_num + col], in[6 * col_num + col]);
    273    u[2] = _mm256_add_epi32(in[2 * col_num + col], in[5 * col_num + col]);
    274    u[5] = _mm256_sub_epi32(in[2 * col_num + col], in[5 * col_num + col]);
    275    u[3] = _mm256_add_epi32(in[3 * col_num + col], in[4 * col_num + col]);
    276    v[4] = _mm256_sub_epi32(in[3 * col_num + col], in[4 * col_num + col]);
    277    v[0] = _mm256_add_epi32(u[0], u[3]);
    278    v[3] = _mm256_sub_epi32(u[0], u[3]);
    279    v[1] = _mm256_add_epi32(u[1], u[2]);
    280    v[2] = _mm256_sub_epi32(u[1], u[2]);
    281 
    282    v[5] = _mm256_mullo_epi32(u[5], cospim32);
    283    v[6] = _mm256_mullo_epi32(u[6], cospi32);
    284    v[5] = _mm256_add_epi32(v[5], v[6]);
    285    v[5] = _mm256_add_epi32(v[5], rnding);
    286    v[5] = _mm256_srai_epi32(v[5], bit);
    287 
    288    u[0] = _mm256_mullo_epi32(u[5], cospi32);
    289    v[6] = _mm256_mullo_epi32(u[6], cospim32);
    290    v[6] = _mm256_sub_epi32(u[0], v[6]);
    291    v[6] = _mm256_add_epi32(v[6], rnding);
    292    v[6] = _mm256_srai_epi32(v[6], bit);
    293 
    294    // stage 3
    295    // type 0
    296    v[0] = _mm256_mullo_epi32(v[0], cospi32);
    297    v[1] = _mm256_mullo_epi32(v[1], cospi32);
    298    u[0] = _mm256_add_epi32(v[0], v[1]);
    299    u[0] = _mm256_add_epi32(u[0], rnding);
    300    u[0] = _mm256_srai_epi32(u[0], bit);
    301 
    302    u[1] = _mm256_sub_epi32(v[0], v[1]);
    303    u[1] = _mm256_add_epi32(u[1], rnding);
    304    u[1] = _mm256_srai_epi32(u[1], bit);
    305 
    306    // type 1
    307    v[0] = _mm256_mullo_epi32(v[2], cospi48);
    308    v[1] = _mm256_mullo_epi32(v[3], cospi16);
    309    u[2] = _mm256_add_epi32(v[0], v[1]);
    310    u[2] = _mm256_add_epi32(u[2], rnding);
    311    u[2] = _mm256_srai_epi32(u[2], bit);
    312 
    313    v[0] = _mm256_mullo_epi32(v[2], cospi16);
    314    v[1] = _mm256_mullo_epi32(v[3], cospi48);
    315    u[3] = _mm256_sub_epi32(v[1], v[0]);
    316    u[3] = _mm256_add_epi32(u[3], rnding);
    317    u[3] = _mm256_srai_epi32(u[3], bit);
    318 
    319    u[4] = _mm256_add_epi32(v[4], v[5]);
    320    u[5] = _mm256_sub_epi32(v[4], v[5]);
    321    u[6] = _mm256_sub_epi32(v[7], v[6]);
    322    u[7] = _mm256_add_epi32(v[7], v[6]);
    323 
    324    // stage 4
    325    // stage 5
    326    v[0] = _mm256_mullo_epi32(u[4], cospi56);
    327    v[1] = _mm256_mullo_epi32(u[7], cospi8);
    328    v[0] = _mm256_add_epi32(v[0], v[1]);
    329    v[0] = _mm256_add_epi32(v[0], rnding);
    330    out[1 * outstride + col] = _mm256_srai_epi32(v[0], bit);  // buf0[4]
    331 
    332    v[0] = _mm256_mullo_epi32(u[4], cospi8);
    333    v[1] = _mm256_mullo_epi32(u[7], cospi56);
    334    v[0] = _mm256_sub_epi32(v[1], v[0]);
    335    v[0] = _mm256_add_epi32(v[0], rnding);
    336    out[7 * outstride + col] = _mm256_srai_epi32(v[0], bit);  // buf0[7]
    337 
    338    v[0] = _mm256_mullo_epi32(u[5], cospi24);
    339    v[1] = _mm256_mullo_epi32(u[6], cospi40);
    340    v[0] = _mm256_add_epi32(v[0], v[1]);
    341    v[0] = _mm256_add_epi32(v[0], rnding);
    342    out[5 * outstride + col] = _mm256_srai_epi32(v[0], bit);  // buf0[5]
    343 
    344    v[0] = _mm256_mullo_epi32(u[5], cospi40);
    345    v[1] = _mm256_mullo_epi32(u[6], cospi24);
    346    v[0] = _mm256_sub_epi32(v[1], v[0]);
    347    v[0] = _mm256_add_epi32(v[0], rnding);
    348    out[3 * outstride + col] = _mm256_srai_epi32(v[0], bit);  // buf0[6]
    349 
    350    out[0 * outstride + col] = u[0];  // buf0[0]
    351    out[4 * outstride + col] = u[1];  // buf0[1]
    352    out[2 * outstride + col] = u[2];  // buf0[2]
    353    out[6 * outstride + col] = u[3];  // buf0[3]
    354  }
    355 }
    356 static void fadst8_avx2(__m256i *in, __m256i *out, const int8_t bit,
    357                        const int col_num, const int outstirde) {
    358  (void)col_num;
    359  const int32_t *cospi = cospi_arr(bit);
    360  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
    361  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
    362  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
    363  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
    364  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
    365  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
    366  const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
    367  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
    368  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
    369  const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
    370  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
    371  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
    372  const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
    373  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
    374  const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
    375  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
    376  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
    377  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
    378  const __m256i zero = _mm256_setzero_si256();
    379  __m256i u0, u1, u2, u3, u4, u5, u6, u7;
    380  __m256i v0, v1, v2, v3, v4, v5, v6, v7;
    381  __m256i x, y;
    382  for (int col = 0; col < col_num; ++col) {
    383    u0 = in[0 * col_num + col];
    384    u1 = _mm256_sub_epi32(zero, in[7 * col_num + col]);
    385    u2 = _mm256_sub_epi32(zero, in[3 * col_num + col]);
    386    u3 = in[4 * col_num + col];
    387    u4 = _mm256_sub_epi32(zero, in[1 * col_num + col]);
    388    u5 = in[6 * col_num + col];
    389    u6 = in[2 * col_num + col];
    390    u7 = _mm256_sub_epi32(zero, in[5 * col_num + col]);
    391 
    392    // stage 2
    393    v0 = u0;
    394    v1 = u1;
    395 
    396    x = _mm256_mullo_epi32(u2, cospi32);
    397    y = _mm256_mullo_epi32(u3, cospi32);
    398    v2 = _mm256_add_epi32(x, y);
    399    v2 = _mm256_add_epi32(v2, rnding);
    400    v2 = _mm256_srai_epi32(v2, bit);
    401 
    402    v3 = _mm256_sub_epi32(x, y);
    403    v3 = _mm256_add_epi32(v3, rnding);
    404    v3 = _mm256_srai_epi32(v3, bit);
    405 
    406    v4 = u4;
    407    v5 = u5;
    408 
    409    x = _mm256_mullo_epi32(u6, cospi32);
    410    y = _mm256_mullo_epi32(u7, cospi32);
    411    v6 = _mm256_add_epi32(x, y);
    412    v6 = _mm256_add_epi32(v6, rnding);
    413    v6 = _mm256_srai_epi32(v6, bit);
    414 
    415    v7 = _mm256_sub_epi32(x, y);
    416    v7 = _mm256_add_epi32(v7, rnding);
    417    v7 = _mm256_srai_epi32(v7, bit);
    418 
    419    // stage 3
    420    u0 = _mm256_add_epi32(v0, v2);
    421    u1 = _mm256_add_epi32(v1, v3);
    422    u2 = _mm256_sub_epi32(v0, v2);
    423    u3 = _mm256_sub_epi32(v1, v3);
    424    u4 = _mm256_add_epi32(v4, v6);
    425    u5 = _mm256_add_epi32(v5, v7);
    426    u6 = _mm256_sub_epi32(v4, v6);
    427    u7 = _mm256_sub_epi32(v5, v7);
    428 
    429    // stage 4
    430    v0 = u0;
    431    v1 = u1;
    432    v2 = u2;
    433    v3 = u3;
    434 
    435    x = _mm256_mullo_epi32(u4, cospi16);
    436    y = _mm256_mullo_epi32(u5, cospi48);
    437    v4 = _mm256_add_epi32(x, y);
    438    v4 = _mm256_add_epi32(v4, rnding);
    439    v4 = _mm256_srai_epi32(v4, bit);
    440 
    441    x = _mm256_mullo_epi32(u4, cospi48);
    442    y = _mm256_mullo_epi32(u5, cospim16);
    443    v5 = _mm256_add_epi32(x, y);
    444    v5 = _mm256_add_epi32(v5, rnding);
    445    v5 = _mm256_srai_epi32(v5, bit);
    446 
    447    x = _mm256_mullo_epi32(u6, cospim48);
    448    y = _mm256_mullo_epi32(u7, cospi16);
    449    v6 = _mm256_add_epi32(x, y);
    450    v6 = _mm256_add_epi32(v6, rnding);
    451    v6 = _mm256_srai_epi32(v6, bit);
    452 
    453    x = _mm256_mullo_epi32(u6, cospi16);
    454    y = _mm256_mullo_epi32(u7, cospi48);
    455    v7 = _mm256_add_epi32(x, y);
    456    v7 = _mm256_add_epi32(v7, rnding);
    457    v7 = _mm256_srai_epi32(v7, bit);
    458 
    459    // stage 5
    460    u0 = _mm256_add_epi32(v0, v4);
    461    u1 = _mm256_add_epi32(v1, v5);
    462    u2 = _mm256_add_epi32(v2, v6);
    463    u3 = _mm256_add_epi32(v3, v7);
    464    u4 = _mm256_sub_epi32(v0, v4);
    465    u5 = _mm256_sub_epi32(v1, v5);
    466    u6 = _mm256_sub_epi32(v2, v6);
    467    u7 = _mm256_sub_epi32(v3, v7);
    468 
    469    // stage 6
    470    x = _mm256_mullo_epi32(u0, cospi4);
    471    y = _mm256_mullo_epi32(u1, cospi60);
    472    v0 = _mm256_add_epi32(x, y);
    473    v0 = _mm256_add_epi32(v0, rnding);
    474    v0 = _mm256_srai_epi32(v0, bit);
    475 
    476    x = _mm256_mullo_epi32(u0, cospi60);
    477    y = _mm256_mullo_epi32(u1, cospim4);
    478    v1 = _mm256_add_epi32(x, y);
    479    v1 = _mm256_add_epi32(v1, rnding);
    480    v1 = _mm256_srai_epi32(v1, bit);
    481 
    482    x = _mm256_mullo_epi32(u2, cospi20);
    483    y = _mm256_mullo_epi32(u3, cospi44);
    484    v2 = _mm256_add_epi32(x, y);
    485    v2 = _mm256_add_epi32(v2, rnding);
    486    v2 = _mm256_srai_epi32(v2, bit);
    487 
    488    x = _mm256_mullo_epi32(u2, cospi44);
    489    y = _mm256_mullo_epi32(u3, cospim20);
    490    v3 = _mm256_add_epi32(x, y);
    491    v3 = _mm256_add_epi32(v3, rnding);
    492    v3 = _mm256_srai_epi32(v3, bit);
    493 
    494    x = _mm256_mullo_epi32(u4, cospi36);
    495    y = _mm256_mullo_epi32(u5, cospi28);
    496    v4 = _mm256_add_epi32(x, y);
    497    v4 = _mm256_add_epi32(v4, rnding);
    498    v4 = _mm256_srai_epi32(v4, bit);
    499 
    500    x = _mm256_mullo_epi32(u4, cospi28);
    501    y = _mm256_mullo_epi32(u5, cospim36);
    502    v5 = _mm256_add_epi32(x, y);
    503    v5 = _mm256_add_epi32(v5, rnding);
    504    v5 = _mm256_srai_epi32(v5, bit);
    505 
    506    x = _mm256_mullo_epi32(u6, cospi52);
    507    y = _mm256_mullo_epi32(u7, cospi12);
    508    v6 = _mm256_add_epi32(x, y);
    509    v6 = _mm256_add_epi32(v6, rnding);
    510    v6 = _mm256_srai_epi32(v6, bit);
    511 
    512    x = _mm256_mullo_epi32(u6, cospi12);
    513    y = _mm256_mullo_epi32(u7, cospim52);
    514    v7 = _mm256_add_epi32(x, y);
    515    v7 = _mm256_add_epi32(v7, rnding);
    516    v7 = _mm256_srai_epi32(v7, bit);
    517 
    518    // stage 7
    519    out[0 * outstirde + col] = v1;
    520    out[1 * outstirde + col] = v6;
    521    out[2 * outstirde + col] = v3;
    522    out[3 * outstirde + col] = v4;
    523    out[4 * outstirde + col] = v5;
    524    out[5 * outstirde + col] = v2;
    525    out[6 * outstirde + col] = v7;
    526    out[7 * outstirde + col] = v0;
    527  }
    528 }
    529 static void idtx8_avx2(__m256i *in, __m256i *out, const int8_t bit, int col_num,
    530                       int outstride) {
    531  (void)bit;
    532  (void)outstride;
    533  int num_iters = 8 * col_num;
    534  for (int i = 0; i < num_iters; i += 8) {
    535    out[i] = _mm256_add_epi32(in[i], in[i]);
    536    out[i + 1] = _mm256_add_epi32(in[i + 1], in[i + 1]);
    537    out[i + 2] = _mm256_add_epi32(in[i + 2], in[i + 2]);
    538    out[i + 3] = _mm256_add_epi32(in[i + 3], in[i + 3]);
    539    out[i + 4] = _mm256_add_epi32(in[i + 4], in[i + 4]);
    540    out[i + 5] = _mm256_add_epi32(in[i + 5], in[i + 5]);
    541    out[i + 6] = _mm256_add_epi32(in[i + 6], in[i + 6]);
    542    out[i + 7] = _mm256_add_epi32(in[i + 7], in[i + 7]);
    543  }
    544 }
    545 void av1_fwd_txfm2d_8x8_avx2(const int16_t *input, int32_t *coeff, int stride,
    546                             TX_TYPE tx_type, int bd) {
    547  __m256i in[8], out[8];
    548  const TX_SIZE tx_size = TX_8X8;
    549  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
    550  const int txw_idx = get_txw_idx(tx_size);
    551  const int txh_idx = get_txh_idx(tx_size);
    552  const int width = tx_size_wide[tx_size];
    553  const int width_div8 = (width >> 3);
    554 
    555  switch (tx_type) {
    556    case DCT_DCT:
    557      load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
    558      fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
    559                 width_div8);
    560      col_txfm_8x8_rounding(out, -shift[1]);
    561      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
    562      fdct8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
    563                 width_div8);
    564      store_buffer_avx2(out, coeff, 8, 8);
    565      break;
    566    case ADST_DCT:
    567      load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
    568      fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
    569                  width_div8);
    570      col_txfm_8x8_rounding(out, -shift[1]);
    571      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
    572      fdct8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
    573                 width_div8);
    574      store_buffer_avx2(out, coeff, 8, 8);
    575      break;
    576    case DCT_ADST:
    577      load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
    578      fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
    579                 width_div8);
    580      col_txfm_8x8_rounding(out, -shift[1]);
    581      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
    582      fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
    583                  width_div8);
    584      store_buffer_avx2(out, coeff, 8, 8);
    585      break;
    586    case ADST_ADST:
    587      load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
    588      fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
    589                  width_div8);
    590      col_txfm_8x8_rounding(out, -shift[1]);
    591      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
    592      fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
    593                  width_div8);
    594      store_buffer_avx2(out, coeff, 8, 8);
    595      break;
    596    case FLIPADST_DCT:
    597      load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]);
    598      fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
    599                  width_div8);
    600      col_txfm_8x8_rounding(out, -shift[1]);
    601      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
    602      fdct8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
    603                 width_div8);
    604      store_buffer_avx2(out, coeff, 8, 8);
    605      break;
    606    case DCT_FLIPADST:
    607      load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]);
    608      fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
    609                 width_div8);
    610      col_txfm_8x8_rounding(out, -shift[1]);
    611      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
    612      fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
    613                  width_div8);
    614      store_buffer_avx2(out, coeff, 8, 8);
    615      break;
    616    case FLIPADST_FLIPADST:
    617      load_buffer_8x8_avx2(input, in, stride, 1, 1, shift[0]);
    618      fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
    619                  width_div8);
    620      col_txfm_8x8_rounding(out, -shift[1]);
    621      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
    622      fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
    623                  width_div8);
    624      store_buffer_avx2(out, coeff, 8, 8);
    625      break;
    626    case ADST_FLIPADST:
    627      load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]);
    628      fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
    629                  width_div8);
    630      col_txfm_8x8_rounding(out, -shift[1]);
    631      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
    632      fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
    633                  width_div8);
    634      store_buffer_avx2(out, coeff, 8, 8);
    635      break;
    636    case FLIPADST_ADST:
    637      load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]);
    638      fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
    639                  width_div8);
    640      col_txfm_8x8_rounding(out, -shift[1]);
    641      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
    642      fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
    643                  width_div8);
    644      store_buffer_avx2(out, coeff, 8, 8);
    645      break;
    646    case IDTX:
    647      load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
    648      idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
    649                 width_div8);
    650      col_txfm_8x8_rounding(out, -shift[1]);
    651      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
    652      idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
    653                 width_div8);
    654      store_buffer_avx2(out, coeff, 8, 8);
    655      break;
    656    case V_DCT:
    657      load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
    658      fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
    659                 width_div8);
    660      col_txfm_8x8_rounding(out, -shift[1]);
    661      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
    662      idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
    663                 width_div8);
    664      store_buffer_avx2(out, coeff, 8, 8);
    665      break;
    666    case H_DCT:
    667      load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
    668      idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
    669                 width_div8);
    670      col_txfm_8x8_rounding(out, -shift[1]);
    671      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
    672      fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
    673                 width_div8);
    674      store_buffer_avx2(out, coeff, 8, 8);
    675      break;
    676    case V_ADST:
    677      load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
    678      fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
    679                  width_div8);
    680      col_txfm_8x8_rounding(out, -shift[1]);
    681      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
    682      idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
    683                 width_div8);
    684      store_buffer_avx2(out, coeff, 8, 8);
    685      break;
    686    case H_ADST:
    687      load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
    688      idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
    689                 width_div8);
    690      col_txfm_8x8_rounding(out, -shift[1]);
    691      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
    692      fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
    693                  width_div8);
    694      store_buffer_avx2(out, coeff, 8, 8);
    695      break;
    696    case V_FLIPADST:
    697      load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]);
    698      fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
    699                  width_div8);
    700      col_txfm_8x8_rounding(out, -shift[1]);
    701      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
    702      idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
    703                 width_div8);
    704      store_buffer_avx2(out, coeff, 8, 8);
    705      break;
    706    case H_FLIPADST:
    707      load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]);
    708      idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
    709                 width_div8);
    710      col_txfm_8x8_rounding(out, -shift[1]);
    711      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
    712      fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
    713                  width_div8);
    714      store_buffer_avx2(out, coeff, 8, 8);
    715      break;
    716    default: assert(0);
    717  }
    718  (void)bd;
    719 }
    720 
    721 static void fdct16_avx2(__m256i *in, __m256i *out, const int8_t bit,
    722                        const int col_num, const int outstride) {
    723  const int32_t *cospi = cospi_arr(bit);
    724  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
    725  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
    726  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
    727  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
    728  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
    729  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
    730  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
    731  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
    732  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
    733  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
    734  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
    735  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
    736  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
    737  const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
    738  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
    739  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
    740  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
    741  const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
    742  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
    743  __m256i u[16], v[16], x;
    744  int col;
    745 
    746  // Calculate the column 0, 1, 2, 3
    747  for (col = 0; col < col_num; ++col) {
    748    // stage 0
    749    // stage 1
    750    u[0] = _mm256_add_epi32(in[0 * col_num + col], in[15 * col_num + col]);
    751    u[15] = _mm256_sub_epi32(in[0 * col_num + col], in[15 * col_num + col]);
    752    u[1] = _mm256_add_epi32(in[1 * col_num + col], in[14 * col_num + col]);
    753    u[14] = _mm256_sub_epi32(in[1 * col_num + col], in[14 * col_num + col]);
    754    u[2] = _mm256_add_epi32(in[2 * col_num + col], in[13 * col_num + col]);
    755    u[13] = _mm256_sub_epi32(in[2 * col_num + col], in[13 * col_num + col]);
    756    u[3] = _mm256_add_epi32(in[3 * col_num + col], in[12 * col_num + col]);
    757    u[12] = _mm256_sub_epi32(in[3 * col_num + col], in[12 * col_num + col]);
    758    u[4] = _mm256_add_epi32(in[4 * col_num + col], in[11 * col_num + col]);
    759    u[11] = _mm256_sub_epi32(in[4 * col_num + col], in[11 * col_num + col]);
    760    u[5] = _mm256_add_epi32(in[5 * col_num + col], in[10 * col_num + col]);
    761    u[10] = _mm256_sub_epi32(in[5 * col_num + col], in[10 * col_num + col]);
    762    u[6] = _mm256_add_epi32(in[6 * col_num + col], in[9 * col_num + col]);
    763    u[9] = _mm256_sub_epi32(in[6 * col_num + col], in[9 * col_num + col]);
    764    u[7] = _mm256_add_epi32(in[7 * col_num + col], in[8 * col_num + col]);
    765    u[8] = _mm256_sub_epi32(in[7 * col_num + col], in[8 * col_num + col]);
    766 
    767    // stage 2
    768    v[0] = _mm256_add_epi32(u[0], u[7]);
    769    v[7] = _mm256_sub_epi32(u[0], u[7]);
    770    v[1] = _mm256_add_epi32(u[1], u[6]);
    771    v[6] = _mm256_sub_epi32(u[1], u[6]);
    772    v[2] = _mm256_add_epi32(u[2], u[5]);
    773    v[5] = _mm256_sub_epi32(u[2], u[5]);
    774    v[3] = _mm256_add_epi32(u[3], u[4]);
    775    v[4] = _mm256_sub_epi32(u[3], u[4]);
    776    v[8] = u[8];
    777    v[9] = u[9];
    778 
    779    v[10] = _mm256_mullo_epi32(u[10], cospim32);
    780    x = _mm256_mullo_epi32(u[13], cospi32);
    781    v[10] = _mm256_add_epi32(v[10], x);
    782    v[10] = _mm256_add_epi32(v[10], rnding);
    783    v[10] = _mm256_srai_epi32(v[10], bit);
    784 
    785    v[13] = _mm256_mullo_epi32(u[10], cospi32);
    786    x = _mm256_mullo_epi32(u[13], cospim32);
    787    v[13] = _mm256_sub_epi32(v[13], x);
    788    v[13] = _mm256_add_epi32(v[13], rnding);
    789    v[13] = _mm256_srai_epi32(v[13], bit);
    790 
    791    v[11] = _mm256_mullo_epi32(u[11], cospim32);
    792    x = _mm256_mullo_epi32(u[12], cospi32);
    793    v[11] = _mm256_add_epi32(v[11], x);
    794    v[11] = _mm256_add_epi32(v[11], rnding);
    795    v[11] = _mm256_srai_epi32(v[11], bit);
    796 
    797    v[12] = _mm256_mullo_epi32(u[11], cospi32);
    798    x = _mm256_mullo_epi32(u[12], cospim32);
    799    v[12] = _mm256_sub_epi32(v[12], x);
    800    v[12] = _mm256_add_epi32(v[12], rnding);
    801    v[12] = _mm256_srai_epi32(v[12], bit);
    802    v[14] = u[14];
    803    v[15] = u[15];
    804 
    805    // stage 3
    806    u[0] = _mm256_add_epi32(v[0], v[3]);
    807    u[3] = _mm256_sub_epi32(v[0], v[3]);
    808    u[1] = _mm256_add_epi32(v[1], v[2]);
    809    u[2] = _mm256_sub_epi32(v[1], v[2]);
    810    u[4] = v[4];
    811 
    812    u[5] = _mm256_mullo_epi32(v[5], cospim32);
    813    x = _mm256_mullo_epi32(v[6], cospi32);
    814    u[5] = _mm256_add_epi32(u[5], x);
    815    u[5] = _mm256_add_epi32(u[5], rnding);
    816    u[5] = _mm256_srai_epi32(u[5], bit);
    817 
    818    u[6] = _mm256_mullo_epi32(v[5], cospi32);
    819    x = _mm256_mullo_epi32(v[6], cospim32);
    820    u[6] = _mm256_sub_epi32(u[6], x);
    821    u[6] = _mm256_add_epi32(u[6], rnding);
    822    u[6] = _mm256_srai_epi32(u[6], bit);
    823 
    824    u[7] = v[7];
    825    u[8] = _mm256_add_epi32(v[8], v[11]);
    826    u[11] = _mm256_sub_epi32(v[8], v[11]);
    827    u[9] = _mm256_add_epi32(v[9], v[10]);
    828    u[10] = _mm256_sub_epi32(v[9], v[10]);
    829    u[12] = _mm256_sub_epi32(v[15], v[12]);
    830    u[15] = _mm256_add_epi32(v[15], v[12]);
    831    u[13] = _mm256_sub_epi32(v[14], v[13]);
    832    u[14] = _mm256_add_epi32(v[14], v[13]);
    833 
    834    // stage 4
    835    u[0] = _mm256_mullo_epi32(u[0], cospi32);
    836    u[1] = _mm256_mullo_epi32(u[1], cospi32);
    837    v[0] = _mm256_add_epi32(u[0], u[1]);
    838    v[0] = _mm256_add_epi32(v[0], rnding);
    839    v[0] = _mm256_srai_epi32(v[0], bit);
    840 
    841    v[1] = _mm256_sub_epi32(u[0], u[1]);
    842    v[1] = _mm256_add_epi32(v[1], rnding);
    843    v[1] = _mm256_srai_epi32(v[1], bit);
    844 
    845    v[2] = _mm256_mullo_epi32(u[2], cospi48);
    846    x = _mm256_mullo_epi32(u[3], cospi16);
    847    v[2] = _mm256_add_epi32(v[2], x);
    848    v[2] = _mm256_add_epi32(v[2], rnding);
    849    v[2] = _mm256_srai_epi32(v[2], bit);
    850 
    851    v[3] = _mm256_mullo_epi32(u[2], cospi16);
    852    x = _mm256_mullo_epi32(u[3], cospi48);
    853    v[3] = _mm256_sub_epi32(x, v[3]);
    854    v[3] = _mm256_add_epi32(v[3], rnding);
    855    v[3] = _mm256_srai_epi32(v[3], bit);
    856 
    857    v[4] = _mm256_add_epi32(u[4], u[5]);
    858    v[5] = _mm256_sub_epi32(u[4], u[5]);
    859    v[6] = _mm256_sub_epi32(u[7], u[6]);
    860    v[7] = _mm256_add_epi32(u[7], u[6]);
    861    v[8] = u[8];
    862 
    863    v[9] = _mm256_mullo_epi32(u[9], cospim16);
    864    x = _mm256_mullo_epi32(u[14], cospi48);
    865    v[9] = _mm256_add_epi32(v[9], x);
    866    v[9] = _mm256_add_epi32(v[9], rnding);
    867    v[9] = _mm256_srai_epi32(v[9], bit);
    868 
    869    v[14] = _mm256_mullo_epi32(u[9], cospi48);
    870    x = _mm256_mullo_epi32(u[14], cospim16);
    871    v[14] = _mm256_sub_epi32(v[14], x);
    872    v[14] = _mm256_add_epi32(v[14], rnding);
    873    v[14] = _mm256_srai_epi32(v[14], bit);
    874 
    875    v[10] = _mm256_mullo_epi32(u[10], cospim48);
    876    x = _mm256_mullo_epi32(u[13], cospim16);
    877    v[10] = _mm256_add_epi32(v[10], x);
    878    v[10] = _mm256_add_epi32(v[10], rnding);
    879    v[10] = _mm256_srai_epi32(v[10], bit);
    880 
    881    v[13] = _mm256_mullo_epi32(u[10], cospim16);
    882    x = _mm256_mullo_epi32(u[13], cospim48);
    883    v[13] = _mm256_sub_epi32(v[13], x);
    884    v[13] = _mm256_add_epi32(v[13], rnding);
    885    v[13] = _mm256_srai_epi32(v[13], bit);
    886 
    887    v[11] = u[11];
    888    v[12] = u[12];
    889    v[15] = u[15];
    890 
    891    // stage 5
    892    u[0] = v[0];
    893    u[1] = v[1];
    894    u[2] = v[2];
    895    u[3] = v[3];
    896 
    897    u[4] = _mm256_mullo_epi32(v[4], cospi56);
    898    x = _mm256_mullo_epi32(v[7], cospi8);
    899    u[4] = _mm256_add_epi32(u[4], x);
    900    u[4] = _mm256_add_epi32(u[4], rnding);
    901    u[4] = _mm256_srai_epi32(u[4], bit);
    902 
    903    u[7] = _mm256_mullo_epi32(v[4], cospi8);
    904    x = _mm256_mullo_epi32(v[7], cospi56);
    905    u[7] = _mm256_sub_epi32(x, u[7]);
    906    u[7] = _mm256_add_epi32(u[7], rnding);
    907    u[7] = _mm256_srai_epi32(u[7], bit);
    908 
    909    u[5] = _mm256_mullo_epi32(v[5], cospi24);
    910    x = _mm256_mullo_epi32(v[6], cospi40);
    911    u[5] = _mm256_add_epi32(u[5], x);
    912    u[5] = _mm256_add_epi32(u[5], rnding);
    913    u[5] = _mm256_srai_epi32(u[5], bit);
    914 
    915    u[6] = _mm256_mullo_epi32(v[5], cospi40);
    916    x = _mm256_mullo_epi32(v[6], cospi24);
    917    u[6] = _mm256_sub_epi32(x, u[6]);
    918    u[6] = _mm256_add_epi32(u[6], rnding);
    919    u[6] = _mm256_srai_epi32(u[6], bit);
    920 
    921    u[8] = _mm256_add_epi32(v[8], v[9]);
    922    u[9] = _mm256_sub_epi32(v[8], v[9]);
    923    u[10] = _mm256_sub_epi32(v[11], v[10]);
    924    u[11] = _mm256_add_epi32(v[11], v[10]);
    925    u[12] = _mm256_add_epi32(v[12], v[13]);
    926    u[13] = _mm256_sub_epi32(v[12], v[13]);
    927    u[14] = _mm256_sub_epi32(v[15], v[14]);
    928    u[15] = _mm256_add_epi32(v[15], v[14]);
    929 
    930    // stage 6
    931    v[0] = u[0];
    932    v[1] = u[1];
    933    v[2] = u[2];
    934    v[3] = u[3];
    935    v[4] = u[4];
    936    v[5] = u[5];
    937    v[6] = u[6];
    938    v[7] = u[7];
    939 
    940    v[8] = _mm256_mullo_epi32(u[8], cospi60);
    941    x = _mm256_mullo_epi32(u[15], cospi4);
    942    v[8] = _mm256_add_epi32(v[8], x);
    943    v[8] = _mm256_add_epi32(v[8], rnding);
    944    v[8] = _mm256_srai_epi32(v[8], bit);
    945 
    946    v[15] = _mm256_mullo_epi32(u[8], cospi4);
    947    x = _mm256_mullo_epi32(u[15], cospi60);
    948    v[15] = _mm256_sub_epi32(x, v[15]);
    949    v[15] = _mm256_add_epi32(v[15], rnding);
    950    v[15] = _mm256_srai_epi32(v[15], bit);
    951 
    952    v[9] = _mm256_mullo_epi32(u[9], cospi28);
    953    x = _mm256_mullo_epi32(u[14], cospi36);
    954    v[9] = _mm256_add_epi32(v[9], x);
    955    v[9] = _mm256_add_epi32(v[9], rnding);
    956    v[9] = _mm256_srai_epi32(v[9], bit);
    957 
    958    v[14] = _mm256_mullo_epi32(u[9], cospi36);
    959    x = _mm256_mullo_epi32(u[14], cospi28);
    960    v[14] = _mm256_sub_epi32(x, v[14]);
    961    v[14] = _mm256_add_epi32(v[14], rnding);
    962    v[14] = _mm256_srai_epi32(v[14], bit);
    963 
    964    v[10] = _mm256_mullo_epi32(u[10], cospi44);
    965    x = _mm256_mullo_epi32(u[13], cospi20);
    966    v[10] = _mm256_add_epi32(v[10], x);
    967    v[10] = _mm256_add_epi32(v[10], rnding);
    968    v[10] = _mm256_srai_epi32(v[10], bit);
    969 
    970    v[13] = _mm256_mullo_epi32(u[10], cospi20);
    971    x = _mm256_mullo_epi32(u[13], cospi44);
    972    v[13] = _mm256_sub_epi32(x, v[13]);
    973    v[13] = _mm256_add_epi32(v[13], rnding);
    974    v[13] = _mm256_srai_epi32(v[13], bit);
    975 
    976    v[11] = _mm256_mullo_epi32(u[11], cospi12);
    977    x = _mm256_mullo_epi32(u[12], cospi52);
    978    v[11] = _mm256_add_epi32(v[11], x);
    979    v[11] = _mm256_add_epi32(v[11], rnding);
    980    v[11] = _mm256_srai_epi32(v[11], bit);
    981 
    982    v[12] = _mm256_mullo_epi32(u[11], cospi52);
    983    x = _mm256_mullo_epi32(u[12], cospi12);
    984    v[12] = _mm256_sub_epi32(x, v[12]);
    985    v[12] = _mm256_add_epi32(v[12], rnding);
    986    v[12] = _mm256_srai_epi32(v[12], bit);
    987 
    988    out[0 * outstride + col] = v[0];
    989    out[1 * outstride + col] = v[8];
    990    out[2 * outstride + col] = v[4];
    991    out[3 * outstride + col] = v[12];
    992    out[4 * outstride + col] = v[2];
    993    out[5 * outstride + col] = v[10];
    994    out[6 * outstride + col] = v[6];
    995    out[7 * outstride + col] = v[14];
    996    out[8 * outstride + col] = v[1];
    997    out[9 * outstride + col] = v[9];
    998    out[10 * outstride + col] = v[5];
    999    out[11 * outstride + col] = v[13];
   1000    out[12 * outstride + col] = v[3];
   1001    out[13 * outstride + col] = v[11];
   1002    out[14 * outstride + col] = v[7];
   1003    out[15 * outstride + col] = v[15];
   1004  }
   1005 }
   1006 static void fadst16_avx2(__m256i *in, __m256i *out, const int8_t bit,
   1007                         const int num_cols, const int outstride) {
   1008  const int32_t *cospi = cospi_arr(bit);
   1009  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
   1010  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
   1011  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
   1012  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
   1013  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
   1014  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
   1015  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
   1016  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
   1017  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
   1018  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
   1019  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
   1020  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
   1021  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
   1022  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
   1023  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
   1024  const __m256i cospim2 = _mm256_set1_epi32(-cospi[2]);
   1025  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
   1026  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
   1027  const __m256i cospim10 = _mm256_set1_epi32(-cospi[10]);
   1028  const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
   1029  const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
   1030  const __m256i cospim18 = _mm256_set1_epi32(-cospi[18]);
   1031  const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
   1032  const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
   1033  const __m256i cospim26 = _mm256_set1_epi32(-cospi[26]);
   1034  const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
   1035  const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
   1036  const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
   1037  const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
   1038  const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
   1039  const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
   1040  const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
   1041  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
   1042  const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
   1043  const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
   1044  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
   1045  const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
   1046  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
   1047  const __m256i zero = _mm256_setzero_si256();
   1048 
   1049  __m256i u[16], v[16], x, y;
   1050  int col;
   1051 
   1052  for (col = 0; col < num_cols; ++col) {
   1053    // stage 0
   1054    // stage 1
   1055    u[0] = in[0 * num_cols + col];
   1056    u[1] = _mm256_sub_epi32(zero, in[15 * num_cols + col]);
   1057    u[2] = _mm256_sub_epi32(zero, in[7 * num_cols + col]);
   1058    u[3] = in[8 * num_cols + col];
   1059    u[4] = _mm256_sub_epi32(zero, in[3 * num_cols + col]);
   1060    u[5] = in[12 * num_cols + col];
   1061    u[6] = in[4 * num_cols + col];
   1062    u[7] = _mm256_sub_epi32(zero, in[11 * num_cols + col]);
   1063    u[8] = _mm256_sub_epi32(zero, in[1 * num_cols + col]);
   1064    u[9] = in[14 * num_cols + col];
   1065    u[10] = in[6 * num_cols + col];
   1066    u[11] = _mm256_sub_epi32(zero, in[9 * num_cols + col]);
   1067    u[12] = in[2 * num_cols + col];
   1068    u[13] = _mm256_sub_epi32(zero, in[13 * num_cols + col]);
   1069    u[14] = _mm256_sub_epi32(zero, in[5 * num_cols + col]);
   1070    u[15] = in[10 * num_cols + col];
   1071 
   1072    // stage 2
   1073    v[0] = u[0];
   1074    v[1] = u[1];
   1075 
   1076    x = _mm256_mullo_epi32(u[2], cospi32);
   1077    y = _mm256_mullo_epi32(u[3], cospi32);
   1078    v[2] = _mm256_add_epi32(x, y);
   1079    v[2] = _mm256_add_epi32(v[2], rnding);
   1080    v[2] = _mm256_srai_epi32(v[2], bit);
   1081 
   1082    v[3] = _mm256_sub_epi32(x, y);
   1083    v[3] = _mm256_add_epi32(v[3], rnding);
   1084    v[3] = _mm256_srai_epi32(v[3], bit);
   1085 
   1086    v[4] = u[4];
   1087    v[5] = u[5];
   1088 
   1089    x = _mm256_mullo_epi32(u[6], cospi32);
   1090    y = _mm256_mullo_epi32(u[7], cospi32);
   1091    v[6] = _mm256_add_epi32(x, y);
   1092    v[6] = _mm256_add_epi32(v[6], rnding);
   1093    v[6] = _mm256_srai_epi32(v[6], bit);
   1094 
   1095    v[7] = _mm256_sub_epi32(x, y);
   1096    v[7] = _mm256_add_epi32(v[7], rnding);
   1097    v[7] = _mm256_srai_epi32(v[7], bit);
   1098 
   1099    v[8] = u[8];
   1100    v[9] = u[9];
   1101 
   1102    x = _mm256_mullo_epi32(u[10], cospi32);
   1103    y = _mm256_mullo_epi32(u[11], cospi32);
   1104    v[10] = _mm256_add_epi32(x, y);
   1105    v[10] = _mm256_add_epi32(v[10], rnding);
   1106    v[10] = _mm256_srai_epi32(v[10], bit);
   1107 
   1108    v[11] = _mm256_sub_epi32(x, y);
   1109    v[11] = _mm256_add_epi32(v[11], rnding);
   1110    v[11] = _mm256_srai_epi32(v[11], bit);
   1111 
   1112    v[12] = u[12];
   1113    v[13] = u[13];
   1114 
   1115    x = _mm256_mullo_epi32(u[14], cospi32);
   1116    y = _mm256_mullo_epi32(u[15], cospi32);
   1117    v[14] = _mm256_add_epi32(x, y);
   1118    v[14] = _mm256_add_epi32(v[14], rnding);
   1119    v[14] = _mm256_srai_epi32(v[14], bit);
   1120 
   1121    v[15] = _mm256_sub_epi32(x, y);
   1122    v[15] = _mm256_add_epi32(v[15], rnding);
   1123    v[15] = _mm256_srai_epi32(v[15], bit);
   1124 
   1125    // stage 3
   1126    u[0] = _mm256_add_epi32(v[0], v[2]);
   1127    u[1] = _mm256_add_epi32(v[1], v[3]);
   1128    u[2] = _mm256_sub_epi32(v[0], v[2]);
   1129    u[3] = _mm256_sub_epi32(v[1], v[3]);
   1130    u[4] = _mm256_add_epi32(v[4], v[6]);
   1131    u[5] = _mm256_add_epi32(v[5], v[7]);
   1132    u[6] = _mm256_sub_epi32(v[4], v[6]);
   1133    u[7] = _mm256_sub_epi32(v[5], v[7]);
   1134    u[8] = _mm256_add_epi32(v[8], v[10]);
   1135    u[9] = _mm256_add_epi32(v[9], v[11]);
   1136    u[10] = _mm256_sub_epi32(v[8], v[10]);
   1137    u[11] = _mm256_sub_epi32(v[9], v[11]);
   1138    u[12] = _mm256_add_epi32(v[12], v[14]);
   1139    u[13] = _mm256_add_epi32(v[13], v[15]);
   1140    u[14] = _mm256_sub_epi32(v[12], v[14]);
   1141    u[15] = _mm256_sub_epi32(v[13], v[15]);
   1142 
   1143    // stage 4
   1144    v[0] = u[0];
   1145    v[1] = u[1];
   1146    v[2] = u[2];
   1147    v[3] = u[3];
   1148    v[4] = av1_half_btf_avx2(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit);
   1149    v[5] = av1_half_btf_avx2(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit);
   1150    v[6] = av1_half_btf_avx2(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit);
   1151    v[7] = av1_half_btf_avx2(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit);
   1152    v[8] = u[8];
   1153    v[9] = u[9];
   1154    v[10] = u[10];
   1155    v[11] = u[11];
   1156    v[12] = av1_half_btf_avx2(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit);
   1157    v[13] =
   1158        av1_half_btf_avx2(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit);
   1159    v[14] =
   1160        av1_half_btf_avx2(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit);
   1161    v[15] = av1_half_btf_avx2(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit);
   1162 
   1163    // stage 5
   1164    u[0] = _mm256_add_epi32(v[0], v[4]);
   1165    u[1] = _mm256_add_epi32(v[1], v[5]);
   1166    u[2] = _mm256_add_epi32(v[2], v[6]);
   1167    u[3] = _mm256_add_epi32(v[3], v[7]);
   1168    u[4] = _mm256_sub_epi32(v[0], v[4]);
   1169    u[5] = _mm256_sub_epi32(v[1], v[5]);
   1170    u[6] = _mm256_sub_epi32(v[2], v[6]);
   1171    u[7] = _mm256_sub_epi32(v[3], v[7]);
   1172    u[8] = _mm256_add_epi32(v[8], v[12]);
   1173    u[9] = _mm256_add_epi32(v[9], v[13]);
   1174    u[10] = _mm256_add_epi32(v[10], v[14]);
   1175    u[11] = _mm256_add_epi32(v[11], v[15]);
   1176    u[12] = _mm256_sub_epi32(v[8], v[12]);
   1177    u[13] = _mm256_sub_epi32(v[9], v[13]);
   1178    u[14] = _mm256_sub_epi32(v[10], v[14]);
   1179    u[15] = _mm256_sub_epi32(v[11], v[15]);
   1180 
   1181    // stage 6
   1182    v[0] = u[0];
   1183    v[1] = u[1];
   1184    v[2] = u[2];
   1185    v[3] = u[3];
   1186    v[4] = u[4];
   1187    v[5] = u[5];
   1188    v[6] = u[6];
   1189    v[7] = u[7];
   1190    v[8] = av1_half_btf_avx2(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit);
   1191    v[9] = av1_half_btf_avx2(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit);
   1192    v[10] = av1_half_btf_avx2(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit);
   1193    v[11] =
   1194        av1_half_btf_avx2(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit);
   1195    v[12] = av1_half_btf_avx2(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit);
   1196    v[13] = av1_half_btf_avx2(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit);
   1197    v[14] =
   1198        av1_half_btf_avx2(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit);
   1199    v[15] = av1_half_btf_avx2(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit);
   1200 
   1201    // stage 7
   1202    u[0] = _mm256_add_epi32(v[0], v[8]);
   1203    u[1] = _mm256_add_epi32(v[1], v[9]);
   1204    u[2] = _mm256_add_epi32(v[2], v[10]);
   1205    u[3] = _mm256_add_epi32(v[3], v[11]);
   1206    u[4] = _mm256_add_epi32(v[4], v[12]);
   1207    u[5] = _mm256_add_epi32(v[5], v[13]);
   1208    u[6] = _mm256_add_epi32(v[6], v[14]);
   1209    u[7] = _mm256_add_epi32(v[7], v[15]);
   1210    u[8] = _mm256_sub_epi32(v[0], v[8]);
   1211    u[9] = _mm256_sub_epi32(v[1], v[9]);
   1212    u[10] = _mm256_sub_epi32(v[2], v[10]);
   1213    u[11] = _mm256_sub_epi32(v[3], v[11]);
   1214    u[12] = _mm256_sub_epi32(v[4], v[12]);
   1215    u[13] = _mm256_sub_epi32(v[5], v[13]);
   1216    u[14] = _mm256_sub_epi32(v[6], v[14]);
   1217    u[15] = _mm256_sub_epi32(v[7], v[15]);
   1218 
   1219    // stage 8
   1220    v[0] = av1_half_btf_avx2(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit);
   1221    v[1] = av1_half_btf_avx2(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit);
   1222    v[2] = av1_half_btf_avx2(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit);
   1223    v[3] = av1_half_btf_avx2(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit);
   1224    v[4] = av1_half_btf_avx2(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit);
   1225    v[5] = av1_half_btf_avx2(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit);
   1226    v[6] = av1_half_btf_avx2(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit);
   1227    v[7] = av1_half_btf_avx2(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit);
   1228    v[8] = av1_half_btf_avx2(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit);
   1229    v[9] = av1_half_btf_avx2(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit);
   1230    v[10] = av1_half_btf_avx2(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit);
   1231    v[11] =
   1232        av1_half_btf_avx2(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit);
   1233    v[12] = av1_half_btf_avx2(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit);
   1234    v[13] =
   1235        av1_half_btf_avx2(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit);
   1236    v[14] = av1_half_btf_avx2(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit);
   1237    v[15] = av1_half_btf_avx2(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit);
   1238 
   1239    // stage 9
   1240    out[0 * outstride + col] = v[1];
   1241    out[1 * outstride + col] = v[14];
   1242    out[2 * outstride + col] = v[3];
   1243    out[3 * outstride + col] = v[12];
   1244    out[4 * outstride + col] = v[5];
   1245    out[5 * outstride + col] = v[10];
   1246    out[6 * outstride + col] = v[7];
   1247    out[7 * outstride + col] = v[8];
   1248    out[8 * outstride + col] = v[9];
   1249    out[9 * outstride + col] = v[6];
   1250    out[10 * outstride + col] = v[11];
   1251    out[11 * outstride + col] = v[4];
   1252    out[12 * outstride + col] = v[13];
   1253    out[13 * outstride + col] = v[2];
   1254    out[14 * outstride + col] = v[15];
   1255    out[15 * outstride + col] = v[0];
   1256  }
   1257 }
   1258 static void idtx16_avx2(__m256i *in, __m256i *out, const int8_t bit,
   1259                        int col_num, const int outstride) {
   1260  (void)bit;
   1261  (void)outstride;
   1262  __m256i fact = _mm256_set1_epi32(2 * NewSqrt2);
   1263  __m256i offset = _mm256_set1_epi32(1 << (NewSqrt2Bits - 1));
   1264  __m256i a_low;
   1265 
   1266  int num_iters = 16 * col_num;
   1267  for (int i = 0; i < num_iters; i++) {
   1268    a_low = _mm256_mullo_epi32(in[i], fact);
   1269    a_low = _mm256_add_epi32(a_low, offset);
   1270    out[i] = _mm256_srai_epi32(a_low, NewSqrt2Bits);
   1271  }
   1272 }
   1273 static const transform_1d_avx2 col_highbd_txfm8x16_arr[TX_TYPES] = {
   1274  fdct16_avx2,   // DCT_DCT
   1275  fadst16_avx2,  // ADST_DCT
   1276  fdct16_avx2,   // DCT_ADST
   1277  fadst16_avx2,  // ADST_ADST
   1278  fadst16_avx2,  // FLIPADST_DCT
   1279  fdct16_avx2,   // DCT_FLIPADST
   1280  fadst16_avx2,  // FLIPADST_FLIPADST
   1281  fadst16_avx2,  // ADST_FLIPADST
   1282  fadst16_avx2,  // FLIPADST_ADST
   1283  idtx16_avx2,   // IDTX
   1284  fdct16_avx2,   // V_DCT
   1285  idtx16_avx2,   // H_DCT
   1286  fadst16_avx2,  // V_ADST
   1287  idtx16_avx2,   // H_ADST
   1288  fadst16_avx2,  // V_FLIPADST
   1289  idtx16_avx2    // H_FLIPADST
   1290 };
   1291 static const transform_1d_avx2 row_highbd_txfm8x8_arr[TX_TYPES] = {
   1292  fdct8_avx2,   // DCT_DCT
   1293  fdct8_avx2,   // ADST_DCT
   1294  fadst8_avx2,  // DCT_ADST
   1295  fadst8_avx2,  // ADST_ADST
   1296  fdct8_avx2,   // FLIPADST_DCT
   1297  fadst8_avx2,  // DCT_FLIPADST
   1298  fadst8_avx2,  // FLIPADST_FLIPADST
   1299  fadst8_avx2,  // ADST_FLIPADST
   1300  fadst8_avx2,  // FLIPADST_ADST
   1301  idtx8_avx2,   // IDTX
   1302  idtx8_avx2,   // V_DCT
   1303  fdct8_avx2,   // H_DCT
   1304  idtx8_avx2,   // V_ADST
   1305  fadst8_avx2,  // H_ADST
   1306  idtx8_avx2,   // V_FLIPADST
   1307  fadst8_avx2   // H_FLIPADST
   1308 };
   1309 void av1_fwd_txfm2d_8x16_avx2(const int16_t *input, int32_t *coeff, int stride,
   1310                              TX_TYPE tx_type, int bd) {
   1311  __m256i in[16], out[16];
   1312  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16];
   1313  const int txw_idx = get_txw_idx(TX_8X16);
   1314  const int txh_idx = get_txh_idx(TX_8X16);
   1315  const transform_1d_avx2 col_txfm = col_highbd_txfm8x16_arr[tx_type];
   1316  const transform_1d_avx2 row_txfm = row_highbd_txfm8x8_arr[tx_type];
   1317  const int8_t bit = av1_fwd_cos_bit_col[txw_idx][txh_idx];
   1318  int ud_flip, lr_flip;
   1319  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   1320 
   1321  load_buffer_8x16_avx2(input, in, stride, ud_flip, lr_flip, shift[0]);
   1322  col_txfm(in, out, bit, 1, 1);
   1323  col_txfm_8x8_rounding(out, -shift[1]);
   1324  col_txfm_8x8_rounding(&out[8], -shift[1]);
   1325  fwd_txfm_transpose_8x8_avx2(out, in, 1, 2);
   1326  fwd_txfm_transpose_8x8_avx2(&out[8], &in[1], 1, 2);
   1327  row_txfm(in, out, bit, 2, 2);
   1328  round_shift_rect_array_32_avx2(out, in, 16, -shift[2], NewSqrt2);
   1329  store_buffer_avx2(in, coeff, 8, 16);
   1330  (void)bd;
   1331 }
   1332 static const transform_1d_avx2 col_highbd_txfm8x8_arr[TX_TYPES] = {
   1333  fdct8_avx2,   // DCT_DCT
   1334  fadst8_avx2,  // ADST_DCT
   1335  fdct8_avx2,   // DCT_ADST
   1336  fadst8_avx2,  // ADST_ADST
   1337  fadst8_avx2,  // FLIPADST_DCT
   1338  fdct8_avx2,   // DCT_FLIPADST
   1339  fadst8_avx2,  // FLIPADST_FLIPADST
   1340  fadst8_avx2,  // ADST_FLIPADST
   1341  fadst8_avx2,  // FLIPADST_ADST
   1342  idtx8_avx2,   // IDTX
   1343  fdct8_avx2,   // V_DCT
   1344  idtx8_avx2,   // H_DCT
   1345  fadst8_avx2,  // V_ADST
   1346  idtx8_avx2,   // H_ADST
   1347  fadst8_avx2,  // V_FLIPADST
   1348  idtx8_avx2    // H_FLIPADST
   1349 };
   1350 static const transform_1d_avx2 row_highbd_txfm8x16_arr[TX_TYPES] = {
   1351  fdct16_avx2,   // DCT_DCT
   1352  fdct16_avx2,   // ADST_DCT
   1353  fadst16_avx2,  // DCT_ADST
   1354  fadst16_avx2,  // ADST_ADST
   1355  fdct16_avx2,   // FLIPADST_DCT
   1356  fadst16_avx2,  // DCT_FLIPADST
   1357  fadst16_avx2,  // FLIPADST_FLIPADST
   1358  fadst16_avx2,  // ADST_FLIPADST
   1359  fadst16_avx2,  // FLIPADST_ADST
   1360  idtx16_avx2,   // IDTX
   1361  idtx16_avx2,   // V_DCT
   1362  fdct16_avx2,   // H_DCT
   1363  idtx16_avx2,   // V_ADST
   1364  fadst16_avx2,  // H_ADST
   1365  idtx16_avx2,   // V_FLIPADST
   1366  fadst16_avx2   // H_FLIPADST
   1367 };
   1368 void av1_fwd_txfm2d_16x8_avx2(const int16_t *input, int32_t *coeff, int stride,
   1369                              TX_TYPE tx_type, int bd) {
   1370  __m256i in[16], out[16];
   1371  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8];
   1372  const int txw_idx = get_txw_idx(TX_16X8);
   1373  const int txh_idx = get_txh_idx(TX_16X8);
   1374  const transform_1d_avx2 col_txfm = col_highbd_txfm8x8_arr[tx_type];
   1375  const transform_1d_avx2 row_txfm = row_highbd_txfm8x16_arr[tx_type];
   1376  const int8_t bit = av1_fwd_cos_bit_col[txw_idx][txh_idx];
   1377  int ud_flip, lr_flip;
   1378  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   1379 
   1380  load_buffer_16xn_avx2(input, in, stride, 8, 2, ud_flip, lr_flip);
   1381  round_shift_32_8xn_avx2(in, 16, shift[0], 1);
   1382  col_txfm(in, out, bit, 2, 2);
   1383  round_shift_32_8xn_avx2(out, 16, shift[1], 1);
   1384  fwd_txfm_transpose_8x8_avx2(out, in, 2, 1);
   1385  fwd_txfm_transpose_8x8_avx2(&out[1], &in[8], 2, 1);
   1386  row_txfm(in, out, bit, 1, 1);
   1387  round_shift_rect_array_32_avx2(out, out, 16, -shift[2], NewSqrt2);
   1388  store_buffer_avx2(out, coeff, 8, 16);
   1389  (void)bd;
   1390 }
   1391 void av1_fwd_txfm2d_16x16_avx2(const int16_t *input, int32_t *coeff, int stride,
   1392                               TX_TYPE tx_type, int bd) {
   1393  __m256i in[32], out[32];
   1394  const TX_SIZE tx_size = TX_16X16;
   1395  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
   1396  const int txw_idx = get_txw_idx(tx_size);
   1397  const int txh_idx = get_txh_idx(tx_size);
   1398  const int width = tx_size_wide[tx_size];
   1399  const int height = tx_size_high[tx_size];
   1400  const int width_div8 = (width >> 3);
   1401  const int width_div16 = (width >> 4);
   1402  const int size = (height << 1);
   1403  switch (tx_type) {
   1404    case DCT_DCT:
   1405      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
   1406      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
   1407      fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
   1408                  width_div8);
   1409      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
   1410      fwd_txfm_transpose_16x16_avx2(out, in);
   1411      fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
   1412                  width_div8);
   1413      store_buffer_avx2(out, coeff, 8, 32);
   1414      break;
   1415    case ADST_DCT:
   1416      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
   1417      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
   1418      fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
   1419                   width_div8);
   1420      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
   1421      fwd_txfm_transpose_16x16_avx2(out, in);
   1422      fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
   1423                  width_div8);
   1424      store_buffer_avx2(out, coeff, 8, 32);
   1425      break;
   1426    case DCT_ADST:
   1427      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
   1428      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
   1429      fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
   1430                  width_div8);
   1431      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
   1432      fwd_txfm_transpose_16x16_avx2(out, in);
   1433      fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
   1434                   width_div8);
   1435      store_buffer_avx2(out, coeff, 8, 32);
   1436      break;
   1437    case ADST_ADST:
   1438      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
   1439      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
   1440      fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
   1441                   width_div8);
   1442      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
   1443      fwd_txfm_transpose_16x16_avx2(out, in);
   1444      fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
   1445                   width_div8);
   1446      store_buffer_avx2(out, coeff, 8, 32);
   1447      break;
   1448    case FLIPADST_DCT:
   1449      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0);
   1450      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
   1451      fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
   1452                   width_div8);
   1453      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
   1454      fwd_txfm_transpose_16x16_avx2(out, in);
   1455      fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
   1456                  width_div8);
   1457      store_buffer_avx2(out, coeff, 8, 32);
   1458      break;
   1459    case DCT_FLIPADST:
   1460      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1);
   1461      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
   1462      fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
   1463                  width_div8);
   1464      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
   1465      fwd_txfm_transpose_16x16_avx2(out, in);
   1466      fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
   1467                   width_div8);
   1468      store_buffer_avx2(out, coeff, 8, 32);
   1469      break;
   1470    case FLIPADST_FLIPADST:
   1471      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 1);
   1472      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
   1473      fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
   1474                   width_div8);
   1475      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
   1476      fwd_txfm_transpose_16x16_avx2(out, in);
   1477      fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
   1478                   width_div8);
   1479      store_buffer_avx2(out, coeff, 8, 32);
   1480      break;
   1481    case ADST_FLIPADST:
   1482      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1);
   1483      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
   1484      fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
   1485                   width_div8);
   1486      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
   1487      fwd_txfm_transpose_16x16_avx2(out, in);
   1488      fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
   1489                   width_div8);
   1490      store_buffer_avx2(out, coeff, 8, 32);
   1491      break;
   1492    case FLIPADST_ADST:
   1493      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0);
   1494      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
   1495      fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
   1496                   width_div8);
   1497      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
   1498      fwd_txfm_transpose_16x16_avx2(out, in);
   1499      fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
   1500                   width_div8);
   1501      store_buffer_avx2(out, coeff, 8, 32);
   1502      break;
   1503    case IDTX:
   1504      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
   1505      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
   1506      idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
   1507                  width_div8);
   1508      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
   1509      fwd_txfm_transpose_16x16_avx2(out, in);
   1510      idtx16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
   1511                  width_div8);
   1512      store_buffer_avx2(out, coeff, 8, 32);
   1513      break;
   1514    case V_DCT:
   1515      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
   1516      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
   1517      fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
   1518                  width_div8);
   1519      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
   1520      fwd_txfm_transpose_16x16_avx2(out, in);
   1521      idtx16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
   1522                  width_div8);
   1523      store_buffer_avx2(out, coeff, 8, 32);
   1524      break;
   1525    case H_DCT:
   1526      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
   1527      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
   1528      idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
   1529                  width_div8);
   1530      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
   1531      fwd_txfm_transpose_16x16_avx2(out, in);
   1532      fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
   1533                  width_div8);
   1534      store_buffer_avx2(out, coeff, 8, 32);
   1535      break;
   1536    case V_ADST:
   1537      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
   1538      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
   1539      fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
   1540                   width_div8);
   1541      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
   1542      fwd_txfm_transpose_16x16_avx2(out, in);
   1543      idtx16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
   1544                  width_div8);
   1545      store_buffer_avx2(out, coeff, 8, 32);
   1546      break;
   1547    case H_ADST:
   1548      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
   1549      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
   1550      idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
   1551                  width_div8);
   1552      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
   1553      fwd_txfm_transpose_16x16_avx2(out, in);
   1554      fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
   1555                   width_div8);
   1556      store_buffer_avx2(out, coeff, 8, 32);
   1557      break;
   1558    case V_FLIPADST:
   1559      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0);
   1560      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
   1561      fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
   1562                   width_div8);
   1563      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
   1564      fwd_txfm_transpose_16x16_avx2(out, in);
   1565      idtx16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
   1566                  width_div8);
   1567      store_buffer_avx2(out, coeff, 8, 32);
   1568      break;
   1569    case H_FLIPADST:
   1570      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1);
   1571      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
   1572      idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
   1573                  width_div8);
   1574      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
   1575      fwd_txfm_transpose_16x16_avx2(out, in);
   1576      fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
   1577                   width_div8);
   1578      store_buffer_avx2(out, coeff, 8, 32);
   1579      break;
   1580    default: assert(0);
   1581  }
   1582  (void)bd;
   1583 }
   1584 static inline void fdct32_avx2(__m256i *input, __m256i *output,
   1585                               const int8_t cos_bit, const int instride,
   1586                               const int outstride) {
   1587  __m256i buf0[32];
   1588  __m256i buf1[32];
   1589  const int32_t *cospi;
   1590  int startidx = 0 * instride;
   1591  int endidx = 31 * instride;
   1592  // stage 0
   1593  // stage 1
   1594  buf1[0] = _mm256_add_epi32(input[startidx], input[endidx]);
   1595  buf1[31] = _mm256_sub_epi32(input[startidx], input[endidx]);
   1596  startidx += instride;
   1597  endidx -= instride;
   1598  buf1[1] = _mm256_add_epi32(input[startidx], input[endidx]);
   1599  buf1[30] = _mm256_sub_epi32(input[startidx], input[endidx]);
   1600  startidx += instride;
   1601  endidx -= instride;
   1602  buf1[2] = _mm256_add_epi32(input[startidx], input[endidx]);
   1603  buf1[29] = _mm256_sub_epi32(input[startidx], input[endidx]);
   1604  startidx += instride;
   1605  endidx -= instride;
   1606  buf1[3] = _mm256_add_epi32(input[startidx], input[endidx]);
   1607  buf1[28] = _mm256_sub_epi32(input[startidx], input[endidx]);
   1608  startidx += instride;
   1609  endidx -= instride;
   1610  buf1[4] = _mm256_add_epi32(input[startidx], input[endidx]);
   1611  buf1[27] = _mm256_sub_epi32(input[startidx], input[endidx]);
   1612  startidx += instride;
   1613  endidx -= instride;
   1614  buf1[5] = _mm256_add_epi32(input[startidx], input[endidx]);
   1615  buf1[26] = _mm256_sub_epi32(input[startidx], input[endidx]);
   1616  startidx += instride;
   1617  endidx -= instride;
   1618  buf1[6] = _mm256_add_epi32(input[startidx], input[endidx]);
   1619  buf1[25] = _mm256_sub_epi32(input[startidx], input[endidx]);
   1620  startidx += instride;
   1621  endidx -= instride;
   1622  buf1[7] = _mm256_add_epi32(input[startidx], input[endidx]);
   1623  buf1[24] = _mm256_sub_epi32(input[startidx], input[endidx]);
   1624  startidx += instride;
   1625  endidx -= instride;
   1626  buf1[8] = _mm256_add_epi32(input[startidx], input[endidx]);
   1627  buf1[23] = _mm256_sub_epi32(input[startidx], input[endidx]);
   1628  startidx += instride;
   1629  endidx -= instride;
   1630  buf1[9] = _mm256_add_epi32(input[startidx], input[endidx]);
   1631  buf1[22] = _mm256_sub_epi32(input[startidx], input[endidx]);
   1632  startidx += instride;
   1633  endidx -= instride;
   1634  buf1[10] = _mm256_add_epi32(input[startidx], input[endidx]);
   1635  buf1[21] = _mm256_sub_epi32(input[startidx], input[endidx]);
   1636  startidx += instride;
   1637  endidx -= instride;
   1638  buf1[11] = _mm256_add_epi32(input[startidx], input[endidx]);
   1639  buf1[20] = _mm256_sub_epi32(input[startidx], input[endidx]);
   1640  startidx += instride;
   1641  endidx -= instride;
   1642  buf1[12] = _mm256_add_epi32(input[startidx], input[endidx]);
   1643  buf1[19] = _mm256_sub_epi32(input[startidx], input[endidx]);
   1644  startidx += instride;
   1645  endidx -= instride;
   1646  buf1[13] = _mm256_add_epi32(input[startidx], input[endidx]);
   1647  buf1[18] = _mm256_sub_epi32(input[startidx], input[endidx]);
   1648  startidx += instride;
   1649  endidx -= instride;
   1650  buf1[14] = _mm256_add_epi32(input[startidx], input[endidx]);
   1651  buf1[17] = _mm256_sub_epi32(input[startidx], input[endidx]);
   1652  startidx += instride;
   1653  endidx -= instride;
   1654  buf1[15] = _mm256_add_epi32(input[startidx], input[endidx]);
   1655  buf1[16] = _mm256_sub_epi32(input[startidx], input[endidx]);
   1656 
   1657  // stage 2
   1658  cospi = cospi_arr(cos_bit);
   1659  buf0[0] = _mm256_add_epi32(buf1[0], buf1[15]);
   1660  buf0[15] = _mm256_sub_epi32(buf1[0], buf1[15]);
   1661  buf0[1] = _mm256_add_epi32(buf1[1], buf1[14]);
   1662  buf0[14] = _mm256_sub_epi32(buf1[1], buf1[14]);
   1663  buf0[2] = _mm256_add_epi32(buf1[2], buf1[13]);
   1664  buf0[13] = _mm256_sub_epi32(buf1[2], buf1[13]);
   1665  buf0[3] = _mm256_add_epi32(buf1[3], buf1[12]);
   1666  buf0[12] = _mm256_sub_epi32(buf1[3], buf1[12]);
   1667  buf0[4] = _mm256_add_epi32(buf1[4], buf1[11]);
   1668  buf0[11] = _mm256_sub_epi32(buf1[4], buf1[11]);
   1669  buf0[5] = _mm256_add_epi32(buf1[5], buf1[10]);
   1670  buf0[10] = _mm256_sub_epi32(buf1[5], buf1[10]);
   1671  buf0[6] = _mm256_add_epi32(buf1[6], buf1[9]);
   1672  buf0[9] = _mm256_sub_epi32(buf1[6], buf1[9]);
   1673  buf0[7] = _mm256_add_epi32(buf1[7], buf1[8]);
   1674  buf0[8] = _mm256_sub_epi32(buf1[7], buf1[8]);
   1675  buf0[16] = buf1[16];
   1676  buf0[17] = buf1[17];
   1677  buf0[18] = buf1[18];
   1678  buf0[19] = buf1[19];
   1679  btf_32_avx2_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20],
   1680                    buf0[27], cos_bit);
   1681  btf_32_avx2_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21],
   1682                    buf0[26], cos_bit);
   1683  btf_32_avx2_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22],
   1684                    buf0[25], cos_bit);
   1685  btf_32_avx2_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23],
   1686                    buf0[24], cos_bit);
   1687  buf0[28] = buf1[28];
   1688  buf0[29] = buf1[29];
   1689  buf0[30] = buf1[30];
   1690  buf0[31] = buf1[31];
   1691 
   1692  // stage 3
   1693  cospi = cospi_arr(cos_bit);
   1694  buf1[0] = _mm256_add_epi32(buf0[0], buf0[7]);
   1695  buf1[7] = _mm256_sub_epi32(buf0[0], buf0[7]);
   1696  buf1[1] = _mm256_add_epi32(buf0[1], buf0[6]);
   1697  buf1[6] = _mm256_sub_epi32(buf0[1], buf0[6]);
   1698  buf1[2] = _mm256_add_epi32(buf0[2], buf0[5]);
   1699  buf1[5] = _mm256_sub_epi32(buf0[2], buf0[5]);
   1700  buf1[3] = _mm256_add_epi32(buf0[3], buf0[4]);
   1701  buf1[4] = _mm256_sub_epi32(buf0[3], buf0[4]);
   1702  buf1[8] = buf0[8];
   1703  buf1[9] = buf0[9];
   1704  btf_32_avx2_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10],
   1705                    buf1[13], cos_bit);
   1706  btf_32_avx2_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11],
   1707                    buf1[12], cos_bit);
   1708  buf1[14] = buf0[14];
   1709  buf1[15] = buf0[15];
   1710  buf1[16] = _mm256_add_epi32(buf0[16], buf0[23]);
   1711  buf1[23] = _mm256_sub_epi32(buf0[16], buf0[23]);
   1712  buf1[17] = _mm256_add_epi32(buf0[17], buf0[22]);
   1713  buf1[22] = _mm256_sub_epi32(buf0[17], buf0[22]);
   1714  buf1[18] = _mm256_add_epi32(buf0[18], buf0[21]);
   1715  buf1[21] = _mm256_sub_epi32(buf0[18], buf0[21]);
   1716  buf1[19] = _mm256_add_epi32(buf0[19], buf0[20]);
   1717  buf1[20] = _mm256_sub_epi32(buf0[19], buf0[20]);
   1718  buf1[24] = _mm256_sub_epi32(buf0[31], buf0[24]);
   1719  buf1[31] = _mm256_add_epi32(buf0[31], buf0[24]);
   1720  buf1[25] = _mm256_sub_epi32(buf0[30], buf0[25]);
   1721  buf1[30] = _mm256_add_epi32(buf0[30], buf0[25]);
   1722  buf1[26] = _mm256_sub_epi32(buf0[29], buf0[26]);
   1723  buf1[29] = _mm256_add_epi32(buf0[29], buf0[26]);
   1724  buf1[27] = _mm256_sub_epi32(buf0[28], buf0[27]);
   1725  buf1[28] = _mm256_add_epi32(buf0[28], buf0[27]);
   1726 
   1727  // stage 4
   1728  cospi = cospi_arr(cos_bit);
   1729  buf0[0] = _mm256_add_epi32(buf1[0], buf1[3]);
   1730  buf0[3] = _mm256_sub_epi32(buf1[0], buf1[3]);
   1731  buf0[1] = _mm256_add_epi32(buf1[1], buf1[2]);
   1732  buf0[2] = _mm256_sub_epi32(buf1[1], buf1[2]);
   1733  buf0[4] = buf1[4];
   1734  btf_32_avx2_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6],
   1735                    cos_bit);
   1736  buf0[7] = buf1[7];
   1737  buf0[8] = _mm256_add_epi32(buf1[8], buf1[11]);
   1738  buf0[11] = _mm256_sub_epi32(buf1[8], buf1[11]);
   1739  buf0[9] = _mm256_add_epi32(buf1[9], buf1[10]);
   1740  buf0[10] = _mm256_sub_epi32(buf1[9], buf1[10]);
   1741  buf0[12] = _mm256_sub_epi32(buf1[15], buf1[12]);
   1742  buf0[15] = _mm256_add_epi32(buf1[15], buf1[12]);
   1743  buf0[13] = _mm256_sub_epi32(buf1[14], buf1[13]);
   1744  buf0[14] = _mm256_add_epi32(buf1[14], buf1[13]);
   1745  buf0[16] = buf1[16];
   1746  buf0[17] = buf1[17];
   1747  btf_32_avx2_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18],
   1748                    buf0[29], cos_bit);
   1749  btf_32_avx2_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19],
   1750                    buf0[28], cos_bit);
   1751  btf_32_avx2_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20],
   1752                    buf0[27], cos_bit);
   1753  btf_32_avx2_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21],
   1754                    buf0[26], cos_bit);
   1755  buf0[22] = buf1[22];
   1756  buf0[23] = buf1[23];
   1757  buf0[24] = buf1[24];
   1758  buf0[25] = buf1[25];
   1759  buf0[30] = buf1[30];
   1760  buf0[31] = buf1[31];
   1761 
   1762  // stage 5
   1763  cospi = cospi_arr(cos_bit);
   1764  btf_32_avx2_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1],
   1765                    cos_bit);
   1766  btf_32_avx2_type0(cospi[16], cospi[48], buf0[3], buf0[2], buf1[2], buf1[3],
   1767                    cos_bit);
   1768  buf1[4] = _mm256_add_epi32(buf0[4], buf0[5]);
   1769  buf1[5] = _mm256_sub_epi32(buf0[4], buf0[5]);
   1770  buf1[6] = _mm256_sub_epi32(buf0[7], buf0[6]);
   1771  buf1[7] = _mm256_add_epi32(buf0[7], buf0[6]);
   1772  buf1[8] = buf0[8];
   1773  btf_32_avx2_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9], buf1[14],
   1774                    cos_bit);
   1775  btf_32_avx2_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10],
   1776                    buf1[13], cos_bit);
   1777  buf1[11] = buf0[11];
   1778  buf1[12] = buf0[12];
   1779  buf1[15] = buf0[15];
   1780  buf1[16] = _mm256_add_epi32(buf0[16], buf0[19]);
   1781  buf1[19] = _mm256_sub_epi32(buf0[16], buf0[19]);
   1782  buf1[17] = _mm256_add_epi32(buf0[17], buf0[18]);
   1783  buf1[18] = _mm256_sub_epi32(buf0[17], buf0[18]);
   1784  buf1[20] = _mm256_sub_epi32(buf0[23], buf0[20]);
   1785  buf1[23] = _mm256_add_epi32(buf0[23], buf0[20]);
   1786  buf1[21] = _mm256_sub_epi32(buf0[22], buf0[21]);
   1787  buf1[22] = _mm256_add_epi32(buf0[22], buf0[21]);
   1788  buf1[24] = _mm256_add_epi32(buf0[24], buf0[27]);
   1789  buf1[27] = _mm256_sub_epi32(buf0[24], buf0[27]);
   1790  buf1[25] = _mm256_add_epi32(buf0[25], buf0[26]);
   1791  buf1[26] = _mm256_sub_epi32(buf0[25], buf0[26]);
   1792  buf1[28] = _mm256_sub_epi32(buf0[31], buf0[28]);
   1793  buf1[31] = _mm256_add_epi32(buf0[31], buf0[28]);
   1794  buf1[29] = _mm256_sub_epi32(buf0[30], buf0[29]);
   1795  buf1[30] = _mm256_add_epi32(buf0[30], buf0[29]);
   1796 
   1797  // stage 6
   1798  cospi = cospi_arr(cos_bit);
   1799  buf0[0] = buf1[0];
   1800  buf0[1] = buf1[1];
   1801  buf0[2] = buf1[2];
   1802  buf0[3] = buf1[3];
   1803  btf_32_avx2_type0(cospi[8], cospi[56], buf1[7], buf1[4], buf0[4], buf0[7],
   1804                    cos_bit);
   1805  btf_32_avx2_type0(cospi[40], cospi[24], buf1[6], buf1[5], buf0[5], buf0[6],
   1806                    cos_bit);
   1807  buf0[8] = _mm256_add_epi32(buf1[8], buf1[9]);
   1808  buf0[9] = _mm256_sub_epi32(buf1[8], buf1[9]);
   1809  buf0[10] = _mm256_sub_epi32(buf1[11], buf1[10]);
   1810  buf0[11] = _mm256_add_epi32(buf1[11], buf1[10]);
   1811  buf0[12] = _mm256_add_epi32(buf1[12], buf1[13]);
   1812  buf0[13] = _mm256_sub_epi32(buf1[12], buf1[13]);
   1813  buf0[14] = _mm256_sub_epi32(buf1[15], buf1[14]);
   1814  buf0[15] = _mm256_add_epi32(buf1[15], buf1[14]);
   1815  buf0[16] = buf1[16];
   1816  btf_32_avx2_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17],
   1817                    buf0[30], cos_bit);
   1818  btf_32_avx2_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18],
   1819                    buf0[29], cos_bit);
   1820  buf0[19] = buf1[19];
   1821  buf0[20] = buf1[20];
   1822  btf_32_avx2_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21],
   1823                    buf0[26], cos_bit);
   1824  btf_32_avx2_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22],
   1825                    buf0[25], cos_bit);
   1826  buf0[23] = buf1[23];
   1827  buf0[24] = buf1[24];
   1828  buf0[27] = buf1[27];
   1829  buf0[28] = buf1[28];
   1830  buf0[31] = buf1[31];
   1831 
   1832  // stage 7
   1833  cospi = cospi_arr(cos_bit);
   1834  buf1[0] = buf0[0];
   1835  buf1[1] = buf0[1];
   1836  buf1[2] = buf0[2];
   1837  buf1[3] = buf0[3];
   1838  buf1[4] = buf0[4];
   1839  buf1[5] = buf0[5];
   1840  buf1[6] = buf0[6];
   1841  buf1[7] = buf0[7];
   1842  btf_32_avx2_type0(cospi[4], cospi[60], buf0[15], buf0[8], buf1[8], buf1[15],
   1843                    cos_bit);
   1844  btf_32_avx2_type0(cospi[36], cospi[28], buf0[14], buf0[9], buf1[9], buf1[14],
   1845                    cos_bit);
   1846  btf_32_avx2_type0(cospi[20], cospi[44], buf0[13], buf0[10], buf1[10],
   1847                    buf1[13], cos_bit);
   1848  btf_32_avx2_type0(cospi[52], cospi[12], buf0[12], buf0[11], buf1[11],
   1849                    buf1[12], cos_bit);
   1850  buf1[16] = _mm256_add_epi32(buf0[16], buf0[17]);
   1851  buf1[17] = _mm256_sub_epi32(buf0[16], buf0[17]);
   1852  buf1[18] = _mm256_sub_epi32(buf0[19], buf0[18]);
   1853  buf1[19] = _mm256_add_epi32(buf0[19], buf0[18]);
   1854  buf1[20] = _mm256_add_epi32(buf0[20], buf0[21]);
   1855  buf1[21] = _mm256_sub_epi32(buf0[20], buf0[21]);
   1856  buf1[22] = _mm256_sub_epi32(buf0[23], buf0[22]);
   1857  buf1[23] = _mm256_add_epi32(buf0[23], buf0[22]);
   1858  buf1[24] = _mm256_add_epi32(buf0[24], buf0[25]);
   1859  buf1[25] = _mm256_sub_epi32(buf0[24], buf0[25]);
   1860  buf1[26] = _mm256_sub_epi32(buf0[27], buf0[26]);
   1861  buf1[27] = _mm256_add_epi32(buf0[27], buf0[26]);
   1862  buf1[28] = _mm256_add_epi32(buf0[28], buf0[29]);
   1863  buf1[29] = _mm256_sub_epi32(buf0[28], buf0[29]);
   1864  buf1[30] = _mm256_sub_epi32(buf0[31], buf0[30]);
   1865  buf1[31] = _mm256_add_epi32(buf0[31], buf0[30]);
   1866 
   1867  // stage 8
   1868  cospi = cospi_arr(cos_bit);
   1869  buf0[0] = buf1[0];
   1870  buf0[1] = buf1[1];
   1871  buf0[2] = buf1[2];
   1872  buf0[3] = buf1[3];
   1873  buf0[4] = buf1[4];
   1874  buf0[5] = buf1[5];
   1875  buf0[6] = buf1[6];
   1876  buf0[7] = buf1[7];
   1877  buf0[8] = buf1[8];
   1878  buf0[9] = buf1[9];
   1879  buf0[10] = buf1[10];
   1880  buf0[11] = buf1[11];
   1881  buf0[12] = buf1[12];
   1882  buf0[13] = buf1[13];
   1883  buf0[14] = buf1[14];
   1884  buf0[15] = buf1[15];
   1885  btf_32_avx2_type0(cospi[2], cospi[62], buf1[31], buf1[16], buf0[16], buf0[31],
   1886                    cos_bit);
   1887  btf_32_avx2_type0(cospi[34], cospi[30], buf1[30], buf1[17], buf0[17],
   1888                    buf0[30], cos_bit);
   1889  btf_32_avx2_type0(cospi[18], cospi[46], buf1[29], buf1[18], buf0[18],
   1890                    buf0[29], cos_bit);
   1891  btf_32_avx2_type0(cospi[50], cospi[14], buf1[28], buf1[19], buf0[19],
   1892                    buf0[28], cos_bit);
   1893  btf_32_avx2_type0(cospi[10], cospi[54], buf1[27], buf1[20], buf0[20],
   1894                    buf0[27], cos_bit);
   1895  btf_32_avx2_type0(cospi[42], cospi[22], buf1[26], buf1[21], buf0[21],
   1896                    buf0[26], cos_bit);
   1897  btf_32_avx2_type0(cospi[26], cospi[38], buf1[25], buf1[22], buf0[22],
   1898                    buf0[25], cos_bit);
   1899  btf_32_avx2_type0(cospi[58], cospi[6], buf1[24], buf1[23], buf0[23], buf0[24],
   1900                    cos_bit);
   1901 
   1902  startidx = 0 * outstride;
   1903  endidx = 31 * outstride;
   1904  // stage 9
   1905  output[startidx] = buf0[0];
   1906  output[endidx] = buf0[31];
   1907  startidx += outstride;
   1908  endidx -= outstride;
   1909  output[startidx] = buf0[16];
   1910  output[endidx] = buf0[15];
   1911  startidx += outstride;
   1912  endidx -= outstride;
   1913  output[startidx] = buf0[8];
   1914  output[endidx] = buf0[23];
   1915  startidx += outstride;
   1916  endidx -= outstride;
   1917  output[startidx] = buf0[24];
   1918  output[endidx] = buf0[7];
   1919  startidx += outstride;
   1920  endidx -= outstride;
   1921  output[startidx] = buf0[4];
   1922  output[endidx] = buf0[27];
   1923  startidx += outstride;
   1924  endidx -= outstride;
   1925  output[startidx] = buf0[20];
   1926  output[endidx] = buf0[11];
   1927  startidx += outstride;
   1928  endidx -= outstride;
   1929  output[startidx] = buf0[12];
   1930  output[endidx] = buf0[19];
   1931  startidx += outstride;
   1932  endidx -= outstride;
   1933  output[startidx] = buf0[28];
   1934  output[endidx] = buf0[3];
   1935  startidx += outstride;
   1936  endidx -= outstride;
   1937  output[startidx] = buf0[2];
   1938  output[endidx] = buf0[29];
   1939  startidx += outstride;
   1940  endidx -= outstride;
   1941  output[startidx] = buf0[18];
   1942  output[endidx] = buf0[13];
   1943  startidx += outstride;
   1944  endidx -= outstride;
   1945  output[startidx] = buf0[10];
   1946  output[endidx] = buf0[21];
   1947  startidx += outstride;
   1948  endidx -= outstride;
   1949  output[startidx] = buf0[26];
   1950  output[endidx] = buf0[5];
   1951  startidx += outstride;
   1952  endidx -= outstride;
   1953  output[startidx] = buf0[6];
   1954  output[endidx] = buf0[25];
   1955  startidx += outstride;
   1956  endidx -= outstride;
   1957  output[startidx] = buf0[22];
   1958  output[endidx] = buf0[9];
   1959  startidx += outstride;
   1960  endidx -= outstride;
   1961  output[startidx] = buf0[14];
   1962  output[endidx] = buf0[17];
   1963  startidx += outstride;
   1964  endidx -= outstride;
   1965  output[startidx] = buf0[30];
   1966  output[endidx] = buf0[1];
   1967 }
   1968 static inline void idtx32x32_avx2(__m256i *input, __m256i *output,
   1969                                  const int8_t cos_bit, int instride,
   1970                                  int outstride) {
   1971  (void)cos_bit;
   1972  for (int i = 0; i < 32; i += 8) {
   1973    output[i * outstride] = _mm256_slli_epi32(input[i * instride], 2);
   1974    output[(i + 1) * outstride] =
   1975        _mm256_slli_epi32(input[(i + 1) * instride], 2);
   1976    output[(i + 2) * outstride] =
   1977        _mm256_slli_epi32(input[(i + 2) * instride], 2);
   1978    output[(i + 3) * outstride] =
   1979        _mm256_slli_epi32(input[(i + 3) * instride], 2);
   1980    output[(i + 4) * outstride] =
   1981        _mm256_slli_epi32(input[(i + 4) * instride], 2);
   1982    output[(i + 5) * outstride] =
   1983        _mm256_slli_epi32(input[(i + 5) * instride], 2);
   1984    output[(i + 6) * outstride] =
   1985        _mm256_slli_epi32(input[(i + 6) * instride], 2);
   1986    output[(i + 7) * outstride] =
   1987        _mm256_slli_epi32(input[(i + 7) * instride], 2);
   1988  }
   1989 }
   1990 static const transform_1d_avx2 col_txfm8x32_arr[TX_TYPES] = {
   1991  fdct32_avx2,     // DCT_DCT
   1992  NULL,            // ADST_DCT
   1993  NULL,            // DCT_ADST
   1994  NULL,            // ADST_ADST
   1995  NULL,            // FLIPADST_DCT
   1996  NULL,            // DCT_FLIPADST
   1997  NULL,            // FLIPADST_FLIPADST
   1998  NULL,            // ADST_FLIPADST
   1999  NULL,            // FLIPADST_ADST
   2000  idtx32x32_avx2,  // IDTX
   2001  NULL,            // V_DCT
   2002  NULL,            // H_DCT
   2003  NULL,            // V_ADST
   2004  NULL,            // H_ADST
   2005  NULL,            // V_FLIPADST
   2006  NULL             // H_FLIPADST
   2007 };
   2008 static const transform_1d_avx2 row_txfm8x32_arr[TX_TYPES] = {
   2009  fdct32_avx2,     // DCT_DCT
   2010  NULL,            // ADST_DCT
   2011  NULL,            // DCT_ADST
   2012  NULL,            // ADST_ADST
   2013  NULL,            // FLIPADST_DCT
   2014  NULL,            // DCT_FLIPADST
   2015  NULL,            // FLIPADST_FLIPADST
   2016  NULL,            // ADST_FLIPADST
   2017  NULL,            // FLIPADST_ADST
   2018  idtx32x32_avx2,  // IDTX
   2019  NULL,            // V_DCT
   2020  NULL,            // H_DCT
   2021  NULL,            // V_ADST
   2022  NULL,            // H_ADST
   2023  NULL,            // V_FLIPADST
   2024  NULL             // H_FLIPADST
   2025 };
   2026 void av1_fwd_txfm2d_32x32_avx2(const int16_t *input, int32_t *output,
   2027                               int stride, TX_TYPE tx_type, int bd) {
   2028  (void)bd;
   2029  __m256i buf0[128], buf1[128];
   2030  const int tx_size = TX_32X32;
   2031  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
   2032  const int txw_idx = get_txw_idx(tx_size);
   2033  const int txh_idx = get_txh_idx(tx_size);
   2034  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
   2035  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   2036  const int width = tx_size_wide[tx_size];
   2037  const int height = tx_size_high[tx_size];
   2038  const transform_1d_avx2 col_txfm = col_txfm8x32_arr[tx_type];
   2039  const transform_1d_avx2 row_txfm = row_txfm8x32_arr[tx_type];
   2040  int r, c;
   2041  const int width_div16 = (width >> 4);
   2042  const int width_div8 = (width >> 3);
   2043 
   2044  for (int i = 0; i < width_div16; i++) {
   2045    load_buffer_16xn_avx2(input + (i << 4), &buf0[(i << 1)], stride, height,
   2046                          width_div8, 0, 0);
   2047    round_shift_32_8xn_avx2(&buf0[(i << 1)], height, shift[0], width_div8);
   2048    round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[0], width_div8);
   2049    col_txfm(&buf0[(i << 1)], &buf0[(i << 1)], cos_bit_col, width_div8,
   2050             width_div8);
   2051    col_txfm(&buf0[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_col, width_div8,
   2052             width_div8);
   2053    round_shift_32_8xn_avx2(&buf0[(i << 1)], height, shift[1], width_div8);
   2054    round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[1], width_div8);
   2055  }
   2056 
   2057  for (r = 0; r < height; r += 8) {
   2058    for (c = 0; c < width_div8; c++) {
   2059      fwd_txfm_transpose_8x8_avx2(&buf0[r * width_div8 + c],
   2060                                  &buf1[c * 8 * width_div8 + (r >> 3)],
   2061                                  width_div8, width_div8);
   2062    }
   2063  }
   2064 
   2065  for (int i = 0; i < width_div16; i++) {
   2066    row_txfm(&buf1[(i << 1)], &buf1[(i << 1)], cos_bit_row, width_div8,
   2067             width_div8);
   2068    row_txfm(&buf1[(i << 1) + 1], &buf1[(i << 1) + 1], cos_bit_row, width_div8,
   2069             width_div8);
   2070    round_shift_32_8xn_avx2(&buf1[(i << 1)], height, shift[2], width_div8);
   2071    round_shift_32_8xn_avx2(&buf1[(i << 1) + 1], height, shift[2], width_div8);
   2072  }
   2073 
   2074  store_buffer_avx2(buf1, output, 8, 128);
   2075 }
   2076 static inline void fdct64_stage2_avx2(__m256i *x1, __m256i *x2,
   2077                                      __m256i *cospi_m32, __m256i *cospi_p32,
   2078                                      const __m256i *__rounding,
   2079                                      int8_t cos_bit) {
   2080  x2[0] = _mm256_add_epi32(x1[0], x1[31]);
   2081  x2[31] = _mm256_sub_epi32(x1[0], x1[31]);
   2082  x2[1] = _mm256_add_epi32(x1[1], x1[30]);
   2083  x2[30] = _mm256_sub_epi32(x1[1], x1[30]);
   2084  x2[2] = _mm256_add_epi32(x1[2], x1[29]);
   2085  x2[29] = _mm256_sub_epi32(x1[2], x1[29]);
   2086  x2[3] = _mm256_add_epi32(x1[3], x1[28]);
   2087  x2[28] = _mm256_sub_epi32(x1[3], x1[28]);
   2088  x2[4] = _mm256_add_epi32(x1[4], x1[27]);
   2089  x2[27] = _mm256_sub_epi32(x1[4], x1[27]);
   2090  x2[5] = _mm256_add_epi32(x1[5], x1[26]);
   2091  x2[26] = _mm256_sub_epi32(x1[5], x1[26]);
   2092  x2[6] = _mm256_add_epi32(x1[6], x1[25]);
   2093  x2[25] = _mm256_sub_epi32(x1[6], x1[25]);
   2094  x2[7] = _mm256_add_epi32(x1[7], x1[24]);
   2095  x2[24] = _mm256_sub_epi32(x1[7], x1[24]);
   2096  x2[8] = _mm256_add_epi32(x1[8], x1[23]);
   2097  x2[23] = _mm256_sub_epi32(x1[8], x1[23]);
   2098  x2[9] = _mm256_add_epi32(x1[9], x1[22]);
   2099  x2[22] = _mm256_sub_epi32(x1[9], x1[22]);
   2100  x2[10] = _mm256_add_epi32(x1[10], x1[21]);
   2101  x2[21] = _mm256_sub_epi32(x1[10], x1[21]);
   2102  x2[11] = _mm256_add_epi32(x1[11], x1[20]);
   2103  x2[20] = _mm256_sub_epi32(x1[11], x1[20]);
   2104  x2[12] = _mm256_add_epi32(x1[12], x1[19]);
   2105  x2[19] = _mm256_sub_epi32(x1[12], x1[19]);
   2106  x2[13] = _mm256_add_epi32(x1[13], x1[18]);
   2107  x2[18] = _mm256_sub_epi32(x1[13], x1[18]);
   2108  x2[14] = _mm256_add_epi32(x1[14], x1[17]);
   2109  x2[17] = _mm256_sub_epi32(x1[14], x1[17]);
   2110  x2[15] = _mm256_add_epi32(x1[15], x1[16]);
   2111  x2[16] = _mm256_sub_epi32(x1[15], x1[16]);
   2112  x2[32] = x1[32];
   2113  x2[33] = x1[33];
   2114  x2[34] = x1[34];
   2115  x2[35] = x1[35];
   2116  x2[36] = x1[36];
   2117  x2[37] = x1[37];
   2118  x2[38] = x1[38];
   2119  x2[39] = x1[39];
   2120  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[40], x1[55], x2[40], x2[55],
   2121                        *__rounding, cos_bit);
   2122  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[41], x1[54], x2[41], x2[54],
   2123                        *__rounding, cos_bit);
   2124  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[42], x1[53], x2[42], x2[53],
   2125                        *__rounding, cos_bit);
   2126  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[43], x1[52], x2[43], x2[52],
   2127                        *__rounding, cos_bit);
   2128  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[44], x1[51], x2[44], x2[51],
   2129                        *__rounding, cos_bit);
   2130  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[45], x1[50], x2[45], x2[50],
   2131                        *__rounding, cos_bit);
   2132  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[46], x1[49], x2[46], x2[49],
   2133                        *__rounding, cos_bit);
   2134  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[47], x1[48], x2[47], x2[48],
   2135                        *__rounding, cos_bit);
   2136  x2[56] = x1[56];
   2137  x2[57] = x1[57];
   2138  x2[58] = x1[58];
   2139  x2[59] = x1[59];
   2140  x2[60] = x1[60];
   2141  x2[61] = x1[61];
   2142  x2[62] = x1[62];
   2143  x2[63] = x1[63];
   2144 }
   2145 static inline void fdct64_stage3_avx2(__m256i *x2, __m256i *x3,
   2146                                      __m256i *cospi_m32, __m256i *cospi_p32,
   2147                                      const __m256i *__rounding,
   2148                                      int8_t cos_bit) {
   2149  x3[0] = _mm256_add_epi32(x2[0], x2[15]);
   2150  x3[15] = _mm256_sub_epi32(x2[0], x2[15]);
   2151  x3[1] = _mm256_add_epi32(x2[1], x2[14]);
   2152  x3[14] = _mm256_sub_epi32(x2[1], x2[14]);
   2153  x3[2] = _mm256_add_epi32(x2[2], x2[13]);
   2154  x3[13] = _mm256_sub_epi32(x2[2], x2[13]);
   2155  x3[3] = _mm256_add_epi32(x2[3], x2[12]);
   2156  x3[12] = _mm256_sub_epi32(x2[3], x2[12]);
   2157  x3[4] = _mm256_add_epi32(x2[4], x2[11]);
   2158  x3[11] = _mm256_sub_epi32(x2[4], x2[11]);
   2159  x3[5] = _mm256_add_epi32(x2[5], x2[10]);
   2160  x3[10] = _mm256_sub_epi32(x2[5], x2[10]);
   2161  x3[6] = _mm256_add_epi32(x2[6], x2[9]);
   2162  x3[9] = _mm256_sub_epi32(x2[6], x2[9]);
   2163  x3[7] = _mm256_add_epi32(x2[7], x2[8]);
   2164  x3[8] = _mm256_sub_epi32(x2[7], x2[8]);
   2165  x3[16] = x2[16];
   2166  x3[17] = x2[17];
   2167  x3[18] = x2[18];
   2168  x3[19] = x2[19];
   2169  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[20], x2[27], x3[20], x3[27],
   2170                        *__rounding, cos_bit);
   2171  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[21], x2[26], x3[21], x3[26],
   2172                        *__rounding, cos_bit);
   2173  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[22], x2[25], x3[22], x3[25],
   2174                        *__rounding, cos_bit);
   2175  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[23], x2[24], x3[23], x3[24],
   2176                        *__rounding, cos_bit);
   2177  x3[28] = x2[28];
   2178  x3[29] = x2[29];
   2179  x3[30] = x2[30];
   2180  x3[31] = x2[31];
   2181  x3[32] = _mm256_add_epi32(x2[32], x2[47]);
   2182  x3[47] = _mm256_sub_epi32(x2[32], x2[47]);
   2183  x3[33] = _mm256_add_epi32(x2[33], x2[46]);
   2184  x3[46] = _mm256_sub_epi32(x2[33], x2[46]);
   2185  x3[34] = _mm256_add_epi32(x2[34], x2[45]);
   2186  x3[45] = _mm256_sub_epi32(x2[34], x2[45]);
   2187  x3[35] = _mm256_add_epi32(x2[35], x2[44]);
   2188  x3[44] = _mm256_sub_epi32(x2[35], x2[44]);
   2189  x3[36] = _mm256_add_epi32(x2[36], x2[43]);
   2190  x3[43] = _mm256_sub_epi32(x2[36], x2[43]);
   2191  x3[37] = _mm256_add_epi32(x2[37], x2[42]);
   2192  x3[42] = _mm256_sub_epi32(x2[37], x2[42]);
   2193  x3[38] = _mm256_add_epi32(x2[38], x2[41]);
   2194  x3[41] = _mm256_sub_epi32(x2[38], x2[41]);
   2195  x3[39] = _mm256_add_epi32(x2[39], x2[40]);
   2196  x3[40] = _mm256_sub_epi32(x2[39], x2[40]);
   2197  x3[48] = _mm256_sub_epi32(x2[63], x2[48]);
   2198  x3[63] = _mm256_add_epi32(x2[63], x2[48]);
   2199  x3[49] = _mm256_sub_epi32(x2[62], x2[49]);
   2200  x3[62] = _mm256_add_epi32(x2[62], x2[49]);
   2201  x3[50] = _mm256_sub_epi32(x2[61], x2[50]);
   2202  x3[61] = _mm256_add_epi32(x2[61], x2[50]);
   2203  x3[51] = _mm256_sub_epi32(x2[60], x2[51]);
   2204  x3[60] = _mm256_add_epi32(x2[60], x2[51]);
   2205  x3[52] = _mm256_sub_epi32(x2[59], x2[52]);
   2206  x3[59] = _mm256_add_epi32(x2[59], x2[52]);
   2207  x3[53] = _mm256_sub_epi32(x2[58], x2[53]);
   2208  x3[58] = _mm256_add_epi32(x2[58], x2[53]);
   2209  x3[54] = _mm256_sub_epi32(x2[57], x2[54]);
   2210  x3[57] = _mm256_add_epi32(x2[57], x2[54]);
   2211  x3[55] = _mm256_sub_epi32(x2[56], x2[55]);
   2212  x3[56] = _mm256_add_epi32(x2[56], x2[55]);
   2213 }
   2214 static inline void fdct64_stage4_avx2(__m256i *x3, __m256i *x4,
   2215                                      __m256i *cospi_m32, __m256i *cospi_p32,
   2216                                      __m256i *cospi_m16, __m256i *cospi_p48,
   2217                                      __m256i *cospi_m48,
   2218                                      const __m256i *__rounding,
   2219                                      int8_t cos_bit) {
   2220  x4[0] = _mm256_add_epi32(x3[0], x3[7]);
   2221  x4[7] = _mm256_sub_epi32(x3[0], x3[7]);
   2222  x4[1] = _mm256_add_epi32(x3[1], x3[6]);
   2223  x4[6] = _mm256_sub_epi32(x3[1], x3[6]);
   2224  x4[2] = _mm256_add_epi32(x3[2], x3[5]);
   2225  x4[5] = _mm256_sub_epi32(x3[2], x3[5]);
   2226  x4[3] = _mm256_add_epi32(x3[3], x3[4]);
   2227  x4[4] = _mm256_sub_epi32(x3[3], x3[4]);
   2228  x4[8] = x3[8];
   2229  x4[9] = x3[9];
   2230  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x3[10], x3[13], x4[10], x4[13],
   2231                        *__rounding, cos_bit);
   2232  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x3[11], x3[12], x4[11], x4[12],
   2233                        *__rounding, cos_bit);
   2234  x4[14] = x3[14];
   2235  x4[15] = x3[15];
   2236  x4[16] = _mm256_add_epi32(x3[16], x3[23]);
   2237  x4[23] = _mm256_sub_epi32(x3[16], x3[23]);
   2238  x4[17] = _mm256_add_epi32(x3[17], x3[22]);
   2239  x4[22] = _mm256_sub_epi32(x3[17], x3[22]);
   2240  x4[18] = _mm256_add_epi32(x3[18], x3[21]);
   2241  x4[21] = _mm256_sub_epi32(x3[18], x3[21]);
   2242  x4[19] = _mm256_add_epi32(x3[19], x3[20]);
   2243  x4[20] = _mm256_sub_epi32(x3[19], x3[20]);
   2244  x4[24] = _mm256_sub_epi32(x3[31], x3[24]);
   2245  x4[31] = _mm256_add_epi32(x3[31], x3[24]);
   2246  x4[25] = _mm256_sub_epi32(x3[30], x3[25]);
   2247  x4[30] = _mm256_add_epi32(x3[30], x3[25]);
   2248  x4[26] = _mm256_sub_epi32(x3[29], x3[26]);
   2249  x4[29] = _mm256_add_epi32(x3[29], x3[26]);
   2250  x4[27] = _mm256_sub_epi32(x3[28], x3[27]);
   2251  x4[28] = _mm256_add_epi32(x3[28], x3[27]);
   2252  x4[32] = x3[32];
   2253  x4[33] = x3[33];
   2254  x4[34] = x3[34];
   2255  x4[35] = x3[35];
   2256  btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[36], x3[59], x4[36], x4[59],
   2257                        *__rounding, cos_bit);
   2258  btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[37], x3[58], x4[37], x4[58],
   2259                        *__rounding, cos_bit);
   2260  btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[38], x3[57], x4[38], x4[57],
   2261                        *__rounding, cos_bit);
   2262  btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[39], x3[56], x4[39], x4[56],
   2263                        *__rounding, cos_bit);
   2264  btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[40], x3[55], x4[40], x4[55],
   2265                        *__rounding, cos_bit);
   2266  btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[41], x3[54], x4[41], x4[54],
   2267                        *__rounding, cos_bit);
   2268  btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[42], x3[53], x4[42], x4[53],
   2269                        *__rounding, cos_bit);
   2270  btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[43], x3[52], x4[43], x4[52],
   2271                        *__rounding, cos_bit);
   2272  x4[44] = x3[44];
   2273  x4[45] = x3[45];
   2274  x4[46] = x3[46];
   2275  x4[47] = x3[47];
   2276  x4[48] = x3[48];
   2277  x4[49] = x3[49];
   2278  x4[50] = x3[50];
   2279  x4[51] = x3[51];
   2280  x4[60] = x3[60];
   2281  x4[61] = x3[61];
   2282  x4[62] = x3[62];
   2283  x4[63] = x3[63];
   2284 }
   2285 static inline void fdct64_stage5_avx2(__m256i *x4, __m256i *x5,
   2286                                      __m256i *cospi_m32, __m256i *cospi_p32,
   2287                                      __m256i *cospi_m16, __m256i *cospi_p48,
   2288                                      __m256i *cospi_m48,
   2289                                      const __m256i *__rounding,
   2290                                      int8_t cos_bit) {
   2291  x5[0] = _mm256_add_epi32(x4[0], x4[3]);
   2292  x5[3] = _mm256_sub_epi32(x4[0], x4[3]);
   2293  x5[1] = _mm256_add_epi32(x4[1], x4[2]);
   2294  x5[2] = _mm256_sub_epi32(x4[1], x4[2]);
   2295  x5[4] = x4[4];
   2296  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x4[5], x4[6], x5[5], x5[6],
   2297                        *__rounding, cos_bit);
   2298  x5[7] = x4[7];
   2299  x5[8] = _mm256_add_epi32(x4[8], x4[11]);
   2300  x5[11] = _mm256_sub_epi32(x4[8], x4[11]);
   2301  x5[9] = _mm256_add_epi32(x4[9], x4[10]);
   2302  x5[10] = _mm256_sub_epi32(x4[9], x4[10]);
   2303  x5[12] = _mm256_sub_epi32(x4[15], x4[12]);
   2304  x5[15] = _mm256_add_epi32(x4[15], x4[12]);
   2305  x5[13] = _mm256_sub_epi32(x4[14], x4[13]);
   2306  x5[14] = _mm256_add_epi32(x4[14], x4[13]);
   2307  x5[16] = x4[16];
   2308  x5[17] = x4[17];
   2309  btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x4[18], x4[29], x5[18], x5[29],
   2310                        *__rounding, cos_bit);
   2311  btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x4[19], x4[28], x5[19], x5[28],
   2312                        *__rounding, cos_bit);
   2313  btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x4[20], x4[27], x5[20], x5[27],
   2314                        *__rounding, cos_bit);
   2315  btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x4[21], x4[26], x5[21], x5[26],
   2316                        *__rounding, cos_bit);
   2317  x5[22] = x4[22];
   2318  x5[23] = x4[23];
   2319  x5[24] = x4[24];
   2320  x5[25] = x4[25];
   2321  x5[30] = x4[30];
   2322  x5[31] = x4[31];
   2323  x5[32] = _mm256_add_epi32(x4[32], x4[39]);
   2324  x5[39] = _mm256_sub_epi32(x4[32], x4[39]);
   2325  x5[33] = _mm256_add_epi32(x4[33], x4[38]);
   2326  x5[38] = _mm256_sub_epi32(x4[33], x4[38]);
   2327  x5[34] = _mm256_add_epi32(x4[34], x4[37]);
   2328  x5[37] = _mm256_sub_epi32(x4[34], x4[37]);
   2329  x5[35] = _mm256_add_epi32(x4[35], x4[36]);
   2330  x5[36] = _mm256_sub_epi32(x4[35], x4[36]);
   2331  x5[40] = _mm256_sub_epi32(x4[47], x4[40]);
   2332  x5[47] = _mm256_add_epi32(x4[47], x4[40]);
   2333  x5[41] = _mm256_sub_epi32(x4[46], x4[41]);
   2334  x5[46] = _mm256_add_epi32(x4[46], x4[41]);
   2335  x5[42] = _mm256_sub_epi32(x4[45], x4[42]);
   2336  x5[45] = _mm256_add_epi32(x4[45], x4[42]);
   2337  x5[43] = _mm256_sub_epi32(x4[44], x4[43]);
   2338  x5[44] = _mm256_add_epi32(x4[44], x4[43]);
   2339  x5[48] = _mm256_add_epi32(x4[48], x4[55]);
   2340  x5[55] = _mm256_sub_epi32(x4[48], x4[55]);
   2341  x5[49] = _mm256_add_epi32(x4[49], x4[54]);
   2342  x5[54] = _mm256_sub_epi32(x4[49], x4[54]);
   2343  x5[50] = _mm256_add_epi32(x4[50], x4[53]);
   2344  x5[53] = _mm256_sub_epi32(x4[50], x4[53]);
   2345  x5[51] = _mm256_add_epi32(x4[51], x4[52]);
   2346  x5[52] = _mm256_sub_epi32(x4[51], x4[52]);
   2347  x5[56] = _mm256_sub_epi32(x4[63], x4[56]);
   2348  x5[63] = _mm256_add_epi32(x4[63], x4[56]);
   2349  x5[57] = _mm256_sub_epi32(x4[62], x4[57]);
   2350  x5[62] = _mm256_add_epi32(x4[62], x4[57]);
   2351  x5[58] = _mm256_sub_epi32(x4[61], x4[58]);
   2352  x5[61] = _mm256_add_epi32(x4[61], x4[58]);
   2353  x5[59] = _mm256_sub_epi32(x4[60], x4[59]);
   2354  x5[60] = _mm256_add_epi32(x4[60], x4[59]);
   2355 }
   2356 static inline void fdct64_stage6_avx2(
   2357    __m256i *x5, __m256i *x6, __m256i *cospi_p16, __m256i *cospi_p32,
   2358    __m256i *cospi_m16, __m256i *cospi_p48, __m256i *cospi_m48,
   2359    __m256i *cospi_m08, __m256i *cospi_p56, __m256i *cospi_m56,
   2360    __m256i *cospi_m40, __m256i *cospi_p24, __m256i *cospi_m24,
   2361    const __m256i *__rounding, int8_t cos_bit) {
   2362  btf_32_type0_avx2_new(*cospi_p32, *cospi_p32, x5[0], x5[1], x6[0], x6[1],
   2363                        *__rounding, cos_bit);
   2364  btf_32_type0_avx2_new(*cospi_p16, *cospi_p48, x5[3], x5[2], x6[2], x6[3],
   2365                        *__rounding, cos_bit);
   2366  x6[4] = _mm256_add_epi32(x5[4], x5[5]);
   2367  x6[5] = _mm256_sub_epi32(x5[4], x5[5]);
   2368  x6[6] = _mm256_sub_epi32(x5[7], x5[6]);
   2369  x6[7] = _mm256_add_epi32(x5[7], x5[6]);
   2370  x6[8] = x5[8];
   2371  btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x5[9], x5[14], x6[9], x6[14],
   2372                        *__rounding, cos_bit);
   2373  btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x5[10], x5[13], x6[10], x6[13],
   2374                        *__rounding, cos_bit);
   2375  x6[11] = x5[11];
   2376  x6[12] = x5[12];
   2377  x6[15] = x5[15];
   2378  x6[16] = _mm256_add_epi32(x5[16], x5[19]);
   2379  x6[19] = _mm256_sub_epi32(x5[16], x5[19]);
   2380  x6[17] = _mm256_add_epi32(x5[17], x5[18]);
   2381  x6[18] = _mm256_sub_epi32(x5[17], x5[18]);
   2382  x6[20] = _mm256_sub_epi32(x5[23], x5[20]);
   2383  x6[23] = _mm256_add_epi32(x5[23], x5[20]);
   2384  x6[21] = _mm256_sub_epi32(x5[22], x5[21]);
   2385  x6[22] = _mm256_add_epi32(x5[22], x5[21]);
   2386  x6[24] = _mm256_add_epi32(x5[24], x5[27]);
   2387  x6[27] = _mm256_sub_epi32(x5[24], x5[27]);
   2388  x6[25] = _mm256_add_epi32(x5[25], x5[26]);
   2389  x6[26] = _mm256_sub_epi32(x5[25], x5[26]);
   2390  x6[28] = _mm256_sub_epi32(x5[31], x5[28]);
   2391  x6[31] = _mm256_add_epi32(x5[31], x5[28]);
   2392  x6[29] = _mm256_sub_epi32(x5[30], x5[29]);
   2393  x6[30] = _mm256_add_epi32(x5[30], x5[29]);
   2394  x6[32] = x5[32];
   2395  x6[33] = x5[33];
   2396  btf_32_type0_avx2_new(*cospi_m08, *cospi_p56, x5[34], x5[61], x6[34], x6[61],
   2397                        *__rounding, cos_bit);
   2398  btf_32_type0_avx2_new(*cospi_m08, *cospi_p56, x5[35], x5[60], x6[35], x6[60],
   2399                        *__rounding, cos_bit);
   2400  btf_32_type0_avx2_new(*cospi_m56, *cospi_m08, x5[36], x5[59], x6[36], x6[59],
   2401                        *__rounding, cos_bit);
   2402  btf_32_type0_avx2_new(*cospi_m56, *cospi_m08, x5[37], x5[58], x6[37], x6[58],
   2403                        *__rounding, cos_bit);
   2404  x6[38] = x5[38];
   2405  x6[39] = x5[39];
   2406  x6[40] = x5[40];
   2407  x6[41] = x5[41];
   2408  btf_32_type0_avx2_new(*cospi_m40, *cospi_p24, x5[42], x5[53], x6[42], x6[53],
   2409                        *__rounding, cos_bit);
   2410  btf_32_type0_avx2_new(*cospi_m40, *cospi_p24, x5[43], x5[52], x6[43], x6[52],
   2411                        *__rounding, cos_bit);
   2412  btf_32_type0_avx2_new(*cospi_m24, *cospi_m40, x5[44], x5[51], x6[44], x6[51],
   2413                        *__rounding, cos_bit);
   2414  btf_32_type0_avx2_new(*cospi_m24, *cospi_m40, x5[45], x5[50], x6[45], x6[50],
   2415                        *__rounding, cos_bit);
   2416  x6[46] = x5[46];
   2417  x6[47] = x5[47];
   2418  x6[48] = x5[48];
   2419  x6[49] = x5[49];
   2420  x6[54] = x5[54];
   2421  x6[55] = x5[55];
   2422  x6[56] = x5[56];
   2423  x6[57] = x5[57];
   2424  x6[62] = x5[62];
   2425  x6[63] = x5[63];
   2426 }
   2427 static inline void fdct64_stage7_avx2(__m256i *x6, __m256i *x7,
   2428                                      __m256i *cospi_p08, __m256i *cospi_p56,
   2429                                      __m256i *cospi_p40, __m256i *cospi_p24,
   2430                                      __m256i *cospi_m08, __m256i *cospi_m56,
   2431                                      __m256i *cospi_m40, __m256i *cospi_m24,
   2432                                      const __m256i *__rounding,
   2433                                      int8_t cos_bit) {
   2434  x7[0] = x6[0];
   2435  x7[1] = x6[1];
   2436  x7[2] = x6[2];
   2437  x7[3] = x6[3];
   2438  btf_32_type0_avx2_new(*cospi_p08, *cospi_p56, x6[7], x6[4], x7[4], x7[7],
   2439                        *__rounding, cos_bit);
   2440  btf_32_type0_avx2_new(*cospi_p40, *cospi_p24, x6[6], x6[5], x7[5], x7[6],
   2441                        *__rounding, cos_bit);
   2442  x7[8] = _mm256_add_epi32(x6[8], x6[9]);
   2443  x7[9] = _mm256_sub_epi32(x6[8], x6[9]);
   2444  x7[10] = _mm256_sub_epi32(x6[11], x6[10]);
   2445  x7[11] = _mm256_add_epi32(x6[11], x6[10]);
   2446  x7[12] = _mm256_add_epi32(x6[12], x6[13]);
   2447  x7[13] = _mm256_sub_epi32(x6[12], x6[13]);
   2448  x7[14] = _mm256_sub_epi32(x6[15], x6[14]);
   2449  x7[15] = _mm256_add_epi32(x6[15], x6[14]);
   2450  x7[16] = x6[16];
   2451  btf_32_type0_avx2_new(*cospi_m08, *cospi_p56, x6[17], x6[30], x7[17], x7[30],
   2452                        *__rounding, cos_bit);
   2453  btf_32_type0_avx2_new(*cospi_m56, *cospi_m08, x6[18], x6[29], x7[18], x7[29],
   2454                        *__rounding, cos_bit);
   2455  x7[19] = x6[19];
   2456  x7[20] = x6[20];
   2457  btf_32_type0_avx2_new(*cospi_m40, *cospi_p24, x6[21], x6[26], x7[21], x7[26],
   2458                        *__rounding, cos_bit);
   2459  btf_32_type0_avx2_new(*cospi_m24, *cospi_m40, x6[22], x6[25], x7[22], x7[25],
   2460                        *__rounding, cos_bit);
   2461  x7[23] = x6[23];
   2462  x7[24] = x6[24];
   2463  x7[27] = x6[27];
   2464  x7[28] = x6[28];
   2465  x7[31] = x6[31];
   2466  x7[32] = _mm256_add_epi32(x6[32], x6[35]);
   2467  x7[35] = _mm256_sub_epi32(x6[32], x6[35]);
   2468  x7[33] = _mm256_add_epi32(x6[33], x6[34]);
   2469  x7[34] = _mm256_sub_epi32(x6[33], x6[34]);
   2470  x7[36] = _mm256_sub_epi32(x6[39], x6[36]);
   2471  x7[39] = _mm256_add_epi32(x6[39], x6[36]);
   2472  x7[37] = _mm256_sub_epi32(x6[38], x6[37]);
   2473  x7[38] = _mm256_add_epi32(x6[38], x6[37]);
   2474  x7[40] = _mm256_add_epi32(x6[40], x6[43]);
   2475  x7[43] = _mm256_sub_epi32(x6[40], x6[43]);
   2476  x7[41] = _mm256_add_epi32(x6[41], x6[42]);
   2477  x7[42] = _mm256_sub_epi32(x6[41], x6[42]);
   2478  x7[44] = _mm256_sub_epi32(x6[47], x6[44]);
   2479  x7[47] = _mm256_add_epi32(x6[47], x6[44]);
   2480  x7[45] = _mm256_sub_epi32(x6[46], x6[45]);
   2481  x7[46] = _mm256_add_epi32(x6[46], x6[45]);
   2482  x7[48] = _mm256_add_epi32(x6[48], x6[51]);
   2483  x7[51] = _mm256_sub_epi32(x6[48], x6[51]);
   2484  x7[49] = _mm256_add_epi32(x6[49], x6[50]);
   2485  x7[50] = _mm256_sub_epi32(x6[49], x6[50]);
   2486  x7[52] = _mm256_sub_epi32(x6[55], x6[52]);
   2487  x7[55] = _mm256_add_epi32(x6[55], x6[52]);
   2488  x7[53] = _mm256_sub_epi32(x6[54], x6[53]);
   2489  x7[54] = _mm256_add_epi32(x6[54], x6[53]);
   2490  x7[56] = _mm256_add_epi32(x6[56], x6[59]);
   2491  x7[59] = _mm256_sub_epi32(x6[56], x6[59]);
   2492  x7[57] = _mm256_add_epi32(x6[57], x6[58]);
   2493  x7[58] = _mm256_sub_epi32(x6[57], x6[58]);
   2494  x7[60] = _mm256_sub_epi32(x6[63], x6[60]);
   2495  x7[63] = _mm256_add_epi32(x6[63], x6[60]);
   2496  x7[61] = _mm256_sub_epi32(x6[62], x6[61]);
   2497  x7[62] = _mm256_add_epi32(x6[62], x6[61]);
   2498 }
   2499 static inline void fdct64_stage8_avx2(__m256i *x7, __m256i *x8,
   2500                                      const int32_t *cospi,
   2501                                      const __m256i *__rounding,
   2502                                      int8_t cos_bit) {
   2503  __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]);
   2504  __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]);
   2505  __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]);
   2506  __m256i cospi_p36 = _mm256_set1_epi32(cospi[36]);
   2507  __m256i cospi_p44 = _mm256_set1_epi32(cospi[44]);
   2508  __m256i cospi_p20 = _mm256_set1_epi32(cospi[20]);
   2509  __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]);
   2510  __m256i cospi_p52 = _mm256_set1_epi32(cospi[52]);
   2511  __m256i cospi_m04 = _mm256_set1_epi32(-cospi[4]);
   2512  __m256i cospi_m60 = _mm256_set1_epi32(-cospi[60]);
   2513  __m256i cospi_m36 = _mm256_set1_epi32(-cospi[36]);
   2514  __m256i cospi_m28 = _mm256_set1_epi32(-cospi[28]);
   2515  __m256i cospi_m20 = _mm256_set1_epi32(-cospi[20]);
   2516  __m256i cospi_m44 = _mm256_set1_epi32(-cospi[44]);
   2517  __m256i cospi_m52 = _mm256_set1_epi32(-cospi[52]);
   2518  __m256i cospi_m12 = _mm256_set1_epi32(-cospi[12]);
   2519 
   2520  x8[0] = x7[0];
   2521  x8[1] = x7[1];
   2522  x8[2] = x7[2];
   2523  x8[3] = x7[3];
   2524  x8[4] = x7[4];
   2525  x8[5] = x7[5];
   2526  x8[6] = x7[6];
   2527  x8[7] = x7[7];
   2528 
   2529  btf_32_type0_avx2_new(cospi_p04, cospi_p60, x7[15], x7[8], x8[8], x8[15],
   2530                        *__rounding, cos_bit);
   2531  btf_32_type0_avx2_new(cospi_p36, cospi_p28, x7[14], x7[9], x8[9], x8[14],
   2532                        *__rounding, cos_bit);
   2533  btf_32_type0_avx2_new(cospi_p20, cospi_p44, x7[13], x7[10], x8[10], x8[13],
   2534                        *__rounding, cos_bit);
   2535  btf_32_type0_avx2_new(cospi_p52, cospi_p12, x7[12], x7[11], x8[11], x8[12],
   2536                        *__rounding, cos_bit);
   2537  x8[16] = _mm256_add_epi32(x7[16], x7[17]);
   2538  x8[17] = _mm256_sub_epi32(x7[16], x7[17]);
   2539  x8[18] = _mm256_sub_epi32(x7[19], x7[18]);
   2540  x8[19] = _mm256_add_epi32(x7[19], x7[18]);
   2541  x8[20] = _mm256_add_epi32(x7[20], x7[21]);
   2542  x8[21] = _mm256_sub_epi32(x7[20], x7[21]);
   2543  x8[22] = _mm256_sub_epi32(x7[23], x7[22]);
   2544  x8[23] = _mm256_add_epi32(x7[23], x7[22]);
   2545  x8[24] = _mm256_add_epi32(x7[24], x7[25]);
   2546  x8[25] = _mm256_sub_epi32(x7[24], x7[25]);
   2547  x8[26] = _mm256_sub_epi32(x7[27], x7[26]);
   2548  x8[27] = _mm256_add_epi32(x7[27], x7[26]);
   2549  x8[28] = _mm256_add_epi32(x7[28], x7[29]);
   2550  x8[29] = _mm256_sub_epi32(x7[28], x7[29]);
   2551  x8[30] = _mm256_sub_epi32(x7[31], x7[30]);
   2552  x8[31] = _mm256_add_epi32(x7[31], x7[30]);
   2553  x8[32] = x7[32];
   2554  btf_32_type0_avx2_new(cospi_m04, cospi_p60, x7[33], x7[62], x8[33], x8[62],
   2555                        *__rounding, cos_bit);
   2556  btf_32_type0_avx2_new(cospi_m60, cospi_m04, x7[34], x7[61], x8[34], x8[61],
   2557                        *__rounding, cos_bit);
   2558  x8[35] = x7[35];
   2559  x8[36] = x7[36];
   2560  btf_32_type0_avx2_new(cospi_m36, cospi_p28, x7[37], x7[58], x8[37], x8[58],
   2561                        *__rounding, cos_bit);
   2562  btf_32_type0_avx2_new(cospi_m28, cospi_m36, x7[38], x7[57], x8[38], x8[57],
   2563                        *__rounding, cos_bit);
   2564  x8[39] = x7[39];
   2565  x8[40] = x7[40];
   2566  btf_32_type0_avx2_new(cospi_m20, cospi_p44, x7[41], x7[54], x8[41], x8[54],
   2567                        *__rounding, cos_bit);
   2568  btf_32_type0_avx2_new(cospi_m44, cospi_m20, x7[42], x7[53], x8[42], x8[53],
   2569                        *__rounding, cos_bit);
   2570  x8[43] = x7[43];
   2571  x8[44] = x7[44];
   2572  btf_32_type0_avx2_new(cospi_m52, cospi_p12, x7[45], x7[50], x8[45], x8[50],
   2573                        *__rounding, cos_bit);
   2574  btf_32_type0_avx2_new(cospi_m12, cospi_m52, x7[46], x7[49], x8[46], x8[49],
   2575                        *__rounding, cos_bit);
   2576  x8[47] = x7[47];
   2577  x8[48] = x7[48];
   2578  x8[51] = x7[51];
   2579  x8[52] = x7[52];
   2580  x8[55] = x7[55];
   2581  x8[56] = x7[56];
   2582  x8[59] = x7[59];
   2583  x8[60] = x7[60];
   2584  x8[63] = x7[63];
   2585 }
   2586 static inline void fdct64_stage9_avx2(__m256i *x8, __m256i *x9,
   2587                                      const int32_t *cospi,
   2588                                      const __m256i *__rounding,
   2589                                      int8_t cos_bit) {
   2590  __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]);
   2591  __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]);
   2592  __m256i cospi_p30 = _mm256_set1_epi32(cospi[30]);
   2593  __m256i cospi_p34 = _mm256_set1_epi32(cospi[34]);
   2594  __m256i cospi_p46 = _mm256_set1_epi32(cospi[46]);
   2595  __m256i cospi_p18 = _mm256_set1_epi32(cospi[18]);
   2596  __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]);
   2597  __m256i cospi_p50 = _mm256_set1_epi32(cospi[50]);
   2598  __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]);
   2599  __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]);
   2600  __m256i cospi_p22 = _mm256_set1_epi32(cospi[22]);
   2601  __m256i cospi_p42 = _mm256_set1_epi32(cospi[42]);
   2602  __m256i cospi_p38 = _mm256_set1_epi32(cospi[38]);
   2603  __m256i cospi_p26 = _mm256_set1_epi32(cospi[26]);
   2604  __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]);
   2605  __m256i cospi_p58 = _mm256_set1_epi32(cospi[58]);
   2606 
   2607  x9[0] = x8[0];
   2608  x9[1] = x8[1];
   2609  x9[2] = x8[2];
   2610  x9[3] = x8[3];
   2611  x9[4] = x8[4];
   2612  x9[5] = x8[5];
   2613  x9[6] = x8[6];
   2614  x9[7] = x8[7];
   2615  x9[8] = x8[8];
   2616  x9[9] = x8[9];
   2617  x9[10] = x8[10];
   2618  x9[11] = x8[11];
   2619  x9[12] = x8[12];
   2620  x9[13] = x8[13];
   2621  x9[14] = x8[14];
   2622  x9[15] = x8[15];
   2623  btf_32_type0_avx2_new(cospi_p02, cospi_p62, x8[31], x8[16], x9[16], x9[31],
   2624                        *__rounding, cos_bit);
   2625  btf_32_type0_avx2_new(cospi_p34, cospi_p30, x8[30], x8[17], x9[17], x9[30],
   2626                        *__rounding, cos_bit);
   2627  btf_32_type0_avx2_new(cospi_p18, cospi_p46, x8[29], x8[18], x9[18], x9[29],
   2628                        *__rounding, cos_bit);
   2629  btf_32_type0_avx2_new(cospi_p50, cospi_p14, x8[28], x8[19], x9[19], x9[28],
   2630                        *__rounding, cos_bit);
   2631  btf_32_type0_avx2_new(cospi_p10, cospi_p54, x8[27], x8[20], x9[20], x9[27],
   2632                        *__rounding, cos_bit);
   2633  btf_32_type0_avx2_new(cospi_p42, cospi_p22, x8[26], x8[21], x9[21], x9[26],
   2634                        *__rounding, cos_bit);
   2635  btf_32_type0_avx2_new(cospi_p26, cospi_p38, x8[25], x8[22], x9[22], x9[25],
   2636                        *__rounding, cos_bit);
   2637  btf_32_type0_avx2_new(cospi_p58, cospi_p06, x8[24], x8[23], x9[23], x9[24],
   2638                        *__rounding, cos_bit);
   2639  x9[32] = _mm256_add_epi32(x8[32], x8[33]);
   2640  x9[33] = _mm256_sub_epi32(x8[32], x8[33]);
   2641  x9[34] = _mm256_sub_epi32(x8[35], x8[34]);
   2642  x9[35] = _mm256_add_epi32(x8[35], x8[34]);
   2643  x9[36] = _mm256_add_epi32(x8[36], x8[37]);
   2644  x9[37] = _mm256_sub_epi32(x8[36], x8[37]);
   2645  x9[38] = _mm256_sub_epi32(x8[39], x8[38]);
   2646  x9[39] = _mm256_add_epi32(x8[39], x8[38]);
   2647  x9[40] = _mm256_add_epi32(x8[40], x8[41]);
   2648  x9[41] = _mm256_sub_epi32(x8[40], x8[41]);
   2649  x9[42] = _mm256_sub_epi32(x8[43], x8[42]);
   2650  x9[43] = _mm256_add_epi32(x8[43], x8[42]);
   2651  x9[44] = _mm256_add_epi32(x8[44], x8[45]);
   2652  x9[45] = _mm256_sub_epi32(x8[44], x8[45]);
   2653  x9[46] = _mm256_sub_epi32(x8[47], x8[46]);
   2654  x9[47] = _mm256_add_epi32(x8[47], x8[46]);
   2655  x9[48] = _mm256_add_epi32(x8[48], x8[49]);
   2656  x9[49] = _mm256_sub_epi32(x8[48], x8[49]);
   2657  x9[50] = _mm256_sub_epi32(x8[51], x8[50]);
   2658  x9[51] = _mm256_add_epi32(x8[51], x8[50]);
   2659  x9[52] = _mm256_add_epi32(x8[52], x8[53]);
   2660  x9[53] = _mm256_sub_epi32(x8[52], x8[53]);
   2661  x9[54] = _mm256_sub_epi32(x8[55], x8[54]);
   2662  x9[55] = _mm256_add_epi32(x8[55], x8[54]);
   2663  x9[56] = _mm256_add_epi32(x8[56], x8[57]);
   2664  x9[57] = _mm256_sub_epi32(x8[56], x8[57]);
   2665  x9[58] = _mm256_sub_epi32(x8[59], x8[58]);
   2666  x9[59] = _mm256_add_epi32(x8[59], x8[58]);
   2667  x9[60] = _mm256_add_epi32(x8[60], x8[61]);
   2668  x9[61] = _mm256_sub_epi32(x8[60], x8[61]);
   2669  x9[62] = _mm256_sub_epi32(x8[63], x8[62]);
   2670  x9[63] = _mm256_add_epi32(x8[63], x8[62]);
   2671 }
   2672 static inline void fdct64_stage10_avx2(__m256i *x9, __m256i *x10,
   2673                                       const int32_t *cospi,
   2674                                       const __m256i *__rounding,
   2675                                       int8_t cos_bit) {
   2676  __m256i cospi_p63 = _mm256_set1_epi32(cospi[63]);
   2677  __m256i cospi_p01 = _mm256_set1_epi32(cospi[1]);
   2678  __m256i cospi_p31 = _mm256_set1_epi32(cospi[31]);
   2679  __m256i cospi_p33 = _mm256_set1_epi32(cospi[33]);
   2680  __m256i cospi_p47 = _mm256_set1_epi32(cospi[47]);
   2681  __m256i cospi_p17 = _mm256_set1_epi32(cospi[17]);
   2682  __m256i cospi_p15 = _mm256_set1_epi32(cospi[15]);
   2683  __m256i cospi_p49 = _mm256_set1_epi32(cospi[49]);
   2684  __m256i cospi_p55 = _mm256_set1_epi32(cospi[55]);
   2685  __m256i cospi_p09 = _mm256_set1_epi32(cospi[9]);
   2686  __m256i cospi_p23 = _mm256_set1_epi32(cospi[23]);
   2687  __m256i cospi_p41 = _mm256_set1_epi32(cospi[41]);
   2688  __m256i cospi_p39 = _mm256_set1_epi32(cospi[39]);
   2689  __m256i cospi_p25 = _mm256_set1_epi32(cospi[25]);
   2690  __m256i cospi_p07 = _mm256_set1_epi32(cospi[7]);
   2691  __m256i cospi_p57 = _mm256_set1_epi32(cospi[57]);
   2692  __m256i cospi_p59 = _mm256_set1_epi32(cospi[59]);
   2693  __m256i cospi_p05 = _mm256_set1_epi32(cospi[5]);
   2694  __m256i cospi_p27 = _mm256_set1_epi32(cospi[27]);
   2695  __m256i cospi_p37 = _mm256_set1_epi32(cospi[37]);
   2696  __m256i cospi_p43 = _mm256_set1_epi32(cospi[43]);
   2697  __m256i cospi_p21 = _mm256_set1_epi32(cospi[21]);
   2698  __m256i cospi_p11 = _mm256_set1_epi32(cospi[11]);
   2699  __m256i cospi_p53 = _mm256_set1_epi32(cospi[53]);
   2700  __m256i cospi_p51 = _mm256_set1_epi32(cospi[51]);
   2701  __m256i cospi_p13 = _mm256_set1_epi32(cospi[13]);
   2702  __m256i cospi_p19 = _mm256_set1_epi32(cospi[19]);
   2703  __m256i cospi_p45 = _mm256_set1_epi32(cospi[45]);
   2704  __m256i cospi_p35 = _mm256_set1_epi32(cospi[35]);
   2705  __m256i cospi_p29 = _mm256_set1_epi32(cospi[29]);
   2706  __m256i cospi_p03 = _mm256_set1_epi32(cospi[3]);
   2707  __m256i cospi_p61 = _mm256_set1_epi32(cospi[61]);
   2708 
   2709  x10[0] = x9[0];
   2710  x10[1] = x9[1];
   2711  x10[2] = x9[2];
   2712  x10[3] = x9[3];
   2713  x10[4] = x9[4];
   2714  x10[5] = x9[5];
   2715  x10[6] = x9[6];
   2716  x10[7] = x9[7];
   2717  x10[8] = x9[8];
   2718  x10[9] = x9[9];
   2719  x10[10] = x9[10];
   2720  x10[11] = x9[11];
   2721  x10[12] = x9[12];
   2722  x10[13] = x9[13];
   2723  x10[14] = x9[14];
   2724  x10[15] = x9[15];
   2725  x10[16] = x9[16];
   2726  x10[17] = x9[17];
   2727  x10[18] = x9[18];
   2728  x10[19] = x9[19];
   2729  x10[20] = x9[20];
   2730  x10[21] = x9[21];
   2731  x10[22] = x9[22];
   2732  x10[23] = x9[23];
   2733  x10[24] = x9[24];
   2734  x10[25] = x9[25];
   2735  x10[26] = x9[26];
   2736  x10[27] = x9[27];
   2737  x10[28] = x9[28];
   2738  x10[29] = x9[29];
   2739  x10[30] = x9[30];
   2740  x10[31] = x9[31];
   2741  btf_32_type0_avx2_new(cospi_p01, cospi_p63, x9[63], x9[32], x10[32], x10[63],
   2742                        *__rounding, cos_bit);
   2743  btf_32_type0_avx2_new(cospi_p33, cospi_p31, x9[62], x9[33], x10[33], x10[62],
   2744                        *__rounding, cos_bit);
   2745  btf_32_type0_avx2_new(cospi_p17, cospi_p47, x9[61], x9[34], x10[34], x10[61],
   2746                        *__rounding, cos_bit);
   2747  btf_32_type0_avx2_new(cospi_p49, cospi_p15, x9[60], x9[35], x10[35], x10[60],
   2748                        *__rounding, cos_bit);
   2749  btf_32_type0_avx2_new(cospi_p09, cospi_p55, x9[59], x9[36], x10[36], x10[59],
   2750                        *__rounding, cos_bit);
   2751  btf_32_type0_avx2_new(cospi_p41, cospi_p23, x9[58], x9[37], x10[37], x10[58],
   2752                        *__rounding, cos_bit);
   2753  btf_32_type0_avx2_new(cospi_p25, cospi_p39, x9[57], x9[38], x10[38], x10[57],
   2754                        *__rounding, cos_bit);
   2755  btf_32_type0_avx2_new(cospi_p57, cospi_p07, x9[56], x9[39], x10[39], x10[56],
   2756                        *__rounding, cos_bit);
   2757  btf_32_type0_avx2_new(cospi_p05, cospi_p59, x9[55], x9[40], x10[40], x10[55],
   2758                        *__rounding, cos_bit);
   2759  btf_32_type0_avx2_new(cospi_p37, cospi_p27, x9[54], x9[41], x10[41], x10[54],
   2760                        *__rounding, cos_bit);
   2761  btf_32_type0_avx2_new(cospi_p21, cospi_p43, x9[53], x9[42], x10[42], x10[53],
   2762                        *__rounding, cos_bit);
   2763  btf_32_type0_avx2_new(cospi_p53, cospi_p11, x9[52], x9[43], x10[43], x10[52],
   2764                        *__rounding, cos_bit);
   2765  btf_32_type0_avx2_new(cospi_p13, cospi_p51, x9[51], x9[44], x10[44], x10[51],
   2766                        *__rounding, cos_bit);
   2767  btf_32_type0_avx2_new(cospi_p45, cospi_p19, x9[50], x9[45], x10[45], x10[50],
   2768                        *__rounding, cos_bit);
   2769  btf_32_type0_avx2_new(cospi_p29, cospi_p35, x9[49], x9[46], x10[46], x10[49],
   2770                        *__rounding, cos_bit);
   2771  btf_32_type0_avx2_new(cospi_p61, cospi_p03, x9[48], x9[47], x10[47], x10[48],
   2772                        *__rounding, cos_bit);
   2773 }
   2774 static void fdct64_avx2(__m256i *input, __m256i *output, int8_t cos_bit,
   2775                        const int instride, const int outstride) {
   2776  const int32_t *cospi = cospi_arr(cos_bit);
   2777  const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
   2778  __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]);
   2779  __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]);
   2780  __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]);
   2781  __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]);
   2782  __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]);
   2783  __m256i cospi_p16 = _mm256_set1_epi32(cospi[16]);
   2784  __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]);
   2785  __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]);
   2786  __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]);
   2787  __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]);
   2788  __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]);
   2789  __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]);
   2790  __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]);
   2791  __m256i cospi_p40 = _mm256_set1_epi32(cospi[40]);
   2792 
   2793  int startidx = 0 * instride;
   2794  int endidx = 63 * instride;
   2795  // stage 1
   2796  __m256i x1[64];
   2797  x1[0] = _mm256_add_epi32(input[startidx], input[endidx]);
   2798  x1[63] = _mm256_sub_epi32(input[startidx], input[endidx]);
   2799  startidx += instride;
   2800  endidx -= instride;
   2801  x1[1] = _mm256_add_epi32(input[startidx], input[endidx]);
   2802  x1[62] = _mm256_sub_epi32(input[startidx], input[endidx]);
   2803  startidx += instride;
   2804  endidx -= instride;
   2805  x1[2] = _mm256_add_epi32(input[startidx], input[endidx]);
   2806  x1[61] = _mm256_sub_epi32(input[startidx], input[endidx]);
   2807  startidx += instride;
   2808  endidx -= instride;
   2809  x1[3] = _mm256_add_epi32(input[startidx], input[endidx]);
   2810  x1[60] = _mm256_sub_epi32(input[startidx], input[endidx]);
   2811  startidx += instride;
   2812  endidx -= instride;
   2813  x1[4] = _mm256_add_epi32(input[startidx], input[endidx]);
   2814  x1[59] = _mm256_sub_epi32(input[startidx], input[endidx]);
   2815  startidx += instride;
   2816  endidx -= instride;
   2817  x1[5] = _mm256_add_epi32(input[startidx], input[endidx]);
   2818  x1[58] = _mm256_sub_epi32(input[startidx], input[endidx]);
   2819  startidx += instride;
   2820  endidx -= instride;
   2821  x1[6] = _mm256_add_epi32(input[startidx], input[endidx]);
   2822  x1[57] = _mm256_sub_epi32(input[startidx], input[endidx]);
   2823  startidx += instride;
   2824  endidx -= instride;
   2825  x1[7] = _mm256_add_epi32(input[startidx], input[endidx]);
   2826  x1[56] = _mm256_sub_epi32(input[startidx], input[endidx]);
   2827  startidx += instride;
   2828  endidx -= instride;
   2829  x1[8] = _mm256_add_epi32(input[startidx], input[endidx]);
   2830  x1[55] = _mm256_sub_epi32(input[startidx], input[endidx]);
   2831  startidx += instride;
   2832  endidx -= instride;
   2833  x1[9] = _mm256_add_epi32(input[startidx], input[endidx]);
   2834  x1[54] = _mm256_sub_epi32(input[startidx], input[endidx]);
   2835  startidx += instride;
   2836  endidx -= instride;
   2837  x1[10] = _mm256_add_epi32(input[startidx], input[endidx]);
   2838  x1[53] = _mm256_sub_epi32(input[startidx], input[endidx]);
   2839  startidx += instride;
   2840  endidx -= instride;
   2841  x1[11] = _mm256_add_epi32(input[startidx], input[endidx]);
   2842  x1[52] = _mm256_sub_epi32(input[startidx], input[endidx]);
   2843  startidx += instride;
   2844  endidx -= instride;
   2845  x1[12] = _mm256_add_epi32(input[startidx], input[endidx]);
   2846  x1[51] = _mm256_sub_epi32(input[startidx], input[endidx]);
   2847  startidx += instride;
   2848  endidx -= instride;
   2849  x1[13] = _mm256_add_epi32(input[startidx], input[endidx]);
   2850  x1[50] = _mm256_sub_epi32(input[startidx], input[endidx]);
   2851  startidx += instride;
   2852  endidx -= instride;
   2853  x1[14] = _mm256_add_epi32(input[startidx], input[endidx]);
   2854  x1[49] = _mm256_sub_epi32(input[startidx], input[endidx]);
   2855  startidx += instride;
   2856  endidx -= instride;
   2857  x1[15] = _mm256_add_epi32(input[startidx], input[endidx]);
   2858  x1[48] = _mm256_sub_epi32(input[startidx], input[endidx]);
   2859  startidx += instride;
   2860  endidx -= instride;
   2861  x1[16] = _mm256_add_epi32(input[startidx], input[endidx]);
   2862  x1[47] = _mm256_sub_epi32(input[startidx], input[endidx]);
   2863  startidx += instride;
   2864  endidx -= instride;
   2865  x1[17] = _mm256_add_epi32(input[startidx], input[endidx]);
   2866  x1[46] = _mm256_sub_epi32(input[startidx], input[endidx]);
   2867  startidx += instride;
   2868  endidx -= instride;
   2869  x1[18] = _mm256_add_epi32(input[startidx], input[endidx]);
   2870  x1[45] = _mm256_sub_epi32(input[startidx], input[endidx]);
   2871  startidx += instride;
   2872  endidx -= instride;
   2873  x1[19] = _mm256_add_epi32(input[startidx], input[endidx]);
   2874  x1[44] = _mm256_sub_epi32(input[startidx], input[endidx]);
   2875  startidx += instride;
   2876  endidx -= instride;
   2877  x1[20] = _mm256_add_epi32(input[startidx], input[endidx]);
   2878  x1[43] = _mm256_sub_epi32(input[startidx], input[endidx]);
   2879  startidx += instride;
   2880  endidx -= instride;
   2881  x1[21] = _mm256_add_epi32(input[startidx], input[endidx]);
   2882  x1[42] = _mm256_sub_epi32(input[startidx], input[endidx]);
   2883  startidx += instride;
   2884  endidx -= instride;
   2885  x1[22] = _mm256_add_epi32(input[startidx], input[endidx]);
   2886  x1[41] = _mm256_sub_epi32(input[startidx], input[endidx]);
   2887  startidx += instride;
   2888  endidx -= instride;
   2889  x1[23] = _mm256_add_epi32(input[startidx], input[endidx]);
   2890  x1[40] = _mm256_sub_epi32(input[startidx], input[endidx]);
   2891  startidx += instride;
   2892  endidx -= instride;
   2893  x1[24] = _mm256_add_epi32(input[startidx], input[endidx]);
   2894  x1[39] = _mm256_sub_epi32(input[startidx], input[endidx]);
   2895  startidx += instride;
   2896  endidx -= instride;
   2897  x1[25] = _mm256_add_epi32(input[startidx], input[endidx]);
   2898  x1[38] = _mm256_sub_epi32(input[startidx], input[endidx]);
   2899  startidx += instride;
   2900  endidx -= instride;
   2901  x1[26] = _mm256_add_epi32(input[startidx], input[endidx]);
   2902  x1[37] = _mm256_sub_epi32(input[startidx], input[endidx]);
   2903  startidx += instride;
   2904  endidx -= instride;
   2905  x1[27] = _mm256_add_epi32(input[startidx], input[endidx]);
   2906  x1[36] = _mm256_sub_epi32(input[startidx], input[endidx]);
   2907  startidx += instride;
   2908  endidx -= instride;
   2909  x1[28] = _mm256_add_epi32(input[startidx], input[endidx]);
   2910  x1[35] = _mm256_sub_epi32(input[startidx], input[endidx]);
   2911  startidx += instride;
   2912  endidx -= instride;
   2913  x1[29] = _mm256_add_epi32(input[startidx], input[endidx]);
   2914  x1[34] = _mm256_sub_epi32(input[startidx], input[endidx]);
   2915  startidx += instride;
   2916  endidx -= instride;
   2917  x1[30] = _mm256_add_epi32(input[startidx], input[endidx]);
   2918  x1[33] = _mm256_sub_epi32(input[startidx], input[endidx]);
   2919  startidx += instride;
   2920  endidx -= instride;
   2921  x1[31] = _mm256_add_epi32(input[startidx], input[endidx]);
   2922  x1[32] = _mm256_sub_epi32(input[startidx], input[endidx]);
   2923 
   2924  // stage 2
   2925  __m256i x2[64];
   2926  fdct64_stage2_avx2(x1, x2, &cospi_m32, &cospi_p32, &__rounding, cos_bit);
   2927  // stage 3
   2928  fdct64_stage3_avx2(x2, x1, &cospi_m32, &cospi_p32, &__rounding, cos_bit);
   2929  // stage 4
   2930  fdct64_stage4_avx2(x1, x2, &cospi_m32, &cospi_p32, &cospi_m16, &cospi_p48,
   2931                     &cospi_m48, &__rounding, cos_bit);
   2932  // stage 5
   2933  fdct64_stage5_avx2(x2, x1, &cospi_m32, &cospi_p32, &cospi_m16, &cospi_p48,
   2934                     &cospi_m48, &__rounding, cos_bit);
   2935  // stage 6
   2936  fdct64_stage6_avx2(x1, x2, &cospi_p16, &cospi_p32, &cospi_m16, &cospi_p48,
   2937                     &cospi_m48, &cospi_m08, &cospi_p56, &cospi_m56, &cospi_m40,
   2938                     &cospi_p24, &cospi_m24, &__rounding, cos_bit);
   2939  // stage 7
   2940  fdct64_stage7_avx2(x2, x1, &cospi_p08, &cospi_p56, &cospi_p40, &cospi_p24,
   2941                     &cospi_m08, &cospi_m56, &cospi_m40, &cospi_m24,
   2942                     &__rounding, cos_bit);
   2943  // stage 8
   2944  fdct64_stage8_avx2(x1, x2, cospi, &__rounding, cos_bit);
   2945  // stage 9
   2946  fdct64_stage9_avx2(x2, x1, cospi, &__rounding, cos_bit);
   2947  // stage 10
   2948  fdct64_stage10_avx2(x1, x2, cospi, &__rounding, cos_bit);
   2949 
   2950  startidx = 0 * outstride;
   2951  endidx = 63 * outstride;
   2952 
   2953  // stage 11
   2954  output[startidx] = x2[0];
   2955  output[endidx] = x2[63];
   2956  startidx += outstride;
   2957  endidx -= outstride;
   2958  output[startidx] = x2[32];
   2959  output[endidx] = x2[31];
   2960  startidx += outstride;
   2961  endidx -= outstride;
   2962  output[startidx] = x2[16];
   2963  output[endidx] = x2[47];
   2964  startidx += outstride;
   2965  endidx -= outstride;
   2966  output[startidx] = x2[48];
   2967  output[endidx] = x2[15];
   2968  startidx += outstride;
   2969  endidx -= outstride;
   2970  output[startidx] = x2[8];
   2971  output[endidx] = x2[55];
   2972  startidx += outstride;
   2973  endidx -= outstride;
   2974  output[startidx] = x2[40];
   2975  output[endidx] = x2[23];
   2976  startidx += outstride;
   2977  endidx -= outstride;
   2978  output[startidx] = x2[24];
   2979  output[endidx] = x2[39];
   2980  startidx += outstride;
   2981  endidx -= outstride;
   2982  output[startidx] = x2[56];
   2983  output[endidx] = x2[7];
   2984  startidx += outstride;
   2985  endidx -= outstride;
   2986  output[startidx] = x2[4];
   2987  output[endidx] = x2[59];
   2988  startidx += outstride;
   2989  endidx -= outstride;
   2990  output[startidx] = x2[36];
   2991  output[endidx] = x2[27];
   2992  startidx += outstride;
   2993  endidx -= outstride;
   2994  output[startidx] = x2[20];
   2995  output[endidx] = x2[43];
   2996  startidx += outstride;
   2997  endidx -= outstride;
   2998  output[startidx] = x2[52];
   2999  output[endidx] = x2[11];
   3000  startidx += outstride;
   3001  endidx -= outstride;
   3002  output[startidx] = x2[12];
   3003  output[endidx] = x2[51];
   3004  startidx += outstride;
   3005  endidx -= outstride;
   3006  output[startidx] = x2[44];
   3007  output[endidx] = x2[19];
   3008  startidx += outstride;
   3009  endidx -= outstride;
   3010  output[startidx] = x2[28];
   3011  output[endidx] = x2[35];
   3012  startidx += outstride;
   3013  endidx -= outstride;
   3014  output[startidx] = x2[60];
   3015  output[endidx] = x2[3];
   3016  startidx += outstride;
   3017  endidx -= outstride;
   3018  output[startidx] = x2[2];
   3019  output[endidx] = x2[61];
   3020  startidx += outstride;
   3021  endidx -= outstride;
   3022  output[startidx] = x2[34];
   3023  output[endidx] = x2[29];
   3024  startidx += outstride;
   3025  endidx -= outstride;
   3026  output[startidx] = x2[18];
   3027  output[endidx] = x2[45];
   3028  startidx += outstride;
   3029  endidx -= outstride;
   3030  output[startidx] = x2[50];
   3031  output[endidx] = x2[13];
   3032  startidx += outstride;
   3033  endidx -= outstride;
   3034  output[startidx] = x2[10];
   3035  output[endidx] = x2[53];
   3036  startidx += outstride;
   3037  endidx -= outstride;
   3038  output[startidx] = x2[42];
   3039  output[endidx] = x2[21];
   3040  startidx += outstride;
   3041  endidx -= outstride;
   3042  output[startidx] = x2[26];
   3043  output[endidx] = x2[37];
   3044  startidx += outstride;
   3045  endidx -= outstride;
   3046  output[startidx] = x2[58];
   3047  output[endidx] = x2[5];
   3048  startidx += outstride;
   3049  endidx -= outstride;
   3050  output[startidx] = x2[6];
   3051  output[endidx] = x2[57];
   3052  startidx += outstride;
   3053  endidx -= outstride;
   3054  output[startidx] = x2[38];
   3055  output[endidx] = x2[25];
   3056  startidx += outstride;
   3057  endidx -= outstride;
   3058  output[startidx] = x2[22];
   3059  output[endidx] = x2[41];
   3060  startidx += outstride;
   3061  endidx -= outstride;
   3062  output[startidx] = x2[54];
   3063  output[endidx] = x2[9];
   3064  startidx += outstride;
   3065  endidx -= outstride;
   3066  output[startidx] = x2[14];
   3067  output[endidx] = x2[49];
   3068  startidx += outstride;
   3069  endidx -= outstride;
   3070  output[startidx] = x2[46];
   3071  output[endidx] = x2[17];
   3072  startidx += outstride;
   3073  endidx -= outstride;
   3074  output[startidx] = x2[30];
   3075  output[endidx] = x2[33];
   3076  startidx += outstride;
   3077  endidx -= outstride;
   3078  output[startidx] = x2[62];
   3079  output[endidx] = x2[1];
   3080 }
   3081 void av1_fwd_txfm2d_64x64_avx2(const int16_t *input, int32_t *output,
   3082                               int stride, TX_TYPE tx_type, int bd) {
   3083  (void)bd;
   3084  (void)tx_type;
   3085  assert(tx_type == DCT_DCT);
   3086  const TX_SIZE tx_size = TX_64X64;
   3087  __m256i buf0[512], buf1[512];
   3088  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
   3089  const int txw_idx = get_txw_idx(tx_size);
   3090  const int txh_idx = get_txh_idx(tx_size);
   3091  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
   3092  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   3093  const int width = tx_size_wide[tx_size];
   3094  const int height = tx_size_high[tx_size];
   3095  const transform_1d_avx2 col_txfm = fdct64_avx2;
   3096  const transform_1d_avx2 row_txfm = fdct64_avx2;
   3097  const int width_div16 = (width >> 4);
   3098  const int width_div8 = (width >> 3);
   3099  int r, c;
   3100  for (int i = 0; i < width_div16; i++) {
   3101    load_buffer_16xn_avx2(input + (i << 4), &buf0[i << 1], stride, height,
   3102                          width_div8, 0, 0);
   3103    round_shift_32_8xn_avx2(&buf0[i << 1], height, shift[0], width_div8);
   3104    round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[0], width_div8);
   3105    col_txfm(&buf0[i << 1], &buf0[i << 1], cos_bit_col, width_div8, width_div8);
   3106    col_txfm(&buf0[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_col, width_div8,
   3107             width_div8);
   3108    round_shift_32_8xn_avx2(&buf0[i << 1], height, shift[1], width_div8);
   3109    round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[1], width_div8);
   3110  }
   3111 
   3112  for (r = 0; r < height; r += 8) {
   3113    for (c = 0; c < width_div8; c++) {
   3114      fwd_txfm_transpose_8x8_avx2(&buf0[r * width_div8 + c],
   3115                                  &buf1[c * 8 * width_div8 + (r >> 3)],
   3116                                  width_div8, width_div8);
   3117    }
   3118  }
   3119 
   3120  for (int i = 0; i < 2; i++) {
   3121    row_txfm(&buf1[i << 1], &buf0[i << 1], cos_bit_row, width_div8,
   3122             width_div16);
   3123    row_txfm(&buf1[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_row, width_div8,
   3124             width_div16);
   3125    round_shift_32_8xn_avx2(&buf0[i << 1], (height >> 1), shift[2],
   3126                            width_div16);
   3127    round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], (height >> 1), shift[2],
   3128                            width_div16);
   3129  }
   3130 
   3131  store_buffer_avx2(buf0, output, 8, 128);
   3132 }