tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

av1_fwd_txfm2d_avx2.c (133780B)


      1 /*
      2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include "config/av1_rtcd.h"
     13 
     14 #include "av1/common/enums.h"
     15 #include "av1/common/av1_txfm.h"
     16 #include "av1/encoder/x86/av1_fwd_txfm_avx2.h"
     17 #include "av1/common/x86/av1_txfm_sse2.h"
     18 #include "av1/encoder/av1_fwd_txfm1d_cfg.h"
     19 #include "av1/encoder/x86/av1_txfm1d_sse4.h"
     20 #include "av1/encoder/x86/av1_fwd_txfm_sse2.h"
     21 #include "aom_dsp/x86/txfm_common_avx2.h"
     22 
     23 static inline void fdct16x16_new_avx2(const __m256i *input, __m256i *output,
     24                                      int8_t cos_bit) {
     25  const int32_t *cospi = cospi_arr(cos_bit);
     26  const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
     27 
     28  __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
     29  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
     30  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
     31  __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
     32  __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
     33  __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
     34  __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
     35  __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
     36  __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
     37  __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
     38  __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
     39  __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
     40  __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
     41  __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
     42  __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
     43  __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
     44  __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
     45  __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
     46 
     47  // stage 1
     48  __m256i x1[16];
     49  btf_16_adds_subs_out_avx2(&x1[0], &x1[15], input[0], input[15]);
     50  btf_16_adds_subs_out_avx2(&x1[1], &x1[14], input[1], input[14]);
     51  btf_16_adds_subs_out_avx2(&x1[2], &x1[13], input[2], input[13]);
     52  btf_16_adds_subs_out_avx2(&x1[3], &x1[12], input[3], input[12]);
     53  btf_16_adds_subs_out_avx2(&x1[4], &x1[11], input[4], input[11]);
     54  btf_16_adds_subs_out_avx2(&x1[5], &x1[10], input[5], input[10]);
     55  btf_16_adds_subs_out_avx2(&x1[6], &x1[9], input[6], input[9]);
     56  btf_16_adds_subs_out_avx2(&x1[7], &x1[8], input[7], input[8]);
     57 
     58  // stage 2
     59  btf_16_adds_subs_avx2(&x1[0], &x1[7]);
     60  btf_16_adds_subs_avx2(&x1[1], &x1[6]);
     61  btf_16_adds_subs_avx2(&x1[2], &x1[5]);
     62  btf_16_adds_subs_avx2(&x1[3], &x1[4]);
     63  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit);
     64  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit);
     65 
     66  // stage 3
     67  btf_16_adds_subs_avx2(&x1[0], &x1[3]);
     68  btf_16_adds_subs_avx2(&x1[1], &x1[2]);
     69  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
     70  btf_16_adds_subs_avx2(&x1[8], &x1[11]);
     71  btf_16_adds_subs_avx2(&x1[9], &x1[10]);
     72  btf_16_adds_subs_avx2(&x1[15], &x1[12]);
     73  btf_16_adds_subs_avx2(&x1[14], &x1[13]);
     74 
     75  // stage 4
     76  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
     77  btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit);
     78  btf_16_adds_subs_avx2(&x1[4], &x1[5]);
     79  btf_16_adds_subs_avx2(&x1[7], &x1[6]);
     80  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
     81  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
     82 
     83  // stage 5
     84  btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit);
     85  btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit);
     86  btf_16_adds_subs_avx2(&x1[8], &x1[9]);
     87  btf_16_adds_subs_avx2(&x1[11], &x1[10]);
     88  btf_16_adds_subs_avx2(&x1[12], &x1[13]);
     89  btf_16_adds_subs_avx2(&x1[15], &x1[14]);
     90 
     91  // stage 6
     92  btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit);
     93  btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit);
     94  btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit);
     95  btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit);
     96 
     97  // stage 7
     98  output[0] = x1[0];
     99  output[1] = x1[8];
    100  output[2] = x1[4];
    101  output[3] = x1[12];
    102  output[4] = x1[2];
    103  output[5] = x1[10];
    104  output[6] = x1[6];
    105  output[7] = x1[14];
    106  output[8] = x1[1];
    107  output[9] = x1[9];
    108  output[10] = x1[5];
    109  output[11] = x1[13];
    110  output[12] = x1[3];
    111  output[13] = x1[11];
    112  output[14] = x1[7];
    113  output[15] = x1[15];
    114 }
    115 
    116 static inline void fdct16x32_avx2(const __m256i *input, __m256i *output,
    117                                  int8_t cos_bit) {
    118  const int32_t *cospi = cospi_arr(cos_bit);
    119  const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
    120 
    121  __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
    122  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
    123  __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
    124  __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
    125  __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
    126  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
    127  __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
    128  __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
    129  __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
    130  __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
    131  __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
    132  __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
    133  __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
    134  __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
    135  __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
    136  __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
    137  __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
    138  __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
    139  __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
    140  __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
    141  __m256i cospi_p62_p02 = pair_set_w16_epi16(cospi[62], cospi[2]);
    142  __m256i cospi_m02_p62 = pair_set_w16_epi16(-cospi[2], cospi[62]);
    143  __m256i cospi_p30_p34 = pair_set_w16_epi16(cospi[30], cospi[34]);
    144  __m256i cospi_m34_p30 = pair_set_w16_epi16(-cospi[34], cospi[30]);
    145  __m256i cospi_p46_p18 = pair_set_w16_epi16(cospi[46], cospi[18]);
    146  __m256i cospi_m18_p46 = pair_set_w16_epi16(-cospi[18], cospi[46]);
    147  __m256i cospi_p14_p50 = pair_set_w16_epi16(cospi[14], cospi[50]);
    148  __m256i cospi_m50_p14 = pair_set_w16_epi16(-cospi[50], cospi[14]);
    149  __m256i cospi_p54_p10 = pair_set_w16_epi16(cospi[54], cospi[10]);
    150  __m256i cospi_m10_p54 = pair_set_w16_epi16(-cospi[10], cospi[54]);
    151  __m256i cospi_p22_p42 = pair_set_w16_epi16(cospi[22], cospi[42]);
    152  __m256i cospi_m42_p22 = pair_set_w16_epi16(-cospi[42], cospi[22]);
    153  __m256i cospi_p38_p26 = pair_set_w16_epi16(cospi[38], cospi[26]);
    154  __m256i cospi_m26_p38 = pair_set_w16_epi16(-cospi[26], cospi[38]);
    155  __m256i cospi_p06_p58 = pair_set_w16_epi16(cospi[6], cospi[58]);
    156  __m256i cospi_m58_p06 = pair_set_w16_epi16(-cospi[58], cospi[6]);
    157 
    158  // stage 1
    159  __m256i x1[32];
    160  btf_16_adds_subs_out_avx2(&x1[0], &x1[31], input[0], input[31]);
    161  btf_16_adds_subs_out_avx2(&x1[1], &x1[30], input[1], input[30]);
    162  btf_16_adds_subs_out_avx2(&x1[2], &x1[29], input[2], input[29]);
    163  btf_16_adds_subs_out_avx2(&x1[3], &x1[28], input[3], input[28]);
    164  btf_16_adds_subs_out_avx2(&x1[4], &x1[27], input[4], input[27]);
    165  btf_16_adds_subs_out_avx2(&x1[5], &x1[26], input[5], input[26]);
    166  btf_16_adds_subs_out_avx2(&x1[6], &x1[25], input[6], input[25]);
    167  btf_16_adds_subs_out_avx2(&x1[7], &x1[24], input[7], input[24]);
    168  btf_16_adds_subs_out_avx2(&x1[8], &x1[23], input[8], input[23]);
    169  btf_16_adds_subs_out_avx2(&x1[9], &x1[22], input[9], input[22]);
    170  btf_16_adds_subs_out_avx2(&x1[10], &x1[21], input[10], input[21]);
    171  btf_16_adds_subs_out_avx2(&x1[11], &x1[20], input[11], input[20]);
    172  btf_16_adds_subs_out_avx2(&x1[12], &x1[19], input[12], input[19]);
    173  btf_16_adds_subs_out_avx2(&x1[13], &x1[18], input[13], input[18]);
    174  btf_16_adds_subs_out_avx2(&x1[14], &x1[17], input[14], input[17]);
    175  btf_16_adds_subs_out_avx2(&x1[15], &x1[16], input[15], input[16]);
    176 
    177  // stage 2
    178  btf_16_adds_subs_avx2(&x1[0], &x1[15]);
    179  btf_16_adds_subs_avx2(&x1[1], &x1[14]);
    180  btf_16_adds_subs_avx2(&x1[2], &x1[13]);
    181  btf_16_adds_subs_avx2(&x1[3], &x1[12]);
    182  btf_16_adds_subs_avx2(&x1[4], &x1[11]);
    183  btf_16_adds_subs_avx2(&x1[5], &x1[10]);
    184  btf_16_adds_subs_avx2(&x1[6], &x1[9]);
    185  btf_16_adds_subs_avx2(&x1[7], &x1[8]);
    186  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[20], &x1[27], _r, cos_bit);
    187  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[21], &x1[26], _r, cos_bit);
    188  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[22], &x1[25], _r, cos_bit);
    189  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[23], &x1[24], _r, cos_bit);
    190 
    191  // stage 3
    192  btf_16_adds_subs_avx2(&x1[0], &x1[7]);
    193  btf_16_adds_subs_avx2(&x1[1], &x1[6]);
    194  btf_16_adds_subs_avx2(&x1[2], &x1[5]);
    195  btf_16_adds_subs_avx2(&x1[3], &x1[4]);
    196  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit);
    197  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit);
    198  btf_16_adds_subs_avx2(&x1[16], &x1[23]);
    199  btf_16_adds_subs_avx2(&x1[17], &x1[22]);
    200  btf_16_adds_subs_avx2(&x1[18], &x1[21]);
    201  btf_16_adds_subs_avx2(&x1[19], &x1[20]);
    202  btf_16_adds_subs_avx2(&x1[31], &x1[24]);
    203  btf_16_adds_subs_avx2(&x1[30], &x1[25]);
    204  btf_16_adds_subs_avx2(&x1[29], &x1[26]);
    205  btf_16_adds_subs_avx2(&x1[28], &x1[27]);
    206 
    207  // stage 4
    208  btf_16_adds_subs_avx2(&x1[0], &x1[3]);
    209  btf_16_adds_subs_avx2(&x1[1], &x1[2]);
    210  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
    211  btf_16_adds_subs_avx2(&x1[8], &x1[11]);
    212  btf_16_adds_subs_avx2(&x1[9], &x1[10]);
    213  btf_16_adds_subs_avx2(&x1[15], &x1[12]);
    214  btf_16_adds_subs_avx2(&x1[14], &x1[13]);
    215  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[18], &x1[29], _r, cos_bit);
    216  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[19], &x1[28], _r, cos_bit);
    217  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[20], &x1[27], _r, cos_bit);
    218  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[21], &x1[26], _r, cos_bit);
    219 
    220  // stage 5
    221  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
    222  btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit);
    223  btf_16_adds_subs_avx2(&x1[4], &x1[5]);
    224  btf_16_adds_subs_avx2(&x1[7], &x1[6]);
    225  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
    226  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
    227  btf_16_adds_subs_avx2(&x1[16], &x1[19]);
    228  btf_16_adds_subs_avx2(&x1[17], &x1[18]);
    229  btf_16_adds_subs_avx2(&x1[23], &x1[20]);
    230  btf_16_adds_subs_avx2(&x1[22], &x1[21]);
    231  btf_16_adds_subs_avx2(&x1[24], &x1[27]);
    232  btf_16_adds_subs_avx2(&x1[25], &x1[26]);
    233  btf_16_adds_subs_avx2(&x1[31], &x1[28]);
    234  btf_16_adds_subs_avx2(&x1[30], &x1[29]);
    235 
    236  // stage 6
    237  btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit);
    238  btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit);
    239  btf_16_adds_subs_avx2(&x1[8], &x1[9]);
    240  btf_16_adds_subs_avx2(&x1[11], &x1[10]);
    241  btf_16_adds_subs_avx2(&x1[12], &x1[13]);
    242  btf_16_adds_subs_avx2(&x1[15], &x1[14]);
    243  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[17], &x1[30], _r, cos_bit);
    244  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[18], &x1[29], _r, cos_bit);
    245  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[21], &x1[26], _r, cos_bit);
    246  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[22], &x1[25], _r, cos_bit);
    247 
    248  // stage 7
    249  btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit);
    250  btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit);
    251  btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit);
    252  btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit);
    253  btf_16_adds_subs_avx2(&x1[16], &x1[17]);
    254  btf_16_adds_subs_avx2(&x1[19], &x1[18]);
    255  btf_16_adds_subs_avx2(&x1[20], &x1[21]);
    256  btf_16_adds_subs_avx2(&x1[23], &x1[22]);
    257  btf_16_adds_subs_avx2(&x1[24], &x1[25]);
    258  btf_16_adds_subs_avx2(&x1[27], &x1[26]);
    259  btf_16_adds_subs_avx2(&x1[28], &x1[29]);
    260  btf_16_adds_subs_avx2(&x1[31], &x1[30]);
    261 
    262  // stage 8
    263  btf_16_w16_avx2(cospi_p62_p02, cospi_m02_p62, &x1[16], &x1[31], _r, cos_bit);
    264  btf_16_w16_avx2(cospi_p30_p34, cospi_m34_p30, &x1[17], &x1[30], _r, cos_bit);
    265  btf_16_w16_avx2(cospi_p46_p18, cospi_m18_p46, &x1[18], &x1[29], _r, cos_bit);
    266  btf_16_w16_avx2(cospi_p14_p50, cospi_m50_p14, &x1[19], &x1[28], _r, cos_bit);
    267  btf_16_w16_avx2(cospi_p54_p10, cospi_m10_p54, &x1[20], &x1[27], _r, cos_bit);
    268  btf_16_w16_avx2(cospi_p22_p42, cospi_m42_p22, &x1[21], &x1[26], _r, cos_bit);
    269  btf_16_w16_avx2(cospi_p38_p26, cospi_m26_p38, &x1[22], &x1[25], _r, cos_bit);
    270  btf_16_w16_avx2(cospi_p06_p58, cospi_m58_p06, &x1[23], &x1[24], _r, cos_bit);
    271 
    272  // stage 9
    273  output[0] = x1[0];
    274  output[1] = x1[16];
    275  output[2] = x1[8];
    276  output[3] = x1[24];
    277  output[4] = x1[4];
    278  output[5] = x1[20];
    279  output[6] = x1[12];
    280  output[7] = x1[28];
    281  output[8] = x1[2];
    282  output[9] = x1[18];
    283  output[10] = x1[10];
    284  output[11] = x1[26];
    285  output[12] = x1[6];
    286  output[13] = x1[22];
    287  output[14] = x1[14];
    288  output[15] = x1[30];
    289  output[16] = x1[1];
    290  output[17] = x1[17];
    291  output[18] = x1[9];
    292  output[19] = x1[25];
    293  output[20] = x1[5];
    294  output[21] = x1[21];
    295  output[22] = x1[13];
    296  output[23] = x1[29];
    297  output[24] = x1[3];
    298  output[25] = x1[19];
    299  output[26] = x1[11];
    300  output[27] = x1[27];
    301  output[28] = x1[7];
    302  output[29] = x1[23];
    303  output[30] = x1[15];
    304  output[31] = x1[31];
    305 }
    306 
    307 static inline void fdct16x64_new_avx2(const __m256i *input, __m256i *output,
    308                                      int8_t cos_bit) {
    309  const int32_t *cospi = cospi_arr(cos_bit);
    310  const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
    311 
    312  __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
    313  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
    314  __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
    315  __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
    316  __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
    317  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
    318  __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
    319  __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
    320  __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
    321  __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
    322  __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
    323  __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
    324  __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
    325  __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
    326  __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
    327  __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
    328  __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
    329  __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
    330  __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
    331  __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
    332  __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]);
    333  __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
    334  __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]);
    335  __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
    336  __m256i cospi_p62_p02 = pair_set_w16_epi16(cospi[62], cospi[2]);
    337  __m256i cospi_m02_p62 = pair_set_w16_epi16(-cospi[2], cospi[62]);
    338  __m256i cospi_p30_p34 = pair_set_w16_epi16(cospi[30], cospi[34]);
    339  __m256i cospi_m34_p30 = pair_set_w16_epi16(-cospi[34], cospi[30]);
    340  __m256i cospi_p46_p18 = pair_set_w16_epi16(cospi[46], cospi[18]);
    341  __m256i cospi_m18_p46 = pair_set_w16_epi16(-cospi[18], cospi[46]);
    342  __m256i cospi_p14_p50 = pair_set_w16_epi16(cospi[14], cospi[50]);
    343  __m256i cospi_m50_p14 = pair_set_w16_epi16(-cospi[50], cospi[14]);
    344  __m256i cospi_p54_p10 = pair_set_w16_epi16(cospi[54], cospi[10]);
    345  __m256i cospi_m10_p54 = pair_set_w16_epi16(-cospi[10], cospi[54]);
    346  __m256i cospi_p22_p42 = pair_set_w16_epi16(cospi[22], cospi[42]);
    347  __m256i cospi_m42_p22 = pair_set_w16_epi16(-cospi[42], cospi[22]);
    348  __m256i cospi_p38_p26 = pair_set_w16_epi16(cospi[38], cospi[26]);
    349  __m256i cospi_m26_p38 = pair_set_w16_epi16(-cospi[26], cospi[38]);
    350  __m256i cospi_p06_p58 = pair_set_w16_epi16(cospi[6], cospi[58]);
    351  __m256i cospi_m58_p06 = pair_set_w16_epi16(-cospi[58], cospi[6]);
    352  __m256i cospi_p63_p01 = pair_set_w16_epi16(cospi[63], cospi[1]);
    353  __m256i cospi_m01_p63 = pair_set_w16_epi16(-cospi[1], cospi[63]);
    354  __m256i cospi_p31_p33 = pair_set_w16_epi16(cospi[31], cospi[33]);
    355  __m256i cospi_m33_p31 = pair_set_w16_epi16(-cospi[33], cospi[31]);
    356  __m256i cospi_p47_p17 = pair_set_w16_epi16(cospi[47], cospi[17]);
    357  __m256i cospi_m17_p47 = pair_set_w16_epi16(-cospi[17], cospi[47]);
    358  __m256i cospi_p15_p49 = pair_set_w16_epi16(cospi[15], cospi[49]);
    359  __m256i cospi_m49_p15 = pair_set_w16_epi16(-cospi[49], cospi[15]);
    360  __m256i cospi_p55_p09 = pair_set_w16_epi16(cospi[55], cospi[9]);
    361  __m256i cospi_m09_p55 = pair_set_w16_epi16(-cospi[9], cospi[55]);
    362  __m256i cospi_p23_p41 = pair_set_w16_epi16(cospi[23], cospi[41]);
    363  __m256i cospi_m41_p23 = pair_set_w16_epi16(-cospi[41], cospi[23]);
    364  __m256i cospi_p39_p25 = pair_set_w16_epi16(cospi[39], cospi[25]);
    365  __m256i cospi_m25_p39 = pair_set_w16_epi16(-cospi[25], cospi[39]);
    366  __m256i cospi_p07_p57 = pair_set_w16_epi16(cospi[7], cospi[57]);
    367  __m256i cospi_m57_p07 = pair_set_w16_epi16(-cospi[57], cospi[7]);
    368  __m256i cospi_p59_p05 = pair_set_w16_epi16(cospi[59], cospi[5]);
    369  __m256i cospi_m05_p59 = pair_set_w16_epi16(-cospi[5], cospi[59]);
    370  __m256i cospi_p27_p37 = pair_set_w16_epi16(cospi[27], cospi[37]);
    371  __m256i cospi_m37_p27 = pair_set_w16_epi16(-cospi[37], cospi[27]);
    372  __m256i cospi_p43_p21 = pair_set_w16_epi16(cospi[43], cospi[21]);
    373  __m256i cospi_m21_p43 = pair_set_w16_epi16(-cospi[21], cospi[43]);
    374  __m256i cospi_p11_p53 = pair_set_w16_epi16(cospi[11], cospi[53]);
    375  __m256i cospi_m53_p11 = pair_set_w16_epi16(-cospi[53], cospi[11]);
    376  __m256i cospi_p51_p13 = pair_set_w16_epi16(cospi[51], cospi[13]);
    377  __m256i cospi_m13_p51 = pair_set_w16_epi16(-cospi[13], cospi[51]);
    378  __m256i cospi_p19_p45 = pair_set_w16_epi16(cospi[19], cospi[45]);
    379  __m256i cospi_m45_p19 = pair_set_w16_epi16(-cospi[45], cospi[19]);
    380  __m256i cospi_p35_p29 = pair_set_w16_epi16(cospi[35], cospi[29]);
    381  __m256i cospi_m29_p35 = pair_set_w16_epi16(-cospi[29], cospi[35]);
    382  __m256i cospi_p03_p61 = pair_set_w16_epi16(cospi[3], cospi[61]);
    383  __m256i cospi_m61_p03 = pair_set_w16_epi16(-cospi[61], cospi[3]);
    384 
    385  // stage 1
    386  __m256i x1[64];
    387  btf_16_adds_subs_out_avx2(&x1[0], &x1[63], input[0], input[63]);
    388  btf_16_adds_subs_out_avx2(&x1[1], &x1[62], input[1], input[62]);
    389  btf_16_adds_subs_out_avx2(&x1[2], &x1[61], input[2], input[61]);
    390  btf_16_adds_subs_out_avx2(&x1[3], &x1[60], input[3], input[60]);
    391  btf_16_adds_subs_out_avx2(&x1[4], &x1[59], input[4], input[59]);
    392  btf_16_adds_subs_out_avx2(&x1[5], &x1[58], input[5], input[58]);
    393  btf_16_adds_subs_out_avx2(&x1[6], &x1[57], input[6], input[57]);
    394  btf_16_adds_subs_out_avx2(&x1[7], &x1[56], input[7], input[56]);
    395  btf_16_adds_subs_out_avx2(&x1[8], &x1[55], input[8], input[55]);
    396  btf_16_adds_subs_out_avx2(&x1[9], &x1[54], input[9], input[54]);
    397  btf_16_adds_subs_out_avx2(&x1[10], &x1[53], input[10], input[53]);
    398  btf_16_adds_subs_out_avx2(&x1[11], &x1[52], input[11], input[52]);
    399  btf_16_adds_subs_out_avx2(&x1[12], &x1[51], input[12], input[51]);
    400  btf_16_adds_subs_out_avx2(&x1[13], &x1[50], input[13], input[50]);
    401  btf_16_adds_subs_out_avx2(&x1[14], &x1[49], input[14], input[49]);
    402  btf_16_adds_subs_out_avx2(&x1[15], &x1[48], input[15], input[48]);
    403  btf_16_adds_subs_out_avx2(&x1[16], &x1[47], input[16], input[47]);
    404  btf_16_adds_subs_out_avx2(&x1[17], &x1[46], input[17], input[46]);
    405  btf_16_adds_subs_out_avx2(&x1[18], &x1[45], input[18], input[45]);
    406  btf_16_adds_subs_out_avx2(&x1[19], &x1[44], input[19], input[44]);
    407  btf_16_adds_subs_out_avx2(&x1[20], &x1[43], input[20], input[43]);
    408  btf_16_adds_subs_out_avx2(&x1[21], &x1[42], input[21], input[42]);
    409  btf_16_adds_subs_out_avx2(&x1[22], &x1[41], input[22], input[41]);
    410  btf_16_adds_subs_out_avx2(&x1[23], &x1[40], input[23], input[40]);
    411  btf_16_adds_subs_out_avx2(&x1[24], &x1[39], input[24], input[39]);
    412  btf_16_adds_subs_out_avx2(&x1[25], &x1[38], input[25], input[38]);
    413  btf_16_adds_subs_out_avx2(&x1[26], &x1[37], input[26], input[37]);
    414  btf_16_adds_subs_out_avx2(&x1[27], &x1[36], input[27], input[36]);
    415  btf_16_adds_subs_out_avx2(&x1[28], &x1[35], input[28], input[35]);
    416  btf_16_adds_subs_out_avx2(&x1[29], &x1[34], input[29], input[34]);
    417  btf_16_adds_subs_out_avx2(&x1[30], &x1[33], input[30], input[33]);
    418  btf_16_adds_subs_out_avx2(&x1[31], &x1[32], input[31], input[32]);
    419 
    420  // stage 2
    421  btf_16_adds_subs_avx2(&x1[0], &x1[31]);
    422  btf_16_adds_subs_avx2(&x1[1], &x1[30]);
    423  btf_16_adds_subs_avx2(&x1[2], &x1[29]);
    424  btf_16_adds_subs_avx2(&x1[3], &x1[28]);
    425  btf_16_adds_subs_avx2(&x1[4], &x1[27]);
    426  btf_16_adds_subs_avx2(&x1[5], &x1[26]);
    427  btf_16_adds_subs_avx2(&x1[6], &x1[25]);
    428  btf_16_adds_subs_avx2(&x1[7], &x1[24]);
    429  btf_16_adds_subs_avx2(&x1[8], &x1[23]);
    430  btf_16_adds_subs_avx2(&x1[9], &x1[22]);
    431  btf_16_adds_subs_avx2(&x1[10], &x1[21]);
    432  btf_16_adds_subs_avx2(&x1[11], &x1[20]);
    433  btf_16_adds_subs_avx2(&x1[12], &x1[19]);
    434  btf_16_adds_subs_avx2(&x1[13], &x1[18]);
    435  btf_16_adds_subs_avx2(&x1[14], &x1[17]);
    436  btf_16_adds_subs_avx2(&x1[15], &x1[16]);
    437  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[40], &x1[55], _r, cos_bit);
    438  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[41], &x1[54], _r, cos_bit);
    439  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[42], &x1[53], _r, cos_bit);
    440  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[43], &x1[52], _r, cos_bit);
    441  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[44], &x1[51], _r, cos_bit);
    442  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[45], &x1[50], _r, cos_bit);
    443  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[46], &x1[49], _r, cos_bit);
    444  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[47], &x1[48], _r, cos_bit);
    445 
    446  // stage 3
    447  btf_16_adds_subs_avx2(&x1[0], &x1[15]);
    448  btf_16_adds_subs_avx2(&x1[1], &x1[14]);
    449  btf_16_adds_subs_avx2(&x1[2], &x1[13]);
    450  btf_16_adds_subs_avx2(&x1[3], &x1[12]);
    451  btf_16_adds_subs_avx2(&x1[4], &x1[11]);
    452  btf_16_adds_subs_avx2(&x1[5], &x1[10]);
    453  btf_16_adds_subs_avx2(&x1[6], &x1[9]);
    454  btf_16_adds_subs_avx2(&x1[7], &x1[8]);
    455  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[20], &x1[27], _r, cos_bit);
    456  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[21], &x1[26], _r, cos_bit);
    457  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[22], &x1[25], _r, cos_bit);
    458  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[23], &x1[24], _r, cos_bit);
    459  btf_16_adds_subs_avx2(&x1[32], &x1[47]);
    460  btf_16_adds_subs_avx2(&x1[33], &x1[46]);
    461  btf_16_adds_subs_avx2(&x1[34], &x1[45]);
    462  btf_16_adds_subs_avx2(&x1[35], &x1[44]);
    463  btf_16_adds_subs_avx2(&x1[36], &x1[43]);
    464  btf_16_adds_subs_avx2(&x1[37], &x1[42]);
    465  btf_16_adds_subs_avx2(&x1[38], &x1[41]);
    466  btf_16_adds_subs_avx2(&x1[39], &x1[40]);
    467  btf_16_adds_subs_avx2(&x1[63], &x1[48]);
    468  btf_16_adds_subs_avx2(&x1[62], &x1[49]);
    469  btf_16_adds_subs_avx2(&x1[61], &x1[50]);
    470  btf_16_adds_subs_avx2(&x1[60], &x1[51]);
    471  btf_16_adds_subs_avx2(&x1[59], &x1[52]);
    472  btf_16_adds_subs_avx2(&x1[58], &x1[53]);
    473  btf_16_adds_subs_avx2(&x1[57], &x1[54]);
    474  btf_16_adds_subs_avx2(&x1[56], &x1[55]);
    475 
    476  // stage 4
    477  btf_16_adds_subs_avx2(&x1[0], &x1[7]);
    478  btf_16_adds_subs_avx2(&x1[1], &x1[6]);
    479  btf_16_adds_subs_avx2(&x1[2], &x1[5]);
    480  btf_16_adds_subs_avx2(&x1[3], &x1[4]);
    481  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit);
    482  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit);
    483  btf_16_adds_subs_avx2(&x1[16], &x1[23]);
    484  btf_16_adds_subs_avx2(&x1[17], &x1[22]);
    485  btf_16_adds_subs_avx2(&x1[18], &x1[21]);
    486  btf_16_adds_subs_avx2(&x1[19], &x1[20]);
    487  btf_16_adds_subs_avx2(&x1[31], &x1[24]);
    488  btf_16_adds_subs_avx2(&x1[30], &x1[25]);
    489  btf_16_adds_subs_avx2(&x1[29], &x1[26]);
    490  btf_16_adds_subs_avx2(&x1[28], &x1[27]);
    491  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[36], &x1[59], _r, cos_bit);
    492  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[37], &x1[58], _r, cos_bit);
    493  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[38], &x1[57], _r, cos_bit);
    494  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[39], &x1[56], _r, cos_bit);
    495  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[40], &x1[55], _r, cos_bit);
    496  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[41], &x1[54], _r, cos_bit);
    497  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[42], &x1[53], _r, cos_bit);
    498  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[43], &x1[52], _r, cos_bit);
    499 
    500  // stage 5
    501  btf_16_adds_subs_avx2(&x1[0], &x1[3]);
    502  btf_16_adds_subs_avx2(&x1[1], &x1[2]);
    503  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
    504  btf_16_adds_subs_avx2(&x1[8], &x1[11]);
    505  btf_16_adds_subs_avx2(&x1[9], &x1[10]);
    506  btf_16_adds_subs_avx2(&x1[15], &x1[12]);
    507  btf_16_adds_subs_avx2(&x1[14], &x1[13]);
    508  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[18], &x1[29], _r, cos_bit);
    509  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[19], &x1[28], _r, cos_bit);
    510  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[20], &x1[27], _r, cos_bit);
    511  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[21], &x1[26], _r, cos_bit);
    512  btf_16_adds_subs_avx2(&x1[32], &x1[39]);
    513  btf_16_adds_subs_avx2(&x1[33], &x1[38]);
    514  btf_16_adds_subs_avx2(&x1[34], &x1[37]);
    515  btf_16_adds_subs_avx2(&x1[35], &x1[36]);
    516  btf_16_adds_subs_avx2(&x1[47], &x1[40]);
    517  btf_16_adds_subs_avx2(&x1[46], &x1[41]);
    518  btf_16_adds_subs_avx2(&x1[45], &x1[42]);
    519  btf_16_adds_subs_avx2(&x1[44], &x1[43]);
    520  btf_16_adds_subs_avx2(&x1[48], &x1[55]);
    521  btf_16_adds_subs_avx2(&x1[49], &x1[54]);
    522  btf_16_adds_subs_avx2(&x1[50], &x1[53]);
    523  btf_16_adds_subs_avx2(&x1[51], &x1[52]);
    524  btf_16_adds_subs_avx2(&x1[63], &x1[56]);
    525  btf_16_adds_subs_avx2(&x1[62], &x1[57]);
    526  btf_16_adds_subs_avx2(&x1[61], &x1[58]);
    527  btf_16_adds_subs_avx2(&x1[60], &x1[59]);
    528 
    529  // stage 6
    530  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
    531  btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit);
    532  btf_16_adds_subs_avx2(&x1[4], &x1[5]);
    533  btf_16_adds_subs_avx2(&x1[7], &x1[6]);
    534  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
    535  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
    536  btf_16_adds_subs_avx2(&x1[16], &x1[19]);
    537  btf_16_adds_subs_avx2(&x1[17], &x1[18]);
    538  btf_16_adds_subs_avx2(&x1[23], &x1[20]);
    539  btf_16_adds_subs_avx2(&x1[22], &x1[21]);
    540  btf_16_adds_subs_avx2(&x1[24], &x1[27]);
    541  btf_16_adds_subs_avx2(&x1[25], &x1[26]);
    542  btf_16_adds_subs_avx2(&x1[31], &x1[28]);
    543  btf_16_adds_subs_avx2(&x1[30], &x1[29]);
    544  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[34], &x1[61], _r, cos_bit);
    545  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[35], &x1[60], _r, cos_bit);
    546  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[36], &x1[59], _r, cos_bit);
    547  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[37], &x1[58], _r, cos_bit);
    548  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[42], &x1[53], _r, cos_bit);
    549  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[43], &x1[52], _r, cos_bit);
    550  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[44], &x1[51], _r, cos_bit);
    551  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[45], &x1[50], _r, cos_bit);
    552 
    553  // stage 7
    554  btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit);
    555  btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit);
    556  btf_16_adds_subs_avx2(&x1[8], &x1[9]);
    557  btf_16_adds_subs_avx2(&x1[11], &x1[10]);
    558  btf_16_adds_subs_avx2(&x1[12], &x1[13]);
    559  btf_16_adds_subs_avx2(&x1[15], &x1[14]);
    560  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[17], &x1[30], _r, cos_bit);
    561  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[18], &x1[29], _r, cos_bit);
    562  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[21], &x1[26], _r, cos_bit);
    563  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[22], &x1[25], _r, cos_bit);
    564  btf_16_adds_subs_avx2(&x1[32], &x1[35]);
    565  btf_16_adds_subs_avx2(&x1[33], &x1[34]);
    566  btf_16_adds_subs_avx2(&x1[39], &x1[36]);
    567  btf_16_adds_subs_avx2(&x1[38], &x1[37]);
    568  btf_16_adds_subs_avx2(&x1[40], &x1[43]);
    569  btf_16_adds_subs_avx2(&x1[41], &x1[42]);
    570  btf_16_adds_subs_avx2(&x1[47], &x1[44]);
    571  btf_16_adds_subs_avx2(&x1[46], &x1[45]);
    572  btf_16_adds_subs_avx2(&x1[48], &x1[51]);
    573  btf_16_adds_subs_avx2(&x1[49], &x1[50]);
    574  btf_16_adds_subs_avx2(&x1[55], &x1[52]);
    575  btf_16_adds_subs_avx2(&x1[54], &x1[53]);
    576  btf_16_adds_subs_avx2(&x1[56], &x1[59]);
    577  btf_16_adds_subs_avx2(&x1[57], &x1[58]);
    578  btf_16_adds_subs_avx2(&x1[63], &x1[60]);
    579  btf_16_adds_subs_avx2(&x1[62], &x1[61]);
    580 
    581  // stage 8
    582  btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit);
    583  btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit);
    584  btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit);
    585  btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit);
    586  btf_16_adds_subs_avx2(&x1[16], &x1[17]);
    587  btf_16_adds_subs_avx2(&x1[19], &x1[18]);
    588  btf_16_adds_subs_avx2(&x1[20], &x1[21]);
    589  btf_16_adds_subs_avx2(&x1[23], &x1[22]);
    590  btf_16_adds_subs_avx2(&x1[24], &x1[25]);
    591  btf_16_adds_subs_avx2(&x1[27], &x1[26]);
    592  btf_16_adds_subs_avx2(&x1[28], &x1[29]);
    593  btf_16_adds_subs_avx2(&x1[31], &x1[30]);
    594  btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x1[33], &x1[62], _r, cos_bit);
    595  btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x1[34], &x1[61], _r, cos_bit);
    596  btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x1[37], &x1[58], _r, cos_bit);
    597  btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x1[38], &x1[57], _r, cos_bit);
    598  btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x1[41], &x1[54], _r, cos_bit);
    599  btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x1[42], &x1[53], _r, cos_bit);
    600  btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x1[45], &x1[50], _r, cos_bit);
    601  btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x1[46], &x1[49], _r, cos_bit);
    602 
    603  // stage 9
    604  btf_16_w16_avx2(cospi_p62_p02, cospi_m02_p62, &x1[16], &x1[31], _r, cos_bit);
    605  btf_16_w16_avx2(cospi_p30_p34, cospi_m34_p30, &x1[17], &x1[30], _r, cos_bit);
    606  btf_16_w16_avx2(cospi_p46_p18, cospi_m18_p46, &x1[18], &x1[29], _r, cos_bit);
    607  btf_16_w16_avx2(cospi_p14_p50, cospi_m50_p14, &x1[19], &x1[28], _r, cos_bit);
    608  btf_16_w16_avx2(cospi_p54_p10, cospi_m10_p54, &x1[20], &x1[27], _r, cos_bit);
    609  btf_16_w16_avx2(cospi_p22_p42, cospi_m42_p22, &x1[21], &x1[26], _r, cos_bit);
    610  btf_16_w16_avx2(cospi_p38_p26, cospi_m26_p38, &x1[22], &x1[25], _r, cos_bit);
    611  btf_16_w16_avx2(cospi_p06_p58, cospi_m58_p06, &x1[23], &x1[24], _r, cos_bit);
    612  btf_16_adds_subs_avx2(&x1[32], &x1[33]);
    613  btf_16_adds_subs_avx2(&x1[35], &x1[34]);
    614  btf_16_adds_subs_avx2(&x1[36], &x1[37]);
    615  btf_16_adds_subs_avx2(&x1[39], &x1[38]);
    616  btf_16_adds_subs_avx2(&x1[40], &x1[41]);
    617  btf_16_adds_subs_avx2(&x1[43], &x1[42]);
    618  btf_16_adds_subs_avx2(&x1[44], &x1[45]);
    619  btf_16_adds_subs_avx2(&x1[47], &x1[46]);
    620  btf_16_adds_subs_avx2(&x1[48], &x1[49]);
    621  btf_16_adds_subs_avx2(&x1[51], &x1[50]);
    622  btf_16_adds_subs_avx2(&x1[52], &x1[53]);
    623  btf_16_adds_subs_avx2(&x1[55], &x1[54]);
    624  btf_16_adds_subs_avx2(&x1[56], &x1[57]);
    625  btf_16_adds_subs_avx2(&x1[59], &x1[58]);
    626  btf_16_adds_subs_avx2(&x1[60], &x1[61]);
    627  btf_16_adds_subs_avx2(&x1[63], &x1[62]);
    628 
    629  // stage 10
    630  btf_16_w16_avx2(cospi_p63_p01, cospi_m01_p63, &x1[32], &x1[63], _r, cos_bit);
    631  btf_16_w16_avx2(cospi_p31_p33, cospi_m33_p31, &x1[33], &x1[62], _r, cos_bit);
    632  btf_16_w16_avx2(cospi_p47_p17, cospi_m17_p47, &x1[34], &x1[61], _r, cos_bit);
    633  btf_16_w16_avx2(cospi_p15_p49, cospi_m49_p15, &x1[35], &x1[60], _r, cos_bit);
    634  btf_16_w16_avx2(cospi_p55_p09, cospi_m09_p55, &x1[36], &x1[59], _r, cos_bit);
    635  btf_16_w16_avx2(cospi_p23_p41, cospi_m41_p23, &x1[37], &x1[58], _r, cos_bit);
    636  btf_16_w16_avx2(cospi_p39_p25, cospi_m25_p39, &x1[38], &x1[57], _r, cos_bit);
    637  btf_16_w16_avx2(cospi_p07_p57, cospi_m57_p07, &x1[39], &x1[56], _r, cos_bit);
    638  btf_16_w16_avx2(cospi_p59_p05, cospi_m05_p59, &x1[40], &x1[55], _r, cos_bit);
    639  btf_16_w16_avx2(cospi_p27_p37, cospi_m37_p27, &x1[41], &x1[54], _r, cos_bit);
    640  btf_16_w16_avx2(cospi_p43_p21, cospi_m21_p43, &x1[42], &x1[53], _r, cos_bit);
    641  btf_16_w16_avx2(cospi_p11_p53, cospi_m53_p11, &x1[43], &x1[52], _r, cos_bit);
    642  btf_16_w16_avx2(cospi_p51_p13, cospi_m13_p51, &x1[44], &x1[51], _r, cos_bit);
    643  btf_16_w16_avx2(cospi_p19_p45, cospi_m45_p19, &x1[45], &x1[50], _r, cos_bit);
    644  btf_16_w16_avx2(cospi_p35_p29, cospi_m29_p35, &x1[46], &x1[49], _r, cos_bit);
    645  btf_16_w16_avx2(cospi_p03_p61, cospi_m61_p03, &x1[47], &x1[48], _r, cos_bit);
    646 
    647  // stage 11
    648  output[0] = x1[0];
    649  output[1] = x1[32];
    650  output[2] = x1[16];
    651  output[3] = x1[48];
    652  output[4] = x1[8];
    653  output[5] = x1[40];
    654  output[6] = x1[24];
    655  output[7] = x1[56];
    656  output[8] = x1[4];
    657  output[9] = x1[36];
    658  output[10] = x1[20];
    659  output[11] = x1[52];
    660  output[12] = x1[12];
    661  output[13] = x1[44];
    662  output[14] = x1[28];
    663  output[15] = x1[60];
    664  output[16] = x1[2];
    665  output[17] = x1[34];
    666  output[18] = x1[18];
    667  output[19] = x1[50];
    668  output[20] = x1[10];
    669  output[21] = x1[42];
    670  output[22] = x1[26];
    671  output[23] = x1[58];
    672  output[24] = x1[6];
    673  output[25] = x1[38];
    674  output[26] = x1[22];
    675  output[27] = x1[54];
    676  output[28] = x1[14];
    677  output[29] = x1[46];
    678  output[30] = x1[30];
    679  output[31] = x1[62];
    680  output[32] = x1[1];
    681  output[33] = x1[33];
    682  output[34] = x1[17];
    683  output[35] = x1[49];
    684  output[36] = x1[9];
    685  output[37] = x1[41];
    686  output[38] = x1[25];
    687  output[39] = x1[57];
    688  output[40] = x1[5];
    689  output[41] = x1[37];
    690  output[42] = x1[21];
    691  output[43] = x1[53];
    692  output[44] = x1[13];
    693  output[45] = x1[45];
    694  output[46] = x1[29];
    695  output[47] = x1[61];
    696  output[48] = x1[3];
    697  output[49] = x1[35];
    698  output[50] = x1[19];
    699  output[51] = x1[51];
    700  output[52] = x1[11];
    701  output[53] = x1[43];
    702  output[54] = x1[27];
    703  output[55] = x1[59];
    704  output[56] = x1[7];
    705  output[57] = x1[39];
    706  output[58] = x1[23];
    707  output[59] = x1[55];
    708  output[60] = x1[15];
    709  output[61] = x1[47];
    710  output[62] = x1[31];
    711  output[63] = x1[63];
    712 }
    713 
    714 static inline void fdct32_avx2(const __m256i *input, __m256i *output,
    715                               int8_t cos_bit) {
    716  __m256i x1[32];
    717  const int32_t *cospi = cospi_arr(cos_bit);
    718  const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
    719  // stage 0
    720  // stage 1
    721  btf_32_add_sub_out_avx2(&x1[0], &x1[31], input[0], input[31]);
    722  btf_32_add_sub_out_avx2(&x1[1], &x1[30], input[1], input[30]);
    723  btf_32_add_sub_out_avx2(&x1[2], &x1[29], input[2], input[29]);
    724  btf_32_add_sub_out_avx2(&x1[3], &x1[28], input[3], input[28]);
    725  btf_32_add_sub_out_avx2(&x1[4], &x1[27], input[4], input[27]);
    726  btf_32_add_sub_out_avx2(&x1[5], &x1[26], input[5], input[26]);
    727  btf_32_add_sub_out_avx2(&x1[6], &x1[25], input[6], input[25]);
    728  btf_32_add_sub_out_avx2(&x1[7], &x1[24], input[7], input[24]);
    729  btf_32_add_sub_out_avx2(&x1[8], &x1[23], input[8], input[23]);
    730  btf_32_add_sub_out_avx2(&x1[9], &x1[22], input[9], input[22]);
    731  btf_32_add_sub_out_avx2(&x1[10], &x1[21], input[10], input[21]);
    732  btf_32_add_sub_out_avx2(&x1[11], &x1[20], input[11], input[20]);
    733  btf_32_add_sub_out_avx2(&x1[12], &x1[19], input[12], input[19]);
    734  btf_32_add_sub_out_avx2(&x1[13], &x1[18], input[13], input[18]);
    735  btf_32_add_sub_out_avx2(&x1[14], &x1[17], input[14], input[17]);
    736  btf_32_add_sub_out_avx2(&x1[15], &x1[16], input[15], input[16]);
    737 
    738  // stage 2
    739  btf_32_add_sub_avx2(&x1[0], &x1[15]);
    740  btf_32_add_sub_avx2(&x1[1], &x1[14]);
    741  btf_32_add_sub_avx2(&x1[2], &x1[13]);
    742  btf_32_add_sub_avx2(&x1[3], &x1[12]);
    743  btf_32_add_sub_avx2(&x1[4], &x1[11]);
    744  btf_32_add_sub_avx2(&x1[5], &x1[10]);
    745  btf_32_add_sub_avx2(&x1[6], &x1[9]);
    746  btf_32_add_sub_avx2(&x1[7], &x1[8]);
    747  btf_32_avx2_type0(-cospi[32], cospi[32], &x1[20], &x1[27], _r, cos_bit);
    748  btf_32_avx2_type0(-cospi[32], cospi[32], &x1[21], &x1[26], _r, cos_bit);
    749  btf_32_avx2_type0(-cospi[32], cospi[32], &x1[22], &x1[25], _r, cos_bit);
    750  btf_32_avx2_type0(-cospi[32], cospi[32], &x1[23], &x1[24], _r, cos_bit);
    751 
    752  // stage 3
    753  btf_32_add_sub_avx2(&x1[0], &x1[7]);
    754  btf_32_add_sub_avx2(&x1[1], &x1[6]);
    755  btf_32_add_sub_avx2(&x1[2], &x1[5]);
    756  btf_32_add_sub_avx2(&x1[3], &x1[4]);
    757  btf_32_avx2_type0(-cospi[32], cospi[32], &x1[10], &x1[13], _r, cos_bit);
    758  btf_32_avx2_type0(-cospi[32], cospi[32], &x1[11], &x1[12], _r, cos_bit);
    759  btf_32_add_sub_avx2(&x1[16], &x1[23]);
    760  btf_32_add_sub_avx2(&x1[17], &x1[22]);
    761  btf_32_add_sub_avx2(&x1[18], &x1[21]);
    762  btf_32_add_sub_avx2(&x1[19], &x1[20]);
    763  btf_32_add_sub_avx2(&x1[31], &x1[24]);
    764  btf_32_add_sub_avx2(&x1[30], &x1[25]);
    765  btf_32_add_sub_avx2(&x1[29], &x1[26]);
    766  btf_32_add_sub_avx2(&x1[28], &x1[27]);
    767 
    768  // stage 4
    769  btf_32_add_sub_avx2(&x1[0], &x1[3]);
    770  btf_32_add_sub_avx2(&x1[1], &x1[2]);
    771  btf_32_avx2_type0(-cospi[32], cospi[32], &x1[5], &x1[6], _r, cos_bit);
    772  btf_32_add_sub_avx2(&x1[8], &x1[11]);
    773  btf_32_add_sub_avx2(&x1[9], &x1[10]);
    774  btf_32_add_sub_avx2(&x1[15], &x1[12]);
    775  btf_32_add_sub_avx2(&x1[14], &x1[13]);
    776  btf_32_avx2_type0(-cospi[16], cospi[48], &x1[18], &x1[29], _r, cos_bit);
    777  btf_32_avx2_type0(-cospi[16], cospi[48], &x1[19], &x1[28], _r, cos_bit);
    778  btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[20], &x1[27], _r, cos_bit);
    779  btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[21], &x1[26], _r, cos_bit);
    780 
    781  // stage 5
    782  btf_32_avx2_type0(cospi[32], cospi[32], &x1[0], &x1[1], _r, cos_bit);
    783  btf_32_avx2_type1(cospi[48], cospi[16], &x1[2], &x1[3], _r, cos_bit);
    784  btf_32_add_sub_avx2(&x1[4], &x1[5]);
    785  btf_32_add_sub_avx2(&x1[7], &x1[6]);
    786  btf_32_avx2_type0(-cospi[16], cospi[48], &x1[9], &x1[14], _r, cos_bit);
    787  btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[10], &x1[13], _r, cos_bit);
    788  btf_32_add_sub_avx2(&x1[16], &x1[19]);
    789  btf_32_add_sub_avx2(&x1[17], &x1[18]);
    790  btf_32_add_sub_avx2(&x1[23], &x1[20]);
    791  btf_32_add_sub_avx2(&x1[22], &x1[21]);
    792  btf_32_add_sub_avx2(&x1[24], &x1[27]);
    793  btf_32_add_sub_avx2(&x1[25], &x1[26]);
    794  btf_32_add_sub_avx2(&x1[31], &x1[28]);
    795  btf_32_add_sub_avx2(&x1[30], &x1[29]);
    796 
    797  // stage 6
    798  btf_32_avx2_type1(cospi[56], cospi[8], &x1[4], &x1[7], _r, cos_bit);
    799  btf_32_avx2_type1(cospi[24], cospi[40], &x1[5], &x1[6], _r, cos_bit);
    800  btf_32_add_sub_avx2(&x1[8], &x1[9]);
    801  btf_32_add_sub_avx2(&x1[11], &x1[10]);
    802  btf_32_add_sub_avx2(&x1[12], &x1[13]);
    803  btf_32_add_sub_avx2(&x1[15], &x1[14]);
    804  btf_32_avx2_type0(-cospi[8], cospi[56], &x1[17], &x1[30], _r, cos_bit);
    805  btf_32_avx2_type0(-cospi[56], -cospi[8], &x1[18], &x1[29], _r, cos_bit);
    806  btf_32_avx2_type0(-cospi[40], cospi[24], &x1[21], &x1[26], _r, cos_bit);
    807  btf_32_avx2_type0(-cospi[24], -cospi[40], &x1[22], &x1[25], _r, cos_bit);
    808 
    809  // stage 7
    810  btf_32_avx2_type1(cospi[60], cospi[4], &x1[8], &x1[15], _r, cos_bit);
    811  btf_32_avx2_type1(cospi[28], cospi[36], &x1[9], &x1[14], _r, cos_bit);
    812  btf_32_avx2_type1(cospi[44], cospi[20], &x1[10], &x1[13], _r, cos_bit);
    813  btf_32_avx2_type1(cospi[12], cospi[52], &x1[11], &x1[12], _r, cos_bit);
    814  btf_32_add_sub_avx2(&x1[16], &x1[17]);
    815  btf_32_add_sub_avx2(&x1[19], &x1[18]);
    816  btf_32_add_sub_avx2(&x1[20], &x1[21]);
    817  btf_32_add_sub_avx2(&x1[23], &x1[22]);
    818  btf_32_add_sub_avx2(&x1[24], &x1[25]);
    819  btf_32_add_sub_avx2(&x1[27], &x1[26]);
    820  btf_32_add_sub_avx2(&x1[28], &x1[29]);
    821  btf_32_add_sub_avx2(&x1[31], &x1[30]);
    822 
    823  // stage 8
    824  btf_32_avx2_type1(cospi[62], cospi[2], &x1[16], &x1[31], _r, cos_bit);
    825  btf_32_avx2_type1(cospi[30], cospi[34], &x1[17], &x1[30], _r, cos_bit);
    826  btf_32_avx2_type1(cospi[46], cospi[18], &x1[18], &x1[29], _r, cos_bit);
    827  btf_32_avx2_type1(cospi[14], cospi[50], &x1[19], &x1[28], _r, cos_bit);
    828  btf_32_avx2_type1(cospi[54], cospi[10], &x1[20], &x1[27], _r, cos_bit);
    829  btf_32_avx2_type1(cospi[22], cospi[42], &x1[21], &x1[26], _r, cos_bit);
    830  btf_32_avx2_type1(cospi[38], cospi[26], &x1[22], &x1[25], _r, cos_bit);
    831  btf_32_avx2_type1(cospi[6], cospi[58], &x1[23], &x1[24], _r, cos_bit);
    832 
    833  // stage 9
    834  output[0] = x1[0];
    835  output[1] = x1[16];
    836  output[2] = x1[8];
    837  output[3] = x1[24];
    838  output[4] = x1[4];
    839  output[5] = x1[20];
    840  output[6] = x1[12];
    841  output[7] = x1[28];
    842  output[8] = x1[2];
    843  output[9] = x1[18];
    844  output[10] = x1[10];
    845  output[11] = x1[26];
    846  output[12] = x1[6];
    847  output[13] = x1[22];
    848  output[14] = x1[14];
    849  output[15] = x1[30];
    850  output[16] = x1[1];
    851  output[17] = x1[17];
    852  output[18] = x1[9];
    853  output[19] = x1[25];
    854  output[20] = x1[5];
    855  output[21] = x1[21];
    856  output[22] = x1[13];
    857  output[23] = x1[29];
    858  output[24] = x1[3];
    859  output[25] = x1[19];
    860  output[26] = x1[11];
    861  output[27] = x1[27];
    862  output[28] = x1[7];
    863  output[29] = x1[23];
    864  output[30] = x1[15];
    865  output[31] = x1[31];
    866 }
    867 
    868 static inline void fdct64_new_avx2(const __m256i *input, __m256i *output,
    869                                   int8_t cos_bit) {
    870  const int32_t *cospi = cospi_arr(cos_bit);
    871  const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
    872 
    873  __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]);
    874  __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]);
    875  __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]);
    876  __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]);
    877  __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]);
    878  __m256i cospi_p16 = _mm256_set1_epi32(cospi[16]);
    879  __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]);
    880  __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]);
    881  __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]);
    882  __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]);
    883  __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]);
    884  __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]);
    885  __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]);
    886  __m256i cospi_p40 = _mm256_set1_epi32(cospi[40]);
    887  __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]);
    888  __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]);
    889  __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]);
    890  __m256i cospi_p36 = _mm256_set1_epi32(cospi[36]);
    891  __m256i cospi_p44 = _mm256_set1_epi32(cospi[44]);
    892  __m256i cospi_p20 = _mm256_set1_epi32(cospi[20]);
    893  __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]);
    894  __m256i cospi_p52 = _mm256_set1_epi32(cospi[52]);
    895  __m256i cospi_m04 = _mm256_set1_epi32(-cospi[4]);
    896  __m256i cospi_m60 = _mm256_set1_epi32(-cospi[60]);
    897  __m256i cospi_m36 = _mm256_set1_epi32(-cospi[36]);
    898  __m256i cospi_m28 = _mm256_set1_epi32(-cospi[28]);
    899  __m256i cospi_m20 = _mm256_set1_epi32(-cospi[20]);
    900  __m256i cospi_m44 = _mm256_set1_epi32(-cospi[44]);
    901  __m256i cospi_m52 = _mm256_set1_epi32(-cospi[52]);
    902  __m256i cospi_m12 = _mm256_set1_epi32(-cospi[12]);
    903  __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]);
    904  __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]);
    905  __m256i cospi_p30 = _mm256_set1_epi32(cospi[30]);
    906  __m256i cospi_p34 = _mm256_set1_epi32(cospi[34]);
    907  __m256i cospi_p46 = _mm256_set1_epi32(cospi[46]);
    908  __m256i cospi_p18 = _mm256_set1_epi32(cospi[18]);
    909  __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]);
    910  __m256i cospi_p50 = _mm256_set1_epi32(cospi[50]);
    911  __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]);
    912  __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]);
    913  __m256i cospi_p22 = _mm256_set1_epi32(cospi[22]);
    914  __m256i cospi_p42 = _mm256_set1_epi32(cospi[42]);
    915  __m256i cospi_p38 = _mm256_set1_epi32(cospi[38]);
    916  __m256i cospi_p26 = _mm256_set1_epi32(cospi[26]);
    917  __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]);
    918  __m256i cospi_p58 = _mm256_set1_epi32(cospi[58]);
    919  __m256i cospi_p63 = _mm256_set1_epi32(cospi[63]);
    920  __m256i cospi_p01 = _mm256_set1_epi32(cospi[1]);
    921  __m256i cospi_p31 = _mm256_set1_epi32(cospi[31]);
    922  __m256i cospi_p33 = _mm256_set1_epi32(cospi[33]);
    923  __m256i cospi_p47 = _mm256_set1_epi32(cospi[47]);
    924  __m256i cospi_p17 = _mm256_set1_epi32(cospi[17]);
    925  __m256i cospi_p15 = _mm256_set1_epi32(cospi[15]);
    926  __m256i cospi_p49 = _mm256_set1_epi32(cospi[49]);
    927  __m256i cospi_p55 = _mm256_set1_epi32(cospi[55]);
    928  __m256i cospi_p09 = _mm256_set1_epi32(cospi[9]);
    929  __m256i cospi_p23 = _mm256_set1_epi32(cospi[23]);
    930  __m256i cospi_p41 = _mm256_set1_epi32(cospi[41]);
    931  __m256i cospi_p39 = _mm256_set1_epi32(cospi[39]);
    932  __m256i cospi_p25 = _mm256_set1_epi32(cospi[25]);
    933  __m256i cospi_p07 = _mm256_set1_epi32(cospi[7]);
    934  __m256i cospi_p57 = _mm256_set1_epi32(cospi[57]);
    935  __m256i cospi_p59 = _mm256_set1_epi32(cospi[59]);
    936  __m256i cospi_p05 = _mm256_set1_epi32(cospi[5]);
    937  __m256i cospi_p27 = _mm256_set1_epi32(cospi[27]);
    938  __m256i cospi_p37 = _mm256_set1_epi32(cospi[37]);
    939  __m256i cospi_p43 = _mm256_set1_epi32(cospi[43]);
    940  __m256i cospi_p21 = _mm256_set1_epi32(cospi[21]);
    941  __m256i cospi_p11 = _mm256_set1_epi32(cospi[11]);
    942  __m256i cospi_p53 = _mm256_set1_epi32(cospi[53]);
    943  __m256i cospi_p51 = _mm256_set1_epi32(cospi[51]);
    944  __m256i cospi_p13 = _mm256_set1_epi32(cospi[13]);
    945  __m256i cospi_p19 = _mm256_set1_epi32(cospi[19]);
    946  __m256i cospi_p45 = _mm256_set1_epi32(cospi[45]);
    947  __m256i cospi_p35 = _mm256_set1_epi32(cospi[35]);
    948  __m256i cospi_p29 = _mm256_set1_epi32(cospi[29]);
    949  __m256i cospi_p03 = _mm256_set1_epi32(cospi[3]);
    950  __m256i cospi_p61 = _mm256_set1_epi32(cospi[61]);
    951 
    952  // stage 1
    953  __m256i x1[64];
    954  btf_32_add_sub_out_avx2(&x1[0], &x1[63], input[0], input[63]);
    955  btf_32_add_sub_out_avx2(&x1[1], &x1[62], input[1], input[62]);
    956  btf_32_add_sub_out_avx2(&x1[2], &x1[61], input[2], input[61]);
    957  btf_32_add_sub_out_avx2(&x1[3], &x1[60], input[3], input[60]);
    958  btf_32_add_sub_out_avx2(&x1[4], &x1[59], input[4], input[59]);
    959  btf_32_add_sub_out_avx2(&x1[5], &x1[58], input[5], input[58]);
    960  btf_32_add_sub_out_avx2(&x1[6], &x1[57], input[6], input[57]);
    961  btf_32_add_sub_out_avx2(&x1[7], &x1[56], input[7], input[56]);
    962  btf_32_add_sub_out_avx2(&x1[8], &x1[55], input[8], input[55]);
    963  btf_32_add_sub_out_avx2(&x1[9], &x1[54], input[9], input[54]);
    964  btf_32_add_sub_out_avx2(&x1[10], &x1[53], input[10], input[53]);
    965  btf_32_add_sub_out_avx2(&x1[11], &x1[52], input[11], input[52]);
    966  btf_32_add_sub_out_avx2(&x1[12], &x1[51], input[12], input[51]);
    967  btf_32_add_sub_out_avx2(&x1[13], &x1[50], input[13], input[50]);
    968  btf_32_add_sub_out_avx2(&x1[14], &x1[49], input[14], input[49]);
    969  btf_32_add_sub_out_avx2(&x1[15], &x1[48], input[15], input[48]);
    970  btf_32_add_sub_out_avx2(&x1[16], &x1[47], input[16], input[47]);
    971  btf_32_add_sub_out_avx2(&x1[17], &x1[46], input[17], input[46]);
    972  btf_32_add_sub_out_avx2(&x1[18], &x1[45], input[18], input[45]);
    973  btf_32_add_sub_out_avx2(&x1[19], &x1[44], input[19], input[44]);
    974  btf_32_add_sub_out_avx2(&x1[20], &x1[43], input[20], input[43]);
    975  btf_32_add_sub_out_avx2(&x1[21], &x1[42], input[21], input[42]);
    976  btf_32_add_sub_out_avx2(&x1[22], &x1[41], input[22], input[41]);
    977  btf_32_add_sub_out_avx2(&x1[23], &x1[40], input[23], input[40]);
    978  btf_32_add_sub_out_avx2(&x1[24], &x1[39], input[24], input[39]);
    979  btf_32_add_sub_out_avx2(&x1[25], &x1[38], input[25], input[38]);
    980  btf_32_add_sub_out_avx2(&x1[26], &x1[37], input[26], input[37]);
    981  btf_32_add_sub_out_avx2(&x1[27], &x1[36], input[27], input[36]);
    982  btf_32_add_sub_out_avx2(&x1[28], &x1[35], input[28], input[35]);
    983  btf_32_add_sub_out_avx2(&x1[29], &x1[34], input[29], input[34]);
    984  btf_32_add_sub_out_avx2(&x1[30], &x1[33], input[30], input[33]);
    985  btf_32_add_sub_out_avx2(&x1[31], &x1[32], input[31], input[32]);
    986 
    987  // stage 2
    988  btf_32_add_sub_avx2(&x1[0], &x1[31]);
    989  btf_32_add_sub_avx2(&x1[1], &x1[30]);
    990  btf_32_add_sub_avx2(&x1[2], &x1[29]);
    991  btf_32_add_sub_avx2(&x1[3], &x1[28]);
    992  btf_32_add_sub_avx2(&x1[4], &x1[27]);
    993  btf_32_add_sub_avx2(&x1[5], &x1[26]);
    994  btf_32_add_sub_avx2(&x1[6], &x1[25]);
    995  btf_32_add_sub_avx2(&x1[7], &x1[24]);
    996  btf_32_add_sub_avx2(&x1[8], &x1[23]);
    997  btf_32_add_sub_avx2(&x1[9], &x1[22]);
    998  btf_32_add_sub_avx2(&x1[10], &x1[21]);
    999  btf_32_add_sub_avx2(&x1[11], &x1[20]);
   1000  btf_32_add_sub_avx2(&x1[12], &x1[19]);
   1001  btf_32_add_sub_avx2(&x1[13], &x1[18]);
   1002  btf_32_add_sub_avx2(&x1[14], &x1[17]);
   1003  btf_32_add_sub_avx2(&x1[15], &x1[16]);
   1004  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[40], &x1[55], _r, cos_bit);
   1005  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[41], &x1[54], _r, cos_bit);
   1006  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[42], &x1[53], _r, cos_bit);
   1007  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[43], &x1[52], _r, cos_bit);
   1008  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[44], &x1[51], _r, cos_bit);
   1009  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[45], &x1[50], _r, cos_bit);
   1010  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[46], &x1[49], _r, cos_bit);
   1011  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[47], &x1[48], _r, cos_bit);
   1012 
   1013  // stage 3
   1014  btf_32_add_sub_avx2(&x1[0], &x1[15]);
   1015  btf_32_add_sub_avx2(&x1[1], &x1[14]);
   1016  btf_32_add_sub_avx2(&x1[2], &x1[13]);
   1017  btf_32_add_sub_avx2(&x1[3], &x1[12]);
   1018  btf_32_add_sub_avx2(&x1[4], &x1[11]);
   1019  btf_32_add_sub_avx2(&x1[5], &x1[10]);
   1020  btf_32_add_sub_avx2(&x1[6], &x1[9]);
   1021  btf_32_add_sub_avx2(&x1[7], &x1[8]);
   1022  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[20], &x1[27], _r, cos_bit);
   1023  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[21], &x1[26], _r, cos_bit);
   1024  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[22], &x1[25], _r, cos_bit);
   1025  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[23], &x1[24], _r, cos_bit);
   1026  btf_32_add_sub_avx2(&x1[32], &x1[47]);
   1027  btf_32_add_sub_avx2(&x1[33], &x1[46]);
   1028  btf_32_add_sub_avx2(&x1[34], &x1[45]);
   1029  btf_32_add_sub_avx2(&x1[35], &x1[44]);
   1030  btf_32_add_sub_avx2(&x1[36], &x1[43]);
   1031  btf_32_add_sub_avx2(&x1[37], &x1[42]);
   1032  btf_32_add_sub_avx2(&x1[38], &x1[41]);
   1033  btf_32_add_sub_avx2(&x1[39], &x1[40]);
   1034  btf_32_add_sub_avx2(&x1[63], &x1[48]);
   1035  btf_32_add_sub_avx2(&x1[62], &x1[49]);
   1036  btf_32_add_sub_avx2(&x1[61], &x1[50]);
   1037  btf_32_add_sub_avx2(&x1[60], &x1[51]);
   1038  btf_32_add_sub_avx2(&x1[59], &x1[52]);
   1039  btf_32_add_sub_avx2(&x1[58], &x1[53]);
   1040  btf_32_add_sub_avx2(&x1[57], &x1[54]);
   1041  btf_32_add_sub_avx2(&x1[56], &x1[55]);
   1042 
   1043  // stage 4
   1044  btf_32_add_sub_avx2(&x1[0], &x1[7]);
   1045  btf_32_add_sub_avx2(&x1[1], &x1[6]);
   1046  btf_32_add_sub_avx2(&x1[2], &x1[5]);
   1047  btf_32_add_sub_avx2(&x1[3], &x1[4]);
   1048  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[10], &x1[13], _r, cos_bit);
   1049  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[11], &x1[12], _r, cos_bit);
   1050  btf_32_add_sub_avx2(&x1[16], &x1[23]);
   1051  btf_32_add_sub_avx2(&x1[17], &x1[22]);
   1052  btf_32_add_sub_avx2(&x1[18], &x1[21]);
   1053  btf_32_add_sub_avx2(&x1[19], &x1[20]);
   1054  btf_32_add_sub_avx2(&x1[31], &x1[24]);
   1055  btf_32_add_sub_avx2(&x1[30], &x1[25]);
   1056  btf_32_add_sub_avx2(&x1[29], &x1[26]);
   1057  btf_32_add_sub_avx2(&x1[28], &x1[27]);
   1058  btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[36], &x1[59], _r, cos_bit);
   1059  btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[37], &x1[58], _r, cos_bit);
   1060  btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[38], &x1[57], _r, cos_bit);
   1061  btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[39], &x1[56], _r, cos_bit);
   1062  btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[40], &x1[55], _r, cos_bit);
   1063  btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[41], &x1[54], _r, cos_bit);
   1064  btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[42], &x1[53], _r, cos_bit);
   1065  btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[43], &x1[52], _r, cos_bit);
   1066 
   1067  // stage 5
   1068  btf_32_add_sub_avx2(&x1[0], &x1[3]);
   1069  btf_32_add_sub_avx2(&x1[1], &x1[2]);
   1070  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[5], &x1[6], _r, cos_bit);
   1071  btf_32_add_sub_avx2(&x1[8], &x1[11]);
   1072  btf_32_add_sub_avx2(&x1[9], &x1[10]);
   1073  btf_32_add_sub_avx2(&x1[15], &x1[12]);
   1074  btf_32_add_sub_avx2(&x1[14], &x1[13]);
   1075  btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[18], &x1[29], _r, cos_bit);
   1076  btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[19], &x1[28], _r, cos_bit);
   1077  btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[20], &x1[27], _r, cos_bit);
   1078  btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[21], &x1[26], _r, cos_bit);
   1079  btf_32_add_sub_avx2(&x1[32], &x1[39]);
   1080  btf_32_add_sub_avx2(&x1[33], &x1[38]);
   1081  btf_32_add_sub_avx2(&x1[34], &x1[37]);
   1082  btf_32_add_sub_avx2(&x1[35], &x1[36]);
   1083  btf_32_add_sub_avx2(&x1[47], &x1[40]);
   1084  btf_32_add_sub_avx2(&x1[46], &x1[41]);
   1085  btf_32_add_sub_avx2(&x1[45], &x1[42]);
   1086  btf_32_add_sub_avx2(&x1[44], &x1[43]);
   1087  btf_32_add_sub_avx2(&x1[48], &x1[55]);
   1088  btf_32_add_sub_avx2(&x1[49], &x1[54]);
   1089  btf_32_add_sub_avx2(&x1[50], &x1[53]);
   1090  btf_32_add_sub_avx2(&x1[51], &x1[52]);
   1091  btf_32_add_sub_avx2(&x1[63], &x1[56]);
   1092  btf_32_add_sub_avx2(&x1[62], &x1[57]);
   1093  btf_32_add_sub_avx2(&x1[61], &x1[58]);
   1094  btf_32_add_sub_avx2(&x1[60], &x1[59]);
   1095 
   1096  // stage 6
   1097  btf_32_avx2_type0_new(cospi_p32, cospi_p32, &x1[0], &x1[1], _r, cos_bit);
   1098  btf_32_avx2_type1_new(cospi_p48, cospi_p16, &x1[2], &x1[3], _r, cos_bit);
   1099  btf_32_add_sub_avx2(&x1[4], &x1[5]);
   1100  btf_32_add_sub_avx2(&x1[7], &x1[6]);
   1101  btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[9], &x1[14], _r, cos_bit);
   1102  btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[10], &x1[13], _r, cos_bit);
   1103  btf_32_add_sub_avx2(&x1[16], &x1[19]);
   1104  btf_32_add_sub_avx2(&x1[17], &x1[18]);
   1105  btf_32_add_sub_avx2(&x1[23], &x1[20]);
   1106  btf_32_add_sub_avx2(&x1[22], &x1[21]);
   1107  btf_32_add_sub_avx2(&x1[24], &x1[27]);
   1108  btf_32_add_sub_avx2(&x1[25], &x1[26]);
   1109  btf_32_add_sub_avx2(&x1[31], &x1[28]);
   1110  btf_32_add_sub_avx2(&x1[30], &x1[29]);
   1111  btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[34], &x1[61], _r, cos_bit);
   1112  btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[35], &x1[60], _r, cos_bit);
   1113  btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[36], &x1[59], _r, cos_bit);
   1114  btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[37], &x1[58], _r, cos_bit);
   1115  btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[42], &x1[53], _r, cos_bit);
   1116  btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[43], &x1[52], _r, cos_bit);
   1117  btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[44], &x1[51], _r, cos_bit);
   1118  btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[45], &x1[50], _r, cos_bit);
   1119 
   1120  // stage 7
   1121  btf_32_avx2_type1_new(cospi_p56, cospi_p08, &x1[4], &x1[7], _r, cos_bit);
   1122  btf_32_avx2_type1_new(cospi_p24, cospi_p40, &x1[5], &x1[6], _r, cos_bit);
   1123  btf_32_add_sub_avx2(&x1[8], &x1[9]);
   1124  btf_32_add_sub_avx2(&x1[11], &x1[10]);
   1125  btf_32_add_sub_avx2(&x1[12], &x1[13]);
   1126  btf_32_add_sub_avx2(&x1[15], &x1[14]);
   1127  btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[17], &x1[30], _r, cos_bit);
   1128  btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[18], &x1[29], _r, cos_bit);
   1129  btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[21], &x1[26], _r, cos_bit);
   1130  btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[22], &x1[25], _r, cos_bit);
   1131  btf_32_add_sub_avx2(&x1[32], &x1[35]);
   1132  btf_32_add_sub_avx2(&x1[33], &x1[34]);
   1133  btf_32_add_sub_avx2(&x1[39], &x1[36]);
   1134  btf_32_add_sub_avx2(&x1[38], &x1[37]);
   1135  btf_32_add_sub_avx2(&x1[40], &x1[43]);
   1136  btf_32_add_sub_avx2(&x1[41], &x1[42]);
   1137  btf_32_add_sub_avx2(&x1[47], &x1[44]);
   1138  btf_32_add_sub_avx2(&x1[46], &x1[45]);
   1139  btf_32_add_sub_avx2(&x1[48], &x1[51]);
   1140  btf_32_add_sub_avx2(&x1[49], &x1[50]);
   1141  btf_32_add_sub_avx2(&x1[55], &x1[52]);
   1142  btf_32_add_sub_avx2(&x1[54], &x1[53]);
   1143  btf_32_add_sub_avx2(&x1[56], &x1[59]);
   1144  btf_32_add_sub_avx2(&x1[57], &x1[58]);
   1145  btf_32_add_sub_avx2(&x1[63], &x1[60]);
   1146  btf_32_add_sub_avx2(&x1[62], &x1[61]);
   1147 
   1148  // stage 8
   1149  btf_32_avx2_type1_new(cospi_p60, cospi_p04, &x1[8], &x1[15], _r, cos_bit);
   1150  btf_32_avx2_type1_new(cospi_p28, cospi_p36, &x1[9], &x1[14], _r, cos_bit);
   1151  btf_32_avx2_type1_new(cospi_p44, cospi_p20, &x1[10], &x1[13], _r, cos_bit);
   1152  btf_32_avx2_type1_new(cospi_p12, cospi_p52, &x1[11], &x1[12], _r, cos_bit);
   1153  btf_32_add_sub_avx2(&x1[16], &x1[17]);
   1154  btf_32_add_sub_avx2(&x1[19], &x1[18]);
   1155  btf_32_add_sub_avx2(&x1[20], &x1[21]);
   1156  btf_32_add_sub_avx2(&x1[23], &x1[22]);
   1157  btf_32_add_sub_avx2(&x1[24], &x1[25]);
   1158  btf_32_add_sub_avx2(&x1[27], &x1[26]);
   1159  btf_32_add_sub_avx2(&x1[28], &x1[29]);
   1160  btf_32_add_sub_avx2(&x1[31], &x1[30]);
   1161  btf_32_avx2_type0_new(cospi_m04, cospi_p60, &x1[33], &x1[62], _r, cos_bit);
   1162  btf_32_avx2_type0_new(cospi_m60, cospi_m04, &x1[34], &x1[61], _r, cos_bit);
   1163  btf_32_avx2_type0_new(cospi_m36, cospi_p28, &x1[37], &x1[58], _r, cos_bit);
   1164  btf_32_avx2_type0_new(cospi_m28, cospi_m36, &x1[38], &x1[57], _r, cos_bit);
   1165  btf_32_avx2_type0_new(cospi_m20, cospi_p44, &x1[41], &x1[54], _r, cos_bit);
   1166  btf_32_avx2_type0_new(cospi_m44, cospi_m20, &x1[42], &x1[53], _r, cos_bit);
   1167  btf_32_avx2_type0_new(cospi_m52, cospi_p12, &x1[45], &x1[50], _r, cos_bit);
   1168  btf_32_avx2_type0_new(cospi_m12, cospi_m52, &x1[46], &x1[49], _r, cos_bit);
   1169 
   1170  // stage 9
   1171  btf_32_avx2_type1_new(cospi_p62, cospi_p02, &x1[16], &x1[31], _r, cos_bit);
   1172  btf_32_avx2_type1_new(cospi_p30, cospi_p34, &x1[17], &x1[30], _r, cos_bit);
   1173  btf_32_avx2_type1_new(cospi_p46, cospi_p18, &x1[18], &x1[29], _r, cos_bit);
   1174  btf_32_avx2_type1_new(cospi_p14, cospi_p50, &x1[19], &x1[28], _r, cos_bit);
   1175  btf_32_avx2_type1_new(cospi_p54, cospi_p10, &x1[20], &x1[27], _r, cos_bit);
   1176  btf_32_avx2_type1_new(cospi_p22, cospi_p42, &x1[21], &x1[26], _r, cos_bit);
   1177  btf_32_avx2_type1_new(cospi_p38, cospi_p26, &x1[22], &x1[25], _r, cos_bit);
   1178  btf_32_avx2_type1_new(cospi_p06, cospi_p58, &x1[23], &x1[24], _r, cos_bit);
   1179  btf_32_add_sub_avx2(&x1[32], &x1[33]);
   1180  btf_32_add_sub_avx2(&x1[35], &x1[34]);
   1181  btf_32_add_sub_avx2(&x1[36], &x1[37]);
   1182  btf_32_add_sub_avx2(&x1[39], &x1[38]);
   1183  btf_32_add_sub_avx2(&x1[40], &x1[41]);
   1184  btf_32_add_sub_avx2(&x1[43], &x1[42]);
   1185  btf_32_add_sub_avx2(&x1[44], &x1[45]);
   1186  btf_32_add_sub_avx2(&x1[47], &x1[46]);
   1187  btf_32_add_sub_avx2(&x1[48], &x1[49]);
   1188  btf_32_add_sub_avx2(&x1[51], &x1[50]);
   1189  btf_32_add_sub_avx2(&x1[52], &x1[53]);
   1190  btf_32_add_sub_avx2(&x1[55], &x1[54]);
   1191  btf_32_add_sub_avx2(&x1[56], &x1[57]);
   1192  btf_32_add_sub_avx2(&x1[59], &x1[58]);
   1193  btf_32_add_sub_avx2(&x1[60], &x1[61]);
   1194  btf_32_add_sub_avx2(&x1[63], &x1[62]);
   1195 
   1196  // stage 10
   1197  btf_32_avx2_type1_new(cospi_p63, cospi_p01, &x1[32], &x1[63], _r, cos_bit);
   1198  btf_32_avx2_type1_new(cospi_p31, cospi_p33, &x1[33], &x1[62], _r, cos_bit);
   1199  btf_32_avx2_type1_new(cospi_p47, cospi_p17, &x1[34], &x1[61], _r, cos_bit);
   1200  btf_32_avx2_type1_new(cospi_p15, cospi_p49, &x1[35], &x1[60], _r, cos_bit);
   1201  btf_32_avx2_type1_new(cospi_p55, cospi_p09, &x1[36], &x1[59], _r, cos_bit);
   1202  btf_32_avx2_type1_new(cospi_p23, cospi_p41, &x1[37], &x1[58], _r, cos_bit);
   1203  btf_32_avx2_type1_new(cospi_p39, cospi_p25, &x1[38], &x1[57], _r, cos_bit);
   1204  btf_32_avx2_type1_new(cospi_p07, cospi_p57, &x1[39], &x1[56], _r, cos_bit);
   1205  btf_32_avx2_type1_new(cospi_p59, cospi_p05, &x1[40], &x1[55], _r, cos_bit);
   1206  btf_32_avx2_type1_new(cospi_p27, cospi_p37, &x1[41], &x1[54], _r, cos_bit);
   1207  btf_32_avx2_type1_new(cospi_p43, cospi_p21, &x1[42], &x1[53], _r, cos_bit);
   1208  btf_32_avx2_type1_new(cospi_p11, cospi_p53, &x1[43], &x1[52], _r, cos_bit);
   1209  btf_32_avx2_type1_new(cospi_p51, cospi_p13, &x1[44], &x1[51], _r, cos_bit);
   1210  btf_32_avx2_type1_new(cospi_p19, cospi_p45, &x1[45], &x1[50], _r, cos_bit);
   1211  btf_32_avx2_type1_new(cospi_p35, cospi_p29, &x1[46], &x1[49], _r, cos_bit);
   1212  btf_32_avx2_type1_new(cospi_p03, cospi_p61, &x1[47], &x1[48], _r, cos_bit);
   1213 
   1214  // stage 11
   1215  output[0] = x1[0];
   1216  output[1] = x1[32];
   1217  output[2] = x1[16];
   1218  output[3] = x1[48];
   1219  output[4] = x1[8];
   1220  output[5] = x1[40];
   1221  output[6] = x1[24];
   1222  output[7] = x1[56];
   1223  output[8] = x1[4];
   1224  output[9] = x1[36];
   1225  output[10] = x1[20];
   1226  output[11] = x1[52];
   1227  output[12] = x1[12];
   1228  output[13] = x1[44];
   1229  output[14] = x1[28];
   1230  output[15] = x1[60];
   1231  output[16] = x1[2];
   1232  output[17] = x1[34];
   1233  output[18] = x1[18];
   1234  output[19] = x1[50];
   1235  output[20] = x1[10];
   1236  output[21] = x1[42];
   1237  output[22] = x1[26];
   1238  output[23] = x1[58];
   1239  output[24] = x1[6];
   1240  output[25] = x1[38];
   1241  output[26] = x1[22];
   1242  output[27] = x1[54];
   1243  output[28] = x1[14];
   1244  output[29] = x1[46];
   1245  output[30] = x1[30];
   1246  output[31] = x1[62];
   1247  output[32] = x1[1];
   1248  output[33] = x1[33];
   1249  output[34] = x1[17];
   1250  output[35] = x1[49];
   1251  output[36] = x1[9];
   1252  output[37] = x1[41];
   1253  output[38] = x1[25];
   1254  output[39] = x1[57];
   1255  output[40] = x1[5];
   1256  output[41] = x1[37];
   1257  output[42] = x1[21];
   1258  output[43] = x1[53];
   1259  output[44] = x1[13];
   1260  output[45] = x1[45];
   1261  output[46] = x1[29];
   1262  output[47] = x1[61];
   1263  output[48] = x1[3];
   1264  output[49] = x1[35];
   1265  output[50] = x1[19];
   1266  output[51] = x1[51];
   1267  output[52] = x1[11];
   1268  output[53] = x1[43];
   1269  output[54] = x1[27];
   1270  output[55] = x1[59];
   1271  output[56] = x1[7];
   1272  output[57] = x1[39];
   1273  output[58] = x1[23];
   1274  output[59] = x1[55];
   1275  output[60] = x1[15];
   1276  output[61] = x1[47];
   1277  output[62] = x1[31];
   1278  output[63] = x1[63];
   1279 }
   1280 
   1281 static inline void fadst16x16_new_avx2(const __m256i *input, __m256i *output,
   1282                                       int8_t cos_bit) {
   1283  const int32_t *cospi = cospi_arr(cos_bit);
   1284  const __m256i __zero = _mm256_setzero_si256();
   1285  const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
   1286 
   1287  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
   1288  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
   1289  __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
   1290  __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
   1291  __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
   1292  __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
   1293  __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
   1294  __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
   1295  __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
   1296  __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]);
   1297  __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]);
   1298  __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
   1299  __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
   1300  __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
   1301  __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
   1302  __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
   1303  __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
   1304  __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
   1305  __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
   1306  __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
   1307  __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
   1308  __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
   1309  __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
   1310  __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
   1311  __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
   1312  __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
   1313  __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
   1314 
   1315  // stage 1
   1316  __m256i x1[16];
   1317  x1[0] = input[0];
   1318  x1[1] = _mm256_subs_epi16(__zero, input[15]);
   1319  x1[2] = _mm256_subs_epi16(__zero, input[7]);
   1320  x1[3] = input[8];
   1321  x1[4] = _mm256_subs_epi16(__zero, input[3]);
   1322  x1[5] = input[12];
   1323  x1[6] = input[4];
   1324  x1[7] = _mm256_subs_epi16(__zero, input[11]);
   1325  x1[8] = _mm256_subs_epi16(__zero, input[1]);
   1326  x1[9] = input[14];
   1327  x1[10] = input[6];
   1328  x1[11] = _mm256_subs_epi16(__zero, input[9]);
   1329  x1[12] = input[2];
   1330  x1[13] = _mm256_subs_epi16(__zero, input[13]);
   1331  x1[14] = _mm256_subs_epi16(__zero, input[5]);
   1332  x1[15] = input[10];
   1333 
   1334  // stage 2
   1335  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit);
   1336  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit);
   1337  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit);
   1338  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit);
   1339 
   1340  // stage 3
   1341  btf_16_adds_subs_avx2(&x1[0], &x1[2]);
   1342  btf_16_adds_subs_avx2(&x1[1], &x1[3]);
   1343  btf_16_adds_subs_avx2(&x1[4], &x1[6]);
   1344  btf_16_adds_subs_avx2(&x1[5], &x1[7]);
   1345  btf_16_adds_subs_avx2(&x1[8], &x1[10]);
   1346  btf_16_adds_subs_avx2(&x1[9], &x1[11]);
   1347  btf_16_adds_subs_avx2(&x1[12], &x1[14]);
   1348  btf_16_adds_subs_avx2(&x1[13], &x1[15]);
   1349 
   1350  // stage 4
   1351  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r, cos_bit);
   1352  btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x1[6], &x1[7], _r, cos_bit);
   1353  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r, cos_bit);
   1354  btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x1[14], &x1[15], _r, cos_bit);
   1355 
   1356  // stage 5
   1357  btf_16_adds_subs_avx2(&x1[0], &x1[4]);
   1358  btf_16_adds_subs_avx2(&x1[1], &x1[5]);
   1359  btf_16_adds_subs_avx2(&x1[2], &x1[6]);
   1360  btf_16_adds_subs_avx2(&x1[3], &x1[7]);
   1361  btf_16_adds_subs_avx2(&x1[8], &x1[12]);
   1362  btf_16_adds_subs_avx2(&x1[9], &x1[13]);
   1363  btf_16_adds_subs_avx2(&x1[10], &x1[14]);
   1364  btf_16_adds_subs_avx2(&x1[11], &x1[15]);
   1365 
   1366  // stage 6
   1367  btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r, cos_bit);
   1368  btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x1[10], &x1[11], _r, cos_bit);
   1369  btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x1[12], &x1[13], _r, cos_bit);
   1370  btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x1[14], &x1[15], _r, cos_bit);
   1371 
   1372  // stage 7
   1373  btf_16_adds_subs_avx2(&x1[0], &x1[8]);
   1374  btf_16_adds_subs_avx2(&x1[1], &x1[9]);
   1375  btf_16_adds_subs_avx2(&x1[2], &x1[10]);
   1376  btf_16_adds_subs_avx2(&x1[3], &x1[11]);
   1377  btf_16_adds_subs_avx2(&x1[4], &x1[12]);
   1378  btf_16_adds_subs_avx2(&x1[5], &x1[13]);
   1379  btf_16_adds_subs_avx2(&x1[6], &x1[14]);
   1380  btf_16_adds_subs_avx2(&x1[7], &x1[15]);
   1381 
   1382  // stage 8
   1383  btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r, cos_bit);
   1384  btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r, cos_bit);
   1385  btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r, cos_bit);
   1386  btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r, cos_bit);
   1387  btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r, cos_bit);
   1388  btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r, cos_bit);
   1389  btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r, cos_bit);
   1390  btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r, cos_bit);
   1391 
   1392  // stage 9
   1393  output[0] = x1[1];
   1394  output[1] = x1[14];
   1395  output[2] = x1[3];
   1396  output[3] = x1[12];
   1397  output[4] = x1[5];
   1398  output[5] = x1[10];
   1399  output[6] = x1[7];
   1400  output[7] = x1[8];
   1401  output[8] = x1[9];
   1402  output[9] = x1[6];
   1403  output[10] = x1[11];
   1404  output[11] = x1[4];
   1405  output[12] = x1[13];
   1406  output[13] = x1[2];
   1407  output[14] = x1[15];
   1408  output[15] = x1[0];
   1409 }
   1410 
   1411 static inline void fidentity16x16_new_avx2(const __m256i *input,
   1412                                           __m256i *output, int8_t cos_bit) {
   1413  (void)cos_bit;
   1414  const __m256i one = _mm256_set1_epi16(1);
   1415 
   1416  for (int i = 0; i < 16; ++i) {
   1417    const __m256i a_lo = _mm256_unpacklo_epi16(input[i], one);
   1418    const __m256i a_hi = _mm256_unpackhi_epi16(input[i], one);
   1419    const __m256i b_lo = scale_round_avx2(a_lo, 2 * NewSqrt2);
   1420    const __m256i b_hi = scale_round_avx2(a_hi, 2 * NewSqrt2);
   1421    output[i] = _mm256_packs_epi32(b_lo, b_hi);
   1422  }
   1423 }
   1424 
   1425 static inline void fidentity16x32_avx2(const __m256i *input, __m256i *output,
   1426                                       int8_t cos_bit) {
   1427  (void)cos_bit;
   1428  for (int i = 0; i < 32; ++i) {
   1429    output[i] = _mm256_slli_epi16(input[i], 2);
   1430  }
   1431 }
   1432 
   1433 static inline void store_output_32bit_w16(int32_t *const out,
   1434                                          const __m256i *const in1,
   1435                                          const __m256i *const in2,
   1436                                          const int stride,
   1437                                          const int out_size) {
   1438  for (int i = 0; i < out_size; ++i) {
   1439    _mm256_store_si256((__m256i *)(out + stride * i), in1[i]);
   1440    _mm256_store_si256((__m256i *)(out + stride * i + 8), in2[i]);
   1441  }
   1442 }
   1443 
   1444 // Store 8 16 bit values. Sign extend the values.
   1445 static inline void store_buffer_16bit_to_32bit_w16_avx2(const __m256i *const in,
   1446                                                        int32_t *out,
   1447                                                        const int stride,
   1448                                                        const int out_size) {
   1449  for (int i = 0; i < out_size; ++i) {
   1450    _mm256_store_si256((__m256i *)(out),
   1451                       _mm256_cvtepi16_epi32(_mm256_castsi256_si128(in[i])));
   1452    _mm256_store_si256(
   1453        (__m256i *)(out + 8),
   1454        _mm256_cvtepi16_epi32(_mm256_extracti128_si256(in[i], 1)));
   1455    out += stride;
   1456  }
   1457 }
   1458 
   1459 static inline void store_rect_16bit_to_32bit_avx2(const __m256i a,
   1460                                                  int32_t *const b) {
   1461  const __m256i one = _mm256_set1_epi16(1);
   1462  const __m256i a_reoder = _mm256_permute4x64_epi64(a, 0xd8);
   1463  const __m256i a_lo = _mm256_unpacklo_epi16(a_reoder, one);
   1464  const __m256i a_hi = _mm256_unpackhi_epi16(a_reoder, one);
   1465  const __m256i b_lo = scale_round_avx2(a_lo, NewSqrt2);
   1466  const __m256i b_hi = scale_round_avx2(a_hi, NewSqrt2);
   1467  _mm256_store_si256((__m256i *)b, b_lo);
   1468  _mm256_store_si256((__m256i *)(b + 8), b_hi);
   1469 }
   1470 
   1471 static inline void store_rect_buffer_16bit_to_32bit_w16_avx2(
   1472    const __m256i *const in, int32_t *const out, const int stride,
   1473    const int out_size) {
   1474  for (int i = 0; i < out_size; ++i) {
   1475    store_rect_16bit_to_32bit_avx2(in[i], out + i * stride);
   1476  }
   1477 }
   1478 
   1479 typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output,
   1480                                  int8_t cos_bit);
   1481 
   1482 static const transform_1d_avx2 col_txfm16x32_arr[TX_TYPES] = {
   1483  fdct16x32_avx2,       // DCT_DCT
   1484  NULL,                 // ADST_DCT
   1485  NULL,                 // DCT_ADST
   1486  NULL,                 // ADST_ADST
   1487  NULL,                 // FLIPADST_DCT
   1488  NULL,                 // DCT_FLIPADST
   1489  NULL,                 // FLIPADST_FLIPADST
   1490  NULL,                 // ADST_FLIPADST
   1491  NULL,                 // FLIPADST_ADST
   1492  fidentity16x32_avx2,  // IDTX
   1493  fdct16x32_avx2,       // V_DCT
   1494  fidentity16x32_avx2,  // H_DCT
   1495  NULL,                 // V_ADST
   1496  NULL,                 // H_ADST
   1497  NULL,                 // V_FLIPADST
   1498  NULL                  // H_FLIPADST
   1499 };
   1500 
   1501 static const transform_1d_avx2 row_txfm16x32_arr[TX_TYPES] = {
   1502  fdct16x32_avx2,       // DCT_DCT
   1503  NULL,                 // ADST_DCT
   1504  NULL,                 // DCT_ADST
   1505  NULL,                 // ADST_ADST
   1506  NULL,                 // FLIPADST_DCT
   1507  NULL,                 // DCT_FLIPADST
   1508  NULL,                 // FLIPADST_FLIPADST
   1509  NULL,                 // ADST_FLIPADST
   1510  NULL,                 // FLIPADST_ADST
   1511  fidentity16x32_avx2,  // IDTX
   1512  fidentity16x32_avx2,  // V_DCT
   1513  fdct16x32_avx2,       // H_DCT
   1514  NULL,                 // V_ADST
   1515  NULL,                 // H_ADST
   1516  NULL,                 // V_FLIPADST
   1517  NULL                  // H_FLIPADST
   1518 };
   1519 
   1520 static const transform_1d_avx2 col_txfm16x16_arr[TX_TYPES] = {
   1521  fdct16x16_new_avx2,       // DCT_DCT
   1522  fadst16x16_new_avx2,      // ADST_DCT
   1523  fdct16x16_new_avx2,       // DCT_ADST
   1524  fadst16x16_new_avx2,      // ADST_ADST
   1525  fadst16x16_new_avx2,      // FLIPADST_DCT
   1526  fdct16x16_new_avx2,       // DCT_FLIPADST
   1527  fadst16x16_new_avx2,      // FLIPADST_FLIPADST
   1528  fadst16x16_new_avx2,      // ADST_FLIPADST
   1529  fadst16x16_new_avx2,      // FLIPADST_ADST
   1530  fidentity16x16_new_avx2,  // IDTX
   1531  fdct16x16_new_avx2,       // V_DCT
   1532  fidentity16x16_new_avx2,  // H_DCT
   1533  fadst16x16_new_avx2,      // V_ADST
   1534  fidentity16x16_new_avx2,  // H_ADST
   1535  fadst16x16_new_avx2,      // V_FLIPADST
   1536  fidentity16x16_new_avx2   // H_FLIPADST
   1537 };
   1538 
   1539 static const transform_1d_avx2 row_txfm16x16_arr[TX_TYPES] = {
   1540  fdct16x16_new_avx2,       // DCT_DCT
   1541  fdct16x16_new_avx2,       // ADST_DCT
   1542  fadst16x16_new_avx2,      // DCT_ADST
   1543  fadst16x16_new_avx2,      // ADST_ADST
   1544  fdct16x16_new_avx2,       // FLIPADST_DCT
   1545  fadst16x16_new_avx2,      // DCT_FLIPADST
   1546  fadst16x16_new_avx2,      // FLIPADST_FLIPADST
   1547  fadst16x16_new_avx2,      // ADST_FLIPADST
   1548  fadst16x16_new_avx2,      // FLIPADST_ADST
   1549  fidentity16x16_new_avx2,  // IDTX
   1550  fidentity16x16_new_avx2,  // V_DCT
   1551  fdct16x16_new_avx2,       // H_DCT
   1552  fidentity16x16_new_avx2,  // V_ADST
   1553  fadst16x16_new_avx2,      // H_ADST
   1554  fidentity16x16_new_avx2,  // V_FLIPADST
   1555  fadst16x16_new_avx2       // H_FLIPADST
   1556 };
   1557 
   1558 static const transform_1d_sse2 col_txfm8x8_arr[TX_TYPES] = {
   1559  fdct8x8_new_sse2,       // DCT_DCT
   1560  fadst8x8_new_sse2,      // ADST_DCT
   1561  fdct8x8_new_sse2,       // DCT_ADST
   1562  fadst8x8_new_sse2,      // ADST_ADST
   1563  fadst8x8_new_sse2,      // FLIPADST_DCT
   1564  fdct8x8_new_sse2,       // DCT_FLIPADST
   1565  fadst8x8_new_sse2,      // FLIPADST_FLIPADST
   1566  fadst8x8_new_sse2,      // ADST_FLIPADST
   1567  fadst8x8_new_sse2,      // FLIPADST_ADST
   1568  fidentity8x8_new_sse2,  // IDTX
   1569  fdct8x8_new_sse2,       // V_DCT
   1570  fidentity8x8_new_sse2,  // H_DCT
   1571  fadst8x8_new_sse2,      // V_ADST
   1572  fidentity8x8_new_sse2,  // H_ADST
   1573  fadst8x8_new_sse2,      // V_FLIPADST
   1574  fidentity8x8_new_sse2,  // H_FLIPADST
   1575 };
   1576 
   1577 static const transform_1d_sse2 row_txfm8x8_arr[TX_TYPES] = {
   1578  fdct8x8_new_sse2,       // DCT_DCT
   1579  fdct8x8_new_sse2,       // ADST_DCT
   1580  fadst8x8_new_sse2,      // DCT_ADST
   1581  fadst8x8_new_sse2,      // ADST_ADST
   1582  fdct8x8_new_sse2,       // FLIPADST_DCT
   1583  fadst8x8_new_sse2,      // DCT_FLIPADST
   1584  fadst8x8_new_sse2,      // FLIPADST_FLIPADST
   1585  fadst8x8_new_sse2,      // ADST_FLIPADST
   1586  fadst8x8_new_sse2,      // FLIPADST_ADST
   1587  fidentity8x8_new_sse2,  // IDTX
   1588  fidentity8x8_new_sse2,  // V_DCT
   1589  fdct8x8_new_sse2,       // H_DCT
   1590  fidentity8x8_new_sse2,  // V_ADST
   1591  fadst8x8_new_sse2,      // H_ADST
   1592  fidentity8x8_new_sse2,  // V_FLIPADST
   1593  fadst8x8_new_sse2       // H_FLIPADST
   1594 };
   1595 
   1596 static inline void load_buffer_and_round_shift(const int16_t *in, int stride,
   1597                                               __m128i *out, int bit) {
   1598  out[0] = _mm_load_si128((const __m128i *)(in + 0 * stride));
   1599  out[1] = _mm_load_si128((const __m128i *)(in + 1 * stride));
   1600  out[2] = _mm_load_si128((const __m128i *)(in + 2 * stride));
   1601  out[3] = _mm_load_si128((const __m128i *)(in + 3 * stride));
   1602  out[4] = _mm_load_si128((const __m128i *)(in + 4 * stride));
   1603  out[5] = _mm_load_si128((const __m128i *)(in + 5 * stride));
   1604  out[6] = _mm_load_si128((const __m128i *)(in + 6 * stride));
   1605  out[7] = _mm_load_si128((const __m128i *)(in + 7 * stride));
   1606  out[0] = _mm_slli_epi16(out[0], bit);
   1607  out[1] = _mm_slli_epi16(out[1], bit);
   1608  out[2] = _mm_slli_epi16(out[2], bit);
   1609  out[3] = _mm_slli_epi16(out[3], bit);
   1610  out[4] = _mm_slli_epi16(out[4], bit);
   1611  out[5] = _mm_slli_epi16(out[5], bit);
   1612  out[6] = _mm_slli_epi16(out[6], bit);
   1613  out[7] = _mm_slli_epi16(out[7], bit);
   1614 }
   1615 
   1616 static inline void load_buffer_and_flip_round_shift(const int16_t *in,
   1617                                                    int stride, __m128i *out,
   1618                                                    int bit) {
   1619  out[7] = load_16bit_to_16bit(in + 0 * stride);
   1620  out[6] = load_16bit_to_16bit(in + 1 * stride);
   1621  out[5] = load_16bit_to_16bit(in + 2 * stride);
   1622  out[4] = load_16bit_to_16bit(in + 3 * stride);
   1623  out[3] = load_16bit_to_16bit(in + 4 * stride);
   1624  out[2] = load_16bit_to_16bit(in + 5 * stride);
   1625  out[1] = load_16bit_to_16bit(in + 6 * stride);
   1626  out[0] = load_16bit_to_16bit(in + 7 * stride);
   1627  out[7] = _mm_slli_epi16(out[7], bit);
   1628  out[6] = _mm_slli_epi16(out[6], bit);
   1629  out[5] = _mm_slli_epi16(out[5], bit);
   1630  out[4] = _mm_slli_epi16(out[4], bit);
   1631  out[3] = _mm_slli_epi16(out[3], bit);
   1632  out[2] = _mm_slli_epi16(out[2], bit);
   1633  out[1] = _mm_slli_epi16(out[1], bit);
   1634  out[0] = _mm_slli_epi16(out[0], bit);
   1635 }
   1636 
   1637 #define TRANSPOSE_8X8_AVX2()                                         \
   1638  {                                                                  \
   1639    /* aa0:    00 10 01 11  02 12 03 13 | 40 50 41 51  42 52 43 53*/ \
   1640    /* aa1:    04 14 05 15  06 16 07 17 | 44 54 45 55  46 56 47 57*/ \
   1641    /* aa2:    20 30 21 31  22 32 23 33 | 60 70 61 71  62 72 63 73*/ \
   1642    /* aa3:    24 34 25 35  26 36 27 37 | 64 74 65 75  66 76 67 77*/ \
   1643    const __m256i aa0 = _mm256_unpacklo_epi16(b0, b1);               \
   1644    const __m256i aa1 = _mm256_unpackhi_epi16(b0, b1);               \
   1645    const __m256i aa2 = _mm256_unpacklo_epi16(b2, b3);               \
   1646    const __m256i aa3 = _mm256_unpackhi_epi16(b2, b3);               \
   1647    /* Unpack 32 bit elements resulting in: */                       \
   1648    /* bb0: 00 10 20 30  01 11 21 31 | 40 50 60 70  41 51 61 71*/    \
   1649    /* bb1: 02 12 22 32  03 13 23 33 | 42 52 62 72  43 53 63 73*/    \
   1650    /* bb2: 04 14 24 34  05 15 25 35 | 44 54 64 74  45 55 65 75*/    \
   1651    /* bb2: 06 16 26 36  07 17 27 37 | 46 56 66 76  47 57 67 77*/    \
   1652    const __m256i bb0 = _mm256_unpacklo_epi32(aa0, aa2);             \
   1653    const __m256i bb1 = _mm256_unpackhi_epi32(aa0, aa2);             \
   1654    const __m256i bb2 = _mm256_unpacklo_epi32(aa1, aa3);             \
   1655    const __m256i bb3 = _mm256_unpackhi_epi32(aa1, aa3);             \
   1656    /* bb0: 00 10 20 30 40 50 60 70| 01 11 21 31 41 51 61 71*/       \
   1657    /* bb1: 02 12 22 32 42 52 62 72| 03 13 23 33 43 53 63 73*/       \
   1658    /* bb2: 04 14 24 34 44 54 64 74| 05 15 25 35 45 55 65 75*/       \
   1659    /* bb2: 06 16 26 36 46 56 66 76| 07 17 27 37 47 57 67 77*/       \
   1660    c0 = _mm256_permute4x64_epi64(bb0, 0xd8);                        \
   1661    c1 = _mm256_permute4x64_epi64(bb1, 0xd8);                        \
   1662    c2 = _mm256_permute4x64_epi64(bb2, 0xd8);                        \
   1663    c3 = _mm256_permute4x64_epi64(bb3, 0xd8);                        \
   1664  }
   1665 
   1666 static inline void transpose_round_shift_flip_8x8(__m128i *const in,
   1667                                                  __m128i *const out, int bit) {
   1668  __m256i c0, c1, c2, c3;
   1669  bit = -bit;
   1670  const __m256i rounding = _mm256_set1_epi16(1 << (bit - 1));
   1671  const __m256i s04 =
   1672      _mm256_insertf128_si256(_mm256_castsi128_si256(in[0]), in[4], 0x1);
   1673  const __m256i s15 =
   1674      _mm256_insertf128_si256(_mm256_castsi128_si256(in[1]), in[5], 0x1);
   1675  const __m256i s26 =
   1676      _mm256_insertf128_si256(_mm256_castsi128_si256(in[2]), in[6], 0x1);
   1677  const __m256i s37 =
   1678      _mm256_insertf128_si256(_mm256_castsi128_si256(in[3]), in[7], 0x1);
   1679 
   1680  const __m256i a0 = _mm256_adds_epi16(s04, rounding);
   1681  const __m256i a1 = _mm256_adds_epi16(s15, rounding);
   1682  const __m256i a2 = _mm256_adds_epi16(s26, rounding);
   1683  const __m256i a3 = _mm256_adds_epi16(s37, rounding);
   1684 
   1685  // b0: 00 01 02 03  04 05 06 07 | 40 41 42 43  44 45 46 47
   1686  // b1: 10 11 12 13  14 15 16 17 | 50 51 52 53  54 55 56 57
   1687  // b2: 20 21 22 23  24 25 26 27 | 60 61 62 63  64 65 66 67
   1688  // b3: 30 31 32 33  34 35 36 37 | 70 71 72 73  74 75 76 77
   1689  const __m256i b0 = _mm256_srai_epi16(a0, bit);
   1690  const __m256i b1 = _mm256_srai_epi16(a1, bit);
   1691  const __m256i b2 = _mm256_srai_epi16(a2, bit);
   1692  const __m256i b3 = _mm256_srai_epi16(a3, bit);
   1693 
   1694  TRANSPOSE_8X8_AVX2()
   1695 
   1696  // Unpack 64 bit elements resulting in:
   1697  // out[7]: 00 10 20 30  40 50 60 70
   1698  // out[6]: 01 11 21 31  41 51 61 71
   1699  // out[5]: 02 12 22 32  42 52 62 72
   1700  // out[4]: 03 13 23 33  43 53 63 73
   1701  // out[3]: 04 14 24 34  44 54 64 74
   1702  // out[2]: 05 15 25 35  45 55 65 75
   1703  // out[1]: 06 16 26 36  46 56 66 76
   1704  // out[0]: 07 17 27 37  47 57 67 77
   1705  out[7] = _mm256_castsi256_si128(c0);
   1706  out[6] = _mm256_extractf128_si256(c0, 1);
   1707  out[5] = _mm256_castsi256_si128(c1);
   1708  out[4] = _mm256_extractf128_si256(c1, 1);
   1709  out[3] = _mm256_castsi256_si128(c2);
   1710  out[2] = _mm256_extractf128_si256(c2, 1);
   1711  out[1] = _mm256_castsi256_si128(c3);
   1712  out[0] = _mm256_extractf128_si256(c3, 1);
   1713 }
   1714 
   1715 static inline void transpose_round_shift_8x8(__m128i *const in,
   1716                                             __m128i *const out, int bit) {
   1717  __m256i c0, c1, c2, c3;
   1718  bit = -bit;
   1719  const __m256i rounding = _mm256_set1_epi16(1 << (bit - 1));
   1720  const __m256i s04 =
   1721      _mm256_insertf128_si256(_mm256_castsi128_si256(in[0]), in[4], 0x1);
   1722  const __m256i s15 =
   1723      _mm256_insertf128_si256(_mm256_castsi128_si256(in[1]), in[5], 0x1);
   1724  const __m256i s26 =
   1725      _mm256_insertf128_si256(_mm256_castsi128_si256(in[2]), in[6], 0x1);
   1726  const __m256i s37 =
   1727      _mm256_insertf128_si256(_mm256_castsi128_si256(in[3]), in[7], 0x1);
   1728 
   1729  const __m256i a0 = _mm256_adds_epi16(s04, rounding);
   1730  const __m256i a1 = _mm256_adds_epi16(s15, rounding);
   1731  const __m256i a2 = _mm256_adds_epi16(s26, rounding);
   1732  const __m256i a3 = _mm256_adds_epi16(s37, rounding);
   1733 
   1734  // b0: 00 01 02 03  04 05 06 07 | 40 41 42 43  44 45 46 47
   1735  // b1: 10 11 12 13  14 15 16 17 | 50 51 52 53  54 55 56 57
   1736  // b2: 20 21 22 23  24 25 26 27 | 60 61 62 63  64 65 66 67
   1737  // b3: 30 31 32 33  34 35 36 37 | 70 71 72 73  74 75 76 77
   1738  const __m256i b0 = _mm256_srai_epi16(a0, bit);
   1739  const __m256i b1 = _mm256_srai_epi16(a1, bit);
   1740  const __m256i b2 = _mm256_srai_epi16(a2, bit);
   1741  const __m256i b3 = _mm256_srai_epi16(a3, bit);
   1742 
   1743  TRANSPOSE_8X8_AVX2()
   1744  // Unpack 64 bit elements resulting in:
   1745  // out[7]: 00 10 20 30  40 50 60 70
   1746  // out[6]: 01 11 21 31  41 51 61 71
   1747  // out[5]: 02 12 22 32  42 52 62 72
   1748  // out[4]: 03 13 23 33  43 53 63 73
   1749  // out[3]: 04 14 24 34  44 54 64 74
   1750  // out[2]: 05 15 25 35  45 55 65 75
   1751  // out[1]: 06 16 26 36  46 56 66 76
   1752  // out[0]: 07 17 27 37  47 57 67 77
   1753  out[0] = _mm256_castsi256_si128(c0);
   1754  out[1] = _mm256_extractf128_si256(c0, 1);
   1755  out[2] = _mm256_castsi256_si128(c1);
   1756  out[3] = _mm256_extractf128_si256(c1, 1);
   1757  out[4] = _mm256_castsi256_si128(c2);
   1758  out[5] = _mm256_extractf128_si256(c2, 1);
   1759  out[6] = _mm256_castsi256_si128(c3);
   1760  out[7] = _mm256_extractf128_si256(c3, 1);
   1761 }
   1762 
   1763 static inline void store_buffer_16bit_to_32bit_w8_avx2(const __m128i *const in,
   1764                                                       int32_t *const out,
   1765                                                       const int stride,
   1766                                                       const int out_size) {
   1767  for (int i = 0; i < out_size; ++i) {
   1768    _mm256_store_si256((__m256i *)(out + i * stride),
   1769                       _mm256_cvtepi16_epi32(in[i]));
   1770  }
   1771 }
   1772 
   1773 static void av1_lowbd_fwd_txfm2d_8x8_avx2(const int16_t *input, int32_t *output,
   1774                                          int stride, TX_TYPE tx_type, int bd) {
   1775  (void)bd;
   1776  __m128i buf0[8], buf1[8], *buf;
   1777  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8];
   1778  const int txw_idx = get_txw_idx(TX_8X8);
   1779  const int txh_idx = get_txh_idx(TX_8X8);
   1780  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
   1781  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   1782  const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type];
   1783  const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type];
   1784  int ud_flip, lr_flip;
   1785 
   1786  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   1787  // Condition to check shift bit is avoided while round shifting, by assuming
   1788  // that shift[0] will always be positive.
   1789  assert(shift[0] > 0);
   1790  if (ud_flip)
   1791    load_buffer_and_flip_round_shift(input, stride, buf0, shift[0]);
   1792  else
   1793    load_buffer_and_round_shift(input, stride, buf0, shift[0]);
   1794 
   1795  col_txfm(buf0, buf0, cos_bit_col);
   1796  // Condition to check shift bit is avoided while round shifting, by assuming
   1797  // that shift[1] will always be negative.
   1798  assert(shift[1] < 0);
   1799 
   1800  if (lr_flip) {
   1801    transpose_round_shift_flip_8x8(buf0, buf1, shift[1]);
   1802  } else {
   1803    transpose_round_shift_8x8(buf0, buf1, shift[1]);
   1804  }
   1805 
   1806  buf = buf1;
   1807  row_txfm(buf, buf, cos_bit_row);
   1808 
   1809  // Round and shift operation is avoided here as the shift bit is assumed to be
   1810  // zero always.
   1811  assert(shift[2] == 0);
   1812  store_buffer_16bit_to_32bit_w8_avx2(buf, output, 8, 8);
   1813 }
   1814 
   1815 static void lowbd_fwd_txfm2d_16x16_avx2(const int16_t *input, int32_t *output,
   1816                                        int stride, TX_TYPE tx_type, int bd) {
   1817  (void)bd;
   1818  const TX_SIZE tx_size = TX_16X16;
   1819  __m256i buf0[16], buf1[16];
   1820  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
   1821  const int txw_idx = get_txw_idx(tx_size);
   1822  const int txh_idx = get_txh_idx(tx_size);
   1823  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
   1824  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   1825  const int width = tx_size_wide[tx_size];
   1826  const int height = tx_size_high[tx_size];
   1827  const transform_1d_avx2 col_txfm = col_txfm16x16_arr[tx_type];
   1828  const transform_1d_avx2 row_txfm = row_txfm16x16_arr[tx_type];
   1829  int ud_flip, lr_flip;
   1830 
   1831  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   1832  const int32_t i = 0;
   1833  if (ud_flip) {
   1834    load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0, height);
   1835  } else {
   1836    load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
   1837  }
   1838  round_shift_16bit_w16_avx2(buf0, height, shift[0]);
   1839  col_txfm(buf0, buf0, cos_bit_col);
   1840  round_shift_16bit_w16_avx2(buf0, height, shift[1]);
   1841  transpose_16bit_16x16_avx2(buf0, buf1 + 0 * width + 16 * i);
   1842 
   1843  __m256i *buf;
   1844  if (lr_flip) {
   1845    buf = buf0;
   1846    flip_buf_avx2(buf1 + width * i, buf, width);
   1847  } else {
   1848    buf = buf1 + width * i;
   1849  }
   1850  row_txfm(buf, buf, cos_bit_row);
   1851  round_shift_16bit_w16_avx2(buf, width, shift[2]);
   1852  store_buffer_16bit_to_32bit_w16_avx2(buf, output + i * 16, height, width);
   1853 }
   1854 
   1855 static void lowbd_fwd_txfm2d_32x32_avx2(const int16_t *input, int32_t *output,
   1856                                        int stride, TX_TYPE tx_type, int bd) {
   1857  (void)bd;
   1858  const TX_SIZE tx_size = TX_32X32;
   1859  __m256i buf0[32], buf1[128];
   1860  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
   1861  const int txw_idx = get_txw_idx(tx_size);
   1862  const int txh_idx = get_txh_idx(tx_size);
   1863  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
   1864  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   1865  const int width = tx_size_wide[tx_size];
   1866  const int height = tx_size_high[tx_size];
   1867  const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type];
   1868  const transform_1d_avx2 row_txfm = row_txfm16x32_arr[tx_type];
   1869 
   1870  int ud_flip, lr_flip;
   1871  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   1872 
   1873  for (int i = 0; i < 2; i++) {
   1874    if (ud_flip) {
   1875      load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0,
   1876                                           height);
   1877    } else {
   1878      load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
   1879    }
   1880    round_shift_16bit_w16_avx2(buf0, height, shift[0]);
   1881    col_txfm(buf0, buf0, cos_bit_col);
   1882    round_shift_16bit_w16_avx2(buf0, height, shift[1]);
   1883    transpose_16bit_16x16_avx2(buf0 + 0 * 16, buf1 + 0 * width + 16 * i);
   1884    transpose_16bit_16x16_avx2(buf0 + 1 * 16, buf1 + 1 * width + 16 * i);
   1885  }
   1886 
   1887  for (int i = 0; i < 2; i++) {
   1888    __m256i *buf;
   1889    if (lr_flip) {
   1890      buf = buf0;
   1891      flip_buf_avx2(buf1 + width * i, buf, width);
   1892    } else {
   1893      buf = buf1 + width * i;
   1894    }
   1895    row_txfm(buf, buf, cos_bit_row);
   1896    round_shift_16bit_w16_avx2(buf, width, shift[2]);
   1897    store_buffer_16bit_to_32bit_w16_avx2(buf, output + i * 16, height, width);
   1898  }
   1899 }
   1900 
   1901 static void lowbd_fwd_txfm2d_64x64_avx2(const int16_t *input, int32_t *output,
   1902                                        int stride, TX_TYPE tx_type, int bd) {
   1903  (void)bd;
   1904  (void)tx_type;
   1905  assert(tx_type == DCT_DCT);
   1906  const TX_SIZE tx_size = TX_64X64;
   1907  __m256i buf0[64], buf1[256];
   1908  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
   1909  const int txw_idx = get_txw_idx(tx_size);
   1910  const int txh_idx = get_txh_idx(tx_size);
   1911  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
   1912  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   1913  const int width = tx_size_wide[tx_size];
   1914  const int height = tx_size_high[tx_size];
   1915  const transform_1d_avx2 col_txfm = fdct16x64_new_avx2;
   1916  const int width_div16 = (width >> 4);
   1917  const int height_div16 = (height >> 4);
   1918 
   1919  for (int i = 0; i < width_div16; i++) {
   1920    load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
   1921    round_shift_16bit_w16_avx2(buf0, height, shift[0]);
   1922    col_txfm(buf0, buf0, cos_bit_col);
   1923    round_shift_16bit_w16_avx2(buf0, height, shift[1]);
   1924    for (int j = 0; j < AOMMIN(2, height_div16); ++j) {
   1925      transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
   1926    }
   1927  }
   1928 
   1929  for (int i = 0; i < AOMMIN(2, height_div16); i++) {
   1930    __m256i bufA[64];
   1931    __m256i bufB[64];
   1932    __m128i *buf = (__m128i *)(buf1 + width * i);
   1933    for (int j = 0; j < width; ++j) {
   1934      bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]);
   1935      bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]);
   1936    }
   1937    fdct64_new_avx2(bufA, bufA, cos_bit_row);
   1938    fdct64_new_avx2(bufB, bufB, cos_bit_row);
   1939    round_shift_array_32_avx2(bufA, bufA, 32, -shift[2]);
   1940    round_shift_array_32_avx2(bufB, bufB, 32, -shift[2]);
   1941    store_output_32bit_w16(output + i * 16, bufA, bufB, 32, 32);
   1942  }
   1943 }
   1944 
   1945 static void lowbd_fwd_txfm2d_16x32_avx2(const int16_t *input, int32_t *output,
   1946                                        int stride, TX_TYPE tx_type, int bd) {
   1947  (void)bd;
   1948  const TX_SIZE tx_size = TX_16X32;
   1949  __m256i buf0[32], buf1[32];
   1950  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
   1951  const int txw_idx = get_txw_idx(tx_size);
   1952  const int txh_idx = get_txh_idx(tx_size);
   1953  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
   1954  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   1955  const int width = tx_size_wide[tx_size];
   1956  const int height = tx_size_high[tx_size];
   1957  const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type];
   1958  const transform_1d_avx2 row_txfm = row_txfm16x16_arr[tx_type];
   1959 
   1960  int ud_flip, lr_flip;
   1961  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   1962 
   1963  if (ud_flip) {
   1964    load_buffer_16bit_to_16bit_flip_avx2(input, stride, buf0, height);
   1965  } else {
   1966    load_buffer_16bit_to_16bit_avx2(input, stride, buf0, height);
   1967  }
   1968  round_shift_16bit_w16_avx2(buf0, height, shift[0]);
   1969  col_txfm(buf0, buf0, cos_bit_col);
   1970  round_shift_16bit_w16_avx2(buf0, height, shift[1]);
   1971  transpose_16bit_16x16_avx2(buf0, buf1);
   1972  transpose_16bit_16x16_avx2(buf0 + 16, buf1 + 16);
   1973 
   1974  for (int i = 0; i < 2; i++) {
   1975    __m256i *buf;
   1976    if (lr_flip) {
   1977      buf = buf0;
   1978      flip_buf_avx2(buf1 + width * i, buf, width);
   1979    } else {
   1980      buf = buf1 + width * i;
   1981    }
   1982    row_txfm(buf, buf, cos_bit_row);
   1983    round_shift_16bit_w16_avx2(buf, width, shift[2]);
   1984    store_rect_buffer_16bit_to_32bit_w16_avx2(buf, output + i * 16, height,
   1985                                              width);
   1986  }
   1987 }
   1988 
   1989 static void lowbd_fwd_txfm2d_32x16_avx2(const int16_t *input, int32_t *output,
   1990                                        int stride, TX_TYPE tx_type, int bd) {
   1991  (void)bd;
   1992  __m256i buf0[32], buf1[64];
   1993  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16];
   1994  const int txw_idx = get_txw_idx(TX_32X16);
   1995  const int txh_idx = get_txh_idx(TX_32X16);
   1996  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
   1997  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   1998  const int width = 32;
   1999  const int height = 16;
   2000  const transform_1d_avx2 col_txfm = col_txfm16x16_arr[tx_type];
   2001  const transform_1d_avx2 row_txfm = row_txfm16x32_arr[tx_type];
   2002 
   2003  int ud_flip, lr_flip;
   2004  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   2005 
   2006  for (int i = 0; i < 2; i++) {
   2007    if (ud_flip) {
   2008      load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0,
   2009                                           height);
   2010    } else {
   2011      load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
   2012    }
   2013    round_shift_16bit_w16_avx2(buf0, height, shift[0]);
   2014    col_txfm(buf0, buf0, cos_bit_col);
   2015    round_shift_16bit_w16_avx2(buf0, height, shift[1]);
   2016    transpose_16bit_16x16_avx2(buf0, buf1 + 0 * width + 16 * i);
   2017  }
   2018 
   2019  __m256i *buf;
   2020  if (lr_flip) {
   2021    buf = buf0;
   2022    flip_buf_avx2(buf1, buf, width);
   2023  } else {
   2024    buf = buf1;
   2025  }
   2026  row_txfm(buf, buf, cos_bit_row);
   2027  round_shift_16bit_w16_avx2(buf, width, shift[2]);
   2028  store_rect_buffer_16bit_to_32bit_w16_avx2(buf, output, height, width);
   2029 }
   2030 
   2031 static void lowbd_fwd_txfm2d_64x32_avx2(const int16_t *input, int32_t *output,
   2032                                        int stride, TX_TYPE tx_type, int bd) {
   2033  (void)bd;
   2034  const TX_SIZE tx_size = TX_64X32;
   2035  __m256i buf0[64], buf1[256];
   2036  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
   2037  const int txw_idx = get_txw_idx(tx_size);
   2038  const int txh_idx = get_txh_idx(tx_size);
   2039  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
   2040  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   2041  const int width = tx_size_wide[tx_size];
   2042  const int height = tx_size_high[tx_size];
   2043  const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type];
   2044  const int width_div16 = (width >> 4);
   2045  const int height_div16 = (height >> 4);
   2046 
   2047  for (int i = 0; i < width_div16; i++) {
   2048    load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
   2049    round_shift_16bit_w16_avx2(buf0, height, shift[0]);
   2050    col_txfm(buf0, buf0, cos_bit_col);
   2051    round_shift_16bit_w16_avx2(buf0, height, shift[1]);
   2052    for (int j = 0; j < AOMMIN(4, height_div16); ++j) {
   2053      transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
   2054    }
   2055  }
   2056  assert(tx_type == DCT_DCT);
   2057  for (int i = 0; i < AOMMIN(2, height_div16); i++) {
   2058    __m256i bufA[64];
   2059    __m256i bufB[64];
   2060    __m128i *buf = (__m128i *)(buf1 + width * i);
   2061    for (int j = 0; j < width; ++j) {
   2062      bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]);
   2063      bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]);
   2064    }
   2065    fdct64_new_avx2(bufA, bufA, cos_bit_row);
   2066    fdct64_new_avx2(bufB, bufB, cos_bit_row);
   2067    round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2], NewSqrt2);
   2068    round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2], NewSqrt2);
   2069 
   2070    store_output_32bit_w16(output + i * 16, bufA, bufB, 32, 32);
   2071  }
   2072 }
   2073 
   2074 static void lowbd_fwd_txfm2d_32x64_avx2(const int16_t *input, int32_t *output,
   2075                                        int stride, TX_TYPE tx_type, int bd) {
   2076  (void)bd;
   2077  (void)tx_type;
   2078  assert(tx_type == DCT_DCT);
   2079  const TX_SIZE tx_size = TX_32X64;
   2080  __m256i buf0[64], buf1[256];
   2081  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
   2082  const int txw_idx = get_txw_idx(tx_size);
   2083  const int txh_idx = get_txh_idx(tx_size);
   2084  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
   2085  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   2086  const int width = tx_size_wide[tx_size];
   2087  const int height = tx_size_high[tx_size];
   2088  const transform_1d_avx2 col_txfm = fdct16x64_new_avx2;
   2089  const int width_div16 = (width >> 4);
   2090  const int height_div16 = (height >> 4);
   2091 
   2092  for (int i = 0; i < width_div16; i++) {
   2093    load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
   2094    round_shift_16bit_w16_avx2(buf0, height, shift[0]);
   2095    col_txfm(buf0, buf0, cos_bit_col);
   2096    round_shift_16bit_w16_avx2(buf0, height, shift[1]);
   2097    for (int j = 0; j < AOMMIN(2, height_div16); ++j) {
   2098      transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
   2099    }
   2100  }
   2101 
   2102  for (int i = 0; i < AOMMIN(2, height_div16); i++) {
   2103    __m256i bufA[32];
   2104    __m256i bufB[32];
   2105    __m128i *buf = (__m128i *)(buf1 + width * i);
   2106    for (int j = 0; j < width; ++j) {
   2107      bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]);
   2108      bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]);
   2109    }
   2110    fdct32_avx2(bufA, bufA, cos_bit_row);
   2111    fdct32_avx2(bufB, bufB, cos_bit_row);
   2112    round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2], NewSqrt2);
   2113    round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2], NewSqrt2);
   2114 
   2115    store_output_32bit_w16(output + i * 16, bufA, bufB, 32, 32);
   2116  }
   2117 }
   2118 
   2119 static void lowbd_fwd_txfm2d_16x64_avx2(const int16_t *input, int32_t *output,
   2120                                        int stride, TX_TYPE tx_type, int bd) {
   2121  (void)bd;
   2122  (void)tx_type;
   2123  assert(tx_type == DCT_DCT);
   2124  const TX_SIZE tx_size = TX_16X64;
   2125  __m256i buf0[64], buf1[64];
   2126  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
   2127  const int txw_idx = get_txw_idx(tx_size);
   2128  const int txh_idx = get_txh_idx(tx_size);
   2129  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
   2130  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   2131  const int width = tx_size_wide[tx_size];
   2132  const int height = tx_size_high[tx_size];
   2133  const transform_1d_avx2 col_txfm = fdct16x64_new_avx2;
   2134  const transform_1d_avx2 row_txfm = fdct16x16_new_avx2;
   2135  const int width_div16 = (width >> 4);
   2136  const int height_div16 = (height >> 4);
   2137 
   2138  for (int i = 0; i < width_div16; i++) {
   2139    load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
   2140    round_shift_16bit_w16_avx2(buf0, height, shift[0]);
   2141    col_txfm(buf0, buf0, cos_bit_col);
   2142    round_shift_16bit_w16_avx2(buf0, height, shift[1]);
   2143    for (int j = 0; j < height_div16; ++j) {
   2144      transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
   2145    }
   2146  }
   2147 
   2148  for (int i = 0; i < AOMMIN(2, height_div16); i++) {
   2149    __m256i *buf = buf1 + width * i;
   2150    row_txfm(buf, buf, cos_bit_row);
   2151    round_shift_16bit_w16_avx2(buf, width, shift[2]);
   2152    store_buffer_16bit_to_32bit_w16_avx2(buf, output + width * i, 32, width);
   2153  }
   2154 }
   2155 
   2156 static void lowbd_fwd_txfm2d_64x16_avx2(const int16_t *input, int32_t *output,
   2157                                        int stride, TX_TYPE tx_type, int bd) {
   2158  (void)bd;
   2159  (void)tx_type;
   2160  assert(tx_type == DCT_DCT);
   2161  const TX_SIZE tx_size = TX_64X16;
   2162  __m256i buf0[64], buf1[64];
   2163  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
   2164  const int txw_idx = get_txw_idx(tx_size);
   2165  const int txh_idx = get_txh_idx(tx_size);
   2166  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
   2167  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   2168  const int width = tx_size_wide[tx_size];
   2169  const int height = tx_size_high[tx_size];
   2170  const transform_1d_avx2 col_txfm = fdct16x16_new_avx2;
   2171  const transform_1d_avx2 row_txfm = fdct16x64_new_avx2;
   2172  const int width_div16 = (width >> 4);
   2173  const int height_div16 = (height >> 4);
   2174 
   2175  for (int i = 0; i < width_div16; i++) {
   2176    load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
   2177    round_shift_16bit_w16_avx2(buf0, height, shift[0]);
   2178    col_txfm(buf0, buf0, cos_bit_col);
   2179    round_shift_16bit_w16_avx2(buf0, height, shift[1]);
   2180    for (int j = 0; j < height_div16; ++j) {
   2181      transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
   2182    }
   2183  }
   2184 
   2185  for (int i = 0; i < height_div16; i++) {
   2186    __m256i *buf = buf1 + width * i;
   2187    row_txfm(buf, buf, cos_bit_row);
   2188    round_shift_16bit_w16_avx2(buf, width, shift[2]);
   2189    store_buffer_16bit_to_32bit_w16_avx2(buf, output + 16 * i, 16, 32);
   2190  }
   2191  // Zero out the bottom 16x32 area.
   2192  memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
   2193 }
   2194 
   2195 static inline void btf_16_avx2(__m256i *w0, __m256i *w1, __m256i *in0,
   2196                               __m256i *in1, __m128i *out0, __m128i *out1,
   2197                               __m128i *out2, __m128i *out3,
   2198                               const __m256i *__rounding, int8_t *cos_bit) {
   2199  __m256i t0 = _mm256_unpacklo_epi16(*in0, *in1);
   2200  __m256i t1 = _mm256_unpackhi_epi16(*in0, *in1);
   2201  __m256i u0 = _mm256_madd_epi16(t0, *w0);
   2202  __m256i u1 = _mm256_madd_epi16(t1, *w0);
   2203  __m256i v0 = _mm256_madd_epi16(t0, *w1);
   2204  __m256i v1 = _mm256_madd_epi16(t1, *w1);
   2205 
   2206  __m256i a0 = _mm256_add_epi32(u0, *__rounding);
   2207  __m256i a1 = _mm256_add_epi32(u1, *__rounding);
   2208  __m256i b0 = _mm256_add_epi32(v0, *__rounding);
   2209  __m256i b1 = _mm256_add_epi32(v1, *__rounding);
   2210 
   2211  __m256i c0 = _mm256_srai_epi32(a0, *cos_bit);
   2212  __m256i c1 = _mm256_srai_epi32(a1, *cos_bit);
   2213  __m256i d0 = _mm256_srai_epi32(b0, *cos_bit);
   2214  __m256i d1 = _mm256_srai_epi32(b1, *cos_bit);
   2215 
   2216  __m256i temp0 = _mm256_packs_epi32(c0, c1);
   2217  __m256i temp1 = _mm256_packs_epi32(d0, d1);
   2218 
   2219  *out0 = _mm256_castsi256_si128(temp0);
   2220  *out1 = _mm256_castsi256_si128(temp1);
   2221  *out2 = _mm256_extracti128_si256(temp0, 0x01);
   2222  *out3 = _mm256_extracti128_si256(temp1, 0x01);
   2223 }
   2224 
   2225 static inline void fdct8x8_new_avx2(const __m256i *input, __m256i *output,
   2226                                    int8_t cos_bit) {
   2227  const int32_t *cospi = cospi_arr(cos_bit);
   2228  const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
   2229 
   2230  __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
   2231  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
   2232  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
   2233  __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
   2234  __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
   2235  __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
   2236  __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
   2237  __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
   2238  __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
   2239 
   2240  // stage 1
   2241  __m256i x1[8];
   2242  x1[0] = _mm256_adds_epi16(input[0], input[7]);
   2243  x1[7] = _mm256_subs_epi16(input[0], input[7]);
   2244  x1[1] = _mm256_adds_epi16(input[1], input[6]);
   2245  x1[6] = _mm256_subs_epi16(input[1], input[6]);
   2246  x1[2] = _mm256_adds_epi16(input[2], input[5]);
   2247  x1[5] = _mm256_subs_epi16(input[2], input[5]);
   2248  x1[3] = _mm256_adds_epi16(input[3], input[4]);
   2249  x1[4] = _mm256_subs_epi16(input[3], input[4]);
   2250 
   2251  // stage 2
   2252  __m256i x2[8];
   2253  x2[0] = _mm256_adds_epi16(x1[0], x1[3]);
   2254  x2[3] = _mm256_subs_epi16(x1[0], x1[3]);
   2255  x2[1] = _mm256_adds_epi16(x1[1], x1[2]);
   2256  x2[2] = _mm256_subs_epi16(x1[1], x1[2]);
   2257  x2[4] = x1[4];
   2258  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], __rounding,
   2259                  cos_bit);
   2260  x2[5] = x1[5];
   2261  x2[6] = x1[6];
   2262  x2[7] = x1[7];
   2263 
   2264  // stage 3
   2265  __m256i x3[8];
   2266  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x2[0], &x2[1], __rounding,
   2267                  cos_bit);
   2268  x3[0] = x2[0];
   2269  x3[1] = x2[1];
   2270  btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x2[2], &x2[3], __rounding,
   2271                  cos_bit);
   2272  x3[2] = x2[2];
   2273  x3[3] = x2[3];
   2274  x3[4] = _mm256_adds_epi16(x2[4], x2[5]);
   2275  x3[5] = _mm256_subs_epi16(x2[4], x2[5]);
   2276  x3[6] = _mm256_subs_epi16(x2[7], x2[6]);
   2277  x3[7] = _mm256_adds_epi16(x2[7], x2[6]);
   2278 
   2279  // stage 4
   2280  __m256i x4[8];
   2281  x4[0] = x3[0];
   2282  x4[1] = x3[1];
   2283  x4[2] = x3[2];
   2284  x4[3] = x3[3];
   2285  btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x3[4], &x3[7], __rounding,
   2286                  cos_bit);
   2287  x4[4] = x3[4];
   2288  x4[7] = x3[7];
   2289  btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x3[5], &x3[6], __rounding,
   2290                  cos_bit);
   2291  x4[5] = x3[5];
   2292  x4[6] = x3[6];
   2293  // stage 5
   2294  output[0] = x4[0];
   2295  output[1] = x4[4];
   2296  output[2] = x4[2];
   2297  output[3] = x4[6];
   2298  output[4] = x4[1];
   2299  output[5] = x4[5];
   2300  output[6] = x4[3];
   2301  output[7] = x4[7];
   2302 }
   2303 
   2304 static inline void fadst8x8_new_avx2(const __m256i *input, __m256i *output,
   2305                                     int8_t cos_bit) {
   2306  const int32_t *cospi = cospi_arr(cos_bit);
   2307  const __m256i __zero = _mm256_setzero_si256();
   2308  const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
   2309 
   2310  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
   2311  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
   2312  __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
   2313  __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
   2314  __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
   2315  __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
   2316  __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
   2317  __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
   2318  __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
   2319  __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
   2320  __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
   2321  __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
   2322  __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
   2323 
   2324  // stage 1
   2325  __m256i x1[8];
   2326  x1[0] = input[0];
   2327  x1[1] = _mm256_subs_epi16(__zero, input[7]);
   2328  x1[2] = _mm256_subs_epi16(__zero, input[3]);
   2329  x1[3] = input[4];
   2330  x1[4] = _mm256_subs_epi16(__zero, input[1]);
   2331  x1[5] = input[6];
   2332  x1[6] = input[2];
   2333  x1[7] = _mm256_subs_epi16(__zero, input[5]);
   2334 
   2335  // stage 2
   2336  __m256i x2[8];
   2337  x2[0] = x1[0];
   2338  x2[1] = x1[1];
   2339  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], __rounding,
   2340                  cos_bit);
   2341  x2[2] = x1[2];
   2342  x2[3] = x1[3];
   2343  x2[4] = x1[4];
   2344  x2[5] = x1[5];
   2345  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], __rounding,
   2346                  cos_bit);
   2347  x2[6] = x1[6];
   2348  x2[7] = x1[7];
   2349 
   2350  // stage 3
   2351  __m256i x3[8];
   2352  x3[0] = _mm256_adds_epi16(x2[0], x2[2]);
   2353  x3[2] = _mm256_subs_epi16(x2[0], x2[2]);
   2354  x3[1] = _mm256_adds_epi16(x2[1], x2[3]);
   2355  x3[3] = _mm256_subs_epi16(x2[1], x2[3]);
   2356  x3[4] = _mm256_adds_epi16(x2[4], x2[6]);
   2357  x3[6] = _mm256_subs_epi16(x2[4], x2[6]);
   2358  x3[5] = _mm256_adds_epi16(x2[5], x2[7]);
   2359  x3[7] = _mm256_subs_epi16(x2[5], x2[7]);
   2360 
   2361  // stage 4
   2362  __m256i x4[8];
   2363  x4[0] = x3[0];
   2364  x4[1] = x3[1];
   2365  x4[2] = x3[2];
   2366  x4[3] = x3[3];
   2367  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x3[4], &x3[5], __rounding,
   2368                  cos_bit);
   2369  x4[4] = x3[4];
   2370  x4[5] = x3[5];
   2371  btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x3[6], &x3[7], __rounding,
   2372                  cos_bit);
   2373  x4[6] = x3[6];
   2374  x4[7] = x3[7];
   2375 
   2376  // stage 5
   2377  __m256i x5[8];
   2378  x5[0] = _mm256_adds_epi16(x4[0], x4[4]);
   2379  x5[4] = _mm256_subs_epi16(x4[0], x4[4]);
   2380  x5[1] = _mm256_adds_epi16(x4[1], x4[5]);
   2381  x5[5] = _mm256_subs_epi16(x4[1], x4[5]);
   2382  x5[2] = _mm256_adds_epi16(x4[2], x4[6]);
   2383  x5[6] = _mm256_subs_epi16(x4[2], x4[6]);
   2384  x5[3] = _mm256_adds_epi16(x4[3], x4[7]);
   2385  x5[7] = _mm256_subs_epi16(x4[3], x4[7]);
   2386 
   2387  // stage 6
   2388  __m256i x6[8];
   2389  btf_16_w16_avx2(cospi_p04_p60, cospi_p60_m04, &x5[0], &x5[1], __rounding,
   2390                  cos_bit);
   2391  x6[0] = x5[0];
   2392  x6[1] = x5[1];
   2393  btf_16_w16_avx2(cospi_p20_p44, cospi_p44_m20, &x5[2], &x5[3], __rounding,
   2394                  cos_bit);
   2395  x6[2] = x5[2];
   2396  x6[3] = x5[3];
   2397  btf_16_w16_avx2(cospi_p36_p28, cospi_p28_m36, &x5[4], &x5[5], __rounding,
   2398                  cos_bit);
   2399  x6[4] = x5[4];
   2400  x6[5] = x5[5];
   2401  btf_16_w16_avx2(cospi_p52_p12, cospi_p12_m52, &x5[6], &x5[7], __rounding,
   2402                  cos_bit);
   2403  x6[6] = x5[6];
   2404  x6[7] = x5[7];
   2405 
   2406  // stage 7
   2407  output[0] = x6[1];
   2408  output[1] = x6[6];
   2409  output[2] = x6[3];
   2410  output[3] = x6[4];
   2411  output[4] = x6[5];
   2412  output[5] = x6[2];
   2413  output[6] = x6[7];
   2414  output[7] = x6[0];
   2415 }
   2416 
   2417 static inline void fidentity8x8_new_avx2(const __m256i *input, __m256i *output,
   2418                                         int8_t cos_bit) {
   2419  (void)cos_bit;
   2420 
   2421  output[0] = _mm256_adds_epi16(input[0], input[0]);
   2422  output[1] = _mm256_adds_epi16(input[1], input[1]);
   2423  output[2] = _mm256_adds_epi16(input[2], input[2]);
   2424  output[3] = _mm256_adds_epi16(input[3], input[3]);
   2425  output[4] = _mm256_adds_epi16(input[4], input[4]);
   2426  output[5] = _mm256_adds_epi16(input[5], input[5]);
   2427  output[6] = _mm256_adds_epi16(input[6], input[6]);
   2428  output[7] = _mm256_adds_epi16(input[7], input[7]);
   2429 }
   2430 
   2431 static inline void fdct8x16_new_avx2(const __m128i *input, __m128i *output,
   2432                                     int8_t cos_bit) {
   2433  const int32_t *cospi = cospi_arr(cos_bit);
   2434  const __m256i __rounding_256 = _mm256_set1_epi32(1 << (cos_bit - 1));
   2435  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
   2436  __m128i temp0, temp1, temp2, temp3;
   2437  __m256i in0, in1;
   2438  __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
   2439  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
   2440  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
   2441  __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
   2442  __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
   2443  __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
   2444  __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
   2445  __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
   2446  __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
   2447  __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
   2448  __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
   2449  __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
   2450  __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
   2451  __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
   2452  __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
   2453  __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
   2454  __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
   2455  __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
   2456 
   2457  __m256i cospi_arr[12];
   2458 
   2459  cospi_arr[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m32_p32),
   2460                                         cospi_m32_p32, 0x1);
   2461  cospi_arr[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
   2462                                         cospi_p32_p32, 0x1);
   2463  cospi_arr[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
   2464                                         cospi_p48_p16, 0x1);
   2465  cospi_arr[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32),
   2466                                         cospi_m16_p48, 0x1);
   2467  cospi_arr[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m16_p48),
   2468                                         cospi_m48_m16, 0x1);
   2469  cospi_arr[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_p16),
   2470                                         cospi_m16_p48, 0x1);
   2471  cospi_arr[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p56_p08),
   2472                                         cospi_p24_p40, 0x1);
   2473  cospi_arr[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m08_p56),
   2474                                         cospi_m40_p24, 0x1);
   2475  cospi_arr[8] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p60_p04),
   2476                                         cospi_p28_p36, 0x1);
   2477  cospi_arr[9] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m04_p60),
   2478                                         cospi_m36_p28, 0x1);
   2479  cospi_arr[10] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p44_p20),
   2480                                          cospi_p12_p52, 0x1);
   2481  cospi_arr[11] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m20_p44),
   2482                                          cospi_m52_p12, 0x1);
   2483 
   2484  __m256i x[8];
   2485  x[0] =
   2486      _mm256_insertf128_si256(_mm256_castsi128_si256(input[0]), input[1], 0x1);
   2487  x[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[15]), input[14],
   2488                                 0x1);
   2489  x[2] =
   2490      _mm256_insertf128_si256(_mm256_castsi128_si256(input[2]), input[3], 0x1);
   2491  x[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[13]), input[12],
   2492                                 0x1);
   2493  x[4] =
   2494      _mm256_insertf128_si256(_mm256_castsi128_si256(input[5]), input[4], 0x1);
   2495  x[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[10]), input[11],
   2496                                 0x1);
   2497  x[6] =
   2498      _mm256_insertf128_si256(_mm256_castsi128_si256(input[7]), input[6], 0x1);
   2499  x[7] =
   2500      _mm256_insertf128_si256(_mm256_castsi128_si256(input[8]), input[9], 0x1);
   2501 
   2502  // stage 1
   2503  __m256i x1[8];
   2504  x1[0] = _mm256_adds_epi16(x[0], x[1]);
   2505  x1[7] = _mm256_subs_epi16(x[0], x[1]);
   2506  x1[1] = _mm256_adds_epi16(x[2], x[3]);
   2507  x1[6] = _mm256_subs_epi16(x[2], x[3]);
   2508  x1[2] = _mm256_adds_epi16(x[4], x[5]);
   2509  x1[5] = _mm256_subs_epi16(x[4], x[5]);
   2510  x1[3] = _mm256_adds_epi16(x[6], x[7]);
   2511  x1[4] = _mm256_subs_epi16(x[6], x[7]);
   2512 
   2513  // stage 2
   2514  __m256i x2[8];
   2515  x2[0] = _mm256_adds_epi16(x1[0], x1[3]);
   2516  x2[7] = _mm256_subs_epi16(x1[0], x1[3]);
   2517  x2[1] = _mm256_adds_epi16(x1[1], x1[2]);
   2518  x2[6] = _mm256_subs_epi16(x1[1], x1[2]);
   2519  x2[2] = x1[4];
   2520  x2[3] = x1[7];
   2521  btf_16_avx2(&cospi_arr[0], &cospi_arr[1], &x1[5], &x1[6], &temp0, &temp1,
   2522              &temp2, &temp3, &__rounding_256, &cos_bit);
   2523  x2[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp0, 0x1);
   2524  x2[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp3), temp1, 0x1);
   2525 
   2526  // stage 3
   2527  __m256i x3[8];
   2528  x2[1] = _mm256_permute4x64_epi64(x2[1], 0x4e);
   2529  x3[0] = _mm256_adds_epi16(x2[0], x2[1]);
   2530  x3[1] = _mm256_subs_epi16(x2[0], x2[1]);
   2531  x3[2] = _mm256_blend_epi32(x2[7], x2[6], 0xf0);
   2532  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, _mm256_castsi256_si128(x2[6]),
   2533              _mm256_extractf128_si256(x2[7], 0x01), temp0, temp1);
   2534  x3[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp1), temp0, 0x1);
   2535  x3[3] = _mm256_adds_epi16(x2[2], x2[4]);
   2536  x3[4] = _mm256_subs_epi16(x2[2], x2[4]);
   2537  x3[5] = _mm256_adds_epi16(x2[3], x2[5]);
   2538  x3[6] = _mm256_subs_epi16(x2[3], x2[5]);
   2539 
   2540  // stage 4
   2541  __m256i x4[8];
   2542  x4[0] = _mm256_blend_epi32(x3[0], x3[1], 0xf0);
   2543  x4[1] = _mm256_permute2f128_si256(x3[0], x3[1], 0x21);
   2544  btf_16_avx2(&cospi_arr[2], &cospi_arr[3], &x4[0], &x4[1], &output[0],
   2545              &output[8], &output[4], &output[12], &__rounding_256, &cos_bit);
   2546  x4[2] = _mm256_adds_epi16(x3[2], x3[7]);
   2547  x4[3] = _mm256_subs_epi16(x3[2], x3[7]);
   2548  x4[4] = _mm256_permute2f128_si256(x3[3], x3[4], 0x20);
   2549  x4[5] = _mm256_permute2f128_si256(x3[6], x3[5], 0x20);
   2550  in0 = _mm256_permute2f128_si256(x3[3], x3[4], 0x31);
   2551  in1 = _mm256_permute2f128_si256(x3[5], x3[6], 0x31);
   2552  btf_16_avx2(&cospi_arr[4], &cospi_arr[5], &in0, &in1, &temp0, &temp1, &temp2,
   2553              &temp3, &__rounding_256, &cos_bit);
   2554 
   2555  x4[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp2, 0x1);
   2556  x4[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp3), temp1, 0x1);
   2557 
   2558  // stage 5
   2559  __m256i x5[4];
   2560  in0 = _mm256_permute2f128_si256(x4[2], x4[3], 0x31);
   2561  in1 = _mm256_permute2f128_si256(x4[2], x4[3], 0x20);
   2562  btf_16_avx2(&cospi_arr[6], &cospi_arr[7], &in0, &in1, &output[2], &output[14],
   2563              &output[10], &output[6], &__rounding_256, &cos_bit);
   2564  x5[0] = _mm256_adds_epi16(x4[4], x4[6]);
   2565  x5[1] = _mm256_subs_epi16(x4[4], x4[6]);
   2566  x5[2] = _mm256_adds_epi16(x4[5], x4[7]);
   2567  x5[3] = _mm256_subs_epi16(x4[5], x4[7]);
   2568 
   2569  // stage 6
   2570  in0 = _mm256_permute2f128_si256(x5[0], x5[1], 0x20);
   2571  in1 = _mm256_permute2f128_si256(x5[2], x5[3], 0x31);
   2572  btf_16_avx2(&cospi_arr[8], &cospi_arr[9], &in0, &in1, &output[1], &output[15],
   2573              &output[9], &output[7], &__rounding_256, &cos_bit);
   2574  in0 = _mm256_permute2f128_si256(x5[1], x5[0], 0x31);
   2575  in1 = _mm256_permute2f128_si256(x5[3], x5[2], 0x20);
   2576  btf_16_avx2(&cospi_arr[10], &cospi_arr[11], &in0, &in1, &output[5],
   2577              &output[11], &output[13], &output[3], &__rounding_256, &cos_bit);
   2578 }
   2579 
   2580 static inline void fadst8x16_new_avx2(const __m128i *input, __m128i *output,
   2581                                      int8_t cos_bit) {
   2582  const int32_t *cospi = cospi_arr(cos_bit);
   2583  const __m256i __zero = _mm256_setzero_si256();
   2584  const __m256i __rounding_256 = _mm256_set1_epi32(1 << (cos_bit - 1));
   2585  __m256i in0, in1;
   2586  __m128i temp0, temp1, temp2, temp3;
   2587 
   2588  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
   2589  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
   2590  __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
   2591  __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
   2592  __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
   2593  __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
   2594  __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
   2595  __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
   2596  __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
   2597  __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
   2598  __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
   2599  __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
   2600  __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
   2601  __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
   2602  __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
   2603  __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
   2604  __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
   2605  __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
   2606  __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
   2607  __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
   2608  __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
   2609  __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
   2610  __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
   2611  __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
   2612  __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
   2613  __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
   2614  __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
   2615 
   2616  __m256i cospi_arr[20];
   2617 
   2618  cospi_arr[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
   2619                                         cospi_p32_p32, 0x1);
   2620  cospi_arr[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32),
   2621                                         cospi_p32_m32, 0x1);
   2622  cospi_arr[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
   2623                                         cospi_p32_p32, 0x1);
   2624  cospi_arr[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32),
   2625                                         cospi_p32_m32, 0x1);
   2626  cospi_arr[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p16_p48),
   2627                                         cospi_m48_p16, 0x1);
   2628  cospi_arr[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_m16),
   2629                                         cospi_p16_p48, 0x1);
   2630  cospi_arr[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p16_p48),
   2631                                         cospi_m48_p16, 0x1);
   2632  cospi_arr[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_m16),
   2633                                         cospi_p16_p48, 0x1);
   2634  cospi_arr[8] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p08_p56),
   2635                                         cospi_p40_p24, 0x1);
   2636  cospi_arr[9] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p56_m08),
   2637                                         cospi_p24_m40, 0x1);
   2638  cospi_arr[10] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m56_p08),
   2639                                          cospi_m24_p40, 0x1);
   2640  cospi_arr[11] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p08_p56),
   2641                                          cospi_p40_p24, 0x1);
   2642  cospi_arr[12] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p02_p62),
   2643                                          cospi_p10_p54, 0x1);
   2644  cospi_arr[13] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p62_m02),
   2645                                          cospi_p54_m10, 0x1);
   2646  cospi_arr[14] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p18_p46),
   2647                                          cospi_p26_p38, 0x1);
   2648  cospi_arr[15] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p46_m18),
   2649                                          cospi_p38_m26, 0x1);
   2650  cospi_arr[16] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p34_p30),
   2651                                          cospi_p42_p22, 0x1);
   2652  cospi_arr[17] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p30_m34),
   2653                                          cospi_p22_m42, 0x1);
   2654  cospi_arr[18] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p50_p14),
   2655                                          cospi_p58_p06, 0x1);
   2656  cospi_arr[19] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p14_m50),
   2657                                          cospi_p06_m58, 0x1);
   2658 
   2659  __m256i x[8];
   2660  x[0] =
   2661      _mm256_insertf128_si256(_mm256_castsi128_si256(input[0]), input[4], 0x1);
   2662  x[1] =
   2663      _mm256_insertf128_si256(_mm256_castsi128_si256(input[2]), input[6], 0x1);
   2664  x[2] =
   2665      _mm256_insertf128_si256(_mm256_castsi128_si256(input[8]), input[12], 0x1);
   2666  x[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[10]), input[14],
   2667                                 0x1);
   2668  x[4] =
   2669      _mm256_insertf128_si256(_mm256_castsi128_si256(input[1]), input[9], 0x1);
   2670  x[5] =
   2671      _mm256_insertf128_si256(_mm256_castsi128_si256(input[3]), input[11], 0x1);
   2672  x[6] =
   2673      _mm256_insertf128_si256(_mm256_castsi128_si256(input[5]), input[13], 0x1);
   2674  x[7] =
   2675      _mm256_insertf128_si256(_mm256_castsi128_si256(input[7]), input[15], 0x1);
   2676 
   2677  // stage 1
   2678  __m256i x1[8];
   2679  x1[0] = x[0];
   2680  x1[1] = _mm256_subs_epi16(__zero, x[7]);
   2681  x1[2] = x[2];
   2682  x1[3] = _mm256_subs_epi16(__zero, x[5]);
   2683  x1[4] = _mm256_subs_epi16(__zero, x[4]);
   2684  x1[5] = x[3];
   2685  x1[6] = _mm256_subs_epi16(__zero, x[6]);
   2686  x1[7] = x[1];
   2687 
   2688  // stage 2
   2689  __m256i x2[8];
   2690  x2[0] = _mm256_blend_epi32(x1[0], x1[1], 0xf0);
   2691  x2[3] = _mm256_blend_epi32(x1[3], x1[2], 0xf0);
   2692  x2[4] = _mm256_blend_epi32(x1[4], x1[5], 0xf0);
   2693  x2[7] = _mm256_blend_epi32(x1[7], x1[6], 0xf0);
   2694  in0 = _mm256_blend_epi32(x1[1], x1[0], 0xf0);
   2695  in1 = _mm256_blend_epi32(x1[2], x1[3], 0xf0);
   2696  btf_16_avx2(&cospi_arr[0], &cospi_arr[1], &in0, &in1, &temp0, &temp1, &temp2,
   2697              &temp3, &__rounding_256, &cos_bit);
   2698  x2[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
   2699  x2[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
   2700  in0 = _mm256_permute2f128_si256(x1[7], x1[6], 0x21);
   2701  in1 = _mm256_permute2f128_si256(x1[4], x1[5], 0x21);
   2702  btf_16_avx2(&cospi_arr[2], &cospi_arr[3], &in0, &in1, &temp0, &temp1, &temp2,
   2703              &temp3, &__rounding_256, &cos_bit);
   2704  x2[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
   2705  x2[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
   2706 
   2707  // stage 3
   2708  __m256i x3[8];
   2709  x3[0] = _mm256_adds_epi16(x2[0], x2[1]);
   2710  x3[1] = _mm256_subs_epi16(x2[0], x2[1]);
   2711  x3[2] = _mm256_adds_epi16(x2[3], x2[2]);
   2712  x3[3] = _mm256_subs_epi16(x2[3], x2[2]);
   2713  x3[4] = _mm256_adds_epi16(x2[4], x2[5]);
   2714  x3[5] = _mm256_subs_epi16(x2[4], x2[5]);
   2715  x3[6] = _mm256_adds_epi16(x2[7], x2[6]);
   2716  x3[7] = _mm256_subs_epi16(x2[7], x2[6]);
   2717 
   2718  // stage 4
   2719  __m256i x4[8];
   2720  x4[0] = x3[0];
   2721  x4[1] = x3[1];
   2722  x4[4] = x3[4];
   2723  x4[5] = x3[5];
   2724  in0 = _mm256_permute2f128_si256(x3[2], x3[3], 0x20);
   2725  in1 = _mm256_permute2f128_si256(x3[2], x3[3], 0x31);
   2726  btf_16_avx2(&cospi_arr[4], &cospi_arr[5], &in0, &in1, &temp0, &temp1, &temp2,
   2727              &temp3, &__rounding_256, &cos_bit);
   2728  x4[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
   2729  x4[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
   2730  in0 = _mm256_permute2f128_si256(x3[6], x3[7], 0x20);
   2731  in1 = _mm256_permute2f128_si256(x3[6], x3[7], 0x31);
   2732  btf_16_avx2(&cospi_arr[6], &cospi_arr[7], &in0, &in1, &temp0, &temp1, &temp2,
   2733              &temp3, &__rounding_256, &cos_bit);
   2734  x4[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
   2735  x4[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
   2736 
   2737  // stage 5
   2738  __m256i x5[8];
   2739  x5[0] = _mm256_adds_epi16(x4[0], x4[2]);
   2740  x5[1] = _mm256_subs_epi16(x4[0], x4[2]);
   2741  x5[2] = _mm256_adds_epi16(x4[1], x4[3]);
   2742  x5[3] = _mm256_subs_epi16(x4[1], x4[3]);
   2743  x5[4] = _mm256_adds_epi16(x4[4], x4[6]);
   2744  x5[5] = _mm256_subs_epi16(x4[4], x4[6]);
   2745  x5[6] = _mm256_adds_epi16(x4[5], x4[7]);
   2746  x5[7] = _mm256_subs_epi16(x4[5], x4[7]);
   2747 
   2748  // stage 6
   2749  __m256i x6[8];
   2750  x6[0] = x5[0];
   2751  x6[1] = x5[2];
   2752  x6[2] = x5[1];
   2753  x6[3] = x5[3];
   2754  in0 = _mm256_permute2f128_si256(x5[4], x5[6], 0x20);
   2755  in1 = _mm256_permute2f128_si256(x5[4], x5[6], 0x31);
   2756  btf_16_avx2(&cospi_arr[8], &cospi_arr[9], &in0, &in1, &temp0, &temp1, &temp2,
   2757              &temp3, &__rounding_256, &cos_bit);
   2758  x6[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
   2759  x6[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
   2760  in0 = _mm256_permute2f128_si256(x5[5], x5[7], 0x20);
   2761  in1 = _mm256_permute2f128_si256(x5[5], x5[7], 0x31);
   2762  btf_16_avx2(&cospi_arr[10], &cospi_arr[11], &in0, &in1, &temp0, &temp1,
   2763              &temp2, &temp3, &__rounding_256, &cos_bit);
   2764  x6[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
   2765  x6[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
   2766 
   2767  // stage 7
   2768  __m256i x7[8];
   2769  x7[0] = _mm256_adds_epi16(x6[0], x6[4]);
   2770  x7[1] = _mm256_subs_epi16(x6[0], x6[4]);
   2771  x7[2] = _mm256_adds_epi16(x6[1], x6[5]);
   2772  x7[3] = _mm256_subs_epi16(x6[1], x6[5]);
   2773  x7[4] = _mm256_adds_epi16(x6[2], x6[6]);
   2774  x7[5] = _mm256_subs_epi16(x6[2], x6[6]);
   2775  x7[6] = _mm256_adds_epi16(x6[3], x6[7]);
   2776  x7[7] = _mm256_subs_epi16(x6[3], x6[7]);
   2777 
   2778  // stage 8
   2779  in0 = _mm256_permute2f128_si256(x7[0], x7[2], 0x20);
   2780  in1 = _mm256_permute2f128_si256(x7[0], x7[2], 0x31);
   2781  btf_16_avx2(&cospi_arr[12], &cospi_arr[13], &in0, &in1, &output[15],
   2782              &output[0], &output[13], &output[2], &__rounding_256, &cos_bit);
   2783  in0 = _mm256_permute2f128_si256(x7[4], x7[6], 0x20);
   2784  in1 = _mm256_permute2f128_si256(x7[4], x7[6], 0x31);
   2785  btf_16_avx2(&cospi_arr[14], &cospi_arr[15], &in0, &in1, &output[11],
   2786              &output[4], &output[9], &output[6], &__rounding_256, &cos_bit);
   2787  in0 = _mm256_permute2f128_si256(x7[1], x7[3], 0x20);
   2788  in1 = _mm256_permute2f128_si256(x7[1], x7[3], 0x31);
   2789  btf_16_avx2(&cospi_arr[16], &cospi_arr[17], &in0, &in1, &output[7],
   2790              &output[8], &output[5], &output[10], &__rounding_256, &cos_bit);
   2791  in0 = _mm256_permute2f128_si256(x7[5], x7[7], 0x20);
   2792  in1 = _mm256_permute2f128_si256(x7[5], x7[7], 0x31);
   2793  btf_16_avx2(&cospi_arr[18], &cospi_arr[19], &in0, &in1, &output[3],
   2794              &output[12], &output[1], &output[14], &__rounding_256, &cos_bit);
   2795 }
   2796 
   2797 static inline void fidentity8x16_new_avx2(const __m128i *input, __m128i *output,
   2798                                          int8_t cos_bit) {
   2799  (void)cos_bit;
   2800  const __m256i one = _mm256_set1_epi16(1);
   2801  __m256i temp;
   2802  for (int i = 0; i < 16; i += 2) {
   2803    temp = _mm256_insertf128_si256(_mm256_castsi128_si256(input[i]),
   2804                                   input[i + 1], 0x1);
   2805    const __m256i a_lo = _mm256_unpacklo_epi16(temp, one);
   2806    const __m256i a_hi = _mm256_unpackhi_epi16(temp, one);
   2807    const __m256i b_lo = scale_round_avx2(a_lo, 2 * NewSqrt2);
   2808    const __m256i b_hi = scale_round_avx2(a_hi, 2 * NewSqrt2);
   2809    temp = _mm256_packs_epi32(b_lo, b_hi);
   2810    output[i] = _mm256_castsi256_si128(temp);
   2811    output[i + 1] = _mm256_extractf128_si256(temp, 0x1);
   2812  }
   2813 }
   2814 
   2815 static const transform_1d_avx2 row_txfm8x16_arr[TX_TYPES] = {
   2816  fdct8x8_new_avx2,       // DCT_DCT
   2817  fdct8x8_new_avx2,       // ADST_DCT
   2818  fadst8x8_new_avx2,      // DCT_ADST
   2819  fadst8x8_new_avx2,      // ADST_ADST
   2820  fdct8x8_new_avx2,       // FLIPADST_DCT
   2821  fadst8x8_new_avx2,      // DCT_FLIPADST
   2822  fadst8x8_new_avx2,      // FLIPADST_FLIPADST
   2823  fadst8x8_new_avx2,      // ADST_FLIPADST
   2824  fadst8x8_new_avx2,      // FLIPADST_ADST
   2825  fidentity8x8_new_avx2,  // IDTX
   2826  fidentity8x8_new_avx2,  // V_DCT
   2827  fdct8x8_new_avx2,       // H_DCT
   2828  fidentity8x8_new_avx2,  // V_ADST
   2829  fadst8x8_new_avx2,      // H_ADST
   2830  fidentity8x8_new_avx2,  // V_FLIPADST
   2831  fadst8x8_new_avx2       // H_FLIPADST
   2832 };
   2833 
   2834 static const transform_1d_sse2 col_txfm8x16_arr[TX_TYPES] = {
   2835  fdct8x16_new_avx2,       // DCT_DCT
   2836  fadst8x16_new_avx2,      // ADST_DCT
   2837  fdct8x16_new_avx2,       // DCT_ADST
   2838  fadst8x16_new_avx2,      // ADST_ADST
   2839  fadst8x16_new_avx2,      // FLIPADST_DCT
   2840  fdct8x16_new_avx2,       // DCT_FLIPADST
   2841  fadst8x16_new_avx2,      // FLIPADST_FLIPADST
   2842  fadst8x16_new_avx2,      // ADST_FLIPADST
   2843  fadst8x16_new_avx2,      // FLIPADST_ADST
   2844  fidentity8x16_new_avx2,  // IDTX
   2845  fdct8x16_new_avx2,       // V_DCT
   2846  fidentity8x16_new_avx2,  // H_DCT
   2847  fadst8x16_new_avx2,      // V_ADST
   2848  fidentity8x16_new_avx2,  // H_ADST
   2849  fadst8x16_new_avx2,      // V_FLIPADST
   2850  fidentity8x16_new_avx2   // H_FLIPADST
   2851 };
   2852 
   2853 static const transform_1d_avx2 col_txfm16x8_arr[TX_TYPES] = {
   2854  fdct8x8_new_avx2,       // DCT_DCT
   2855  fadst8x8_new_avx2,      // ADST_DCT
   2856  fdct8x8_new_avx2,       // DCT_ADST
   2857  fadst8x8_new_avx2,      // ADST_ADST
   2858  fadst8x8_new_avx2,      // FLIPADST_DCT
   2859  fdct8x8_new_avx2,       // DCT_FLIPADST
   2860  fadst8x8_new_avx2,      // FLIPADST_FLIPADST
   2861  fadst8x8_new_avx2,      // ADST_FLIPADST
   2862  fadst8x8_new_avx2,      // FLIPADST_ADST
   2863  fidentity8x8_new_avx2,  // IDTX
   2864  fdct8x8_new_avx2,       // V_DCT
   2865  fidentity8x8_new_avx2,  // H_DCT
   2866  fadst8x8_new_avx2,      // V_ADST
   2867  fidentity8x8_new_avx2,  // H_ADST
   2868  fadst8x8_new_avx2,      // V_FLIPADST
   2869  fidentity8x8_new_avx2,  // H_FLIPADST
   2870 };
   2871 
   2872 static const transform_1d_sse2 row_txfm16x8_arr[TX_TYPES] = {
   2873  fdct8x16_new_avx2,       // DCT_DCT
   2874  fdct8x16_new_avx2,       // ADST_DCT
   2875  fadst8x16_new_avx2,      // DCT_ADST
   2876  fadst8x16_new_avx2,      // ADST_ADST
   2877  fdct8x16_new_avx2,       // FLIPADST_DCT
   2878  fadst8x16_new_avx2,      // DCT_FLIPADST
   2879  fadst8x16_new_avx2,      // FLIPADST_FLIPADST
   2880  fadst8x16_new_avx2,      // ADST_FLIPADST
   2881  fadst8x16_new_avx2,      // FLIPADST_ADST
   2882  fidentity8x16_new_avx2,  // IDTX
   2883  fidentity8x16_new_avx2,  // V_DCT
   2884  fdct8x16_new_avx2,       // H_DCT
   2885  fidentity8x16_new_avx2,  // V_ADST
   2886  fadst8x16_new_avx2,      // H_ADST
   2887  fidentity8x16_new_avx2,  // V_FLIPADST
   2888  fadst8x16_new_avx2       // H_FLIPADST
   2889 };
   2890 
   2891 static void lowbd_fwd_txfm2d_8x16_avx2(const int16_t *input, int32_t *output,
   2892                                       int stride, TX_TYPE tx_type, int bd) {
   2893  (void)bd;
   2894  __m128i buf0[16], buf1[16];
   2895  __m256i buf2[8];
   2896  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16];
   2897  const int txw_idx = get_txw_idx(TX_8X16);
   2898  const int txh_idx = get_txh_idx(TX_8X16);
   2899  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
   2900  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   2901  const int width = 8;
   2902  const int height = 16;
   2903  const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
   2904  const transform_1d_avx2 row_txfm = row_txfm8x16_arr[tx_type];
   2905  int ud_flip, lr_flip;
   2906 
   2907  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   2908  if (ud_flip) {
   2909    load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
   2910  } else {
   2911    load_buffer_16bit_to_16bit(input, stride, buf0, height);
   2912  }
   2913  round_shift_16bit(buf0, height, shift[0]);
   2914  col_txfm(buf0, buf0, cos_bit_col);
   2915  round_shift_16bit(buf0, height, shift[1]);
   2916  transpose_16bit_8x8(buf0, buf1);
   2917  transpose_16bit_8x8(buf0 + 8, buf1 + 8);
   2918 
   2919  __m128i *bufl, *bufu;
   2920  if (lr_flip) {
   2921    bufl = buf0;
   2922    bufu = buf0 + 8;
   2923    flip_buf_sse2(buf1 + width * 0, bufl, width);
   2924    flip_buf_sse2(buf1 + width * 1, bufu, width);
   2925  } else {
   2926    bufl = buf1 + width * 0;
   2927    bufu = buf1 + width * 1;
   2928  }
   2929  pack_reg(bufl, bufu, buf2);
   2930  row_txfm(buf2, buf2, cos_bit_row);
   2931  round_shift_16bit_w16_avx2(buf2, width, shift[2]);
   2932  store_rect_buffer_16bit_to_32bit_w16_avx2(buf2, output, height, width);
   2933 }
   2934 
   2935 static void lowbd_fwd_txfm2d_16x8_avx2(const int16_t *input, int32_t *output,
   2936                                       int stride, TX_TYPE tx_type, int bd) {
   2937  (void)bd;
   2938  __m128i buf0[16], buf1[16];
   2939  __m256i buf2[8];
   2940  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8];
   2941  const int txw_idx = get_txw_idx(TX_16X8);
   2942  const int txh_idx = get_txh_idx(TX_16X8);
   2943  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
   2944  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   2945  const int width = 16;
   2946  const int height = 8;
   2947  const transform_1d_avx2 col_txfm = col_txfm16x8_arr[tx_type];
   2948  const transform_1d_sse2 row_txfm = row_txfm16x8_arr[tx_type];
   2949  __m128i *buf;
   2950  int ud_flip, lr_flip;
   2951 
   2952  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   2953 
   2954  if (ud_flip) {
   2955    load_buffer_16bit_to_16bit_flip(input + 8 * 0, stride, buf0, height);
   2956    load_buffer_16bit_to_16bit_flip(input + 8 * 1, stride, &buf0[8], height);
   2957  } else {
   2958    load_buffer_16bit_to_16bit(input + 8 * 0, stride, buf0, height);
   2959    load_buffer_16bit_to_16bit(input + 8 * 1, stride, &buf0[8], height);
   2960  }
   2961  pack_reg(buf0, &buf0[8], buf2);
   2962  round_shift_16bit_w16_avx2(buf2, height, shift[0]);
   2963  col_txfm(buf2, buf2, cos_bit_col);
   2964  round_shift_16bit_w16_avx2(buf2, height, shift[1]);
   2965  transpose_16bit_16x8_avx2(buf2, buf2);
   2966  extract_reg(buf2, buf1);
   2967 
   2968  if (lr_flip) {
   2969    buf = buf0;
   2970    flip_buf_sse2(buf1, buf, width);
   2971  } else {
   2972    buf = buf1;
   2973  }
   2974  row_txfm(buf, buf, cos_bit_row);
   2975  round_shift_16bit(buf, width, shift[2]);
   2976  store_rect_buffer_16bit_to_32bit_w8(buf, output, height, width);
   2977 }
   2978 
   2979 static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
   2980  av1_lowbd_fwd_txfm2d_4x4_sse2,   // 4x4 transform
   2981  av1_lowbd_fwd_txfm2d_8x8_avx2,   // 8x8 transform
   2982  lowbd_fwd_txfm2d_16x16_avx2,     // 16x16 transform
   2983  lowbd_fwd_txfm2d_32x32_avx2,     // 32x32 transform
   2984  lowbd_fwd_txfm2d_64x64_avx2,     // 64x64 transform
   2985  av1_lowbd_fwd_txfm2d_4x8_sse2,   // 4x8 transform
   2986  av1_lowbd_fwd_txfm2d_8x4_sse2,   // 8x4 transform
   2987  lowbd_fwd_txfm2d_8x16_avx2,      // 8x16 transform
   2988  lowbd_fwd_txfm2d_16x8_avx2,      // 16x8 transform
   2989  lowbd_fwd_txfm2d_16x32_avx2,     // 16x32 transform
   2990  lowbd_fwd_txfm2d_32x16_avx2,     // 32x16 transform
   2991  lowbd_fwd_txfm2d_32x64_avx2,     // 32x64 transform
   2992  lowbd_fwd_txfm2d_64x32_avx2,     // 64x32 transform
   2993  av1_lowbd_fwd_txfm2d_4x16_sse2,  // 4x16 transform
   2994  av1_lowbd_fwd_txfm2d_16x4_sse2,  // 16x4 transform
   2995  av1_lowbd_fwd_txfm2d_8x32_sse2,  // 8x32 transform
   2996  av1_lowbd_fwd_txfm2d_32x8_sse2,  // 32x8 transform
   2997  lowbd_fwd_txfm2d_16x64_avx2,     // 16x64 transform
   2998  lowbd_fwd_txfm2d_64x16_avx2,     // 64x16 transform
   2999 };
   3000 
   3001 void av1_lowbd_fwd_txfm_avx2(const int16_t *src_diff, tran_low_t *coeff,
   3002                             int diff_stride, TxfmParam *txfm_param) {
   3003  FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size];
   3004  if (txfm_param->lossless && txfm_param->tx_size == TX_4X4) {
   3005    av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
   3006  } else {
   3007    fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
   3008                    txfm_param->bd);
   3009  }
   3010 }