tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

av1_inv_txfm_avx2.c (89957B)


      1 /*
      2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include "config/aom_config.h"
     13 
     14 #include "config/av1_rtcd.h"
     15 
     16 #include "av1/common/av1_inv_txfm1d_cfg.h"
     17 #include "av1/common/x86/av1_txfm_sse2.h"
     18 #include "av1/common/x86/av1_inv_txfm_avx2.h"
     19 #include "av1/common/x86/av1_inv_txfm_ssse3.h"
     20 
     21 // TODO(venkatsanampudi@ittiam.com): move this to header file
     22 
     23 // Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
     24 static const int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793,
     25                                                4 * 4096, 4 * 5793 };
     26 
     27 static inline void idct16_stage5_avx2(__m256i *x1, const int32_t *cospi,
     28                                      const __m256i _r, int8_t cos_bit) {
     29  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
     30  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
     31  btf_16_adds_subs_avx2(&x1[0], &x1[3]);
     32  btf_16_adds_subs_avx2(&x1[1], &x1[2]);
     33  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
     34 
     35  btf_16_adds_subs_avx2(&x1[8], &x1[11]);
     36  btf_16_adds_subs_avx2(&x1[9], &x1[10]);
     37  btf_16_adds_subs_avx2(&x1[15], &x1[12]);
     38  btf_16_adds_subs_avx2(&x1[14], &x1[13]);
     39 }
     40 
     41 static inline void idct16_stage6_avx2(__m256i *x, const int32_t *cospi,
     42                                      const __m256i _r, int8_t cos_bit) {
     43  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
     44  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
     45  btf_16_adds_subs_avx2(&x[0], &x[7]);
     46  btf_16_adds_subs_avx2(&x[1], &x[6]);
     47  btf_16_adds_subs_avx2(&x[2], &x[5]);
     48  btf_16_adds_subs_avx2(&x[3], &x[4]);
     49  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
     50  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
     51 }
     52 
     53 static inline void idct16_stage7_avx2(__m256i *output, __m256i *x1) {
     54  btf_16_adds_subs_out_avx2(&output[0], &output[15], x1[0], x1[15]);
     55  btf_16_adds_subs_out_avx2(&output[1], &output[14], x1[1], x1[14]);
     56  btf_16_adds_subs_out_avx2(&output[2], &output[13], x1[2], x1[13]);
     57  btf_16_adds_subs_out_avx2(&output[3], &output[12], x1[3], x1[12]);
     58  btf_16_adds_subs_out_avx2(&output[4], &output[11], x1[4], x1[11]);
     59  btf_16_adds_subs_out_avx2(&output[5], &output[10], x1[5], x1[10]);
     60  btf_16_adds_subs_out_avx2(&output[6], &output[9], x1[6], x1[9]);
     61  btf_16_adds_subs_out_avx2(&output[7], &output[8], x1[7], x1[8]);
     62 }
     63 
     64 static void idct16_avx2(const __m256i *input, __m256i *output) {
     65  const int32_t *cospi = cospi_arr(INV_COS_BIT);
     66  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
     67 
     68  __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
     69  __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
     70  __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
     71  __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
     72  __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
     73  __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
     74  __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
     75  __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
     76  __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
     77  __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
     78  __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
     79  __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
     80  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
     81  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
     82  __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
     83  __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
     84  __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
     85  __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
     86  __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
     87 
     88  // stage 1
     89  __m256i x1[16];
     90  x1[0] = input[0];
     91  x1[1] = input[8];
     92  x1[2] = input[4];
     93  x1[3] = input[12];
     94  x1[4] = input[2];
     95  x1[5] = input[10];
     96  x1[6] = input[6];
     97  x1[7] = input[14];
     98  x1[8] = input[1];
     99  x1[9] = input[9];
    100  x1[10] = input[5];
    101  x1[11] = input[13];
    102  x1[12] = input[3];
    103  x1[13] = input[11];
    104  x1[14] = input[7];
    105  x1[15] = input[15];
    106 
    107  // stage 2
    108  btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r,
    109                  INV_COS_BIT);
    110  btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r,
    111                  INV_COS_BIT);
    112  btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r,
    113                  INV_COS_BIT);
    114  btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r,
    115                  INV_COS_BIT);
    116 
    117  // stage 3
    118  btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r,
    119                  INV_COS_BIT);
    120  btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r,
    121                  INV_COS_BIT);
    122  btf_16_adds_subs_avx2(&x1[8], &x1[9]);
    123  btf_16_adds_subs_avx2(&x1[11], &x1[10]);
    124  btf_16_adds_subs_avx2(&x1[12], &x1[13]);
    125  btf_16_adds_subs_avx2(&x1[15], &x1[14]);
    126 
    127  // stage 4
    128  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r,
    129                  INV_COS_BIT);
    130  btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r,
    131                  INV_COS_BIT);
    132  btf_16_adds_subs_avx2(&x1[4], &x1[5]);
    133  btf_16_adds_subs_avx2(&x1[7], &x1[6]);
    134  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r,
    135                  INV_COS_BIT);
    136  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r,
    137                  INV_COS_BIT);
    138 
    139  idct16_stage5_avx2(x1, cospi, _r, INV_COS_BIT);
    140  idct16_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
    141  idct16_stage7_avx2(output, x1);
    142 }
    143 
    144 static void idct16_low8_avx2(const __m256i *input, __m256i *output) {
    145  const int32_t *cospi = cospi_arr(INV_COS_BIT);
    146  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
    147 
    148  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
    149  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
    150  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
    151 
    152  // stage 1
    153  __m256i x1[16];
    154  x1[0] = input[0];
    155  x1[2] = input[4];
    156  x1[4] = input[2];
    157  x1[6] = input[6];
    158  x1[8] = input[1];
    159  x1[10] = input[5];
    160  x1[12] = input[3];
    161  x1[14] = input[7];
    162 
    163  // stage 2
    164  btf_16_w16_0_avx2(cospi[60], cospi[4], x1[8], x1[8], x1[15]);
    165  btf_16_w16_0_avx2(-cospi[36], cospi[28], x1[14], x1[9], x1[14]);
    166  btf_16_w16_0_avx2(cospi[44], cospi[20], x1[10], x1[10], x1[13]);
    167  btf_16_w16_0_avx2(-cospi[52], cospi[12], x1[12], x1[11], x1[12]);
    168 
    169  // stage 3
    170  btf_16_w16_0_avx2(cospi[56], cospi[8], x1[4], x1[4], x1[7]);
    171  btf_16_w16_0_avx2(-cospi[40], cospi[24], x1[6], x1[5], x1[6]);
    172  btf_16_adds_subs_avx2(&x1[8], &x1[9]);
    173  btf_16_adds_subs_avx2(&x1[11], &x1[10]);
    174  btf_16_adds_subs_avx2(&x1[12], &x1[13]);
    175  btf_16_adds_subs_avx2(&x1[15], &x1[14]);
    176 
    177  // stage 4
    178  btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]);
    179  btf_16_w16_0_avx2(cospi[48], cospi[16], x1[2], x1[2], x1[3]);
    180  btf_16_adds_subs_avx2(&x1[4], &x1[5]);
    181  btf_16_adds_subs_avx2(&x1[7], &x1[6]);
    182  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r,
    183                  INV_COS_BIT);
    184  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r,
    185                  INV_COS_BIT);
    186 
    187  idct16_stage5_avx2(x1, cospi, _r, INV_COS_BIT);
    188  idct16_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
    189  idct16_stage7_avx2(output, x1);
    190 }
    191 
    192 static void idct16_low1_avx2(const __m256i *input, __m256i *output) {
    193  const int32_t *cospi = cospi_arr(INV_COS_BIT);
    194 
    195  // stage 1
    196  __m256i x1[2];
    197  x1[0] = input[0];
    198 
    199  // stage 2
    200  // stage 3
    201  // stage 4
    202  btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]);
    203 
    204  // stage 5
    205  // stage 6
    206  output[0] = x1[0];
    207  output[1] = x1[1];
    208  output[2] = x1[1];
    209  output[3] = x1[0];
    210  output[4] = x1[0];
    211  output[5] = x1[1];
    212  output[6] = x1[1];
    213  output[7] = x1[0];
    214  output[8] = x1[0];
    215  output[9] = x1[1];
    216  output[10] = x1[1];
    217  output[11] = x1[0];
    218  output[12] = x1[0];
    219  output[13] = x1[1];
    220  output[14] = x1[1];
    221  output[15] = x1[0];
    222 }
    223 
    224 static inline void iadst16_stage3_avx2(__m256i *x) {
    225  btf_16_adds_subs_avx2(&x[0], &x[8]);
    226  btf_16_adds_subs_avx2(&x[1], &x[9]);
    227  btf_16_adds_subs_avx2(&x[2], &x[10]);
    228  btf_16_adds_subs_avx2(&x[3], &x[11]);
    229  btf_16_adds_subs_avx2(&x[4], &x[12]);
    230  btf_16_adds_subs_avx2(&x[5], &x[13]);
    231  btf_16_adds_subs_avx2(&x[6], &x[14]);
    232  btf_16_adds_subs_avx2(&x[7], &x[15]);
    233 }
    234 
    235 static inline void iadst16_stage4_avx2(__m256i *x, const int32_t *cospi,
    236                                       const __m256i _r, int8_t cos_bit) {
    237  const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
    238  const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
    239  const __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
    240  const __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
    241  const __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]);
    242  const __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]);
    243  btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x[8], &x[9], _r, cos_bit);
    244  btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x[10], &x[11], _r, cos_bit);
    245  btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x[12], &x[13], _r, cos_bit);
    246  btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x[14], &x[15], _r, cos_bit);
    247 }
    248 
    249 static inline void iadst16_stage5_avx2(__m256i *x) {
    250  btf_16_adds_subs_avx2(&x[0], &x[4]);
    251  btf_16_adds_subs_avx2(&x[1], &x[5]);
    252  btf_16_adds_subs_avx2(&x[2], &x[6]);
    253  btf_16_adds_subs_avx2(&x[3], &x[7]);
    254  btf_16_adds_subs_avx2(&x[8], &x[12]);
    255  btf_16_adds_subs_avx2(&x[9], &x[13]);
    256  btf_16_adds_subs_avx2(&x[10], &x[14]);
    257  btf_16_adds_subs_avx2(&x[11], &x[15]);
    258 }
    259 
    260 static inline void iadst16_stage6_avx2(__m256i *x, const int32_t *cospi,
    261                                       const __m256i _r, int8_t cos_bit) {
    262  const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
    263  const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
    264  const __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
    265  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[4], &x[5], _r, cos_bit);
    266  btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[6], &x[7], _r, cos_bit);
    267  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[12], &x[13], _r, cos_bit);
    268  btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[14], &x[15], _r, cos_bit);
    269 }
    270 
    271 static inline void iadst16_stage7_avx2(__m256i *x) {
    272  btf_16_adds_subs_avx2(&x[0], &x[2]);
    273  btf_16_adds_subs_avx2(&x[1], &x[3]);
    274  btf_16_adds_subs_avx2(&x[4], &x[6]);
    275  btf_16_adds_subs_avx2(&x[5], &x[7]);
    276  btf_16_adds_subs_avx2(&x[8], &x[10]);
    277  btf_16_adds_subs_avx2(&x[9], &x[11]);
    278  btf_16_adds_subs_avx2(&x[12], &x[14]);
    279  btf_16_adds_subs_avx2(&x[13], &x[15]);
    280 }
    281 
    282 static inline void iadst16_stage8_avx2(__m256i *x1, const int32_t *cospi,
    283                                       const __m256i _r, int8_t cos_bit) {
    284  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
    285  const __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
    286  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit);
    287  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit);
    288  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit);
    289  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit);
    290 }
    291 
    292 static inline void iadst16_stage9_avx2(__m256i *output, __m256i *x1) {
    293  const __m256i __zero = _mm256_setzero_si256();
    294  output[0] = x1[0];
    295  output[1] = _mm256_subs_epi16(__zero, x1[8]);
    296  output[2] = x1[12];
    297  output[3] = _mm256_subs_epi16(__zero, x1[4]);
    298  output[4] = x1[6];
    299  output[5] = _mm256_subs_epi16(__zero, x1[14]);
    300  output[6] = x1[10];
    301  output[7] = _mm256_subs_epi16(__zero, x1[2]);
    302  output[8] = x1[3];
    303  output[9] = _mm256_subs_epi16(__zero, x1[11]);
    304  output[10] = x1[15];
    305  output[11] = _mm256_subs_epi16(__zero, x1[7]);
    306  output[12] = x1[5];
    307  output[13] = _mm256_subs_epi16(__zero, x1[13]);
    308  output[14] = x1[9];
    309  output[15] = _mm256_subs_epi16(__zero, x1[1]);
    310 }
    311 
    312 static void iadst16_avx2(const __m256i *input, __m256i *output) {
    313  const int32_t *cospi = cospi_arr(INV_COS_BIT);
    314 
    315  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
    316 
    317  __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
    318  __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
    319  __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
    320  __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
    321  __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
    322  __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
    323  __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
    324  __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
    325  __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
    326  __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
    327  __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
    328  __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
    329  __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
    330  __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
    331  __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
    332  __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
    333 
    334  // stage 1
    335  __m256i x1[16];
    336  x1[0] = input[15];
    337  x1[1] = input[0];
    338  x1[2] = input[13];
    339  x1[3] = input[2];
    340  x1[4] = input[11];
    341  x1[5] = input[4];
    342  x1[6] = input[9];
    343  x1[7] = input[6];
    344  x1[8] = input[7];
    345  x1[9] = input[8];
    346  x1[10] = input[5];
    347  x1[11] = input[10];
    348  x1[12] = input[3];
    349  x1[13] = input[12];
    350  x1[14] = input[1];
    351  x1[15] = input[14];
    352 
    353  // stage 2
    354  btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r,
    355                  INV_COS_BIT);
    356  btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r,
    357                  INV_COS_BIT);
    358  btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r,
    359                  INV_COS_BIT);
    360  btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r,
    361                  INV_COS_BIT);
    362  btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r,
    363                  INV_COS_BIT);
    364  btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r,
    365                  INV_COS_BIT);
    366  btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r,
    367                  INV_COS_BIT);
    368  btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r,
    369                  INV_COS_BIT);
    370 
    371  iadst16_stage3_avx2(x1);
    372  iadst16_stage4_avx2(x1, cospi, _r, INV_COS_BIT);
    373  iadst16_stage5_avx2(x1);
    374  iadst16_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
    375  iadst16_stage7_avx2(x1);
    376  iadst16_stage8_avx2(x1, cospi, _r, INV_COS_BIT);
    377  iadst16_stage9_avx2(output, x1);
    378 }
    379 
    380 static void iadst16_low8_avx2(const __m256i *input, __m256i *output) {
    381  const int32_t *cospi = cospi_arr(INV_COS_BIT);
    382  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
    383 
    384  // stage 1
    385  __m256i x1[16];
    386  x1[1] = input[0];
    387  x1[3] = input[2];
    388  x1[5] = input[4];
    389  x1[7] = input[6];
    390  x1[8] = input[7];
    391  x1[10] = input[5];
    392  x1[12] = input[3];
    393  x1[14] = input[1];
    394 
    395  // stage 2
    396  btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]);
    397  btf_16_w16_0_avx2(cospi[54], -cospi[10], x1[3], x1[2], x1[3]);
    398  btf_16_w16_0_avx2(cospi[46], -cospi[18], x1[5], x1[4], x1[5]);
    399  btf_16_w16_0_avx2(cospi[38], -cospi[26], x1[7], x1[6], x1[7]);
    400  btf_16_w16_0_avx2(cospi[34], cospi[30], x1[8], x1[8], x1[9]);
    401  btf_16_w16_0_avx2(cospi[42], cospi[22], x1[10], x1[10], x1[11]);
    402  btf_16_w16_0_avx2(cospi[50], cospi[14], x1[12], x1[12], x1[13]);
    403  btf_16_w16_0_avx2(cospi[58], cospi[06], x1[14], x1[14], x1[15]);
    404 
    405  iadst16_stage3_avx2(x1);
    406  iadst16_stage4_avx2(x1, cospi, _r, INV_COS_BIT);
    407  iadst16_stage5_avx2(x1);
    408  iadst16_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
    409  iadst16_stage7_avx2(x1);
    410  iadst16_stage8_avx2(x1, cospi, _r, INV_COS_BIT);
    411  iadst16_stage9_avx2(output, x1);
    412 }
    413 
    414 static void iadst16_low1_avx2(const __m256i *input, __m256i *output) {
    415  const int32_t *cospi = cospi_arr(INV_COS_BIT);
    416  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
    417 
    418  const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
    419  const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
    420  const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
    421  const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
    422 
    423  // stage 1
    424  __m256i x1[16];
    425  x1[1] = input[0];
    426 
    427  // stage 2
    428  btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]);
    429 
    430  // stage 3
    431  x1[8] = x1[0];
    432  x1[9] = x1[1];
    433 
    434  // stage 4
    435  btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r,
    436                  INV_COS_BIT);
    437 
    438  // stage 5
    439  x1[4] = x1[0];
    440  x1[5] = x1[1];
    441 
    442  x1[12] = x1[8];
    443  x1[13] = x1[9];
    444 
    445  // stage 6
    446  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r,
    447                  INV_COS_BIT);
    448  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r,
    449                  INV_COS_BIT);
    450 
    451  // stage 7
    452  x1[2] = x1[0];
    453  x1[3] = x1[1];
    454  x1[6] = x1[4];
    455  x1[7] = x1[5];
    456  x1[10] = x1[8];
    457  x1[11] = x1[9];
    458  x1[14] = x1[12];
    459  x1[15] = x1[13];
    460 
    461  iadst16_stage8_avx2(x1, cospi, _r, INV_COS_BIT);
    462  iadst16_stage9_avx2(output, x1);
    463 }
    464 
    465 static inline void idct32_high16_stage3_avx2(__m256i *x) {
    466  btf_16_adds_subs_avx2(&x[16], &x[17]);
    467  btf_16_adds_subs_avx2(&x[19], &x[18]);
    468  btf_16_adds_subs_avx2(&x[20], &x[21]);
    469  btf_16_adds_subs_avx2(&x[23], &x[22]);
    470  btf_16_adds_subs_avx2(&x[24], &x[25]);
    471  btf_16_adds_subs_avx2(&x[27], &x[26]);
    472  btf_16_adds_subs_avx2(&x[28], &x[29]);
    473  btf_16_adds_subs_avx2(&x[31], &x[30]);
    474 }
    475 
    476 static inline void idct32_high16_stage4_avx2(__m256i *x, const int32_t *cospi,
    477                                             const __m256i _r, int8_t cos_bit) {
    478  const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
    479  const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
    480  const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
    481  const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
    482  const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
    483  const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
    484  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
    485  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit);
    486  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit);
    487  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
    488 }
    489 
    490 static inline void idct32_high24_stage5_avx2(__m256i *x, const int32_t *cospi,
    491                                             const __m256i _r, int8_t cos_bit) {
    492  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
    493  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
    494  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
    495  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
    496  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit);
    497  btf_16_adds_subs_avx2(&x[16], &x[19]);
    498  btf_16_adds_subs_avx2(&x[17], &x[18]);
    499  btf_16_adds_subs_avx2(&x[23], &x[20]);
    500  btf_16_adds_subs_avx2(&x[22], &x[21]);
    501  btf_16_adds_subs_avx2(&x[24], &x[27]);
    502  btf_16_adds_subs_avx2(&x[25], &x[26]);
    503  btf_16_adds_subs_avx2(&x[31], &x[28]);
    504  btf_16_adds_subs_avx2(&x[30], &x[29]);
    505 }
    506 
    507 static inline void idct32_high28_stage6_avx2(__m256i *x, const int32_t *cospi,
    508                                             const __m256i _r, int8_t cos_bit) {
    509  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
    510  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
    511  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
    512  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
    513  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
    514  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit);
    515  btf_16_adds_subs_avx2(&x[8], &x[11]);
    516  btf_16_adds_subs_avx2(&x[9], &x[10]);
    517  btf_16_adds_subs_avx2(&x[15], &x[12]);
    518  btf_16_adds_subs_avx2(&x[14], &x[13]);
    519  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit);
    520  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit);
    521  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit);
    522  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit);
    523 }
    524 
    525 static inline void idct32_stage7_avx2(__m256i *x, const int32_t *cospi,
    526                                      const __m256i _r, int8_t cos_bit) {
    527  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
    528  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
    529  btf_16_adds_subs_avx2(&x[0], &x[7]);
    530  btf_16_adds_subs_avx2(&x[1], &x[6]);
    531  btf_16_adds_subs_avx2(&x[2], &x[5]);
    532  btf_16_adds_subs_avx2(&x[3], &x[4]);
    533  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
    534  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
    535  btf_16_adds_subs_avx2(&x[16], &x[23]);
    536  btf_16_adds_subs_avx2(&x[17], &x[22]);
    537  btf_16_adds_subs_avx2(&x[18], &x[21]);
    538  btf_16_adds_subs_avx2(&x[19], &x[20]);
    539  btf_16_adds_subs_avx2(&x[31], &x[24]);
    540  btf_16_adds_subs_avx2(&x[30], &x[25]);
    541  btf_16_adds_subs_avx2(&x[29], &x[26]);
    542  btf_16_adds_subs_avx2(&x[28], &x[27]);
    543 }
    544 
    545 static inline void idct32_stage8_avx2(__m256i *x, const int32_t *cospi,
    546                                      const __m256i _r, int8_t cos_bit) {
    547  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
    548  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
    549  btf_16_adds_subs_avx2(&x[0], &x[15]);
    550  btf_16_adds_subs_avx2(&x[1], &x[14]);
    551  btf_16_adds_subs_avx2(&x[2], &x[13]);
    552  btf_16_adds_subs_avx2(&x[3], &x[12]);
    553  btf_16_adds_subs_avx2(&x[4], &x[11]);
    554  btf_16_adds_subs_avx2(&x[5], &x[10]);
    555  btf_16_adds_subs_avx2(&x[6], &x[9]);
    556  btf_16_adds_subs_avx2(&x[7], &x[8]);
    557  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit);
    558  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit);
    559  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit);
    560  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit);
    561 }
    562 
    563 static inline void idct32_stage9_avx2(__m256i *output, __m256i *x) {
    564  btf_16_adds_subs_out_avx2(&output[0], &output[31], x[0], x[31]);
    565  btf_16_adds_subs_out_avx2(&output[1], &output[30], x[1], x[30]);
    566  btf_16_adds_subs_out_avx2(&output[2], &output[29], x[2], x[29]);
    567  btf_16_adds_subs_out_avx2(&output[3], &output[28], x[3], x[28]);
    568  btf_16_adds_subs_out_avx2(&output[4], &output[27], x[4], x[27]);
    569  btf_16_adds_subs_out_avx2(&output[5], &output[26], x[5], x[26]);
    570  btf_16_adds_subs_out_avx2(&output[6], &output[25], x[6], x[25]);
    571  btf_16_adds_subs_out_avx2(&output[7], &output[24], x[7], x[24]);
    572  btf_16_adds_subs_out_avx2(&output[8], &output[23], x[8], x[23]);
    573  btf_16_adds_subs_out_avx2(&output[9], &output[22], x[9], x[22]);
    574  btf_16_adds_subs_out_avx2(&output[10], &output[21], x[10], x[21]);
    575  btf_16_adds_subs_out_avx2(&output[11], &output[20], x[11], x[20]);
    576  btf_16_adds_subs_out_avx2(&output[12], &output[19], x[12], x[19]);
    577  btf_16_adds_subs_out_avx2(&output[13], &output[18], x[13], x[18]);
    578  btf_16_adds_subs_out_avx2(&output[14], &output[17], x[14], x[17]);
    579  btf_16_adds_subs_out_avx2(&output[15], &output[16], x[15], x[16]);
    580 }
    581 
    582 static void idct32_low1_avx2(const __m256i *input, __m256i *output) {
    583  const int32_t *cospi = cospi_arr(INV_COS_BIT);
    584 
    585  // stage 1
    586  __m256i x[2];
    587  x[0] = input[0];
    588 
    589  // stage 2
    590  // stage 3
    591  // stage 4
    592  // stage 5
    593  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
    594 
    595  // stage 6
    596  // stage 7
    597  // stage 8
    598  // stage 9
    599  output[0] = x[0];
    600  output[31] = x[0];
    601  output[1] = x[1];
    602  output[30] = x[1];
    603  output[2] = x[1];
    604  output[29] = x[1];
    605  output[3] = x[0];
    606  output[28] = x[0];
    607  output[4] = x[0];
    608  output[27] = x[0];
    609  output[5] = x[1];
    610  output[26] = x[1];
    611  output[6] = x[1];
    612  output[25] = x[1];
    613  output[7] = x[0];
    614  output[24] = x[0];
    615  output[8] = x[0];
    616  output[23] = x[0];
    617  output[9] = x[1];
    618  output[22] = x[1];
    619  output[10] = x[1];
    620  output[21] = x[1];
    621  output[11] = x[0];
    622  output[20] = x[0];
    623  output[12] = x[0];
    624  output[19] = x[0];
    625  output[13] = x[1];
    626  output[18] = x[1];
    627  output[14] = x[1];
    628  output[17] = x[1];
    629  output[15] = x[0];
    630  output[16] = x[0];
    631 }
    632 
    633 static void idct32_low8_avx2(const __m256i *input, __m256i *output) {
    634  const int32_t *cospi = cospi_arr(INV_COS_BIT);
    635  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
    636 
    637  // stage 1
    638  __m256i x[32];
    639  x[0] = input[0];
    640  x[4] = input[4];
    641  x[8] = input[2];
    642  x[12] = input[6];
    643  x[16] = input[1];
    644  x[20] = input[5];
    645  x[24] = input[3];
    646  x[28] = input[7];
    647 
    648  // stage 2
    649  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
    650  btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
    651  btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
    652  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
    653 
    654  // stage 3
    655  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
    656  btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
    657  x[17] = x[16];
    658  x[18] = x[19];
    659  x[21] = x[20];
    660  x[22] = x[23];
    661  x[25] = x[24];
    662  x[26] = x[27];
    663  x[29] = x[28];
    664  x[30] = x[31];
    665 
    666  // stage 4
    667  btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
    668  x[9] = x[8];
    669  x[10] = x[11];
    670  x[13] = x[12];
    671  x[14] = x[15];
    672  idct32_high16_stage4_avx2(x, cospi, _r, INV_COS_BIT);
    673 
    674  // stage 5
    675  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
    676  x[5] = x[4];
    677  x[6] = x[7];
    678  idct32_high24_stage5_avx2(x, cospi, _r, INV_COS_BIT);
    679  // stage 6
    680  x[3] = x[0];
    681  x[2] = x[1];
    682  idct32_high28_stage6_avx2(x, cospi, _r, INV_COS_BIT);
    683 
    684  idct32_stage7_avx2(x, cospi, _r, INV_COS_BIT);
    685  idct32_stage8_avx2(x, cospi, _r, INV_COS_BIT);
    686  idct32_stage9_avx2(output, x);
    687 }
    688 
    689 static void idct32_low16_avx2(const __m256i *input, __m256i *output) {
    690  const int32_t *cospi = cospi_arr(INV_COS_BIT);
    691  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
    692 
    693  // stage 1
    694  __m256i x[32];
    695  x[0] = input[0];
    696  x[2] = input[8];
    697  x[4] = input[4];
    698  x[6] = input[12];
    699  x[8] = input[2];
    700  x[10] = input[10];
    701  x[12] = input[6];
    702  x[14] = input[14];
    703  x[16] = input[1];
    704  x[18] = input[9];
    705  x[20] = input[5];
    706  x[22] = input[13];
    707  x[24] = input[3];
    708  x[26] = input[11];
    709  x[28] = input[7];
    710  x[30] = input[15];
    711 
    712  // stage 2
    713  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
    714  btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]);
    715  btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]);
    716  btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
    717  btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
    718  btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]);
    719  btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]);
    720  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
    721 
    722  // stage 3
    723  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
    724  btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]);
    725  btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]);
    726  btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
    727  idct32_high16_stage3_avx2(x);
    728 
    729  // stage 4
    730  btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
    731  btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]);
    732  btf_16_adds_subs_avx2(&x[8], &x[9]);
    733  btf_16_adds_subs_avx2(&x[11], &x[10]);
    734  btf_16_adds_subs_avx2(&x[12], &x[13]);
    735  btf_16_adds_subs_avx2(&x[15], &x[14]);
    736  idct32_high16_stage4_avx2(x, cospi, _r, INV_COS_BIT);
    737 
    738  // stage 5
    739  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
    740  btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
    741  btf_16_adds_subs_avx2(&x[4], &x[5]);
    742  btf_16_adds_subs_avx2(&x[7], &x[6]);
    743  idct32_high24_stage5_avx2(x, cospi, _r, INV_COS_BIT);
    744 
    745  btf_16_adds_subs_avx2(&x[0], &x[3]);
    746  btf_16_adds_subs_avx2(&x[1], &x[2]);
    747  idct32_high28_stage6_avx2(x, cospi, _r, INV_COS_BIT);
    748 
    749  idct32_stage7_avx2(x, cospi, _r, INV_COS_BIT);
    750  idct32_stage8_avx2(x, cospi, _r, INV_COS_BIT);
    751  idct32_stage9_avx2(output, x);
    752 }
    753 
    754 static void idct32_avx2(const __m256i *input, __m256i *output) {
    755  const int32_t *cospi = cospi_arr(INV_COS_BIT);
    756  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
    757 
    758  __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
    759  __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
    760  __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
    761  __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
    762  __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
    763  __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
    764  __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
    765  __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
    766  __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
    767  __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
    768  __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
    769  __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
    770  __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
    771  __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
    772  __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
    773  __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
    774  __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
    775  __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
    776  __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
    777  __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
    778  __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
    779  __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
    780  __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
    781  __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
    782  __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
    783  __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
    784  __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
    785  __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
    786  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
    787  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
    788  __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
    789  __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
    790 
    791  // stage 1
    792  __m256i x1[32];
    793  x1[0] = input[0];
    794  x1[1] = input[16];
    795  x1[2] = input[8];
    796  x1[3] = input[24];
    797  x1[4] = input[4];
    798  x1[5] = input[20];
    799  x1[6] = input[12];
    800  x1[7] = input[28];
    801  x1[8] = input[2];
    802  x1[9] = input[18];
    803  x1[10] = input[10];
    804  x1[11] = input[26];
    805  x1[12] = input[6];
    806  x1[13] = input[22];
    807  x1[14] = input[14];
    808  x1[15] = input[30];
    809  x1[16] = input[1];
    810  x1[17] = input[17];
    811  x1[18] = input[9];
    812  x1[19] = input[25];
    813  x1[20] = input[5];
    814  x1[21] = input[21];
    815  x1[22] = input[13];
    816  x1[23] = input[29];
    817  x1[24] = input[3];
    818  x1[25] = input[19];
    819  x1[26] = input[11];
    820  x1[27] = input[27];
    821  x1[28] = input[7];
    822  x1[29] = input[23];
    823  x1[30] = input[15];
    824  x1[31] = input[31];
    825 
    826  // stage 2
    827  btf_16_w16_avx2(cospi_p62_m02, cospi_p02_p62, &x1[16], &x1[31], _r,
    828                  INV_COS_BIT);
    829  btf_16_w16_avx2(cospi_p30_m34, cospi_p34_p30, &x1[17], &x1[30], _r,
    830                  INV_COS_BIT);
    831  btf_16_w16_avx2(cospi_p46_m18, cospi_p18_p46, &x1[18], &x1[29], _r,
    832                  INV_COS_BIT);
    833  btf_16_w16_avx2(cospi_p14_m50, cospi_p50_p14, &x1[19], &x1[28], _r,
    834                  INV_COS_BIT);
    835  btf_16_w16_avx2(cospi_p54_m10, cospi_p10_p54, &x1[20], &x1[27], _r,
    836                  INV_COS_BIT);
    837  btf_16_w16_avx2(cospi_p22_m42, cospi_p42_p22, &x1[21], &x1[26], _r,
    838                  INV_COS_BIT);
    839  btf_16_w16_avx2(cospi_p38_m26, cospi_p26_p38, &x1[22], &x1[25], _r,
    840                  INV_COS_BIT);
    841  btf_16_w16_avx2(cospi_p06_m58, cospi_p58_p06, &x1[23], &x1[24], _r,
    842                  INV_COS_BIT);
    843 
    844  // stage 3
    845  btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r,
    846                  INV_COS_BIT);
    847  btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r,
    848                  INV_COS_BIT);
    849  btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r,
    850                  INV_COS_BIT);
    851  btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r,
    852                  INV_COS_BIT);
    853  idct32_high16_stage3_avx2(x1);
    854 
    855  // stage 4
    856  btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r,
    857                  INV_COS_BIT);
    858  btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r,
    859                  INV_COS_BIT);
    860  btf_16_adds_subs_avx2(&x1[8], &x1[9]);
    861  btf_16_adds_subs_avx2(&x1[11], &x1[10]);
    862  btf_16_adds_subs_avx2(&x1[12], &x1[13]);
    863  btf_16_adds_subs_avx2(&x1[15], &x1[14]);
    864  idct32_high16_stage4_avx2(x1, cospi, _r, INV_COS_BIT);
    865 
    866  // stage 5
    867  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r,
    868                  INV_COS_BIT);
    869  btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r,
    870                  INV_COS_BIT);
    871  btf_16_adds_subs_avx2(&x1[4], &x1[5]);
    872  btf_16_adds_subs_avx2(&x1[7], &x1[6]);
    873  idct32_high24_stage5_avx2(x1, cospi, _r, INV_COS_BIT);
    874 
    875  // stage 6
    876  btf_16_adds_subs_avx2(&x1[0], &x1[3]);
    877  btf_16_adds_subs_avx2(&x1[1], &x1[2]);
    878  idct32_high28_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
    879 
    880  idct32_stage7_avx2(x1, cospi, _r, INV_COS_BIT);
    881  idct32_stage8_avx2(x1, cospi, _r, INV_COS_BIT);
    882  idct32_stage9_avx2(output, x1);
    883 }
    884 
    885 static inline void idct64_stage4_high32_avx2(__m256i *x, const int32_t *cospi,
    886                                             const __m256i _r, int8_t cos_bit) {
    887  (void)cos_bit;
    888  const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
    889  const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
    890  const __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]);
    891  const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
    892  const __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
    893  const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
    894  const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
    895  const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
    896  const __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]);
    897  const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
    898  const __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
    899  const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
    900  btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, cos_bit);
    901  btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x[34], &x[61], _r, cos_bit);
    902  btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x[37], &x[58], _r, cos_bit);
    903  btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, cos_bit);
    904  btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, cos_bit);
    905  btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x[42], &x[53], _r, cos_bit);
    906  btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x[45], &x[50], _r, cos_bit);
    907  btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit);
    908 }
    909 
    910 static inline void idct64_stage5_high48_avx2(__m256i *x, const int32_t *cospi,
    911                                             const __m256i _r, int8_t cos_bit) {
    912  (void)cos_bit;
    913  const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
    914  const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
    915  const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
    916  const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
    917  const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
    918  const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
    919  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
    920  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit);
    921  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit);
    922  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
    923  btf_16_adds_subs_avx2(&x[32], &x[35]);
    924  btf_16_adds_subs_avx2(&x[33], &x[34]);
    925  btf_16_adds_subs_avx2(&x[39], &x[36]);
    926  btf_16_adds_subs_avx2(&x[38], &x[37]);
    927  btf_16_adds_subs_avx2(&x[40], &x[43]);
    928  btf_16_adds_subs_avx2(&x[41], &x[42]);
    929  btf_16_adds_subs_avx2(&x[47], &x[44]);
    930  btf_16_adds_subs_avx2(&x[46], &x[45]);
    931  btf_16_adds_subs_avx2(&x[48], &x[51]);
    932  btf_16_adds_subs_avx2(&x[49], &x[50]);
    933  btf_16_adds_subs_avx2(&x[55], &x[52]);
    934  btf_16_adds_subs_avx2(&x[54], &x[53]);
    935  btf_16_adds_subs_avx2(&x[56], &x[59]);
    936  btf_16_adds_subs_avx2(&x[57], &x[58]);
    937  btf_16_adds_subs_avx2(&x[63], &x[60]);
    938  btf_16_adds_subs_avx2(&x[62], &x[61]);
    939 }
    940 
    941 static inline void idct64_stage6_high32_avx2(__m256i *x, const int32_t *cospi,
    942                                             const __m256i _r, int8_t cos_bit) {
    943  (void)cos_bit;
    944  const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
    945  const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
    946  const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
    947  const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
    948  const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
    949  const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
    950  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[34], &x[61], _r, cos_bit);
    951  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[35], &x[60], _r, cos_bit);
    952  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[36], &x[59], _r, cos_bit);
    953  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[37], &x[58], _r, cos_bit);
    954  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[42], &x[53], _r, cos_bit);
    955  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[43], &x[52], _r, cos_bit);
    956  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[44], &x[51], _r, cos_bit);
    957  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[45], &x[50], _r, cos_bit);
    958 }
    959 
    960 static inline void idct64_stage6_high48_avx2(__m256i *x, const int32_t *cospi,
    961                                             const __m256i _r, int8_t cos_bit) {
    962  btf_16_adds_subs_avx2(&x[16], &x[19]);
    963  btf_16_adds_subs_avx2(&x[17], &x[18]);
    964  btf_16_adds_subs_avx2(&x[23], &x[20]);
    965  btf_16_adds_subs_avx2(&x[22], &x[21]);
    966  btf_16_adds_subs_avx2(&x[24], &x[27]);
    967  btf_16_adds_subs_avx2(&x[25], &x[26]);
    968  btf_16_adds_subs_avx2(&x[31], &x[28]);
    969  btf_16_adds_subs_avx2(&x[30], &x[29]);
    970  idct64_stage6_high32_avx2(x, cospi, _r, cos_bit);
    971 }
    972 
    973 static inline void idct64_stage7_high48_avx2(__m256i *x, const int32_t *cospi,
    974                                             const __m256i _r, int8_t cos_bit) {
    975  (void)cos_bit;
    976  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
    977  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
    978  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
    979  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit);
    980  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit);
    981  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit);
    982  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit);
    983  btf_16_adds_subs_avx2(&x[32], &x[39]);
    984  btf_16_adds_subs_avx2(&x[33], &x[38]);
    985  btf_16_adds_subs_avx2(&x[34], &x[37]);
    986  btf_16_adds_subs_avx2(&x[35], &x[36]);
    987  btf_16_adds_subs_avx2(&x[47], &x[40]);
    988  btf_16_adds_subs_avx2(&x[46], &x[41]);
    989  btf_16_adds_subs_avx2(&x[45], &x[42]);
    990  btf_16_adds_subs_avx2(&x[44], &x[43]);
    991  btf_16_adds_subs_avx2(&x[48], &x[55]);
    992  btf_16_adds_subs_avx2(&x[49], &x[54]);
    993  btf_16_adds_subs_avx2(&x[50], &x[53]);
    994  btf_16_adds_subs_avx2(&x[51], &x[52]);
    995  btf_16_adds_subs_avx2(&x[63], &x[56]);
    996  btf_16_adds_subs_avx2(&x[62], &x[57]);
    997  btf_16_adds_subs_avx2(&x[61], &x[58]);
    998  btf_16_adds_subs_avx2(&x[60], &x[59]);
    999 }
   1000 
   1001 static inline void idct64_stage8_high48_avx2(__m256i *x, const int32_t *cospi,
   1002                                             const __m256i _r, int8_t cos_bit) {
   1003  (void)cos_bit;
   1004  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
   1005  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
   1006  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
   1007  btf_16_adds_subs_avx2(&x[16], &x[23]);
   1008  btf_16_adds_subs_avx2(&x[17], &x[22]);
   1009  btf_16_adds_subs_avx2(&x[18], &x[21]);
   1010  btf_16_adds_subs_avx2(&x[19], &x[20]);
   1011  btf_16_adds_subs_avx2(&x[31], &x[24]);
   1012  btf_16_adds_subs_avx2(&x[30], &x[25]);
   1013  btf_16_adds_subs_avx2(&x[29], &x[26]);
   1014  btf_16_adds_subs_avx2(&x[28], &x[27]);
   1015  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[36], &x[59], _r, cos_bit);
   1016  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[37], &x[58], _r, cos_bit);
   1017  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[38], &x[57], _r, cos_bit);
   1018  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[39], &x[56], _r, cos_bit);
   1019  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[40], &x[55], _r, cos_bit);
   1020  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[41], &x[54], _r, cos_bit);
   1021  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[42], &x[53], _r, cos_bit);
   1022  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[43], &x[52], _r, cos_bit);
   1023 }
   1024 
   1025 static inline void idct64_stage9_avx2(__m256i *x, const int32_t *cospi,
   1026                                      const __m256i _r, int8_t cos_bit) {
   1027  (void)cos_bit;
   1028  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
   1029  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
   1030  btf_16_adds_subs_avx2(&x[0], &x[15]);
   1031  btf_16_adds_subs_avx2(&x[1], &x[14]);
   1032  btf_16_adds_subs_avx2(&x[2], &x[13]);
   1033  btf_16_adds_subs_avx2(&x[3], &x[12]);
   1034  btf_16_adds_subs_avx2(&x[4], &x[11]);
   1035  btf_16_adds_subs_avx2(&x[5], &x[10]);
   1036  btf_16_adds_subs_avx2(&x[6], &x[9]);
   1037  btf_16_adds_subs_avx2(&x[7], &x[8]);
   1038  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit);
   1039  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit);
   1040  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit);
   1041  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit);
   1042  btf_16_adds_subs_avx2(&x[32], &x[47]);
   1043  btf_16_adds_subs_avx2(&x[33], &x[46]);
   1044  btf_16_adds_subs_avx2(&x[34], &x[45]);
   1045  btf_16_adds_subs_avx2(&x[35], &x[44]);
   1046  btf_16_adds_subs_avx2(&x[36], &x[43]);
   1047  btf_16_adds_subs_avx2(&x[37], &x[42]);
   1048  btf_16_adds_subs_avx2(&x[38], &x[41]);
   1049  btf_16_adds_subs_avx2(&x[39], &x[40]);
   1050  btf_16_adds_subs_avx2(&x[63], &x[48]);
   1051  btf_16_adds_subs_avx2(&x[62], &x[49]);
   1052  btf_16_adds_subs_avx2(&x[61], &x[50]);
   1053  btf_16_adds_subs_avx2(&x[60], &x[51]);
   1054  btf_16_adds_subs_avx2(&x[59], &x[52]);
   1055  btf_16_adds_subs_avx2(&x[58], &x[53]);
   1056  btf_16_adds_subs_avx2(&x[57], &x[54]);
   1057  btf_16_adds_subs_avx2(&x[56], &x[55]);
   1058 }
   1059 
   1060 static inline void idct64_stage10_avx2(__m256i *x, const int32_t *cospi,
   1061                                       const __m256i _r, int8_t cos_bit) {
   1062  (void)cos_bit;
   1063  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
   1064  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
   1065  btf_16_adds_subs_avx2(&x[0], &x[31]);
   1066  btf_16_adds_subs_avx2(&x[1], &x[30]);
   1067  btf_16_adds_subs_avx2(&x[2], &x[29]);
   1068  btf_16_adds_subs_avx2(&x[3], &x[28]);
   1069  btf_16_adds_subs_avx2(&x[4], &x[27]);
   1070  btf_16_adds_subs_avx2(&x[5], &x[26]);
   1071  btf_16_adds_subs_avx2(&x[6], &x[25]);
   1072  btf_16_adds_subs_avx2(&x[7], &x[24]);
   1073  btf_16_adds_subs_avx2(&x[8], &x[23]);
   1074  btf_16_adds_subs_avx2(&x[9], &x[22]);
   1075  btf_16_adds_subs_avx2(&x[10], &x[21]);
   1076  btf_16_adds_subs_avx2(&x[11], &x[20]);
   1077  btf_16_adds_subs_avx2(&x[12], &x[19]);
   1078  btf_16_adds_subs_avx2(&x[13], &x[18]);
   1079  btf_16_adds_subs_avx2(&x[14], &x[17]);
   1080  btf_16_adds_subs_avx2(&x[15], &x[16]);
   1081  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[40], &x[55], _r, cos_bit);
   1082  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[41], &x[54], _r, cos_bit);
   1083  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[42], &x[53], _r, cos_bit);
   1084  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[43], &x[52], _r, cos_bit);
   1085  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[44], &x[51], _r, cos_bit);
   1086  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[45], &x[50], _r, cos_bit);
   1087  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[46], &x[49], _r, cos_bit);
   1088  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[47], &x[48], _r, cos_bit);
   1089 }
   1090 
   1091 static inline void idct64_stage11_avx2(__m256i *output, __m256i *x) {
   1092  btf_16_adds_subs_out_avx2(&output[0], &output[63], x[0], x[63]);
   1093  btf_16_adds_subs_out_avx2(&output[1], &output[62], x[1], x[62]);
   1094  btf_16_adds_subs_out_avx2(&output[2], &output[61], x[2], x[61]);
   1095  btf_16_adds_subs_out_avx2(&output[3], &output[60], x[3], x[60]);
   1096  btf_16_adds_subs_out_avx2(&output[4], &output[59], x[4], x[59]);
   1097  btf_16_adds_subs_out_avx2(&output[5], &output[58], x[5], x[58]);
   1098  btf_16_adds_subs_out_avx2(&output[6], &output[57], x[6], x[57]);
   1099  btf_16_adds_subs_out_avx2(&output[7], &output[56], x[7], x[56]);
   1100  btf_16_adds_subs_out_avx2(&output[8], &output[55], x[8], x[55]);
   1101  btf_16_adds_subs_out_avx2(&output[9], &output[54], x[9], x[54]);
   1102  btf_16_adds_subs_out_avx2(&output[10], &output[53], x[10], x[53]);
   1103  btf_16_adds_subs_out_avx2(&output[11], &output[52], x[11], x[52]);
   1104  btf_16_adds_subs_out_avx2(&output[12], &output[51], x[12], x[51]);
   1105  btf_16_adds_subs_out_avx2(&output[13], &output[50], x[13], x[50]);
   1106  btf_16_adds_subs_out_avx2(&output[14], &output[49], x[14], x[49]);
   1107  btf_16_adds_subs_out_avx2(&output[15], &output[48], x[15], x[48]);
   1108  btf_16_adds_subs_out_avx2(&output[16], &output[47], x[16], x[47]);
   1109  btf_16_adds_subs_out_avx2(&output[17], &output[46], x[17], x[46]);
   1110  btf_16_adds_subs_out_avx2(&output[18], &output[45], x[18], x[45]);
   1111  btf_16_adds_subs_out_avx2(&output[19], &output[44], x[19], x[44]);
   1112  btf_16_adds_subs_out_avx2(&output[20], &output[43], x[20], x[43]);
   1113  btf_16_adds_subs_out_avx2(&output[21], &output[42], x[21], x[42]);
   1114  btf_16_adds_subs_out_avx2(&output[22], &output[41], x[22], x[41]);
   1115  btf_16_adds_subs_out_avx2(&output[23], &output[40], x[23], x[40]);
   1116  btf_16_adds_subs_out_avx2(&output[24], &output[39], x[24], x[39]);
   1117  btf_16_adds_subs_out_avx2(&output[25], &output[38], x[25], x[38]);
   1118  btf_16_adds_subs_out_avx2(&output[26], &output[37], x[26], x[37]);
   1119  btf_16_adds_subs_out_avx2(&output[27], &output[36], x[27], x[36]);
   1120  btf_16_adds_subs_out_avx2(&output[28], &output[35], x[28], x[35]);
   1121  btf_16_adds_subs_out_avx2(&output[29], &output[34], x[29], x[34]);
   1122  btf_16_adds_subs_out_avx2(&output[30], &output[33], x[30], x[33]);
   1123  btf_16_adds_subs_out_avx2(&output[31], &output[32], x[31], x[32]);
   1124 }
   1125 
   1126 static void idct64_low1_avx2(const __m256i *input, __m256i *output) {
   1127  const int32_t *cospi = cospi_arr(INV_COS_BIT);
   1128 
   1129  // stage 1
   1130  __m256i x[32];
   1131  x[0] = input[0];
   1132 
   1133  // stage 2
   1134  // stage 3
   1135  // stage 4
   1136  // stage 5
   1137  // stage 6
   1138  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
   1139 
   1140  // stage 7
   1141  // stage 8
   1142  // stage 9
   1143  // stage 10
   1144  // stage 11
   1145  output[0] = x[0];
   1146  output[63] = x[0];
   1147  output[1] = x[1];
   1148  output[62] = x[1];
   1149  output[2] = x[1];
   1150  output[61] = x[1];
   1151  output[3] = x[0];
   1152  output[60] = x[0];
   1153  output[4] = x[0];
   1154  output[59] = x[0];
   1155  output[5] = x[1];
   1156  output[58] = x[1];
   1157  output[6] = x[1];
   1158  output[57] = x[1];
   1159  output[7] = x[0];
   1160  output[56] = x[0];
   1161  output[8] = x[0];
   1162  output[55] = x[0];
   1163  output[9] = x[1];
   1164  output[54] = x[1];
   1165  output[10] = x[1];
   1166  output[53] = x[1];
   1167  output[11] = x[0];
   1168  output[52] = x[0];
   1169  output[12] = x[0];
   1170  output[51] = x[0];
   1171  output[13] = x[1];
   1172  output[50] = x[1];
   1173  output[14] = x[1];
   1174  output[49] = x[1];
   1175  output[15] = x[0];
   1176  output[48] = x[0];
   1177  output[16] = x[0];
   1178  output[47] = x[0];
   1179  output[17] = x[1];
   1180  output[46] = x[1];
   1181  output[18] = x[1];
   1182  output[45] = x[1];
   1183  output[19] = x[0];
   1184  output[44] = x[0];
   1185  output[20] = x[0];
   1186  output[43] = x[0];
   1187  output[21] = x[1];
   1188  output[42] = x[1];
   1189  output[22] = x[1];
   1190  output[41] = x[1];
   1191  output[23] = x[0];
   1192  output[40] = x[0];
   1193  output[24] = x[0];
   1194  output[39] = x[0];
   1195  output[25] = x[1];
   1196  output[38] = x[1];
   1197  output[26] = x[1];
   1198  output[37] = x[1];
   1199  output[27] = x[0];
   1200  output[36] = x[0];
   1201  output[28] = x[0];
   1202  output[35] = x[0];
   1203  output[29] = x[1];
   1204  output[34] = x[1];
   1205  output[30] = x[1];
   1206  output[33] = x[1];
   1207  output[31] = x[0];
   1208  output[32] = x[0];
   1209 }
   1210 
   1211 static void idct64_low8_avx2(const __m256i *input, __m256i *output) {
   1212  const int32_t *cospi = cospi_arr(INV_COS_BIT);
   1213  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
   1214  const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
   1215  const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
   1216  const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
   1217  const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
   1218  const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
   1219  const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
   1220  const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
   1221  const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
   1222  const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
   1223  const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
   1224  const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
   1225  const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
   1226  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
   1227  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
   1228  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
   1229  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
   1230 
   1231  // stage 1
   1232  __m256i x[64];
   1233  x[0] = input[0];
   1234  x[8] = input[4];
   1235  x[16] = input[2];
   1236  x[24] = input[6];
   1237  x[32] = input[1];
   1238  x[40] = input[5];
   1239  x[48] = input[3];
   1240  x[56] = input[7];
   1241 
   1242  // stage 2
   1243  btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
   1244  btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
   1245  btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
   1246  btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
   1247 
   1248  // stage 3
   1249  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
   1250  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
   1251  x[33] = x[32];
   1252  x[38] = x[39];
   1253  x[41] = x[40];
   1254  x[46] = x[47];
   1255  x[49] = x[48];
   1256  x[54] = x[55];
   1257  x[57] = x[56];
   1258  x[62] = x[63];
   1259 
   1260  // stage 4
   1261  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
   1262  x[17] = x[16];
   1263  x[22] = x[23];
   1264  x[25] = x[24];
   1265  x[30] = x[31];
   1266  btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r,
   1267                  INV_COS_BIT);
   1268  btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r,
   1269                  INV_COS_BIT);
   1270  btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r,
   1271                  INV_COS_BIT);
   1272  btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r,
   1273                  INV_COS_BIT);
   1274 
   1275  // stage 5
   1276  x[9] = x[8];
   1277  x[14] = x[15];
   1278  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r,
   1279                  INV_COS_BIT);
   1280  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r,
   1281                  INV_COS_BIT);
   1282  x[35] = x[32];
   1283  x[34] = x[33];
   1284  x[36] = x[39];
   1285  x[37] = x[38];
   1286  x[43] = x[40];
   1287  x[42] = x[41];
   1288  x[44] = x[47];
   1289  x[45] = x[46];
   1290  x[51] = x[48];
   1291  x[50] = x[49];
   1292  x[52] = x[55];
   1293  x[53] = x[54];
   1294  x[59] = x[56];
   1295  x[58] = x[57];
   1296  x[60] = x[63];
   1297  x[61] = x[62];
   1298 
   1299  // stage 6
   1300  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
   1301  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, INV_COS_BIT);
   1302  x[19] = x[16];
   1303  x[18] = x[17];
   1304  x[20] = x[23];
   1305  x[21] = x[22];
   1306  x[27] = x[24];
   1307  x[26] = x[25];
   1308  x[28] = x[31];
   1309  x[29] = x[30];
   1310  idct64_stage6_high32_avx2(x, cospi, _r, INV_COS_BIT);
   1311 
   1312  // stage 7
   1313  x[3] = x[0];
   1314  x[2] = x[1];
   1315  x[11] = x[8];
   1316  x[10] = x[9];
   1317  x[12] = x[15];
   1318  x[13] = x[14];
   1319  idct64_stage7_high48_avx2(x, cospi, _r, INV_COS_BIT);
   1320 
   1321  // stage 8
   1322  x[7] = x[0];
   1323  x[6] = x[1];
   1324  x[5] = x[2];
   1325  x[4] = x[3];
   1326  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r,
   1327                  INV_COS_BIT);
   1328  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r,
   1329                  INV_COS_BIT);
   1330  idct64_stage8_high48_avx2(x, cospi, _r, INV_COS_BIT);
   1331 
   1332  idct64_stage9_avx2(x, cospi, _r, INV_COS_BIT);
   1333  idct64_stage10_avx2(x, cospi, _r, INV_COS_BIT);
   1334  idct64_stage11_avx2(output, x);
   1335 }
   1336 
   1337 static void idct64_low16_avx2(const __m256i *input, __m256i *output) {
   1338  const int32_t *cospi = cospi_arr(INV_COS_BIT);
   1339  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
   1340 
   1341  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
   1342  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
   1343  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
   1344  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
   1345  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
   1346 
   1347  // stage 1
   1348  __m256i x[64];
   1349  x[0] = input[0];
   1350  x[4] = input[8];
   1351  x[8] = input[4];
   1352  x[12] = input[12];
   1353  x[16] = input[2];
   1354  x[20] = input[10];
   1355  x[24] = input[6];
   1356  x[28] = input[14];
   1357  x[32] = input[1];
   1358  x[36] = input[9];
   1359  x[40] = input[5];
   1360  x[44] = input[13];
   1361  x[48] = input[3];
   1362  x[52] = input[11];
   1363  x[56] = input[7];
   1364  x[60] = input[15];
   1365 
   1366  // stage 2
   1367  btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
   1368  btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]);
   1369  btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]);
   1370  btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
   1371  btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
   1372  btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]);
   1373  btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]);
   1374  btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
   1375 
   1376  // stage 3
   1377  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
   1378  btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
   1379  btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
   1380  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
   1381  x[33] = x[32];
   1382  x[34] = x[35];
   1383  x[37] = x[36];
   1384  x[38] = x[39];
   1385  x[41] = x[40];
   1386  x[42] = x[43];
   1387  x[45] = x[44];
   1388  x[46] = x[47];
   1389  x[49] = x[48];
   1390  x[50] = x[51];
   1391  x[53] = x[52];
   1392  x[54] = x[55];
   1393  x[57] = x[56];
   1394  x[58] = x[59];
   1395  x[61] = x[60];
   1396  x[62] = x[63];
   1397 
   1398  // stage 4
   1399  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
   1400  btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
   1401  x[17] = x[16];
   1402  x[18] = x[19];
   1403  x[21] = x[20];
   1404  x[22] = x[23];
   1405  x[25] = x[24];
   1406  x[26] = x[27];
   1407  x[29] = x[28];
   1408  x[30] = x[31];
   1409  idct64_stage4_high32_avx2(x, cospi, _r, INV_COS_BIT);
   1410 
   1411  // stage 5
   1412  btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
   1413  x[9] = x[8];
   1414  x[10] = x[11];
   1415  x[13] = x[12];
   1416  x[14] = x[15];
   1417  idct64_stage5_high48_avx2(x, cospi, _r, INV_COS_BIT);
   1418 
   1419  // stage 6
   1420  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
   1421  x[5] = x[4];
   1422  x[6] = x[7];
   1423  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, INV_COS_BIT);
   1424  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r,
   1425                  INV_COS_BIT);
   1426  idct64_stage6_high48_avx2(x, cospi, _r, INV_COS_BIT);
   1427 
   1428  // stage 7
   1429  x[3] = x[0];
   1430  x[2] = x[1];
   1431  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, INV_COS_BIT);
   1432  btf_16_adds_subs_avx2(&x[8], &x[11]);
   1433  btf_16_adds_subs_avx2(&x[9], &x[10]);
   1434  btf_16_adds_subs_avx2(&x[15], &x[12]);
   1435  btf_16_adds_subs_avx2(&x[14], &x[13]);
   1436  idct64_stage7_high48_avx2(x, cospi, _r, INV_COS_BIT);
   1437 
   1438  // stage 8
   1439  btf_16_adds_subs_avx2(&x[0], &x[7]);
   1440  btf_16_adds_subs_avx2(&x[1], &x[6]);
   1441  btf_16_adds_subs_avx2(&x[2], &x[5]);
   1442  btf_16_adds_subs_avx2(&x[3], &x[4]);
   1443  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r,
   1444                  INV_COS_BIT);
   1445  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r,
   1446                  INV_COS_BIT);
   1447  idct64_stage8_high48_avx2(x, cospi, _r, INV_COS_BIT);
   1448 
   1449  idct64_stage9_avx2(x, cospi, _r, INV_COS_BIT);
   1450  idct64_stage10_avx2(x, cospi, _r, INV_COS_BIT);
   1451  idct64_stage11_avx2(output, x);
   1452 }
   1453 
   1454 static void idct64_low32_avx2(const __m256i *input, __m256i *output) {
   1455  const int32_t *cospi = cospi_arr(INV_COS_BIT);
   1456  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
   1457 
   1458  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
   1459  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
   1460  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
   1461  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
   1462  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
   1463 
   1464  // stage 1
   1465  __m256i x[64];
   1466  x[0] = input[0];
   1467  x[2] = input[16];
   1468  x[4] = input[8];
   1469  x[6] = input[24];
   1470  x[8] = input[4];
   1471  x[10] = input[20];
   1472  x[12] = input[12];
   1473  x[14] = input[28];
   1474  x[16] = input[2];
   1475  x[18] = input[18];
   1476  x[20] = input[10];
   1477  x[22] = input[26];
   1478  x[24] = input[6];
   1479  x[26] = input[22];
   1480  x[28] = input[14];
   1481  x[30] = input[30];
   1482  x[32] = input[1];
   1483  x[34] = input[17];
   1484  x[36] = input[9];
   1485  x[38] = input[25];
   1486  x[40] = input[5];
   1487  x[42] = input[21];
   1488  x[44] = input[13];
   1489  x[46] = input[29];
   1490  x[48] = input[3];
   1491  x[50] = input[19];
   1492  x[52] = input[11];
   1493  x[54] = input[27];
   1494  x[56] = input[7];
   1495  x[58] = input[23];
   1496  x[60] = input[15];
   1497  x[62] = input[31];
   1498 
   1499  // stage 2
   1500  btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
   1501  btf_16_w16_0_avx2(-cospi[33], cospi[31], x[62], x[33], x[62]);
   1502  btf_16_w16_0_avx2(cospi[47], cospi[17], x[34], x[34], x[61]);
   1503  btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]);
   1504  btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]);
   1505  btf_16_w16_0_avx2(-cospi[41], cospi[23], x[58], x[37], x[58]);
   1506  btf_16_w16_0_avx2(cospi[39], cospi[25], x[38], x[38], x[57]);
   1507  btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
   1508  btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
   1509  btf_16_w16_0_avx2(-cospi[37], cospi[27], x[54], x[41], x[54]);
   1510  btf_16_w16_0_avx2(cospi[43], cospi[21], x[42], x[42], x[53]);
   1511  btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]);
   1512  btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]);
   1513  btf_16_w16_0_avx2(-cospi[45], cospi[19], x[50], x[45], x[50]);
   1514  btf_16_w16_0_avx2(cospi[35], cospi[29], x[46], x[46], x[49]);
   1515  btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
   1516 
   1517  // stage 3
   1518  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
   1519  btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]);
   1520  btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]);
   1521  btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
   1522  btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
   1523  btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]);
   1524  btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]);
   1525  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
   1526  btf_16_adds_subs_avx2(&x[32], &x[33]);
   1527  btf_16_adds_subs_avx2(&x[35], &x[34]);
   1528  btf_16_adds_subs_avx2(&x[36], &x[37]);
   1529  btf_16_adds_subs_avx2(&x[39], &x[38]);
   1530  btf_16_adds_subs_avx2(&x[40], &x[41]);
   1531  btf_16_adds_subs_avx2(&x[43], &x[42]);
   1532  btf_16_adds_subs_avx2(&x[44], &x[45]);
   1533  btf_16_adds_subs_avx2(&x[47], &x[46]);
   1534  btf_16_adds_subs_avx2(&x[48], &x[49]);
   1535  btf_16_adds_subs_avx2(&x[51], &x[50]);
   1536  btf_16_adds_subs_avx2(&x[52], &x[53]);
   1537  btf_16_adds_subs_avx2(&x[55], &x[54]);
   1538  btf_16_adds_subs_avx2(&x[56], &x[57]);
   1539  btf_16_adds_subs_avx2(&x[59], &x[58]);
   1540  btf_16_adds_subs_avx2(&x[60], &x[61]);
   1541  btf_16_adds_subs_avx2(&x[63], &x[62]);
   1542 
   1543  // stage 4
   1544  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
   1545  btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]);
   1546  btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]);
   1547  btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
   1548  btf_16_adds_subs_avx2(&x[16], &x[17]);
   1549  btf_16_adds_subs_avx2(&x[19], &x[18]);
   1550  btf_16_adds_subs_avx2(&x[20], &x[21]);
   1551  btf_16_adds_subs_avx2(&x[23], &x[22]);
   1552  btf_16_adds_subs_avx2(&x[24], &x[25]);
   1553  btf_16_adds_subs_avx2(&x[27], &x[26]);
   1554  btf_16_adds_subs_avx2(&x[28], &x[29]);
   1555  btf_16_adds_subs_avx2(&x[31], &x[30]);
   1556  idct64_stage4_high32_avx2(x, cospi, _r, INV_COS_BIT);
   1557 
   1558  // stage 5
   1559  btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
   1560  btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]);
   1561  btf_16_adds_subs_avx2(&x[8], &x[9]);
   1562  btf_16_adds_subs_avx2(&x[11], &x[10]);
   1563  btf_16_adds_subs_avx2(&x[12], &x[13]);
   1564  btf_16_adds_subs_avx2(&x[15], &x[14]);
   1565  idct64_stage5_high48_avx2(x, cospi, _r, INV_COS_BIT);
   1566 
   1567  // stage 6
   1568  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
   1569  btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
   1570  btf_16_adds_subs_avx2(&x[4], &x[5]);
   1571  btf_16_adds_subs_avx2(&x[7], &x[6]);
   1572  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, INV_COS_BIT);
   1573  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r,
   1574                  INV_COS_BIT);
   1575  idct64_stage6_high48_avx2(x, cospi, _r, INV_COS_BIT);
   1576 
   1577  // stage 7
   1578  btf_16_adds_subs_avx2(&x[0], &x[3]);
   1579  btf_16_adds_subs_avx2(&x[1], &x[2]);
   1580  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, INV_COS_BIT);
   1581  btf_16_adds_subs_avx2(&x[8], &x[11]);
   1582  btf_16_adds_subs_avx2(&x[9], &x[10]);
   1583  btf_16_adds_subs_avx2(&x[15], &x[12]);
   1584  btf_16_adds_subs_avx2(&x[14], &x[13]);
   1585  idct64_stage7_high48_avx2(x, cospi, _r, INV_COS_BIT);
   1586 
   1587  // stage 8
   1588  btf_16_adds_subs_avx2(&x[0], &x[7]);
   1589  btf_16_adds_subs_avx2(&x[1], &x[6]);
   1590  btf_16_adds_subs_avx2(&x[2], &x[5]);
   1591  btf_16_adds_subs_avx2(&x[3], &x[4]);
   1592  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r,
   1593                  INV_COS_BIT);
   1594  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r,
   1595                  INV_COS_BIT);
   1596  idct64_stage8_high48_avx2(x, cospi, _r, INV_COS_BIT);
   1597 
   1598  // stage 9~11
   1599  idct64_stage9_avx2(x, cospi, _r, INV_COS_BIT);
   1600  idct64_stage10_avx2(x, cospi, _r, INV_COS_BIT);
   1601  idct64_stage11_avx2(output, x);
   1602 }
   1603 
   1604 typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output);
   1605 
   1606 // 1D functions process 16 pixels at one time.
   1607 static const transform_1d_avx2
   1608    lowbd_txfm_all_1d_zeros_w16_arr[TX_SIZES][ITX_TYPES_1D][4] = {
   1609      {
   1610          { NULL, NULL, NULL, NULL },
   1611          { NULL, NULL, NULL, NULL },
   1612          { NULL, NULL, NULL, NULL },
   1613      },
   1614      { { NULL, NULL, NULL, NULL },
   1615        { NULL, NULL, NULL, NULL },
   1616        { NULL, NULL, NULL, NULL } },
   1617      {
   1618          { idct16_low1_avx2, idct16_low8_avx2, idct16_avx2, NULL },
   1619          { iadst16_low1_avx2, iadst16_low8_avx2, iadst16_avx2, NULL },
   1620          { NULL, NULL, NULL, NULL },
   1621      },
   1622      { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 },
   1623        { NULL, NULL, NULL, NULL },
   1624        { NULL, NULL, NULL, NULL } },
   1625      { { idct64_low1_avx2, idct64_low8_avx2, idct64_low16_avx2,
   1626          idct64_low32_avx2 },
   1627        { NULL, NULL, NULL, NULL },
   1628        { NULL, NULL, NULL, NULL } }
   1629    };
   1630 
   1631 // only process w >= 16 h >= 16
   1632 static inline void lowbd_inv_txfm2d_add_no_identity_avx2(
   1633    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
   1634    TX_SIZE tx_size, int eob) {
   1635  __m256i buf1[64 * 16];
   1636  int eobx, eoby;
   1637  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
   1638  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   1639  const int txw_idx = get_txw_idx(tx_size);
   1640  const int txh_idx = get_txh_idx(tx_size);
   1641  const int txfm_size_col = tx_size_wide[tx_size];
   1642  const int txfm_size_row = tx_size_high[tx_size];
   1643  const int buf_size_w_div16 = txfm_size_col >> 4;
   1644  const int buf_size_nonzero_w = ((eobx + 16) >> 4) << 4;
   1645  const int buf_size_nonzero_h_div16 = (eoby + 16) >> 4;
   1646  const int input_stride = AOMMIN(32, txfm_size_row);
   1647  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
   1648 
   1649  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
   1650  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
   1651  const transform_1d_avx2 row_txfm =
   1652      lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
   1653  const transform_1d_avx2 col_txfm =
   1654      lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
   1655 
   1656  assert(col_txfm != NULL);
   1657  assert(row_txfm != NULL);
   1658  int ud_flip, lr_flip;
   1659  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   1660  const __m256i scale0 = _mm256_set1_epi16(1 << (15 + shift[0]));
   1661  for (int i = 0; i < buf_size_nonzero_h_div16; i++) {
   1662    __m256i buf0[64];
   1663    load_buffer_32bit_to_16bit_w16_avx2(input + 16 * i, input_stride, buf0,
   1664                                        buf_size_nonzero_w);
   1665    if (rect_type == 1 || rect_type == -1) {
   1666      round_shift_avx2(buf0, buf0, buf_size_nonzero_w);  // rect special code
   1667    }
   1668    row_txfm(buf0, buf0);
   1669    for (int j = 0; j < txfm_size_col; ++j) {
   1670      buf0[j] = _mm256_mulhrs_epi16(buf0[j], scale0);
   1671    }
   1672 
   1673    __m256i *buf1_cur = buf1 + (i << 4);
   1674    if (lr_flip) {
   1675      for (int j = 0; j < buf_size_w_div16; ++j) {
   1676        __m256i temp[16];
   1677        flip_buf_avx2(buf0 + 16 * j, temp, 16);
   1678        int offset = txfm_size_row * (buf_size_w_div16 - 1 - j);
   1679        transpose_16bit_16x16_avx2(temp, buf1_cur + offset);
   1680      }
   1681    } else {
   1682      for (int j = 0; j < buf_size_w_div16; ++j) {
   1683        transpose_16bit_16x16_avx2(buf0 + 16 * j, buf1_cur + txfm_size_row * j);
   1684      }
   1685    }
   1686  }
   1687  const __m256i scale1 = _mm256_set1_epi16(1 << (15 + shift[1]));
   1688  for (int i = 0; i < buf_size_w_div16; i++) {
   1689    __m256i *buf1_cur = buf1 + i * txfm_size_row;
   1690    col_txfm(buf1_cur, buf1_cur);
   1691    for (int j = 0; j < txfm_size_row; ++j) {
   1692      buf1_cur[j] = _mm256_mulhrs_epi16(buf1_cur[j], scale1);
   1693    }
   1694  }
   1695  for (int i = 0; i < buf_size_w_div16; i++) {
   1696    lowbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row, output + 16 * i,
   1697                                 stride, ud_flip, txfm_size_row);
   1698  }
   1699 }
   1700 
   1701 static inline void iidentity_row_16xn_avx2(__m256i *out, const int32_t *input,
   1702                                           int stride, int shift, int height,
   1703                                           int txw_idx, int rect_type) {
   1704  const int32_t *input_row = input;
   1705  const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txw_idx]);
   1706  const __m256i _r = _mm256_set1_epi16((1 << (NewSqrt2Bits - 1)) +
   1707                                       (1 << (NewSqrt2Bits - shift - 1)));
   1708  const __m256i one = _mm256_set1_epi16(1);
   1709  const __m256i scale__r = _mm256_unpacklo_epi16(scale, _r);
   1710  if (rect_type != 1 && rect_type != -1) {
   1711    for (int i = 0; i < height; ++i) {
   1712      const __m256i src = load_32bit_to_16bit_w16_avx2(input_row);
   1713      input_row += stride;
   1714      __m256i lo = _mm256_unpacklo_epi16(src, one);
   1715      __m256i hi = _mm256_unpackhi_epi16(src, one);
   1716      lo = _mm256_madd_epi16(lo, scale__r);
   1717      hi = _mm256_madd_epi16(hi, scale__r);
   1718      lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift);
   1719      hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift);
   1720      out[i] = _mm256_packs_epi32(lo, hi);
   1721    }
   1722  } else {
   1723    const __m256i rect_scale =
   1724        _mm256_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits));
   1725    for (int i = 0; i < height; ++i) {
   1726      __m256i src = load_32bit_to_16bit_w16_avx2(input_row);
   1727      src = _mm256_mulhrs_epi16(src, rect_scale);
   1728      input_row += stride;
   1729      __m256i lo = _mm256_unpacklo_epi16(src, one);
   1730      __m256i hi = _mm256_unpackhi_epi16(src, one);
   1731      lo = _mm256_madd_epi16(lo, scale__r);
   1732      hi = _mm256_madd_epi16(hi, scale__r);
   1733      lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift);
   1734      hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift);
   1735      out[i] = _mm256_packs_epi32(lo, hi);
   1736    }
   1737  }
   1738 }
   1739 
   1740 static inline void iidentity_col_16xn_avx2(uint8_t *output, int stride,
   1741                                           __m256i *buf, int shift, int height,
   1742                                           int txh_idx) {
   1743  const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txh_idx]);
   1744  const __m256i scale__r = _mm256_set1_epi16(1 << (NewSqrt2Bits - 1));
   1745  const __m256i shift__r = _mm256_set1_epi32(1 << (-shift - 1));
   1746  const __m256i one = _mm256_set1_epi16(1);
   1747  const __m256i scale_coeff = _mm256_unpacklo_epi16(scale, scale__r);
   1748  for (int h = 0; h < height; ++h) {
   1749    __m256i lo = _mm256_unpacklo_epi16(buf[h], one);
   1750    __m256i hi = _mm256_unpackhi_epi16(buf[h], one);
   1751    lo = _mm256_madd_epi16(lo, scale_coeff);
   1752    hi = _mm256_madd_epi16(hi, scale_coeff);
   1753    lo = _mm256_srai_epi32(lo, NewSqrt2Bits);
   1754    hi = _mm256_srai_epi32(hi, NewSqrt2Bits);
   1755    lo = _mm256_add_epi32(lo, shift__r);
   1756    hi = _mm256_add_epi32(hi, shift__r);
   1757    lo = _mm256_srai_epi32(lo, -shift);
   1758    hi = _mm256_srai_epi32(hi, -shift);
   1759    const __m256i x = _mm256_packs_epi32(lo, hi);
   1760    write_recon_w16_avx2(x, output);
   1761    output += stride;
   1762  }
   1763 }
   1764 
   1765 static inline void lowbd_inv_txfm2d_add_idtx_avx2(const int32_t *input,
   1766                                                  uint8_t *output, int stride,
   1767                                                  TX_SIZE tx_size,
   1768                                                  int32_t eob) {
   1769  (void)eob;
   1770  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   1771  const int txw_idx = get_txw_idx(tx_size);
   1772  const int txh_idx = get_txh_idx(tx_size);
   1773  const int txfm_size_col = tx_size_wide[tx_size];
   1774  const int txfm_size_row = tx_size_high[tx_size];
   1775  const int col_max = AOMMIN(32, txfm_size_col);
   1776  const int row_max = AOMMIN(32, txfm_size_row);
   1777  const int input_stride = row_max;
   1778  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
   1779  __m256i buf[32];
   1780 
   1781  for (int i = 0; i < (col_max >> 4); ++i) {
   1782    for (int j = 0; j < (row_max >> 4); j++) {
   1783      iidentity_row_16xn_avx2(buf, input + j * 16 + i * 16 * input_stride,
   1784                              row_max, shift[0], 16, txw_idx, rect_type);
   1785      transpose_16bit_16x16_avx2(buf, buf);
   1786      iidentity_col_16xn_avx2(output + i * 16 + j * 16 * stride, stride, buf,
   1787                              shift[1], 16, txh_idx);
   1788    }
   1789  }
   1790 }
   1791 
   1792 static inline void lowbd_inv_txfm2d_add_h_identity_avx2(
   1793    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
   1794    TX_SIZE tx_size, int eob) {
   1795  int eobx, eoby;
   1796  get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
   1797  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   1798  const int txw_idx = get_txw_idx(tx_size);
   1799  const int txh_idx = get_txh_idx(tx_size);
   1800  const int txfm_size_col = tx_size_wide[tx_size];
   1801  const int txfm_size_row = tx_size_high[tx_size];
   1802  const int txfm_size_row_notzero = AOMMIN(32, txfm_size_row);
   1803  const int input_stride = txfm_size_row_notzero;
   1804  const int buf_size_w_div16 = (eobx + 16) >> 4;
   1805  const int buf_size_h_div16 = (eoby + 16) >> 4;
   1806  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
   1807 
   1808  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
   1809  const transform_1d_avx2 col_txfm =
   1810      lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
   1811 
   1812  assert(col_txfm != NULL);
   1813 
   1814  int ud_flip, lr_flip;
   1815  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   1816  for (int i = 0; i < buf_size_w_div16; i++) {
   1817    __m256i buf0[64];
   1818    for (int j = 0; j < buf_size_h_div16; j++) {
   1819      __m256i *buf0_cur = buf0 + j * 16;
   1820      const int32_t *input_cur = input + i * 16 * input_stride + j * 16;
   1821      iidentity_row_16xn_avx2(buf0_cur, input_cur, input_stride, shift[0], 16,
   1822                              txw_idx, rect_type);
   1823      transpose_16bit_16x16_avx2(buf0_cur, buf0_cur);
   1824    }
   1825    col_txfm(buf0, buf0);
   1826    __m256i mshift = _mm256_set1_epi16(1 << (15 + shift[1]));
   1827    int k = ud_flip ? (txfm_size_row - 1) : 0;
   1828    const int step = ud_flip ? -1 : 1;
   1829    for (int j = 0; j < txfm_size_row; ++j, k += step) {
   1830      __m256i res = _mm256_mulhrs_epi16(buf0[k], mshift);
   1831      write_recon_w16_avx2(res, output + (i << 4) + j * stride);
   1832    }
   1833  }
   1834 }
   1835 
   1836 static inline void lowbd_inv_txfm2d_add_v_identity_avx2(
   1837    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
   1838    TX_SIZE tx_size, int eob) {
   1839  __m256i buf1[64];
   1840  int eobx, eoby;
   1841  get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
   1842  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   1843  const int txw_idx = get_txw_idx(tx_size);
   1844  const int txh_idx = get_txh_idx(tx_size);
   1845  const int txfm_size_col = tx_size_wide[tx_size];
   1846  const int txfm_size_row = tx_size_high[tx_size];
   1847  const int buf_size_w_div16 = txfm_size_col >> 4;
   1848  const int buf_size_h_div16 = (eoby + 16) >> 4;
   1849  const int buf_size_nonzero_w = ((eobx + 8) >> 3) << 3;
   1850  const int input_stride = AOMMIN(32, txfm_size_row);
   1851  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
   1852 
   1853  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
   1854  const transform_1d_avx2 row_txfm =
   1855      lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
   1856 
   1857  assert(row_txfm != NULL);
   1858 
   1859  int ud_flip, lr_flip;
   1860  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   1861  for (int i = 0; i < buf_size_h_div16; i++) {
   1862    __m256i buf0[64];
   1863    load_buffer_32bit_to_16bit_w16_avx2(input + i * 16, input_stride, buf0,
   1864                                        buf_size_nonzero_w);
   1865    if (rect_type == 1 || rect_type == -1) {
   1866      round_shift_avx2(buf0, buf0, buf_size_nonzero_w);  // rect special code
   1867    }
   1868    row_txfm(buf0, buf0);
   1869    round_shift_16bit_w16_avx2(buf0, txfm_size_col, shift[0]);
   1870    __m256i *_buf1 = buf1;
   1871    if (lr_flip) {
   1872      for (int j = 0; j < buf_size_w_div16; ++j) {
   1873        __m256i temp[16];
   1874        flip_buf_avx2(buf0 + 16 * j, temp, 16);
   1875        transpose_16bit_16x16_avx2(temp,
   1876                                   _buf1 + 16 * (buf_size_w_div16 - 1 - j));
   1877      }
   1878    } else {
   1879      for (int j = 0; j < buf_size_w_div16; ++j) {
   1880        transpose_16bit_16x16_avx2(buf0 + 16 * j, _buf1 + 16 * j);
   1881      }
   1882    }
   1883    for (int j = 0; j < buf_size_w_div16; ++j) {
   1884      iidentity_col_16xn_avx2(output + i * 16 * stride + j * 16, stride,
   1885                              buf1 + j * 16, shift[1], 16, txh_idx);
   1886    }
   1887  }
   1888 }
   1889 
   1890 static const transform_1d_ssse3 lowbd_txfm_all_1d_zeros_8x8_arr[2][2] = {
   1891  { av1_idct8_low1_ssse3, av1_idct8_sse2 },
   1892  { av1_iadst8_low1_ssse3, av1_iadst8_sse2 }
   1893 };
   1894 
   1895 static inline void load_buffer_avx2(const int32_t *in, int stride,
   1896                                    __m128i *out) {
   1897  const __m256i a = _mm256_load_si256((const __m256i *)in);
   1898  const __m256i b = _mm256_load_si256((const __m256i *)(in + stride * 1));
   1899  const __m256i c = _mm256_load_si256((const __m256i *)(in + stride * 2));
   1900  const __m256i d = _mm256_load_si256((const __m256i *)(in + stride * 3));
   1901  const __m256i e = _mm256_load_si256((const __m256i *)(in + stride * 4));
   1902  const __m256i f = _mm256_load_si256((const __m256i *)(in + stride * 5));
   1903  const __m256i g = _mm256_load_si256((const __m256i *)(in + stride * 6));
   1904  const __m256i h = _mm256_load_si256((const __m256i *)(in + stride * 7));
   1905 
   1906  // a0 a1 a2 a3 b0 b1 b2 b3 a4 a5 a6 a7 b4 b5 b6 b7
   1907  const __m256i ab_16bit = _mm256_packs_epi32(a, b);
   1908  // c0 c1 c2 c3 d0 d1 d2 d3 c4 c5 c6 c7 d4 d5 d6 d7
   1909  const __m256i cd_16bit = _mm256_packs_epi32(c, d);
   1910  // e0 e1 e2 e3 f0 f1 f2 f3 e4 e5 e6 e7 f4 f5 f6 f7
   1911  const __m256i ef_16bit = _mm256_packs_epi32(e, f);
   1912  // g0 g1 g2 g3 h0 h1 h2 h3 g4 g5 g6 g7 h4 h5 h6 h7
   1913  const __m256i gh_16bit = _mm256_packs_epi32(g, h);
   1914 
   1915  // a0 a1 a2 a3 a4 a5 a6 a7 b0 b1 b2 b3 b4 b5 b6 b7
   1916  const __m256i ab = _mm256_permute4x64_epi64(ab_16bit, 0xd8);
   1917  // c0 c1 c2 c3 c4 c5 c6 c7 d0 d1 d2 d3 d4 d5 d6 d7
   1918  const __m256i cd = _mm256_permute4x64_epi64(cd_16bit, 0xd8);
   1919  // e0 e1 e2 e3 e4 e5 e6 e7 f0 f1 f2 f3 f4 f5 f6 f7
   1920  const __m256i ef = _mm256_permute4x64_epi64(ef_16bit, 0xd8);
   1921  // g0 g1 g2 g3 g4 g5 g6 g7 h0 h1 h2 h3 h4 h5 h6 h7
   1922  const __m256i gh = _mm256_permute4x64_epi64(gh_16bit, 0xd8);
   1923 
   1924  out[0] = _mm256_castsi256_si128(ab);
   1925  out[1] = _mm256_extractf128_si256(ab, 1);
   1926  out[2] = _mm256_castsi256_si128(cd);
   1927  out[3] = _mm256_extractf128_si256(cd, 1);
   1928  out[4] = _mm256_castsi256_si128(ef);
   1929  out[5] = _mm256_extractf128_si256(ef, 1);
   1930  out[6] = _mm256_castsi256_si128(gh);
   1931  out[7] = _mm256_extractf128_si256(gh, 1);
   1932 }
   1933 
   1934 static inline void round_and_transpose_avx2(const __m128i *const in,
   1935                                            __m128i *const out, int bit,
   1936                                            int *lr_flip) {
   1937  __m256i buf_temp[4];
   1938  const __m256i scale = _mm256_set1_epi16(1 << (15 + bit));
   1939  int j = *lr_flip ? 7 : 0;
   1940  const int step = *lr_flip ? -1 : 1;
   1941 
   1942  // 70 71 72 73 74 75 76 77 | 30 31 32 33 34 35 36 37
   1943  buf_temp[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]),
   1944                                        in[j + 4 * step], 1);
   1945  j += step;
   1946  // 60 61 62 63 64 65 66 67 | 20 21 22 23 24 25 26 27
   1947  buf_temp[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]),
   1948                                        in[j + 4 * step], 1);
   1949  j += step;
   1950  // 50 51 52 53 54 55 56 57 | 10 11 12 13 14 15 16 17
   1951  buf_temp[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]),
   1952                                        in[j + 4 * step], 1);
   1953  j += step;
   1954  // 40 41 42 43 44 45 46 47 | 00 01 02 03 04 05 06 07
   1955  buf_temp[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]),
   1956                                        in[j + 4 * step], 1);
   1957 
   1958  // 70 71 72 73 74 75 76 77 | 30 31 32 33 34 35 36 37
   1959  buf_temp[0] = _mm256_mulhrs_epi16(buf_temp[0], scale);
   1960  // 60 61 62 63 64 65 66 67 | 20 21 22 23 24 25 26 27
   1961  buf_temp[1] = _mm256_mulhrs_epi16(buf_temp[1], scale);
   1962  // 50 51 52 53 54 55 56 57 | 10 11 12 13 14 15 16 17
   1963  buf_temp[2] = _mm256_mulhrs_epi16(buf_temp[2], scale);
   1964  // 40 41 42 43 44 45 46 47 | 00 01 02 03 04 05 06 07
   1965  buf_temp[3] = _mm256_mulhrs_epi16(buf_temp[3], scale);
   1966 
   1967  // 70 60 71 61 72 62 73 63 | 30 20 31 21 32 22 33 23
   1968  const __m256i unpcklo0 = _mm256_unpacklo_epi16(buf_temp[0], buf_temp[1]);
   1969  // 74 64 75 65 76 66 77 67 | 34 24 35 25 36 26 37 27
   1970  const __m256i unpckhi0 = _mm256_unpackhi_epi16(buf_temp[0], buf_temp[1]);
   1971  // 50 40 51 41 52 42 53 43 | 10 00 11 01 12 02 13 03
   1972  const __m256i unpcklo1 = _mm256_unpacklo_epi16(buf_temp[2], buf_temp[3]);
   1973  // 54 44 55 45 56 46 57 47 | 14 04 15 05 16 06 17 07
   1974  const __m256i unpckhi1 = _mm256_unpackhi_epi16(buf_temp[2], buf_temp[3]);
   1975 
   1976  // 70 60 50 40 71 61 51 41 | 30 20 10 00 31 21 11 01
   1977  const __m256i unpcklo00 = _mm256_unpacklo_epi32(unpcklo0, unpcklo1);
   1978  // 72 62 52 42 73 63 53 43 | 32 22 12 02 33 23 13 03
   1979  const __m256i unpckhi00 = _mm256_unpackhi_epi32(unpcklo0, unpcklo1);
   1980  // 74 64 54 44 75 65 55 45 | 34 24 14 04 35 25 15 05
   1981  const __m256i unpcklo01 = _mm256_unpacklo_epi32(unpckhi0, unpckhi1);
   1982  // 76 66 56 46 77 67 57 47 | 36 26 16 06 37 27 17 07
   1983  const __m256i unpckhi01 = _mm256_unpackhi_epi32(unpckhi0, unpckhi1);
   1984 
   1985  // 70 60 50 40 30 20 10 00 | 71 61 51 41 31 21 11 01
   1986  const __m256i reg_00 = _mm256_permute4x64_epi64(unpcklo00, 0xd8);
   1987  // 72 62 52 42 32 22 12 02 | 73 63 53 43 33 23 13 03
   1988  const __m256i reg_01 = _mm256_permute4x64_epi64(unpckhi00, 0xd8);
   1989  // 74 64 54 44 34 24 14 04 | 75 65 55 45 35 25 15 05
   1990  const __m256i reg_10 = _mm256_permute4x64_epi64(unpcklo01, 0xd8);
   1991  // 76 66 56 46 36 26 16 06 | 77 67 57 47 37 27 17 07
   1992  const __m256i reg_11 = _mm256_permute4x64_epi64(unpckhi01, 0xd8);
   1993 
   1994  // 70 60 50 40 30 20 10 00
   1995  out[0] = _mm256_castsi256_si128(reg_00);
   1996  // 71 61 51 41 31 21 11 01
   1997  out[1] = _mm256_extracti128_si256(reg_00, 1);
   1998  // 72 62 52 42 32 22 12 02
   1999  out[2] = _mm256_castsi256_si128(reg_01);
   2000  // 73 63 53 43 33 23 13 03
   2001  out[3] = _mm256_extracti128_si256(reg_01, 1);
   2002  // 74 64 54 44 34 24 14 04
   2003  out[4] = _mm256_castsi256_si128(reg_10);
   2004  // 75 65 55 45 35 25 15 05
   2005  out[5] = _mm256_extracti128_si256(reg_10, 1);
   2006  // 76 66 56 46 36 26 16 06
   2007  out[6] = _mm256_castsi256_si128(reg_11);
   2008  // 77 67 57 47 37 27 17 07
   2009  out[7] = _mm256_extracti128_si256(reg_11, 1);
   2010 }
   2011 
   2012 static inline void round_shift_lowbd_write_buffer_avx2(__m128i *in, int bit,
   2013                                                       uint8_t *output,
   2014                                                       int stride, int flipud) {
   2015  __m256i in_256[4], v_256[4];
   2016  int j = flipud ? 7 : 0;
   2017  const int step = flipud ? -1 : 1;
   2018  const __m256i scale = _mm256_set1_epi16(1 << (15 + bit));
   2019  const __m256i zero = _mm256_setzero_si256();
   2020  // in[0], in[1]
   2021  in_256[0] =
   2022      _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1);
   2023  j += 2 * step;
   2024  // in[2], in[3]
   2025  in_256[1] =
   2026      _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1);
   2027  j += 2 * step;
   2028  // in[4], in[5]
   2029  in_256[2] =
   2030      _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1);
   2031  j += 2 * step;
   2032  // in[6], in[7]
   2033  in_256[3] =
   2034      _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1);
   2035 
   2036  // i00 i01 i02 i03 i04 i05 i06 i07 i10 i11 i12 i13 i14 i15 i16 i17
   2037  in_256[0] = _mm256_mulhrs_epi16(in_256[0], scale);
   2038  // i20 i21 i22 i23 i24 i25 i26 i27 i30 i31 i32 i33 i34 i35 i36 i37
   2039  in_256[1] = _mm256_mulhrs_epi16(in_256[1], scale);
   2040  // i40 i41 i42 i43 i44 i45 i46 i47 i50 i51 i52 i53 i54 i55 i56 i57
   2041  in_256[2] = _mm256_mulhrs_epi16(in_256[2], scale);
   2042  // i60 i61 i62 i63 i64 i65 i66 i67 i70 i71 i72 i73 i74 i75 i76 i77
   2043  in_256[3] = _mm256_mulhrs_epi16(in_256[3], scale);
   2044 
   2045  const __m128i v0 = _mm_loadl_epi64((__m128i const *)(output));
   2046  const __m128i v1 = _mm_loadl_epi64((__m128i const *)(output + stride));
   2047  const __m128i v2 = _mm_loadl_epi64((__m128i const *)(output + 2 * stride));
   2048  const __m128i v3 = _mm_loadl_epi64((__m128i const *)(output + 3 * stride));
   2049  const __m128i v4 = _mm_loadl_epi64((__m128i const *)(output + 4 * stride));
   2050  const __m128i v5 = _mm_loadl_epi64((__m128i const *)(output + 5 * stride));
   2051  const __m128i v6 = _mm_loadl_epi64((__m128i const *)(output + 6 * stride));
   2052  const __m128i v7 = _mm_loadl_epi64((__m128i const *)(output + 7 * stride));
   2053 
   2054  v_256[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(v0), v1, 1);
   2055  v_256[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(v2), v3, 1);
   2056  v_256[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(v4), v5, 1);
   2057  v_256[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(v6), v7, 1);
   2058 
   2059  const __m256i unpcklo0 = _mm256_unpacklo_epi8(v_256[0], zero);
   2060  const __m256i unpcklo1 = _mm256_unpacklo_epi8(v_256[1], zero);
   2061  const __m256i unpcklo2 = _mm256_unpacklo_epi8(v_256[2], zero);
   2062  const __m256i unpcklo3 = _mm256_unpacklo_epi8(v_256[3], zero);
   2063  // 00 01 10 11
   2064  const __m256i x0 = _mm256_adds_epi16(in_256[0], unpcklo0);
   2065  // 20 21 30 31
   2066  const __m256i x1 = _mm256_adds_epi16(in_256[1], unpcklo1);
   2067  // 40 41 50 51
   2068  const __m256i x2 = _mm256_adds_epi16(in_256[2], unpcklo2);
   2069  // 60 61 70 71
   2070  const __m256i x3 = _mm256_adds_epi16(in_256[3], unpcklo3);
   2071 
   2072  // 00 01 20 21 10 11 30 31
   2073  const __m256i res_0123 = _mm256_packus_epi16(x0, x1);
   2074  // 40 41 60 61 50 51 70 71
   2075  const __m256i res_4567 = _mm256_packus_epi16(x2, x3);
   2076 
   2077  // 00 01 20 21
   2078  const __m128i res_02 = _mm256_castsi256_si128(res_0123);
   2079  // 10 11 30 31
   2080  const __m128i res_13 = _mm256_extracti128_si256(res_0123, 1);
   2081  // 40 41 60 61
   2082  const __m128i res_46 = _mm256_castsi256_si128(res_4567);
   2083  // 50 51 70 71
   2084  const __m128i res_57 = _mm256_extracti128_si256(res_4567, 1);
   2085 
   2086  // 00 01
   2087  _mm_storel_epi64((__m128i *)(output), res_02);
   2088  // 10 11
   2089  _mm_storel_epi64((__m128i *)(output + stride), res_13);
   2090  // 20 21
   2091  _mm_storel_epi64((__m128i *)(output + 2 * stride),
   2092                   _mm_unpackhi_epi64(res_02, res_02));
   2093  // 30 31
   2094  _mm_storel_epi64((__m128i *)(output + 3 * stride),
   2095                   _mm_unpackhi_epi64(res_13, res_13));
   2096  // 40 41
   2097  _mm_storel_epi64((__m128i *)(output + 4 * stride), res_46);
   2098  // 50 51
   2099  _mm_storel_epi64((__m128i *)(output + 5 * stride), res_57);
   2100  // 60 61
   2101  _mm_storel_epi64((__m128i *)(output + 6 * stride),
   2102                   _mm_unpackhi_epi64(res_46, res_46));
   2103  // 70 71
   2104  _mm_storel_epi64((__m128i *)(output + 7 * stride),
   2105                   _mm_unpackhi_epi64(res_57, res_57));
   2106 }
   2107 
   2108 // AVX2 implementation has the advantage when combined multiple operations
   2109 // together.
   2110 static inline void lowbd_inv_txfm2d_8x8_no_identity_avx2(
   2111    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
   2112    TX_SIZE tx_size, int eob) {
   2113  __m128i buf1[8];
   2114  const int input_stride = 8;
   2115  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   2116  assert(hitx_1d_tab[tx_type] < 2);
   2117  assert(vitx_1d_tab[tx_type] < 2);
   2118  const transform_1d_ssse3 row_txfm =
   2119      lowbd_txfm_all_1d_zeros_8x8_arr[hitx_1d_tab[tx_type]][eob != 1];
   2120  const transform_1d_ssse3 col_txfm =
   2121      lowbd_txfm_all_1d_zeros_8x8_arr[vitx_1d_tab[tx_type]][eob != 1];
   2122 
   2123  assert(col_txfm != NULL);
   2124  assert(row_txfm != NULL);
   2125  int ud_flip, lr_flip;
   2126  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   2127 
   2128  __m128i buf0[8];
   2129  __m128i *buf0_cur = buf0;
   2130  load_buffer_avx2(input, input_stride, buf0_cur);
   2131  row_txfm(buf0, buf0);
   2132 
   2133  assert(shift[0] < 0);
   2134  __m128i *_buf1 = buf1;
   2135  round_and_transpose_avx2(buf0, _buf1, shift[0], &lr_flip);
   2136  assert(shift[1] < 0);
   2137  col_txfm(buf1, buf1);
   2138  round_shift_lowbd_write_buffer_avx2(buf1, shift[1], output, stride, ud_flip);
   2139 }
   2140 
   2141 // AVX2 implementation of 8x8 inverse transform. Observed that coding AVX2 for
   2142 // tx_type with identity in either of the direction has no advantage.
   2143 static void lowbd_inv_txfm2d_add_8x8_avx2(const int32_t *input, uint8_t *output,
   2144                                          int stride, TX_TYPE tx_type,
   2145                                          TX_SIZE tx_size, int eob) {
   2146  switch (tx_type) {
   2147    case IDTX:
   2148      av1_lowbd_inv_txfm2d_add_idtx_ssse3(input, output, stride, tx_size);
   2149 
   2150      break;
   2151    case V_DCT:
   2152    case V_ADST:
   2153    case V_FLIPADST:
   2154      av1_lowbd_inv_txfm2d_add_h_identity_ssse3(input, output, stride, tx_type,
   2155                                                tx_size, eob);
   2156      break;
   2157    case H_DCT:
   2158    case H_ADST:
   2159    case H_FLIPADST:
   2160      av1_lowbd_inv_txfm2d_add_v_identity_ssse3(input, output, stride, tx_type,
   2161                                                tx_size, eob);
   2162      break;
   2163    default:
   2164      lowbd_inv_txfm2d_8x8_no_identity_avx2(input, output, stride, tx_type,
   2165                                            tx_size, eob);
   2166  }
   2167 }
   2168 
   2169 // for 32x32,32x64,64x32,64x64,16x32,32x16,64x16,16x64
   2170 static inline void lowbd_inv_txfm2d_add_universe_avx2(
   2171    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
   2172    TX_SIZE tx_size, int eob) {
   2173  (void)eob;
   2174  switch (tx_type) {
   2175    case DCT_DCT:
   2176    case ADST_DCT:   // ADST in vertical, DCT in horizontal
   2177    case DCT_ADST:   // DCT  in vertical, ADST in horizontal
   2178    case ADST_ADST:  // ADST in both directions
   2179    case FLIPADST_DCT:
   2180    case DCT_FLIPADST:
   2181    case FLIPADST_FLIPADST:
   2182    case ADST_FLIPADST:
   2183    case FLIPADST_ADST:
   2184      lowbd_inv_txfm2d_add_no_identity_avx2(input, output, stride, tx_type,
   2185                                            tx_size, eob);
   2186      break;
   2187    case IDTX:
   2188      lowbd_inv_txfm2d_add_idtx_avx2(input, output, stride, tx_size, eob);
   2189      break;
   2190    case V_DCT:
   2191    case V_ADST:
   2192    case V_FLIPADST:
   2193      lowbd_inv_txfm2d_add_h_identity_avx2(input, output, stride, tx_type,
   2194                                           tx_size, eob);
   2195      break;
   2196    case H_DCT:
   2197    case H_ADST:
   2198    case H_FLIPADST:
   2199      lowbd_inv_txfm2d_add_v_identity_avx2(input, output, stride, tx_type,
   2200                                           tx_size, eob);
   2201      break;
   2202    default:
   2203      av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size,
   2204                                     eob);
   2205      break;
   2206  }
   2207 }
   2208 
   2209 void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output,
   2210                                   int stride, TX_TYPE tx_type, TX_SIZE tx_size,
   2211                                   int eob) {
   2212  switch (tx_size) {
   2213    case TX_4X4:
   2214    case TX_4X8:
   2215    case TX_8X4:
   2216    case TX_8X16:
   2217    case TX_16X8:
   2218    case TX_4X16:
   2219    case TX_16X4:
   2220    case TX_8X32:
   2221    case TX_32X8:
   2222      av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size,
   2223                                     eob);
   2224      break;
   2225    case TX_8X8:
   2226      lowbd_inv_txfm2d_add_8x8_avx2(input, output, stride, tx_type, tx_size,
   2227                                    eob);
   2228      break;
   2229    case TX_16X16:
   2230    case TX_32X32:
   2231    case TX_64X64:
   2232    case TX_16X32:
   2233    case TX_32X16:
   2234    case TX_32X64:
   2235    case TX_64X32:
   2236    case TX_16X64:
   2237    case TX_64X16:
   2238    default:
   2239      lowbd_inv_txfm2d_add_universe_avx2(input, output, stride, tx_type,
   2240                                         tx_size, eob);
   2241      break;
   2242  }
   2243 }
   2244 
   2245 void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
   2246                           const TxfmParam *txfm_param) {
   2247  const TX_TYPE tx_type = txfm_param->tx_type;
   2248  if (!txfm_param->lossless) {
   2249    av1_lowbd_inv_txfm2d_add_avx2(dqcoeff, dst, stride, tx_type,
   2250                                  txfm_param->tx_size, txfm_param->eob);
   2251  } else {
   2252    av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
   2253  }
   2254 }