tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

av1_inv_txfm_ssse3.c (113960B)


      1 /*
      2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include "config/aom_config.h"
     13 #include "config/av1_rtcd.h"
     14 
     15 #include "av1/common/av1_inv_txfm1d_cfg.h"
     16 #include "av1/common/x86/av1_inv_txfm_ssse3.h"
     17 #include "av1/common/x86/av1_txfm_sse2.h"
     18 
     19 // TODO(venkatsanampudi@ittiam.com): move this to header file
     20 
     21 // Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
     22 static const int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793,
     23                                                4 * 4096, 4 * 5793 };
     24 
     25 // TODO(binpengsmail@gmail.com): replace some for loop with do {} while
     26 
     27 static void idct4_sse2(const __m128i *input, __m128i *output) {
     28  const int8_t cos_bit = INV_COS_BIT;
     29  const int32_t *cospi = cospi_arr(INV_COS_BIT);
     30  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
     31 
     32  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
     33  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
     34  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
     35  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
     36 
     37  // stage 1
     38  __m128i x[4];
     39  x[0] = input[0];
     40  x[1] = input[2];
     41  x[2] = input[1];
     42  x[3] = input[3];
     43 
     44  // stage 2
     45  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
     46  btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
     47 
     48  // stage 3
     49  btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]);
     50  btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
     51 }
     52 
     53 static void idct4_w4_sse2(const __m128i *input, __m128i *output) {
     54  const int8_t cos_bit = INV_COS_BIT;
     55  const int32_t *cospi = cospi_arr(INV_COS_BIT);
     56  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
     57 
     58  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
     59  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
     60  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
     61  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
     62 
     63  // stage 1
     64  __m128i x[4];
     65  x[0] = input[0];
     66  x[1] = input[2];
     67  x[2] = input[1];
     68  x[3] = input[3];
     69 
     70  // stage 2
     71  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
     72  btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
     73 
     74  // stage 3
     75  btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]);
     76  btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
     77 }
     78 
     79 void av1_idct8_low1_ssse3(const __m128i *input, __m128i *output) {
     80  const int32_t *cospi = cospi_arr(INV_COS_BIT);
     81 
     82  // stage 1
     83  __m128i x[2];
     84  x[0] = input[0];
     85 
     86  // stage 2
     87  // stage 3
     88  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
     89 
     90  // stage 4
     91  // stage 5
     92  output[0] = x[0];
     93  output[7] = x[0];
     94  output[1] = x[1];
     95  output[6] = x[1];
     96  output[2] = x[1];
     97  output[5] = x[1];
     98  output[3] = x[0];
     99  output[4] = x[0];
    100 }
    101 
    102 void av1_idct8_sse2(const __m128i *input, __m128i *output) {
    103  const int8_t cos_bit = INV_COS_BIT;
    104  const int32_t *cospi = cospi_arr(INV_COS_BIT);
    105  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
    106 
    107  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
    108  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
    109  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
    110  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
    111  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
    112  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
    113  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
    114  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
    115  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
    116 
    117  // stage 1
    118  __m128i x[8];
    119  x[0] = input[0];
    120  x[1] = input[4];
    121  x[2] = input[2];
    122  x[3] = input[6];
    123  x[4] = input[1];
    124  x[5] = input[5];
    125  x[6] = input[3];
    126  x[7] = input[7];
    127 
    128  // stage 2
    129  btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
    130  btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
    131 
    132  // stage 3
    133  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
    134  btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
    135  btf_16_adds_subs_sse2(x[4], x[5]);
    136  btf_16_subs_adds_sse2(x[7], x[6]);
    137 
    138  // stage 4
    139  btf_16_adds_subs_sse2(x[0], x[3]);
    140  btf_16_adds_subs_sse2(x[1], x[2]);
    141  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
    142 
    143  // stage 5
    144  btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]);
    145  btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]);
    146  btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]);
    147  btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
    148 }
    149 
    150 static void idct8_w4_sse2(const __m128i *input, __m128i *output) {
    151  const int8_t cos_bit = INV_COS_BIT;
    152  const int32_t *cospi = cospi_arr(INV_COS_BIT);
    153  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
    154 
    155  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
    156  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
    157  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
    158  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
    159  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
    160  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
    161  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
    162  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
    163  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
    164 
    165  // stage 1
    166  __m128i x[8];
    167  x[0] = input[0];
    168  x[1] = input[4];
    169  x[2] = input[2];
    170  x[3] = input[6];
    171  x[4] = input[1];
    172  x[5] = input[5];
    173  x[6] = input[3];
    174  x[7] = input[7];
    175 
    176  // stage 2
    177  btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
    178  btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
    179 
    180  // stage 3
    181  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
    182  btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
    183  btf_16_adds_subs_sse2(x[4], x[5]);
    184  btf_16_subs_adds_sse2(x[7], x[6]);
    185 
    186  // stage 4
    187  btf_16_adds_subs_sse2(x[0], x[3]);
    188  btf_16_adds_subs_sse2(x[1], x[2]);
    189  btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
    190 
    191  // stage 5
    192  btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]);
    193  btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]);
    194  btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]);
    195  btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
    196 }
    197 
    198 static inline void idct16_stage5_sse2(__m128i *x, const int32_t *cospi,
    199                                      const __m128i __rounding,
    200                                      int8_t cos_bit) {
    201  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
    202  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
    203  btf_16_adds_subs_sse2(x[0], x[3]);
    204  btf_16_adds_subs_sse2(x[1], x[2]);
    205  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
    206  btf_16_adds_subs_sse2(x[8], x[11]);
    207  btf_16_adds_subs_sse2(x[9], x[10]);
    208  btf_16_subs_adds_sse2(x[15], x[12]);
    209  btf_16_subs_adds_sse2(x[14], x[13]);
    210 }
    211 
    212 static inline void idct16_stage6_sse2(__m128i *x, const int32_t *cospi,
    213                                      const __m128i __rounding,
    214                                      int8_t cos_bit) {
    215  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
    216  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
    217  btf_16_adds_subs_sse2(x[0], x[7]);
    218  btf_16_adds_subs_sse2(x[1], x[6]);
    219  btf_16_adds_subs_sse2(x[2], x[5]);
    220  btf_16_adds_subs_sse2(x[3], x[4]);
    221  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
    222  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
    223 }
    224 
    225 static inline void idct16_stage7_sse2(__m128i *output, __m128i *x) {
    226  btf_16_adds_subs_out_sse2(output[0], output[15], x[0], x[15]);
    227  btf_16_adds_subs_out_sse2(output[1], output[14], x[1], x[14]);
    228  btf_16_adds_subs_out_sse2(output[2], output[13], x[2], x[13]);
    229  btf_16_adds_subs_out_sse2(output[3], output[12], x[3], x[12]);
    230  btf_16_adds_subs_out_sse2(output[4], output[11], x[4], x[11]);
    231  btf_16_adds_subs_out_sse2(output[5], output[10], x[5], x[10]);
    232  btf_16_adds_subs_out_sse2(output[6], output[9], x[6], x[9]);
    233  btf_16_adds_subs_out_sse2(output[7], output[8], x[7], x[8]);
    234 }
    235 
    236 static void idct16_low1_ssse3(const __m128i *input, __m128i *output) {
    237  const int32_t *cospi = cospi_arr(INV_COS_BIT);
    238 
    239  // stage 1
    240  __m128i x[2];
    241  x[0] = input[0];
    242 
    243  // stage 2
    244  // stage 3
    245  // stage 4
    246  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
    247 
    248  // stage 5
    249  // stage 6
    250  // stage 7
    251  output[0] = x[0];
    252  output[15] = x[0];
    253  output[1] = x[1];
    254  output[14] = x[1];
    255  output[2] = x[1];
    256  output[13] = x[1];
    257  output[3] = x[0];
    258  output[12] = x[0];
    259  output[4] = x[0];
    260  output[11] = x[0];
    261  output[5] = x[1];
    262  output[10] = x[1];
    263  output[6] = x[1];
    264  output[9] = x[1];
    265  output[7] = x[0];
    266  output[8] = x[0];
    267 }
    268 
    269 static void idct16_low8_ssse3(const __m128i *input, __m128i *output) {
    270  const int8_t cos_bit = INV_COS_BIT;
    271  const int32_t *cospi = cospi_arr(INV_COS_BIT);
    272  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
    273  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
    274  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
    275  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
    276 
    277  // stage 1
    278  __m128i x[16];
    279  x[0] = input[0];
    280  x[2] = input[4];
    281  x[4] = input[2];
    282  x[6] = input[6];
    283  x[8] = input[1];
    284  x[10] = input[5];
    285  x[12] = input[3];
    286  x[14] = input[7];
    287 
    288  // stage 2
    289  btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
    290  btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
    291  btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
    292  btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
    293 
    294  // stage 3
    295  btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
    296  btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
    297  btf_16_adds_subs_sse2(x[8], x[9]);
    298  btf_16_subs_adds_sse2(x[11], x[10]);
    299  btf_16_adds_subs_sse2(x[12], x[13]);
    300  btf_16_subs_adds_sse2(x[15], x[14]);
    301 
    302  // stage 4
    303  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
    304  btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
    305  btf_16_adds_subs_sse2(x[4], x[5]);
    306  btf_16_subs_adds_sse2(x[7], x[6]);
    307  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
    308  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
    309 
    310  idct16_stage5_sse2(x, cospi, __rounding, cos_bit);
    311  idct16_stage6_sse2(x, cospi, __rounding, cos_bit);
    312  idct16_stage7_sse2(output, x);
    313 }
    314 
    315 static void idct16_sse2(const __m128i *input, __m128i *output) {
    316  const int8_t cos_bit = INV_COS_BIT;
    317  const int32_t *cospi = cospi_arr(INV_COS_BIT);
    318  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
    319 
    320  const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
    321  const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
    322  const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
    323  const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
    324  const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
    325  const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
    326  const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
    327  const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
    328  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
    329  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
    330  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
    331  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
    332  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
    333  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
    334  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
    335  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
    336  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
    337  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
    338  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
    339 
    340  // stage 1
    341  __m128i x[16];
    342  x[0] = input[0];
    343  x[1] = input[8];
    344  x[2] = input[4];
    345  x[3] = input[12];
    346  x[4] = input[2];
    347  x[5] = input[10];
    348  x[6] = input[6];
    349  x[7] = input[14];
    350  x[8] = input[1];
    351  x[9] = input[9];
    352  x[10] = input[5];
    353  x[11] = input[13];
    354  x[12] = input[3];
    355  x[13] = input[11];
    356  x[14] = input[7];
    357  x[15] = input[15];
    358 
    359  // stage 2
    360  btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
    361  btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
    362  btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
    363  btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
    364 
    365  // stage 3
    366  btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
    367  btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
    368  btf_16_adds_subs_sse2(x[8], x[9]);
    369  btf_16_subs_adds_sse2(x[11], x[10]);
    370  btf_16_adds_subs_sse2(x[12], x[13]);
    371  btf_16_subs_adds_sse2(x[15], x[14]);
    372 
    373  // stage 4
    374  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
    375  btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
    376  btf_16_adds_subs_sse2(x[4], x[5]);
    377  btf_16_subs_adds_sse2(x[7], x[6]);
    378  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
    379  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
    380 
    381  // stage 5~7
    382  idct16_stage5_sse2(x, cospi, __rounding, cos_bit);
    383  idct16_stage6_sse2(x, cospi, __rounding, cos_bit);
    384  idct16_stage7_sse2(output, x);
    385 }
    386 
    387 static void idct16_w4_sse2(const __m128i *input, __m128i *output) {
    388  const int8_t cos_bit = INV_COS_BIT;
    389  const int32_t *cospi = cospi_arr(INV_COS_BIT);
    390  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
    391 
    392  const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
    393  const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
    394  const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
    395  const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
    396  const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
    397  const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
    398  const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
    399  const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
    400  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
    401  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
    402  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
    403  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
    404  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
    405  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
    406  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
    407  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
    408  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
    409  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
    410  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
    411  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
    412 
    413  // stage 1
    414  __m128i x[16];
    415  x[0] = input[0];
    416  x[1] = input[8];
    417  x[2] = input[4];
    418  x[3] = input[12];
    419  x[4] = input[2];
    420  x[5] = input[10];
    421  x[6] = input[6];
    422  x[7] = input[14];
    423  x[8] = input[1];
    424  x[9] = input[9];
    425  x[10] = input[5];
    426  x[11] = input[13];
    427  x[12] = input[3];
    428  x[13] = input[11];
    429  x[14] = input[7];
    430  x[15] = input[15];
    431 
    432  // stage 2
    433  btf_16_4p_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
    434  btf_16_4p_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
    435  btf_16_4p_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
    436  btf_16_4p_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
    437 
    438  // stage 3
    439  btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
    440  btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
    441  btf_16_adds_subs_sse2(x[8], x[9]);
    442  btf_16_subs_adds_sse2(x[11], x[10]);
    443  btf_16_adds_subs_sse2(x[12], x[13]);
    444  btf_16_subs_adds_sse2(x[15], x[14]);
    445 
    446  // stage 4
    447  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
    448  btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
    449  btf_16_adds_subs_sse2(x[4], x[5]);
    450  btf_16_subs_adds_sse2(x[7], x[6]);
    451  btf_16_4p_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
    452  btf_16_4p_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
    453 
    454  // stage 5
    455  btf_16_adds_subs_sse2(x[0], x[3]);
    456  btf_16_adds_subs_sse2(x[1], x[2]);
    457  btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
    458  btf_16_adds_subs_sse2(x[8], x[11]);
    459  btf_16_adds_subs_sse2(x[9], x[10]);
    460  btf_16_subs_adds_sse2(x[15], x[12]);
    461  btf_16_subs_adds_sse2(x[14], x[13]);
    462 
    463  // stage 6
    464  btf_16_adds_subs_sse2(x[0], x[7]);
    465  btf_16_adds_subs_sse2(x[1], x[6]);
    466  btf_16_adds_subs_sse2(x[2], x[5]);
    467  btf_16_adds_subs_sse2(x[3], x[4]);
    468  btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
    469  btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
    470 
    471  // stage 7
    472  idct16_stage7_sse2(output, x);
    473 }
    474 
    475 static inline void idct32_high16_stage3_sse2(__m128i *x) {
    476  btf_16_adds_subs_sse2(x[16], x[17]);
    477  btf_16_subs_adds_sse2(x[19], x[18]);
    478  btf_16_adds_subs_sse2(x[20], x[21]);
    479  btf_16_subs_adds_sse2(x[23], x[22]);
    480  btf_16_adds_subs_sse2(x[24], x[25]);
    481  btf_16_subs_adds_sse2(x[27], x[26]);
    482  btf_16_adds_subs_sse2(x[28], x[29]);
    483  btf_16_subs_adds_sse2(x[31], x[30]);
    484 }
    485 
    486 static inline void idct32_high16_stage4_sse2(__m128i *x, const int32_t *cospi,
    487                                             const __m128i __rounding,
    488                                             int8_t cos_bit) {
    489  const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
    490  const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
    491  const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
    492  const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
    493  const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
    494  const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
    495  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
    496  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
    497  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
    498  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
    499 }
    500 
    501 static inline void idct32_high24_stage5_sse2(__m128i *x, const int32_t *cospi,
    502                                             const __m128i __rounding,
    503                                             int8_t cos_bit) {
    504  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
    505  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
    506  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
    507  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
    508  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
    509  btf_16_adds_subs_sse2(x[16], x[19]);
    510  btf_16_adds_subs_sse2(x[17], x[18]);
    511  btf_16_subs_adds_sse2(x[23], x[20]);
    512  btf_16_subs_adds_sse2(x[22], x[21]);
    513  btf_16_adds_subs_sse2(x[24], x[27]);
    514  btf_16_adds_subs_sse2(x[25], x[26]);
    515  btf_16_subs_adds_sse2(x[31], x[28]);
    516  btf_16_subs_adds_sse2(x[30], x[29]);
    517 }
    518 
    519 static inline void idct32_high28_stage6_sse2(__m128i *x, const int32_t *cospi,
    520                                             const __m128i __rounding,
    521                                             int8_t cos_bit) {
    522  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
    523  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
    524  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
    525  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
    526  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
    527  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
    528  btf_16_adds_subs_sse2(x[8], x[11]);
    529  btf_16_adds_subs_sse2(x[9], x[10]);
    530  btf_16_subs_adds_sse2(x[15], x[12]);
    531  btf_16_subs_adds_sse2(x[14], x[13]);
    532  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
    533  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
    534  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
    535  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
    536 }
    537 
    538 static inline void idct32_stage7_sse2(__m128i *x, const int32_t *cospi,
    539                                      const __m128i __rounding,
    540                                      int8_t cos_bit) {
    541  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
    542  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
    543  btf_16_adds_subs_sse2(x[0], x[7]);
    544  btf_16_adds_subs_sse2(x[1], x[6]);
    545  btf_16_adds_subs_sse2(x[2], x[5]);
    546  btf_16_adds_subs_sse2(x[3], x[4]);
    547  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
    548  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
    549  btf_16_adds_subs_sse2(x[16], x[23]);
    550  btf_16_adds_subs_sse2(x[17], x[22]);
    551  btf_16_adds_subs_sse2(x[18], x[21]);
    552  btf_16_adds_subs_sse2(x[19], x[20]);
    553  btf_16_subs_adds_sse2(x[31], x[24]);
    554  btf_16_subs_adds_sse2(x[30], x[25]);
    555  btf_16_subs_adds_sse2(x[29], x[26]);
    556  btf_16_subs_adds_sse2(x[28], x[27]);
    557 }
    558 
    559 static inline void idct32_stage8_sse2(__m128i *x, const int32_t *cospi,
    560                                      const __m128i __rounding,
    561                                      int8_t cos_bit) {
    562  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
    563  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
    564  btf_16_adds_subs_sse2(x[0], x[15]);
    565  btf_16_adds_subs_sse2(x[1], x[14]);
    566  btf_16_adds_subs_sse2(x[2], x[13]);
    567  btf_16_adds_subs_sse2(x[3], x[12]);
    568  btf_16_adds_subs_sse2(x[4], x[11]);
    569  btf_16_adds_subs_sse2(x[5], x[10]);
    570  btf_16_adds_subs_sse2(x[6], x[9]);
    571  btf_16_adds_subs_sse2(x[7], x[8]);
    572  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
    573  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
    574  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
    575  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
    576 }
    577 
    578 static inline void idct32_stage9_sse2(__m128i *output, __m128i *x) {
    579  btf_16_adds_subs_out_sse2(output[0], output[31], x[0], x[31]);
    580  btf_16_adds_subs_out_sse2(output[1], output[30], x[1], x[30]);
    581  btf_16_adds_subs_out_sse2(output[2], output[29], x[2], x[29]);
    582  btf_16_adds_subs_out_sse2(output[3], output[28], x[3], x[28]);
    583  btf_16_adds_subs_out_sse2(output[4], output[27], x[4], x[27]);
    584  btf_16_adds_subs_out_sse2(output[5], output[26], x[5], x[26]);
    585  btf_16_adds_subs_out_sse2(output[6], output[25], x[6], x[25]);
    586  btf_16_adds_subs_out_sse2(output[7], output[24], x[7], x[24]);
    587  btf_16_adds_subs_out_sse2(output[8], output[23], x[8], x[23]);
    588  btf_16_adds_subs_out_sse2(output[9], output[22], x[9], x[22]);
    589  btf_16_adds_subs_out_sse2(output[10], output[21], x[10], x[21]);
    590  btf_16_adds_subs_out_sse2(output[11], output[20], x[11], x[20]);
    591  btf_16_adds_subs_out_sse2(output[12], output[19], x[12], x[19]);
    592  btf_16_adds_subs_out_sse2(output[13], output[18], x[13], x[18]);
    593  btf_16_adds_subs_out_sse2(output[14], output[17], x[14], x[17]);
    594  btf_16_adds_subs_out_sse2(output[15], output[16], x[15], x[16]);
    595 }
    596 
    597 static void idct32_low1_ssse3(const __m128i *input, __m128i *output) {
    598  const int32_t *cospi = cospi_arr(INV_COS_BIT);
    599 
    600  // stage 1
    601  __m128i x[2];
    602  x[0] = input[0];
    603 
    604  // stage 2
    605  // stage 3
    606  // stage 4
    607  // stage 5
    608  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
    609 
    610  // stage 6
    611  // stage 7
    612  // stage 8
    613  // stage 9
    614  output[0] = x[0];
    615  output[31] = x[0];
    616  output[1] = x[1];
    617  output[30] = x[1];
    618  output[2] = x[1];
    619  output[29] = x[1];
    620  output[3] = x[0];
    621  output[28] = x[0];
    622  output[4] = x[0];
    623  output[27] = x[0];
    624  output[5] = x[1];
    625  output[26] = x[1];
    626  output[6] = x[1];
    627  output[25] = x[1];
    628  output[7] = x[0];
    629  output[24] = x[0];
    630  output[8] = x[0];
    631  output[23] = x[0];
    632  output[9] = x[1];
    633  output[22] = x[1];
    634  output[10] = x[1];
    635  output[21] = x[1];
    636  output[11] = x[0];
    637  output[20] = x[0];
    638  output[12] = x[0];
    639  output[19] = x[0];
    640  output[13] = x[1];
    641  output[18] = x[1];
    642  output[14] = x[1];
    643  output[17] = x[1];
    644  output[15] = x[0];
    645  output[16] = x[0];
    646 }
    647 
    648 static void idct32_low8_ssse3(const __m128i *input, __m128i *output) {
    649  const int8_t cos_bit = INV_COS_BIT;
    650  const int32_t *cospi = cospi_arr(INV_COS_BIT);
    651  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
    652 
    653  // stage 1
    654  __m128i x[32];
    655  x[0] = input[0];
    656  x[4] = input[4];
    657  x[8] = input[2];
    658  x[12] = input[6];
    659  x[16] = input[1];
    660  x[20] = input[5];
    661  x[24] = input[3];
    662  x[28] = input[7];
    663 
    664  // stage 2
    665  btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
    666  btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
    667  btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
    668  btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
    669 
    670  // stage 3
    671  btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
    672  btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
    673  x[17] = x[16];
    674  x[18] = x[19];
    675  x[21] = x[20];
    676  x[22] = x[23];
    677  x[25] = x[24];
    678  x[26] = x[27];
    679  x[29] = x[28];
    680  x[30] = x[31];
    681 
    682  // stage 4
    683  btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
    684  x[9] = x[8];
    685  x[10] = x[11];
    686  x[13] = x[12];
    687  x[14] = x[15];
    688  idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
    689 
    690  // stage 5
    691  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
    692  x[5] = x[4];
    693  x[6] = x[7];
    694  idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
    695  // stage 6
    696  x[3] = x[0];
    697  x[2] = x[1];
    698  idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
    699 
    700  idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
    701  idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
    702  idct32_stage9_sse2(output, x);
    703 }
    704 
    705 static void idct32_low16_ssse3(const __m128i *input, __m128i *output) {
    706  const int8_t cos_bit = INV_COS_BIT;
    707  const int32_t *cospi = cospi_arr(INV_COS_BIT);
    708  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
    709 
    710  // stage 1
    711  __m128i x[32];
    712  x[0] = input[0];
    713  x[2] = input[8];
    714  x[4] = input[4];
    715  x[6] = input[12];
    716  x[8] = input[2];
    717  x[10] = input[10];
    718  x[12] = input[6];
    719  x[14] = input[14];
    720  x[16] = input[1];
    721  x[18] = input[9];
    722  x[20] = input[5];
    723  x[22] = input[13];
    724  x[24] = input[3];
    725  x[26] = input[11];
    726  x[28] = input[7];
    727  x[30] = input[15];
    728 
    729  // stage 2
    730  btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
    731  btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]);
    732  btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]);
    733  btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
    734  btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
    735  btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]);
    736  btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]);
    737  btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
    738 
    739  // stage 3
    740  btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
    741  btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
    742  btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
    743  btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
    744  idct32_high16_stage3_sse2(x);
    745 
    746  // stage 4
    747  btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
    748  btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
    749  btf_16_adds_subs_sse2(x[8], x[9]);
    750  btf_16_subs_adds_sse2(x[11], x[10]);
    751  btf_16_adds_subs_sse2(x[12], x[13]);
    752  btf_16_subs_adds_sse2(x[15], x[14]);
    753  idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
    754 
    755  // stage 5
    756  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
    757  btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
    758  btf_16_adds_subs_sse2(x[4], x[5]);
    759  btf_16_subs_adds_sse2(x[7], x[6]);
    760  idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
    761 
    762  btf_16_adds_subs_sse2(x[0], x[3]);
    763  btf_16_adds_subs_sse2(x[1], x[2]);
    764  idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
    765 
    766  idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
    767  idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
    768  idct32_stage9_sse2(output, x);
    769 }
    770 
    771 static void idct32_sse2(const __m128i *input, __m128i *output) {
    772  const int8_t cos_bit = INV_COS_BIT;
    773  const int32_t *cospi = cospi_arr(INV_COS_BIT);
    774  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
    775 
    776  const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
    777  const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
    778  const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
    779  const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
    780  const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
    781  const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
    782  const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
    783  const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
    784  const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
    785  const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
    786  const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
    787  const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
    788  const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
    789  const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
    790  const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
    791  const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
    792  const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
    793  const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
    794  const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
    795  const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
    796  const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
    797  const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
    798  const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
    799  const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
    800  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
    801  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
    802  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
    803  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
    804  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
    805  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
    806  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
    807  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
    808 
    809  // stage 1
    810  __m128i x[32];
    811  x[0] = input[0];
    812  x[1] = input[16];
    813  x[2] = input[8];
    814  x[3] = input[24];
    815  x[4] = input[4];
    816  x[5] = input[20];
    817  x[6] = input[12];
    818  x[7] = input[28];
    819  x[8] = input[2];
    820  x[9] = input[18];
    821  x[10] = input[10];
    822  x[11] = input[26];
    823  x[12] = input[6];
    824  x[13] = input[22];
    825  x[14] = input[14];
    826  x[15] = input[30];
    827  x[16] = input[1];
    828  x[17] = input[17];
    829  x[18] = input[9];
    830  x[19] = input[25];
    831  x[20] = input[5];
    832  x[21] = input[21];
    833  x[22] = input[13];
    834  x[23] = input[29];
    835  x[24] = input[3];
    836  x[25] = input[19];
    837  x[26] = input[11];
    838  x[27] = input[27];
    839  x[28] = input[7];
    840  x[29] = input[23];
    841  x[30] = input[15];
    842  x[31] = input[31];
    843 
    844  // stage 2
    845  btf_16_sse2(cospi_p62_m02, cospi_p02_p62, x[16], x[31], x[16], x[31]);
    846  btf_16_sse2(cospi_p30_m34, cospi_p34_p30, x[17], x[30], x[17], x[30]);
    847  btf_16_sse2(cospi_p46_m18, cospi_p18_p46, x[18], x[29], x[18], x[29]);
    848  btf_16_sse2(cospi_p14_m50, cospi_p50_p14, x[19], x[28], x[19], x[28]);
    849  btf_16_sse2(cospi_p54_m10, cospi_p10_p54, x[20], x[27], x[20], x[27]);
    850  btf_16_sse2(cospi_p22_m42, cospi_p42_p22, x[21], x[26], x[21], x[26]);
    851  btf_16_sse2(cospi_p38_m26, cospi_p26_p38, x[22], x[25], x[22], x[25]);
    852  btf_16_sse2(cospi_p06_m58, cospi_p58_p06, x[23], x[24], x[23], x[24]);
    853 
    854  // stage 3
    855  btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
    856  btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
    857  btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
    858  btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
    859  idct32_high16_stage3_sse2(x);
    860 
    861  // stage 4
    862  btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
    863  btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
    864  btf_16_adds_subs_sse2(x[8], x[9]);
    865  btf_16_subs_adds_sse2(x[11], x[10]);
    866  btf_16_adds_subs_sse2(x[12], x[13]);
    867  btf_16_subs_adds_sse2(x[15], x[14]);
    868  idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
    869 
    870  // stage 5
    871  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
    872  btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
    873  btf_16_adds_subs_sse2(x[4], x[5]);
    874  btf_16_adds_subs_sse2(x[7], x[6]);
    875  idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
    876 
    877  // stage 6
    878  btf_16_adds_subs_sse2(x[0], x[3]);
    879  btf_16_adds_subs_sse2(x[1], x[2]);
    880  idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
    881 
    882  // stage 7~8
    883  idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
    884  idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
    885  idct32_stage9_sse2(output, x);
    886 }
    887 
    888 static inline void idct64_stage4_high32_sse2(__m128i *x, const int32_t *cospi,
    889                                             const __m128i __rounding,
    890                                             int8_t cos_bit) {
    891  const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
    892  const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
    893  const __m128i cospi_m60_m04 = pair_set_epi16(-cospi[60], -cospi[4]);
    894  const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
    895  const __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
    896  const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
    897  const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
    898  const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
    899  const __m128i cospi_m44_m20 = pair_set_epi16(-cospi[44], -cospi[20]);
    900  const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
    901  const __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
    902  const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
    903  btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
    904  btf_16_sse2(cospi_m60_m04, cospi_m04_p60, x[34], x[61], x[34], x[61]);
    905  btf_16_sse2(cospi_m36_p28, cospi_p28_p36, x[37], x[58], x[37], x[58]);
    906  btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
    907  btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
    908  btf_16_sse2(cospi_m44_m20, cospi_m20_p44, x[42], x[53], x[42], x[53]);
    909  btf_16_sse2(cospi_m52_p12, cospi_p12_p52, x[45], x[50], x[45], x[50]);
    910  btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
    911 }
    912 
    913 static inline void idct64_stage5_high48_sse2(__m128i *x, const int32_t *cospi,
    914                                             const __m128i __rounding,
    915                                             int8_t cos_bit) {
    916  const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
    917  const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
    918  const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
    919  const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
    920  const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
    921  const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
    922  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
    923  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
    924  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
    925  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
    926  btf_16_adds_subs_sse2(x[32], x[35]);
    927  btf_16_adds_subs_sse2(x[33], x[34]);
    928  btf_16_subs_adds_sse2(x[39], x[36]);
    929  btf_16_subs_adds_sse2(x[38], x[37]);
    930  btf_16_adds_subs_sse2(x[40], x[43]);
    931  btf_16_adds_subs_sse2(x[41], x[42]);
    932  btf_16_subs_adds_sse2(x[47], x[44]);
    933  btf_16_subs_adds_sse2(x[46], x[45]);
    934  btf_16_adds_subs_sse2(x[48], x[51]);
    935  btf_16_adds_subs_sse2(x[49], x[50]);
    936  btf_16_subs_adds_sse2(x[55], x[52]);
    937  btf_16_subs_adds_sse2(x[54], x[53]);
    938  btf_16_adds_subs_sse2(x[56], x[59]);
    939  btf_16_adds_subs_sse2(x[57], x[58]);
    940  btf_16_subs_adds_sse2(x[63], x[60]);
    941  btf_16_subs_adds_sse2(x[62], x[61]);
    942 }
    943 
    944 static inline void idct64_stage6_high32_sse2(__m128i *x, const int32_t *cospi,
    945                                             const __m128i __rounding,
    946                                             int8_t cos_bit) {
    947  const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
    948  const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
    949  const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
    950  const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
    951  const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
    952  const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
    953  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[34], x[61], x[34], x[61]);
    954  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[35], x[60], x[35], x[60]);
    955  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[36], x[59], x[36], x[59]);
    956  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[37], x[58], x[37], x[58]);
    957  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[42], x[53], x[42], x[53]);
    958  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[43], x[52], x[43], x[52]);
    959  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[44], x[51], x[44], x[51]);
    960  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[45], x[50], x[45], x[50]);
    961 }
    962 
    963 static inline void idct64_stage6_high48_sse2(__m128i *x, const int32_t *cospi,
    964                                             const __m128i __rounding,
    965                                             int8_t cos_bit) {
    966  btf_16_adds_subs_sse2(x[16], x[19]);
    967  btf_16_adds_subs_sse2(x[17], x[18]);
    968  btf_16_subs_adds_sse2(x[23], x[20]);
    969  btf_16_subs_adds_sse2(x[22], x[21]);
    970  btf_16_adds_subs_sse2(x[24], x[27]);
    971  btf_16_adds_subs_sse2(x[25], x[26]);
    972  btf_16_subs_adds_sse2(x[31], x[28]);
    973  btf_16_subs_adds_sse2(x[30], x[29]);
    974  idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit);
    975 }
    976 
    977 static inline void idct64_stage7_high48_sse2(__m128i *x, const int32_t *cospi,
    978                                             const __m128i __rounding,
    979                                             int8_t cos_bit) {
    980  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
    981  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
    982  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
    983  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
    984  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
    985  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
    986  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
    987  btf_16_adds_subs_sse2(x[32], x[39]);
    988  btf_16_adds_subs_sse2(x[33], x[38]);
    989  btf_16_adds_subs_sse2(x[34], x[37]);
    990  btf_16_adds_subs_sse2(x[35], x[36]);
    991  btf_16_subs_adds_sse2(x[47], x[40]);
    992  btf_16_subs_adds_sse2(x[46], x[41]);
    993  btf_16_subs_adds_sse2(x[45], x[42]);
    994  btf_16_subs_adds_sse2(x[44], x[43]);
    995  btf_16_adds_subs_sse2(x[48], x[55]);
    996  btf_16_adds_subs_sse2(x[49], x[54]);
    997  btf_16_adds_subs_sse2(x[50], x[53]);
    998  btf_16_adds_subs_sse2(x[51], x[52]);
    999  btf_16_subs_adds_sse2(x[63], x[56]);
   1000  btf_16_subs_adds_sse2(x[62], x[57]);
   1001  btf_16_subs_adds_sse2(x[61], x[58]);
   1002  btf_16_subs_adds_sse2(x[60], x[59]);
   1003 }
   1004 
   1005 static inline void idct64_stage8_high48_sse2(__m128i *x, const int32_t *cospi,
   1006                                             const __m128i __rounding,
   1007                                             int8_t cos_bit) {
   1008  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
   1009  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
   1010  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
   1011  btf_16_adds_subs_sse2(x[16], x[23]);
   1012  btf_16_adds_subs_sse2(x[17], x[22]);
   1013  btf_16_adds_subs_sse2(x[18], x[21]);
   1014  btf_16_adds_subs_sse2(x[19], x[20]);
   1015  btf_16_subs_adds_sse2(x[31], x[24]);
   1016  btf_16_subs_adds_sse2(x[30], x[25]);
   1017  btf_16_subs_adds_sse2(x[29], x[26]);
   1018  btf_16_subs_adds_sse2(x[28], x[27]);
   1019  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[36], x[59], x[36], x[59]);
   1020  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[37], x[58], x[37], x[58]);
   1021  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[38], x[57], x[38], x[57]);
   1022  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[39], x[56], x[39], x[56]);
   1023  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[40], x[55], x[40], x[55]);
   1024  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[41], x[54], x[41], x[54]);
   1025  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[42], x[53], x[42], x[53]);
   1026  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[43], x[52], x[43], x[52]);
   1027 }
   1028 
   1029 static inline void idct64_stage9_sse2(__m128i *x, const int32_t *cospi,
   1030                                      const __m128i __rounding,
   1031                                      int8_t cos_bit) {
   1032  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
   1033  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
   1034  btf_16_adds_subs_sse2(x[0], x[15]);
   1035  btf_16_adds_subs_sse2(x[1], x[14]);
   1036  btf_16_adds_subs_sse2(x[2], x[13]);
   1037  btf_16_adds_subs_sse2(x[3], x[12]);
   1038  btf_16_adds_subs_sse2(x[4], x[11]);
   1039  btf_16_adds_subs_sse2(x[5], x[10]);
   1040  btf_16_adds_subs_sse2(x[6], x[9]);
   1041  btf_16_adds_subs_sse2(x[7], x[8]);
   1042  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
   1043  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
   1044  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
   1045  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
   1046  btf_16_adds_subs_sse2(x[32], x[47]);
   1047  btf_16_adds_subs_sse2(x[33], x[46]);
   1048  btf_16_adds_subs_sse2(x[34], x[45]);
   1049  btf_16_adds_subs_sse2(x[35], x[44]);
   1050  btf_16_adds_subs_sse2(x[36], x[43]);
   1051  btf_16_adds_subs_sse2(x[37], x[42]);
   1052  btf_16_adds_subs_sse2(x[38], x[41]);
   1053  btf_16_adds_subs_sse2(x[39], x[40]);
   1054  btf_16_subs_adds_sse2(x[63], x[48]);
   1055  btf_16_subs_adds_sse2(x[62], x[49]);
   1056  btf_16_subs_adds_sse2(x[61], x[50]);
   1057  btf_16_subs_adds_sse2(x[60], x[51]);
   1058  btf_16_subs_adds_sse2(x[59], x[52]);
   1059  btf_16_subs_adds_sse2(x[58], x[53]);
   1060  btf_16_subs_adds_sse2(x[57], x[54]);
   1061  btf_16_subs_adds_sse2(x[56], x[55]);
   1062 }
   1063 
   1064 static inline void idct64_stage10_sse2(__m128i *x, const int32_t *cospi,
   1065                                       const __m128i __rounding,
   1066                                       int8_t cos_bit) {
   1067  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
   1068  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
   1069  btf_16_adds_subs_sse2(x[0], x[31]);
   1070  btf_16_adds_subs_sse2(x[1], x[30]);
   1071  btf_16_adds_subs_sse2(x[2], x[29]);
   1072  btf_16_adds_subs_sse2(x[3], x[28]);
   1073  btf_16_adds_subs_sse2(x[4], x[27]);
   1074  btf_16_adds_subs_sse2(x[5], x[26]);
   1075  btf_16_adds_subs_sse2(x[6], x[25]);
   1076  btf_16_adds_subs_sse2(x[7], x[24]);
   1077  btf_16_adds_subs_sse2(x[8], x[23]);
   1078  btf_16_adds_subs_sse2(x[9], x[22]);
   1079  btf_16_adds_subs_sse2(x[10], x[21]);
   1080  btf_16_adds_subs_sse2(x[11], x[20]);
   1081  btf_16_adds_subs_sse2(x[12], x[19]);
   1082  btf_16_adds_subs_sse2(x[13], x[18]);
   1083  btf_16_adds_subs_sse2(x[14], x[17]);
   1084  btf_16_adds_subs_sse2(x[15], x[16]);
   1085  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[40], x[55], x[40], x[55]);
   1086  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[41], x[54], x[41], x[54]);
   1087  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[42], x[53], x[42], x[53]);
   1088  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[43], x[52], x[43], x[52]);
   1089  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[44], x[51], x[44], x[51]);
   1090  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[45], x[50], x[45], x[50]);
   1091  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[46], x[49], x[46], x[49]);
   1092  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[47], x[48], x[47], x[48]);
   1093 }
   1094 
   1095 static inline void idct64_stage11_sse2(__m128i *output, __m128i *x) {
   1096  btf_16_adds_subs_out_sse2(output[0], output[63], x[0], x[63]);
   1097  btf_16_adds_subs_out_sse2(output[1], output[62], x[1], x[62]);
   1098  btf_16_adds_subs_out_sse2(output[2], output[61], x[2], x[61]);
   1099  btf_16_adds_subs_out_sse2(output[3], output[60], x[3], x[60]);
   1100  btf_16_adds_subs_out_sse2(output[4], output[59], x[4], x[59]);
   1101  btf_16_adds_subs_out_sse2(output[5], output[58], x[5], x[58]);
   1102  btf_16_adds_subs_out_sse2(output[6], output[57], x[6], x[57]);
   1103  btf_16_adds_subs_out_sse2(output[7], output[56], x[7], x[56]);
   1104  btf_16_adds_subs_out_sse2(output[8], output[55], x[8], x[55]);
   1105  btf_16_adds_subs_out_sse2(output[9], output[54], x[9], x[54]);
   1106  btf_16_adds_subs_out_sse2(output[10], output[53], x[10], x[53]);
   1107  btf_16_adds_subs_out_sse2(output[11], output[52], x[11], x[52]);
   1108  btf_16_adds_subs_out_sse2(output[12], output[51], x[12], x[51]);
   1109  btf_16_adds_subs_out_sse2(output[13], output[50], x[13], x[50]);
   1110  btf_16_adds_subs_out_sse2(output[14], output[49], x[14], x[49]);
   1111  btf_16_adds_subs_out_sse2(output[15], output[48], x[15], x[48]);
   1112  btf_16_adds_subs_out_sse2(output[16], output[47], x[16], x[47]);
   1113  btf_16_adds_subs_out_sse2(output[17], output[46], x[17], x[46]);
   1114  btf_16_adds_subs_out_sse2(output[18], output[45], x[18], x[45]);
   1115  btf_16_adds_subs_out_sse2(output[19], output[44], x[19], x[44]);
   1116  btf_16_adds_subs_out_sse2(output[20], output[43], x[20], x[43]);
   1117  btf_16_adds_subs_out_sse2(output[21], output[42], x[21], x[42]);
   1118  btf_16_adds_subs_out_sse2(output[22], output[41], x[22], x[41]);
   1119  btf_16_adds_subs_out_sse2(output[23], output[40], x[23], x[40]);
   1120  btf_16_adds_subs_out_sse2(output[24], output[39], x[24], x[39]);
   1121  btf_16_adds_subs_out_sse2(output[25], output[38], x[25], x[38]);
   1122  btf_16_adds_subs_out_sse2(output[26], output[37], x[26], x[37]);
   1123  btf_16_adds_subs_out_sse2(output[27], output[36], x[27], x[36]);
   1124  btf_16_adds_subs_out_sse2(output[28], output[35], x[28], x[35]);
   1125  btf_16_adds_subs_out_sse2(output[29], output[34], x[29], x[34]);
   1126  btf_16_adds_subs_out_sse2(output[30], output[33], x[30], x[33]);
   1127  btf_16_adds_subs_out_sse2(output[31], output[32], x[31], x[32]);
   1128 }
   1129 
   1130 static void idct64_low1_ssse3(const __m128i *input, __m128i *output) {
   1131  const int32_t *cospi = cospi_arr(INV_COS_BIT);
   1132 
   1133  // stage 1
   1134  __m128i x[32];
   1135  x[0] = input[0];
   1136 
   1137  // stage 2
   1138  // stage 3
   1139  // stage 4
   1140  // stage 5
   1141  // stage 6
   1142  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
   1143 
   1144  // stage 7
   1145  // stage 8
   1146  // stage 9
   1147  // stage 10
   1148  // stage 11
   1149  output[0] = x[0];
   1150  output[63] = x[0];
   1151  output[1] = x[1];
   1152  output[62] = x[1];
   1153  output[2] = x[1];
   1154  output[61] = x[1];
   1155  output[3] = x[0];
   1156  output[60] = x[0];
   1157  output[4] = x[0];
   1158  output[59] = x[0];
   1159  output[5] = x[1];
   1160  output[58] = x[1];
   1161  output[6] = x[1];
   1162  output[57] = x[1];
   1163  output[7] = x[0];
   1164  output[56] = x[0];
   1165  output[8] = x[0];
   1166  output[55] = x[0];
   1167  output[9] = x[1];
   1168  output[54] = x[1];
   1169  output[10] = x[1];
   1170  output[53] = x[1];
   1171  output[11] = x[0];
   1172  output[52] = x[0];
   1173  output[12] = x[0];
   1174  output[51] = x[0];
   1175  output[13] = x[1];
   1176  output[50] = x[1];
   1177  output[14] = x[1];
   1178  output[49] = x[1];
   1179  output[15] = x[0];
   1180  output[48] = x[0];
   1181  output[16] = x[0];
   1182  output[47] = x[0];
   1183  output[17] = x[1];
   1184  output[46] = x[1];
   1185  output[18] = x[1];
   1186  output[45] = x[1];
   1187  output[19] = x[0];
   1188  output[44] = x[0];
   1189  output[20] = x[0];
   1190  output[43] = x[0];
   1191  output[21] = x[1];
   1192  output[42] = x[1];
   1193  output[22] = x[1];
   1194  output[41] = x[1];
   1195  output[23] = x[0];
   1196  output[40] = x[0];
   1197  output[24] = x[0];
   1198  output[39] = x[0];
   1199  output[25] = x[1];
   1200  output[38] = x[1];
   1201  output[26] = x[1];
   1202  output[37] = x[1];
   1203  output[27] = x[0];
   1204  output[36] = x[0];
   1205  output[28] = x[0];
   1206  output[35] = x[0];
   1207  output[29] = x[1];
   1208  output[34] = x[1];
   1209  output[30] = x[1];
   1210  output[33] = x[1];
   1211  output[31] = x[0];
   1212  output[32] = x[0];
   1213 }
   1214 
   1215 static void idct64_low8_ssse3(const __m128i *input, __m128i *output) {
   1216  const int8_t cos_bit = INV_COS_BIT;
   1217  const int32_t *cospi = cospi_arr(INV_COS_BIT);
   1218  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
   1219  const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
   1220  const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
   1221  const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
   1222  const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
   1223  const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
   1224  const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
   1225  const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
   1226  const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
   1227  const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
   1228  const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
   1229  const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
   1230  const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
   1231  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
   1232  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
   1233  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
   1234  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
   1235 
   1236  // stage 1
   1237  __m128i x[64];
   1238  x[0] = input[0];
   1239  x[8] = input[4];
   1240  x[16] = input[2];
   1241  x[24] = input[6];
   1242  x[32] = input[1];
   1243  x[40] = input[5];
   1244  x[48] = input[3];
   1245  x[56] = input[7];
   1246 
   1247  // stage 2
   1248  btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
   1249  btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
   1250  btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
   1251  btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
   1252 
   1253  // stage 3
   1254  btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
   1255  btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
   1256  x[33] = x[32];
   1257  x[38] = x[39];
   1258  x[41] = x[40];
   1259  x[46] = x[47];
   1260  x[49] = x[48];
   1261  x[54] = x[55];
   1262  x[57] = x[56];
   1263  x[62] = x[63];
   1264 
   1265  // stage 4
   1266  btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
   1267  x[17] = x[16];
   1268  x[22] = x[23];
   1269  x[25] = x[24];
   1270  x[30] = x[31];
   1271  btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
   1272  btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
   1273  btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
   1274  btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
   1275 
   1276  // stage 5
   1277  x[9] = x[8];
   1278  x[14] = x[15];
   1279  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
   1280  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
   1281  x[35] = x[32];
   1282  x[34] = x[33];
   1283  x[36] = x[39];
   1284  x[37] = x[38];
   1285  x[43] = x[40];
   1286  x[42] = x[41];
   1287  x[44] = x[47];
   1288  x[45] = x[46];
   1289  x[51] = x[48];
   1290  x[50] = x[49];
   1291  x[52] = x[55];
   1292  x[53] = x[54];
   1293  x[59] = x[56];
   1294  x[58] = x[57];
   1295  x[60] = x[63];
   1296  x[61] = x[62];
   1297 
   1298  // stage 6
   1299  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
   1300  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
   1301  x[19] = x[16];
   1302  x[18] = x[17];
   1303  x[20] = x[23];
   1304  x[21] = x[22];
   1305  x[27] = x[24];
   1306  x[26] = x[25];
   1307  x[28] = x[31];
   1308  x[29] = x[30];
   1309  idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit);
   1310 
   1311  // stage 7
   1312  x[3] = x[0];
   1313  x[2] = x[1];
   1314  x[11] = x[8];
   1315  x[10] = x[9];
   1316  x[12] = x[15];
   1317  x[13] = x[14];
   1318  idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
   1319 
   1320  // stage 8
   1321  x[7] = x[0];
   1322  x[6] = x[1];
   1323  x[5] = x[2];
   1324  x[4] = x[3];
   1325  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
   1326  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
   1327  idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
   1328 
   1329  idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
   1330  idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
   1331  idct64_stage11_sse2(output, x);
   1332 }
   1333 
   1334 static void idct64_low16_ssse3(const __m128i *input, __m128i *output) {
   1335  const int8_t cos_bit = INV_COS_BIT;
   1336  const int32_t *cospi = cospi_arr(INV_COS_BIT);
   1337  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
   1338 
   1339  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
   1340  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
   1341  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
   1342  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
   1343  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
   1344 
   1345  // stage 1
   1346  __m128i x[64];
   1347  x[0] = input[0];
   1348  x[4] = input[8];
   1349  x[8] = input[4];
   1350  x[12] = input[12];
   1351  x[16] = input[2];
   1352  x[20] = input[10];
   1353  x[24] = input[6];
   1354  x[28] = input[14];
   1355  x[32] = input[1];
   1356  x[36] = input[9];
   1357  x[40] = input[5];
   1358  x[44] = input[13];
   1359  x[48] = input[3];
   1360  x[52] = input[11];
   1361  x[56] = input[7];
   1362  x[60] = input[15];
   1363 
   1364  // stage 2
   1365  btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
   1366  btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]);
   1367  btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]);
   1368  btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
   1369  btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
   1370  btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]);
   1371  btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]);
   1372  btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
   1373 
   1374  // stage 3
   1375  btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
   1376  btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
   1377  btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
   1378  btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
   1379  x[33] = x[32];
   1380  x[34] = x[35];
   1381  x[37] = x[36];
   1382  x[38] = x[39];
   1383  x[41] = x[40];
   1384  x[42] = x[43];
   1385  x[45] = x[44];
   1386  x[46] = x[47];
   1387  x[49] = x[48];
   1388  x[50] = x[51];
   1389  x[53] = x[52];
   1390  x[54] = x[55];
   1391  x[57] = x[56];
   1392  x[58] = x[59];
   1393  x[61] = x[60];
   1394  x[62] = x[63];
   1395 
   1396  // stage 4
   1397  btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
   1398  btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
   1399  x[17] = x[16];
   1400  x[18] = x[19];
   1401  x[21] = x[20];
   1402  x[22] = x[23];
   1403  x[25] = x[24];
   1404  x[26] = x[27];
   1405  x[29] = x[28];
   1406  x[30] = x[31];
   1407  idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit);
   1408 
   1409  // stage 5
   1410  btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
   1411  x[9] = x[8];
   1412  x[10] = x[11];
   1413  x[13] = x[12];
   1414  x[14] = x[15];
   1415  idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit);
   1416 
   1417  // stage 6
   1418  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
   1419  x[5] = x[4];
   1420  x[6] = x[7];
   1421  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
   1422  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
   1423  idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit);
   1424 
   1425  // stage 7
   1426  x[3] = x[0];
   1427  x[2] = x[1];
   1428  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
   1429  btf_16_adds_subs_sse2(x[8], x[11]);
   1430  btf_16_adds_subs_sse2(x[9], x[10]);
   1431  btf_16_subs_adds_sse2(x[15], x[12]);
   1432  btf_16_subs_adds_sse2(x[14], x[13]);
   1433  idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
   1434 
   1435  // stage 8
   1436  btf_16_adds_subs_sse2(x[0], x[7]);
   1437  btf_16_adds_subs_sse2(x[1], x[6]);
   1438  btf_16_adds_subs_sse2(x[2], x[5]);
   1439  btf_16_adds_subs_sse2(x[3], x[4]);
   1440  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
   1441  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
   1442  idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
   1443 
   1444  idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
   1445  idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
   1446  idct64_stage11_sse2(output, x);
   1447 }
   1448 
   1449 static void idct64_low32_ssse3(const __m128i *input, __m128i *output) {
   1450  const int8_t cos_bit = INV_COS_BIT;
   1451  const int32_t *cospi = cospi_arr(INV_COS_BIT);
   1452  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
   1453 
   1454  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
   1455  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
   1456  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
   1457  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
   1458  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
   1459 
   1460  // stage 1
   1461  __m128i x[64];
   1462  x[0] = input[0];
   1463  x[2] = input[16];
   1464  x[4] = input[8];
   1465  x[6] = input[24];
   1466  x[8] = input[4];
   1467  x[10] = input[20];
   1468  x[12] = input[12];
   1469  x[14] = input[28];
   1470  x[16] = input[2];
   1471  x[18] = input[18];
   1472  x[20] = input[10];
   1473  x[22] = input[26];
   1474  x[24] = input[6];
   1475  x[26] = input[22];
   1476  x[28] = input[14];
   1477  x[30] = input[30];
   1478  x[32] = input[1];
   1479  x[34] = input[17];
   1480  x[36] = input[9];
   1481  x[38] = input[25];
   1482  x[40] = input[5];
   1483  x[42] = input[21];
   1484  x[44] = input[13];
   1485  x[46] = input[29];
   1486  x[48] = input[3];
   1487  x[50] = input[19];
   1488  x[52] = input[11];
   1489  x[54] = input[27];
   1490  x[56] = input[7];
   1491  x[58] = input[23];
   1492  x[60] = input[15];
   1493  x[62] = input[31];
   1494 
   1495  // stage 2
   1496  btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
   1497  btf_16_ssse3(-cospi[33], cospi[31], x[62], x[33], x[62]);
   1498  btf_16_ssse3(cospi[47], cospi[17], x[34], x[34], x[61]);
   1499  btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]);
   1500  btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]);
   1501  btf_16_ssse3(-cospi[41], cospi[23], x[58], x[37], x[58]);
   1502  btf_16_ssse3(cospi[39], cospi[25], x[38], x[38], x[57]);
   1503  btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
   1504  btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
   1505  btf_16_ssse3(-cospi[37], cospi[27], x[54], x[41], x[54]);
   1506  btf_16_ssse3(cospi[43], cospi[21], x[42], x[42], x[53]);
   1507  btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]);
   1508  btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]);
   1509  btf_16_ssse3(-cospi[45], cospi[19], x[50], x[45], x[50]);
   1510  btf_16_ssse3(cospi[35], cospi[29], x[46], x[46], x[49]);
   1511  btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
   1512 
   1513  // stage 3
   1514  btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
   1515  btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]);
   1516  btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]);
   1517  btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
   1518  btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
   1519  btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]);
   1520  btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]);
   1521  btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
   1522  btf_16_adds_subs_sse2(x[32], x[33]);
   1523  btf_16_subs_adds_sse2(x[35], x[34]);
   1524  btf_16_adds_subs_sse2(x[36], x[37]);
   1525  btf_16_subs_adds_sse2(x[39], x[38]);
   1526  btf_16_adds_subs_sse2(x[40], x[41]);
   1527  btf_16_subs_adds_sse2(x[43], x[42]);
   1528  btf_16_adds_subs_sse2(x[44], x[45]);
   1529  btf_16_subs_adds_sse2(x[47], x[46]);
   1530  btf_16_adds_subs_sse2(x[48], x[49]);
   1531  btf_16_subs_adds_sse2(x[51], x[50]);
   1532  btf_16_adds_subs_sse2(x[52], x[53]);
   1533  btf_16_subs_adds_sse2(x[55], x[54]);
   1534  btf_16_adds_subs_sse2(x[56], x[57]);
   1535  btf_16_subs_adds_sse2(x[59], x[58]);
   1536  btf_16_adds_subs_sse2(x[60], x[61]);
   1537  btf_16_subs_adds_sse2(x[63], x[62]);
   1538 
   1539  // stage 4
   1540  btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
   1541  btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
   1542  btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
   1543  btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
   1544  btf_16_adds_subs_sse2(x[16], x[17]);
   1545  btf_16_subs_adds_sse2(x[19], x[18]);
   1546  btf_16_adds_subs_sse2(x[20], x[21]);
   1547  btf_16_subs_adds_sse2(x[23], x[22]);
   1548  btf_16_adds_subs_sse2(x[24], x[25]);
   1549  btf_16_subs_adds_sse2(x[27], x[26]);
   1550  btf_16_adds_subs_sse2(x[28], x[29]);
   1551  btf_16_subs_adds_sse2(x[31], x[30]);
   1552  idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit);
   1553 
   1554  // stage 5
   1555  btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
   1556  btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
   1557  btf_16_adds_subs_sse2(x[8], x[9]);
   1558  btf_16_subs_adds_sse2(x[11], x[10]);
   1559  btf_16_adds_subs_sse2(x[12], x[13]);
   1560  btf_16_subs_adds_sse2(x[15], x[14]);
   1561  idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit);
   1562 
   1563  // stage 6
   1564  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
   1565  btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
   1566  btf_16_adds_subs_sse2(x[4], x[5]);
   1567  btf_16_subs_adds_sse2(x[7], x[6]);
   1568  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
   1569  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
   1570  idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit);
   1571 
   1572  // stage 7
   1573  btf_16_adds_subs_sse2(x[0], x[3]);
   1574  btf_16_adds_subs_sse2(x[1], x[2]);
   1575  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
   1576  btf_16_adds_subs_sse2(x[8], x[11]);
   1577  btf_16_adds_subs_sse2(x[9], x[10]);
   1578  btf_16_subs_adds_sse2(x[15], x[12]);
   1579  btf_16_subs_adds_sse2(x[14], x[13]);
   1580  idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
   1581 
   1582  // stage 8
   1583  btf_16_adds_subs_sse2(x[0], x[7]);
   1584  btf_16_adds_subs_sse2(x[1], x[6]);
   1585  btf_16_adds_subs_sse2(x[2], x[5]);
   1586  btf_16_adds_subs_sse2(x[3], x[4]);
   1587  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
   1588  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
   1589  idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
   1590 
   1591  // stage 9~11
   1592  idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
   1593  idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
   1594  idct64_stage11_sse2(output, x);
   1595 }
   1596 
   1597 static void iadst4_sse2(const __m128i *input, __m128i *output) {
   1598  const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
   1599  const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
   1600  const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
   1601  const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]);
   1602  const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]);
   1603  const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]);
   1604  const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]);
   1605  const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]);
   1606  const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]);
   1607  __m128i x0[4];
   1608  x0[0] = input[0];
   1609  x0[1] = input[1];
   1610  x0[2] = input[2];
   1611  x0[3] = input[3];
   1612 
   1613  __m128i u[4];
   1614  u[0] = _mm_unpacklo_epi16(x0[0], x0[2]);
   1615  u[1] = _mm_unpackhi_epi16(x0[0], x0[2]);
   1616  u[2] = _mm_unpacklo_epi16(x0[1], x0[3]);
   1617  u[3] = _mm_unpackhi_epi16(x0[1], x0[3]);
   1618 
   1619  __m128i x1[16];
   1620  x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04);  // x0*sin1 + x2*sin4
   1621  x1[1] = _mm_madd_epi16(u[1], sinpi_p01_p04);
   1622  x1[2] = _mm_madd_epi16(u[0], sinpi_p02_m01);  // x0*sin2 - x2*sin1
   1623  x1[3] = _mm_madd_epi16(u[1], sinpi_p02_m01);
   1624  x1[4] = _mm_madd_epi16(u[2], sinpi_p03_p02);  // x1*sin3 + x3*sin2
   1625  x1[5] = _mm_madd_epi16(u[3], sinpi_p03_p02);
   1626  x1[6] = _mm_madd_epi16(u[2], sinpi_p03_m04);  // x1*sin3 - x3*sin4
   1627  x1[7] = _mm_madd_epi16(u[3], sinpi_p03_m04);
   1628  x1[8] = _mm_madd_epi16(u[0], sinpi_p03_m03);  // x0*sin3 - x2*sin3
   1629  x1[9] = _mm_madd_epi16(u[1], sinpi_p03_m03);
   1630  x1[10] = _mm_madd_epi16(u[2], sinpi_0_p03);  // x2*sin3
   1631  x1[11] = _mm_madd_epi16(u[3], sinpi_0_p03);
   1632  x1[12] = _mm_madd_epi16(u[0], sinpi_p04_p02);  // x0*sin4 + x2*sin2
   1633  x1[13] = _mm_madd_epi16(u[1], sinpi_p04_p02);
   1634  x1[14] = _mm_madd_epi16(u[2], sinpi_m03_m01);  // -x1*sin3 - x3*sin1
   1635  x1[15] = _mm_madd_epi16(u[3], sinpi_m03_m01);
   1636 
   1637  __m128i x2[8];
   1638  x2[0] = _mm_add_epi32(x1[0], x1[4]);  // x0*sin1 +x2*sin4 +x1*sin3 +x3*sin2
   1639  x2[1] = _mm_add_epi32(x1[1], x1[5]);
   1640  x2[2] = _mm_add_epi32(x1[2], x1[6]);  // x0*sin2 -x2*sin1 +x1*sin3 -x3*sin4
   1641  x2[3] = _mm_add_epi32(x1[3], x1[7]);
   1642  x2[4] = _mm_add_epi32(x1[8], x1[10]);  // x0*sin3 -x2*sin3 +x3*sin3
   1643  x2[5] = _mm_add_epi32(x1[9], x1[11]);
   1644  x2[6] = _mm_add_epi32(x1[12], x1[14]);  // x0*sin1 +x2*sin4 +x0*sin2 -x2*sin1
   1645  x2[7] = _mm_add_epi32(x1[13], x1[15]);
   1646 
   1647  const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
   1648  for (int i = 0; i < 4; ++i) {
   1649    __m128i out0 = _mm_add_epi32(x2[2 * i], rounding);
   1650    __m128i out1 = _mm_add_epi32(x2[2 * i + 1], rounding);
   1651    out0 = _mm_srai_epi32(out0, INV_COS_BIT);
   1652    out1 = _mm_srai_epi32(out1, INV_COS_BIT);
   1653    output[i] = _mm_packs_epi32(out0, out1);
   1654  }
   1655 }
   1656 
   1657 static void iadst4_w4_sse2(const __m128i *input, __m128i *output) {
   1658  const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
   1659  const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
   1660  const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
   1661  const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]);
   1662  const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]);
   1663  const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]);
   1664  const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]);
   1665  const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]);
   1666  const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]);
   1667  __m128i x0[4];
   1668  x0[0] = input[0];
   1669  x0[1] = input[1];
   1670  x0[2] = input[2];
   1671  x0[3] = input[3];
   1672 
   1673  __m128i u[2];
   1674  u[0] = _mm_unpacklo_epi16(x0[0], x0[2]);
   1675  u[1] = _mm_unpacklo_epi16(x0[1], x0[3]);
   1676 
   1677  __m128i x1[8];
   1678  x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04);  // x0*sin1 + x2*sin4
   1679  x1[1] = _mm_madd_epi16(u[0], sinpi_p02_m01);  // x0*sin2 - x2*sin1
   1680  x1[2] = _mm_madd_epi16(u[1], sinpi_p03_p02);  // x1*sin3 + x3*sin2
   1681  x1[3] = _mm_madd_epi16(u[1], sinpi_p03_m04);  // x1*sin3 - x3*sin4
   1682  x1[4] = _mm_madd_epi16(u[0], sinpi_p03_m03);  // x0*sin3 - x2*sin3
   1683  x1[5] = _mm_madd_epi16(u[1], sinpi_0_p03);    // x2*sin3
   1684  x1[6] = _mm_madd_epi16(u[0], sinpi_p04_p02);  // x0*sin4 + x2*sin2
   1685  x1[7] = _mm_madd_epi16(u[1], sinpi_m03_m01);  // -x1*sin3 - x3*sin1
   1686 
   1687  __m128i x2[4];
   1688  x2[0] = _mm_add_epi32(x1[0], x1[2]);  // x0*sin1 + x2*sin4 + x1*sin3 + x3*sin2
   1689  x2[1] = _mm_add_epi32(x1[1], x1[3]);  // x0*sin2 - x2*sin1 + x1*sin3 - x3*sin4
   1690  x2[2] = _mm_add_epi32(x1[4], x1[5]);  // x0*sin3 - x2*sin3 + x3*sin3
   1691  x2[3] = _mm_add_epi32(x1[6], x1[7]);  // x0*sin4 + x2*sin2 - x1*sin3 - x3*sin1
   1692 
   1693  const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
   1694  for (int i = 0; i < 4; ++i) {
   1695    __m128i out0 = _mm_add_epi32(x2[i], rounding);
   1696    out0 = _mm_srai_epi32(out0, INV_COS_BIT);
   1697    output[i] = _mm_packs_epi32(out0, out0);
   1698  }
   1699 }
   1700 
   1701 void av1_iadst8_low1_ssse3(const __m128i *input, __m128i *output) {
   1702  const int8_t cos_bit = INV_COS_BIT;
   1703  const int32_t *cospi = cospi_arr(INV_COS_BIT);
   1704  const __m128i __zero = _mm_setzero_si128();
   1705  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
   1706 
   1707  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
   1708  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
   1709  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
   1710  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
   1711 
   1712  // stage 1
   1713  __m128i x[8];
   1714  x[1] = input[0];
   1715 
   1716  // stage 2
   1717  btf_16_ssse3(cospi[60], -cospi[4], x[1], x[0], x[1]);
   1718 
   1719  // stage 3
   1720  x[4] = x[0];
   1721  x[5] = x[1];
   1722 
   1723  // stage 4
   1724  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
   1725 
   1726  // stage 5
   1727  x[2] = x[0];
   1728  x[3] = x[1];
   1729  x[6] = x[4];
   1730  x[7] = x[5];
   1731 
   1732  // stage 6
   1733  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
   1734  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
   1735 
   1736  // stage 7
   1737  output[0] = x[0];
   1738  output[1] = _mm_subs_epi16(__zero, x[4]);
   1739  output[2] = x[6];
   1740  output[3] = _mm_subs_epi16(__zero, x[2]);
   1741  output[4] = x[3];
   1742  output[5] = _mm_subs_epi16(__zero, x[7]);
   1743  output[6] = x[5];
   1744  output[7] = _mm_subs_epi16(__zero, x[1]);
   1745 }
   1746 
   1747 void av1_iadst8_sse2(const __m128i *input, __m128i *output) {
   1748  const int8_t cos_bit = INV_COS_BIT;
   1749  const int32_t *cospi = cospi_arr(INV_COS_BIT);
   1750  const __m128i __zero = _mm_setzero_si128();
   1751  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
   1752 
   1753  const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
   1754  const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
   1755  const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
   1756  const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
   1757  const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
   1758  const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
   1759  const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
   1760  const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
   1761  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
   1762  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
   1763  const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
   1764  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
   1765  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
   1766 
   1767  // stage 1
   1768  __m128i x[8];
   1769  x[0] = input[7];
   1770  x[1] = input[0];
   1771  x[2] = input[5];
   1772  x[3] = input[2];
   1773  x[4] = input[3];
   1774  x[5] = input[4];
   1775  x[6] = input[1];
   1776  x[7] = input[6];
   1777 
   1778  // stage 2
   1779  btf_16_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]);
   1780  btf_16_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]);
   1781  btf_16_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]);
   1782  btf_16_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]);
   1783 
   1784  // stage 3
   1785  btf_16_adds_subs_sse2(x[0], x[4]);
   1786  btf_16_adds_subs_sse2(x[1], x[5]);
   1787  btf_16_adds_subs_sse2(x[2], x[6]);
   1788  btf_16_adds_subs_sse2(x[3], x[7]);
   1789 
   1790  // stage 4
   1791  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
   1792  btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
   1793 
   1794  // stage 5
   1795  btf_16_adds_subs_sse2(x[0], x[2]);
   1796  btf_16_adds_subs_sse2(x[1], x[3]);
   1797  btf_16_adds_subs_sse2(x[4], x[6]);
   1798  btf_16_adds_subs_sse2(x[5], x[7]);
   1799 
   1800  // stage 6
   1801  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
   1802  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
   1803 
   1804  // stage 7
   1805  output[0] = x[0];
   1806  output[1] = _mm_subs_epi16(__zero, x[4]);
   1807  output[2] = x[6];
   1808  output[3] = _mm_subs_epi16(__zero, x[2]);
   1809  output[4] = x[3];
   1810  output[5] = _mm_subs_epi16(__zero, x[7]);
   1811  output[6] = x[5];
   1812  output[7] = _mm_subs_epi16(__zero, x[1]);
   1813 }
   1814 
   1815 static void iadst8_w4_sse2(const __m128i *input, __m128i *output) {
   1816  const int8_t cos_bit = INV_COS_BIT;
   1817  const int32_t *cospi = cospi_arr(INV_COS_BIT);
   1818  const __m128i __zero = _mm_setzero_si128();
   1819  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
   1820 
   1821  const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
   1822  const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
   1823  const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
   1824  const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
   1825  const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
   1826  const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
   1827  const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
   1828  const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
   1829  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
   1830  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
   1831  const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
   1832  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
   1833  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
   1834 
   1835  // stage 1
   1836  __m128i x[8];
   1837  x[0] = input[7];
   1838  x[1] = input[0];
   1839  x[2] = input[5];
   1840  x[3] = input[2];
   1841  x[4] = input[3];
   1842  x[5] = input[4];
   1843  x[6] = input[1];
   1844  x[7] = input[6];
   1845 
   1846  // stage 2
   1847  btf_16_4p_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]);
   1848  btf_16_4p_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]);
   1849  btf_16_4p_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]);
   1850  btf_16_4p_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]);
   1851 
   1852  // stage 3
   1853  btf_16_adds_subs_sse2(x[0], x[4]);
   1854  btf_16_adds_subs_sse2(x[1], x[5]);
   1855  btf_16_adds_subs_sse2(x[2], x[6]);
   1856  btf_16_adds_subs_sse2(x[3], x[7]);
   1857 
   1858  // stage 4
   1859  btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
   1860  btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
   1861 
   1862  // stage 5
   1863  btf_16_adds_subs_sse2(x[0], x[2]);
   1864  btf_16_adds_subs_sse2(x[1], x[3]);
   1865  btf_16_adds_subs_sse2(x[4], x[6]);
   1866  btf_16_adds_subs_sse2(x[5], x[7]);
   1867 
   1868  // stage 6
   1869  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
   1870  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
   1871 
   1872  // stage 7
   1873  output[0] = x[0];
   1874  output[1] = _mm_subs_epi16(__zero, x[4]);
   1875  output[2] = x[6];
   1876  output[3] = _mm_subs_epi16(__zero, x[2]);
   1877  output[4] = x[3];
   1878  output[5] = _mm_subs_epi16(__zero, x[7]);
   1879  output[6] = x[5];
   1880  output[7] = _mm_subs_epi16(__zero, x[1]);
   1881 }
   1882 
   1883 static inline void iadst16_stage3_ssse3(__m128i *x) {
   1884  btf_16_adds_subs_sse2(x[0], x[8]);
   1885  btf_16_adds_subs_sse2(x[1], x[9]);
   1886  btf_16_adds_subs_sse2(x[2], x[10]);
   1887  btf_16_adds_subs_sse2(x[3], x[11]);
   1888  btf_16_adds_subs_sse2(x[4], x[12]);
   1889  btf_16_adds_subs_sse2(x[5], x[13]);
   1890  btf_16_adds_subs_sse2(x[6], x[14]);
   1891  btf_16_adds_subs_sse2(x[7], x[15]);
   1892 }
   1893 
   1894 static inline void iadst16_stage4_ssse3(__m128i *x, const int32_t *cospi,
   1895                                        const __m128i __rounding,
   1896                                        int8_t cos_bit) {
   1897  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
   1898  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
   1899  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
   1900  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
   1901  const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
   1902  const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
   1903  btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
   1904  btf_16_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]);
   1905  btf_16_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]);
   1906  btf_16_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
   1907 }
   1908 
   1909 static inline void iadst16_stage5_ssse3(__m128i *x) {
   1910  btf_16_adds_subs_sse2(x[0], x[4]);
   1911  btf_16_adds_subs_sse2(x[1], x[5]);
   1912  btf_16_adds_subs_sse2(x[2], x[6]);
   1913  btf_16_adds_subs_sse2(x[3], x[7]);
   1914  btf_16_adds_subs_sse2(x[8], x[12]);
   1915  btf_16_adds_subs_sse2(x[9], x[13]);
   1916  btf_16_adds_subs_sse2(x[10], x[14]);
   1917  btf_16_adds_subs_sse2(x[11], x[15]);
   1918 }
   1919 
   1920 static inline void iadst16_stage6_ssse3(__m128i *x, const int32_t *cospi,
   1921                                        const __m128i __rounding,
   1922                                        int8_t cos_bit) {
   1923  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
   1924  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
   1925  const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
   1926  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
   1927  btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
   1928  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
   1929  btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
   1930 }
   1931 
   1932 static inline void iadst16_stage7_ssse3(__m128i *x) {
   1933  btf_16_adds_subs_sse2(x[0], x[2]);
   1934  btf_16_adds_subs_sse2(x[1], x[3]);
   1935  btf_16_adds_subs_sse2(x[4], x[6]);
   1936  btf_16_adds_subs_sse2(x[5], x[7]);
   1937  btf_16_adds_subs_sse2(x[8], x[10]);
   1938  btf_16_adds_subs_sse2(x[9], x[11]);
   1939  btf_16_adds_subs_sse2(x[12], x[14]);
   1940  btf_16_adds_subs_sse2(x[13], x[15]);
   1941 }
   1942 
   1943 static inline void iadst16_stage8_ssse3(__m128i *x, const int32_t *cospi,
   1944                                        const __m128i __rounding,
   1945                                        int8_t cos_bit) {
   1946  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
   1947  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
   1948  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
   1949  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
   1950  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]);
   1951  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]);
   1952 }
   1953 
   1954 static inline void iadst16_stage9_ssse3(__m128i *output, __m128i *x) {
   1955  const __m128i __zero = _mm_setzero_si128();
   1956  output[0] = x[0];
   1957  output[1] = _mm_subs_epi16(__zero, x[8]);
   1958  output[2] = x[12];
   1959  output[3] = _mm_subs_epi16(__zero, x[4]);
   1960  output[4] = x[6];
   1961  output[5] = _mm_subs_epi16(__zero, x[14]);
   1962  output[6] = x[10];
   1963  output[7] = _mm_subs_epi16(__zero, x[2]);
   1964  output[8] = x[3];
   1965  output[9] = _mm_subs_epi16(__zero, x[11]);
   1966  output[10] = x[15];
   1967  output[11] = _mm_subs_epi16(__zero, x[7]);
   1968  output[12] = x[5];
   1969  output[13] = _mm_subs_epi16(__zero, x[13]);
   1970  output[14] = x[9];
   1971  output[15] = _mm_subs_epi16(__zero, x[1]);
   1972 }
   1973 
   1974 static void iadst16_low1_ssse3(const __m128i *input, __m128i *output) {
   1975  const int8_t cos_bit = INV_COS_BIT;
   1976  const int32_t *cospi = cospi_arr(INV_COS_BIT);
   1977  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
   1978 
   1979  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
   1980  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
   1981  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
   1982  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
   1983 
   1984  // stage 1
   1985  __m128i x[16];
   1986  x[1] = input[0];
   1987 
   1988  // stage 2
   1989  btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]);
   1990 
   1991  // stage 3
   1992  x[8] = x[0];
   1993  x[9] = x[1];
   1994 
   1995  // stage 4
   1996  btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
   1997 
   1998  // stage 5
   1999  x[4] = x[0];
   2000  x[5] = x[1];
   2001  x[12] = x[8];
   2002  x[13] = x[9];
   2003 
   2004  // stage 6
   2005  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
   2006  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
   2007 
   2008  // stage 7
   2009  x[2] = x[0];
   2010  x[3] = x[1];
   2011  x[6] = x[4];
   2012  x[7] = x[5];
   2013  x[10] = x[8];
   2014  x[11] = x[9];
   2015  x[14] = x[12];
   2016  x[15] = x[13];
   2017 
   2018  iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
   2019  iadst16_stage9_ssse3(output, x);
   2020 }
   2021 
   2022 static void iadst16_low8_ssse3(const __m128i *input, __m128i *output) {
   2023  const int8_t cos_bit = INV_COS_BIT;
   2024  const int32_t *cospi = cospi_arr(INV_COS_BIT);
   2025  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
   2026 
   2027  // stage 1
   2028  __m128i x[16];
   2029  x[1] = input[0];
   2030  x[3] = input[2];
   2031  x[5] = input[4];
   2032  x[7] = input[6];
   2033  x[8] = input[7];
   2034  x[10] = input[5];
   2035  x[12] = input[3];
   2036  x[14] = input[1];
   2037 
   2038  // stage 2
   2039  btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]);
   2040  btf_16_ssse3(cospi[54], -cospi[10], x[3], x[2], x[3]);
   2041  btf_16_ssse3(cospi[46], -cospi[18], x[5], x[4], x[5]);
   2042  btf_16_ssse3(cospi[38], -cospi[26], x[7], x[6], x[7]);
   2043  btf_16_ssse3(cospi[34], cospi[30], x[8], x[8], x[9]);
   2044  btf_16_ssse3(cospi[42], cospi[22], x[10], x[10], x[11]);
   2045  btf_16_ssse3(cospi[50], cospi[14], x[12], x[12], x[13]);
   2046  btf_16_ssse3(cospi[58], cospi[6], x[14], x[14], x[15]);
   2047 
   2048  // stage 3
   2049  iadst16_stage3_ssse3(x);
   2050  iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit);
   2051  iadst16_stage5_ssse3(x);
   2052  iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit);
   2053  iadst16_stage7_ssse3(x);
   2054  iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
   2055  iadst16_stage9_ssse3(output, x);
   2056 }
   2057 static void iadst16_sse2(const __m128i *input, __m128i *output) {
   2058  const int8_t cos_bit = INV_COS_BIT;
   2059  const int32_t *cospi = cospi_arr(INV_COS_BIT);
   2060  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
   2061  const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
   2062  const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
   2063  const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
   2064  const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
   2065  const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
   2066  const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
   2067  const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
   2068  const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
   2069  const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
   2070  const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
   2071  const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
   2072  const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
   2073  const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
   2074  const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
   2075  const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
   2076  const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
   2077 
   2078  // stage 1
   2079  __m128i x[16];
   2080  x[0] = input[15];
   2081  x[1] = input[0];
   2082  x[2] = input[13];
   2083  x[3] = input[2];
   2084  x[4] = input[11];
   2085  x[5] = input[4];
   2086  x[6] = input[9];
   2087  x[7] = input[6];
   2088  x[8] = input[7];
   2089  x[9] = input[8];
   2090  x[10] = input[5];
   2091  x[11] = input[10];
   2092  x[12] = input[3];
   2093  x[13] = input[12];
   2094  x[14] = input[1];
   2095  x[15] = input[14];
   2096 
   2097  // stage 2
   2098  btf_16_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]);
   2099  btf_16_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]);
   2100  btf_16_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]);
   2101  btf_16_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]);
   2102  btf_16_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]);
   2103  btf_16_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]);
   2104  btf_16_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]);
   2105  btf_16_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]);
   2106 
   2107  // stage 3~9
   2108  iadst16_stage3_ssse3(x);
   2109  iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit);
   2110  iadst16_stage5_ssse3(x);
   2111  iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit);
   2112  iadst16_stage7_ssse3(x);
   2113  iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
   2114  iadst16_stage9_ssse3(output, x);
   2115 }
   2116 
   2117 static void iadst16_w4_sse2(const __m128i *input, __m128i *output) {
   2118  const int8_t cos_bit = INV_COS_BIT;
   2119  const int32_t *cospi = cospi_arr(INV_COS_BIT);
   2120  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
   2121 
   2122  const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
   2123  const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
   2124  const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
   2125  const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
   2126  const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
   2127  const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
   2128  const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
   2129  const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
   2130  const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
   2131  const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
   2132  const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
   2133  const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
   2134  const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
   2135  const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
   2136  const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
   2137  const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
   2138  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
   2139  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
   2140  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
   2141  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
   2142  const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
   2143  const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
   2144  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
   2145  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
   2146  const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
   2147  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
   2148  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
   2149 
   2150  // stage 1
   2151  __m128i x[16];
   2152  x[0] = input[15];
   2153  x[1] = input[0];
   2154  x[2] = input[13];
   2155  x[3] = input[2];
   2156  x[4] = input[11];
   2157  x[5] = input[4];
   2158  x[6] = input[9];
   2159  x[7] = input[6];
   2160  x[8] = input[7];
   2161  x[9] = input[8];
   2162  x[10] = input[5];
   2163  x[11] = input[10];
   2164  x[12] = input[3];
   2165  x[13] = input[12];
   2166  x[14] = input[1];
   2167  x[15] = input[14];
   2168 
   2169  // stage 2
   2170  btf_16_4p_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]);
   2171  btf_16_4p_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]);
   2172  btf_16_4p_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]);
   2173  btf_16_4p_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]);
   2174  btf_16_4p_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]);
   2175  btf_16_4p_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]);
   2176  btf_16_4p_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]);
   2177  btf_16_4p_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]);
   2178 
   2179  // stage 3
   2180  iadst16_stage3_ssse3(x);
   2181 
   2182  // stage 4
   2183  btf_16_4p_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
   2184  btf_16_4p_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]);
   2185  btf_16_4p_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]);
   2186  btf_16_4p_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
   2187 
   2188  // stage 5
   2189  iadst16_stage5_ssse3(x);
   2190 
   2191  // stage 6
   2192  btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
   2193  btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
   2194  btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
   2195  btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
   2196 
   2197  // stage 7
   2198  iadst16_stage7_ssse3(x);
   2199 
   2200  // stage 8
   2201  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
   2202  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
   2203  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]);
   2204  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]);
   2205 
   2206  // stage 9
   2207  iadst16_stage9_ssse3(output, x);
   2208 }
   2209 
   2210 static void iidentity4_ssse3(const __m128i *input, __m128i *output) {
   2211  const int16_t scale_fractional = (NewSqrt2 - (1 << NewSqrt2Bits));
   2212  const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
   2213  for (int i = 0; i < 4; ++i) {
   2214    __m128i x = _mm_mulhrs_epi16(input[i], scale);
   2215    output[i] = _mm_adds_epi16(x, input[i]);
   2216  }
   2217 }
   2218 
   2219 static void iidentity8_sse2(const __m128i *input, __m128i *output) {
   2220  for (int i = 0; i < 8; ++i) {
   2221    output[i] = _mm_adds_epi16(input[i], input[i]);
   2222  }
   2223 }
   2224 
   2225 static void iidentity16_ssse3(const __m128i *input, __m128i *output) {
   2226  const int16_t scale_fractional = 2 * (NewSqrt2 - (1 << NewSqrt2Bits));
   2227  const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
   2228  for (int i = 0; i < 16; ++i) {
   2229    __m128i x = _mm_mulhrs_epi16(input[i], scale);
   2230    __m128i srcx2 = _mm_adds_epi16(input[i], input[i]);
   2231    output[i] = _mm_adds_epi16(x, srcx2);
   2232  }
   2233 }
   2234 
   2235 static inline __m128i lowbd_get_recon_8x8_sse2(const __m128i pred,
   2236                                               __m128i res) {
   2237  const __m128i zero = _mm_setzero_si128();
   2238  __m128i x0 = _mm_adds_epi16(res, _mm_unpacklo_epi8(pred, zero));
   2239  return _mm_packus_epi16(x0, x0);
   2240 }
   2241 
   2242 static inline void lowbd_write_buffer_4xn_sse2(__m128i *in, uint8_t *output,
   2243                                               int stride, int flipud,
   2244                                               const int height) {
   2245  int j = flipud ? (height - 1) : 0;
   2246  const int step = flipud ? -1 : 1;
   2247  const __m128i zero = _mm_setzero_si128();
   2248  for (int i = 0; i < height; ++i, j += step) {
   2249    const __m128i v = _mm_cvtsi32_si128(*((int *)(output + i * stride)));
   2250    __m128i u = _mm_adds_epi16(in[j], _mm_unpacklo_epi8(v, zero));
   2251    u = _mm_packus_epi16(u, zero);
   2252    *((int *)(output + i * stride)) = _mm_cvtsi128_si32(u);
   2253  }
   2254 }
   2255 
   2256 static inline void lowbd_write_buffer_8xn_sse2(__m128i *in, uint8_t *output,
   2257                                               int stride, int flipud,
   2258                                               const int height) {
   2259  int j = flipud ? (height - 1) : 0;
   2260  const int step = flipud ? -1 : 1;
   2261  for (int i = 0; i < height; ++i, j += step) {
   2262    const __m128i v = _mm_loadl_epi64((__m128i const *)(output + i * stride));
   2263    const __m128i u = lowbd_get_recon_8x8_sse2(v, in[j]);
   2264    _mm_storel_epi64((__m128i *)(output + i * stride), u);
   2265  }
   2266 }
   2267 
   2268 // 1D functions process process 8 pixels at one time.
   2269 static const transform_1d_ssse3
   2270    lowbd_txfm_all_1d_w8_arr[TX_SIZES][ITX_TYPES_1D] = {
   2271      { idct4_sse2, iadst4_sse2, iidentity4_ssse3 },
   2272      { av1_idct8_sse2, av1_iadst8_sse2, iidentity8_sse2 },
   2273      { idct16_sse2, iadst16_sse2, iidentity16_ssse3 },
   2274      { idct32_sse2, NULL, NULL },
   2275      { idct64_low32_ssse3, NULL, NULL },
   2276    };
   2277 
   2278 // functions for blocks with eob at DC and within
   2279 // topleft 8x8, 16x16, 32x32 corner
   2280 static const transform_1d_ssse3
   2281    lowbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
   2282      {
   2283          { idct4_sse2, idct4_sse2, NULL, NULL },
   2284          { iadst4_sse2, iadst4_sse2, NULL, NULL },
   2285          { iidentity4_ssse3, iidentity4_ssse3, NULL, NULL },
   2286      },
   2287      { { av1_idct8_low1_ssse3, av1_idct8_sse2, NULL, NULL },
   2288        { av1_iadst8_low1_ssse3, av1_iadst8_sse2, NULL, NULL },
   2289        { iidentity8_sse2, iidentity8_sse2, NULL, NULL } },
   2290      {
   2291          { idct16_low1_ssse3, idct16_low8_ssse3, idct16_sse2, NULL },
   2292          { iadst16_low1_ssse3, iadst16_low8_ssse3, iadst16_sse2, NULL },
   2293          { NULL, NULL, NULL, NULL },
   2294      },
   2295      { { idct32_low1_ssse3, idct32_low8_ssse3, idct32_low16_ssse3,
   2296          idct32_sse2 },
   2297        { NULL, NULL, NULL, NULL },
   2298        { NULL, NULL, NULL, NULL } },
   2299      { { idct64_low1_ssse3, idct64_low8_ssse3, idct64_low16_ssse3,
   2300          idct64_low32_ssse3 },
   2301        { NULL, NULL, NULL, NULL },
   2302        { NULL, NULL, NULL, NULL } }
   2303    };
   2304 
   2305 // 1D functions process process 4 pixels at one time.
   2306 // used in 4x4, 4x8, 4x16, 8x4, 16x4
   2307 static const transform_1d_ssse3
   2308    lowbd_txfm_all_1d_w4_arr[TX_SIZES][ITX_TYPES_1D] = {
   2309      { idct4_w4_sse2, iadst4_w4_sse2, iidentity4_ssse3 },
   2310      { idct8_w4_sse2, iadst8_w4_sse2, iidentity8_sse2 },
   2311      { idct16_w4_sse2, iadst16_w4_sse2, iidentity16_ssse3 },
   2312      { NULL, NULL, NULL },
   2313      { NULL, NULL, NULL },
   2314    };
   2315 
   2316 static inline void iidentity_row_8xn_ssse3(__m128i *out, const int32_t *input,
   2317                                           int stride, int shift, int height,
   2318                                           int txw_idx, int rect_type) {
   2319  const int32_t *input_row = input;
   2320  const __m128i scale = _mm_set1_epi16(NewSqrt2list[txw_idx]);
   2321  const __m128i rounding = _mm_set1_epi16((1 << (NewSqrt2Bits - 1)) +
   2322                                          (1 << (NewSqrt2Bits - shift - 1)));
   2323  const __m128i one = _mm_set1_epi16(1);
   2324  const __m128i scale_rounding = _mm_unpacklo_epi16(scale, rounding);
   2325  if (rect_type != 1 && rect_type != -1) {
   2326    for (int i = 0; i < height; ++i) {
   2327      const __m128i src = load_32bit_to_16bit(input_row);
   2328      input_row += stride;
   2329      __m128i lo = _mm_unpacklo_epi16(src, one);
   2330      __m128i hi = _mm_unpackhi_epi16(src, one);
   2331      lo = _mm_madd_epi16(lo, scale_rounding);
   2332      hi = _mm_madd_epi16(hi, scale_rounding);
   2333      lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift);
   2334      hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift);
   2335      out[i] = _mm_packs_epi32(lo, hi);
   2336    }
   2337  } else {
   2338    const __m128i rect_scale =
   2339        _mm_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits));
   2340    for (int i = 0; i < height; ++i) {
   2341      __m128i src = load_32bit_to_16bit(input_row);
   2342      src = _mm_mulhrs_epi16(src, rect_scale);
   2343      input_row += stride;
   2344      __m128i lo = _mm_unpacklo_epi16(src, one);
   2345      __m128i hi = _mm_unpackhi_epi16(src, one);
   2346      lo = _mm_madd_epi16(lo, scale_rounding);
   2347      hi = _mm_madd_epi16(hi, scale_rounding);
   2348      lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift);
   2349      hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift);
   2350      out[i] = _mm_packs_epi32(lo, hi);
   2351    }
   2352  }
   2353 }
   2354 
   2355 static inline void iidentity_col_8xn_ssse3(uint8_t *output, int stride,
   2356                                           __m128i *buf, int shift, int height,
   2357                                           int txh_idx) {
   2358  const __m128i scale = _mm_set1_epi16(NewSqrt2list[txh_idx]);
   2359  const __m128i scale_rounding = _mm_set1_epi16(1 << (NewSqrt2Bits - 1));
   2360  const __m128i shift_rounding = _mm_set1_epi32(1 << (-shift - 1));
   2361  const __m128i one = _mm_set1_epi16(1);
   2362  const __m128i scale_coeff = _mm_unpacklo_epi16(scale, scale_rounding);
   2363  const __m128i zero = _mm_setzero_si128();
   2364  for (int h = 0; h < height; ++h) {
   2365    __m128i lo = _mm_unpacklo_epi16(buf[h], one);
   2366    __m128i hi = _mm_unpackhi_epi16(buf[h], one);
   2367    lo = _mm_madd_epi16(lo, scale_coeff);
   2368    hi = _mm_madd_epi16(hi, scale_coeff);
   2369    lo = _mm_srai_epi32(lo, NewSqrt2Bits);
   2370    hi = _mm_srai_epi32(hi, NewSqrt2Bits);
   2371    lo = _mm_add_epi32(lo, shift_rounding);
   2372    hi = _mm_add_epi32(hi, shift_rounding);
   2373    lo = _mm_srai_epi32(lo, -shift);
   2374    hi = _mm_srai_epi32(hi, -shift);
   2375    __m128i x = _mm_packs_epi32(lo, hi);
   2376 
   2377    const __m128i pred = _mm_loadl_epi64((__m128i const *)(output));
   2378    x = _mm_adds_epi16(x, _mm_unpacklo_epi8(pred, zero));
   2379    const __m128i u = _mm_packus_epi16(x, x);
   2380    _mm_storel_epi64((__m128i *)(output), u);
   2381    output += stride;
   2382  }
   2383 }
   2384 
   2385 void av1_lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t *input, uint8_t *output,
   2386                                         int stride, TX_SIZE tx_size) {
   2387  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   2388  const int txw_idx = get_txw_idx(tx_size);
   2389  const int txh_idx = get_txh_idx(tx_size);
   2390  const int txfm_size_col = tx_size_wide[tx_size];
   2391  const int txfm_size_row = tx_size_high[tx_size];
   2392  const int col_max = AOMMIN(32, txfm_size_col);
   2393  const int row_max = AOMMIN(32, txfm_size_row);
   2394  const int input_stride = row_max;
   2395  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
   2396 
   2397  for (int i = 0; i < (col_max >> 3); ++i) {
   2398    for (int j = 0; j < (row_max >> 3); j++) {
   2399      __m128i buf[8];
   2400      iidentity_row_8xn_ssse3(buf, input + j * 8 + i * 8 * input_stride,
   2401                              row_max, shift[0], 8, txw_idx, rect_type);
   2402      transpose_16bit_8x8(buf, buf);
   2403      iidentity_col_8xn_ssse3(output + i * 8 + j * 8 * stride, stride, buf,
   2404                              shift[1], 8, txh_idx);
   2405    }
   2406  }
   2407 }
   2408 
   2409 static void lowbd_inv_txfm2d_add_4x4_ssse3(const int32_t *input,
   2410                                           uint8_t *output, int stride,
   2411                                           TX_TYPE tx_type, TX_SIZE tx_size_,
   2412                                           int eob) {
   2413  (void)tx_size_;
   2414  (void)eob;
   2415  __m128i buf[4];
   2416  const TX_SIZE tx_size = TX_4X4;
   2417  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   2418  const int txw_idx = get_txw_idx(tx_size);
   2419  const int txh_idx = get_txh_idx(tx_size);
   2420  const int txfm_size_col = tx_size_wide[tx_size];
   2421  const int txfm_size_row = tx_size_high[tx_size];
   2422 
   2423  const transform_1d_ssse3 row_txfm =
   2424      lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
   2425  const transform_1d_ssse3 col_txfm =
   2426      lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
   2427 
   2428  int ud_flip, lr_flip;
   2429  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   2430  load_buffer_32bit_to_16bit_w4(input, txfm_size_row, buf, txfm_size_col);
   2431  row_txfm(buf, buf);
   2432  if (lr_flip) {
   2433    __m128i temp[4];
   2434    flip_buf_sse2(buf, temp, txfm_size_col);
   2435    transpose_16bit_4x4(temp, buf);
   2436  } else {
   2437    transpose_16bit_4x4(buf, buf);
   2438  }
   2439  col_txfm(buf, buf);
   2440  round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
   2441  lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
   2442 }
   2443 
   2444 static inline __m128i lowbd_get_recon_16x16_sse2(const __m128i pred,
   2445                                                 __m128i res0, __m128i res1) {
   2446  const __m128i zero = _mm_setzero_si128();
   2447  __m128i x0 = _mm_unpacklo_epi8(pred, zero);
   2448  __m128i x1 = _mm_unpackhi_epi8(pred, zero);
   2449  x0 = _mm_adds_epi16(res0, x0);
   2450  x1 = _mm_adds_epi16(res1, x1);
   2451  return _mm_packus_epi16(x0, x1);
   2452 }
   2453 
   2454 static inline void lowbd_write_buffer_16xn_sse2(__m128i *in, uint8_t *output,
   2455                                                int stride, int flipud,
   2456                                                int height) {
   2457  int j = flipud ? (height - 1) : 0;
   2458  const int step = flipud ? -1 : 1;
   2459  for (int i = 0; i < height; ++i, j += step) {
   2460    __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride));
   2461    __m128i u = lowbd_get_recon_16x16_sse2(v, in[j], in[j + height]);
   2462    _mm_storeu_si128((__m128i *)(output + i * stride), u);
   2463  }
   2464 }
   2465 
   2466 static inline void round_shift_ssse3(const __m128i *input, __m128i *output,
   2467                                     int size) {
   2468  const __m128i scale = _mm_set1_epi16(NewInvSqrt2 * 8);
   2469  for (int i = 0; i < size; ++i) {
   2470    output[i] = _mm_mulhrs_epi16(input[i], scale);
   2471  }
   2472 }
   2473 
   2474 static inline void lowbd_inv_txfm2d_add_no_identity_ssse3(
   2475    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
   2476    TX_SIZE tx_size, int eob) {
   2477  __m128i buf1[64 * 8];
   2478  int eobx, eoby;
   2479  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
   2480  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   2481  const int txw_idx = get_txw_idx(tx_size);
   2482  const int txh_idx = get_txh_idx(tx_size);
   2483  const int txfm_size_col = tx_size_wide[tx_size];
   2484  const int txfm_size_row = tx_size_high[tx_size];
   2485  const int buf_size_w_div8 = txfm_size_col >> 3;
   2486  const int buf_size_nonzero_w = ((eobx + 8) >> 3) << 3;
   2487  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
   2488  const int input_stride = AOMMIN(32, txfm_size_row);
   2489  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
   2490 
   2491  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
   2492  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
   2493  const transform_1d_ssse3 row_txfm =
   2494      lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
   2495  const transform_1d_ssse3 col_txfm =
   2496      lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
   2497 
   2498  assert(col_txfm != NULL);
   2499  assert(row_txfm != NULL);
   2500  int ud_flip, lr_flip;
   2501  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   2502  for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
   2503    __m128i buf0[64];
   2504    load_buffer_32bit_to_16bit(input + 8 * i, input_stride, buf0,
   2505                               buf_size_nonzero_w);
   2506    if (rect_type == 1 || rect_type == -1) {
   2507      round_shift_ssse3(buf0, buf0, buf_size_nonzero_w);  // rect special code
   2508    }
   2509    row_txfm(buf0, buf0);
   2510    round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
   2511    __m128i *_buf1 = buf1 + i * 8;
   2512    if (lr_flip) {
   2513      for (int j = 0; j < buf_size_w_div8; ++j) {
   2514        __m128i temp[8];
   2515        flip_buf_sse2(buf0 + 8 * j, temp, 8);
   2516        transpose_16bit_8x8(temp,
   2517                            _buf1 + txfm_size_row * (buf_size_w_div8 - 1 - j));
   2518      }
   2519    } else {
   2520      for (int j = 0; j < buf_size_w_div8; ++j) {
   2521        transpose_16bit_8x8(buf0 + 8 * j, _buf1 + txfm_size_row * j);
   2522      }
   2523    }
   2524  }
   2525  for (int i = 0; i < buf_size_w_div8; i++) {
   2526    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row);
   2527    round_shift_16bit_ssse3(buf1 + i * txfm_size_row, txfm_size_row, shift[1]);
   2528  }
   2529 
   2530  if (txfm_size_col >= 16) {
   2531    for (int i = 0; i < (txfm_size_col >> 4); i++) {
   2532      lowbd_write_buffer_16xn_sse2(buf1 + i * txfm_size_row * 2,
   2533                                   output + 16 * i, stride, ud_flip,
   2534                                   txfm_size_row);
   2535    }
   2536  } else if (txfm_size_col == 8) {
   2537    lowbd_write_buffer_8xn_sse2(buf1, output, stride, ud_flip, txfm_size_row);
   2538  }
   2539 }
   2540 
   2541 void av1_lowbd_inv_txfm2d_add_h_identity_ssse3(const int32_t *input,
   2542                                               uint8_t *output, int stride,
   2543                                               TX_TYPE tx_type, TX_SIZE tx_size,
   2544                                               int eob) {
   2545  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   2546  int eobx, eoby;
   2547  get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
   2548  const int txw_idx = get_txw_idx(tx_size);
   2549  const int txh_idx = get_txh_idx(tx_size);
   2550  const int txfm_size_col = tx_size_wide[tx_size];
   2551  const int txfm_size_row = tx_size_high[tx_size];
   2552  const int buf_size_w_div8 = (eobx + 8) >> 3;
   2553  const int buf_size_h_div8 = (eoby + 8) >> 3;
   2554  const int input_stride = AOMMIN(32, txfm_size_row);
   2555  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
   2556 
   2557  const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
   2558  assert(fun_idx < 5);
   2559  const transform_1d_ssse3 col_txfm =
   2560      lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
   2561 
   2562  assert(col_txfm != NULL);
   2563 
   2564  int ud_flip, lr_flip;
   2565  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   2566  for (int i = 0; i < buf_size_w_div8; i++) {
   2567    __m128i buf0[64];
   2568    for (int j = 0; j < buf_size_h_div8; j++) {
   2569      __m128i *buf0_cur = buf0 + j * 8;
   2570      const int32_t *input_cur = input + i * 8 * input_stride + j * 8;
   2571      iidentity_row_8xn_ssse3(buf0_cur, input_cur, input_stride, shift[0], 8,
   2572                              txw_idx, rect_type);
   2573      transpose_16bit_8x8(buf0_cur, buf0_cur);
   2574    }
   2575    col_txfm(buf0, buf0);
   2576    __m128i mshift = _mm_set1_epi16(1 << (15 + shift[1]));
   2577    int k = ud_flip ? (txfm_size_row - 1) : 0;
   2578    const int step = ud_flip ? -1 : 1;
   2579    uint8_t *out = output + 8 * i;
   2580    for (int j = 0; j < txfm_size_row; ++j, k += step) {
   2581      const __m128i v = _mm_loadl_epi64((__m128i const *)(out));
   2582      __m128i res = _mm_mulhrs_epi16(buf0[k], mshift);
   2583      const __m128i u = lowbd_get_recon_8x8_sse2(v, res);
   2584      _mm_storel_epi64((__m128i *)(out), u);
   2585      out += stride;
   2586    }
   2587  }
   2588 }
   2589 
   2590 void av1_lowbd_inv_txfm2d_add_v_identity_ssse3(const int32_t *input,
   2591                                               uint8_t *output, int stride,
   2592                                               TX_TYPE tx_type, TX_SIZE tx_size,
   2593                                               int eob) {
   2594  __m128i buf1[64];
   2595  int eobx, eoby;
   2596  get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
   2597  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   2598  const int txw_idx = get_txw_idx(tx_size);
   2599  const int txh_idx = get_txh_idx(tx_size);
   2600  const int txfm_size_col = tx_size_wide[tx_size];
   2601  const int txfm_size_row = tx_size_high[tx_size];
   2602  const int buf_size_w_div8 = txfm_size_col >> 3;
   2603  const int buf_size_nonzero_w = ((eobx + 8) >> 3) << 3;
   2604  const int buf_size_h_div8 = (eoby + 8) >> 3;
   2605  const int input_stride = AOMMIN(32, txfm_size_row);
   2606  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
   2607 
   2608  const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
   2609  const transform_1d_ssse3 row_txfm =
   2610      lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
   2611 
   2612  assert(row_txfm != NULL);
   2613  int ud_flip, lr_flip;
   2614  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   2615  for (int i = 0; i < buf_size_h_div8; i++) {
   2616    __m128i buf0[64];
   2617    load_buffer_32bit_to_16bit(input + i * 8, input_stride, buf0,
   2618                               buf_size_nonzero_w);
   2619    if (rect_type == 1 || rect_type == -1) {
   2620      round_shift_ssse3(buf0, buf0, buf_size_nonzero_w);  // rect special code
   2621    }
   2622    row_txfm(buf0, buf0);
   2623    round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
   2624    __m128i *_buf1 = buf1;
   2625    if (lr_flip) {
   2626      for (int j = 0; j < buf_size_w_div8; ++j) {
   2627        __m128i temp[8];
   2628        flip_buf_sse2(buf0 + 8 * j, temp, 8);
   2629        transpose_16bit_8x8(temp, _buf1 + 8 * (buf_size_w_div8 - 1 - j));
   2630      }
   2631    } else {
   2632      for (int j = 0; j < buf_size_w_div8; ++j) {
   2633        transpose_16bit_8x8(buf0 + 8 * j, _buf1 + 8 * j);
   2634      }
   2635    }
   2636 
   2637    for (int j = 0; j < buf_size_w_div8; ++j) {
   2638      iidentity_col_8xn_ssse3(output + i * 8 * stride + j * 8, stride,
   2639                              buf1 + j * 8, shift[1], 8, txh_idx);
   2640    }
   2641  }
   2642 }
   2643 
   2644 // for 32x32,32x64,64x32,64x64,32x8,8x32,16x32,32x16,64x16,16x64
   2645 static inline void lowbd_inv_txfm2d_add_universe_ssse3(
   2646    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
   2647    TX_SIZE tx_size, int eob) {
   2648  switch (tx_type) {
   2649    case DCT_DCT:
   2650      lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type,
   2651                                             tx_size, eob);
   2652      break;
   2653    case IDTX:
   2654      av1_lowbd_inv_txfm2d_add_idtx_ssse3(input, output, stride, tx_size);
   2655      break;
   2656    case V_DCT:
   2657    case V_ADST:
   2658    case V_FLIPADST:
   2659      av1_lowbd_inv_txfm2d_add_h_identity_ssse3(input, output, stride, tx_type,
   2660                                                tx_size, eob);
   2661      break;
   2662    case H_DCT:
   2663    case H_ADST:
   2664    case H_FLIPADST:
   2665      av1_lowbd_inv_txfm2d_add_v_identity_ssse3(input, output, stride, tx_type,
   2666                                                tx_size, eob);
   2667      break;
   2668    default:
   2669      lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type,
   2670                                             tx_size, eob);
   2671      break;
   2672  }
   2673 }
   2674 
   2675 static void lowbd_inv_txfm2d_add_4x8_ssse3(const int32_t *input,
   2676                                           uint8_t *output, int stride,
   2677                                           TX_TYPE tx_type, TX_SIZE tx_size_,
   2678                                           int eob) {
   2679  (void)tx_size_;
   2680  (void)eob;
   2681  __m128i buf[8];
   2682  const TX_SIZE tx_size = TX_4X8;
   2683  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   2684  const int txw_idx = get_txw_idx(tx_size);
   2685  const int txh_idx = get_txh_idx(tx_size);
   2686  const int txfm_size_col = tx_size_wide[tx_size];
   2687  const int txfm_size_row = tx_size_high[tx_size];
   2688 
   2689  const transform_1d_ssse3 row_txfm =
   2690      lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]];
   2691  const transform_1d_ssse3 col_txfm =
   2692      lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
   2693 
   2694  int ud_flip, lr_flip;
   2695  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   2696  load_buffer_32bit_to_16bit(input, txfm_size_row, buf, txfm_size_col);
   2697  round_shift_ssse3(buf, buf, txfm_size_col);  // rect special code
   2698  row_txfm(buf, buf);
   2699  // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);// shift[0] is 0
   2700  if (lr_flip) {
   2701    __m128i temp[4];
   2702    flip_buf_sse2(buf, temp, txfm_size_col);
   2703    transpose_16bit_8x4(temp, buf);
   2704  } else {
   2705    transpose_16bit_8x4(buf, buf);
   2706  }
   2707  col_txfm(buf, buf);
   2708  round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
   2709  lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
   2710 }
   2711 
   2712 static void lowbd_inv_txfm2d_add_8x4_ssse3(const int32_t *input,
   2713                                           uint8_t *output, int stride,
   2714                                           TX_TYPE tx_type, TX_SIZE tx_size_,
   2715                                           int eob) {
   2716  (void)tx_size_;
   2717  (void)eob;
   2718  __m128i buf[8];
   2719  const TX_SIZE tx_size = TX_8X4;
   2720  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   2721  const int txw_idx = get_txw_idx(tx_size);
   2722  const int txh_idx = get_txh_idx(tx_size);
   2723  const int txfm_size_col = tx_size_wide[tx_size];
   2724  const int txfm_size_row = tx_size_high[tx_size];
   2725 
   2726  const transform_1d_ssse3 row_txfm =
   2727      lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
   2728  const transform_1d_ssse3 col_txfm =
   2729      lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]];
   2730 
   2731  int ud_flip, lr_flip;
   2732  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   2733  load_buffer_32bit_to_16bit_w4(input, txfm_size_row, buf, txfm_size_col);
   2734  round_shift_ssse3(buf, buf, txfm_size_col);  // rect special code
   2735  row_txfm(buf, buf);
   2736  // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]); // shift[0] is 0
   2737  if (lr_flip) {
   2738    __m128i temp[8];
   2739    flip_buf_sse2(buf, temp, txfm_size_col);
   2740    transpose_16bit_4x8(temp, buf);
   2741  } else {
   2742    transpose_16bit_4x8(buf, buf);
   2743  }
   2744  col_txfm(buf, buf);
   2745  round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
   2746  lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
   2747 }
   2748 
   2749 static void lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t *input,
   2750                                            uint8_t *output, int stride,
   2751                                            TX_TYPE tx_type, TX_SIZE tx_size_,
   2752                                            int eob) {
   2753  (void)tx_size_;
   2754  (void)eob;
   2755  __m128i buf[16];
   2756  const TX_SIZE tx_size = TX_4X16;
   2757  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   2758  const int txw_idx = get_txw_idx(tx_size);
   2759  const int txh_idx = get_txh_idx(tx_size);
   2760  const int txfm_size_col = tx_size_wide[tx_size];
   2761  const int txfm_size_row = tx_size_high[tx_size];
   2762 
   2763  const transform_1d_ssse3 row_txfm =
   2764      lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]];
   2765  const transform_1d_ssse3 col_txfm =
   2766      lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
   2767 
   2768  int ud_flip, lr_flip;
   2769  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   2770 
   2771  const int row_one_loop = 8;
   2772  for (int i = 0; i < 2; ++i) {
   2773    const int32_t *input_cur = input + i * row_one_loop;
   2774    __m128i *buf_cur = buf + i * row_one_loop;
   2775    load_buffer_32bit_to_16bit(input_cur, txfm_size_row, buf_cur,
   2776                               txfm_size_col);
   2777    if (row_txfm == iidentity4_ssse3) {
   2778      const __m128i scale = pair_set_epi16(NewSqrt2, 3 << (NewSqrt2Bits - 1));
   2779      const __m128i ones = _mm_set1_epi16(1);
   2780      for (int j = 0; j < 4; ++j) {
   2781        const __m128i buf_lo = _mm_unpacklo_epi16(buf_cur[j], ones);
   2782        const __m128i buf_hi = _mm_unpackhi_epi16(buf_cur[j], ones);
   2783        const __m128i buf_32_lo =
   2784            _mm_srai_epi32(_mm_madd_epi16(buf_lo, scale), (NewSqrt2Bits + 1));
   2785        const __m128i buf_32_hi =
   2786            _mm_srai_epi32(_mm_madd_epi16(buf_hi, scale), (NewSqrt2Bits + 1));
   2787        buf_cur[j] = _mm_packs_epi32(buf_32_lo, buf_32_hi);
   2788      }
   2789    } else {
   2790      row_txfm(buf_cur, buf_cur);
   2791      round_shift_16bit_ssse3(buf_cur, row_one_loop, shift[0]);
   2792    }
   2793    if (lr_flip) {
   2794      __m128i temp[8];
   2795      flip_buf_sse2(buf_cur, temp, txfm_size_col);
   2796      transpose_16bit_8x4(temp, buf_cur);
   2797    } else {
   2798      transpose_16bit_8x4(buf_cur, buf_cur);
   2799    }
   2800  }
   2801  col_txfm(buf, buf);
   2802  round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
   2803  lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
   2804 }
   2805 
   2806 static void lowbd_inv_txfm2d_add_16x4_ssse3(const int32_t *input,
   2807                                            uint8_t *output, int stride,
   2808                                            TX_TYPE tx_type, TX_SIZE tx_size_,
   2809                                            int eob) {
   2810  (void)tx_size_;
   2811  (void)eob;
   2812  __m128i buf[16];
   2813  const TX_SIZE tx_size = TX_16X4;
   2814  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   2815  const int txw_idx = get_txw_idx(tx_size);
   2816  const int txh_idx = get_txh_idx(tx_size);
   2817  const int txfm_size_col = tx_size_wide[tx_size];
   2818  const int txfm_size_row = tx_size_high[tx_size];
   2819  const int buf_size_w_div8 = txfm_size_col >> 3;
   2820 
   2821  const transform_1d_ssse3 row_txfm =
   2822      lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
   2823  const transform_1d_ssse3 col_txfm =
   2824      lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]];
   2825 
   2826  int ud_flip, lr_flip;
   2827  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   2828  const int row_one_loop = 8;
   2829  load_buffer_32bit_to_16bit_w4(input, txfm_size_row, buf, txfm_size_col);
   2830  if (row_txfm == iidentity16_ssse3) {
   2831    const __m128i scale = pair_set_epi16(2 * NewSqrt2, 3 << (NewSqrt2Bits - 1));
   2832    const __m128i ones = _mm_set1_epi16(1);
   2833    for (int j = 0; j < 16; ++j) {
   2834      const __m128i buf_lo = _mm_unpacklo_epi16(buf[j], ones);
   2835      const __m128i buf_hi = _mm_unpackhi_epi16(buf[j], ones);
   2836      const __m128i buf_32_lo =
   2837          _mm_srai_epi32(_mm_madd_epi16(buf_lo, scale), (NewSqrt2Bits + 1));
   2838      const __m128i buf_32_hi =
   2839          _mm_srai_epi32(_mm_madd_epi16(buf_hi, scale), (NewSqrt2Bits + 1));
   2840      buf[j] = _mm_packs_epi32(buf_32_lo, buf_32_hi);
   2841    }
   2842  } else {
   2843    row_txfm(buf, buf);
   2844    round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);
   2845  }
   2846  if (lr_flip) {
   2847    __m128i temp[16];
   2848    flip_buf_sse2(buf, temp, 16);
   2849    transpose_16bit_4x8(temp, buf);
   2850    transpose_16bit_4x8(temp + 8, buf + 8);
   2851  } else {
   2852    transpose_16bit_4x8(buf, buf);
   2853    transpose_16bit_4x8(buf + row_one_loop, buf + row_one_loop);
   2854  }
   2855  for (int i = 0; i < buf_size_w_div8; i++) {
   2856    col_txfm(buf + i * row_one_loop, buf + i * row_one_loop);
   2857    round_shift_16bit_ssse3(buf + i * row_one_loop, txfm_size_row, shift[1]);
   2858  }
   2859  lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, 4);
   2860  lowbd_write_buffer_8xn_sse2(buf + 8, output + 8, stride, ud_flip, 4);
   2861 }
   2862 
   2863 void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output,
   2864                                    int stride, TX_TYPE tx_type,
   2865                                    TX_SIZE tx_size, int eob) {
   2866  switch (tx_size) {
   2867    case TX_4X4:
   2868      lowbd_inv_txfm2d_add_4x4_ssse3(input, output, stride, tx_type, tx_size,
   2869                                     eob);
   2870      break;
   2871    case TX_4X8:
   2872      lowbd_inv_txfm2d_add_4x8_ssse3(input, output, stride, tx_type, tx_size,
   2873                                     eob);
   2874      break;
   2875    case TX_8X4:
   2876      lowbd_inv_txfm2d_add_8x4_ssse3(input, output, stride, tx_type, tx_size,
   2877                                     eob);
   2878      break;
   2879    case TX_4X16:
   2880      lowbd_inv_txfm2d_add_4x16_ssse3(input, output, stride, tx_type, tx_size,
   2881                                      eob);
   2882      break;
   2883    case TX_16X4:
   2884      lowbd_inv_txfm2d_add_16x4_ssse3(input, output, stride, tx_type, tx_size,
   2885                                      eob);
   2886      break;
   2887    default:
   2888      lowbd_inv_txfm2d_add_universe_ssse3(input, output, stride, tx_type,
   2889                                          tx_size, eob);
   2890      break;
   2891  }
   2892 }
   2893 
   2894 void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
   2895                            const TxfmParam *txfm_param) {
   2896  if (!txfm_param->lossless) {
   2897    const TX_TYPE tx_type = txfm_param->tx_type;
   2898    av1_lowbd_inv_txfm2d_add_ssse3(dqcoeff, dst, stride, tx_type,
   2899                                   txfm_param->tx_size, txfm_param->eob);
   2900 
   2901  } else {
   2902    av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
   2903  }
   2904 }