tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

highbd_fwd_txfm_sse4.c (95993B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 #include <assert.h>
     12 #include <smmintrin.h> /* SSE4.1 */
     13 
     14 #include "aom_dsp/txfm_common.h"
     15 #include "aom_dsp/x86/transpose_sse2.h"
     16 #include "aom_dsp/x86/txfm_common_sse2.h"
     17 #include "aom_ports/mem.h"
     18 #include "av1/common/av1_txfm.h"
     19 #include "av1/common/x86/highbd_txfm_utility_sse4.h"
     20 #include "av1/encoder/av1_fwd_txfm1d_cfg.h"
     21 #include "av1/encoder/x86/av1_txfm1d_sse4.h"
     22 #include "config/aom_config.h"
     23 #include "config/av1_rtcd.h"
     24 
     25 static inline void store_output_w4(int32_t *const out, const __m128i *const in,
     26                                   const int stride, const int out_size) {
     27  for (int i = 0; i < out_size; ++i) {
     28    _mm_store_si128((__m128i *)(out + i * stride), in[i]);
     29  }
     30 }
     31 
     32 void av1_fwht4x4_sse4_1(const int16_t *input, tran_low_t *output, int stride) {
     33  __m128i in[4];
     34  in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
     35  in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
     36  in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
     37  in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
     38 
     39  // Convert to int32_t.
     40  __m128i op[4];
     41  op[0] = _mm_cvtepi16_epi32(in[0]);
     42  op[1] = _mm_cvtepi16_epi32(in[1]);
     43  op[2] = _mm_cvtepi16_epi32(in[2]);
     44  op[3] = _mm_cvtepi16_epi32(in[3]);
     45 
     46  for (int i = 0; i < 2; ++i) {
     47    __m128i a1 = op[0];
     48    __m128i b1 = op[1];
     49    __m128i c1 = op[2];
     50    __m128i d1 = op[3];
     51    __m128i e1;
     52 
     53    a1 = _mm_add_epi32(a1, b1);  // a1 += b1
     54    d1 = _mm_sub_epi32(d1, c1);  // d1 = d1 - c1
     55    e1 = _mm_sub_epi32(a1, d1);  // e1 = (a1 - d1) >> 1
     56    e1 = _mm_srai_epi32(e1, 1);
     57    b1 = _mm_sub_epi32(e1, b1);  // b1 = e1 - b1
     58    c1 = _mm_sub_epi32(e1, c1);  // c1 = e1 - c1
     59    a1 = _mm_sub_epi32(a1, c1);  // a1 -= c1
     60    d1 = _mm_add_epi32(d1, b1);  // d1 += b1
     61 
     62    op[0] = a1;
     63    op[1] = c1;
     64    op[2] = d1;
     65    op[3] = b1;
     66 
     67    if (i == 0) {
     68      transpose_32bit_4x4(op, op);
     69    }
     70  }
     71 
     72  op[0] = _mm_slli_epi32(op[0], UNIT_QUANT_SHIFT);
     73  op[1] = _mm_slli_epi32(op[1], UNIT_QUANT_SHIFT);
     74  op[2] = _mm_slli_epi32(op[2], UNIT_QUANT_SHIFT);
     75  op[3] = _mm_slli_epi32(op[3], UNIT_QUANT_SHIFT);
     76 
     77  _mm_storeu_si128((__m128i *)(output + 0), op[0]);
     78  _mm_storeu_si128((__m128i *)(output + 4), op[1]);
     79  _mm_storeu_si128((__m128i *)(output + 8), op[2]);
     80  _mm_storeu_si128((__m128i *)(output + 12), op[3]);
     81 }
     82 
     83 static inline void load_buffer_4x4(const int16_t *input, __m128i *in,
     84                                   int stride, int flipud, int fliplr,
     85                                   int shift) {
     86  if (!flipud) {
     87    in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
     88    in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
     89    in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
     90    in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
     91  } else {
     92    in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
     93    in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
     94    in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
     95    in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
     96  }
     97 
     98  if (fliplr) {
     99    in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
    100    in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
    101    in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
    102    in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
    103  }
    104 
    105  in[0] = _mm_cvtepi16_epi32(in[0]);
    106  in[1] = _mm_cvtepi16_epi32(in[1]);
    107  in[2] = _mm_cvtepi16_epi32(in[2]);
    108  in[3] = _mm_cvtepi16_epi32(in[3]);
    109 
    110  in[0] = _mm_slli_epi32(in[0], shift);
    111  in[1] = _mm_slli_epi32(in[1], shift);
    112  in[2] = _mm_slli_epi32(in[2], shift);
    113  in[3] = _mm_slli_epi32(in[3], shift);
    114 }
    115 
    116 // We only use stage-2 bit;
    117 // shift[0] is used in load_buffer_4x4()
    118 // shift[1] is used in txfm_func_col()
    119 // shift[2] is used in txfm_func_row()
    120 static void fdct4x4_sse4_1(__m128i *in, __m128i *out, int bit,
    121                           const int num_col) {
    122  const int32_t *cospi = cospi_arr(bit);
    123  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
    124  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
    125  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
    126  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
    127  __m128i s0, s1, s2, s3;
    128  __m128i u0, u1, u2, u3;
    129  __m128i v0, v1, v2, v3;
    130 
    131  int endidx = 3 * num_col;
    132  s0 = _mm_add_epi32(in[0], in[endidx]);
    133  s3 = _mm_sub_epi32(in[0], in[endidx]);
    134  endidx -= num_col;
    135  s1 = _mm_add_epi32(in[num_col], in[endidx]);
    136  s2 = _mm_sub_epi32(in[num_col], in[endidx]);
    137 
    138  // btf_32_sse4_1_type0(cospi32, cospi32, s[01], u[02], bit);
    139  u0 = _mm_mullo_epi32(s0, cospi32);
    140  u1 = _mm_mullo_epi32(s1, cospi32);
    141  u2 = _mm_add_epi32(u0, u1);
    142  v0 = _mm_sub_epi32(u0, u1);
    143 
    144  u3 = _mm_add_epi32(u2, rnding);
    145  v1 = _mm_add_epi32(v0, rnding);
    146 
    147  u0 = _mm_srai_epi32(u3, bit);
    148  u2 = _mm_srai_epi32(v1, bit);
    149 
    150  // btf_32_sse4_1_type1(cospi48, cospi16, s[23], u[13], bit);
    151  v0 = _mm_mullo_epi32(s2, cospi48);
    152  v1 = _mm_mullo_epi32(s3, cospi16);
    153  v2 = _mm_add_epi32(v0, v1);
    154 
    155  v3 = _mm_add_epi32(v2, rnding);
    156  u1 = _mm_srai_epi32(v3, bit);
    157 
    158  v0 = _mm_mullo_epi32(s2, cospi16);
    159  v1 = _mm_mullo_epi32(s3, cospi48);
    160  v2 = _mm_sub_epi32(v1, v0);
    161 
    162  v3 = _mm_add_epi32(v2, rnding);
    163  u3 = _mm_srai_epi32(v3, bit);
    164 
    165  // Note: shift[1] and shift[2] are zeros
    166 
    167  out[0] = u0;
    168  out[1] = u1;
    169  out[2] = u2;
    170  out[3] = u3;
    171 }
    172 
    173 static inline void write_buffer_4x4(__m128i *res, int32_t *output) {
    174  _mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
    175  _mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
    176  _mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
    177  _mm_store_si128((__m128i *)(output + 3 * 4), res[3]);
    178 }
    179 
    180 static void fadst4x4_sse4_1(__m128i *in, __m128i *out, int bit,
    181                            const int num_col) {
    182  const int32_t *sinpi = sinpi_arr(bit);
    183  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
    184  const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]);
    185  const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]);
    186  const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]);
    187  const __m128i sinpi4 = _mm_set1_epi32((int)sinpi[4]);
    188  __m128i t;
    189  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
    190  __m128i x0, x1, x2, x3;
    191  __m128i u0, u1, u2, u3;
    192 
    193  int idx = 0 * num_col;
    194  s0 = _mm_mullo_epi32(in[idx], sinpi1);
    195  s1 = _mm_mullo_epi32(in[idx], sinpi4);
    196  t = _mm_add_epi32(in[idx], in[idx + num_col]);
    197  idx += num_col;
    198  s2 = _mm_mullo_epi32(in[idx], sinpi2);
    199  s3 = _mm_mullo_epi32(in[idx], sinpi1);
    200  idx += num_col;
    201  s4 = _mm_mullo_epi32(in[idx], sinpi3);
    202  idx += num_col;
    203  s5 = _mm_mullo_epi32(in[idx], sinpi4);
    204  s6 = _mm_mullo_epi32(in[idx], sinpi2);
    205  s7 = _mm_sub_epi32(t, in[idx]);
    206 
    207  t = _mm_add_epi32(s0, s2);
    208  x0 = _mm_add_epi32(t, s5);
    209  x1 = _mm_mullo_epi32(s7, sinpi3);
    210  t = _mm_sub_epi32(s1, s3);
    211  x2 = _mm_add_epi32(t, s6);
    212  x3 = s4;
    213 
    214  s0 = _mm_add_epi32(x0, x3);
    215  s1 = x1;
    216  s2 = _mm_sub_epi32(x2, x3);
    217  t = _mm_sub_epi32(x2, x0);
    218  s3 = _mm_add_epi32(t, x3);
    219 
    220  u0 = _mm_add_epi32(s0, rnding);
    221  u0 = _mm_srai_epi32(u0, bit);
    222 
    223  u1 = _mm_add_epi32(s1, rnding);
    224  u1 = _mm_srai_epi32(u1, bit);
    225 
    226  u2 = _mm_add_epi32(s2, rnding);
    227  u2 = _mm_srai_epi32(u2, bit);
    228 
    229  u3 = _mm_add_epi32(s3, rnding);
    230  u3 = _mm_srai_epi32(u3, bit);
    231 
    232  out[0] = u0;
    233  out[1] = u1;
    234  out[2] = u2;
    235  out[3] = u3;
    236 }
    237 static void idtx4x4_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) {
    238  (void)bit;
    239  __m128i fact = _mm_set1_epi32(NewSqrt2);
    240  __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
    241  __m128i a_low;
    242 
    243  for (int i = 0; i < 4; i++) {
    244    a_low = _mm_mullo_epi32(in[i * col_num], fact);
    245    a_low = _mm_add_epi32(a_low, offset);
    246    out[i] = _mm_srai_epi32(a_low, NewSqrt2Bits);
    247  }
    248 }
    249 void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff,
    250                               int input_stride, TX_TYPE tx_type, int bd) {
    251  __m128i in[4];
    252  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X4];
    253  const int txw_idx = get_txw_idx(TX_4X4);
    254  const int txh_idx = get_txh_idx(TX_4X4);
    255 
    256  switch (tx_type) {
    257    case DCT_DCT:
    258      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
    259      fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
    260      transpose_32bit_4x4(in, in);
    261      fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
    262      write_buffer_4x4(in, coeff);
    263      break;
    264    case ADST_DCT:
    265      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
    266      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
    267      transpose_32bit_4x4(in, in);
    268      fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
    269      write_buffer_4x4(in, coeff);
    270      break;
    271    case DCT_ADST:
    272      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
    273      fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
    274      transpose_32bit_4x4(in, in);
    275      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
    276      write_buffer_4x4(in, coeff);
    277      break;
    278    case ADST_ADST:
    279      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
    280      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
    281      transpose_32bit_4x4(in, in);
    282      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
    283      write_buffer_4x4(in, coeff);
    284      break;
    285    case FLIPADST_DCT:
    286      load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]);
    287      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
    288      transpose_32bit_4x4(in, in);
    289      fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
    290      write_buffer_4x4(in, coeff);
    291      break;
    292    case DCT_FLIPADST:
    293      load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]);
    294      fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
    295      transpose_32bit_4x4(in, in);
    296      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
    297      write_buffer_4x4(in, coeff);
    298      break;
    299    case FLIPADST_FLIPADST:
    300      load_buffer_4x4(input, in, input_stride, 1, 1, shift[0]);
    301      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
    302      transpose_32bit_4x4(in, in);
    303      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
    304      write_buffer_4x4(in, coeff);
    305      break;
    306    case ADST_FLIPADST:
    307      load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]);
    308      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
    309      transpose_32bit_4x4(in, in);
    310      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
    311      write_buffer_4x4(in, coeff);
    312      break;
    313    case FLIPADST_ADST:
    314      load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]);
    315      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
    316      transpose_32bit_4x4(in, in);
    317      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
    318      write_buffer_4x4(in, coeff);
    319      break;
    320    case IDTX:
    321      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
    322      idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
    323      transpose_32bit_4x4(in, in);
    324      idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
    325      write_buffer_4x4(in, coeff);
    326      break;
    327    case V_DCT:
    328      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
    329      fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
    330      transpose_32bit_4x4(in, in);
    331      idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
    332      write_buffer_4x4(in, coeff);
    333      break;
    334    case H_DCT:
    335      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
    336      idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
    337      transpose_32bit_4x4(in, in);
    338      fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
    339      write_buffer_4x4(in, coeff);
    340      break;
    341    case V_ADST:
    342      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
    343      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
    344      transpose_32bit_4x4(in, in);
    345      idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
    346      write_buffer_4x4(in, coeff);
    347      break;
    348    case H_ADST:
    349      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
    350      idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
    351      transpose_32bit_4x4(in, in);
    352      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
    353      write_buffer_4x4(in, coeff);
    354      break;
    355    case V_FLIPADST:
    356      load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]);
    357      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
    358      transpose_32bit_4x4(in, in);
    359      idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
    360      write_buffer_4x4(in, coeff);
    361      break;
    362    case H_FLIPADST:
    363      load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]);
    364      idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
    365      transpose_32bit_4x4(in, in);
    366      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
    367      write_buffer_4x4(in, coeff);
    368      break;
    369    default: assert(0);
    370  }
    371  (void)bd;
    372 }
    373 
    374 static inline void load_buffer_8x8(const int16_t *input, __m128i *in,
    375                                   int stride, int flipud, int fliplr,
    376                                   int shift) {
    377  __m128i u;
    378  if (!flipud) {
    379    in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
    380    in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
    381    in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
    382    in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
    383    in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
    384    in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
    385    in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
    386    in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
    387  } else {
    388    in[0] = _mm_load_si128((const __m128i *)(input + 7 * stride));
    389    in[1] = _mm_load_si128((const __m128i *)(input + 6 * stride));
    390    in[2] = _mm_load_si128((const __m128i *)(input + 5 * stride));
    391    in[3] = _mm_load_si128((const __m128i *)(input + 4 * stride));
    392    in[4] = _mm_load_si128((const __m128i *)(input + 3 * stride));
    393    in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride));
    394    in[6] = _mm_load_si128((const __m128i *)(input + 1 * stride));
    395    in[7] = _mm_load_si128((const __m128i *)(input + 0 * stride));
    396  }
    397 
    398  if (fliplr) {
    399    in[0] = mm_reverse_epi16(in[0]);
    400    in[1] = mm_reverse_epi16(in[1]);
    401    in[2] = mm_reverse_epi16(in[2]);
    402    in[3] = mm_reverse_epi16(in[3]);
    403    in[4] = mm_reverse_epi16(in[4]);
    404    in[5] = mm_reverse_epi16(in[5]);
    405    in[6] = mm_reverse_epi16(in[6]);
    406    in[7] = mm_reverse_epi16(in[7]);
    407  }
    408 
    409  u = _mm_unpackhi_epi64(in[4], in[4]);
    410  in[8] = _mm_cvtepi16_epi32(in[4]);
    411  in[9] = _mm_cvtepi16_epi32(u);
    412 
    413  u = _mm_unpackhi_epi64(in[5], in[5]);
    414  in[10] = _mm_cvtepi16_epi32(in[5]);
    415  in[11] = _mm_cvtepi16_epi32(u);
    416 
    417  u = _mm_unpackhi_epi64(in[6], in[6]);
    418  in[12] = _mm_cvtepi16_epi32(in[6]);
    419  in[13] = _mm_cvtepi16_epi32(u);
    420 
    421  u = _mm_unpackhi_epi64(in[7], in[7]);
    422  in[14] = _mm_cvtepi16_epi32(in[7]);
    423  in[15] = _mm_cvtepi16_epi32(u);
    424 
    425  u = _mm_unpackhi_epi64(in[3], in[3]);
    426  in[6] = _mm_cvtepi16_epi32(in[3]);
    427  in[7] = _mm_cvtepi16_epi32(u);
    428 
    429  u = _mm_unpackhi_epi64(in[2], in[2]);
    430  in[4] = _mm_cvtepi16_epi32(in[2]);
    431  in[5] = _mm_cvtepi16_epi32(u);
    432 
    433  u = _mm_unpackhi_epi64(in[1], in[1]);
    434  in[2] = _mm_cvtepi16_epi32(in[1]);
    435  in[3] = _mm_cvtepi16_epi32(u);
    436 
    437  u = _mm_unpackhi_epi64(in[0], in[0]);
    438  in[0] = _mm_cvtepi16_epi32(in[0]);
    439  in[1] = _mm_cvtepi16_epi32(u);
    440 
    441  in[0] = _mm_slli_epi32(in[0], shift);
    442  in[1] = _mm_slli_epi32(in[1], shift);
    443  in[2] = _mm_slli_epi32(in[2], shift);
    444  in[3] = _mm_slli_epi32(in[3], shift);
    445  in[4] = _mm_slli_epi32(in[4], shift);
    446  in[5] = _mm_slli_epi32(in[5], shift);
    447  in[6] = _mm_slli_epi32(in[6], shift);
    448  in[7] = _mm_slli_epi32(in[7], shift);
    449 
    450  in[8] = _mm_slli_epi32(in[8], shift);
    451  in[9] = _mm_slli_epi32(in[9], shift);
    452  in[10] = _mm_slli_epi32(in[10], shift);
    453  in[11] = _mm_slli_epi32(in[11], shift);
    454  in[12] = _mm_slli_epi32(in[12], shift);
    455  in[13] = _mm_slli_epi32(in[13], shift);
    456  in[14] = _mm_slli_epi32(in[14], shift);
    457  in[15] = _mm_slli_epi32(in[15], shift);
    458 }
    459 
    460 static inline void col_txfm_8x8_rounding(__m128i *in, int shift) {
    461  const __m128i rounding = _mm_set1_epi32(1 << (shift - 1));
    462 
    463  in[0] = _mm_add_epi32(in[0], rounding);
    464  in[1] = _mm_add_epi32(in[1], rounding);
    465  in[2] = _mm_add_epi32(in[2], rounding);
    466  in[3] = _mm_add_epi32(in[3], rounding);
    467  in[4] = _mm_add_epi32(in[4], rounding);
    468  in[5] = _mm_add_epi32(in[5], rounding);
    469  in[6] = _mm_add_epi32(in[6], rounding);
    470  in[7] = _mm_add_epi32(in[7], rounding);
    471  in[8] = _mm_add_epi32(in[8], rounding);
    472  in[9] = _mm_add_epi32(in[9], rounding);
    473  in[10] = _mm_add_epi32(in[10], rounding);
    474  in[11] = _mm_add_epi32(in[11], rounding);
    475  in[12] = _mm_add_epi32(in[12], rounding);
    476  in[13] = _mm_add_epi32(in[13], rounding);
    477  in[14] = _mm_add_epi32(in[14], rounding);
    478  in[15] = _mm_add_epi32(in[15], rounding);
    479 
    480  in[0] = _mm_srai_epi32(in[0], shift);
    481  in[1] = _mm_srai_epi32(in[1], shift);
    482  in[2] = _mm_srai_epi32(in[2], shift);
    483  in[3] = _mm_srai_epi32(in[3], shift);
    484  in[4] = _mm_srai_epi32(in[4], shift);
    485  in[5] = _mm_srai_epi32(in[5], shift);
    486  in[6] = _mm_srai_epi32(in[6], shift);
    487  in[7] = _mm_srai_epi32(in[7], shift);
    488  in[8] = _mm_srai_epi32(in[8], shift);
    489  in[9] = _mm_srai_epi32(in[9], shift);
    490  in[10] = _mm_srai_epi32(in[10], shift);
    491  in[11] = _mm_srai_epi32(in[11], shift);
    492  in[12] = _mm_srai_epi32(in[12], shift);
    493  in[13] = _mm_srai_epi32(in[13], shift);
    494  in[14] = _mm_srai_epi32(in[14], shift);
    495  in[15] = _mm_srai_epi32(in[15], shift);
    496 }
    497 
    498 static inline void col_txfm_4x8_rounding(__m128i *in, int shift) {
    499  const __m128i rounding = _mm_set1_epi32(1 << (shift - 1));
    500 
    501  in[0] = _mm_add_epi32(in[0], rounding);
    502  in[1] = _mm_add_epi32(in[1], rounding);
    503  in[2] = _mm_add_epi32(in[2], rounding);
    504  in[3] = _mm_add_epi32(in[3], rounding);
    505  in[4] = _mm_add_epi32(in[4], rounding);
    506  in[5] = _mm_add_epi32(in[5], rounding);
    507  in[6] = _mm_add_epi32(in[6], rounding);
    508  in[7] = _mm_add_epi32(in[7], rounding);
    509 
    510  in[0] = _mm_srai_epi32(in[0], shift);
    511  in[1] = _mm_srai_epi32(in[1], shift);
    512  in[2] = _mm_srai_epi32(in[2], shift);
    513  in[3] = _mm_srai_epi32(in[3], shift);
    514  in[4] = _mm_srai_epi32(in[4], shift);
    515  in[5] = _mm_srai_epi32(in[5], shift);
    516  in[6] = _mm_srai_epi32(in[6], shift);
    517  in[7] = _mm_srai_epi32(in[7], shift);
    518 }
    519 
    520 static inline void write_buffer_8x8(const __m128i *res, int32_t *output) {
    521  _mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
    522  _mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
    523  _mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
    524  _mm_store_si128((__m128i *)(output + 3 * 4), res[3]);
    525 
    526  _mm_store_si128((__m128i *)(output + 4 * 4), res[4]);
    527  _mm_store_si128((__m128i *)(output + 5 * 4), res[5]);
    528  _mm_store_si128((__m128i *)(output + 6 * 4), res[6]);
    529  _mm_store_si128((__m128i *)(output + 7 * 4), res[7]);
    530 
    531  _mm_store_si128((__m128i *)(output + 8 * 4), res[8]);
    532  _mm_store_si128((__m128i *)(output + 9 * 4), res[9]);
    533  _mm_store_si128((__m128i *)(output + 10 * 4), res[10]);
    534  _mm_store_si128((__m128i *)(output + 11 * 4), res[11]);
    535 
    536  _mm_store_si128((__m128i *)(output + 12 * 4), res[12]);
    537  _mm_store_si128((__m128i *)(output + 13 * 4), res[13]);
    538  _mm_store_si128((__m128i *)(output + 14 * 4), res[14]);
    539  _mm_store_si128((__m128i *)(output + 15 * 4), res[15]);
    540 }
    541 
    542 static inline void write_buffer_16x8(const __m128i *res, int32_t *output,
    543                                     const int stride) {
    544  _mm_storeu_si128((__m128i *)(output), res[0]);
    545  _mm_storeu_si128((__m128i *)(output + 4), res[1]);
    546  _mm_storeu_si128((__m128i *)(output + stride), res[2]);
    547  _mm_storeu_si128((__m128i *)(output + stride + 4), res[3]);
    548 
    549  _mm_storeu_si128((__m128i *)(output + (stride * 2)), res[4]);
    550  _mm_storeu_si128((__m128i *)(output + (stride * 2) + 4), res[5]);
    551  _mm_storeu_si128((__m128i *)(output + (stride * 3)), res[6]);
    552  _mm_storeu_si128((__m128i *)(output + (stride * 3) + 4), res[7]);
    553 
    554  _mm_storeu_si128((__m128i *)(output + (stride * 4)), res[8]);
    555  _mm_storeu_si128((__m128i *)(output + (stride * 4) + 4), res[9]);
    556  _mm_storeu_si128((__m128i *)(output + (stride * 5)), res[10]);
    557  _mm_storeu_si128((__m128i *)(output + (stride * 5) + 4), res[11]);
    558 
    559  _mm_storeu_si128((__m128i *)(output + (stride * 6)), res[12]);
    560  _mm_storeu_si128((__m128i *)(output + (stride * 6) + 4), res[13]);
    561  _mm_storeu_si128((__m128i *)(output + (stride * 7)), res[14]);
    562  _mm_storeu_si128((__m128i *)(output + (stride * 7) + 4), res[15]);
    563 }
    564 
    565 static void fdct4x8_sse4_1(__m128i *in, __m128i *out, int bit,
    566                           const int col_num) {
    567  const int32_t *cospi = cospi_arr(bit);
    568  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
    569  const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
    570  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
    571  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
    572  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
    573  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
    574  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
    575  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
    576  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
    577  __m128i u[8], v[8];
    578 
    579  int startidx = 0 * col_num;
    580  int endidx = 7 * col_num;
    581  // Even 8 points 0, 2, ..., 14
    582  // stage 0
    583  // stage 1
    584  u[0] = _mm_add_epi32(in[startidx], in[endidx]);
    585  v[7] = _mm_sub_epi32(in[startidx], in[endidx]);  // v[7]
    586  startidx += col_num;
    587  endidx -= col_num;
    588  u[1] = _mm_add_epi32(in[startidx], in[endidx]);
    589  u[6] = _mm_sub_epi32(in[startidx], in[endidx]);
    590  startidx += col_num;
    591  endidx -= col_num;
    592  u[2] = _mm_add_epi32(in[startidx], in[endidx]);
    593  u[5] = _mm_sub_epi32(in[startidx], in[endidx]);
    594  startidx += col_num;
    595  endidx -= col_num;
    596  u[3] = _mm_add_epi32(in[startidx], in[endidx]);
    597  v[4] = _mm_sub_epi32(in[startidx], in[endidx]);  // v[4]
    598 
    599  // stage 2
    600  v[0] = _mm_add_epi32(u[0], u[3]);
    601  v[3] = _mm_sub_epi32(u[0], u[3]);
    602  v[1] = _mm_add_epi32(u[1], u[2]);
    603  v[2] = _mm_sub_epi32(u[1], u[2]);
    604 
    605  v[5] = _mm_mullo_epi32(u[5], cospim32);
    606  v[6] = _mm_mullo_epi32(u[6], cospi32);
    607  v[5] = _mm_add_epi32(v[5], v[6]);
    608  v[5] = _mm_add_epi32(v[5], rnding);
    609  v[5] = _mm_srai_epi32(v[5], bit);
    610 
    611  u[0] = _mm_mullo_epi32(u[5], cospi32);
    612  v[6] = _mm_mullo_epi32(u[6], cospim32);
    613  v[6] = _mm_sub_epi32(u[0], v[6]);
    614  v[6] = _mm_add_epi32(v[6], rnding);
    615  v[6] = _mm_srai_epi32(v[6], bit);
    616 
    617  // stage 3
    618  // type 0
    619  v[0] = _mm_mullo_epi32(v[0], cospi32);
    620  v[1] = _mm_mullo_epi32(v[1], cospi32);
    621  u[0] = _mm_add_epi32(v[0], v[1]);
    622  u[0] = _mm_add_epi32(u[0], rnding);
    623  u[0] = _mm_srai_epi32(u[0], bit);
    624 
    625  u[1] = _mm_sub_epi32(v[0], v[1]);
    626  u[1] = _mm_add_epi32(u[1], rnding);
    627  u[1] = _mm_srai_epi32(u[1], bit);
    628 
    629  // type 1
    630  v[0] = _mm_mullo_epi32(v[2], cospi48);
    631  v[1] = _mm_mullo_epi32(v[3], cospi16);
    632  u[2] = _mm_add_epi32(v[0], v[1]);
    633  u[2] = _mm_add_epi32(u[2], rnding);
    634  u[2] = _mm_srai_epi32(u[2], bit);
    635 
    636  v[0] = _mm_mullo_epi32(v[2], cospi16);
    637  v[1] = _mm_mullo_epi32(v[3], cospi48);
    638  u[3] = _mm_sub_epi32(v[1], v[0]);
    639  u[3] = _mm_add_epi32(u[3], rnding);
    640  u[3] = _mm_srai_epi32(u[3], bit);
    641 
    642  u[4] = _mm_add_epi32(v[4], v[5]);
    643  u[5] = _mm_sub_epi32(v[4], v[5]);
    644  u[6] = _mm_sub_epi32(v[7], v[6]);
    645  u[7] = _mm_add_epi32(v[7], v[6]);
    646 
    647  // stage 4
    648  // stage 5
    649  v[0] = _mm_mullo_epi32(u[4], cospi56);
    650  v[1] = _mm_mullo_epi32(u[7], cospi8);
    651  v[0] = _mm_add_epi32(v[0], v[1]);
    652  v[0] = _mm_add_epi32(v[0], rnding);
    653  out[1 * col_num] = _mm_srai_epi32(v[0], bit);  // buf0[4]
    654 
    655  v[0] = _mm_mullo_epi32(u[4], cospi8);
    656  v[1] = _mm_mullo_epi32(u[7], cospi56);
    657  v[0] = _mm_sub_epi32(v[1], v[0]);
    658  v[0] = _mm_add_epi32(v[0], rnding);
    659  out[7 * col_num] = _mm_srai_epi32(v[0], bit);  // buf0[7]
    660 
    661  v[0] = _mm_mullo_epi32(u[5], cospi24);
    662  v[1] = _mm_mullo_epi32(u[6], cospi40);
    663  v[0] = _mm_add_epi32(v[0], v[1]);
    664  v[0] = _mm_add_epi32(v[0], rnding);
    665  out[5 * col_num] = _mm_srai_epi32(v[0], bit);  // buf0[5]
    666 
    667  v[0] = _mm_mullo_epi32(u[5], cospi40);
    668  v[1] = _mm_mullo_epi32(u[6], cospi24);
    669  v[0] = _mm_sub_epi32(v[1], v[0]);
    670  v[0] = _mm_add_epi32(v[0], rnding);
    671  out[3 * col_num] = _mm_srai_epi32(v[0], bit);  // buf0[6]
    672 
    673  out[0 * col_num] = u[0];  // buf0[0]
    674  out[4 * col_num] = u[1];  // buf0[1]
    675  out[2 * col_num] = u[2];  // buf0[2]
    676  out[6 * col_num] = u[3];  // buf0[3]
    677 }
    678 
    679 static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit,
    680                           const int col_num) {
    681  fdct4x8_sse4_1(in, out, bit, col_num);
    682  fdct4x8_sse4_1(in + 1, out + 1, bit, col_num);
    683 }
    684 
    685 static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit,
    686                            const int col_num) {
    687  const int32_t *cospi = cospi_arr(bit);
    688  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
    689  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
    690  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
    691  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
    692  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
    693  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
    694  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
    695  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
    696  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
    697  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
    698  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
    699  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
    700  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
    701  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
    702  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
    703  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
    704  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
    705  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
    706  const __m128i zero = _mm_setzero_si128();
    707  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
    708  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
    709  __m128i x, y;
    710  int col;
    711 
    712  // Note:
    713  //  Even column: 0, 2, ..., 14
    714  //  Odd column: 1, 3, ..., 15
    715  //  one even column plus one odd column constructs one row (8 coeffs)
    716  //  total we have 8 rows (8x8).
    717  for (col = 0; col < col_num; ++col) {
    718    // stage 0
    719    // stage 1
    720    u0 = in[col_num * 0 + col];
    721    u1 = _mm_sub_epi32(zero, in[col_num * 7 + col]);
    722    u2 = _mm_sub_epi32(zero, in[col_num * 3 + col]);
    723    u3 = in[col_num * 4 + col];
    724    u4 = _mm_sub_epi32(zero, in[col_num * 1 + col]);
    725    u5 = in[col_num * 6 + col];
    726    u6 = in[col_num * 2 + col];
    727    u7 = _mm_sub_epi32(zero, in[col_num * 5 + col]);
    728 
    729    // stage 2
    730    v0 = u0;
    731    v1 = u1;
    732 
    733    x = _mm_mullo_epi32(u2, cospi32);
    734    y = _mm_mullo_epi32(u3, cospi32);
    735    v2 = _mm_add_epi32(x, y);
    736    v2 = _mm_add_epi32(v2, rnding);
    737    v2 = _mm_srai_epi32(v2, bit);
    738 
    739    v3 = _mm_sub_epi32(x, y);
    740    v3 = _mm_add_epi32(v3, rnding);
    741    v3 = _mm_srai_epi32(v3, bit);
    742 
    743    v4 = u4;
    744    v5 = u5;
    745 
    746    x = _mm_mullo_epi32(u6, cospi32);
    747    y = _mm_mullo_epi32(u7, cospi32);
    748    v6 = _mm_add_epi32(x, y);
    749    v6 = _mm_add_epi32(v6, rnding);
    750    v6 = _mm_srai_epi32(v6, bit);
    751 
    752    v7 = _mm_sub_epi32(x, y);
    753    v7 = _mm_add_epi32(v7, rnding);
    754    v7 = _mm_srai_epi32(v7, bit);
    755 
    756    // stage 3
    757    u0 = _mm_add_epi32(v0, v2);
    758    u1 = _mm_add_epi32(v1, v3);
    759    u2 = _mm_sub_epi32(v0, v2);
    760    u3 = _mm_sub_epi32(v1, v3);
    761    u4 = _mm_add_epi32(v4, v6);
    762    u5 = _mm_add_epi32(v5, v7);
    763    u6 = _mm_sub_epi32(v4, v6);
    764    u7 = _mm_sub_epi32(v5, v7);
    765 
    766    // stage 4
    767    v0 = u0;
    768    v1 = u1;
    769    v2 = u2;
    770    v3 = u3;
    771 
    772    x = _mm_mullo_epi32(u4, cospi16);
    773    y = _mm_mullo_epi32(u5, cospi48);
    774    v4 = _mm_add_epi32(x, y);
    775    v4 = _mm_add_epi32(v4, rnding);
    776    v4 = _mm_srai_epi32(v4, bit);
    777 
    778    x = _mm_mullo_epi32(u4, cospi48);
    779    y = _mm_mullo_epi32(u5, cospim16);
    780    v5 = _mm_add_epi32(x, y);
    781    v5 = _mm_add_epi32(v5, rnding);
    782    v5 = _mm_srai_epi32(v5, bit);
    783 
    784    x = _mm_mullo_epi32(u6, cospim48);
    785    y = _mm_mullo_epi32(u7, cospi16);
    786    v6 = _mm_add_epi32(x, y);
    787    v6 = _mm_add_epi32(v6, rnding);
    788    v6 = _mm_srai_epi32(v6, bit);
    789 
    790    x = _mm_mullo_epi32(u6, cospi16);
    791    y = _mm_mullo_epi32(u7, cospi48);
    792    v7 = _mm_add_epi32(x, y);
    793    v7 = _mm_add_epi32(v7, rnding);
    794    v7 = _mm_srai_epi32(v7, bit);
    795 
    796    // stage 5
    797    u0 = _mm_add_epi32(v0, v4);
    798    u1 = _mm_add_epi32(v1, v5);
    799    u2 = _mm_add_epi32(v2, v6);
    800    u3 = _mm_add_epi32(v3, v7);
    801    u4 = _mm_sub_epi32(v0, v4);
    802    u5 = _mm_sub_epi32(v1, v5);
    803    u6 = _mm_sub_epi32(v2, v6);
    804    u7 = _mm_sub_epi32(v3, v7);
    805 
    806    // stage 6
    807    x = _mm_mullo_epi32(u0, cospi4);
    808    y = _mm_mullo_epi32(u1, cospi60);
    809    v0 = _mm_add_epi32(x, y);
    810    v0 = _mm_add_epi32(v0, rnding);
    811    v0 = _mm_srai_epi32(v0, bit);
    812 
    813    x = _mm_mullo_epi32(u0, cospi60);
    814    y = _mm_mullo_epi32(u1, cospim4);
    815    v1 = _mm_add_epi32(x, y);
    816    v1 = _mm_add_epi32(v1, rnding);
    817    v1 = _mm_srai_epi32(v1, bit);
    818 
    819    x = _mm_mullo_epi32(u2, cospi20);
    820    y = _mm_mullo_epi32(u3, cospi44);
    821    v2 = _mm_add_epi32(x, y);
    822    v2 = _mm_add_epi32(v2, rnding);
    823    v2 = _mm_srai_epi32(v2, bit);
    824 
    825    x = _mm_mullo_epi32(u2, cospi44);
    826    y = _mm_mullo_epi32(u3, cospim20);
    827    v3 = _mm_add_epi32(x, y);
    828    v3 = _mm_add_epi32(v3, rnding);
    829    v3 = _mm_srai_epi32(v3, bit);
    830 
    831    x = _mm_mullo_epi32(u4, cospi36);
    832    y = _mm_mullo_epi32(u5, cospi28);
    833    v4 = _mm_add_epi32(x, y);
    834    v4 = _mm_add_epi32(v4, rnding);
    835    v4 = _mm_srai_epi32(v4, bit);
    836 
    837    x = _mm_mullo_epi32(u4, cospi28);
    838    y = _mm_mullo_epi32(u5, cospim36);
    839    v5 = _mm_add_epi32(x, y);
    840    v5 = _mm_add_epi32(v5, rnding);
    841    v5 = _mm_srai_epi32(v5, bit);
    842 
    843    x = _mm_mullo_epi32(u6, cospi52);
    844    y = _mm_mullo_epi32(u7, cospi12);
    845    v6 = _mm_add_epi32(x, y);
    846    v6 = _mm_add_epi32(v6, rnding);
    847    v6 = _mm_srai_epi32(v6, bit);
    848 
    849    x = _mm_mullo_epi32(u6, cospi12);
    850    y = _mm_mullo_epi32(u7, cospim52);
    851    v7 = _mm_add_epi32(x, y);
    852    v7 = _mm_add_epi32(v7, rnding);
    853    v7 = _mm_srai_epi32(v7, bit);
    854 
    855    // stage 7
    856    out[col_num * 0 + col] = v1;
    857    out[col_num * 1 + col] = v6;
    858    out[col_num * 2 + col] = v3;
    859    out[col_num * 3 + col] = v4;
    860    out[col_num * 4 + col] = v5;
    861    out[col_num * 5 + col] = v2;
    862    out[col_num * 6 + col] = v7;
    863    out[col_num * 7 + col] = v0;
    864  }
    865 }
    866 static void idtx8x8_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) {
    867  (void)bit;
    868 
    869  for (int i = 0; i < col_num; i += 1) {
    870    out[0 + 8 * i] = _mm_add_epi32(in[0 + 8 * i], in[0 + 8 * i]);
    871    out[1 + 8 * i] = _mm_add_epi32(in[1 + 8 * i], in[1 + 8 * i]);
    872    out[2 + 8 * i] = _mm_add_epi32(in[2 + 8 * i], in[2 + 8 * i]);
    873    out[3 + 8 * i] = _mm_add_epi32(in[3 + 8 * i], in[3 + 8 * i]);
    874    out[4 + 8 * i] = _mm_add_epi32(in[4 + 8 * i], in[4 + 8 * i]);
    875    out[5 + 8 * i] = _mm_add_epi32(in[5 + 8 * i], in[5 + 8 * i]);
    876    out[6 + 8 * i] = _mm_add_epi32(in[6 + 8 * i], in[6 + 8 * i]);
    877    out[7 + 8 * i] = _mm_add_epi32(in[7 + 8 * i], in[7 + 8 * i]);
    878  }
    879 }
    880 #if !CONFIG_REALTIME_ONLY
    881 static void idtx32x8_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) {
    882  (void)bit;
    883  (void)col_num;
    884  for (int j = 0; j < 2; j++) {
    885    out[j + 8 * 0] = _mm_add_epi32(in[j + 8 * 0], in[j + 8 * 0]);
    886    out[j + 8 * 1] = _mm_add_epi32(in[j + 8 * 1], in[j + 8 * 1]);
    887    out[j + 8 * 2] = _mm_add_epi32(in[j + 8 * 2], in[j + 8 * 2]);
    888    out[j + 8 * 3] = _mm_add_epi32(in[j + 8 * 3], in[j + 8 * 3]);
    889    out[j + 8 * 4] = _mm_add_epi32(in[j + 8 * 4], in[j + 8 * 4]);
    890    out[j + 8 * 5] = _mm_add_epi32(in[j + 8 * 5], in[j + 8 * 5]);
    891    out[j + 8 * 6] = _mm_add_epi32(in[j + 8 * 6], in[j + 8 * 6]);
    892    out[j + 8 * 7] = _mm_add_epi32(in[j + 8 * 7], in[j + 8 * 7]);
    893  }
    894 }
    895 #endif
    896 void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff, int stride,
    897                               TX_TYPE tx_type, int bd) {
    898  __m128i in[16], out[16];
    899  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8];
    900  const int txw_idx = get_txw_idx(TX_8X8);
    901  const int txh_idx = get_txh_idx(TX_8X8);
    902 
    903  switch (tx_type) {
    904    case DCT_DCT:
    905      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
    906      fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
    907      col_txfm_8x8_rounding(out, -shift[1]);
    908      transpose_8x8(out, in);
    909      fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
    910      write_buffer_8x8(out, coeff);
    911      break;
    912    case ADST_DCT:
    913      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
    914      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
    915      col_txfm_8x8_rounding(out, -shift[1]);
    916      transpose_8x8(out, in);
    917      fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
    918      write_buffer_8x8(out, coeff);
    919      break;
    920    case DCT_ADST:
    921      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
    922      fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
    923      col_txfm_8x8_rounding(out, -shift[1]);
    924      transpose_8x8(out, in);
    925      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
    926      write_buffer_8x8(out, coeff);
    927      break;
    928    case ADST_ADST:
    929      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
    930      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
    931      col_txfm_8x8_rounding(out, -shift[1]);
    932      transpose_8x8(out, in);
    933      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
    934      write_buffer_8x8(out, coeff);
    935      break;
    936    case FLIPADST_DCT:
    937      load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
    938      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
    939      col_txfm_8x8_rounding(out, -shift[1]);
    940      transpose_8x8(out, in);
    941      fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
    942      write_buffer_8x8(out, coeff);
    943      break;
    944    case DCT_FLIPADST:
    945      load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
    946      fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
    947      col_txfm_8x8_rounding(out, -shift[1]);
    948      transpose_8x8(out, in);
    949      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
    950      write_buffer_8x8(out, coeff);
    951      break;
    952    case FLIPADST_FLIPADST:
    953      load_buffer_8x8(input, in, stride, 1, 1, shift[0]);
    954      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
    955      col_txfm_8x8_rounding(out, -shift[1]);
    956      transpose_8x8(out, in);
    957      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
    958      write_buffer_8x8(out, coeff);
    959      break;
    960    case ADST_FLIPADST:
    961      load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
    962      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
    963      col_txfm_8x8_rounding(out, -shift[1]);
    964      transpose_8x8(out, in);
    965      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
    966      write_buffer_8x8(out, coeff);
    967      break;
    968    case FLIPADST_ADST:
    969      load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
    970      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
    971      col_txfm_8x8_rounding(out, -shift[1]);
    972      transpose_8x8(out, in);
    973      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
    974      write_buffer_8x8(out, coeff);
    975      break;
    976    case IDTX:
    977      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
    978      idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
    979      col_txfm_8x8_rounding(out, -shift[1]);
    980      transpose_8x8(out, in);
    981      idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
    982      write_buffer_8x8(out, coeff);
    983      break;
    984    case V_DCT:
    985      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
    986      fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
    987      col_txfm_8x8_rounding(out, -shift[1]);
    988      transpose_8x8(out, in);
    989      idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
    990      write_buffer_8x8(out, coeff);
    991      break;
    992    case H_DCT:
    993      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
    994      idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
    995      col_txfm_8x8_rounding(out, -shift[1]);
    996      transpose_8x8(out, in);
    997      fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
    998      write_buffer_8x8(out, coeff);
    999      break;
   1000    case V_ADST:
   1001      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
   1002      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
   1003      col_txfm_8x8_rounding(out, -shift[1]);
   1004      transpose_8x8(out, in);
   1005      idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
   1006      write_buffer_8x8(out, coeff);
   1007      break;
   1008    case H_ADST:
   1009      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
   1010      idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
   1011      col_txfm_8x8_rounding(out, -shift[1]);
   1012      transpose_8x8(out, in);
   1013      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
   1014      write_buffer_8x8(out, coeff);
   1015      break;
   1016    case V_FLIPADST:
   1017      load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
   1018      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
   1019      col_txfm_8x8_rounding(out, -shift[1]);
   1020      transpose_8x8(out, in);
   1021      idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
   1022      write_buffer_8x8(out, coeff);
   1023      break;
   1024    case H_FLIPADST:
   1025      load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
   1026      idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
   1027      col_txfm_8x8_rounding(out, -shift[1]);
   1028      transpose_8x8(out, in);
   1029      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
   1030      write_buffer_8x8(out, coeff);
   1031      break;
   1032    default: assert(0);
   1033  }
   1034  (void)bd;
   1035 }
   1036 
   1037 // Hybrid Transform 16x16
   1038 
   1039 static inline void convert_8x8_to_16x16(const __m128i *in, __m128i *out) {
   1040  int row_index = 0;
   1041  int dst_index = 0;
   1042  int src_index = 0;
   1043 
   1044  // row 0, 1, .., 7
   1045  do {
   1046    out[dst_index] = in[src_index];
   1047    out[dst_index + 1] = in[src_index + 1];
   1048    out[dst_index + 2] = in[src_index + 16];
   1049    out[dst_index + 3] = in[src_index + 17];
   1050    dst_index += 4;
   1051    src_index += 2;
   1052    row_index += 1;
   1053  } while (row_index < 8);
   1054 
   1055  // row 8, 9, ..., 15
   1056  src_index += 16;
   1057  do {
   1058    out[dst_index] = in[src_index];
   1059    out[dst_index + 1] = in[src_index + 1];
   1060    out[dst_index + 2] = in[src_index + 16];
   1061    out[dst_index + 3] = in[src_index + 17];
   1062    dst_index += 4;
   1063    src_index += 2;
   1064    row_index += 1;
   1065  } while (row_index < 16);
   1066 }
   1067 
   1068 static inline void load_buffer_16x16(const int16_t *input, __m128i *out,
   1069                                     int stride, int flipud, int fliplr,
   1070                                     int shift) {
   1071  __m128i in[64];
   1072  // Load 4 8x8 blocks
   1073  const int16_t *topL = input;
   1074  const int16_t *topR = input + 8;
   1075  const int16_t *botL = input + 8 * stride;
   1076  const int16_t *botR = input + 8 * stride + 8;
   1077 
   1078  const int16_t *tmp;
   1079 
   1080  if (flipud) {
   1081    // Swap left columns
   1082    tmp = topL;
   1083    topL = botL;
   1084    botL = tmp;
   1085    // Swap right columns
   1086    tmp = topR;
   1087    topR = botR;
   1088    botR = tmp;
   1089  }
   1090 
   1091  if (fliplr) {
   1092    // Swap top rows
   1093    tmp = topL;
   1094    topL = topR;
   1095    topR = tmp;
   1096    // Swap bottom rows
   1097    tmp = botL;
   1098    botL = botR;
   1099    botR = tmp;
   1100  }
   1101 
   1102  // load first 8 columns
   1103  load_buffer_8x8(topL, &in[0], stride, flipud, fliplr, shift);
   1104  load_buffer_8x8(botL, &in[32], stride, flipud, fliplr, shift);
   1105 
   1106  // load second 8 columns
   1107  load_buffer_8x8(topR, &in[16], stride, flipud, fliplr, shift);
   1108  load_buffer_8x8(botR, &in[48], stride, flipud, fliplr, shift);
   1109 
   1110  convert_8x8_to_16x16(in, out);
   1111 }
   1112 
   1113 static inline void load_buffer_8x16(const int16_t *input, __m128i *out,
   1114                                    int stride, int flipud, int fliplr,
   1115                                    int shift) {
   1116  const int16_t *topL = input;
   1117  const int16_t *botL = input + 8 * stride;
   1118 
   1119  const int16_t *tmp;
   1120 
   1121  if (flipud) {
   1122    tmp = topL;
   1123    topL = botL;
   1124    botL = tmp;
   1125  }
   1126 
   1127  load_buffer_8x8(topL, out, stride, flipud, fliplr, shift);
   1128  load_buffer_8x8(botL, out + 16, stride, flipud, fliplr, shift);
   1129 }
   1130 
   1131 static inline void load_buffer_8x4(const int16_t *input, __m128i *out,
   1132                                   int stride, int flipud, int fliplr,
   1133                                   int shift) {
   1134  const int16_t *topL = input;
   1135  const int16_t *topR = input + 4;
   1136 
   1137  const int16_t *tmp;
   1138 
   1139  if (fliplr) {
   1140    tmp = topL;
   1141    topL = topR;
   1142    topR = tmp;
   1143  }
   1144 
   1145  load_buffer_4x4(topL, out, stride, flipud, fliplr, shift);
   1146  load_buffer_4x4(topR, out + 4, stride, flipud, fliplr, shift);
   1147 }
   1148 
   1149 static inline void load_buffer_16x4(const int16_t *input, __m128i *out,
   1150                                    int stride, int flipud, int fliplr,
   1151                                    int shift) {
   1152  const int16_t *topL = input;
   1153  const int16_t *topR = input + 8;
   1154 
   1155  const int16_t *tmp;
   1156 
   1157  if (fliplr) {
   1158    tmp = topL;
   1159    topL = topR;
   1160    topR = tmp;
   1161  }
   1162 
   1163  load_buffer_8x4(topL, out, stride, flipud, fliplr, shift);
   1164  load_buffer_8x4(topR, out + 8, stride, flipud, fliplr, shift);
   1165 }
   1166 
   1167 static inline void load_buffer_4x8(const int16_t *input, __m128i *out,
   1168                                   int stride, int flipud, int fliplr,
   1169                                   int shift) {
   1170  const int16_t *topL = input;
   1171  const int16_t *botL = input + 4 * stride;
   1172 
   1173  const int16_t *tmp;
   1174 
   1175  if (flipud) {
   1176    tmp = topL;
   1177    topL = botL;
   1178    botL = tmp;
   1179  }
   1180 
   1181  load_buffer_4x4(topL, out, stride, flipud, fliplr, shift);
   1182  load_buffer_4x4(botL, out + 4, stride, flipud, fliplr, shift);
   1183 }
   1184 
   1185 #if !CONFIG_REALTIME_ONLY
   1186 static inline void load_buffer_4x16(const int16_t *input, __m128i *out,
   1187                                    const int stride, const int flipud,
   1188                                    const int fliplr, const int shift) {
   1189  const int16_t *topL = input;
   1190  const int16_t *botL = input + 8 * stride;
   1191 
   1192  const int16_t *tmp;
   1193 
   1194  if (flipud) {
   1195    tmp = topL;
   1196    topL = botL;
   1197    botL = tmp;
   1198  }
   1199  load_buffer_4x8(topL, out, stride, flipud, fliplr, shift);
   1200  load_buffer_4x8(botL, out + 8, stride, flipud, fliplr, shift);
   1201 }
   1202 #endif
   1203 
   1204 static inline void load_buffer_32x8n(const int16_t *input, __m128i *out,
   1205                                     int stride, int flipud, int fliplr,
   1206                                     int shift, const int height) {
   1207  const int16_t *in = input;
   1208  __m128i *output = out;
   1209  for (int col = 0; col < height; col++) {
   1210    in = input + col * stride;
   1211    output = out + col * 8;
   1212    load_buffer_4x4(in, output, 4, flipud, fliplr, shift);
   1213    load_buffer_4x4((in + 16), (output + 4), 4, flipud, fliplr, shift);
   1214  }
   1215 }
   1216 
   1217 static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit,
   1218                             const int col_num) {
   1219  const int32_t *cospi = cospi_arr(bit);
   1220  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
   1221  const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
   1222  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
   1223  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
   1224  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
   1225  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
   1226  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
   1227  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
   1228  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
   1229  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
   1230  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
   1231  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
   1232  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
   1233  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
   1234  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
   1235  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
   1236  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
   1237  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
   1238  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
   1239  __m128i u[16], v[16], x;
   1240  int col;
   1241 
   1242  // Calculate the column 0, 1, 2, 3
   1243  for (col = 0; col < col_num; ++col) {
   1244    // stage 0
   1245    // stage 1
   1246    u[0] = _mm_add_epi32(in[0 * col_num + col], in[15 * col_num + col]);
   1247    u[15] = _mm_sub_epi32(in[0 * col_num + col], in[15 * col_num + col]);
   1248    u[1] = _mm_add_epi32(in[1 * col_num + col], in[14 * col_num + col]);
   1249    u[14] = _mm_sub_epi32(in[1 * col_num + col], in[14 * col_num + col]);
   1250    u[2] = _mm_add_epi32(in[2 * col_num + col], in[13 * col_num + col]);
   1251    u[13] = _mm_sub_epi32(in[2 * col_num + col], in[13 * col_num + col]);
   1252    u[3] = _mm_add_epi32(in[3 * col_num + col], in[12 * col_num + col]);
   1253    u[12] = _mm_sub_epi32(in[3 * col_num + col], in[12 * col_num + col]);
   1254    u[4] = _mm_add_epi32(in[4 * col_num + col], in[11 * col_num + col]);
   1255    u[11] = _mm_sub_epi32(in[4 * col_num + col], in[11 * col_num + col]);
   1256    u[5] = _mm_add_epi32(in[5 * col_num + col], in[10 * col_num + col]);
   1257    u[10] = _mm_sub_epi32(in[5 * col_num + col], in[10 * col_num + col]);
   1258    u[6] = _mm_add_epi32(in[6 * col_num + col], in[9 * col_num + col]);
   1259    u[9] = _mm_sub_epi32(in[6 * col_num + col], in[9 * col_num + col]);
   1260    u[7] = _mm_add_epi32(in[7 * col_num + col], in[8 * col_num + col]);
   1261    u[8] = _mm_sub_epi32(in[7 * col_num + col], in[8 * col_num + col]);
   1262 
   1263    // stage 2
   1264    v[0] = _mm_add_epi32(u[0], u[7]);
   1265    v[7] = _mm_sub_epi32(u[0], u[7]);
   1266    v[1] = _mm_add_epi32(u[1], u[6]);
   1267    v[6] = _mm_sub_epi32(u[1], u[6]);
   1268    v[2] = _mm_add_epi32(u[2], u[5]);
   1269    v[5] = _mm_sub_epi32(u[2], u[5]);
   1270    v[3] = _mm_add_epi32(u[3], u[4]);
   1271    v[4] = _mm_sub_epi32(u[3], u[4]);
   1272    v[8] = u[8];
   1273    v[9] = u[9];
   1274 
   1275    v[10] = _mm_mullo_epi32(u[10], cospim32);
   1276    x = _mm_mullo_epi32(u[13], cospi32);
   1277    v[10] = _mm_add_epi32(v[10], x);
   1278    v[10] = _mm_add_epi32(v[10], rnding);
   1279    v[10] = _mm_srai_epi32(v[10], bit);
   1280 
   1281    v[13] = _mm_mullo_epi32(u[10], cospi32);
   1282    x = _mm_mullo_epi32(u[13], cospim32);
   1283    v[13] = _mm_sub_epi32(v[13], x);
   1284    v[13] = _mm_add_epi32(v[13], rnding);
   1285    v[13] = _mm_srai_epi32(v[13], bit);
   1286 
   1287    v[11] = _mm_mullo_epi32(u[11], cospim32);
   1288    x = _mm_mullo_epi32(u[12], cospi32);
   1289    v[11] = _mm_add_epi32(v[11], x);
   1290    v[11] = _mm_add_epi32(v[11], rnding);
   1291    v[11] = _mm_srai_epi32(v[11], bit);
   1292 
   1293    v[12] = _mm_mullo_epi32(u[11], cospi32);
   1294    x = _mm_mullo_epi32(u[12], cospim32);
   1295    v[12] = _mm_sub_epi32(v[12], x);
   1296    v[12] = _mm_add_epi32(v[12], rnding);
   1297    v[12] = _mm_srai_epi32(v[12], bit);
   1298    v[14] = u[14];
   1299    v[15] = u[15];
   1300 
   1301    // stage 3
   1302    u[0] = _mm_add_epi32(v[0], v[3]);
   1303    u[3] = _mm_sub_epi32(v[0], v[3]);
   1304    u[1] = _mm_add_epi32(v[1], v[2]);
   1305    u[2] = _mm_sub_epi32(v[1], v[2]);
   1306    u[4] = v[4];
   1307 
   1308    u[5] = _mm_mullo_epi32(v[5], cospim32);
   1309    x = _mm_mullo_epi32(v[6], cospi32);
   1310    u[5] = _mm_add_epi32(u[5], x);
   1311    u[5] = _mm_add_epi32(u[5], rnding);
   1312    u[5] = _mm_srai_epi32(u[5], bit);
   1313 
   1314    u[6] = _mm_mullo_epi32(v[5], cospi32);
   1315    x = _mm_mullo_epi32(v[6], cospim32);
   1316    u[6] = _mm_sub_epi32(u[6], x);
   1317    u[6] = _mm_add_epi32(u[6], rnding);
   1318    u[6] = _mm_srai_epi32(u[6], bit);
   1319 
   1320    u[7] = v[7];
   1321    u[8] = _mm_add_epi32(v[8], v[11]);
   1322    u[11] = _mm_sub_epi32(v[8], v[11]);
   1323    u[9] = _mm_add_epi32(v[9], v[10]);
   1324    u[10] = _mm_sub_epi32(v[9], v[10]);
   1325    u[12] = _mm_sub_epi32(v[15], v[12]);
   1326    u[15] = _mm_add_epi32(v[15], v[12]);
   1327    u[13] = _mm_sub_epi32(v[14], v[13]);
   1328    u[14] = _mm_add_epi32(v[14], v[13]);
   1329 
   1330    // stage 4
   1331    u[0] = _mm_mullo_epi32(u[0], cospi32);
   1332    u[1] = _mm_mullo_epi32(u[1], cospi32);
   1333    v[0] = _mm_add_epi32(u[0], u[1]);
   1334    v[0] = _mm_add_epi32(v[0], rnding);
   1335    v[0] = _mm_srai_epi32(v[0], bit);
   1336 
   1337    v[1] = _mm_sub_epi32(u[0], u[1]);
   1338    v[1] = _mm_add_epi32(v[1], rnding);
   1339    v[1] = _mm_srai_epi32(v[1], bit);
   1340 
   1341    v[2] = _mm_mullo_epi32(u[2], cospi48);
   1342    x = _mm_mullo_epi32(u[3], cospi16);
   1343    v[2] = _mm_add_epi32(v[2], x);
   1344    v[2] = _mm_add_epi32(v[2], rnding);
   1345    v[2] = _mm_srai_epi32(v[2], bit);
   1346 
   1347    v[3] = _mm_mullo_epi32(u[2], cospi16);
   1348    x = _mm_mullo_epi32(u[3], cospi48);
   1349    v[3] = _mm_sub_epi32(x, v[3]);
   1350    v[3] = _mm_add_epi32(v[3], rnding);
   1351    v[3] = _mm_srai_epi32(v[3], bit);
   1352 
   1353    v[4] = _mm_add_epi32(u[4], u[5]);
   1354    v[5] = _mm_sub_epi32(u[4], u[5]);
   1355    v[6] = _mm_sub_epi32(u[7], u[6]);
   1356    v[7] = _mm_add_epi32(u[7], u[6]);
   1357    v[8] = u[8];
   1358 
   1359    v[9] = _mm_mullo_epi32(u[9], cospim16);
   1360    x = _mm_mullo_epi32(u[14], cospi48);
   1361    v[9] = _mm_add_epi32(v[9], x);
   1362    v[9] = _mm_add_epi32(v[9], rnding);
   1363    v[9] = _mm_srai_epi32(v[9], bit);
   1364 
   1365    v[14] = _mm_mullo_epi32(u[9], cospi48);
   1366    x = _mm_mullo_epi32(u[14], cospim16);
   1367    v[14] = _mm_sub_epi32(v[14], x);
   1368    v[14] = _mm_add_epi32(v[14], rnding);
   1369    v[14] = _mm_srai_epi32(v[14], bit);
   1370 
   1371    v[10] = _mm_mullo_epi32(u[10], cospim48);
   1372    x = _mm_mullo_epi32(u[13], cospim16);
   1373    v[10] = _mm_add_epi32(v[10], x);
   1374    v[10] = _mm_add_epi32(v[10], rnding);
   1375    v[10] = _mm_srai_epi32(v[10], bit);
   1376 
   1377    v[13] = _mm_mullo_epi32(u[10], cospim16);
   1378    x = _mm_mullo_epi32(u[13], cospim48);
   1379    v[13] = _mm_sub_epi32(v[13], x);
   1380    v[13] = _mm_add_epi32(v[13], rnding);
   1381    v[13] = _mm_srai_epi32(v[13], bit);
   1382 
   1383    v[11] = u[11];
   1384    v[12] = u[12];
   1385    v[15] = u[15];
   1386 
   1387    // stage 5
   1388    u[0] = v[0];
   1389    u[1] = v[1];
   1390    u[2] = v[2];
   1391    u[3] = v[3];
   1392 
   1393    u[4] = _mm_mullo_epi32(v[4], cospi56);
   1394    x = _mm_mullo_epi32(v[7], cospi8);
   1395    u[4] = _mm_add_epi32(u[4], x);
   1396    u[4] = _mm_add_epi32(u[4], rnding);
   1397    u[4] = _mm_srai_epi32(u[4], bit);
   1398 
   1399    u[7] = _mm_mullo_epi32(v[4], cospi8);
   1400    x = _mm_mullo_epi32(v[7], cospi56);
   1401    u[7] = _mm_sub_epi32(x, u[7]);
   1402    u[7] = _mm_add_epi32(u[7], rnding);
   1403    u[7] = _mm_srai_epi32(u[7], bit);
   1404 
   1405    u[5] = _mm_mullo_epi32(v[5], cospi24);
   1406    x = _mm_mullo_epi32(v[6], cospi40);
   1407    u[5] = _mm_add_epi32(u[5], x);
   1408    u[5] = _mm_add_epi32(u[5], rnding);
   1409    u[5] = _mm_srai_epi32(u[5], bit);
   1410 
   1411    u[6] = _mm_mullo_epi32(v[5], cospi40);
   1412    x = _mm_mullo_epi32(v[6], cospi24);
   1413    u[6] = _mm_sub_epi32(x, u[6]);
   1414    u[6] = _mm_add_epi32(u[6], rnding);
   1415    u[6] = _mm_srai_epi32(u[6], bit);
   1416 
   1417    u[8] = _mm_add_epi32(v[8], v[9]);
   1418    u[9] = _mm_sub_epi32(v[8], v[9]);
   1419    u[10] = _mm_sub_epi32(v[11], v[10]);
   1420    u[11] = _mm_add_epi32(v[11], v[10]);
   1421    u[12] = _mm_add_epi32(v[12], v[13]);
   1422    u[13] = _mm_sub_epi32(v[12], v[13]);
   1423    u[14] = _mm_sub_epi32(v[15], v[14]);
   1424    u[15] = _mm_add_epi32(v[15], v[14]);
   1425 
   1426    // stage 6
   1427    v[0] = u[0];
   1428    v[1] = u[1];
   1429    v[2] = u[2];
   1430    v[3] = u[3];
   1431    v[4] = u[4];
   1432    v[5] = u[5];
   1433    v[6] = u[6];
   1434    v[7] = u[7];
   1435 
   1436    v[8] = _mm_mullo_epi32(u[8], cospi60);
   1437    x = _mm_mullo_epi32(u[15], cospi4);
   1438    v[8] = _mm_add_epi32(v[8], x);
   1439    v[8] = _mm_add_epi32(v[8], rnding);
   1440    v[8] = _mm_srai_epi32(v[8], bit);
   1441 
   1442    v[15] = _mm_mullo_epi32(u[8], cospi4);
   1443    x = _mm_mullo_epi32(u[15], cospi60);
   1444    v[15] = _mm_sub_epi32(x, v[15]);
   1445    v[15] = _mm_add_epi32(v[15], rnding);
   1446    v[15] = _mm_srai_epi32(v[15], bit);
   1447 
   1448    v[9] = _mm_mullo_epi32(u[9], cospi28);
   1449    x = _mm_mullo_epi32(u[14], cospi36);
   1450    v[9] = _mm_add_epi32(v[9], x);
   1451    v[9] = _mm_add_epi32(v[9], rnding);
   1452    v[9] = _mm_srai_epi32(v[9], bit);
   1453 
   1454    v[14] = _mm_mullo_epi32(u[9], cospi36);
   1455    x = _mm_mullo_epi32(u[14], cospi28);
   1456    v[14] = _mm_sub_epi32(x, v[14]);
   1457    v[14] = _mm_add_epi32(v[14], rnding);
   1458    v[14] = _mm_srai_epi32(v[14], bit);
   1459 
   1460    v[10] = _mm_mullo_epi32(u[10], cospi44);
   1461    x = _mm_mullo_epi32(u[13], cospi20);
   1462    v[10] = _mm_add_epi32(v[10], x);
   1463    v[10] = _mm_add_epi32(v[10], rnding);
   1464    v[10] = _mm_srai_epi32(v[10], bit);
   1465 
   1466    v[13] = _mm_mullo_epi32(u[10], cospi20);
   1467    x = _mm_mullo_epi32(u[13], cospi44);
   1468    v[13] = _mm_sub_epi32(x, v[13]);
   1469    v[13] = _mm_add_epi32(v[13], rnding);
   1470    v[13] = _mm_srai_epi32(v[13], bit);
   1471 
   1472    v[11] = _mm_mullo_epi32(u[11], cospi12);
   1473    x = _mm_mullo_epi32(u[12], cospi52);
   1474    v[11] = _mm_add_epi32(v[11], x);
   1475    v[11] = _mm_add_epi32(v[11], rnding);
   1476    v[11] = _mm_srai_epi32(v[11], bit);
   1477 
   1478    v[12] = _mm_mullo_epi32(u[11], cospi52);
   1479    x = _mm_mullo_epi32(u[12], cospi12);
   1480    v[12] = _mm_sub_epi32(x, v[12]);
   1481    v[12] = _mm_add_epi32(v[12], rnding);
   1482    v[12] = _mm_srai_epi32(v[12], bit);
   1483 
   1484    out[0 * col_num + col] = v[0];
   1485    out[1 * col_num + col] = v[8];
   1486    out[2 * col_num + col] = v[4];
   1487    out[3 * col_num + col] = v[12];
   1488    out[4 * col_num + col] = v[2];
   1489    out[5 * col_num + col] = v[10];
   1490    out[6 * col_num + col] = v[6];
   1491    out[7 * col_num + col] = v[14];
   1492    out[8 * col_num + col] = v[1];
   1493    out[9 * col_num + col] = v[9];
   1494    out[10 * col_num + col] = v[5];
   1495    out[11 * col_num + col] = v[13];
   1496    out[12 * col_num + col] = v[3];
   1497    out[13 * col_num + col] = v[11];
   1498    out[14 * col_num + col] = v[7];
   1499    out[15 * col_num + col] = v[15];
   1500  }
   1501 }
   1502 
   1503 static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit,
   1504                              const int num_cols) {
   1505  const int32_t *cospi = cospi_arr(bit);
   1506  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
   1507  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
   1508  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
   1509  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
   1510  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
   1511  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
   1512  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
   1513  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
   1514  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
   1515  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
   1516  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
   1517  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
   1518  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
   1519  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
   1520  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
   1521  const __m128i cospim2 = _mm_set1_epi32(-cospi[2]);
   1522  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
   1523  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
   1524  const __m128i cospim10 = _mm_set1_epi32(-cospi[10]);
   1525  const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
   1526  const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
   1527  const __m128i cospim18 = _mm_set1_epi32(-cospi[18]);
   1528  const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
   1529  const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
   1530  const __m128i cospim26 = _mm_set1_epi32(-cospi[26]);
   1531  const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
   1532  const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
   1533  const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
   1534  const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
   1535  const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
   1536  const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
   1537  const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
   1538  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
   1539  const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
   1540  const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
   1541  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
   1542  const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
   1543  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
   1544  const __m128i zero = _mm_setzero_si128();
   1545 
   1546  __m128i u[16], v[16], x, y;
   1547  int col;
   1548 
   1549  for (col = 0; col < num_cols; ++col) {
   1550    // stage 0
   1551    // stage 1
   1552    u[0] = in[0 * num_cols + col];
   1553    u[1] = _mm_sub_epi32(zero, in[15 * num_cols + col]);
   1554    u[2] = _mm_sub_epi32(zero, in[7 * num_cols + col]);
   1555    u[3] = in[8 * num_cols + col];
   1556    u[4] = _mm_sub_epi32(zero, in[3 * num_cols + col]);
   1557    u[5] = in[12 * num_cols + col];
   1558    u[6] = in[4 * num_cols + col];
   1559    u[7] = _mm_sub_epi32(zero, in[11 * num_cols + col]);
   1560    u[8] = _mm_sub_epi32(zero, in[1 * num_cols + col]);
   1561    u[9] = in[14 * num_cols + col];
   1562    u[10] = in[6 * num_cols + col];
   1563    u[11] = _mm_sub_epi32(zero, in[9 * num_cols + col]);
   1564    u[12] = in[2 * num_cols + col];
   1565    u[13] = _mm_sub_epi32(zero, in[13 * num_cols + col]);
   1566    u[14] = _mm_sub_epi32(zero, in[5 * num_cols + col]);
   1567    u[15] = in[10 * num_cols + col];
   1568 
   1569    // stage 2
   1570    v[0] = u[0];
   1571    v[1] = u[1];
   1572 
   1573    x = _mm_mullo_epi32(u[2], cospi32);
   1574    y = _mm_mullo_epi32(u[3], cospi32);
   1575    v[2] = _mm_add_epi32(x, y);
   1576    v[2] = _mm_add_epi32(v[2], rnding);
   1577    v[2] = _mm_srai_epi32(v[2], bit);
   1578 
   1579    v[3] = _mm_sub_epi32(x, y);
   1580    v[3] = _mm_add_epi32(v[3], rnding);
   1581    v[3] = _mm_srai_epi32(v[3], bit);
   1582 
   1583    v[4] = u[4];
   1584    v[5] = u[5];
   1585 
   1586    x = _mm_mullo_epi32(u[6], cospi32);
   1587    y = _mm_mullo_epi32(u[7], cospi32);
   1588    v[6] = _mm_add_epi32(x, y);
   1589    v[6] = _mm_add_epi32(v[6], rnding);
   1590    v[6] = _mm_srai_epi32(v[6], bit);
   1591 
   1592    v[7] = _mm_sub_epi32(x, y);
   1593    v[7] = _mm_add_epi32(v[7], rnding);
   1594    v[7] = _mm_srai_epi32(v[7], bit);
   1595 
   1596    v[8] = u[8];
   1597    v[9] = u[9];
   1598 
   1599    x = _mm_mullo_epi32(u[10], cospi32);
   1600    y = _mm_mullo_epi32(u[11], cospi32);
   1601    v[10] = _mm_add_epi32(x, y);
   1602    v[10] = _mm_add_epi32(v[10], rnding);
   1603    v[10] = _mm_srai_epi32(v[10], bit);
   1604 
   1605    v[11] = _mm_sub_epi32(x, y);
   1606    v[11] = _mm_add_epi32(v[11], rnding);
   1607    v[11] = _mm_srai_epi32(v[11], bit);
   1608 
   1609    v[12] = u[12];
   1610    v[13] = u[13];
   1611 
   1612    x = _mm_mullo_epi32(u[14], cospi32);
   1613    y = _mm_mullo_epi32(u[15], cospi32);
   1614    v[14] = _mm_add_epi32(x, y);
   1615    v[14] = _mm_add_epi32(v[14], rnding);
   1616    v[14] = _mm_srai_epi32(v[14], bit);
   1617 
   1618    v[15] = _mm_sub_epi32(x, y);
   1619    v[15] = _mm_add_epi32(v[15], rnding);
   1620    v[15] = _mm_srai_epi32(v[15], bit);
   1621 
   1622    // stage 3
   1623    u[0] = _mm_add_epi32(v[0], v[2]);
   1624    u[1] = _mm_add_epi32(v[1], v[3]);
   1625    u[2] = _mm_sub_epi32(v[0], v[2]);
   1626    u[3] = _mm_sub_epi32(v[1], v[3]);
   1627    u[4] = _mm_add_epi32(v[4], v[6]);
   1628    u[5] = _mm_add_epi32(v[5], v[7]);
   1629    u[6] = _mm_sub_epi32(v[4], v[6]);
   1630    u[7] = _mm_sub_epi32(v[5], v[7]);
   1631    u[8] = _mm_add_epi32(v[8], v[10]);
   1632    u[9] = _mm_add_epi32(v[9], v[11]);
   1633    u[10] = _mm_sub_epi32(v[8], v[10]);
   1634    u[11] = _mm_sub_epi32(v[9], v[11]);
   1635    u[12] = _mm_add_epi32(v[12], v[14]);
   1636    u[13] = _mm_add_epi32(v[13], v[15]);
   1637    u[14] = _mm_sub_epi32(v[12], v[14]);
   1638    u[15] = _mm_sub_epi32(v[13], v[15]);
   1639 
   1640    // stage 4
   1641    v[0] = u[0];
   1642    v[1] = u[1];
   1643    v[2] = u[2];
   1644    v[3] = u[3];
   1645    v[4] = half_btf_sse4_1(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit);
   1646    v[5] = half_btf_sse4_1(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit);
   1647    v[6] = half_btf_sse4_1(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit);
   1648    v[7] = half_btf_sse4_1(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit);
   1649    v[8] = u[8];
   1650    v[9] = u[9];
   1651    v[10] = u[10];
   1652    v[11] = u[11];
   1653    v[12] = half_btf_sse4_1(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit);
   1654    v[13] = half_btf_sse4_1(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit);
   1655    v[14] = half_btf_sse4_1(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit);
   1656    v[15] = half_btf_sse4_1(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit);
   1657 
   1658    // stage 5
   1659    u[0] = _mm_add_epi32(v[0], v[4]);
   1660    u[1] = _mm_add_epi32(v[1], v[5]);
   1661    u[2] = _mm_add_epi32(v[2], v[6]);
   1662    u[3] = _mm_add_epi32(v[3], v[7]);
   1663    u[4] = _mm_sub_epi32(v[0], v[4]);
   1664    u[5] = _mm_sub_epi32(v[1], v[5]);
   1665    u[6] = _mm_sub_epi32(v[2], v[6]);
   1666    u[7] = _mm_sub_epi32(v[3], v[7]);
   1667    u[8] = _mm_add_epi32(v[8], v[12]);
   1668    u[9] = _mm_add_epi32(v[9], v[13]);
   1669    u[10] = _mm_add_epi32(v[10], v[14]);
   1670    u[11] = _mm_add_epi32(v[11], v[15]);
   1671    u[12] = _mm_sub_epi32(v[8], v[12]);
   1672    u[13] = _mm_sub_epi32(v[9], v[13]);
   1673    u[14] = _mm_sub_epi32(v[10], v[14]);
   1674    u[15] = _mm_sub_epi32(v[11], v[15]);
   1675 
   1676    // stage 6
   1677    v[0] = u[0];
   1678    v[1] = u[1];
   1679    v[2] = u[2];
   1680    v[3] = u[3];
   1681    v[4] = u[4];
   1682    v[5] = u[5];
   1683    v[6] = u[6];
   1684    v[7] = u[7];
   1685    v[8] = half_btf_sse4_1(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit);
   1686    v[9] = half_btf_sse4_1(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit);
   1687    v[10] = half_btf_sse4_1(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit);
   1688    v[11] = half_btf_sse4_1(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit);
   1689    v[12] = half_btf_sse4_1(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit);
   1690    v[13] = half_btf_sse4_1(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit);
   1691    v[14] = half_btf_sse4_1(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit);
   1692    v[15] = half_btf_sse4_1(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit);
   1693 
   1694    // stage 7
   1695    u[0] = _mm_add_epi32(v[0], v[8]);
   1696    u[1] = _mm_add_epi32(v[1], v[9]);
   1697    u[2] = _mm_add_epi32(v[2], v[10]);
   1698    u[3] = _mm_add_epi32(v[3], v[11]);
   1699    u[4] = _mm_add_epi32(v[4], v[12]);
   1700    u[5] = _mm_add_epi32(v[5], v[13]);
   1701    u[6] = _mm_add_epi32(v[6], v[14]);
   1702    u[7] = _mm_add_epi32(v[7], v[15]);
   1703    u[8] = _mm_sub_epi32(v[0], v[8]);
   1704    u[9] = _mm_sub_epi32(v[1], v[9]);
   1705    u[10] = _mm_sub_epi32(v[2], v[10]);
   1706    u[11] = _mm_sub_epi32(v[3], v[11]);
   1707    u[12] = _mm_sub_epi32(v[4], v[12]);
   1708    u[13] = _mm_sub_epi32(v[5], v[13]);
   1709    u[14] = _mm_sub_epi32(v[6], v[14]);
   1710    u[15] = _mm_sub_epi32(v[7], v[15]);
   1711 
   1712    // stage 8
   1713    v[0] = half_btf_sse4_1(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit);
   1714    v[1] = half_btf_sse4_1(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit);
   1715    v[2] = half_btf_sse4_1(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit);
   1716    v[3] = half_btf_sse4_1(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit);
   1717    v[4] = half_btf_sse4_1(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit);
   1718    v[5] = half_btf_sse4_1(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit);
   1719    v[6] = half_btf_sse4_1(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit);
   1720    v[7] = half_btf_sse4_1(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit);
   1721    v[8] = half_btf_sse4_1(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit);
   1722    v[9] = half_btf_sse4_1(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit);
   1723    v[10] = half_btf_sse4_1(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit);
   1724    v[11] = half_btf_sse4_1(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit);
   1725    v[12] = half_btf_sse4_1(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit);
   1726    v[13] = half_btf_sse4_1(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit);
   1727    v[14] = half_btf_sse4_1(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit);
   1728    v[15] = half_btf_sse4_1(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit);
   1729 
   1730    // stage 9
   1731    out[0 * num_cols + col] = v[1];
   1732    out[1 * num_cols + col] = v[14];
   1733    out[2 * num_cols + col] = v[3];
   1734    out[3 * num_cols + col] = v[12];
   1735    out[4 * num_cols + col] = v[5];
   1736    out[5 * num_cols + col] = v[10];
   1737    out[6 * num_cols + col] = v[7];
   1738    out[7 * num_cols + col] = v[8];
   1739    out[8 * num_cols + col] = v[9];
   1740    out[9 * num_cols + col] = v[6];
   1741    out[10 * num_cols + col] = v[11];
   1742    out[11 * num_cols + col] = v[4];
   1743    out[12 * num_cols + col] = v[13];
   1744    out[13 * num_cols + col] = v[2];
   1745    out[14 * num_cols + col] = v[15];
   1746    out[15 * num_cols + col] = v[0];
   1747  }
   1748 }
   1749 
   1750 static void col_txfm_16x16_rounding(__m128i *in, int shift) {
   1751  // Note:
   1752  //  We split 16x16 rounding into 4 sections of 8x8 rounding,
   1753  //  instead of 4 columns
   1754  col_txfm_8x8_rounding(&in[0], shift);
   1755  col_txfm_8x8_rounding(&in[16], shift);
   1756  col_txfm_8x8_rounding(&in[32], shift);
   1757  col_txfm_8x8_rounding(&in[48], shift);
   1758 }
   1759 
   1760 static void col_txfm_8x16_rounding(__m128i *in, int shift) {
   1761  col_txfm_8x8_rounding(&in[0], shift);
   1762  col_txfm_8x8_rounding(&in[16], shift);
   1763 }
   1764 
   1765 static void write_buffer_16x16(const __m128i *in, int32_t *output) {
   1766  const int size_8x8 = 16 * 4;
   1767  write_buffer_8x8(&in[0], output);
   1768  output += size_8x8;
   1769  write_buffer_8x8(&in[16], output);
   1770  output += size_8x8;
   1771  write_buffer_8x8(&in[32], output);
   1772  output += size_8x8;
   1773  write_buffer_8x8(&in[48], output);
   1774 }
   1775 static void idtx16x16_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) {
   1776  (void)bit;
   1777  __m128i fact = _mm_set1_epi32(2 * NewSqrt2);
   1778  __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
   1779  __m128i a_low;
   1780 
   1781  int num_iters = 16 * col_num;
   1782  for (int i = 0; i < num_iters; i++) {
   1783    a_low = _mm_mullo_epi32(in[i], fact);
   1784    a_low = _mm_add_epi32(a_low, offset);
   1785    out[i] = _mm_srai_epi32(a_low, NewSqrt2Bits);
   1786  }
   1787 }
   1788 void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff,
   1789                                 int stride, TX_TYPE tx_type, int bd) {
   1790  __m128i in[64], out[64];
   1791  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X16];
   1792  const int txw_idx = get_txw_idx(TX_16X16);
   1793  const int txh_idx = get_txh_idx(TX_16X16);
   1794  const int col_num = 4;
   1795  switch (tx_type) {
   1796    case DCT_DCT:
   1797      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
   1798      fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
   1799      col_txfm_16x16_rounding(out, -shift[1]);
   1800      transpose_16x16(out, in);
   1801      fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
   1802      write_buffer_16x16(out, coeff);
   1803      break;
   1804    case ADST_DCT:
   1805      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
   1806      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
   1807                        col_num);
   1808      col_txfm_16x16_rounding(out, -shift[1]);
   1809      transpose_16x16(out, in);
   1810      fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
   1811      write_buffer_16x16(out, coeff);
   1812      break;
   1813    case DCT_ADST:
   1814      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
   1815      fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
   1816      col_txfm_16x16_rounding(out, -shift[1]);
   1817      transpose_16x16(out, in);
   1818      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
   1819                        col_num);
   1820      write_buffer_16x16(out, coeff);
   1821      break;
   1822    case ADST_ADST:
   1823      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
   1824      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
   1825                        col_num);
   1826      col_txfm_16x16_rounding(out, -shift[1]);
   1827      transpose_16x16(out, in);
   1828      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
   1829                        col_num);
   1830      write_buffer_16x16(out, coeff);
   1831      break;
   1832    case FLIPADST_DCT:
   1833      load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
   1834      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
   1835                        col_num);
   1836      col_txfm_16x16_rounding(out, -shift[1]);
   1837      transpose_16x16(out, in);
   1838      fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
   1839      write_buffer_16x16(out, coeff);
   1840      break;
   1841    case DCT_FLIPADST:
   1842      load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
   1843      fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
   1844      col_txfm_16x16_rounding(out, -shift[1]);
   1845      transpose_16x16(out, in);
   1846      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
   1847                        col_num);
   1848      write_buffer_16x16(out, coeff);
   1849      break;
   1850    case FLIPADST_FLIPADST:
   1851      load_buffer_16x16(input, in, stride, 1, 1, shift[0]);
   1852      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
   1853                        col_num);
   1854      col_txfm_16x16_rounding(out, -shift[1]);
   1855      transpose_16x16(out, in);
   1856      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
   1857                        col_num);
   1858      write_buffer_16x16(out, coeff);
   1859      break;
   1860    case ADST_FLIPADST:
   1861      load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
   1862      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
   1863                        col_num);
   1864      col_txfm_16x16_rounding(out, -shift[1]);
   1865      transpose_16x16(out, in);
   1866      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
   1867                        col_num);
   1868      write_buffer_16x16(out, coeff);
   1869      break;
   1870    case FLIPADST_ADST:
   1871      load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
   1872      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
   1873                        col_num);
   1874      col_txfm_16x16_rounding(out, -shift[1]);
   1875      transpose_16x16(out, in);
   1876      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
   1877                        col_num);
   1878      write_buffer_16x16(out, coeff);
   1879      break;
   1880    case IDTX:
   1881      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
   1882      idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
   1883      col_txfm_16x16_rounding(out, -shift[1]);
   1884      transpose_16x16(out, in);
   1885      idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
   1886      write_buffer_16x16(out, coeff);
   1887      break;
   1888    case V_DCT:
   1889      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
   1890      fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
   1891      col_txfm_16x16_rounding(out, -shift[1]);
   1892      transpose_16x16(out, in);
   1893      idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
   1894      write_buffer_16x16(out, coeff);
   1895      break;
   1896    case H_DCT:
   1897      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
   1898      idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
   1899      col_txfm_16x16_rounding(out, -shift[1]);
   1900      transpose_16x16(out, in);
   1901      fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
   1902      write_buffer_16x16(out, coeff);
   1903      break;
   1904    case V_ADST:
   1905      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
   1906      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
   1907                        col_num);
   1908      col_txfm_16x16_rounding(out, -shift[1]);
   1909      transpose_16x16(out, in);
   1910      idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
   1911      write_buffer_16x16(out, coeff);
   1912      break;
   1913    case H_ADST:
   1914      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
   1915      idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
   1916      col_txfm_16x16_rounding(out, -shift[1]);
   1917      transpose_16x16(out, in);
   1918      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
   1919                        col_num);
   1920      write_buffer_16x16(out, coeff);
   1921      break;
   1922    case V_FLIPADST:
   1923      load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
   1924      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
   1925                        col_num);
   1926      col_txfm_16x16_rounding(out, -shift[1]);
   1927      transpose_16x16(out, in);
   1928      idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
   1929      write_buffer_16x16(out, coeff);
   1930      break;
   1931    case H_FLIPADST:
   1932      load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
   1933      idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
   1934      col_txfm_16x16_rounding(out, -shift[1]);
   1935      transpose_16x16(out, in);
   1936      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
   1937                        col_num);
   1938      write_buffer_16x16(out, coeff);
   1939      break;
   1940    default: assert(0);
   1941  }
   1942  (void)bd;
   1943 }
   1944 
   1945 static inline void flip_buf_sse4_1(__m128i *in, __m128i *out, int size) {
   1946  for (int i = 0; i < size; i += 2) in[30 - i] = out[i];
   1947  for (int i = 1; i < size; i += 2) in[size - i] = out[i];
   1948 }
   1949 
   1950 static const fwd_transform_1d_sse4_1 col_highbd_txfm8x8_arr[TX_TYPES] = {
   1951  fdct8x8_sse4_1,   // DCT_DCT
   1952  fadst8x8_sse4_1,  // ADST_DCT
   1953  fdct8x8_sse4_1,   // DCT_ADST
   1954  fadst8x8_sse4_1,  // ADST_ADST
   1955  fadst8x8_sse4_1,  // FLIPADST_DCT
   1956  fdct8x8_sse4_1,   // DCT_FLIPADST
   1957  fadst8x8_sse4_1,  // FLIPADST_FLIPADST
   1958  fadst8x8_sse4_1,  // ADST_FLIPADST
   1959  fadst8x8_sse4_1,  // FLIPADST_ADST
   1960  idtx8x8_sse4_1,   // IDTX
   1961  fdct8x8_sse4_1,   // V_DCT
   1962  idtx8x8_sse4_1,   // H_DCT
   1963  fadst8x8_sse4_1,  // V_ADST
   1964  idtx8x8_sse4_1,   // H_ADST
   1965  fadst8x8_sse4_1,  // V_FLIPADST
   1966  idtx8x8_sse4_1    // H_FLIPADST
   1967 };
   1968 #if !CONFIG_REALTIME_ONLY
   1969 static const fwd_transform_1d_sse4_1 row_highbd_txfm32x8_arr[TX_TYPES] = {
   1970  fdct8x8_sse4_1,   // DCT_DCT
   1971  NULL,             // ADST_DCT
   1972  NULL,             // DCT_ADST
   1973  NULL,             // ADST_ADST
   1974  NULL,             // FLIPADST_DCT
   1975  NULL,             // DCT_FLIPADST
   1976  NULL,             // FLIPADST_FLIPADST
   1977  NULL,             // ADST_FLIPADST
   1978  NULL,             // FLIPADST-ADST
   1979  idtx32x8_sse4_1,  // IDTX
   1980  NULL,             // V_DCT
   1981  NULL,             // H_DCT
   1982  NULL,             // V_ADST
   1983  NULL,             // H_ADST
   1984  NULL,             // V_FLIPADST
   1985  NULL,             // H_FLIPADST
   1986 };
   1987 #endif
   1988 static const fwd_transform_1d_sse4_1 col_highbd_txfm4x8_arr[TX_TYPES] = {
   1989  fdct4x8_sse4_1,   // DCT_DCT
   1990  fadst8x8_sse4_1,  // ADST_DCT
   1991  fdct4x8_sse4_1,   // DCT_ADST
   1992  fadst8x8_sse4_1,  // ADST_ADST
   1993  fadst8x8_sse4_1,  // FLIPADST_DCT
   1994  fdct4x8_sse4_1,   // DCT_FLIPADST
   1995  fadst8x8_sse4_1,  // FLIPADST_FLIPADST
   1996  fadst8x8_sse4_1,  // ADST_FLIPADST
   1997  fadst8x8_sse4_1,  // FLIPADST_ADST
   1998  idtx8x8_sse4_1,   // IDTX
   1999  fdct4x8_sse4_1,   // V_DCT
   2000  idtx8x8_sse4_1,   // H_DCT
   2001  fadst8x8_sse4_1,  // V_ADST
   2002  idtx8x8_sse4_1,   // H_ADST
   2003  fadst8x8_sse4_1,  // V_FLIPADST
   2004  idtx8x8_sse4_1    // H_FLIPADST
   2005 };
   2006 
   2007 static const fwd_transform_1d_sse4_1 row_highbd_txfm8x16_arr[TX_TYPES] = {
   2008  fdct16x16_sse4_1,   // DCT_DCT
   2009  fdct16x16_sse4_1,   // ADST_DCT
   2010  fadst16x16_sse4_1,  // DCT_ADST
   2011  fadst16x16_sse4_1,  // ADST_ADST
   2012  fdct16x16_sse4_1,   // FLIPADST_DCT
   2013  fadst16x16_sse4_1,  // DCT_FLIPADST
   2014  fadst16x16_sse4_1,  // FLIPADST_FLIPADST
   2015  fadst16x16_sse4_1,  // ADST_FLIPADST
   2016  fadst16x16_sse4_1,  // FLIPADST_ADST
   2017  idtx16x16_sse4_1,   // IDTX
   2018  idtx16x16_sse4_1,   // V_DCT
   2019  fdct16x16_sse4_1,   // H_DCT
   2020  idtx16x16_sse4_1,   // V_ADST
   2021  fadst16x16_sse4_1,  // H_ADST
   2022  idtx16x16_sse4_1,   // V_FLIPADST
   2023  fadst16x16_sse4_1   // H_FLIPADST
   2024 };
   2025 
   2026 static const fwd_transform_1d_sse4_1 col_highbd_txfm8x16_arr[TX_TYPES] = {
   2027  fdct16x16_sse4_1,   // DCT_DCT
   2028  fadst16x16_sse4_1,  // ADST_DCT
   2029  fdct16x16_sse4_1,   // DCT_ADST
   2030  fadst16x16_sse4_1,  // ADST_ADST
   2031  fadst16x16_sse4_1,  // FLIPADST_DCT
   2032  fdct16x16_sse4_1,   // DCT_FLIPADST
   2033  fadst16x16_sse4_1,  // FLIPADST_FLIPADST
   2034  fadst16x16_sse4_1,  // ADST_FLIPADST
   2035  fadst16x16_sse4_1,  // FLIPADST_ADST
   2036  idtx16x16_sse4_1,   // IDTX
   2037  fdct16x16_sse4_1,   // V_DCT
   2038  idtx16x16_sse4_1,   // H_DCT
   2039  fadst16x16_sse4_1,  // V_ADST
   2040  idtx16x16_sse4_1,   // H_ADST
   2041  fadst16x16_sse4_1,  // V_FLIPADST
   2042  idtx16x16_sse4_1    // H_FLIPADST
   2043 };
   2044 static const fwd_transform_1d_sse4_1 row_highbd_txfm8x8_arr[TX_TYPES] = {
   2045  fdct8x8_sse4_1,   // DCT_DCT
   2046  fdct8x8_sse4_1,   // ADST_DCT
   2047  fadst8x8_sse4_1,  // DCT_ADST
   2048  fadst8x8_sse4_1,  // ADST_ADST
   2049  fdct8x8_sse4_1,   // FLIPADST_DCT
   2050  fadst8x8_sse4_1,  // DCT_FLIPADST
   2051  fadst8x8_sse4_1,  // FLIPADST_FLIPADST
   2052  fadst8x8_sse4_1,  // ADST_FLIPADST
   2053  fadst8x8_sse4_1,  // FLIPADST_ADST
   2054  idtx8x8_sse4_1,   // IDTX
   2055  idtx8x8_sse4_1,   // V_DCT
   2056  fdct8x8_sse4_1,   // H_DCT
   2057  idtx8x8_sse4_1,   // V_ADST
   2058  fadst8x8_sse4_1,  // H_ADST
   2059  idtx8x8_sse4_1,   // V_FLIPADST
   2060  fadst8x8_sse4_1   // H_FLIPADST
   2061 };
   2062 
   2063 static const fwd_transform_1d_sse4_1 row_highbd_txfm4x8_arr[TX_TYPES] = {
   2064  fdct4x8_sse4_1,   // DCT_DCT
   2065  fdct4x8_sse4_1,   // ADST_DCT
   2066  fadst8x8_sse4_1,  // DCT_ADST
   2067  fadst8x8_sse4_1,  // ADST_ADST
   2068  fdct4x8_sse4_1,   // FLIPADST_DCT
   2069  fadst8x8_sse4_1,  // DCT_FLIPADST
   2070  fadst8x8_sse4_1,  // FLIPADST_FLIPADST
   2071  fadst8x8_sse4_1,  // ADST_FLIPADST
   2072  fadst8x8_sse4_1,  // FLIPADST_ADST
   2073  idtx8x8_sse4_1,   // IDTX
   2074  idtx8x8_sse4_1,   // V_DCT
   2075  fdct4x8_sse4_1,   // H_DCT
   2076  idtx8x8_sse4_1,   // V_ADST
   2077  fadst8x8_sse4_1,  // H_ADST
   2078  idtx8x8_sse4_1,   // V_FLIPADST
   2079  fadst8x8_sse4_1   // H_FLIPADST
   2080 };
   2081 
   2082 static const fwd_transform_1d_sse4_1 row_highbd_txfm4x4_arr[TX_TYPES] = {
   2083  fdct4x4_sse4_1,   // DCT_DCT
   2084  fdct4x4_sse4_1,   // ADST_DCT
   2085  fadst4x4_sse4_1,  // DCT_ADST
   2086  fadst4x4_sse4_1,  // ADST_ADST
   2087  fdct4x4_sse4_1,   // FLIPADST_DCT
   2088  fadst4x4_sse4_1,  // DCT_FLIPADST
   2089  fadst4x4_sse4_1,  // FLIPADST_FLIPADST
   2090  fadst4x4_sse4_1,  // ADST_FLIPADST
   2091  fadst4x4_sse4_1,  // FLIPADST_ADST
   2092  idtx4x4_sse4_1,   // IDTX
   2093  idtx4x4_sse4_1,   // V_DCT
   2094  fdct4x4_sse4_1,   // H_DCT
   2095  idtx4x4_sse4_1,   // V_ADST
   2096  fadst4x4_sse4_1,  // H_ADST
   2097  idtx4x4_sse4_1,   // V_FLIPADST
   2098  fadst4x4_sse4_1   // H_FLIPADST
   2099 };
   2100 
   2101 static const fwd_transform_1d_sse4_1 col_highbd_txfm4x4_arr[TX_TYPES] = {
   2102  fdct4x4_sse4_1,   // DCT_DCT
   2103  fadst4x4_sse4_1,  // ADST_DCT
   2104  fdct4x4_sse4_1,   // DCT_ADST
   2105  fadst4x4_sse4_1,  // ADST_ADST
   2106  fadst4x4_sse4_1,  // FLIPADST_DCT
   2107  fdct4x4_sse4_1,   // DCT_FLIPADST
   2108  fadst4x4_sse4_1,  // FLIPADST_FLIPADST
   2109  fadst4x4_sse4_1,  // ADST_FLIPADST
   2110  fadst4x4_sse4_1,  // FLIPADST_ADST
   2111  idtx4x4_sse4_1,   // IDTX
   2112  fdct4x4_sse4_1,   // V_DCT
   2113  idtx4x4_sse4_1,   // H_DCT
   2114  fadst4x4_sse4_1,  // V_ADST
   2115  idtx4x4_sse4_1,   // H_ADST
   2116  fadst4x4_sse4_1,  // V_FLIPADST
   2117  idtx4x4_sse4_1    // H_FLIPADST
   2118 };
   2119 
   2120 static const fwd_transform_1d_sse4_1 col_highbd_txfm8x32_arr[TX_TYPES] = {
   2121  av1_fdct32_sse4_1,  // DCT_DCT
   2122  NULL,               // ADST_DCT
   2123  NULL,               // DCT_ADST
   2124  NULL,               // ADST_ADST
   2125  NULL,               // FLIPADST_DCT
   2126  NULL,               // DCT_FLIPADST
   2127  NULL,               // FLIPADST_FLIPADST
   2128  NULL,               // ADST_FLIPADST
   2129  NULL,               // FLIPADST_ADST
   2130  av1_idtx32_sse4_1,  // IDTX
   2131  NULL,               // V_DCT
   2132  NULL,               // H_DCT
   2133  NULL,               // V_ADST
   2134  NULL,               // H_ADST
   2135  NULL,               // V_FLIPADST
   2136  NULL                // H_FLIPADST
   2137 };
   2138 
   2139 static const fwd_transform_1d_sse4_1 row_highbd_txfm8x32_arr[TX_TYPES] = {
   2140  fdct16x16_sse4_1,  // DCT_DCT
   2141  NULL,              // ADST_DCT
   2142  NULL,              // DCT_ADST
   2143  NULL,              // ADST_ADST
   2144  NULL,              // FLIPADST_DCT
   2145  NULL,              // DCT_FLIPADST
   2146  NULL,              // FLIPADST_FLIPADST
   2147  NULL,              // ADST_FLIPADST
   2148  NULL,              // FLIPADST_ADST
   2149  idtx16x16_sse4_1,  // IDTX
   2150  NULL,              // V_DCT
   2151  NULL,              // H_DCT
   2152  NULL,              // V_ADST
   2153  NULL,              // H_ADST
   2154  NULL,              // V_FLIPADST
   2155  NULL               // H_FLIPADST
   2156 };
   2157 
   2158 void av1_fwd_txfm2d_16x8_sse4_1(const int16_t *input, int32_t *coeff,
   2159                                int stride, TX_TYPE tx_type, int bd) {
   2160  __m128i in[32], out[32];
   2161  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8];
   2162  const int txw_idx = get_txw_idx(TX_16X8);
   2163  const int txh_idx = get_txh_idx(TX_16X8);
   2164  const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x8_arr[tx_type];
   2165  const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x16_arr[tx_type];
   2166  int bit = av1_fwd_cos_bit_col[txw_idx][txh_idx];
   2167  int ud_flip, lr_flip;
   2168  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   2169 
   2170  for (int i = 0; i < 2; i++) {
   2171    load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
   2172    col_txfm(in, in, bit, 2);
   2173    col_txfm_8x8_rounding(in, -shift[1]);
   2174    transpose_8x8(in, out + i * 16);
   2175  }
   2176 
   2177  if (lr_flip) {
   2178    flip_buf_sse4_1(in, out, 32);
   2179    row_txfm(in, out, bit, 2);
   2180  } else {
   2181    row_txfm(out, out, bit, 2);
   2182  }
   2183 
   2184  for (int i = 0; i < 2; i++) {
   2185    av1_round_shift_rect_array_32_sse4_1(out + i * 16, in, 16, -shift[2],
   2186                                         NewSqrt2);
   2187    write_buffer_8x8(in, coeff + i * 64);
   2188  }
   2189  (void)bd;
   2190 }
   2191 
   2192 void av1_fwd_txfm2d_8x16_sse4_1(const int16_t *input, int32_t *coeff,
   2193                                int stride, TX_TYPE tx_type, int bd) {
   2194  __m128i in[32], out[32];
   2195  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16];
   2196  const int txw_idx = get_txw_idx(TX_8X16);
   2197  const int txh_idx = get_txh_idx(TX_8X16);
   2198  const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x16_arr[tx_type];
   2199  const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x8_arr[tx_type];
   2200  int bit = av1_fwd_cos_bit_col[txw_idx][txh_idx];
   2201  int ud_flip, lr_flip;
   2202  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   2203 
   2204  load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
   2205  col_txfm(in, in, bit, 2);
   2206  col_txfm_8x16_rounding(in, -shift[1]);
   2207  transpose_8x8(in, out);
   2208  transpose_8x8(in + 16, out + 16);
   2209 
   2210  for (int i = 0; i < 2; i++) {
   2211    row_txfm(out + i * 16, out, bit, 2);
   2212    av1_round_shift_rect_array_32_sse4_1(out, out, 16, -shift[2], NewSqrt2);
   2213    write_buffer_16x8(out, coeff + i * 8, 16);
   2214  }
   2215  (void)bd;
   2216 }
   2217 
   2218 #if !CONFIG_REALTIME_ONLY
   2219 void av1_fwd_txfm2d_4x16_sse4_1(const int16_t *input, int32_t *coeff,
   2220                                int stride, TX_TYPE tx_type, int bd) {
   2221  __m128i in[16];
   2222  __m128i *outcoeff128 = (__m128i *)coeff;
   2223  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X16];
   2224  const int txw_idx = get_txw_idx(TX_4X16);
   2225  const int txh_idx = get_txh_idx(TX_4X16);
   2226  const int txfm_size_col = tx_size_wide[TX_4X16];
   2227  const int txfm_size_row = tx_size_high[TX_4X16];
   2228  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
   2229  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   2230  const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x16_arr[tx_type];
   2231  const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm4x4_arr[tx_type];
   2232 
   2233  int ud_flip, lr_flip;
   2234  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   2235  // col transform
   2236  load_buffer_4x16(input, in, stride, ud_flip, lr_flip, shift[0]);
   2237  col_txfm(in, outcoeff128, bitcol, 1);
   2238  col_txfm_8x8_rounding(outcoeff128, -shift[1]);
   2239  transpose_8nx8n(outcoeff128, in, txfm_size_col, txfm_size_row);
   2240 
   2241  // row transform
   2242  for (int i = 0; i < 4; i++) {
   2243    __m128i tmp[4];
   2244    row_txfm(in + i, tmp, bitrow, txfm_size_row >> 2);
   2245    store_output_w4(coeff + i * 4, tmp, txfm_size_row, txfm_size_col);
   2246  }
   2247  (void)bd;
   2248 }
   2249 #endif
   2250 
   2251 void av1_fwd_txfm2d_16x4_sse4_1(const int16_t *input, int32_t *coeff,
   2252                                int stride, TX_TYPE tx_type, int bd) {
   2253  __m128i in[16];
   2254  __m128i *outcoeff128 = (__m128i *)coeff;
   2255  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X4];
   2256  const int txw_idx = get_txw_idx(TX_16X4);
   2257  const int txh_idx = get_txh_idx(TX_16X4);
   2258  const int txfm_size_col = tx_size_wide[TX_16X4];
   2259  const int txfm_size_row = tx_size_high[TX_16X4];
   2260  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
   2261  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   2262  const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm4x4_arr[tx_type];
   2263  const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x16_arr[tx_type];
   2264  int ud_flip, lr_flip;
   2265  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   2266 
   2267  // col transform
   2268  load_buffer_16x4(input, in, stride, ud_flip, lr_flip, shift[0]);
   2269 
   2270  for (int i = 0; i < (txfm_size_col >> 2); i++) {
   2271    __m128i *cur_in = &in[i * txfm_size_row];
   2272    col_txfm(cur_in, cur_in, bitcol, 1);
   2273    transpose_32bit_4x4(cur_in, cur_in);
   2274  }
   2275  col_txfm_8x8_rounding(in, -shift[1]);
   2276 
   2277  // row transform
   2278  row_txfm(in, outcoeff128, bitrow, 1);
   2279  (void)bd;
   2280 }
   2281 
   2282 void av1_fwd_txfm2d_16x32_sse4_1(const int16_t *input, int32_t *coeff,
   2283                                 int stride, TX_TYPE tx_type, int bd) {
   2284  __m128i in[128];
   2285  __m128i *outcoef128 = (__m128i *)coeff;
   2286  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X32];
   2287  const int txw_idx = get_txw_idx(TX_16X32);
   2288  const int txh_idx = get_txh_idx(TX_16X32);
   2289  const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x32_arr[tx_type];
   2290  const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x32_arr[tx_type];
   2291  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
   2292  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   2293 
   2294  // column transform
   2295  load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
   2296  load_buffer_16x16(input + 16 * stride, in + 64, stride, 0, 0, shift[0]);
   2297 
   2298  for (int i = 0; i < 4; i++) {
   2299    col_txfm((in + i), (in + i), bitcol, 4);
   2300  }
   2301  col_txfm_16x16_rounding(&in[0], -shift[1]);
   2302  col_txfm_16x16_rounding(&in[64], -shift[1]);
   2303  transpose_8nx8n(in, outcoef128, 16, 32);
   2304 
   2305  // row transform
   2306  row_txfm(outcoef128, in, bitrow, 8);
   2307  av1_round_shift_rect_array_32_sse4_1(in, outcoef128, 128, -shift[2],
   2308                                       NewSqrt2);
   2309  (void)bd;
   2310 }
   2311 
   2312 void av1_fwd_txfm2d_32x64_sse4_1(const int16_t *input, int32_t *coeff,
   2313                                 int stride, TX_TYPE tx_type, int bd) {
   2314  (void)tx_type;
   2315  __m128i in[512];
   2316  __m128i *outcoef128 = (__m128i *)coeff;
   2317  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X64];
   2318  const int txw_idx = get_txw_idx(TX_32X64);
   2319  const int txh_idx = get_txh_idx(TX_32X64);
   2320  const int txfm_size_col = tx_size_wide[TX_32X64];
   2321  const int txfm_size_row = tx_size_high[TX_32X64];
   2322  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
   2323  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   2324  const int num_row = txfm_size_row >> 2;
   2325  const int num_col = txfm_size_col >> 2;
   2326 
   2327  // column transform
   2328  load_buffer_32x8n(input, in, stride, 0, 0, shift[0], txfm_size_row);
   2329  for (int i = 0; i < num_col; i++) {
   2330    av1_fdct64_sse4_1((in + i), (in + i), bitcol, num_col, num_col);
   2331  }
   2332  for (int i = 0; i < num_col; i++) {
   2333    col_txfm_16x16_rounding((in + i * txfm_size_row), -shift[1]);
   2334  }
   2335  transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
   2336 
   2337  // row transform
   2338  for (int i = 0; i < num_row; i++) {
   2339    av1_fdct32_sse4_1((outcoef128 + i), (in + i), bitrow, num_row);
   2340  }
   2341  for (int i = 0; i < txfm_size_col; i++) {
   2342    av1_round_shift_rect_array_32_sse4_1(in + i * 16, outcoef128 + i * 8, 8,
   2343                                         -shift[2], NewSqrt2);
   2344  }
   2345  (void)bd;
   2346 }
   2347 
   2348 void av1_fwd_txfm2d_64x32_sse4_1(const int16_t *input, int32_t *coeff,
   2349                                 int stride, TX_TYPE tx_type, int bd) {
   2350  (void)tx_type;
   2351  __m128i in[512];
   2352  __m128i *outcoef128 = (__m128i *)coeff;
   2353  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_64X32];
   2354  const int txw_idx = get_txw_idx(TX_64X32);
   2355  const int txh_idx = get_txh_idx(TX_64X32);
   2356  const int txfm_size_col = tx_size_wide[TX_64X32];
   2357  const int txfm_size_row = tx_size_high[TX_64X32];
   2358  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
   2359  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   2360  const int num_row = txfm_size_row >> 2;
   2361  const int num_col = txfm_size_col >> 2;
   2362 
   2363  // column transform
   2364  for (int i = 0; i < 32; i++) {
   2365    load_buffer_4x4(input + 0 + i * stride, in + 0 + i * 16, 4, 0, 0, shift[0]);
   2366    load_buffer_4x4(input + 16 + i * stride, in + 4 + i * 16, 4, 0, 0,
   2367                    shift[0]);
   2368    load_buffer_4x4(input + 32 + i * stride, in + 8 + i * 16, 4, 0, 0,
   2369                    shift[0]);
   2370    load_buffer_4x4(input + 48 + i * stride, in + 12 + i * 16, 4, 0, 0,
   2371                    shift[0]);
   2372  }
   2373 
   2374  for (int i = 0; i < num_col; i++) {
   2375    av1_fdct32_sse4_1((in + i), (in + i), bitcol, num_col);
   2376  }
   2377 
   2378  for (int i = 0; i < num_row; i++) {
   2379    col_txfm_16x16_rounding((in + i * txfm_size_col), -shift[1]);
   2380  }
   2381  transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
   2382 
   2383  // row transform
   2384  for (int i = 0; i < num_row; i++) {
   2385    av1_fdct64_sse4_1((outcoef128 + i), (in + i), bitrow, num_row, num_row);
   2386  }
   2387  av1_round_shift_rect_array_32_sse4_1(in, outcoef128, 512, -shift[2],
   2388                                       NewSqrt2);
   2389  (void)bd;
   2390 }
   2391 
   2392 void av1_fwd_txfm2d_32x16_sse4_1(const int16_t *input, int32_t *coeff,
   2393                                 int stride, TX_TYPE tx_type, int bd) {
   2394  __m128i in[128];
   2395  __m128i *outcoef128 = (__m128i *)coeff;
   2396  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16];
   2397  const int txw_idx = get_txw_idx(TX_32X16);
   2398  const int txh_idx = get_txh_idx(TX_32X16);
   2399  const fwd_transform_1d_sse4_1 col_txfm = row_highbd_txfm8x32_arr[tx_type];
   2400  const fwd_transform_1d_sse4_1 row_txfm = col_highbd_txfm8x32_arr[tx_type];
   2401  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
   2402  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   2403 
   2404  // column transform
   2405  load_buffer_32x8n(input, in, stride, 0, 0, shift[0], 16);
   2406  col_txfm(in, in, bitcol, 8);
   2407  col_txfm_16x16_rounding(&in[0], -shift[1]);
   2408  col_txfm_16x16_rounding(&in[64], -shift[1]);
   2409  transpose_8nx8n(in, outcoef128, 32, 16);
   2410 
   2411  // row transform
   2412  for (int i = 0; i < 4; i++) {
   2413    row_txfm((outcoef128 + i), (in + i), bitrow, 4);
   2414  }
   2415  av1_round_shift_rect_array_32_sse4_1(in, outcoef128, 128, -shift[2],
   2416                                       NewSqrt2);
   2417  (void)bd;
   2418 }
   2419 
   2420 #if !CONFIG_REALTIME_ONLY
   2421 void av1_fwd_txfm2d_8x32_sse4_1(const int16_t *input, int32_t *coeff,
   2422                                int stride, TX_TYPE tx_type, int bd) {
   2423  __m128i in[64];
   2424  __m128i *outcoef128 = (__m128i *)coeff;
   2425  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X32];
   2426  const int txw_idx = get_txw_idx(TX_8X32);
   2427  const int txh_idx = get_txh_idx(TX_8X32);
   2428  const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x32_arr[tx_type];
   2429  const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm32x8_arr[tx_type];
   2430  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
   2431  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   2432 
   2433  const int txfm_size_col = tx_size_wide[TX_8X32];
   2434  const int txfm_size_row = tx_size_high[TX_8X32];
   2435  const int num_col = txfm_size_col >> 2;
   2436 
   2437  // column transform
   2438  load_buffer_8x16(input, in, stride, 0, 0, shift[0]);
   2439  load_buffer_8x16(input + (txfm_size_row >> 1) * stride, in + txfm_size_row,
   2440                   stride, 0, 0, shift[0]);
   2441 
   2442  for (int i = 0; i < num_col; i++) {
   2443    col_txfm((in + i), (in + i), bitcol, num_col);
   2444  }
   2445  col_txfm_16x16_rounding(in, -shift[1]);
   2446  transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
   2447 
   2448  // row transform
   2449  for (int i = 0; i < txfm_size_col; i += 2) {
   2450    row_txfm((outcoef128 + i), (outcoef128 + i), bitrow, txfm_size_col);
   2451  }
   2452  (void)bd;
   2453 }
   2454 
   2455 void av1_fwd_txfm2d_32x8_sse4_1(const int16_t *input, int32_t *coeff,
   2456                                int stride, TX_TYPE tx_type, int bd) {
   2457  __m128i in[64];
   2458  __m128i *outcoef128 = (__m128i *)coeff;
   2459  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X8];
   2460  const int txw_idx = get_txw_idx(TX_32X8);
   2461  const int txh_idx = get_txh_idx(TX_32X8);
   2462  const fwd_transform_1d_sse4_1 col_txfm = row_highbd_txfm32x8_arr[tx_type];
   2463  const fwd_transform_1d_sse4_1 row_txfm = col_highbd_txfm8x32_arr[tx_type];
   2464  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
   2465  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   2466 
   2467  const int txfm_size_col = tx_size_wide[TX_32X8];
   2468  const int txfm_size_row = tx_size_high[TX_32X8];
   2469  const int num_col = txfm_size_row >> 2;
   2470 
   2471  // column transform
   2472  load_buffer_32x8n(input, in, stride, 0, 0, shift[0], 8);
   2473  for (int i = 0; i < txfm_size_row; i += 2) {
   2474    col_txfm((in + i), (in + i), bitcol, txfm_size_row);
   2475  }
   2476 
   2477  col_txfm_16x16_rounding(&in[0], -shift[1]);
   2478  transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
   2479 
   2480  // row transform
   2481  for (int i = 0; i < num_col; i++) {
   2482    row_txfm((outcoef128 + i), (outcoef128 + i), bitrow, num_col);
   2483  }
   2484  (void)bd;
   2485 }
   2486 #endif
   2487 
   2488 void av1_fwd_txfm2d_4x8_sse4_1(const int16_t *input, int32_t *coeff, int stride,
   2489                               TX_TYPE tx_type, int bd) {
   2490  __m128i in[8];
   2491  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X8];
   2492  const int txw_idx = get_txw_idx(TX_4X8);
   2493  const int txh_idx = get_txh_idx(TX_4X8);
   2494  const int txfm_size_col = tx_size_wide[TX_4X8];
   2495  const int txfm_size_row = tx_size_high[TX_4X8];
   2496  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
   2497  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   2498  const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm4x8_arr[tx_type];
   2499  const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm4x4_arr[tx_type];
   2500 
   2501  int ud_flip, lr_flip;
   2502  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   2503 
   2504  load_buffer_4x8(input, in, stride, ud_flip, lr_flip, shift[0]);
   2505  col_txfm(in, in, bitcol, 1);
   2506  col_txfm_4x8_rounding(in, -shift[1]);
   2507 
   2508  for (int i = 0; i < 2; i++) {
   2509    __m128i *cur_in = &in[i * 4];
   2510    transpose_32bit_4x4(cur_in, cur_in);
   2511    row_txfm(cur_in, cur_in, bitrow, 1);
   2512    av1_round_shift_rect_array_32_sse4_1(cur_in, cur_in, txfm_size_col,
   2513                                         -shift[2], NewSqrt2);
   2514    store_output_w4(coeff + i * 4, cur_in, txfm_size_row, 4);
   2515  }
   2516  (void)bd;
   2517 }
   2518 
   2519 void av1_fwd_txfm2d_8x4_sse4_1(const int16_t *input, int32_t *coeff, int stride,
   2520                               TX_TYPE tx_type, int bd) {
   2521  __m128i in[8];
   2522  __m128i *outcoeff128 = (__m128i *)coeff;
   2523  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X4];
   2524  const int txw_idx = get_txw_idx(TX_8X4);
   2525  const int txh_idx = get_txh_idx(TX_8X4);
   2526  const int txfm_size_col = tx_size_wide[TX_8X4];
   2527  const int txfm_size_row = tx_size_high[TX_8X4];
   2528  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
   2529  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   2530  const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm4x4_arr[tx_type];
   2531  const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm4x8_arr[tx_type];
   2532  int ud_flip, lr_flip;
   2533  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   2534  // col tranform
   2535  load_buffer_8x4(input, in, stride, ud_flip, lr_flip, shift[0]);
   2536  for (int i = 0; i < 2; i++) {
   2537    __m128i *cur_in = &in[i * txfm_size_row];
   2538    col_txfm(cur_in, cur_in, bitcol, 1);
   2539    transpose_32bit_4x4(cur_in, cur_in);
   2540  }
   2541  col_txfm_4x8_rounding(in, -shift[1]);
   2542 
   2543  // row tranform
   2544  row_txfm(in, outcoeff128, bitrow, 1);
   2545  av1_round_shift_rect_array_32_sse4_1(outcoeff128, outcoeff128, txfm_size_col,
   2546                                       -shift[2], NewSqrt2);
   2547  (void)bd;
   2548 }
   2549 
   2550 #if !CONFIG_REALTIME_ONLY
   2551 void av1_fwd_txfm2d_16x64_sse4_1(const int16_t *input, int32_t *coeff,
   2552                                 int stride, TX_TYPE tx_type, int bd) {
   2553  __m128i in[256];
   2554  __m128i *outcoeff128 = (__m128i *)coeff;
   2555  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X64];
   2556  const int txw_idx = get_txw_idx(TX_16X64);
   2557  const int txh_idx = get_txh_idx(TX_16X64);
   2558  const int txfm_size_col = tx_size_wide[TX_16X64];
   2559  const int txfm_size_row = tx_size_high[TX_16X64];
   2560  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
   2561  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   2562  int ud_flip, lr_flip;
   2563  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   2564  const int num_col = txfm_size_col >> 2;
   2565  // col tranform
   2566  for (int i = 0; i < txfm_size_row; i += num_col) {
   2567    load_buffer_4x4(input + (i + 0) * stride, in + (i + 0) * num_col, num_col,
   2568                    ud_flip, lr_flip, shift[0]);
   2569    load_buffer_4x4(input + (i + 1) * stride, in + (i + 1) * num_col, num_col,
   2570                    ud_flip, lr_flip, shift[0]);
   2571    load_buffer_4x4(input + (i + 2) * stride, in + (i + 2) * num_col, num_col,
   2572                    ud_flip, lr_flip, shift[0]);
   2573    load_buffer_4x4(input + (i + 3) * stride, in + (i + 3) * num_col, num_col,
   2574                    ud_flip, lr_flip, shift[0]);
   2575  }
   2576 
   2577  for (int i = 0; i < num_col; i++) {
   2578    av1_fdct64_sse4_1(in + i, outcoeff128 + i, bitcol, num_col, num_col);
   2579  }
   2580 
   2581  col_txfm_16x16_rounding(outcoeff128, -shift[1]);
   2582  col_txfm_16x16_rounding(outcoeff128 + 64, -shift[1]);
   2583  col_txfm_16x16_rounding(outcoeff128 + 128, -shift[1]);
   2584  col_txfm_16x16_rounding(outcoeff128 + 192, -shift[1]);
   2585 
   2586  transpose_8nx8n(outcoeff128, in, txfm_size_col, 32);
   2587  fdct16x16_sse4_1(in, outcoeff128, bitrow, 8);
   2588  (void)bd;
   2589 }
   2590 
   2591 void av1_fwd_txfm2d_64x16_sse4_1(const int16_t *input, int32_t *coeff,
   2592                                 int stride, TX_TYPE tx_type, int bd) {
   2593  __m128i in[256];
   2594  __m128i *outcoeff128 = (__m128i *)coeff;
   2595  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_64X16];
   2596  const int txw_idx = get_txw_idx(TX_64X16);
   2597  const int txh_idx = get_txh_idx(TX_64X16);
   2598  const int txfm_size_col = tx_size_wide[TX_64X16];
   2599  const int txfm_size_row = tx_size_high[TX_64X16];
   2600  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
   2601  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   2602  int ud_flip, lr_flip;
   2603  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   2604  // col tranform
   2605  for (int i = 0; i < txfm_size_row; i++) {
   2606    load_buffer_4x4(input + 0 + i * stride, in + 0 + i * txfm_size_row, 4,
   2607                    ud_flip, lr_flip, shift[0]);
   2608    load_buffer_4x4(input + 16 + i * stride, in + 4 + i * txfm_size_row, 4,
   2609                    ud_flip, lr_flip, shift[0]);
   2610    load_buffer_4x4(input + 32 + i * stride, in + 8 + i * txfm_size_row, 4,
   2611                    ud_flip, lr_flip, shift[0]);
   2612    load_buffer_4x4(input + 48 + i * stride, in + 12 + i * txfm_size_row, 4,
   2613                    ud_flip, lr_flip, shift[0]);
   2614  }
   2615 
   2616  fdct16x16_sse4_1(in, outcoeff128, bitcol, txfm_size_row);
   2617  col_txfm_16x16_rounding(outcoeff128, -shift[1]);
   2618  col_txfm_16x16_rounding(outcoeff128 + 64, -shift[1]);
   2619  col_txfm_16x16_rounding(outcoeff128 + 128, -shift[1]);
   2620  col_txfm_16x16_rounding(outcoeff128 + 192, -shift[1]);
   2621 
   2622  transpose_8nx8n(outcoeff128, in, txfm_size_col, txfm_size_row);
   2623  for (int i = 0; i < 4; i++) {
   2624    av1_fdct64_sse4_1(in + i, outcoeff128 + i, bitrow, 4, 4);
   2625  }
   2626  memset(coeff + txfm_size_row * 32, 0, txfm_size_row * 32 * sizeof(*coeff));
   2627  (void)bd;
   2628 }
   2629 #endif