tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

av1_fwd_txfm1d.c (63521B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <stdlib.h>
     13 #include "av1/encoder/av1_fwd_txfm1d.h"
     14 #include "av1/common/av1_txfm.h"
     15 
     16 void av1_fdct4(const int32_t *input, int32_t *output, int8_t cos_bit,
     17               const int8_t *stage_range) {
     18  const int32_t size = 4;
     19  const int32_t *cospi;
     20 
     21  int32_t stage = 0;
     22  int32_t *bf0, *bf1;
     23  int32_t step[4];
     24 
     25  // stage 0;
     26  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
     27 
     28  // stage 1;
     29  stage++;
     30  bf1 = output;
     31  bf1[0] = input[0] + input[3];
     32  bf1[1] = input[1] + input[2];
     33  bf1[2] = -input[2] + input[1];
     34  bf1[3] = -input[3] + input[0];
     35  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
     36 
     37  // stage 2
     38  stage++;
     39  cospi = cospi_arr(cos_bit);
     40  bf0 = output;
     41  bf1 = step;
     42  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
     43  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
     44  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
     45  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
     46  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
     47 
     48  // stage 3
     49  stage++;
     50  bf0 = step;
     51  bf1 = output;
     52  bf1[0] = bf0[0];
     53  bf1[1] = bf0[2];
     54  bf1[2] = bf0[1];
     55  bf1[3] = bf0[3];
     56  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
     57 }
     58 
     59 void av1_fdct8(const int32_t *input, int32_t *output, int8_t cos_bit,
     60               const int8_t *stage_range) {
     61  const int32_t size = 8;
     62  const int32_t *cospi;
     63 
     64  int32_t stage = 0;
     65  int32_t *bf0, *bf1;
     66  int32_t step[8];
     67 
     68  // stage 0;
     69  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
     70 
     71  // stage 1;
     72  stage++;
     73  bf1 = output;
     74  bf1[0] = input[0] + input[7];
     75  bf1[1] = input[1] + input[6];
     76  bf1[2] = input[2] + input[5];
     77  bf1[3] = input[3] + input[4];
     78  bf1[4] = -input[4] + input[3];
     79  bf1[5] = -input[5] + input[2];
     80  bf1[6] = -input[6] + input[1];
     81  bf1[7] = -input[7] + input[0];
     82  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
     83 
     84  // stage 2
     85  stage++;
     86  cospi = cospi_arr(cos_bit);
     87  bf0 = output;
     88  bf1 = step;
     89  bf1[0] = bf0[0] + bf0[3];
     90  bf1[1] = bf0[1] + bf0[2];
     91  bf1[2] = -bf0[2] + bf0[1];
     92  bf1[3] = -bf0[3] + bf0[0];
     93  bf1[4] = bf0[4];
     94  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
     95  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
     96  bf1[7] = bf0[7];
     97  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
     98 
     99  // stage 3
    100  stage++;
    101  cospi = cospi_arr(cos_bit);
    102  bf0 = step;
    103  bf1 = output;
    104  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
    105  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
    106  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
    107  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
    108  bf1[4] = bf0[4] + bf0[5];
    109  bf1[5] = -bf0[5] + bf0[4];
    110  bf1[6] = -bf0[6] + bf0[7];
    111  bf1[7] = bf0[7] + bf0[6];
    112  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    113 
    114  // stage 4
    115  stage++;
    116  cospi = cospi_arr(cos_bit);
    117  bf0 = output;
    118  bf1 = step;
    119  bf1[0] = bf0[0];
    120  bf1[1] = bf0[1];
    121  bf1[2] = bf0[2];
    122  bf1[3] = bf0[3];
    123  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
    124  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
    125  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
    126  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
    127  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    128 
    129  // stage 5
    130  stage++;
    131  bf0 = step;
    132  bf1 = output;
    133  bf1[0] = bf0[0];
    134  bf1[1] = bf0[4];
    135  bf1[2] = bf0[2];
    136  bf1[3] = bf0[6];
    137  bf1[4] = bf0[1];
    138  bf1[5] = bf0[5];
    139  bf1[6] = bf0[3];
    140  bf1[7] = bf0[7];
    141  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    142 }
    143 
    144 void av1_fdct16(const int32_t *input, int32_t *output, int8_t cos_bit,
    145                const int8_t *stage_range) {
    146  const int32_t size = 16;
    147  const int32_t *cospi;
    148 
    149  int32_t stage = 0;
    150  int32_t *bf0, *bf1;
    151  int32_t step[16];
    152 
    153  // stage 0;
    154  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
    155 
    156  // stage 1;
    157  stage++;
    158  bf1 = output;
    159  bf1[0] = input[0] + input[15];
    160  bf1[1] = input[1] + input[14];
    161  bf1[2] = input[2] + input[13];
    162  bf1[3] = input[3] + input[12];
    163  bf1[4] = input[4] + input[11];
    164  bf1[5] = input[5] + input[10];
    165  bf1[6] = input[6] + input[9];
    166  bf1[7] = input[7] + input[8];
    167  bf1[8] = -input[8] + input[7];
    168  bf1[9] = -input[9] + input[6];
    169  bf1[10] = -input[10] + input[5];
    170  bf1[11] = -input[11] + input[4];
    171  bf1[12] = -input[12] + input[3];
    172  bf1[13] = -input[13] + input[2];
    173  bf1[14] = -input[14] + input[1];
    174  bf1[15] = -input[15] + input[0];
    175  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    176 
    177  // stage 2
    178  stage++;
    179  cospi = cospi_arr(cos_bit);
    180  bf0 = output;
    181  bf1 = step;
    182  bf1[0] = bf0[0] + bf0[7];
    183  bf1[1] = bf0[1] + bf0[6];
    184  bf1[2] = bf0[2] + bf0[5];
    185  bf1[3] = bf0[3] + bf0[4];
    186  bf1[4] = -bf0[4] + bf0[3];
    187  bf1[5] = -bf0[5] + bf0[2];
    188  bf1[6] = -bf0[6] + bf0[1];
    189  bf1[7] = -bf0[7] + bf0[0];
    190  bf1[8] = bf0[8];
    191  bf1[9] = bf0[9];
    192  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
    193  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
    194  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
    195  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
    196  bf1[14] = bf0[14];
    197  bf1[15] = bf0[15];
    198  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    199 
    200  // stage 3
    201  stage++;
    202  cospi = cospi_arr(cos_bit);
    203  bf0 = step;
    204  bf1 = output;
    205  bf1[0] = bf0[0] + bf0[3];
    206  bf1[1] = bf0[1] + bf0[2];
    207  bf1[2] = -bf0[2] + bf0[1];
    208  bf1[3] = -bf0[3] + bf0[0];
    209  bf1[4] = bf0[4];
    210  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
    211  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
    212  bf1[7] = bf0[7];
    213  bf1[8] = bf0[8] + bf0[11];
    214  bf1[9] = bf0[9] + bf0[10];
    215  bf1[10] = -bf0[10] + bf0[9];
    216  bf1[11] = -bf0[11] + bf0[8];
    217  bf1[12] = -bf0[12] + bf0[15];
    218  bf1[13] = -bf0[13] + bf0[14];
    219  bf1[14] = bf0[14] + bf0[13];
    220  bf1[15] = bf0[15] + bf0[12];
    221  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    222 
    223  // stage 4
    224  stage++;
    225  cospi = cospi_arr(cos_bit);
    226  bf0 = output;
    227  bf1 = step;
    228  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
    229  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
    230  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
    231  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
    232  bf1[4] = bf0[4] + bf0[5];
    233  bf1[5] = -bf0[5] + bf0[4];
    234  bf1[6] = -bf0[6] + bf0[7];
    235  bf1[7] = bf0[7] + bf0[6];
    236  bf1[8] = bf0[8];
    237  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
    238  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
    239  bf1[11] = bf0[11];
    240  bf1[12] = bf0[12];
    241  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
    242  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
    243  bf1[15] = bf0[15];
    244  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    245 
    246  // stage 5
    247  stage++;
    248  cospi = cospi_arr(cos_bit);
    249  bf0 = step;
    250  bf1 = output;
    251  bf1[0] = bf0[0];
    252  bf1[1] = bf0[1];
    253  bf1[2] = bf0[2];
    254  bf1[3] = bf0[3];
    255  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
    256  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
    257  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
    258  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
    259  bf1[8] = bf0[8] + bf0[9];
    260  bf1[9] = -bf0[9] + bf0[8];
    261  bf1[10] = -bf0[10] + bf0[11];
    262  bf1[11] = bf0[11] + bf0[10];
    263  bf1[12] = bf0[12] + bf0[13];
    264  bf1[13] = -bf0[13] + bf0[12];
    265  bf1[14] = -bf0[14] + bf0[15];
    266  bf1[15] = bf0[15] + bf0[14];
    267  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    268 
    269  // stage 6
    270  stage++;
    271  cospi = cospi_arr(cos_bit);
    272  bf0 = output;
    273  bf1 = step;
    274  bf1[0] = bf0[0];
    275  bf1[1] = bf0[1];
    276  bf1[2] = bf0[2];
    277  bf1[3] = bf0[3];
    278  bf1[4] = bf0[4];
    279  bf1[5] = bf0[5];
    280  bf1[6] = bf0[6];
    281  bf1[7] = bf0[7];
    282  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
    283  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
    284  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
    285  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
    286  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
    287  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
    288  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
    289  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
    290  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    291 
    292  // stage 7
    293  stage++;
    294  bf0 = step;
    295  bf1 = output;
    296  bf1[0] = bf0[0];
    297  bf1[1] = bf0[8];
    298  bf1[2] = bf0[4];
    299  bf1[3] = bf0[12];
    300  bf1[4] = bf0[2];
    301  bf1[5] = bf0[10];
    302  bf1[6] = bf0[6];
    303  bf1[7] = bf0[14];
    304  bf1[8] = bf0[1];
    305  bf1[9] = bf0[9];
    306  bf1[10] = bf0[5];
    307  bf1[11] = bf0[13];
    308  bf1[12] = bf0[3];
    309  bf1[13] = bf0[11];
    310  bf1[14] = bf0[7];
    311  bf1[15] = bf0[15];
    312  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    313 }
    314 
    315 void av1_fdct32(const int32_t *input, int32_t *output, int8_t cos_bit,
    316                const int8_t *stage_range) {
    317  const int32_t size = 32;
    318  const int32_t *cospi;
    319 
    320  int32_t stage = 0;
    321  int32_t *bf0, *bf1;
    322  int32_t step[32];
    323 
    324  // stage 0;
    325  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
    326 
    327  // stage 1;
    328  stage++;
    329  bf1 = output;
    330  bf1[0] = input[0] + input[31];
    331  bf1[1] = input[1] + input[30];
    332  bf1[2] = input[2] + input[29];
    333  bf1[3] = input[3] + input[28];
    334  bf1[4] = input[4] + input[27];
    335  bf1[5] = input[5] + input[26];
    336  bf1[6] = input[6] + input[25];
    337  bf1[7] = input[7] + input[24];
    338  bf1[8] = input[8] + input[23];
    339  bf1[9] = input[9] + input[22];
    340  bf1[10] = input[10] + input[21];
    341  bf1[11] = input[11] + input[20];
    342  bf1[12] = input[12] + input[19];
    343  bf1[13] = input[13] + input[18];
    344  bf1[14] = input[14] + input[17];
    345  bf1[15] = input[15] + input[16];
    346  bf1[16] = -input[16] + input[15];
    347  bf1[17] = -input[17] + input[14];
    348  bf1[18] = -input[18] + input[13];
    349  bf1[19] = -input[19] + input[12];
    350  bf1[20] = -input[20] + input[11];
    351  bf1[21] = -input[21] + input[10];
    352  bf1[22] = -input[22] + input[9];
    353  bf1[23] = -input[23] + input[8];
    354  bf1[24] = -input[24] + input[7];
    355  bf1[25] = -input[25] + input[6];
    356  bf1[26] = -input[26] + input[5];
    357  bf1[27] = -input[27] + input[4];
    358  bf1[28] = -input[28] + input[3];
    359  bf1[29] = -input[29] + input[2];
    360  bf1[30] = -input[30] + input[1];
    361  bf1[31] = -input[31] + input[0];
    362  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    363 
    364  // stage 2
    365  stage++;
    366  cospi = cospi_arr(cos_bit);
    367  bf0 = output;
    368  bf1 = step;
    369  bf1[0] = bf0[0] + bf0[15];
    370  bf1[1] = bf0[1] + bf0[14];
    371  bf1[2] = bf0[2] + bf0[13];
    372  bf1[3] = bf0[3] + bf0[12];
    373  bf1[4] = bf0[4] + bf0[11];
    374  bf1[5] = bf0[5] + bf0[10];
    375  bf1[6] = bf0[6] + bf0[9];
    376  bf1[7] = bf0[7] + bf0[8];
    377  bf1[8] = -bf0[8] + bf0[7];
    378  bf1[9] = -bf0[9] + bf0[6];
    379  bf1[10] = -bf0[10] + bf0[5];
    380  bf1[11] = -bf0[11] + bf0[4];
    381  bf1[12] = -bf0[12] + bf0[3];
    382  bf1[13] = -bf0[13] + bf0[2];
    383  bf1[14] = -bf0[14] + bf0[1];
    384  bf1[15] = -bf0[15] + bf0[0];
    385  bf1[16] = bf0[16];
    386  bf1[17] = bf0[17];
    387  bf1[18] = bf0[18];
    388  bf1[19] = bf0[19];
    389  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
    390  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
    391  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
    392  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
    393  bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
    394  bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
    395  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
    396  bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
    397  bf1[28] = bf0[28];
    398  bf1[29] = bf0[29];
    399  bf1[30] = bf0[30];
    400  bf1[31] = bf0[31];
    401  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    402 
    403  // stage 3
    404  stage++;
    405  cospi = cospi_arr(cos_bit);
    406  bf0 = step;
    407  bf1 = output;
    408  bf1[0] = bf0[0] + bf0[7];
    409  bf1[1] = bf0[1] + bf0[6];
    410  bf1[2] = bf0[2] + bf0[5];
    411  bf1[3] = bf0[3] + bf0[4];
    412  bf1[4] = -bf0[4] + bf0[3];
    413  bf1[5] = -bf0[5] + bf0[2];
    414  bf1[6] = -bf0[6] + bf0[1];
    415  bf1[7] = -bf0[7] + bf0[0];
    416  bf1[8] = bf0[8];
    417  bf1[9] = bf0[9];
    418  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
    419  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
    420  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
    421  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
    422  bf1[14] = bf0[14];
    423  bf1[15] = bf0[15];
    424  bf1[16] = bf0[16] + bf0[23];
    425  bf1[17] = bf0[17] + bf0[22];
    426  bf1[18] = bf0[18] + bf0[21];
    427  bf1[19] = bf0[19] + bf0[20];
    428  bf1[20] = -bf0[20] + bf0[19];
    429  bf1[21] = -bf0[21] + bf0[18];
    430  bf1[22] = -bf0[22] + bf0[17];
    431  bf1[23] = -bf0[23] + bf0[16];
    432  bf1[24] = -bf0[24] + bf0[31];
    433  bf1[25] = -bf0[25] + bf0[30];
    434  bf1[26] = -bf0[26] + bf0[29];
    435  bf1[27] = -bf0[27] + bf0[28];
    436  bf1[28] = bf0[28] + bf0[27];
    437  bf1[29] = bf0[29] + bf0[26];
    438  bf1[30] = bf0[30] + bf0[25];
    439  bf1[31] = bf0[31] + bf0[24];
    440  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    441 
    442  // stage 4
    443  stage++;
    444  cospi = cospi_arr(cos_bit);
    445  bf0 = output;
    446  bf1 = step;
    447  bf1[0] = bf0[0] + bf0[3];
    448  bf1[1] = bf0[1] + bf0[2];
    449  bf1[2] = -bf0[2] + bf0[1];
    450  bf1[3] = -bf0[3] + bf0[0];
    451  bf1[4] = bf0[4];
    452  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
    453  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
    454  bf1[7] = bf0[7];
    455  bf1[8] = bf0[8] + bf0[11];
    456  bf1[9] = bf0[9] + bf0[10];
    457  bf1[10] = -bf0[10] + bf0[9];
    458  bf1[11] = -bf0[11] + bf0[8];
    459  bf1[12] = -bf0[12] + bf0[15];
    460  bf1[13] = -bf0[13] + bf0[14];
    461  bf1[14] = bf0[14] + bf0[13];
    462  bf1[15] = bf0[15] + bf0[12];
    463  bf1[16] = bf0[16];
    464  bf1[17] = bf0[17];
    465  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
    466  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
    467  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
    468  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
    469  bf1[22] = bf0[22];
    470  bf1[23] = bf0[23];
    471  bf1[24] = bf0[24];
    472  bf1[25] = bf0[25];
    473  bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
    474  bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
    475  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
    476  bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
    477  bf1[30] = bf0[30];
    478  bf1[31] = bf0[31];
    479  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    480 
    481  // stage 5
    482  stage++;
    483  cospi = cospi_arr(cos_bit);
    484  bf0 = step;
    485  bf1 = output;
    486  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
    487  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
    488  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
    489  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
    490  bf1[4] = bf0[4] + bf0[5];
    491  bf1[5] = -bf0[5] + bf0[4];
    492  bf1[6] = -bf0[6] + bf0[7];
    493  bf1[7] = bf0[7] + bf0[6];
    494  bf1[8] = bf0[8];
    495  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
    496  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
    497  bf1[11] = bf0[11];
    498  bf1[12] = bf0[12];
    499  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
    500  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
    501  bf1[15] = bf0[15];
    502  bf1[16] = bf0[16] + bf0[19];
    503  bf1[17] = bf0[17] + bf0[18];
    504  bf1[18] = -bf0[18] + bf0[17];
    505  bf1[19] = -bf0[19] + bf0[16];
    506  bf1[20] = -bf0[20] + bf0[23];
    507  bf1[21] = -bf0[21] + bf0[22];
    508  bf1[22] = bf0[22] + bf0[21];
    509  bf1[23] = bf0[23] + bf0[20];
    510  bf1[24] = bf0[24] + bf0[27];
    511  bf1[25] = bf0[25] + bf0[26];
    512  bf1[26] = -bf0[26] + bf0[25];
    513  bf1[27] = -bf0[27] + bf0[24];
    514  bf1[28] = -bf0[28] + bf0[31];
    515  bf1[29] = -bf0[29] + bf0[30];
    516  bf1[30] = bf0[30] + bf0[29];
    517  bf1[31] = bf0[31] + bf0[28];
    518  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    519 
    520  // stage 6
    521  stage++;
    522  cospi = cospi_arr(cos_bit);
    523  bf0 = output;
    524  bf1 = step;
    525  bf1[0] = bf0[0];
    526  bf1[1] = bf0[1];
    527  bf1[2] = bf0[2];
    528  bf1[3] = bf0[3];
    529  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
    530  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
    531  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
    532  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
    533  bf1[8] = bf0[8] + bf0[9];
    534  bf1[9] = -bf0[9] + bf0[8];
    535  bf1[10] = -bf0[10] + bf0[11];
    536  bf1[11] = bf0[11] + bf0[10];
    537  bf1[12] = bf0[12] + bf0[13];
    538  bf1[13] = -bf0[13] + bf0[12];
    539  bf1[14] = -bf0[14] + bf0[15];
    540  bf1[15] = bf0[15] + bf0[14];
    541  bf1[16] = bf0[16];
    542  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
    543  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
    544  bf1[19] = bf0[19];
    545  bf1[20] = bf0[20];
    546  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
    547  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
    548  bf1[23] = bf0[23];
    549  bf1[24] = bf0[24];
    550  bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
    551  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
    552  bf1[27] = bf0[27];
    553  bf1[28] = bf0[28];
    554  bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
    555  bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
    556  bf1[31] = bf0[31];
    557  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    558 
    559  // stage 7
    560  stage++;
    561  cospi = cospi_arr(cos_bit);
    562  bf0 = step;
    563  bf1 = output;
    564  bf1[0] = bf0[0];
    565  bf1[1] = bf0[1];
    566  bf1[2] = bf0[2];
    567  bf1[3] = bf0[3];
    568  bf1[4] = bf0[4];
    569  bf1[5] = bf0[5];
    570  bf1[6] = bf0[6];
    571  bf1[7] = bf0[7];
    572  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
    573  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
    574  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
    575  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
    576  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
    577  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
    578  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
    579  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
    580  bf1[16] = bf0[16] + bf0[17];
    581  bf1[17] = -bf0[17] + bf0[16];
    582  bf1[18] = -bf0[18] + bf0[19];
    583  bf1[19] = bf0[19] + bf0[18];
    584  bf1[20] = bf0[20] + bf0[21];
    585  bf1[21] = -bf0[21] + bf0[20];
    586  bf1[22] = -bf0[22] + bf0[23];
    587  bf1[23] = bf0[23] + bf0[22];
    588  bf1[24] = bf0[24] + bf0[25];
    589  bf1[25] = -bf0[25] + bf0[24];
    590  bf1[26] = -bf0[26] + bf0[27];
    591  bf1[27] = bf0[27] + bf0[26];
    592  bf1[28] = bf0[28] + bf0[29];
    593  bf1[29] = -bf0[29] + bf0[28];
    594  bf1[30] = -bf0[30] + bf0[31];
    595  bf1[31] = bf0[31] + bf0[30];
    596  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    597 
    598  // stage 8
    599  stage++;
    600  cospi = cospi_arr(cos_bit);
    601  bf0 = output;
    602  bf1 = step;
    603  bf1[0] = bf0[0];
    604  bf1[1] = bf0[1];
    605  bf1[2] = bf0[2];
    606  bf1[3] = bf0[3];
    607  bf1[4] = bf0[4];
    608  bf1[5] = bf0[5];
    609  bf1[6] = bf0[6];
    610  bf1[7] = bf0[7];
    611  bf1[8] = bf0[8];
    612  bf1[9] = bf0[9];
    613  bf1[10] = bf0[10];
    614  bf1[11] = bf0[11];
    615  bf1[12] = bf0[12];
    616  bf1[13] = bf0[13];
    617  bf1[14] = bf0[14];
    618  bf1[15] = bf0[15];
    619  bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
    620  bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit);
    621  bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
    622  bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit);
    623  bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
    624  bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit);
    625  bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
    626  bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit);
    627  bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
    628  bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit);
    629  bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
    630  bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit);
    631  bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
    632  bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit);
    633  bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
    634  bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit);
    635  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    636 
    637  // stage 9
    638  stage++;
    639  bf0 = step;
    640  bf1 = output;
    641  bf1[0] = bf0[0];
    642  bf1[1] = bf0[16];
    643  bf1[2] = bf0[8];
    644  bf1[3] = bf0[24];
    645  bf1[4] = bf0[4];
    646  bf1[5] = bf0[20];
    647  bf1[6] = bf0[12];
    648  bf1[7] = bf0[28];
    649  bf1[8] = bf0[2];
    650  bf1[9] = bf0[18];
    651  bf1[10] = bf0[10];
    652  bf1[11] = bf0[26];
    653  bf1[12] = bf0[6];
    654  bf1[13] = bf0[22];
    655  bf1[14] = bf0[14];
    656  bf1[15] = bf0[30];
    657  bf1[16] = bf0[1];
    658  bf1[17] = bf0[17];
    659  bf1[18] = bf0[9];
    660  bf1[19] = bf0[25];
    661  bf1[20] = bf0[5];
    662  bf1[21] = bf0[21];
    663  bf1[22] = bf0[13];
    664  bf1[23] = bf0[29];
    665  bf1[24] = bf0[3];
    666  bf1[25] = bf0[19];
    667  bf1[26] = bf0[11];
    668  bf1[27] = bf0[27];
    669  bf1[28] = bf0[7];
    670  bf1[29] = bf0[23];
    671  bf1[30] = bf0[15];
    672  bf1[31] = bf0[31];
    673  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    674 }
    675 
    676 void av1_fadst4(const int32_t *input, int32_t *output, int8_t cos_bit,
    677                const int8_t *stage_range) {
    678  int bit = cos_bit;
    679  const int32_t *sinpi = sinpi_arr(bit);
    680  int32_t x0, x1, x2, x3;
    681  int32_t s0, s1, s2, s3, s4, s5, s6, s7;
    682 
    683  // stage 0
    684  av1_range_check_buf(0, input, input, 4, stage_range[0]);
    685  x0 = input[0];
    686  x1 = input[1];
    687  x2 = input[2];
    688  x3 = input[3];
    689 
    690  if (!(x0 | x1 | x2 | x3)) {
    691    output[0] = output[1] = output[2] = output[3] = 0;
    692    return;
    693  }
    694 
    695  // stage 1
    696  s0 = range_check_value(sinpi[1] * x0, bit + stage_range[1]);
    697  s1 = range_check_value(sinpi[4] * x0, bit + stage_range[1]);
    698  s2 = range_check_value(sinpi[2] * x1, bit + stage_range[1]);
    699  s3 = range_check_value(sinpi[1] * x1, bit + stage_range[1]);
    700  s4 = range_check_value(sinpi[3] * x2, bit + stage_range[1]);
    701  s5 = range_check_value(sinpi[4] * x3, bit + stage_range[1]);
    702  s6 = range_check_value(sinpi[2] * x3, bit + stage_range[1]);
    703  s7 = range_check_value(x0 + x1, stage_range[1]);
    704 
    705  // stage 2
    706  s7 = range_check_value(s7 - x3, stage_range[2]);
    707 
    708  // stage 3
    709  x0 = range_check_value(s0 + s2, bit + stage_range[3]);
    710  x1 = range_check_value(sinpi[3] * s7, bit + stage_range[3]);
    711  x2 = range_check_value(s1 - s3, bit + stage_range[3]);
    712  x3 = range_check_value(s4, bit + stage_range[3]);
    713 
    714  // stage 4
    715  x0 = range_check_value(x0 + s5, bit + stage_range[4]);
    716  x2 = range_check_value(x2 + s6, bit + stage_range[4]);
    717 
    718  // stage 5
    719  s0 = range_check_value(x0 + x3, bit + stage_range[5]);
    720  s1 = range_check_value(x1, bit + stage_range[5]);
    721  s2 = range_check_value(x2 - x3, bit + stage_range[5]);
    722  s3 = range_check_value(x2 - x0, bit + stage_range[5]);
    723 
    724  // stage 6
    725  s3 = range_check_value(s3 + x3, bit + stage_range[6]);
    726 
    727  // 1-D transform scaling factor is sqrt(2).
    728  output[0] = round_shift(s0, bit);
    729  output[1] = round_shift(s1, bit);
    730  output[2] = round_shift(s2, bit);
    731  output[3] = round_shift(s3, bit);
    732  av1_range_check_buf(6, input, output, 4, stage_range[6]);
    733 }
    734 
    735 void av1_fadst8(const int32_t *input, int32_t *output, int8_t cos_bit,
    736                const int8_t *stage_range) {
    737  const int32_t size = 8;
    738  const int32_t *cospi;
    739 
    740  int32_t stage = 0;
    741  int32_t *bf0, *bf1;
    742  int32_t step[8];
    743 
    744  // stage 0;
    745  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
    746 
    747  // stage 1;
    748  stage++;
    749  assert(output != input);
    750  bf1 = output;
    751  bf1[0] = input[0];
    752  bf1[1] = -input[7];
    753  bf1[2] = -input[3];
    754  bf1[3] = input[4];
    755  bf1[4] = -input[1];
    756  bf1[5] = input[6];
    757  bf1[6] = input[2];
    758  bf1[7] = -input[5];
    759  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    760 
    761  // stage 2
    762  stage++;
    763  cospi = cospi_arr(cos_bit);
    764  bf0 = output;
    765  bf1 = step;
    766  bf1[0] = bf0[0];
    767  bf1[1] = bf0[1];
    768  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
    769  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
    770  bf1[4] = bf0[4];
    771  bf1[5] = bf0[5];
    772  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
    773  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
    774  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    775 
    776  // stage 3
    777  stage++;
    778  bf0 = step;
    779  bf1 = output;
    780  bf1[0] = bf0[0] + bf0[2];
    781  bf1[1] = bf0[1] + bf0[3];
    782  bf1[2] = bf0[0] - bf0[2];
    783  bf1[3] = bf0[1] - bf0[3];
    784  bf1[4] = bf0[4] + bf0[6];
    785  bf1[5] = bf0[5] + bf0[7];
    786  bf1[6] = bf0[4] - bf0[6];
    787  bf1[7] = bf0[5] - bf0[7];
    788  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    789 
    790  // stage 4
    791  stage++;
    792  cospi = cospi_arr(cos_bit);
    793  bf0 = output;
    794  bf1 = step;
    795  bf1[0] = bf0[0];
    796  bf1[1] = bf0[1];
    797  bf1[2] = bf0[2];
    798  bf1[3] = bf0[3];
    799  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
    800  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
    801  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
    802  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
    803  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    804 
    805  // stage 5
    806  stage++;
    807  bf0 = step;
    808  bf1 = output;
    809  bf1[0] = bf0[0] + bf0[4];
    810  bf1[1] = bf0[1] + bf0[5];
    811  bf1[2] = bf0[2] + bf0[6];
    812  bf1[3] = bf0[3] + bf0[7];
    813  bf1[4] = bf0[0] - bf0[4];
    814  bf1[5] = bf0[1] - bf0[5];
    815  bf1[6] = bf0[2] - bf0[6];
    816  bf1[7] = bf0[3] - bf0[7];
    817  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    818 
    819  // stage 6
    820  stage++;
    821  cospi = cospi_arr(cos_bit);
    822  bf0 = output;
    823  bf1 = step;
    824  bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit);
    825  bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit);
    826  bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit);
    827  bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit);
    828  bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit);
    829  bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit);
    830  bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit);
    831  bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit);
    832  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    833 
    834  // stage 7
    835  stage++;
    836  bf0 = step;
    837  bf1 = output;
    838  bf1[0] = bf0[1];
    839  bf1[1] = bf0[6];
    840  bf1[2] = bf0[3];
    841  bf1[3] = bf0[4];
    842  bf1[4] = bf0[5];
    843  bf1[5] = bf0[2];
    844  bf1[6] = bf0[7];
    845  bf1[7] = bf0[0];
    846  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    847 }
    848 
    849 void av1_fadst16(const int32_t *input, int32_t *output, int8_t cos_bit,
    850                 const int8_t *stage_range) {
    851  const int32_t size = 16;
    852  const int32_t *cospi;
    853 
    854  int32_t stage = 0;
    855  int32_t *bf0, *bf1;
    856  int32_t step[16];
    857 
    858  // stage 0;
    859  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
    860 
    861  // stage 1;
    862  stage++;
    863  assert(output != input);
    864  bf1 = output;
    865  bf1[0] = input[0];
    866  bf1[1] = -input[15];
    867  bf1[2] = -input[7];
    868  bf1[3] = input[8];
    869  bf1[4] = -input[3];
    870  bf1[5] = input[12];
    871  bf1[6] = input[4];
    872  bf1[7] = -input[11];
    873  bf1[8] = -input[1];
    874  bf1[9] = input[14];
    875  bf1[10] = input[6];
    876  bf1[11] = -input[9];
    877  bf1[12] = input[2];
    878  bf1[13] = -input[13];
    879  bf1[14] = -input[5];
    880  bf1[15] = input[10];
    881  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    882 
    883  // stage 2
    884  stage++;
    885  cospi = cospi_arr(cos_bit);
    886  bf0 = output;
    887  bf1 = step;
    888  bf1[0] = bf0[0];
    889  bf1[1] = bf0[1];
    890  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
    891  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
    892  bf1[4] = bf0[4];
    893  bf1[5] = bf0[5];
    894  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
    895  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
    896  bf1[8] = bf0[8];
    897  bf1[9] = bf0[9];
    898  bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit);
    899  bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit);
    900  bf1[12] = bf0[12];
    901  bf1[13] = bf0[13];
    902  bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
    903  bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
    904  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    905 
    906  // stage 3
    907  stage++;
    908  bf0 = step;
    909  bf1 = output;
    910  bf1[0] = bf0[0] + bf0[2];
    911  bf1[1] = bf0[1] + bf0[3];
    912  bf1[2] = bf0[0] - bf0[2];
    913  bf1[3] = bf0[1] - bf0[3];
    914  bf1[4] = bf0[4] + bf0[6];
    915  bf1[5] = bf0[5] + bf0[7];
    916  bf1[6] = bf0[4] - bf0[6];
    917  bf1[7] = bf0[5] - bf0[7];
    918  bf1[8] = bf0[8] + bf0[10];
    919  bf1[9] = bf0[9] + bf0[11];
    920  bf1[10] = bf0[8] - bf0[10];
    921  bf1[11] = bf0[9] - bf0[11];
    922  bf1[12] = bf0[12] + bf0[14];
    923  bf1[13] = bf0[13] + bf0[15];
    924  bf1[14] = bf0[12] - bf0[14];
    925  bf1[15] = bf0[13] - bf0[15];
    926  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    927 
    928  // stage 4
    929  stage++;
    930  cospi = cospi_arr(cos_bit);
    931  bf0 = output;
    932  bf1 = step;
    933  bf1[0] = bf0[0];
    934  bf1[1] = bf0[1];
    935  bf1[2] = bf0[2];
    936  bf1[3] = bf0[3];
    937  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
    938  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
    939  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
    940  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
    941  bf1[8] = bf0[8];
    942  bf1[9] = bf0[9];
    943  bf1[10] = bf0[10];
    944  bf1[11] = bf0[11];
    945  bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit);
    946  bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
    947  bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
    948  bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
    949  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    950 
    951  // stage 5
    952  stage++;
    953  bf0 = step;
    954  bf1 = output;
    955  bf1[0] = bf0[0] + bf0[4];
    956  bf1[1] = bf0[1] + bf0[5];
    957  bf1[2] = bf0[2] + bf0[6];
    958  bf1[3] = bf0[3] + bf0[7];
    959  bf1[4] = bf0[0] - bf0[4];
    960  bf1[5] = bf0[1] - bf0[5];
    961  bf1[6] = bf0[2] - bf0[6];
    962  bf1[7] = bf0[3] - bf0[7];
    963  bf1[8] = bf0[8] + bf0[12];
    964  bf1[9] = bf0[9] + bf0[13];
    965  bf1[10] = bf0[10] + bf0[14];
    966  bf1[11] = bf0[11] + bf0[15];
    967  bf1[12] = bf0[8] - bf0[12];
    968  bf1[13] = bf0[9] - bf0[13];
    969  bf1[14] = bf0[10] - bf0[14];
    970  bf1[15] = bf0[11] - bf0[15];
    971  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    972 
    973  // stage 6
    974  stage++;
    975  cospi = cospi_arr(cos_bit);
    976  bf0 = output;
    977  bf1 = step;
    978  bf1[0] = bf0[0];
    979  bf1[1] = bf0[1];
    980  bf1[2] = bf0[2];
    981  bf1[3] = bf0[3];
    982  bf1[4] = bf0[4];
    983  bf1[5] = bf0[5];
    984  bf1[6] = bf0[6];
    985  bf1[7] = bf0[7];
    986  bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit);
    987  bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit);
    988  bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit);
    989  bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit);
    990  bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit);
    991  bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
    992  bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
    993  bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
    994  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    995 
    996  // stage 7
    997  stage++;
    998  bf0 = step;
    999  bf1 = output;
   1000  bf1[0] = bf0[0] + bf0[8];
   1001  bf1[1] = bf0[1] + bf0[9];
   1002  bf1[2] = bf0[2] + bf0[10];
   1003  bf1[3] = bf0[3] + bf0[11];
   1004  bf1[4] = bf0[4] + bf0[12];
   1005  bf1[5] = bf0[5] + bf0[13];
   1006  bf1[6] = bf0[6] + bf0[14];
   1007  bf1[7] = bf0[7] + bf0[15];
   1008  bf1[8] = bf0[0] - bf0[8];
   1009  bf1[9] = bf0[1] - bf0[9];
   1010  bf1[10] = bf0[2] - bf0[10];
   1011  bf1[11] = bf0[3] - bf0[11];
   1012  bf1[12] = bf0[4] - bf0[12];
   1013  bf1[13] = bf0[5] - bf0[13];
   1014  bf1[14] = bf0[6] - bf0[14];
   1015  bf1[15] = bf0[7] - bf0[15];
   1016  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
   1017 
   1018  // stage 8
   1019  stage++;
   1020  cospi = cospi_arr(cos_bit);
   1021  bf0 = output;
   1022  bf1 = step;
   1023  bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit);
   1024  bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit);
   1025  bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit);
   1026  bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit);
   1027  bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit);
   1028  bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit);
   1029  bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit);
   1030  bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit);
   1031  bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit);
   1032  bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit);
   1033  bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit);
   1034  bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit);
   1035  bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit);
   1036  bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit);
   1037  bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit);
   1038  bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit);
   1039  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
   1040 
   1041  // stage 9
   1042  stage++;
   1043  bf0 = step;
   1044  bf1 = output;
   1045  bf1[0] = bf0[1];
   1046  bf1[1] = bf0[14];
   1047  bf1[2] = bf0[3];
   1048  bf1[3] = bf0[12];
   1049  bf1[4] = bf0[5];
   1050  bf1[5] = bf0[10];
   1051  bf1[6] = bf0[7];
   1052  bf1[7] = bf0[8];
   1053  bf1[8] = bf0[9];
   1054  bf1[9] = bf0[6];
   1055  bf1[10] = bf0[11];
   1056  bf1[11] = bf0[4];
   1057  bf1[12] = bf0[13];
   1058  bf1[13] = bf0[2];
   1059  bf1[14] = bf0[15];
   1060  bf1[15] = bf0[0];
   1061  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
   1062 }
   1063 
   1064 void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
   1065                      const int8_t *stage_range) {
   1066  (void)cos_bit;
   1067  for (int i = 0; i < 4; ++i)
   1068    output[i] = round_shift((int64_t)input[i] * NewSqrt2, NewSqrt2Bits);
   1069  assert(stage_range[0] + NewSqrt2Bits <= 32);
   1070  av1_range_check_buf(0, input, output, 4, stage_range[0]);
   1071 }
   1072 
   1073 void av1_fidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
   1074                      const int8_t *stage_range) {
   1075  (void)cos_bit;
   1076  for (int i = 0; i < 8; ++i) output[i] = input[i] * 2;
   1077  av1_range_check_buf(0, input, output, 8, stage_range[0]);
   1078 }
   1079 
   1080 void av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
   1081                       const int8_t *stage_range) {
   1082  (void)cos_bit;
   1083  for (int i = 0; i < 16; ++i)
   1084    output[i] = round_shift((int64_t)input[i] * 2 * NewSqrt2, NewSqrt2Bits);
   1085  assert(stage_range[0] + NewSqrt2Bits <= 32);
   1086  av1_range_check_buf(0, input, output, 16, stage_range[0]);
   1087 }
   1088 
   1089 void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
   1090                       const int8_t *stage_range) {
   1091  (void)cos_bit;
   1092  for (int i = 0; i < 32; ++i) output[i] = input[i] * 4;
   1093  av1_range_check_buf(0, input, output, 32, stage_range[0]);
   1094 }
   1095 
   1096 void av1_fdct64(const int32_t *input, int32_t *output, int8_t cos_bit,
   1097                const int8_t *stage_range) {
   1098  const int32_t size = 64;
   1099  const int32_t *cospi;
   1100 
   1101  int32_t stage = 0;
   1102  int32_t *bf0, *bf1;
   1103  int32_t step[64];
   1104 
   1105  // stage 0;
   1106  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
   1107 
   1108  // stage 1;
   1109  stage++;
   1110  bf1 = output;
   1111  bf1[0] = input[0] + input[63];
   1112  bf1[1] = input[1] + input[62];
   1113  bf1[2] = input[2] + input[61];
   1114  bf1[3] = input[3] + input[60];
   1115  bf1[4] = input[4] + input[59];
   1116  bf1[5] = input[5] + input[58];
   1117  bf1[6] = input[6] + input[57];
   1118  bf1[7] = input[7] + input[56];
   1119  bf1[8] = input[8] + input[55];
   1120  bf1[9] = input[9] + input[54];
   1121  bf1[10] = input[10] + input[53];
   1122  bf1[11] = input[11] + input[52];
   1123  bf1[12] = input[12] + input[51];
   1124  bf1[13] = input[13] + input[50];
   1125  bf1[14] = input[14] + input[49];
   1126  bf1[15] = input[15] + input[48];
   1127  bf1[16] = input[16] + input[47];
   1128  bf1[17] = input[17] + input[46];
   1129  bf1[18] = input[18] + input[45];
   1130  bf1[19] = input[19] + input[44];
   1131  bf1[20] = input[20] + input[43];
   1132  bf1[21] = input[21] + input[42];
   1133  bf1[22] = input[22] + input[41];
   1134  bf1[23] = input[23] + input[40];
   1135  bf1[24] = input[24] + input[39];
   1136  bf1[25] = input[25] + input[38];
   1137  bf1[26] = input[26] + input[37];
   1138  bf1[27] = input[27] + input[36];
   1139  bf1[28] = input[28] + input[35];
   1140  bf1[29] = input[29] + input[34];
   1141  bf1[30] = input[30] + input[33];
   1142  bf1[31] = input[31] + input[32];
   1143  bf1[32] = -input[32] + input[31];
   1144  bf1[33] = -input[33] + input[30];
   1145  bf1[34] = -input[34] + input[29];
   1146  bf1[35] = -input[35] + input[28];
   1147  bf1[36] = -input[36] + input[27];
   1148  bf1[37] = -input[37] + input[26];
   1149  bf1[38] = -input[38] + input[25];
   1150  bf1[39] = -input[39] + input[24];
   1151  bf1[40] = -input[40] + input[23];
   1152  bf1[41] = -input[41] + input[22];
   1153  bf1[42] = -input[42] + input[21];
   1154  bf1[43] = -input[43] + input[20];
   1155  bf1[44] = -input[44] + input[19];
   1156  bf1[45] = -input[45] + input[18];
   1157  bf1[46] = -input[46] + input[17];
   1158  bf1[47] = -input[47] + input[16];
   1159  bf1[48] = -input[48] + input[15];
   1160  bf1[49] = -input[49] + input[14];
   1161  bf1[50] = -input[50] + input[13];
   1162  bf1[51] = -input[51] + input[12];
   1163  bf1[52] = -input[52] + input[11];
   1164  bf1[53] = -input[53] + input[10];
   1165  bf1[54] = -input[54] + input[9];
   1166  bf1[55] = -input[55] + input[8];
   1167  bf1[56] = -input[56] + input[7];
   1168  bf1[57] = -input[57] + input[6];
   1169  bf1[58] = -input[58] + input[5];
   1170  bf1[59] = -input[59] + input[4];
   1171  bf1[60] = -input[60] + input[3];
   1172  bf1[61] = -input[61] + input[2];
   1173  bf1[62] = -input[62] + input[1];
   1174  bf1[63] = -input[63] + input[0];
   1175  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
   1176 
   1177  // stage 2
   1178  stage++;
   1179  cospi = cospi_arr(cos_bit);
   1180  bf0 = output;
   1181  bf1 = step;
   1182  bf1[0] = bf0[0] + bf0[31];
   1183  bf1[1] = bf0[1] + bf0[30];
   1184  bf1[2] = bf0[2] + bf0[29];
   1185  bf1[3] = bf0[3] + bf0[28];
   1186  bf1[4] = bf0[4] + bf0[27];
   1187  bf1[5] = bf0[5] + bf0[26];
   1188  bf1[6] = bf0[6] + bf0[25];
   1189  bf1[7] = bf0[7] + bf0[24];
   1190  bf1[8] = bf0[8] + bf0[23];
   1191  bf1[9] = bf0[9] + bf0[22];
   1192  bf1[10] = bf0[10] + bf0[21];
   1193  bf1[11] = bf0[11] + bf0[20];
   1194  bf1[12] = bf0[12] + bf0[19];
   1195  bf1[13] = bf0[13] + bf0[18];
   1196  bf1[14] = bf0[14] + bf0[17];
   1197  bf1[15] = bf0[15] + bf0[16];
   1198  bf1[16] = -bf0[16] + bf0[15];
   1199  bf1[17] = -bf0[17] + bf0[14];
   1200  bf1[18] = -bf0[18] + bf0[13];
   1201  bf1[19] = -bf0[19] + bf0[12];
   1202  bf1[20] = -bf0[20] + bf0[11];
   1203  bf1[21] = -bf0[21] + bf0[10];
   1204  bf1[22] = -bf0[22] + bf0[9];
   1205  bf1[23] = -bf0[23] + bf0[8];
   1206  bf1[24] = -bf0[24] + bf0[7];
   1207  bf1[25] = -bf0[25] + bf0[6];
   1208  bf1[26] = -bf0[26] + bf0[5];
   1209  bf1[27] = -bf0[27] + bf0[4];
   1210  bf1[28] = -bf0[28] + bf0[3];
   1211  bf1[29] = -bf0[29] + bf0[2];
   1212  bf1[30] = -bf0[30] + bf0[1];
   1213  bf1[31] = -bf0[31] + bf0[0];
   1214  bf1[32] = bf0[32];
   1215  bf1[33] = bf0[33];
   1216  bf1[34] = bf0[34];
   1217  bf1[35] = bf0[35];
   1218  bf1[36] = bf0[36];
   1219  bf1[37] = bf0[37];
   1220  bf1[38] = bf0[38];
   1221  bf1[39] = bf0[39];
   1222  bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
   1223  bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
   1224  bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
   1225  bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
   1226  bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
   1227  bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
   1228  bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
   1229  bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
   1230  bf1[48] = half_btf(cospi[32], bf0[48], cospi[32], bf0[47], cos_bit);
   1231  bf1[49] = half_btf(cospi[32], bf0[49], cospi[32], bf0[46], cos_bit);
   1232  bf1[50] = half_btf(cospi[32], bf0[50], cospi[32], bf0[45], cos_bit);
   1233  bf1[51] = half_btf(cospi[32], bf0[51], cospi[32], bf0[44], cos_bit);
   1234  bf1[52] = half_btf(cospi[32], bf0[52], cospi[32], bf0[43], cos_bit);
   1235  bf1[53] = half_btf(cospi[32], bf0[53], cospi[32], bf0[42], cos_bit);
   1236  bf1[54] = half_btf(cospi[32], bf0[54], cospi[32], bf0[41], cos_bit);
   1237  bf1[55] = half_btf(cospi[32], bf0[55], cospi[32], bf0[40], cos_bit);
   1238  bf1[56] = bf0[56];
   1239  bf1[57] = bf0[57];
   1240  bf1[58] = bf0[58];
   1241  bf1[59] = bf0[59];
   1242  bf1[60] = bf0[60];
   1243  bf1[61] = bf0[61];
   1244  bf1[62] = bf0[62];
   1245  bf1[63] = bf0[63];
   1246  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
   1247 
   1248  // stage 3
   1249  stage++;
   1250  cospi = cospi_arr(cos_bit);
   1251  bf0 = step;
   1252  bf1 = output;
   1253  bf1[0] = bf0[0] + bf0[15];
   1254  bf1[1] = bf0[1] + bf0[14];
   1255  bf1[2] = bf0[2] + bf0[13];
   1256  bf1[3] = bf0[3] + bf0[12];
   1257  bf1[4] = bf0[4] + bf0[11];
   1258  bf1[5] = bf0[5] + bf0[10];
   1259  bf1[6] = bf0[6] + bf0[9];
   1260  bf1[7] = bf0[7] + bf0[8];
   1261  bf1[8] = -bf0[8] + bf0[7];
   1262  bf1[9] = -bf0[9] + bf0[6];
   1263  bf1[10] = -bf0[10] + bf0[5];
   1264  bf1[11] = -bf0[11] + bf0[4];
   1265  bf1[12] = -bf0[12] + bf0[3];
   1266  bf1[13] = -bf0[13] + bf0[2];
   1267  bf1[14] = -bf0[14] + bf0[1];
   1268  bf1[15] = -bf0[15] + bf0[0];
   1269  bf1[16] = bf0[16];
   1270  bf1[17] = bf0[17];
   1271  bf1[18] = bf0[18];
   1272  bf1[19] = bf0[19];
   1273  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
   1274  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
   1275  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
   1276  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
   1277  bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
   1278  bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
   1279  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
   1280  bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
   1281  bf1[28] = bf0[28];
   1282  bf1[29] = bf0[29];
   1283  bf1[30] = bf0[30];
   1284  bf1[31] = bf0[31];
   1285  bf1[32] = bf0[32] + bf0[47];
   1286  bf1[33] = bf0[33] + bf0[46];
   1287  bf1[34] = bf0[34] + bf0[45];
   1288  bf1[35] = bf0[35] + bf0[44];
   1289  bf1[36] = bf0[36] + bf0[43];
   1290  bf1[37] = bf0[37] + bf0[42];
   1291  bf1[38] = bf0[38] + bf0[41];
   1292  bf1[39] = bf0[39] + bf0[40];
   1293  bf1[40] = -bf0[40] + bf0[39];
   1294  bf1[41] = -bf0[41] + bf0[38];
   1295  bf1[42] = -bf0[42] + bf0[37];
   1296  bf1[43] = -bf0[43] + bf0[36];
   1297  bf1[44] = -bf0[44] + bf0[35];
   1298  bf1[45] = -bf0[45] + bf0[34];
   1299  bf1[46] = -bf0[46] + bf0[33];
   1300  bf1[47] = -bf0[47] + bf0[32];
   1301  bf1[48] = -bf0[48] + bf0[63];
   1302  bf1[49] = -bf0[49] + bf0[62];
   1303  bf1[50] = -bf0[50] + bf0[61];
   1304  bf1[51] = -bf0[51] + bf0[60];
   1305  bf1[52] = -bf0[52] + bf0[59];
   1306  bf1[53] = -bf0[53] + bf0[58];
   1307  bf1[54] = -bf0[54] + bf0[57];
   1308  bf1[55] = -bf0[55] + bf0[56];
   1309  bf1[56] = bf0[56] + bf0[55];
   1310  bf1[57] = bf0[57] + bf0[54];
   1311  bf1[58] = bf0[58] + bf0[53];
   1312  bf1[59] = bf0[59] + bf0[52];
   1313  bf1[60] = bf0[60] + bf0[51];
   1314  bf1[61] = bf0[61] + bf0[50];
   1315  bf1[62] = bf0[62] + bf0[49];
   1316  bf1[63] = bf0[63] + bf0[48];
   1317  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
   1318 
   1319  // stage 4
   1320  stage++;
   1321  cospi = cospi_arr(cos_bit);
   1322  bf0 = output;
   1323  bf1 = step;
   1324  bf1[0] = bf0[0] + bf0[7];
   1325  bf1[1] = bf0[1] + bf0[6];
   1326  bf1[2] = bf0[2] + bf0[5];
   1327  bf1[3] = bf0[3] + bf0[4];
   1328  bf1[4] = -bf0[4] + bf0[3];
   1329  bf1[5] = -bf0[5] + bf0[2];
   1330  bf1[6] = -bf0[6] + bf0[1];
   1331  bf1[7] = -bf0[7] + bf0[0];
   1332  bf1[8] = bf0[8];
   1333  bf1[9] = bf0[9];
   1334  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
   1335  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
   1336  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
   1337  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
   1338  bf1[14] = bf0[14];
   1339  bf1[15] = bf0[15];
   1340  bf1[16] = bf0[16] + bf0[23];
   1341  bf1[17] = bf0[17] + bf0[22];
   1342  bf1[18] = bf0[18] + bf0[21];
   1343  bf1[19] = bf0[19] + bf0[20];
   1344  bf1[20] = -bf0[20] + bf0[19];
   1345  bf1[21] = -bf0[21] + bf0[18];
   1346  bf1[22] = -bf0[22] + bf0[17];
   1347  bf1[23] = -bf0[23] + bf0[16];
   1348  bf1[24] = -bf0[24] + bf0[31];
   1349  bf1[25] = -bf0[25] + bf0[30];
   1350  bf1[26] = -bf0[26] + bf0[29];
   1351  bf1[27] = -bf0[27] + bf0[28];
   1352  bf1[28] = bf0[28] + bf0[27];
   1353  bf1[29] = bf0[29] + bf0[26];
   1354  bf1[30] = bf0[30] + bf0[25];
   1355  bf1[31] = bf0[31] + bf0[24];
   1356  bf1[32] = bf0[32];
   1357  bf1[33] = bf0[33];
   1358  bf1[34] = bf0[34];
   1359  bf1[35] = bf0[35];
   1360  bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit);
   1361  bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit);
   1362  bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit);
   1363  bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit);
   1364  bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit);
   1365  bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit);
   1366  bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit);
   1367  bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit);
   1368  bf1[44] = bf0[44];
   1369  bf1[45] = bf0[45];
   1370  bf1[46] = bf0[46];
   1371  bf1[47] = bf0[47];
   1372  bf1[48] = bf0[48];
   1373  bf1[49] = bf0[49];
   1374  bf1[50] = bf0[50];
   1375  bf1[51] = bf0[51];
   1376  bf1[52] = half_btf(cospi[48], bf0[52], -cospi[16], bf0[43], cos_bit);
   1377  bf1[53] = half_btf(cospi[48], bf0[53], -cospi[16], bf0[42], cos_bit);
   1378  bf1[54] = half_btf(cospi[48], bf0[54], -cospi[16], bf0[41], cos_bit);
   1379  bf1[55] = half_btf(cospi[48], bf0[55], -cospi[16], bf0[40], cos_bit);
   1380  bf1[56] = half_btf(cospi[16], bf0[56], cospi[48], bf0[39], cos_bit);
   1381  bf1[57] = half_btf(cospi[16], bf0[57], cospi[48], bf0[38], cos_bit);
   1382  bf1[58] = half_btf(cospi[16], bf0[58], cospi[48], bf0[37], cos_bit);
   1383  bf1[59] = half_btf(cospi[16], bf0[59], cospi[48], bf0[36], cos_bit);
   1384  bf1[60] = bf0[60];
   1385  bf1[61] = bf0[61];
   1386  bf1[62] = bf0[62];
   1387  bf1[63] = bf0[63];
   1388  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
   1389 
   1390  // stage 5
   1391  stage++;
   1392  cospi = cospi_arr(cos_bit);
   1393  bf0 = step;
   1394  bf1 = output;
   1395  bf1[0] = bf0[0] + bf0[3];
   1396  bf1[1] = bf0[1] + bf0[2];
   1397  bf1[2] = -bf0[2] + bf0[1];
   1398  bf1[3] = -bf0[3] + bf0[0];
   1399  bf1[4] = bf0[4];
   1400  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
   1401  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
   1402  bf1[7] = bf0[7];
   1403  bf1[8] = bf0[8] + bf0[11];
   1404  bf1[9] = bf0[9] + bf0[10];
   1405  bf1[10] = -bf0[10] + bf0[9];
   1406  bf1[11] = -bf0[11] + bf0[8];
   1407  bf1[12] = -bf0[12] + bf0[15];
   1408  bf1[13] = -bf0[13] + bf0[14];
   1409  bf1[14] = bf0[14] + bf0[13];
   1410  bf1[15] = bf0[15] + bf0[12];
   1411  bf1[16] = bf0[16];
   1412  bf1[17] = bf0[17];
   1413  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
   1414  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
   1415  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
   1416  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
   1417  bf1[22] = bf0[22];
   1418  bf1[23] = bf0[23];
   1419  bf1[24] = bf0[24];
   1420  bf1[25] = bf0[25];
   1421  bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
   1422  bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
   1423  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
   1424  bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
   1425  bf1[30] = bf0[30];
   1426  bf1[31] = bf0[31];
   1427  bf1[32] = bf0[32] + bf0[39];
   1428  bf1[33] = bf0[33] + bf0[38];
   1429  bf1[34] = bf0[34] + bf0[37];
   1430  bf1[35] = bf0[35] + bf0[36];
   1431  bf1[36] = -bf0[36] + bf0[35];
   1432  bf1[37] = -bf0[37] + bf0[34];
   1433  bf1[38] = -bf0[38] + bf0[33];
   1434  bf1[39] = -bf0[39] + bf0[32];
   1435  bf1[40] = -bf0[40] + bf0[47];
   1436  bf1[41] = -bf0[41] + bf0[46];
   1437  bf1[42] = -bf0[42] + bf0[45];
   1438  bf1[43] = -bf0[43] + bf0[44];
   1439  bf1[44] = bf0[44] + bf0[43];
   1440  bf1[45] = bf0[45] + bf0[42];
   1441  bf1[46] = bf0[46] + bf0[41];
   1442  bf1[47] = bf0[47] + bf0[40];
   1443  bf1[48] = bf0[48] + bf0[55];
   1444  bf1[49] = bf0[49] + bf0[54];
   1445  bf1[50] = bf0[50] + bf0[53];
   1446  bf1[51] = bf0[51] + bf0[52];
   1447  bf1[52] = -bf0[52] + bf0[51];
   1448  bf1[53] = -bf0[53] + bf0[50];
   1449  bf1[54] = -bf0[54] + bf0[49];
   1450  bf1[55] = -bf0[55] + bf0[48];
   1451  bf1[56] = -bf0[56] + bf0[63];
   1452  bf1[57] = -bf0[57] + bf0[62];
   1453  bf1[58] = -bf0[58] + bf0[61];
   1454  bf1[59] = -bf0[59] + bf0[60];
   1455  bf1[60] = bf0[60] + bf0[59];
   1456  bf1[61] = bf0[61] + bf0[58];
   1457  bf1[62] = bf0[62] + bf0[57];
   1458  bf1[63] = bf0[63] + bf0[56];
   1459  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
   1460 
   1461  // stage 6
   1462  stage++;
   1463  cospi = cospi_arr(cos_bit);
   1464  bf0 = output;
   1465  bf1 = step;
   1466  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
   1467  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
   1468  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
   1469  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
   1470  bf1[4] = bf0[4] + bf0[5];
   1471  bf1[5] = -bf0[5] + bf0[4];
   1472  bf1[6] = -bf0[6] + bf0[7];
   1473  bf1[7] = bf0[7] + bf0[6];
   1474  bf1[8] = bf0[8];
   1475  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
   1476  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
   1477  bf1[11] = bf0[11];
   1478  bf1[12] = bf0[12];
   1479  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
   1480  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
   1481  bf1[15] = bf0[15];
   1482  bf1[16] = bf0[16] + bf0[19];
   1483  bf1[17] = bf0[17] + bf0[18];
   1484  bf1[18] = -bf0[18] + bf0[17];
   1485  bf1[19] = -bf0[19] + bf0[16];
   1486  bf1[20] = -bf0[20] + bf0[23];
   1487  bf1[21] = -bf0[21] + bf0[22];
   1488  bf1[22] = bf0[22] + bf0[21];
   1489  bf1[23] = bf0[23] + bf0[20];
   1490  bf1[24] = bf0[24] + bf0[27];
   1491  bf1[25] = bf0[25] + bf0[26];
   1492  bf1[26] = -bf0[26] + bf0[25];
   1493  bf1[27] = -bf0[27] + bf0[24];
   1494  bf1[28] = -bf0[28] + bf0[31];
   1495  bf1[29] = -bf0[29] + bf0[30];
   1496  bf1[30] = bf0[30] + bf0[29];
   1497  bf1[31] = bf0[31] + bf0[28];
   1498  bf1[32] = bf0[32];
   1499  bf1[33] = bf0[33];
   1500  bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit);
   1501  bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit);
   1502  bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit);
   1503  bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit);
   1504  bf1[38] = bf0[38];
   1505  bf1[39] = bf0[39];
   1506  bf1[40] = bf0[40];
   1507  bf1[41] = bf0[41];
   1508  bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit);
   1509  bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit);
   1510  bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit);
   1511  bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit);
   1512  bf1[46] = bf0[46];
   1513  bf1[47] = bf0[47];
   1514  bf1[48] = bf0[48];
   1515  bf1[49] = bf0[49];
   1516  bf1[50] = half_btf(cospi[24], bf0[50], -cospi[40], bf0[45], cos_bit);
   1517  bf1[51] = half_btf(cospi[24], bf0[51], -cospi[40], bf0[44], cos_bit);
   1518  bf1[52] = half_btf(cospi[40], bf0[52], cospi[24], bf0[43], cos_bit);
   1519  bf1[53] = half_btf(cospi[40], bf0[53], cospi[24], bf0[42], cos_bit);
   1520  bf1[54] = bf0[54];
   1521  bf1[55] = bf0[55];
   1522  bf1[56] = bf0[56];
   1523  bf1[57] = bf0[57];
   1524  bf1[58] = half_btf(cospi[56], bf0[58], -cospi[8], bf0[37], cos_bit);
   1525  bf1[59] = half_btf(cospi[56], bf0[59], -cospi[8], bf0[36], cos_bit);
   1526  bf1[60] = half_btf(cospi[8], bf0[60], cospi[56], bf0[35], cos_bit);
   1527  bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit);
   1528  bf1[62] = bf0[62];
   1529  bf1[63] = bf0[63];
   1530  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
   1531 
   1532  // stage 7
   1533  stage++;
   1534  cospi = cospi_arr(cos_bit);
   1535  bf0 = step;
   1536  bf1 = output;
   1537  bf1[0] = bf0[0];
   1538  bf1[1] = bf0[1];
   1539  bf1[2] = bf0[2];
   1540  bf1[3] = bf0[3];
   1541  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
   1542  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
   1543  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
   1544  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
   1545  bf1[8] = bf0[8] + bf0[9];
   1546  bf1[9] = -bf0[9] + bf0[8];
   1547  bf1[10] = -bf0[10] + bf0[11];
   1548  bf1[11] = bf0[11] + bf0[10];
   1549  bf1[12] = bf0[12] + bf0[13];
   1550  bf1[13] = -bf0[13] + bf0[12];
   1551  bf1[14] = -bf0[14] + bf0[15];
   1552  bf1[15] = bf0[15] + bf0[14];
   1553  bf1[16] = bf0[16];
   1554  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
   1555  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
   1556  bf1[19] = bf0[19];
   1557  bf1[20] = bf0[20];
   1558  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
   1559  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
   1560  bf1[23] = bf0[23];
   1561  bf1[24] = bf0[24];
   1562  bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
   1563  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
   1564  bf1[27] = bf0[27];
   1565  bf1[28] = bf0[28];
   1566  bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
   1567  bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
   1568  bf1[31] = bf0[31];
   1569  bf1[32] = bf0[32] + bf0[35];
   1570  bf1[33] = bf0[33] + bf0[34];
   1571  bf1[34] = -bf0[34] + bf0[33];
   1572  bf1[35] = -bf0[35] + bf0[32];
   1573  bf1[36] = -bf0[36] + bf0[39];
   1574  bf1[37] = -bf0[37] + bf0[38];
   1575  bf1[38] = bf0[38] + bf0[37];
   1576  bf1[39] = bf0[39] + bf0[36];
   1577  bf1[40] = bf0[40] + bf0[43];
   1578  bf1[41] = bf0[41] + bf0[42];
   1579  bf1[42] = -bf0[42] + bf0[41];
   1580  bf1[43] = -bf0[43] + bf0[40];
   1581  bf1[44] = -bf0[44] + bf0[47];
   1582  bf1[45] = -bf0[45] + bf0[46];
   1583  bf1[46] = bf0[46] + bf0[45];
   1584  bf1[47] = bf0[47] + bf0[44];
   1585  bf1[48] = bf0[48] + bf0[51];
   1586  bf1[49] = bf0[49] + bf0[50];
   1587  bf1[50] = -bf0[50] + bf0[49];
   1588  bf1[51] = -bf0[51] + bf0[48];
   1589  bf1[52] = -bf0[52] + bf0[55];
   1590  bf1[53] = -bf0[53] + bf0[54];
   1591  bf1[54] = bf0[54] + bf0[53];
   1592  bf1[55] = bf0[55] + bf0[52];
   1593  bf1[56] = bf0[56] + bf0[59];
   1594  bf1[57] = bf0[57] + bf0[58];
   1595  bf1[58] = -bf0[58] + bf0[57];
   1596  bf1[59] = -bf0[59] + bf0[56];
   1597  bf1[60] = -bf0[60] + bf0[63];
   1598  bf1[61] = -bf0[61] + bf0[62];
   1599  bf1[62] = bf0[62] + bf0[61];
   1600  bf1[63] = bf0[63] + bf0[60];
   1601  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
   1602 
   1603  // stage 8
   1604  stage++;
   1605  cospi = cospi_arr(cos_bit);
   1606  bf0 = output;
   1607  bf1 = step;
   1608  bf1[0] = bf0[0];
   1609  bf1[1] = bf0[1];
   1610  bf1[2] = bf0[2];
   1611  bf1[3] = bf0[3];
   1612  bf1[4] = bf0[4];
   1613  bf1[5] = bf0[5];
   1614  bf1[6] = bf0[6];
   1615  bf1[7] = bf0[7];
   1616  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
   1617  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
   1618  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
   1619  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
   1620  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
   1621  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
   1622  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
   1623  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
   1624  bf1[16] = bf0[16] + bf0[17];
   1625  bf1[17] = -bf0[17] + bf0[16];
   1626  bf1[18] = -bf0[18] + bf0[19];
   1627  bf1[19] = bf0[19] + bf0[18];
   1628  bf1[20] = bf0[20] + bf0[21];
   1629  bf1[21] = -bf0[21] + bf0[20];
   1630  bf1[22] = -bf0[22] + bf0[23];
   1631  bf1[23] = bf0[23] + bf0[22];
   1632  bf1[24] = bf0[24] + bf0[25];
   1633  bf1[25] = -bf0[25] + bf0[24];
   1634  bf1[26] = -bf0[26] + bf0[27];
   1635  bf1[27] = bf0[27] + bf0[26];
   1636  bf1[28] = bf0[28] + bf0[29];
   1637  bf1[29] = -bf0[29] + bf0[28];
   1638  bf1[30] = -bf0[30] + bf0[31];
   1639  bf1[31] = bf0[31] + bf0[30];
   1640  bf1[32] = bf0[32];
   1641  bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit);
   1642  bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit);
   1643  bf1[35] = bf0[35];
   1644  bf1[36] = bf0[36];
   1645  bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit);
   1646  bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit);
   1647  bf1[39] = bf0[39];
   1648  bf1[40] = bf0[40];
   1649  bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit);
   1650  bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit);
   1651  bf1[43] = bf0[43];
   1652  bf1[44] = bf0[44];
   1653  bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit);
   1654  bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit);
   1655  bf1[47] = bf0[47];
   1656  bf1[48] = bf0[48];
   1657  bf1[49] = half_btf(cospi[12], bf0[49], -cospi[52], bf0[46], cos_bit);
   1658  bf1[50] = half_btf(cospi[52], bf0[50], cospi[12], bf0[45], cos_bit);
   1659  bf1[51] = bf0[51];
   1660  bf1[52] = bf0[52];
   1661  bf1[53] = half_btf(cospi[44], bf0[53], -cospi[20], bf0[42], cos_bit);
   1662  bf1[54] = half_btf(cospi[20], bf0[54], cospi[44], bf0[41], cos_bit);
   1663  bf1[55] = bf0[55];
   1664  bf1[56] = bf0[56];
   1665  bf1[57] = half_btf(cospi[28], bf0[57], -cospi[36], bf0[38], cos_bit);
   1666  bf1[58] = half_btf(cospi[36], bf0[58], cospi[28], bf0[37], cos_bit);
   1667  bf1[59] = bf0[59];
   1668  bf1[60] = bf0[60];
   1669  bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit);
   1670  bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit);
   1671  bf1[63] = bf0[63];
   1672  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
   1673 
   1674  // stage 9
   1675  stage++;
   1676  cospi = cospi_arr(cos_bit);
   1677  bf0 = step;
   1678  bf1 = output;
   1679  bf1[0] = bf0[0];
   1680  bf1[1] = bf0[1];
   1681  bf1[2] = bf0[2];
   1682  bf1[3] = bf0[3];
   1683  bf1[4] = bf0[4];
   1684  bf1[5] = bf0[5];
   1685  bf1[6] = bf0[6];
   1686  bf1[7] = bf0[7];
   1687  bf1[8] = bf0[8];
   1688  bf1[9] = bf0[9];
   1689  bf1[10] = bf0[10];
   1690  bf1[11] = bf0[11];
   1691  bf1[12] = bf0[12];
   1692  bf1[13] = bf0[13];
   1693  bf1[14] = bf0[14];
   1694  bf1[15] = bf0[15];
   1695  bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
   1696  bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit);
   1697  bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
   1698  bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit);
   1699  bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
   1700  bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit);
   1701  bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
   1702  bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit);
   1703  bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
   1704  bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit);
   1705  bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
   1706  bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit);
   1707  bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
   1708  bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit);
   1709  bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
   1710  bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit);
   1711  bf1[32] = bf0[32] + bf0[33];
   1712  bf1[33] = -bf0[33] + bf0[32];
   1713  bf1[34] = -bf0[34] + bf0[35];
   1714  bf1[35] = bf0[35] + bf0[34];
   1715  bf1[36] = bf0[36] + bf0[37];
   1716  bf1[37] = -bf0[37] + bf0[36];
   1717  bf1[38] = -bf0[38] + bf0[39];
   1718  bf1[39] = bf0[39] + bf0[38];
   1719  bf1[40] = bf0[40] + bf0[41];
   1720  bf1[41] = -bf0[41] + bf0[40];
   1721  bf1[42] = -bf0[42] + bf0[43];
   1722  bf1[43] = bf0[43] + bf0[42];
   1723  bf1[44] = bf0[44] + bf0[45];
   1724  bf1[45] = -bf0[45] + bf0[44];
   1725  bf1[46] = -bf0[46] + bf0[47];
   1726  bf1[47] = bf0[47] + bf0[46];
   1727  bf1[48] = bf0[48] + bf0[49];
   1728  bf1[49] = -bf0[49] + bf0[48];
   1729  bf1[50] = -bf0[50] + bf0[51];
   1730  bf1[51] = bf0[51] + bf0[50];
   1731  bf1[52] = bf0[52] + bf0[53];
   1732  bf1[53] = -bf0[53] + bf0[52];
   1733  bf1[54] = -bf0[54] + bf0[55];
   1734  bf1[55] = bf0[55] + bf0[54];
   1735  bf1[56] = bf0[56] + bf0[57];
   1736  bf1[57] = -bf0[57] + bf0[56];
   1737  bf1[58] = -bf0[58] + bf0[59];
   1738  bf1[59] = bf0[59] + bf0[58];
   1739  bf1[60] = bf0[60] + bf0[61];
   1740  bf1[61] = -bf0[61] + bf0[60];
   1741  bf1[62] = -bf0[62] + bf0[63];
   1742  bf1[63] = bf0[63] + bf0[62];
   1743  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
   1744 
   1745  // stage 10
   1746  stage++;
   1747  cospi = cospi_arr(cos_bit);
   1748  bf0 = output;
   1749  bf1 = step;
   1750  bf1[0] = bf0[0];
   1751  bf1[1] = bf0[1];
   1752  bf1[2] = bf0[2];
   1753  bf1[3] = bf0[3];
   1754  bf1[4] = bf0[4];
   1755  bf1[5] = bf0[5];
   1756  bf1[6] = bf0[6];
   1757  bf1[7] = bf0[7];
   1758  bf1[8] = bf0[8];
   1759  bf1[9] = bf0[9];
   1760  bf1[10] = bf0[10];
   1761  bf1[11] = bf0[11];
   1762  bf1[12] = bf0[12];
   1763  bf1[13] = bf0[13];
   1764  bf1[14] = bf0[14];
   1765  bf1[15] = bf0[15];
   1766  bf1[16] = bf0[16];
   1767  bf1[17] = bf0[17];
   1768  bf1[18] = bf0[18];
   1769  bf1[19] = bf0[19];
   1770  bf1[20] = bf0[20];
   1771  bf1[21] = bf0[21];
   1772  bf1[22] = bf0[22];
   1773  bf1[23] = bf0[23];
   1774  bf1[24] = bf0[24];
   1775  bf1[25] = bf0[25];
   1776  bf1[26] = bf0[26];
   1777  bf1[27] = bf0[27];
   1778  bf1[28] = bf0[28];
   1779  bf1[29] = bf0[29];
   1780  bf1[30] = bf0[30];
   1781  bf1[31] = bf0[31];
   1782  bf1[32] = half_btf(cospi[63], bf0[32], cospi[1], bf0[63], cos_bit);
   1783  bf1[33] = half_btf(cospi[31], bf0[33], cospi[33], bf0[62], cos_bit);
   1784  bf1[34] = half_btf(cospi[47], bf0[34], cospi[17], bf0[61], cos_bit);
   1785  bf1[35] = half_btf(cospi[15], bf0[35], cospi[49], bf0[60], cos_bit);
   1786  bf1[36] = half_btf(cospi[55], bf0[36], cospi[9], bf0[59], cos_bit);
   1787  bf1[37] = half_btf(cospi[23], bf0[37], cospi[41], bf0[58], cos_bit);
   1788  bf1[38] = half_btf(cospi[39], bf0[38], cospi[25], bf0[57], cos_bit);
   1789  bf1[39] = half_btf(cospi[7], bf0[39], cospi[57], bf0[56], cos_bit);
   1790  bf1[40] = half_btf(cospi[59], bf0[40], cospi[5], bf0[55], cos_bit);
   1791  bf1[41] = half_btf(cospi[27], bf0[41], cospi[37], bf0[54], cos_bit);
   1792  bf1[42] = half_btf(cospi[43], bf0[42], cospi[21], bf0[53], cos_bit);
   1793  bf1[43] = half_btf(cospi[11], bf0[43], cospi[53], bf0[52], cos_bit);
   1794  bf1[44] = half_btf(cospi[51], bf0[44], cospi[13], bf0[51], cos_bit);
   1795  bf1[45] = half_btf(cospi[19], bf0[45], cospi[45], bf0[50], cos_bit);
   1796  bf1[46] = half_btf(cospi[35], bf0[46], cospi[29], bf0[49], cos_bit);
   1797  bf1[47] = half_btf(cospi[3], bf0[47], cospi[61], bf0[48], cos_bit);
   1798  bf1[48] = half_btf(cospi[3], bf0[48], -cospi[61], bf0[47], cos_bit);
   1799  bf1[49] = half_btf(cospi[35], bf0[49], -cospi[29], bf0[46], cos_bit);
   1800  bf1[50] = half_btf(cospi[19], bf0[50], -cospi[45], bf0[45], cos_bit);
   1801  bf1[51] = half_btf(cospi[51], bf0[51], -cospi[13], bf0[44], cos_bit);
   1802  bf1[52] = half_btf(cospi[11], bf0[52], -cospi[53], bf0[43], cos_bit);
   1803  bf1[53] = half_btf(cospi[43], bf0[53], -cospi[21], bf0[42], cos_bit);
   1804  bf1[54] = half_btf(cospi[27], bf0[54], -cospi[37], bf0[41], cos_bit);
   1805  bf1[55] = half_btf(cospi[59], bf0[55], -cospi[5], bf0[40], cos_bit);
   1806  bf1[56] = half_btf(cospi[7], bf0[56], -cospi[57], bf0[39], cos_bit);
   1807  bf1[57] = half_btf(cospi[39], bf0[57], -cospi[25], bf0[38], cos_bit);
   1808  bf1[58] = half_btf(cospi[23], bf0[58], -cospi[41], bf0[37], cos_bit);
   1809  bf1[59] = half_btf(cospi[55], bf0[59], -cospi[9], bf0[36], cos_bit);
   1810  bf1[60] = half_btf(cospi[15], bf0[60], -cospi[49], bf0[35], cos_bit);
   1811  bf1[61] = half_btf(cospi[47], bf0[61], -cospi[17], bf0[34], cos_bit);
   1812  bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit);
   1813  bf1[63] = half_btf(cospi[63], bf0[63], -cospi[1], bf0[32], cos_bit);
   1814  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
   1815 
   1816  // stage 11
   1817  stage++;
   1818  bf0 = step;
   1819  bf1 = output;
   1820  bf1[0] = bf0[0];
   1821  bf1[1] = bf0[32];
   1822  bf1[2] = bf0[16];
   1823  bf1[3] = bf0[48];
   1824  bf1[4] = bf0[8];
   1825  bf1[5] = bf0[40];
   1826  bf1[6] = bf0[24];
   1827  bf1[7] = bf0[56];
   1828  bf1[8] = bf0[4];
   1829  bf1[9] = bf0[36];
   1830  bf1[10] = bf0[20];
   1831  bf1[11] = bf0[52];
   1832  bf1[12] = bf0[12];
   1833  bf1[13] = bf0[44];
   1834  bf1[14] = bf0[28];
   1835  bf1[15] = bf0[60];
   1836  bf1[16] = bf0[2];
   1837  bf1[17] = bf0[34];
   1838  bf1[18] = bf0[18];
   1839  bf1[19] = bf0[50];
   1840  bf1[20] = bf0[10];
   1841  bf1[21] = bf0[42];
   1842  bf1[22] = bf0[26];
   1843  bf1[23] = bf0[58];
   1844  bf1[24] = bf0[6];
   1845  bf1[25] = bf0[38];
   1846  bf1[26] = bf0[22];
   1847  bf1[27] = bf0[54];
   1848  bf1[28] = bf0[14];
   1849  bf1[29] = bf0[46];
   1850  bf1[30] = bf0[30];
   1851  bf1[31] = bf0[62];
   1852  bf1[32] = bf0[1];
   1853  bf1[33] = bf0[33];
   1854  bf1[34] = bf0[17];
   1855  bf1[35] = bf0[49];
   1856  bf1[36] = bf0[9];
   1857  bf1[37] = bf0[41];
   1858  bf1[38] = bf0[25];
   1859  bf1[39] = bf0[57];
   1860  bf1[40] = bf0[5];
   1861  bf1[41] = bf0[37];
   1862  bf1[42] = bf0[21];
   1863  bf1[43] = bf0[53];
   1864  bf1[44] = bf0[13];
   1865  bf1[45] = bf0[45];
   1866  bf1[46] = bf0[29];
   1867  bf1[47] = bf0[61];
   1868  bf1[48] = bf0[3];
   1869  bf1[49] = bf0[35];
   1870  bf1[50] = bf0[19];
   1871  bf1[51] = bf0[51];
   1872  bf1[52] = bf0[11];
   1873  bf1[53] = bf0[43];
   1874  bf1[54] = bf0[27];
   1875  bf1[55] = bf0[59];
   1876  bf1[56] = bf0[7];
   1877  bf1[57] = bf0[39];
   1878  bf1[58] = bf0[23];
   1879  bf1[59] = bf0[55];
   1880  bf1[60] = bf0[15];
   1881  bf1[61] = bf0[47];
   1882  bf1[62] = bf0[31];
   1883  bf1[63] = bf0[63];
   1884  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
   1885 }