tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

av1_inv_txfm1d.c (80404B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <stdlib.h>
     13 #include "av1/common/av1_inv_txfm1d.h"
     14 #include "av1/common/av1_txfm.h"
     15 
     16 void av1_idct4(const int32_t *input, int32_t *output, int8_t cos_bit,
     17               const int8_t *stage_range) {
     18  assert(output != input);
     19  const int32_t size = 4;
     20  const int32_t *cospi = cospi_arr(cos_bit);
     21 
     22  int32_t stage = 0;
     23  int32_t *bf0, *bf1;
     24  int32_t step[4];
     25 
     26  // stage 0;
     27 
     28  // stage 1;
     29  stage++;
     30  bf1 = output;
     31  bf1[0] = input[0];
     32  bf1[1] = input[2];
     33  bf1[2] = input[1];
     34  bf1[3] = input[3];
     35  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
     36 
     37  // stage 2
     38  stage++;
     39  bf0 = output;
     40  bf1 = step;
     41  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
     42  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
     43  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
     44  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
     45  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
     46 
     47  // stage 3
     48  stage++;
     49  bf0 = step;
     50  bf1 = output;
     51  bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
     52  bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
     53  bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
     54  bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
     55 }
     56 
     57 void av1_idct8(const int32_t *input, int32_t *output, int8_t cos_bit,
     58               const int8_t *stage_range) {
     59  assert(output != input);
     60  const int32_t size = 8;
     61  const int32_t *cospi = cospi_arr(cos_bit);
     62 
     63  int32_t stage = 0;
     64  int32_t *bf0, *bf1;
     65  int32_t step[8];
     66 
     67  // stage 0;
     68 
     69  // stage 1;
     70  stage++;
     71  bf1 = output;
     72  bf1[0] = input[0];
     73  bf1[1] = input[4];
     74  bf1[2] = input[2];
     75  bf1[3] = input[6];
     76  bf1[4] = input[1];
     77  bf1[5] = input[5];
     78  bf1[6] = input[3];
     79  bf1[7] = input[7];
     80  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
     81 
     82  // stage 2
     83  stage++;
     84  bf0 = output;
     85  bf1 = step;
     86  bf1[0] = bf0[0];
     87  bf1[1] = bf0[1];
     88  bf1[2] = bf0[2];
     89  bf1[3] = bf0[3];
     90  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
     91  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
     92  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
     93  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
     94  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
     95 
     96  // stage 3
     97  stage++;
     98  bf0 = step;
     99  bf1 = output;
    100  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
    101  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
    102  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
    103  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
    104  bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
    105  bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
    106  bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
    107  bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
    108  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    109 
    110  // stage 4
    111  stage++;
    112  bf0 = output;
    113  bf1 = step;
    114  bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
    115  bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
    116  bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
    117  bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
    118  bf1[4] = bf0[4];
    119  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
    120  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
    121  bf1[7] = bf0[7];
    122  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    123 
    124  // stage 5
    125  stage++;
    126  bf0 = step;
    127  bf1 = output;
    128  bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
    129  bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
    130  bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
    131  bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
    132  bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
    133  bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
    134  bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
    135  bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
    136 }
    137 
    138 void av1_idct16(const int32_t *input, int32_t *output, int8_t cos_bit,
    139                const int8_t *stage_range) {
    140  assert(output != input);
    141  const int32_t size = 16;
    142  const int32_t *cospi = cospi_arr(cos_bit);
    143 
    144  int32_t stage = 0;
    145  int32_t *bf0, *bf1;
    146  int32_t step[16];
    147 
    148  // stage 0;
    149 
    150  // stage 1;
    151  stage++;
    152  bf1 = output;
    153  bf1[0] = input[0];
    154  bf1[1] = input[8];
    155  bf1[2] = input[4];
    156  bf1[3] = input[12];
    157  bf1[4] = input[2];
    158  bf1[5] = input[10];
    159  bf1[6] = input[6];
    160  bf1[7] = input[14];
    161  bf1[8] = input[1];
    162  bf1[9] = input[9];
    163  bf1[10] = input[5];
    164  bf1[11] = input[13];
    165  bf1[12] = input[3];
    166  bf1[13] = input[11];
    167  bf1[14] = input[7];
    168  bf1[15] = input[15];
    169  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    170 
    171  // stage 2
    172  stage++;
    173  bf0 = output;
    174  bf1 = step;
    175  bf1[0] = bf0[0];
    176  bf1[1] = bf0[1];
    177  bf1[2] = bf0[2];
    178  bf1[3] = bf0[3];
    179  bf1[4] = bf0[4];
    180  bf1[5] = bf0[5];
    181  bf1[6] = bf0[6];
    182  bf1[7] = bf0[7];
    183  bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit);
    184  bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit);
    185  bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit);
    186  bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit);
    187  bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit);
    188  bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit);
    189  bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit);
    190  bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit);
    191  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    192 
    193  // stage 3
    194  stage++;
    195  bf0 = step;
    196  bf1 = output;
    197  bf1[0] = bf0[0];
    198  bf1[1] = bf0[1];
    199  bf1[2] = bf0[2];
    200  bf1[3] = bf0[3];
    201  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
    202  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
    203  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
    204  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
    205  bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]);
    206  bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]);
    207  bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]);
    208  bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]);
    209  bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]);
    210  bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]);
    211  bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]);
    212  bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]);
    213  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    214 
    215  // stage 4
    216  stage++;
    217  bf0 = output;
    218  bf1 = step;
    219  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
    220  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
    221  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
    222  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
    223  bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
    224  bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
    225  bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
    226  bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
    227  bf1[8] = bf0[8];
    228  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
    229  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
    230  bf1[11] = bf0[11];
    231  bf1[12] = bf0[12];
    232  bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit);
    233  bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit);
    234  bf1[15] = bf0[15];
    235  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    236 
    237  // stage 5
    238  stage++;
    239  bf0 = step;
    240  bf1 = output;
    241  bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
    242  bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
    243  bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
    244  bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
    245  bf1[4] = bf0[4];
    246  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
    247  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
    248  bf1[7] = bf0[7];
    249  bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]);
    250  bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]);
    251  bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]);
    252  bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]);
    253  bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]);
    254  bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]);
    255  bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]);
    256  bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]);
    257  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    258 
    259  // stage 6
    260  stage++;
    261  bf0 = output;
    262  bf1 = step;
    263  bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
    264  bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
    265  bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
    266  bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
    267  bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
    268  bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
    269  bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
    270  bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
    271  bf1[8] = bf0[8];
    272  bf1[9] = bf0[9];
    273  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
    274  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
    275  bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
    276  bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
    277  bf1[14] = bf0[14];
    278  bf1[15] = bf0[15];
    279  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    280 
    281  // stage 7
    282  stage++;
    283  bf0 = step;
    284  bf1 = output;
    285  bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]);
    286  bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]);
    287  bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]);
    288  bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]);
    289  bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]);
    290  bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]);
    291  bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]);
    292  bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]);
    293  bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]);
    294  bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]);
    295  bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]);
    296  bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]);
    297  bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]);
    298  bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]);
    299  bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]);
    300  bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]);
    301 }
    302 
    303 void av1_idct32(const int32_t *input, int32_t *output, int8_t cos_bit,
    304                const int8_t *stage_range) {
    305  assert(output != input);
    306  const int32_t size = 32;
    307  const int32_t *cospi = cospi_arr(cos_bit);
    308 
    309  int32_t stage = 0;
    310  int32_t *bf0, *bf1;
    311  int32_t step[32];
    312 
    313  // stage 0;
    314 
    315  // stage 1;
    316  stage++;
    317  bf1 = output;
    318  bf1[0] = input[0];
    319  bf1[1] = input[16];
    320  bf1[2] = input[8];
    321  bf1[3] = input[24];
    322  bf1[4] = input[4];
    323  bf1[5] = input[20];
    324  bf1[6] = input[12];
    325  bf1[7] = input[28];
    326  bf1[8] = input[2];
    327  bf1[9] = input[18];
    328  bf1[10] = input[10];
    329  bf1[11] = input[26];
    330  bf1[12] = input[6];
    331  bf1[13] = input[22];
    332  bf1[14] = input[14];
    333  bf1[15] = input[30];
    334  bf1[16] = input[1];
    335  bf1[17] = input[17];
    336  bf1[18] = input[9];
    337  bf1[19] = input[25];
    338  bf1[20] = input[5];
    339  bf1[21] = input[21];
    340  bf1[22] = input[13];
    341  bf1[23] = input[29];
    342  bf1[24] = input[3];
    343  bf1[25] = input[19];
    344  bf1[26] = input[11];
    345  bf1[27] = input[27];
    346  bf1[28] = input[7];
    347  bf1[29] = input[23];
    348  bf1[30] = input[15];
    349  bf1[31] = input[31];
    350  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    351 
    352  // stage 2
    353  stage++;
    354  bf0 = output;
    355  bf1 = step;
    356  bf1[0] = bf0[0];
    357  bf1[1] = bf0[1];
    358  bf1[2] = bf0[2];
    359  bf1[3] = bf0[3];
    360  bf1[4] = bf0[4];
    361  bf1[5] = bf0[5];
    362  bf1[6] = bf0[6];
    363  bf1[7] = bf0[7];
    364  bf1[8] = bf0[8];
    365  bf1[9] = bf0[9];
    366  bf1[10] = bf0[10];
    367  bf1[11] = bf0[11];
    368  bf1[12] = bf0[12];
    369  bf1[13] = bf0[13];
    370  bf1[14] = bf0[14];
    371  bf1[15] = bf0[15];
    372  bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit);
    373  bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit);
    374  bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit);
    375  bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit);
    376  bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit);
    377  bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit);
    378  bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit);
    379  bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit);
    380  bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit);
    381  bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit);
    382  bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit);
    383  bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit);
    384  bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit);
    385  bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit);
    386  bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit);
    387  bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit);
    388  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    389 
    390  // stage 3
    391  stage++;
    392  bf0 = step;
    393  bf1 = output;
    394  bf1[0] = bf0[0];
    395  bf1[1] = bf0[1];
    396  bf1[2] = bf0[2];
    397  bf1[3] = bf0[3];
    398  bf1[4] = bf0[4];
    399  bf1[5] = bf0[5];
    400  bf1[6] = bf0[6];
    401  bf1[7] = bf0[7];
    402  bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit);
    403  bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit);
    404  bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit);
    405  bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit);
    406  bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit);
    407  bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit);
    408  bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit);
    409  bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit);
    410  bf1[16] = clamp_value(bf0[16] + bf0[17], stage_range[stage]);
    411  bf1[17] = clamp_value(bf0[16] - bf0[17], stage_range[stage]);
    412  bf1[18] = clamp_value(-bf0[18] + bf0[19], stage_range[stage]);
    413  bf1[19] = clamp_value(bf0[18] + bf0[19], stage_range[stage]);
    414  bf1[20] = clamp_value(bf0[20] + bf0[21], stage_range[stage]);
    415  bf1[21] = clamp_value(bf0[20] - bf0[21], stage_range[stage]);
    416  bf1[22] = clamp_value(-bf0[22] + bf0[23], stage_range[stage]);
    417  bf1[23] = clamp_value(bf0[22] + bf0[23], stage_range[stage]);
    418  bf1[24] = clamp_value(bf0[24] + bf0[25], stage_range[stage]);
    419  bf1[25] = clamp_value(bf0[24] - bf0[25], stage_range[stage]);
    420  bf1[26] = clamp_value(-bf0[26] + bf0[27], stage_range[stage]);
    421  bf1[27] = clamp_value(bf0[26] + bf0[27], stage_range[stage]);
    422  bf1[28] = clamp_value(bf0[28] + bf0[29], stage_range[stage]);
    423  bf1[29] = clamp_value(bf0[28] - bf0[29], stage_range[stage]);
    424  bf1[30] = clamp_value(-bf0[30] + bf0[31], stage_range[stage]);
    425  bf1[31] = clamp_value(bf0[30] + bf0[31], stage_range[stage]);
    426  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    427 
    428  // stage 4
    429  stage++;
    430  bf0 = output;
    431  bf1 = step;
    432  bf1[0] = bf0[0];
    433  bf1[1] = bf0[1];
    434  bf1[2] = bf0[2];
    435  bf1[3] = bf0[3];
    436  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
    437  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
    438  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
    439  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
    440  bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]);
    441  bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]);
    442  bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]);
    443  bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]);
    444  bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]);
    445  bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]);
    446  bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]);
    447  bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]);
    448  bf1[16] = bf0[16];
    449  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
    450  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
    451  bf1[19] = bf0[19];
    452  bf1[20] = bf0[20];
    453  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
    454  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
    455  bf1[23] = bf0[23];
    456  bf1[24] = bf0[24];
    457  bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit);
    458  bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit);
    459  bf1[27] = bf0[27];
    460  bf1[28] = bf0[28];
    461  bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit);
    462  bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit);
    463  bf1[31] = bf0[31];
    464  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    465 
    466  // stage 5
    467  stage++;
    468  bf0 = step;
    469  bf1 = output;
    470  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
    471  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
    472  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
    473  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
    474  bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
    475  bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
    476  bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
    477  bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
    478  bf1[8] = bf0[8];
    479  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
    480  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
    481  bf1[11] = bf0[11];
    482  bf1[12] = bf0[12];
    483  bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit);
    484  bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit);
    485  bf1[15] = bf0[15];
    486  bf1[16] = clamp_value(bf0[16] + bf0[19], stage_range[stage]);
    487  bf1[17] = clamp_value(bf0[17] + bf0[18], stage_range[stage]);
    488  bf1[18] = clamp_value(bf0[17] - bf0[18], stage_range[stage]);
    489  bf1[19] = clamp_value(bf0[16] - bf0[19], stage_range[stage]);
    490  bf1[20] = clamp_value(-bf0[20] + bf0[23], stage_range[stage]);
    491  bf1[21] = clamp_value(-bf0[21] + bf0[22], stage_range[stage]);
    492  bf1[22] = clamp_value(bf0[21] + bf0[22], stage_range[stage]);
    493  bf1[23] = clamp_value(bf0[20] + bf0[23], stage_range[stage]);
    494  bf1[24] = clamp_value(bf0[24] + bf0[27], stage_range[stage]);
    495  bf1[25] = clamp_value(bf0[25] + bf0[26], stage_range[stage]);
    496  bf1[26] = clamp_value(bf0[25] - bf0[26], stage_range[stage]);
    497  bf1[27] = clamp_value(bf0[24] - bf0[27], stage_range[stage]);
    498  bf1[28] = clamp_value(-bf0[28] + bf0[31], stage_range[stage]);
    499  bf1[29] = clamp_value(-bf0[29] + bf0[30], stage_range[stage]);
    500  bf1[30] = clamp_value(bf0[29] + bf0[30], stage_range[stage]);
    501  bf1[31] = clamp_value(bf0[28] + bf0[31], stage_range[stage]);
    502  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    503 
    504  // stage 6
    505  stage++;
    506  bf0 = output;
    507  bf1 = step;
    508  bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
    509  bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
    510  bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
    511  bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
    512  bf1[4] = bf0[4];
    513  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
    514  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
    515  bf1[7] = bf0[7];
    516  bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]);
    517  bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]);
    518  bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]);
    519  bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]);
    520  bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]);
    521  bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]);
    522  bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]);
    523  bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]);
    524  bf1[16] = bf0[16];
    525  bf1[17] = bf0[17];
    526  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
    527  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
    528  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
    529  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
    530  bf1[22] = bf0[22];
    531  bf1[23] = bf0[23];
    532  bf1[24] = bf0[24];
    533  bf1[25] = bf0[25];
    534  bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit);
    535  bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit);
    536  bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit);
    537  bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit);
    538  bf1[30] = bf0[30];
    539  bf1[31] = bf0[31];
    540  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    541 
    542  // stage 7
    543  stage++;
    544  bf0 = step;
    545  bf1 = output;
    546  bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
    547  bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
    548  bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
    549  bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
    550  bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
    551  bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
    552  bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
    553  bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
    554  bf1[8] = bf0[8];
    555  bf1[9] = bf0[9];
    556  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
    557  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
    558  bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
    559  bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
    560  bf1[14] = bf0[14];
    561  bf1[15] = bf0[15];
    562  bf1[16] = clamp_value(bf0[16] + bf0[23], stage_range[stage]);
    563  bf1[17] = clamp_value(bf0[17] + bf0[22], stage_range[stage]);
    564  bf1[18] = clamp_value(bf0[18] + bf0[21], stage_range[stage]);
    565  bf1[19] = clamp_value(bf0[19] + bf0[20], stage_range[stage]);
    566  bf1[20] = clamp_value(bf0[19] - bf0[20], stage_range[stage]);
    567  bf1[21] = clamp_value(bf0[18] - bf0[21], stage_range[stage]);
    568  bf1[22] = clamp_value(bf0[17] - bf0[22], stage_range[stage]);
    569  bf1[23] = clamp_value(bf0[16] - bf0[23], stage_range[stage]);
    570  bf1[24] = clamp_value(-bf0[24] + bf0[31], stage_range[stage]);
    571  bf1[25] = clamp_value(-bf0[25] + bf0[30], stage_range[stage]);
    572  bf1[26] = clamp_value(-bf0[26] + bf0[29], stage_range[stage]);
    573  bf1[27] = clamp_value(-bf0[27] + bf0[28], stage_range[stage]);
    574  bf1[28] = clamp_value(bf0[27] + bf0[28], stage_range[stage]);
    575  bf1[29] = clamp_value(bf0[26] + bf0[29], stage_range[stage]);
    576  bf1[30] = clamp_value(bf0[25] + bf0[30], stage_range[stage]);
    577  bf1[31] = clamp_value(bf0[24] + bf0[31], stage_range[stage]);
    578  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    579 
    580  // stage 8
    581  stage++;
    582  bf0 = output;
    583  bf1 = step;
    584  bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]);
    585  bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]);
    586  bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]);
    587  bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]);
    588  bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]);
    589  bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]);
    590  bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]);
    591  bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]);
    592  bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]);
    593  bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]);
    594  bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]);
    595  bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]);
    596  bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]);
    597  bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]);
    598  bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]);
    599  bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]);
    600  bf1[16] = bf0[16];
    601  bf1[17] = bf0[17];
    602  bf1[18] = bf0[18];
    603  bf1[19] = bf0[19];
    604  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
    605  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
    606  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
    607  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
    608  bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
    609  bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
    610  bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
    611  bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
    612  bf1[28] = bf0[28];
    613  bf1[29] = bf0[29];
    614  bf1[30] = bf0[30];
    615  bf1[31] = bf0[31];
    616  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    617 
    618  // stage 9
    619  stage++;
    620  bf0 = step;
    621  bf1 = output;
    622  bf1[0] = clamp_value(bf0[0] + bf0[31], stage_range[stage]);
    623  bf1[1] = clamp_value(bf0[1] + bf0[30], stage_range[stage]);
    624  bf1[2] = clamp_value(bf0[2] + bf0[29], stage_range[stage]);
    625  bf1[3] = clamp_value(bf0[3] + bf0[28], stage_range[stage]);
    626  bf1[4] = clamp_value(bf0[4] + bf0[27], stage_range[stage]);
    627  bf1[5] = clamp_value(bf0[5] + bf0[26], stage_range[stage]);
    628  bf1[6] = clamp_value(bf0[6] + bf0[25], stage_range[stage]);
    629  bf1[7] = clamp_value(bf0[7] + bf0[24], stage_range[stage]);
    630  bf1[8] = clamp_value(bf0[8] + bf0[23], stage_range[stage]);
    631  bf1[9] = clamp_value(bf0[9] + bf0[22], stage_range[stage]);
    632  bf1[10] = clamp_value(bf0[10] + bf0[21], stage_range[stage]);
    633  bf1[11] = clamp_value(bf0[11] + bf0[20], stage_range[stage]);
    634  bf1[12] = clamp_value(bf0[12] + bf0[19], stage_range[stage]);
    635  bf1[13] = clamp_value(bf0[13] + bf0[18], stage_range[stage]);
    636  bf1[14] = clamp_value(bf0[14] + bf0[17], stage_range[stage]);
    637  bf1[15] = clamp_value(bf0[15] + bf0[16], stage_range[stage]);
    638  bf1[16] = clamp_value(bf0[15] - bf0[16], stage_range[stage]);
    639  bf1[17] = clamp_value(bf0[14] - bf0[17], stage_range[stage]);
    640  bf1[18] = clamp_value(bf0[13] - bf0[18], stage_range[stage]);
    641  bf1[19] = clamp_value(bf0[12] - bf0[19], stage_range[stage]);
    642  bf1[20] = clamp_value(bf0[11] - bf0[20], stage_range[stage]);
    643  bf1[21] = clamp_value(bf0[10] - bf0[21], stage_range[stage]);
    644  bf1[22] = clamp_value(bf0[9] - bf0[22], stage_range[stage]);
    645  bf1[23] = clamp_value(bf0[8] - bf0[23], stage_range[stage]);
    646  bf1[24] = clamp_value(bf0[7] - bf0[24], stage_range[stage]);
    647  bf1[25] = clamp_value(bf0[6] - bf0[25], stage_range[stage]);
    648  bf1[26] = clamp_value(bf0[5] - bf0[26], stage_range[stage]);
    649  bf1[27] = clamp_value(bf0[4] - bf0[27], stage_range[stage]);
    650  bf1[28] = clamp_value(bf0[3] - bf0[28], stage_range[stage]);
    651  bf1[29] = clamp_value(bf0[2] - bf0[29], stage_range[stage]);
    652  bf1[30] = clamp_value(bf0[1] - bf0[30], stage_range[stage]);
    653  bf1[31] = clamp_value(bf0[0] - bf0[31], stage_range[stage]);
    654 }
    655 
    656 void av1_iadst4(const int32_t *input, int32_t *output, int8_t cos_bit,
    657                const int8_t *stage_range) {
    658  int bit = cos_bit;
    659  const int32_t *sinpi = sinpi_arr(bit);
    660  int32_t s0, s1, s2, s3, s4, s5, s6, s7;
    661 
    662  int32_t x0 = input[0];
    663  int32_t x1 = input[1];
    664  int32_t x2 = input[2];
    665  int32_t x3 = input[3];
    666 
    667  if (!(x0 | x1 | x2 | x3)) {
    668    output[0] = output[1] = output[2] = output[3] = 0;
    669    return;
    670  }
    671 
    672  assert(sinpi[1] + sinpi[2] == sinpi[4]);
    673 
    674  // stage 1
    675  s0 = range_check_value(sinpi[1] * x0, stage_range[1] + bit);
    676  s1 = range_check_value(sinpi[2] * x0, stage_range[1] + bit);
    677  s2 = range_check_value(sinpi[3] * x1, stage_range[1] + bit);
    678  s3 = range_check_value(sinpi[4] * x2, stage_range[1] + bit);
    679  s4 = range_check_value(sinpi[1] * x2, stage_range[1] + bit);
    680  s5 = range_check_value(sinpi[2] * x3, stage_range[1] + bit);
    681  s6 = range_check_value(sinpi[4] * x3, stage_range[1] + bit);
    682 
    683  // stage 2
    684  // NOTICE: (x0 - x2) here may use one extra bit compared to the
    685  // opt_range_row/col specified in av1_gen_inv_stage_range()
    686  s7 = range_check_value((x0 - x2) + x3, stage_range[2]);
    687 
    688  // stage 3
    689  s0 = range_check_value(s0 + s3, stage_range[3] + bit);
    690  s1 = range_check_value(s1 - s4, stage_range[3] + bit);
    691  s3 = range_check_value(s2, stage_range[3] + bit);
    692  s2 = range_check_value(sinpi[3] * s7, stage_range[3] + bit);
    693 
    694  // stage 4
    695  s0 = range_check_value(s0 + s5, stage_range[4] + bit);
    696  s1 = range_check_value(s1 - s6, stage_range[4] + bit);
    697 
    698  // stage 5
    699  x0 = range_check_value(s0 + s3, stage_range[5] + bit);
    700  x1 = range_check_value(s1 + s3, stage_range[5] + bit);
    701  x2 = range_check_value(s2, stage_range[5] + bit);
    702  x3 = range_check_value(s0 + s1, stage_range[5] + bit);
    703 
    704  // stage 6
    705  x3 = range_check_value(x3 - s3, stage_range[6] + bit);
    706 
    707  output[0] = round_shift(x0, bit);
    708  output[1] = round_shift(x1, bit);
    709  output[2] = round_shift(x2, bit);
    710  output[3] = round_shift(x3, bit);
    711 }
    712 
    713 void av1_iadst8(const int32_t *input, int32_t *output, int8_t cos_bit,
    714                const int8_t *stage_range) {
    715  assert(output != input);
    716  const int32_t size = 8;
    717  const int32_t *cospi = cospi_arr(cos_bit);
    718 
    719  int32_t stage = 0;
    720  int32_t *bf0, *bf1;
    721  int32_t step[8];
    722 
    723  // stage 0;
    724 
    725  // stage 1;
    726  stage++;
    727  bf1 = output;
    728  bf1[0] = input[7];
    729  bf1[1] = input[0];
    730  bf1[2] = input[5];
    731  bf1[3] = input[2];
    732  bf1[4] = input[3];
    733  bf1[5] = input[4];
    734  bf1[6] = input[1];
    735  bf1[7] = input[6];
    736  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    737 
    738  // stage 2
    739  stage++;
    740  bf0 = output;
    741  bf1 = step;
    742  bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit);
    743  bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit);
    744  bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit);
    745  bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit);
    746  bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit);
    747  bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit);
    748  bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit);
    749  bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit);
    750  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    751 
    752  // stage 3
    753  stage++;
    754  bf0 = step;
    755  bf1 = output;
    756  bf1[0] = clamp_value(bf0[0] + bf0[4], stage_range[stage]);
    757  bf1[1] = clamp_value(bf0[1] + bf0[5], stage_range[stage]);
    758  bf1[2] = clamp_value(bf0[2] + bf0[6], stage_range[stage]);
    759  bf1[3] = clamp_value(bf0[3] + bf0[7], stage_range[stage]);
    760  bf1[4] = clamp_value(bf0[0] - bf0[4], stage_range[stage]);
    761  bf1[5] = clamp_value(bf0[1] - bf0[5], stage_range[stage]);
    762  bf1[6] = clamp_value(bf0[2] - bf0[6], stage_range[stage]);
    763  bf1[7] = clamp_value(bf0[3] - bf0[7], stage_range[stage]);
    764  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    765 
    766  // stage 4
    767  stage++;
    768  bf0 = output;
    769  bf1 = step;
    770  bf1[0] = bf0[0];
    771  bf1[1] = bf0[1];
    772  bf1[2] = bf0[2];
    773  bf1[3] = bf0[3];
    774  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
    775  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
    776  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
    777  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
    778  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    779 
    780  // stage 5
    781  stage++;
    782  bf0 = step;
    783  bf1 = output;
    784  bf1[0] = clamp_value(bf0[0] + bf0[2], stage_range[stage]);
    785  bf1[1] = clamp_value(bf0[1] + bf0[3], stage_range[stage]);
    786  bf1[2] = clamp_value(bf0[0] - bf0[2], stage_range[stage]);
    787  bf1[3] = clamp_value(bf0[1] - bf0[3], stage_range[stage]);
    788  bf1[4] = clamp_value(bf0[4] + bf0[6], stage_range[stage]);
    789  bf1[5] = clamp_value(bf0[5] + bf0[7], stage_range[stage]);
    790  bf1[6] = clamp_value(bf0[4] - bf0[6], stage_range[stage]);
    791  bf1[7] = clamp_value(bf0[5] - bf0[7], stage_range[stage]);
    792  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    793 
    794  // stage 6
    795  stage++;
    796  bf0 = output;
    797  bf1 = step;
    798  bf1[0] = bf0[0];
    799  bf1[1] = bf0[1];
    800  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
    801  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
    802  bf1[4] = bf0[4];
    803  bf1[5] = bf0[5];
    804  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
    805  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
    806  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    807 
    808  // stage 7
    809  bf0 = step;
    810  bf1 = output;
    811  bf1[0] = bf0[0];
    812  bf1[1] = -bf0[4];
    813  bf1[2] = bf0[6];
    814  bf1[3] = -bf0[2];
    815  bf1[4] = bf0[3];
    816  bf1[5] = -bf0[7];
    817  bf1[6] = bf0[5];
    818  bf1[7] = -bf0[1];
    819 }
    820 
    821 void av1_iadst16(const int32_t *input, int32_t *output, int8_t cos_bit,
    822                 const int8_t *stage_range) {
    823  assert(output != input);
    824  const int32_t size = 16;
    825  const int32_t *cospi = cospi_arr(cos_bit);
    826 
    827  int32_t stage = 0;
    828  int32_t *bf0, *bf1;
    829  int32_t step[16];
    830 
    831  // stage 0;
    832 
    833  // stage 1;
    834  stage++;
    835  bf1 = output;
    836  bf1[0] = input[15];
    837  bf1[1] = input[0];
    838  bf1[2] = input[13];
    839  bf1[3] = input[2];
    840  bf1[4] = input[11];
    841  bf1[5] = input[4];
    842  bf1[6] = input[9];
    843  bf1[7] = input[6];
    844  bf1[8] = input[7];
    845  bf1[9] = input[8];
    846  bf1[10] = input[5];
    847  bf1[11] = input[10];
    848  bf1[12] = input[3];
    849  bf1[13] = input[12];
    850  bf1[14] = input[1];
    851  bf1[15] = input[14];
    852  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    853 
    854  // stage 2
    855  stage++;
    856  bf0 = output;
    857  bf1 = step;
    858  bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit);
    859  bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit);
    860  bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit);
    861  bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit);
    862  bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit);
    863  bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit);
    864  bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit);
    865  bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit);
    866  bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit);
    867  bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit);
    868  bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit);
    869  bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit);
    870  bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit);
    871  bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit);
    872  bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit);
    873  bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit);
    874  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    875 
    876  // stage 3
    877  stage++;
    878  bf0 = step;
    879  bf1 = output;
    880  bf1[0] = clamp_value(bf0[0] + bf0[8], stage_range[stage]);
    881  bf1[1] = clamp_value(bf0[1] + bf0[9], stage_range[stage]);
    882  bf1[2] = clamp_value(bf0[2] + bf0[10], stage_range[stage]);
    883  bf1[3] = clamp_value(bf0[3] + bf0[11], stage_range[stage]);
    884  bf1[4] = clamp_value(bf0[4] + bf0[12], stage_range[stage]);
    885  bf1[5] = clamp_value(bf0[5] + bf0[13], stage_range[stage]);
    886  bf1[6] = clamp_value(bf0[6] + bf0[14], stage_range[stage]);
    887  bf1[7] = clamp_value(bf0[7] + bf0[15], stage_range[stage]);
    888  bf1[8] = clamp_value(bf0[0] - bf0[8], stage_range[stage]);
    889  bf1[9] = clamp_value(bf0[1] - bf0[9], stage_range[stage]);
    890  bf1[10] = clamp_value(bf0[2] - bf0[10], stage_range[stage]);
    891  bf1[11] = clamp_value(bf0[3] - bf0[11], stage_range[stage]);
    892  bf1[12] = clamp_value(bf0[4] - bf0[12], stage_range[stage]);
    893  bf1[13] = clamp_value(bf0[5] - bf0[13], stage_range[stage]);
    894  bf1[14] = clamp_value(bf0[6] - bf0[14], stage_range[stage]);
    895  bf1[15] = clamp_value(bf0[7] - bf0[15], stage_range[stage]);
    896  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    897 
    898  // stage 4
    899  stage++;
    900  bf0 = output;
    901  bf1 = step;
    902  bf1[0] = bf0[0];
    903  bf1[1] = bf0[1];
    904  bf1[2] = bf0[2];
    905  bf1[3] = bf0[3];
    906  bf1[4] = bf0[4];
    907  bf1[5] = bf0[5];
    908  bf1[6] = bf0[6];
    909  bf1[7] = bf0[7];
    910  bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit);
    911  bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit);
    912  bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit);
    913  bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit);
    914  bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit);
    915  bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
    916  bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
    917  bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
    918  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    919 
    920  // stage 5
    921  stage++;
    922  bf0 = step;
    923  bf1 = output;
    924  bf1[0] = clamp_value(bf0[0] + bf0[4], stage_range[stage]);
    925  bf1[1] = clamp_value(bf0[1] + bf0[5], stage_range[stage]);
    926  bf1[2] = clamp_value(bf0[2] + bf0[6], stage_range[stage]);
    927  bf1[3] = clamp_value(bf0[3] + bf0[7], stage_range[stage]);
    928  bf1[4] = clamp_value(bf0[0] - bf0[4], stage_range[stage]);
    929  bf1[5] = clamp_value(bf0[1] - bf0[5], stage_range[stage]);
    930  bf1[6] = clamp_value(bf0[2] - bf0[6], stage_range[stage]);
    931  bf1[7] = clamp_value(bf0[3] - bf0[7], stage_range[stage]);
    932  bf1[8] = clamp_value(bf0[8] + bf0[12], stage_range[stage]);
    933  bf1[9] = clamp_value(bf0[9] + bf0[13], stage_range[stage]);
    934  bf1[10] = clamp_value(bf0[10] + bf0[14], stage_range[stage]);
    935  bf1[11] = clamp_value(bf0[11] + bf0[15], stage_range[stage]);
    936  bf1[12] = clamp_value(bf0[8] - bf0[12], stage_range[stage]);
    937  bf1[13] = clamp_value(bf0[9] - bf0[13], stage_range[stage]);
    938  bf1[14] = clamp_value(bf0[10] - bf0[14], stage_range[stage]);
    939  bf1[15] = clamp_value(bf0[11] - bf0[15], stage_range[stage]);
    940  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    941 
    942  // stage 6
    943  stage++;
    944  bf0 = output;
    945  bf1 = step;
    946  bf1[0] = bf0[0];
    947  bf1[1] = bf0[1];
    948  bf1[2] = bf0[2];
    949  bf1[3] = bf0[3];
    950  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
    951  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
    952  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
    953  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
    954  bf1[8] = bf0[8];
    955  bf1[9] = bf0[9];
    956  bf1[10] = bf0[10];
    957  bf1[11] = bf0[11];
    958  bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit);
    959  bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
    960  bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
    961  bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
    962  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    963 
    964  // stage 7
    965  stage++;
    966  bf0 = step;
    967  bf1 = output;
    968  bf1[0] = clamp_value(bf0[0] + bf0[2], stage_range[stage]);
    969  bf1[1] = clamp_value(bf0[1] + bf0[3], stage_range[stage]);
    970  bf1[2] = clamp_value(bf0[0] - bf0[2], stage_range[stage]);
    971  bf1[3] = clamp_value(bf0[1] - bf0[3], stage_range[stage]);
    972  bf1[4] = clamp_value(bf0[4] + bf0[6], stage_range[stage]);
    973  bf1[5] = clamp_value(bf0[5] + bf0[7], stage_range[stage]);
    974  bf1[6] = clamp_value(bf0[4] - bf0[6], stage_range[stage]);
    975  bf1[7] = clamp_value(bf0[5] - bf0[7], stage_range[stage]);
    976  bf1[8] = clamp_value(bf0[8] + bf0[10], stage_range[stage]);
    977  bf1[9] = clamp_value(bf0[9] + bf0[11], stage_range[stage]);
    978  bf1[10] = clamp_value(bf0[8] - bf0[10], stage_range[stage]);
    979  bf1[11] = clamp_value(bf0[9] - bf0[11], stage_range[stage]);
    980  bf1[12] = clamp_value(bf0[12] + bf0[14], stage_range[stage]);
    981  bf1[13] = clamp_value(bf0[13] + bf0[15], stage_range[stage]);
    982  bf1[14] = clamp_value(bf0[12] - bf0[14], stage_range[stage]);
    983  bf1[15] = clamp_value(bf0[13] - bf0[15], stage_range[stage]);
    984  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
    985 
    986  // stage 8
    987  stage++;
    988  bf0 = output;
    989  bf1 = step;
    990  bf1[0] = bf0[0];
    991  bf1[1] = bf0[1];
    992  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
    993  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
    994  bf1[4] = bf0[4];
    995  bf1[5] = bf0[5];
    996  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
    997  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
    998  bf1[8] = bf0[8];
    999  bf1[9] = bf0[9];
   1000  bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit);
   1001  bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit);
   1002  bf1[12] = bf0[12];
   1003  bf1[13] = bf0[13];
   1004  bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
   1005  bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
   1006  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
   1007 
   1008  // stage 9
   1009  bf0 = step;
   1010  bf1 = output;
   1011  bf1[0] = bf0[0];
   1012  bf1[1] = -bf0[8];
   1013  bf1[2] = bf0[12];
   1014  bf1[3] = -bf0[4];
   1015  bf1[4] = bf0[6];
   1016  bf1[5] = -bf0[14];
   1017  bf1[6] = bf0[10];
   1018  bf1[7] = -bf0[2];
   1019  bf1[8] = bf0[3];
   1020  bf1[9] = -bf0[11];
   1021  bf1[10] = bf0[15];
   1022  bf1[11] = -bf0[7];
   1023  bf1[12] = bf0[5];
   1024  bf1[13] = -bf0[13];
   1025  bf1[14] = bf0[9];
   1026  bf1[15] = -bf0[1];
   1027 }
   1028 
   1029 void av1_iidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
   1030                      const int8_t *stage_range) {
   1031  (void)cos_bit;
   1032  (void)stage_range;
   1033  for (int i = 0; i < 4; ++i) {
   1034    output[i] = round_shift((int64_t)NewSqrt2 * input[i], NewSqrt2Bits);
   1035  }
   1036  assert(stage_range[0] + NewSqrt2Bits <= 32);
   1037 }
   1038 
   1039 void av1_iidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
   1040                      const int8_t *stage_range) {
   1041  (void)cos_bit;
   1042  (void)stage_range;
   1043  for (int i = 0; i < 8; ++i) output[i] = (int32_t)((int64_t)input[i] * 2);
   1044 }
   1045 
   1046 void av1_iidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
   1047                       const int8_t *stage_range) {
   1048  (void)cos_bit;
   1049  (void)stage_range;
   1050  for (int i = 0; i < 16; ++i)
   1051    output[i] = round_shift((int64_t)NewSqrt2 * 2 * input[i], NewSqrt2Bits);
   1052  assert(stage_range[0] + NewSqrt2Bits <= 32);
   1053 }
   1054 
   1055 void av1_iidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
   1056                       const int8_t *stage_range) {
   1057  (void)cos_bit;
   1058  (void)stage_range;
   1059  for (int i = 0; i < 32; ++i) output[i] = (int32_t)((int64_t)input[i] * 4);
   1060 }
   1061 
   1062 void av1_idct64(const int32_t *input, int32_t *output, int8_t cos_bit,
   1063                const int8_t *stage_range) {
   1064  assert(output != input);
   1065  const int32_t size = 64;
   1066  const int32_t *cospi = cospi_arr(cos_bit);
   1067 
   1068  int32_t stage = 0;
   1069  int32_t *bf0, *bf1;
   1070  int32_t step[64];
   1071 
   1072  // stage 0;
   1073 
   1074  // stage 1;
   1075  stage++;
   1076  bf1 = output;
   1077  bf1[0] = input[0];
   1078  bf1[1] = input[32];
   1079  bf1[2] = input[16];
   1080  bf1[3] = input[48];
   1081  bf1[4] = input[8];
   1082  bf1[5] = input[40];
   1083  bf1[6] = input[24];
   1084  bf1[7] = input[56];
   1085  bf1[8] = input[4];
   1086  bf1[9] = input[36];
   1087  bf1[10] = input[20];
   1088  bf1[11] = input[52];
   1089  bf1[12] = input[12];
   1090  bf1[13] = input[44];
   1091  bf1[14] = input[28];
   1092  bf1[15] = input[60];
   1093  bf1[16] = input[2];
   1094  bf1[17] = input[34];
   1095  bf1[18] = input[18];
   1096  bf1[19] = input[50];
   1097  bf1[20] = input[10];
   1098  bf1[21] = input[42];
   1099  bf1[22] = input[26];
   1100  bf1[23] = input[58];
   1101  bf1[24] = input[6];
   1102  bf1[25] = input[38];
   1103  bf1[26] = input[22];
   1104  bf1[27] = input[54];
   1105  bf1[28] = input[14];
   1106  bf1[29] = input[46];
   1107  bf1[30] = input[30];
   1108  bf1[31] = input[62];
   1109  bf1[32] = input[1];
   1110  bf1[33] = input[33];
   1111  bf1[34] = input[17];
   1112  bf1[35] = input[49];
   1113  bf1[36] = input[9];
   1114  bf1[37] = input[41];
   1115  bf1[38] = input[25];
   1116  bf1[39] = input[57];
   1117  bf1[40] = input[5];
   1118  bf1[41] = input[37];
   1119  bf1[42] = input[21];
   1120  bf1[43] = input[53];
   1121  bf1[44] = input[13];
   1122  bf1[45] = input[45];
   1123  bf1[46] = input[29];
   1124  bf1[47] = input[61];
   1125  bf1[48] = input[3];
   1126  bf1[49] = input[35];
   1127  bf1[50] = input[19];
   1128  bf1[51] = input[51];
   1129  bf1[52] = input[11];
   1130  bf1[53] = input[43];
   1131  bf1[54] = input[27];
   1132  bf1[55] = input[59];
   1133  bf1[56] = input[7];
   1134  bf1[57] = input[39];
   1135  bf1[58] = input[23];
   1136  bf1[59] = input[55];
   1137  bf1[60] = input[15];
   1138  bf1[61] = input[47];
   1139  bf1[62] = input[31];
   1140  bf1[63] = input[63];
   1141  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
   1142 
   1143  // stage 2
   1144  stage++;
   1145  bf0 = output;
   1146  bf1 = step;
   1147  bf1[0] = bf0[0];
   1148  bf1[1] = bf0[1];
   1149  bf1[2] = bf0[2];
   1150  bf1[3] = bf0[3];
   1151  bf1[4] = bf0[4];
   1152  bf1[5] = bf0[5];
   1153  bf1[6] = bf0[6];
   1154  bf1[7] = bf0[7];
   1155  bf1[8] = bf0[8];
   1156  bf1[9] = bf0[9];
   1157  bf1[10] = bf0[10];
   1158  bf1[11] = bf0[11];
   1159  bf1[12] = bf0[12];
   1160  bf1[13] = bf0[13];
   1161  bf1[14] = bf0[14];
   1162  bf1[15] = bf0[15];
   1163  bf1[16] = bf0[16];
   1164  bf1[17] = bf0[17];
   1165  bf1[18] = bf0[18];
   1166  bf1[19] = bf0[19];
   1167  bf1[20] = bf0[20];
   1168  bf1[21] = bf0[21];
   1169  bf1[22] = bf0[22];
   1170  bf1[23] = bf0[23];
   1171  bf1[24] = bf0[24];
   1172  bf1[25] = bf0[25];
   1173  bf1[26] = bf0[26];
   1174  bf1[27] = bf0[27];
   1175  bf1[28] = bf0[28];
   1176  bf1[29] = bf0[29];
   1177  bf1[30] = bf0[30];
   1178  bf1[31] = bf0[31];
   1179  bf1[32] = half_btf(cospi[63], bf0[32], -cospi[1], bf0[63], cos_bit);
   1180  bf1[33] = half_btf(cospi[31], bf0[33], -cospi[33], bf0[62], cos_bit);
   1181  bf1[34] = half_btf(cospi[47], bf0[34], -cospi[17], bf0[61], cos_bit);
   1182  bf1[35] = half_btf(cospi[15], bf0[35], -cospi[49], bf0[60], cos_bit);
   1183  bf1[36] = half_btf(cospi[55], bf0[36], -cospi[9], bf0[59], cos_bit);
   1184  bf1[37] = half_btf(cospi[23], bf0[37], -cospi[41], bf0[58], cos_bit);
   1185  bf1[38] = half_btf(cospi[39], bf0[38], -cospi[25], bf0[57], cos_bit);
   1186  bf1[39] = half_btf(cospi[7], bf0[39], -cospi[57], bf0[56], cos_bit);
   1187  bf1[40] = half_btf(cospi[59], bf0[40], -cospi[5], bf0[55], cos_bit);
   1188  bf1[41] = half_btf(cospi[27], bf0[41], -cospi[37], bf0[54], cos_bit);
   1189  bf1[42] = half_btf(cospi[43], bf0[42], -cospi[21], bf0[53], cos_bit);
   1190  bf1[43] = half_btf(cospi[11], bf0[43], -cospi[53], bf0[52], cos_bit);
   1191  bf1[44] = half_btf(cospi[51], bf0[44], -cospi[13], bf0[51], cos_bit);
   1192  bf1[45] = half_btf(cospi[19], bf0[45], -cospi[45], bf0[50], cos_bit);
   1193  bf1[46] = half_btf(cospi[35], bf0[46], -cospi[29], bf0[49], cos_bit);
   1194  bf1[47] = half_btf(cospi[3], bf0[47], -cospi[61], bf0[48], cos_bit);
   1195  bf1[48] = half_btf(cospi[61], bf0[47], cospi[3], bf0[48], cos_bit);
   1196  bf1[49] = half_btf(cospi[29], bf0[46], cospi[35], bf0[49], cos_bit);
   1197  bf1[50] = half_btf(cospi[45], bf0[45], cospi[19], bf0[50], cos_bit);
   1198  bf1[51] = half_btf(cospi[13], bf0[44], cospi[51], bf0[51], cos_bit);
   1199  bf1[52] = half_btf(cospi[53], bf0[43], cospi[11], bf0[52], cos_bit);
   1200  bf1[53] = half_btf(cospi[21], bf0[42], cospi[43], bf0[53], cos_bit);
   1201  bf1[54] = half_btf(cospi[37], bf0[41], cospi[27], bf0[54], cos_bit);
   1202  bf1[55] = half_btf(cospi[5], bf0[40], cospi[59], bf0[55], cos_bit);
   1203  bf1[56] = half_btf(cospi[57], bf0[39], cospi[7], bf0[56], cos_bit);
   1204  bf1[57] = half_btf(cospi[25], bf0[38], cospi[39], bf0[57], cos_bit);
   1205  bf1[58] = half_btf(cospi[41], bf0[37], cospi[23], bf0[58], cos_bit);
   1206  bf1[59] = half_btf(cospi[9], bf0[36], cospi[55], bf0[59], cos_bit);
   1207  bf1[60] = half_btf(cospi[49], bf0[35], cospi[15], bf0[60], cos_bit);
   1208  bf1[61] = half_btf(cospi[17], bf0[34], cospi[47], bf0[61], cos_bit);
   1209  bf1[62] = half_btf(cospi[33], bf0[33], cospi[31], bf0[62], cos_bit);
   1210  bf1[63] = half_btf(cospi[1], bf0[32], cospi[63], bf0[63], cos_bit);
   1211  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
   1212 
   1213  // stage 3
   1214  stage++;
   1215  bf0 = step;
   1216  bf1 = output;
   1217  bf1[0] = bf0[0];
   1218  bf1[1] = bf0[1];
   1219  bf1[2] = bf0[2];
   1220  bf1[3] = bf0[3];
   1221  bf1[4] = bf0[4];
   1222  bf1[5] = bf0[5];
   1223  bf1[6] = bf0[6];
   1224  bf1[7] = bf0[7];
   1225  bf1[8] = bf0[8];
   1226  bf1[9] = bf0[9];
   1227  bf1[10] = bf0[10];
   1228  bf1[11] = bf0[11];
   1229  bf1[12] = bf0[12];
   1230  bf1[13] = bf0[13];
   1231  bf1[14] = bf0[14];
   1232  bf1[15] = bf0[15];
   1233  bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit);
   1234  bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit);
   1235  bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit);
   1236  bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit);
   1237  bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit);
   1238  bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit);
   1239  bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit);
   1240  bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit);
   1241  bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit);
   1242  bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit);
   1243  bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit);
   1244  bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit);
   1245  bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit);
   1246  bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit);
   1247  bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit);
   1248  bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit);
   1249  bf1[32] = clamp_value(bf0[32] + bf0[33], stage_range[stage]);
   1250  bf1[33] = clamp_value(bf0[32] - bf0[33], stage_range[stage]);
   1251  bf1[34] = clamp_value(-bf0[34] + bf0[35], stage_range[stage]);
   1252  bf1[35] = clamp_value(bf0[34] + bf0[35], stage_range[stage]);
   1253  bf1[36] = clamp_value(bf0[36] + bf0[37], stage_range[stage]);
   1254  bf1[37] = clamp_value(bf0[36] - bf0[37], stage_range[stage]);
   1255  bf1[38] = clamp_value(-bf0[38] + bf0[39], stage_range[stage]);
   1256  bf1[39] = clamp_value(bf0[38] + bf0[39], stage_range[stage]);
   1257  bf1[40] = clamp_value(bf0[40] + bf0[41], stage_range[stage]);
   1258  bf1[41] = clamp_value(bf0[40] - bf0[41], stage_range[stage]);
   1259  bf1[42] = clamp_value(-bf0[42] + bf0[43], stage_range[stage]);
   1260  bf1[43] = clamp_value(bf0[42] + bf0[43], stage_range[stage]);
   1261  bf1[44] = clamp_value(bf0[44] + bf0[45], stage_range[stage]);
   1262  bf1[45] = clamp_value(bf0[44] - bf0[45], stage_range[stage]);
   1263  bf1[46] = clamp_value(-bf0[46] + bf0[47], stage_range[stage]);
   1264  bf1[47] = clamp_value(bf0[46] + bf0[47], stage_range[stage]);
   1265  bf1[48] = clamp_value(bf0[48] + bf0[49], stage_range[stage]);
   1266  bf1[49] = clamp_value(bf0[48] - bf0[49], stage_range[stage]);
   1267  bf1[50] = clamp_value(-bf0[50] + bf0[51], stage_range[stage]);
   1268  bf1[51] = clamp_value(bf0[50] + bf0[51], stage_range[stage]);
   1269  bf1[52] = clamp_value(bf0[52] + bf0[53], stage_range[stage]);
   1270  bf1[53] = clamp_value(bf0[52] - bf0[53], stage_range[stage]);
   1271  bf1[54] = clamp_value(-bf0[54] + bf0[55], stage_range[stage]);
   1272  bf1[55] = clamp_value(bf0[54] + bf0[55], stage_range[stage]);
   1273  bf1[56] = clamp_value(bf0[56] + bf0[57], stage_range[stage]);
   1274  bf1[57] = clamp_value(bf0[56] - bf0[57], stage_range[stage]);
   1275  bf1[58] = clamp_value(-bf0[58] + bf0[59], stage_range[stage]);
   1276  bf1[59] = clamp_value(bf0[58] + bf0[59], stage_range[stage]);
   1277  bf1[60] = clamp_value(bf0[60] + bf0[61], stage_range[stage]);
   1278  bf1[61] = clamp_value(bf0[60] - bf0[61], stage_range[stage]);
   1279  bf1[62] = clamp_value(-bf0[62] + bf0[63], stage_range[stage]);
   1280  bf1[63] = clamp_value(bf0[62] + bf0[63], stage_range[stage]);
   1281  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
   1282 
   1283  // stage 4
   1284  stage++;
   1285  bf0 = output;
   1286  bf1 = step;
   1287  bf1[0] = bf0[0];
   1288  bf1[1] = bf0[1];
   1289  bf1[2] = bf0[2];
   1290  bf1[3] = bf0[3];
   1291  bf1[4] = bf0[4];
   1292  bf1[5] = bf0[5];
   1293  bf1[6] = bf0[6];
   1294  bf1[7] = bf0[7];
   1295  bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit);
   1296  bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit);
   1297  bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit);
   1298  bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit);
   1299  bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit);
   1300  bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit);
   1301  bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit);
   1302  bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit);
   1303  bf1[16] = clamp_value(bf0[16] + bf0[17], stage_range[stage]);
   1304  bf1[17] = clamp_value(bf0[16] - bf0[17], stage_range[stage]);
   1305  bf1[18] = clamp_value(-bf0[18] + bf0[19], stage_range[stage]);
   1306  bf1[19] = clamp_value(bf0[18] + bf0[19], stage_range[stage]);
   1307  bf1[20] = clamp_value(bf0[20] + bf0[21], stage_range[stage]);
   1308  bf1[21] = clamp_value(bf0[20] - bf0[21], stage_range[stage]);
   1309  bf1[22] = clamp_value(-bf0[22] + bf0[23], stage_range[stage]);
   1310  bf1[23] = clamp_value(bf0[22] + bf0[23], stage_range[stage]);
   1311  bf1[24] = clamp_value(bf0[24] + bf0[25], stage_range[stage]);
   1312  bf1[25] = clamp_value(bf0[24] - bf0[25], stage_range[stage]);
   1313  bf1[26] = clamp_value(-bf0[26] + bf0[27], stage_range[stage]);
   1314  bf1[27] = clamp_value(bf0[26] + bf0[27], stage_range[stage]);
   1315  bf1[28] = clamp_value(bf0[28] + bf0[29], stage_range[stage]);
   1316  bf1[29] = clamp_value(bf0[28] - bf0[29], stage_range[stage]);
   1317  bf1[30] = clamp_value(-bf0[30] + bf0[31], stage_range[stage]);
   1318  bf1[31] = clamp_value(bf0[30] + bf0[31], stage_range[stage]);
   1319  bf1[32] = bf0[32];
   1320  bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit);
   1321  bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit);
   1322  bf1[35] = bf0[35];
   1323  bf1[36] = bf0[36];
   1324  bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit);
   1325  bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit);
   1326  bf1[39] = bf0[39];
   1327  bf1[40] = bf0[40];
   1328  bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit);
   1329  bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit);
   1330  bf1[43] = bf0[43];
   1331  bf1[44] = bf0[44];
   1332  bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit);
   1333  bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit);
   1334  bf1[47] = bf0[47];
   1335  bf1[48] = bf0[48];
   1336  bf1[49] = half_btf(-cospi[52], bf0[46], cospi[12], bf0[49], cos_bit);
   1337  bf1[50] = half_btf(cospi[12], bf0[45], cospi[52], bf0[50], cos_bit);
   1338  bf1[51] = bf0[51];
   1339  bf1[52] = bf0[52];
   1340  bf1[53] = half_btf(-cospi[20], bf0[42], cospi[44], bf0[53], cos_bit);
   1341  bf1[54] = half_btf(cospi[44], bf0[41], cospi[20], bf0[54], cos_bit);
   1342  bf1[55] = bf0[55];
   1343  bf1[56] = bf0[56];
   1344  bf1[57] = half_btf(-cospi[36], bf0[38], cospi[28], bf0[57], cos_bit);
   1345  bf1[58] = half_btf(cospi[28], bf0[37], cospi[36], bf0[58], cos_bit);
   1346  bf1[59] = bf0[59];
   1347  bf1[60] = bf0[60];
   1348  bf1[61] = half_btf(-cospi[4], bf0[34], cospi[60], bf0[61], cos_bit);
   1349  bf1[62] = half_btf(cospi[60], bf0[33], cospi[4], bf0[62], cos_bit);
   1350  bf1[63] = bf0[63];
   1351  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
   1352 
   1353  // stage 5
   1354  stage++;
   1355  bf0 = step;
   1356  bf1 = output;
   1357  bf1[0] = bf0[0];
   1358  bf1[1] = bf0[1];
   1359  bf1[2] = bf0[2];
   1360  bf1[3] = bf0[3];
   1361  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
   1362  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
   1363  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
   1364  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
   1365  bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]);
   1366  bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]);
   1367  bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]);
   1368  bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]);
   1369  bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]);
   1370  bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]);
   1371  bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]);
   1372  bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]);
   1373  bf1[16] = bf0[16];
   1374  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
   1375  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
   1376  bf1[19] = bf0[19];
   1377  bf1[20] = bf0[20];
   1378  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
   1379  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
   1380  bf1[23] = bf0[23];
   1381  bf1[24] = bf0[24];
   1382  bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit);
   1383  bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit);
   1384  bf1[27] = bf0[27];
   1385  bf1[28] = bf0[28];
   1386  bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit);
   1387  bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit);
   1388  bf1[31] = bf0[31];
   1389  bf1[32] = clamp_value(bf0[32] + bf0[35], stage_range[stage]);
   1390  bf1[33] = clamp_value(bf0[33] + bf0[34], stage_range[stage]);
   1391  bf1[34] = clamp_value(bf0[33] - bf0[34], stage_range[stage]);
   1392  bf1[35] = clamp_value(bf0[32] - bf0[35], stage_range[stage]);
   1393  bf1[36] = clamp_value(-bf0[36] + bf0[39], stage_range[stage]);
   1394  bf1[37] = clamp_value(-bf0[37] + bf0[38], stage_range[stage]);
   1395  bf1[38] = clamp_value(bf0[37] + bf0[38], stage_range[stage]);
   1396  bf1[39] = clamp_value(bf0[36] + bf0[39], stage_range[stage]);
   1397  bf1[40] = clamp_value(bf0[40] + bf0[43], stage_range[stage]);
   1398  bf1[41] = clamp_value(bf0[41] + bf0[42], stage_range[stage]);
   1399  bf1[42] = clamp_value(bf0[41] - bf0[42], stage_range[stage]);
   1400  bf1[43] = clamp_value(bf0[40] - bf0[43], stage_range[stage]);
   1401  bf1[44] = clamp_value(-bf0[44] + bf0[47], stage_range[stage]);
   1402  bf1[45] = clamp_value(-bf0[45] + bf0[46], stage_range[stage]);
   1403  bf1[46] = clamp_value(bf0[45] + bf0[46], stage_range[stage]);
   1404  bf1[47] = clamp_value(bf0[44] + bf0[47], stage_range[stage]);
   1405  bf1[48] = clamp_value(bf0[48] + bf0[51], stage_range[stage]);
   1406  bf1[49] = clamp_value(bf0[49] + bf0[50], stage_range[stage]);
   1407  bf1[50] = clamp_value(bf0[49] - bf0[50], stage_range[stage]);
   1408  bf1[51] = clamp_value(bf0[48] - bf0[51], stage_range[stage]);
   1409  bf1[52] = clamp_value(-bf0[52] + bf0[55], stage_range[stage]);
   1410  bf1[53] = clamp_value(-bf0[53] + bf0[54], stage_range[stage]);
   1411  bf1[54] = clamp_value(bf0[53] + bf0[54], stage_range[stage]);
   1412  bf1[55] = clamp_value(bf0[52] + bf0[55], stage_range[stage]);
   1413  bf1[56] = clamp_value(bf0[56] + bf0[59], stage_range[stage]);
   1414  bf1[57] = clamp_value(bf0[57] + bf0[58], stage_range[stage]);
   1415  bf1[58] = clamp_value(bf0[57] - bf0[58], stage_range[stage]);
   1416  bf1[59] = clamp_value(bf0[56] - bf0[59], stage_range[stage]);
   1417  bf1[60] = clamp_value(-bf0[60] + bf0[63], stage_range[stage]);
   1418  bf1[61] = clamp_value(-bf0[61] + bf0[62], stage_range[stage]);
   1419  bf1[62] = clamp_value(bf0[61] + bf0[62], stage_range[stage]);
   1420  bf1[63] = clamp_value(bf0[60] + bf0[63], stage_range[stage]);
   1421  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
   1422 
   1423  // stage 6
   1424  stage++;
   1425  bf0 = output;
   1426  bf1 = step;
   1427  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
   1428  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
   1429  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
   1430  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
   1431  bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
   1432  bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
   1433  bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
   1434  bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
   1435  bf1[8] = bf0[8];
   1436  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
   1437  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
   1438  bf1[11] = bf0[11];
   1439  bf1[12] = bf0[12];
   1440  bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit);
   1441  bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit);
   1442  bf1[15] = bf0[15];
   1443  bf1[16] = clamp_value(bf0[16] + bf0[19], stage_range[stage]);
   1444  bf1[17] = clamp_value(bf0[17] + bf0[18], stage_range[stage]);
   1445  bf1[18] = clamp_value(bf0[17] - bf0[18], stage_range[stage]);
   1446  bf1[19] = clamp_value(bf0[16] - bf0[19], stage_range[stage]);
   1447  bf1[20] = clamp_value(-bf0[20] + bf0[23], stage_range[stage]);
   1448  bf1[21] = clamp_value(-bf0[21] + bf0[22], stage_range[stage]);
   1449  bf1[22] = clamp_value(bf0[21] + bf0[22], stage_range[stage]);
   1450  bf1[23] = clamp_value(bf0[20] + bf0[23], stage_range[stage]);
   1451  bf1[24] = clamp_value(bf0[24] + bf0[27], stage_range[stage]);
   1452  bf1[25] = clamp_value(bf0[25] + bf0[26], stage_range[stage]);
   1453  bf1[26] = clamp_value(bf0[25] - bf0[26], stage_range[stage]);
   1454  bf1[27] = clamp_value(bf0[24] - bf0[27], stage_range[stage]);
   1455  bf1[28] = clamp_value(-bf0[28] + bf0[31], stage_range[stage]);
   1456  bf1[29] = clamp_value(-bf0[29] + bf0[30], stage_range[stage]);
   1457  bf1[30] = clamp_value(bf0[29] + bf0[30], stage_range[stage]);
   1458  bf1[31] = clamp_value(bf0[28] + bf0[31], stage_range[stage]);
   1459  bf1[32] = bf0[32];
   1460  bf1[33] = bf0[33];
   1461  bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit);
   1462  bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit);
   1463  bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit);
   1464  bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit);
   1465  bf1[38] = bf0[38];
   1466  bf1[39] = bf0[39];
   1467  bf1[40] = bf0[40];
   1468  bf1[41] = bf0[41];
   1469  bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit);
   1470  bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit);
   1471  bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit);
   1472  bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit);
   1473  bf1[46] = bf0[46];
   1474  bf1[47] = bf0[47];
   1475  bf1[48] = bf0[48];
   1476  bf1[49] = bf0[49];
   1477  bf1[50] = half_btf(-cospi[40], bf0[45], cospi[24], bf0[50], cos_bit);
   1478  bf1[51] = half_btf(-cospi[40], bf0[44], cospi[24], bf0[51], cos_bit);
   1479  bf1[52] = half_btf(cospi[24], bf0[43], cospi[40], bf0[52], cos_bit);
   1480  bf1[53] = half_btf(cospi[24], bf0[42], cospi[40], bf0[53], cos_bit);
   1481  bf1[54] = bf0[54];
   1482  bf1[55] = bf0[55];
   1483  bf1[56] = bf0[56];
   1484  bf1[57] = bf0[57];
   1485  bf1[58] = half_btf(-cospi[8], bf0[37], cospi[56], bf0[58], cos_bit);
   1486  bf1[59] = half_btf(-cospi[8], bf0[36], cospi[56], bf0[59], cos_bit);
   1487  bf1[60] = half_btf(cospi[56], bf0[35], cospi[8], bf0[60], cos_bit);
   1488  bf1[61] = half_btf(cospi[56], bf0[34], cospi[8], bf0[61], cos_bit);
   1489  bf1[62] = bf0[62];
   1490  bf1[63] = bf0[63];
   1491  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
   1492 
   1493  // stage 7
   1494  stage++;
   1495  bf0 = step;
   1496  bf1 = output;
   1497  bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
   1498  bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
   1499  bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
   1500  bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
   1501  bf1[4] = bf0[4];
   1502  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
   1503  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
   1504  bf1[7] = bf0[7];
   1505  bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]);
   1506  bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]);
   1507  bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]);
   1508  bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]);
   1509  bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]);
   1510  bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]);
   1511  bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]);
   1512  bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]);
   1513  bf1[16] = bf0[16];
   1514  bf1[17] = bf0[17];
   1515  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
   1516  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
   1517  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
   1518  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
   1519  bf1[22] = bf0[22];
   1520  bf1[23] = bf0[23];
   1521  bf1[24] = bf0[24];
   1522  bf1[25] = bf0[25];
   1523  bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit);
   1524  bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit);
   1525  bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit);
   1526  bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit);
   1527  bf1[30] = bf0[30];
   1528  bf1[31] = bf0[31];
   1529  bf1[32] = clamp_value(bf0[32] + bf0[39], stage_range[stage]);
   1530  bf1[33] = clamp_value(bf0[33] + bf0[38], stage_range[stage]);
   1531  bf1[34] = clamp_value(bf0[34] + bf0[37], stage_range[stage]);
   1532  bf1[35] = clamp_value(bf0[35] + bf0[36], stage_range[stage]);
   1533  bf1[36] = clamp_value(bf0[35] - bf0[36], stage_range[stage]);
   1534  bf1[37] = clamp_value(bf0[34] - bf0[37], stage_range[stage]);
   1535  bf1[38] = clamp_value(bf0[33] - bf0[38], stage_range[stage]);
   1536  bf1[39] = clamp_value(bf0[32] - bf0[39], stage_range[stage]);
   1537  bf1[40] = clamp_value(-bf0[40] + bf0[47], stage_range[stage]);
   1538  bf1[41] = clamp_value(-bf0[41] + bf0[46], stage_range[stage]);
   1539  bf1[42] = clamp_value(-bf0[42] + bf0[45], stage_range[stage]);
   1540  bf1[43] = clamp_value(-bf0[43] + bf0[44], stage_range[stage]);
   1541  bf1[44] = clamp_value(bf0[43] + bf0[44], stage_range[stage]);
   1542  bf1[45] = clamp_value(bf0[42] + bf0[45], stage_range[stage]);
   1543  bf1[46] = clamp_value(bf0[41] + bf0[46], stage_range[stage]);
   1544  bf1[47] = clamp_value(bf0[40] + bf0[47], stage_range[stage]);
   1545  bf1[48] = clamp_value(bf0[48] + bf0[55], stage_range[stage]);
   1546  bf1[49] = clamp_value(bf0[49] + bf0[54], stage_range[stage]);
   1547  bf1[50] = clamp_value(bf0[50] + bf0[53], stage_range[stage]);
   1548  bf1[51] = clamp_value(bf0[51] + bf0[52], stage_range[stage]);
   1549  bf1[52] = clamp_value(bf0[51] - bf0[52], stage_range[stage]);
   1550  bf1[53] = clamp_value(bf0[50] - bf0[53], stage_range[stage]);
   1551  bf1[54] = clamp_value(bf0[49] - bf0[54], stage_range[stage]);
   1552  bf1[55] = clamp_value(bf0[48] - bf0[55], stage_range[stage]);
   1553  bf1[56] = clamp_value(-bf0[56] + bf0[63], stage_range[stage]);
   1554  bf1[57] = clamp_value(-bf0[57] + bf0[62], stage_range[stage]);
   1555  bf1[58] = clamp_value(-bf0[58] + bf0[61], stage_range[stage]);
   1556  bf1[59] = clamp_value(-bf0[59] + bf0[60], stage_range[stage]);
   1557  bf1[60] = clamp_value(bf0[59] + bf0[60], stage_range[stage]);
   1558  bf1[61] = clamp_value(bf0[58] + bf0[61], stage_range[stage]);
   1559  bf1[62] = clamp_value(bf0[57] + bf0[62], stage_range[stage]);
   1560  bf1[63] = clamp_value(bf0[56] + bf0[63], stage_range[stage]);
   1561  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
   1562 
   1563  // stage 8
   1564  stage++;
   1565  bf0 = output;
   1566  bf1 = step;
   1567  bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
   1568  bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
   1569  bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
   1570  bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
   1571  bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
   1572  bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
   1573  bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
   1574  bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
   1575  bf1[8] = bf0[8];
   1576  bf1[9] = bf0[9];
   1577  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
   1578  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
   1579  bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
   1580  bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
   1581  bf1[14] = bf0[14];
   1582  bf1[15] = bf0[15];
   1583  bf1[16] = clamp_value(bf0[16] + bf0[23], stage_range[stage]);
   1584  bf1[17] = clamp_value(bf0[17] + bf0[22], stage_range[stage]);
   1585  bf1[18] = clamp_value(bf0[18] + bf0[21], stage_range[stage]);
   1586  bf1[19] = clamp_value(bf0[19] + bf0[20], stage_range[stage]);
   1587  bf1[20] = clamp_value(bf0[19] - bf0[20], stage_range[stage]);
   1588  bf1[21] = clamp_value(bf0[18] - bf0[21], stage_range[stage]);
   1589  bf1[22] = clamp_value(bf0[17] - bf0[22], stage_range[stage]);
   1590  bf1[23] = clamp_value(bf0[16] - bf0[23], stage_range[stage]);
   1591  bf1[24] = clamp_value(-bf0[24] + bf0[31], stage_range[stage]);
   1592  bf1[25] = clamp_value(-bf0[25] + bf0[30], stage_range[stage]);
   1593  bf1[26] = clamp_value(-bf0[26] + bf0[29], stage_range[stage]);
   1594  bf1[27] = clamp_value(-bf0[27] + bf0[28], stage_range[stage]);
   1595  bf1[28] = clamp_value(bf0[27] + bf0[28], stage_range[stage]);
   1596  bf1[29] = clamp_value(bf0[26] + bf0[29], stage_range[stage]);
   1597  bf1[30] = clamp_value(bf0[25] + bf0[30], stage_range[stage]);
   1598  bf1[31] = clamp_value(bf0[24] + bf0[31], stage_range[stage]);
   1599  bf1[32] = bf0[32];
   1600  bf1[33] = bf0[33];
   1601  bf1[34] = bf0[34];
   1602  bf1[35] = bf0[35];
   1603  bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit);
   1604  bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit);
   1605  bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit);
   1606  bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit);
   1607  bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit);
   1608  bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit);
   1609  bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit);
   1610  bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit);
   1611  bf1[44] = bf0[44];
   1612  bf1[45] = bf0[45];
   1613  bf1[46] = bf0[46];
   1614  bf1[47] = bf0[47];
   1615  bf1[48] = bf0[48];
   1616  bf1[49] = bf0[49];
   1617  bf1[50] = bf0[50];
   1618  bf1[51] = bf0[51];
   1619  bf1[52] = half_btf(-cospi[16], bf0[43], cospi[48], bf0[52], cos_bit);
   1620  bf1[53] = half_btf(-cospi[16], bf0[42], cospi[48], bf0[53], cos_bit);
   1621  bf1[54] = half_btf(-cospi[16], bf0[41], cospi[48], bf0[54], cos_bit);
   1622  bf1[55] = half_btf(-cospi[16], bf0[40], cospi[48], bf0[55], cos_bit);
   1623  bf1[56] = half_btf(cospi[48], bf0[39], cospi[16], bf0[56], cos_bit);
   1624  bf1[57] = half_btf(cospi[48], bf0[38], cospi[16], bf0[57], cos_bit);
   1625  bf1[58] = half_btf(cospi[48], bf0[37], cospi[16], bf0[58], cos_bit);
   1626  bf1[59] = half_btf(cospi[48], bf0[36], cospi[16], bf0[59], cos_bit);
   1627  bf1[60] = bf0[60];
   1628  bf1[61] = bf0[61];
   1629  bf1[62] = bf0[62];
   1630  bf1[63] = bf0[63];
   1631  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
   1632 
   1633  // stage 9
   1634  stage++;
   1635  bf0 = step;
   1636  bf1 = output;
   1637  bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]);
   1638  bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]);
   1639  bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]);
   1640  bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]);
   1641  bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]);
   1642  bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]);
   1643  bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]);
   1644  bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]);
   1645  bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]);
   1646  bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]);
   1647  bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]);
   1648  bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]);
   1649  bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]);
   1650  bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]);
   1651  bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]);
   1652  bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]);
   1653  bf1[16] = bf0[16];
   1654  bf1[17] = bf0[17];
   1655  bf1[18] = bf0[18];
   1656  bf1[19] = bf0[19];
   1657  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
   1658  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
   1659  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
   1660  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
   1661  bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
   1662  bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
   1663  bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
   1664  bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
   1665  bf1[28] = bf0[28];
   1666  bf1[29] = bf0[29];
   1667  bf1[30] = bf0[30];
   1668  bf1[31] = bf0[31];
   1669  bf1[32] = clamp_value(bf0[32] + bf0[47], stage_range[stage]);
   1670  bf1[33] = clamp_value(bf0[33] + bf0[46], stage_range[stage]);
   1671  bf1[34] = clamp_value(bf0[34] + bf0[45], stage_range[stage]);
   1672  bf1[35] = clamp_value(bf0[35] + bf0[44], stage_range[stage]);
   1673  bf1[36] = clamp_value(bf0[36] + bf0[43], stage_range[stage]);
   1674  bf1[37] = clamp_value(bf0[37] + bf0[42], stage_range[stage]);
   1675  bf1[38] = clamp_value(bf0[38] + bf0[41], stage_range[stage]);
   1676  bf1[39] = clamp_value(bf0[39] + bf0[40], stage_range[stage]);
   1677  bf1[40] = clamp_value(bf0[39] - bf0[40], stage_range[stage]);
   1678  bf1[41] = clamp_value(bf0[38] - bf0[41], stage_range[stage]);
   1679  bf1[42] = clamp_value(bf0[37] - bf0[42], stage_range[stage]);
   1680  bf1[43] = clamp_value(bf0[36] - bf0[43], stage_range[stage]);
   1681  bf1[44] = clamp_value(bf0[35] - bf0[44], stage_range[stage]);
   1682  bf1[45] = clamp_value(bf0[34] - bf0[45], stage_range[stage]);
   1683  bf1[46] = clamp_value(bf0[33] - bf0[46], stage_range[stage]);
   1684  bf1[47] = clamp_value(bf0[32] - bf0[47], stage_range[stage]);
   1685  bf1[48] = clamp_value(-bf0[48] + bf0[63], stage_range[stage]);
   1686  bf1[49] = clamp_value(-bf0[49] + bf0[62], stage_range[stage]);
   1687  bf1[50] = clamp_value(-bf0[50] + bf0[61], stage_range[stage]);
   1688  bf1[51] = clamp_value(-bf0[51] + bf0[60], stage_range[stage]);
   1689  bf1[52] = clamp_value(-bf0[52] + bf0[59], stage_range[stage]);
   1690  bf1[53] = clamp_value(-bf0[53] + bf0[58], stage_range[stage]);
   1691  bf1[54] = clamp_value(-bf0[54] + bf0[57], stage_range[stage]);
   1692  bf1[55] = clamp_value(-bf0[55] + bf0[56], stage_range[stage]);
   1693  bf1[56] = clamp_value(bf0[55] + bf0[56], stage_range[stage]);
   1694  bf1[57] = clamp_value(bf0[54] + bf0[57], stage_range[stage]);
   1695  bf1[58] = clamp_value(bf0[53] + bf0[58], stage_range[stage]);
   1696  bf1[59] = clamp_value(bf0[52] + bf0[59], stage_range[stage]);
   1697  bf1[60] = clamp_value(bf0[51] + bf0[60], stage_range[stage]);
   1698  bf1[61] = clamp_value(bf0[50] + bf0[61], stage_range[stage]);
   1699  bf1[62] = clamp_value(bf0[49] + bf0[62], stage_range[stage]);
   1700  bf1[63] = clamp_value(bf0[48] + bf0[63], stage_range[stage]);
   1701  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
   1702 
   1703  // stage 10
   1704  stage++;
   1705  bf0 = output;
   1706  bf1 = step;
   1707  bf1[0] = clamp_value(bf0[0] + bf0[31], stage_range[stage]);
   1708  bf1[1] = clamp_value(bf0[1] + bf0[30], stage_range[stage]);
   1709  bf1[2] = clamp_value(bf0[2] + bf0[29], stage_range[stage]);
   1710  bf1[3] = clamp_value(bf0[3] + bf0[28], stage_range[stage]);
   1711  bf1[4] = clamp_value(bf0[4] + bf0[27], stage_range[stage]);
   1712  bf1[5] = clamp_value(bf0[5] + bf0[26], stage_range[stage]);
   1713  bf1[6] = clamp_value(bf0[6] + bf0[25], stage_range[stage]);
   1714  bf1[7] = clamp_value(bf0[7] + bf0[24], stage_range[stage]);
   1715  bf1[8] = clamp_value(bf0[8] + bf0[23], stage_range[stage]);
   1716  bf1[9] = clamp_value(bf0[9] + bf0[22], stage_range[stage]);
   1717  bf1[10] = clamp_value(bf0[10] + bf0[21], stage_range[stage]);
   1718  bf1[11] = clamp_value(bf0[11] + bf0[20], stage_range[stage]);
   1719  bf1[12] = clamp_value(bf0[12] + bf0[19], stage_range[stage]);
   1720  bf1[13] = clamp_value(bf0[13] + bf0[18], stage_range[stage]);
   1721  bf1[14] = clamp_value(bf0[14] + bf0[17], stage_range[stage]);
   1722  bf1[15] = clamp_value(bf0[15] + bf0[16], stage_range[stage]);
   1723  bf1[16] = clamp_value(bf0[15] - bf0[16], stage_range[stage]);
   1724  bf1[17] = clamp_value(bf0[14] - bf0[17], stage_range[stage]);
   1725  bf1[18] = clamp_value(bf0[13] - bf0[18], stage_range[stage]);
   1726  bf1[19] = clamp_value(bf0[12] - bf0[19], stage_range[stage]);
   1727  bf1[20] = clamp_value(bf0[11] - bf0[20], stage_range[stage]);
   1728  bf1[21] = clamp_value(bf0[10] - bf0[21], stage_range[stage]);
   1729  bf1[22] = clamp_value(bf0[9] - bf0[22], stage_range[stage]);
   1730  bf1[23] = clamp_value(bf0[8] - bf0[23], stage_range[stage]);
   1731  bf1[24] = clamp_value(bf0[7] - bf0[24], stage_range[stage]);
   1732  bf1[25] = clamp_value(bf0[6] - bf0[25], stage_range[stage]);
   1733  bf1[26] = clamp_value(bf0[5] - bf0[26], stage_range[stage]);
   1734  bf1[27] = clamp_value(bf0[4] - bf0[27], stage_range[stage]);
   1735  bf1[28] = clamp_value(bf0[3] - bf0[28], stage_range[stage]);
   1736  bf1[29] = clamp_value(bf0[2] - bf0[29], stage_range[stage]);
   1737  bf1[30] = clamp_value(bf0[1] - bf0[30], stage_range[stage]);
   1738  bf1[31] = clamp_value(bf0[0] - bf0[31], stage_range[stage]);
   1739  bf1[32] = bf0[32];
   1740  bf1[33] = bf0[33];
   1741  bf1[34] = bf0[34];
   1742  bf1[35] = bf0[35];
   1743  bf1[36] = bf0[36];
   1744  bf1[37] = bf0[37];
   1745  bf1[38] = bf0[38];
   1746  bf1[39] = bf0[39];
   1747  bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
   1748  bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
   1749  bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
   1750  bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
   1751  bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
   1752  bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
   1753  bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
   1754  bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
   1755  bf1[48] = half_btf(cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
   1756  bf1[49] = half_btf(cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
   1757  bf1[50] = half_btf(cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
   1758  bf1[51] = half_btf(cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
   1759  bf1[52] = half_btf(cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
   1760  bf1[53] = half_btf(cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
   1761  bf1[54] = half_btf(cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
   1762  bf1[55] = half_btf(cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
   1763  bf1[56] = bf0[56];
   1764  bf1[57] = bf0[57];
   1765  bf1[58] = bf0[58];
   1766  bf1[59] = bf0[59];
   1767  bf1[60] = bf0[60];
   1768  bf1[61] = bf0[61];
   1769  bf1[62] = bf0[62];
   1770  bf1[63] = bf0[63];
   1771  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
   1772 
   1773  // stage 11
   1774  stage++;
   1775  bf0 = step;
   1776  bf1 = output;
   1777  bf1[0] = clamp_value(bf0[0] + bf0[63], stage_range[stage]);
   1778  bf1[1] = clamp_value(bf0[1] + bf0[62], stage_range[stage]);
   1779  bf1[2] = clamp_value(bf0[2] + bf0[61], stage_range[stage]);
   1780  bf1[3] = clamp_value(bf0[3] + bf0[60], stage_range[stage]);
   1781  bf1[4] = clamp_value(bf0[4] + bf0[59], stage_range[stage]);
   1782  bf1[5] = clamp_value(bf0[5] + bf0[58], stage_range[stage]);
   1783  bf1[6] = clamp_value(bf0[6] + bf0[57], stage_range[stage]);
   1784  bf1[7] = clamp_value(bf0[7] + bf0[56], stage_range[stage]);
   1785  bf1[8] = clamp_value(bf0[8] + bf0[55], stage_range[stage]);
   1786  bf1[9] = clamp_value(bf0[9] + bf0[54], stage_range[stage]);
   1787  bf1[10] = clamp_value(bf0[10] + bf0[53], stage_range[stage]);
   1788  bf1[11] = clamp_value(bf0[11] + bf0[52], stage_range[stage]);
   1789  bf1[12] = clamp_value(bf0[12] + bf0[51], stage_range[stage]);
   1790  bf1[13] = clamp_value(bf0[13] + bf0[50], stage_range[stage]);
   1791  bf1[14] = clamp_value(bf0[14] + bf0[49], stage_range[stage]);
   1792  bf1[15] = clamp_value(bf0[15] + bf0[48], stage_range[stage]);
   1793  bf1[16] = clamp_value(bf0[16] + bf0[47], stage_range[stage]);
   1794  bf1[17] = clamp_value(bf0[17] + bf0[46], stage_range[stage]);
   1795  bf1[18] = clamp_value(bf0[18] + bf0[45], stage_range[stage]);
   1796  bf1[19] = clamp_value(bf0[19] + bf0[44], stage_range[stage]);
   1797  bf1[20] = clamp_value(bf0[20] + bf0[43], stage_range[stage]);
   1798  bf1[21] = clamp_value(bf0[21] + bf0[42], stage_range[stage]);
   1799  bf1[22] = clamp_value(bf0[22] + bf0[41], stage_range[stage]);
   1800  bf1[23] = clamp_value(bf0[23] + bf0[40], stage_range[stage]);
   1801  bf1[24] = clamp_value(bf0[24] + bf0[39], stage_range[stage]);
   1802  bf1[25] = clamp_value(bf0[25] + bf0[38], stage_range[stage]);
   1803  bf1[26] = clamp_value(bf0[26] + bf0[37], stage_range[stage]);
   1804  bf1[27] = clamp_value(bf0[27] + bf0[36], stage_range[stage]);
   1805  bf1[28] = clamp_value(bf0[28] + bf0[35], stage_range[stage]);
   1806  bf1[29] = clamp_value(bf0[29] + bf0[34], stage_range[stage]);
   1807  bf1[30] = clamp_value(bf0[30] + bf0[33], stage_range[stage]);
   1808  bf1[31] = clamp_value(bf0[31] + bf0[32], stage_range[stage]);
   1809  bf1[32] = clamp_value(bf0[31] - bf0[32], stage_range[stage]);
   1810  bf1[33] = clamp_value(bf0[30] - bf0[33], stage_range[stage]);
   1811  bf1[34] = clamp_value(bf0[29] - bf0[34], stage_range[stage]);
   1812  bf1[35] = clamp_value(bf0[28] - bf0[35], stage_range[stage]);
   1813  bf1[36] = clamp_value(bf0[27] - bf0[36], stage_range[stage]);
   1814  bf1[37] = clamp_value(bf0[26] - bf0[37], stage_range[stage]);
   1815  bf1[38] = clamp_value(bf0[25] - bf0[38], stage_range[stage]);
   1816  bf1[39] = clamp_value(bf0[24] - bf0[39], stage_range[stage]);
   1817  bf1[40] = clamp_value(bf0[23] - bf0[40], stage_range[stage]);
   1818  bf1[41] = clamp_value(bf0[22] - bf0[41], stage_range[stage]);
   1819  bf1[42] = clamp_value(bf0[21] - bf0[42], stage_range[stage]);
   1820  bf1[43] = clamp_value(bf0[20] - bf0[43], stage_range[stage]);
   1821  bf1[44] = clamp_value(bf0[19] - bf0[44], stage_range[stage]);
   1822  bf1[45] = clamp_value(bf0[18] - bf0[45], stage_range[stage]);
   1823  bf1[46] = clamp_value(bf0[17] - bf0[46], stage_range[stage]);
   1824  bf1[47] = clamp_value(bf0[16] - bf0[47], stage_range[stage]);
   1825  bf1[48] = clamp_value(bf0[15] - bf0[48], stage_range[stage]);
   1826  bf1[49] = clamp_value(bf0[14] - bf0[49], stage_range[stage]);
   1827  bf1[50] = clamp_value(bf0[13] - bf0[50], stage_range[stage]);
   1828  bf1[51] = clamp_value(bf0[12] - bf0[51], stage_range[stage]);
   1829  bf1[52] = clamp_value(bf0[11] - bf0[52], stage_range[stage]);
   1830  bf1[53] = clamp_value(bf0[10] - bf0[53], stage_range[stage]);
   1831  bf1[54] = clamp_value(bf0[9] - bf0[54], stage_range[stage]);
   1832  bf1[55] = clamp_value(bf0[8] - bf0[55], stage_range[stage]);
   1833  bf1[56] = clamp_value(bf0[7] - bf0[56], stage_range[stage]);
   1834  bf1[57] = clamp_value(bf0[6] - bf0[57], stage_range[stage]);
   1835  bf1[58] = clamp_value(bf0[5] - bf0[58], stage_range[stage]);
   1836  bf1[59] = clamp_value(bf0[4] - bf0[59], stage_range[stage]);
   1837  bf1[60] = clamp_value(bf0[3] - bf0[60], stage_range[stage]);
   1838  bf1[61] = clamp_value(bf0[2] - bf0[61], stage_range[stage]);
   1839  bf1[62] = clamp_value(bf0[1] - bf0[62], stage_range[stage]);
   1840  bf1[63] = clamp_value(bf0[0] - bf0[63], stage_range[stage]);
   1841 }