tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

fft_common.h (78125B)


      1 /*
      2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #ifndef AOM_AOM_DSP_FFT_COMMON_H_
     13 #define AOM_AOM_DSP_FFT_COMMON_H_
     14 
     15 #ifdef __cplusplus
     16 extern "C" {
     17 #endif
     18 
     19 /*!\brief A function pointer for computing 1d fft and ifft.
     20 *
     21 * The function will point to an implementation for a specific transform size,
     22 * and may perform the transforms using vectorized instructions.
     23 *
     24 * For a non-vectorized forward transforms of size n, the input and output
     25 * buffers will be size n. The output takes advantage of conjugate symmetry and
     26 * packs the results as: [r_0, r_1, ..., r_{n/2}, i_1, ..., i_{n/2-1}], where
     27 * (r_{j}, i_{j}) is the complex output for index j.
     28 *
     29 * An inverse transform will assume that the complex "input" is packed
     30 * similarly. Its output will be real.
     31 *
     32 * Non-vectorized transforms (e.g., on a single row) would use a stride = 1.
     33 *
     34 * Vectorized implementations are parallelized along the columns so that the fft
     35 * can be performed on multiple columns at a time. In such cases the data block
     36 * for input and output is typically square (n x n) and the stride will
     37 * correspond to the spacing between rows. At minimum, the input size must be
     38 * n x simd_vector_length.
     39 *
     40 * \param[in]  input   Input buffer. See above for size restrictions.
     41 * \param[out] output  Output buffer. See above for size restrictions.
     42 * \param[in]  stride  The spacing in number of elements between rows
     43 *                     (or elements)
     44 */
     45 typedef void (*aom_fft_1d_func_t)(const float *input, float *output,
     46                                  int stride);
     47 
     48 // Declare some of the forward non-vectorized transforms which are used in some
     49 // of the vectorized implementations
     50 void aom_fft1d_2_float(const float *input, float *output, int stride);
     51 void aom_fft1d_4_float(const float *input, float *output, int stride);
     52 void aom_fft1d_8_float(const float *input, float *output, int stride);
     53 void aom_fft1d_16_float(const float *input, float *output, int stride);
     54 void aom_fft1d_32_float(const float *input, float *output, int stride);
     55 
     56 /*!\brief Function pointer for transposing a matrix of floats.
     57 *
     58 * \param[in]  input  Input buffer (size n x n)
     59 * \param[out] output Output buffer (size n x n)
     60 * \param[in]  n      Extent of one dimension of the square matrix.
     61 */
     62 typedef void (*aom_fft_transpose_func_t)(const float *input, float *output,
     63                                         int n);
     64 
     65 /*!\brief Function pointer for re-arranging intermediate 2d transform results.
     66 *
     67 * After re-arrangement, the real and imaginary components will be packed
     68 * tightly next to each other.
     69 *
     70 * \param[in]  input  Input buffer (size n x n)
     71 * \param[out] output Output buffer (size 2 x n x n)
     72 * \param[in]  n      Extent of one dimension of the square matrix.
     73 */
     74 typedef void (*aom_fft_unpack_func_t)(const float *input, float *output, int n);
     75 
     76 /*!\brief Performs a 2d fft with the given functions.
     77 *
     78 * This generator function allows for multiple different implementations of 2d
     79 * fft with different vector operations, without having to redefine the main
     80 * body multiple times.
     81 *
     82 * \param[in]  input     Input buffer to run the transform on (size n x n)
     83 * \param[out] temp      Working buffer for computing the transform (size n x n)
     84 * \param[out] output    Output buffer (size 2 x n x n)
     85 * \param[in]  tform     Forward transform function
     86 * \param[in]  transpose Transpose function (for n x n matrix)
     87 * \param[in]  unpack    Unpack function used to massage outputs to correct form
     88 * \param[in]  vec_size  Vector size (the transform is done vec_size units at
     89 *                       a time)
     90 */
     91 void aom_fft_2d_gen(const float *input, float *temp, float *output, int n,
     92                    aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose,
     93                    aom_fft_unpack_func_t unpack, int vec_size);
     94 
     95 /*!\brief Perform a 2d inverse fft with the given helper functions
     96 *
     97 * \param[in]  input      Input buffer to run the transform on (size 2 x n x n)
     98 * \param[out] temp       Working buffer for computations (size 2 x n x n)
     99 * \param[out] output     Output buffer (size n x n)
    100 * \param[in]  fft_single Forward transform function (non vectorized)
    101 * \param[in]  fft_multi  Forward transform function (vectorized)
    102 * \param[in]  ifft_multi Inverse transform function (vectorized)
    103 * \param[in]  transpose  Transpose function (for n x n matrix)
    104 * \param[in]  vec_size   Vector size (the transform is done vec_size
    105 *                        units at a time)
    106 */
    107 void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n,
    108                     aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi,
    109                     aom_fft_1d_func_t ifft_multi,
    110                     aom_fft_transpose_func_t transpose, int vec_size);
    111 #ifdef __cplusplus
    112 }
    113 #endif
    114 
    115 // The macros below define 1D fft/ifft for different data types and for
    116 // different simd vector intrinsic types.
    117 
    118 #define GEN_FFT_2(ret, suffix, T, T_VEC, load, store)               \
    119  ret aom_fft1d_2_##suffix(const T *input, T *output, int stride) { \
    120    const T_VEC i0 = load(input + 0 * stride);                      \
    121    const T_VEC i1 = load(input + 1 * stride);                      \
    122    store(output + 0 * stride, i0 + i1);                            \
    123    store(output + 1 * stride, i0 - i1);                            \
    124  }
    125 
    126 #define GEN_FFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \
    127  ret aom_fft1d_4_##suffix(const T *input, T *output, int stride) {       \
    128    const T_VEC kWeight0 = constant(0.0f);                                \
    129    const T_VEC i0 = load(input + 0 * stride);                            \
    130    const T_VEC i1 = load(input + 1 * stride);                            \
    131    const T_VEC i2 = load(input + 2 * stride);                            \
    132    const T_VEC i3 = load(input + 3 * stride);                            \
    133    const T_VEC w0 = add(i0, i2);                                         \
    134    const T_VEC w1 = sub(i0, i2);                                         \
    135    const T_VEC w2 = add(i1, i3);                                         \
    136    const T_VEC w3 = sub(i1, i3);                                         \
    137    store(output + 0 * stride, add(w0, w2));                              \
    138    store(output + 1 * stride, w1);                                       \
    139    store(output + 2 * stride, sub(w0, w2));                              \
    140    store(output + 3 * stride, sub(kWeight0, w3));                        \
    141  }
    142 
    143 #define GEN_FFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, mul) \
    144  ret aom_fft1d_8_##suffix(const T *input, T *output, int stride) {            \
    145    const T_VEC kWeight0 = constant(0.0f);                                     \
    146    const T_VEC kWeight2 = constant(0.707107f);                                \
    147    const T_VEC i0 = load(input + 0 * stride);                                 \
    148    const T_VEC i1 = load(input + 1 * stride);                                 \
    149    const T_VEC i2 = load(input + 2 * stride);                                 \
    150    const T_VEC i3 = load(input + 3 * stride);                                 \
    151    const T_VEC i4 = load(input + 4 * stride);                                 \
    152    const T_VEC i5 = load(input + 5 * stride);                                 \
    153    const T_VEC i6 = load(input + 6 * stride);                                 \
    154    const T_VEC i7 = load(input + 7 * stride);                                 \
    155    const T_VEC w0 = add(i0, i4);                                              \
    156    const T_VEC w1 = sub(i0, i4);                                              \
    157    const T_VEC w2 = add(i2, i6);                                              \
    158    const T_VEC w3 = sub(i2, i6);                                              \
    159    const T_VEC w4 = add(w0, w2);                                              \
    160    const T_VEC w5 = sub(w0, w2);                                              \
    161    const T_VEC w7 = add(i1, i5);                                              \
    162    const T_VEC w8 = sub(i1, i5);                                              \
    163    const T_VEC w9 = add(i3, i7);                                              \
    164    const T_VEC w10 = sub(i3, i7);                                             \
    165    const T_VEC w11 = add(w7, w9);                                             \
    166    const T_VEC w12 = sub(w7, w9);                                             \
    167    store(output + 0 * stride, add(w4, w11));                                  \
    168    store(output + 1 * stride, add(w1, mul(kWeight2, sub(w8, w10))));          \
    169    store(output + 2 * stride, w5);                                            \
    170    store(output + 3 * stride, sub(w1, mul(kWeight2, sub(w8, w10))));          \
    171    store(output + 4 * stride, sub(w4, w11));                                  \
    172    store(output + 5 * stride,                                                 \
    173          sub(sub(kWeight0, w3), mul(kWeight2, add(w10, w8))));                \
    174    store(output + 6 * stride, sub(kWeight0, w12));                            \
    175    store(output + 7 * stride, sub(w3, mul(kWeight2, add(w10, w8))));          \
    176  }
    177 
    178 #define GEN_FFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
    179                   mul)                                                    \
    180  ret aom_fft1d_16_##suffix(const T *input, T *output, int stride) {       \
    181    const T_VEC kWeight0 = constant(0.0f);                                 \
    182    const T_VEC kWeight2 = constant(0.707107f);                            \
    183    const T_VEC kWeight3 = constant(0.92388f);                             \
    184    const T_VEC kWeight4 = constant(0.382683f);                            \
    185    const T_VEC i0 = load(input + 0 * stride);                             \
    186    const T_VEC i1 = load(input + 1 * stride);                             \
    187    const T_VEC i2 = load(input + 2 * stride);                             \
    188    const T_VEC i3 = load(input + 3 * stride);                             \
    189    const T_VEC i4 = load(input + 4 * stride);                             \
    190    const T_VEC i5 = load(input + 5 * stride);                             \
    191    const T_VEC i6 = load(input + 6 * stride);                             \
    192    const T_VEC i7 = load(input + 7 * stride);                             \
    193    const T_VEC i8 = load(input + 8 * stride);                             \
    194    const T_VEC i9 = load(input + 9 * stride);                             \
    195    const T_VEC i10 = load(input + 10 * stride);                           \
    196    const T_VEC i11 = load(input + 11 * stride);                           \
    197    const T_VEC i12 = load(input + 12 * stride);                           \
    198    const T_VEC i13 = load(input + 13 * stride);                           \
    199    const T_VEC i14 = load(input + 14 * stride);                           \
    200    const T_VEC i15 = load(input + 15 * stride);                           \
    201    const T_VEC w0 = add(i0, i8);                                          \
    202    const T_VEC w1 = sub(i0, i8);                                          \
    203    const T_VEC w2 = add(i4, i12);                                         \
    204    const T_VEC w3 = sub(i4, i12);                                         \
    205    const T_VEC w4 = add(w0, w2);                                          \
    206    const T_VEC w5 = sub(w0, w2);                                          \
    207    const T_VEC w7 = add(i2, i10);                                         \
    208    const T_VEC w8 = sub(i2, i10);                                         \
    209    const T_VEC w9 = add(i6, i14);                                         \
    210    const T_VEC w10 = sub(i6, i14);                                        \
    211    const T_VEC w11 = add(w7, w9);                                         \
    212    const T_VEC w12 = sub(w7, w9);                                         \
    213    const T_VEC w14 = add(w4, w11);                                        \
    214    const T_VEC w15 = sub(w4, w11);                                        \
    215    const T_VEC w16[2] = { add(w1, mul(kWeight2, sub(w8, w10))),           \
    216                           sub(sub(kWeight0, w3),                          \
    217                               mul(kWeight2, add(w10, w8))) };             \
    218    const T_VEC w18[2] = { sub(w1, mul(kWeight2, sub(w8, w10))),           \
    219                           sub(w3, mul(kWeight2, add(w10, w8))) };         \
    220    const T_VEC w19 = add(i1, i9);                                         \
    221    const T_VEC w20 = sub(i1, i9);                                         \
    222    const T_VEC w21 = add(i5, i13);                                        \
    223    const T_VEC w22 = sub(i5, i13);                                        \
    224    const T_VEC w23 = add(w19, w21);                                       \
    225    const T_VEC w24 = sub(w19, w21);                                       \
    226    const T_VEC w26 = add(i3, i11);                                        \
    227    const T_VEC w27 = sub(i3, i11);                                        \
    228    const T_VEC w28 = add(i7, i15);                                        \
    229    const T_VEC w29 = sub(i7, i15);                                        \
    230    const T_VEC w30 = add(w26, w28);                                       \
    231    const T_VEC w31 = sub(w26, w28);                                       \
    232    const T_VEC w33 = add(w23, w30);                                       \
    233    const T_VEC w34 = sub(w23, w30);                                       \
    234    const T_VEC w35[2] = { add(w20, mul(kWeight2, sub(w27, w29))),         \
    235                           sub(sub(kWeight0, w22),                         \
    236                               mul(kWeight2, add(w29, w27))) };            \
    237    const T_VEC w37[2] = { sub(w20, mul(kWeight2, sub(w27, w29))),         \
    238                           sub(w22, mul(kWeight2, add(w29, w27))) };       \
    239    store(output + 0 * stride, add(w14, w33));                             \
    240    store(output + 1 * stride,                                             \
    241          add(w16[0], add(mul(kWeight3, w35[0]), mul(kWeight4, w35[1])))); \
    242    store(output + 2 * stride, add(w5, mul(kWeight2, sub(w24, w31))));     \
    243    store(output + 3 * stride,                                             \
    244          add(w18[0], add(mul(kWeight4, w37[0]), mul(kWeight3, w37[1])))); \
    245    store(output + 4 * stride, w15);                                       \
    246    store(output + 5 * stride,                                             \
    247          add(w18[0], sub(sub(kWeight0, mul(kWeight4, w37[0])),            \
    248                          mul(kWeight3, w37[1]))));                        \
    249    store(output + 6 * stride, sub(w5, mul(kWeight2, sub(w24, w31))));     \
    250    store(output + 7 * stride,                                             \
    251          add(w16[0], sub(sub(kWeight0, mul(kWeight3, w35[0])),            \
    252                          mul(kWeight4, w35[1]))));                        \
    253    store(output + 8 * stride, sub(w14, w33));                             \
    254    store(output + 9 * stride,                                             \
    255          add(w16[1], sub(mul(kWeight3, w35[1]), mul(kWeight4, w35[0])))); \
    256    store(output + 10 * stride,                                            \
    257          sub(sub(kWeight0, w12), mul(kWeight2, add(w31, w24))));          \
    258    store(output + 11 * stride,                                            \
    259          add(w18[1], sub(mul(kWeight4, w37[1]), mul(kWeight3, w37[0])))); \
    260    store(output + 12 * stride, sub(kWeight0, w34));                       \
    261    store(output + 13 * stride,                                            \
    262          sub(sub(kWeight0, w18[1]),                                       \
    263              sub(mul(kWeight3, w37[0]), mul(kWeight4, w37[1]))));         \
    264    store(output + 14 * stride, sub(w12, mul(kWeight2, add(w31, w24))));   \
    265    store(output + 15 * stride,                                            \
    266          sub(sub(kWeight0, w16[1]),                                       \
    267              sub(mul(kWeight4, w35[0]), mul(kWeight3, w35[1]))));         \
    268  }
    269 
    270 #define GEN_FFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub,   \
    271                   mul)                                                      \
    272  ret aom_fft1d_32_##suffix(const T *input, T *output, int stride) {         \
    273    const T_VEC kWeight0 = constant(0.0f);                                   \
    274    const T_VEC kWeight2 = constant(0.707107f);                              \
    275    const T_VEC kWeight3 = constant(0.92388f);                               \
    276    const T_VEC kWeight4 = constant(0.382683f);                              \
    277    const T_VEC kWeight5 = constant(0.980785f);                              \
    278    const T_VEC kWeight6 = constant(0.19509f);                               \
    279    const T_VEC kWeight7 = constant(0.83147f);                               \
    280    const T_VEC kWeight8 = constant(0.55557f);                               \
    281    const T_VEC i0 = load(input + 0 * stride);                               \
    282    const T_VEC i1 = load(input + 1 * stride);                               \
    283    const T_VEC i2 = load(input + 2 * stride);                               \
    284    const T_VEC i3 = load(input + 3 * stride);                               \
    285    const T_VEC i4 = load(input + 4 * stride);                               \
    286    const T_VEC i5 = load(input + 5 * stride);                               \
    287    const T_VEC i6 = load(input + 6 * stride);                               \
    288    const T_VEC i7 = load(input + 7 * stride);                               \
    289    const T_VEC i8 = load(input + 8 * stride);                               \
    290    const T_VEC i9 = load(input + 9 * stride);                               \
    291    const T_VEC i10 = load(input + 10 * stride);                             \
    292    const T_VEC i11 = load(input + 11 * stride);                             \
    293    const T_VEC i12 = load(input + 12 * stride);                             \
    294    const T_VEC i13 = load(input + 13 * stride);                             \
    295    const T_VEC i14 = load(input + 14 * stride);                             \
    296    const T_VEC i15 = load(input + 15 * stride);                             \
    297    const T_VEC i16 = load(input + 16 * stride);                             \
    298    const T_VEC i17 = load(input + 17 * stride);                             \
    299    const T_VEC i18 = load(input + 18 * stride);                             \
    300    const T_VEC i19 = load(input + 19 * stride);                             \
    301    const T_VEC i20 = load(input + 20 * stride);                             \
    302    const T_VEC i21 = load(input + 21 * stride);                             \
    303    const T_VEC i22 = load(input + 22 * stride);                             \
    304    const T_VEC i23 = load(input + 23 * stride);                             \
    305    const T_VEC i24 = load(input + 24 * stride);                             \
    306    const T_VEC i25 = load(input + 25 * stride);                             \
    307    const T_VEC i26 = load(input + 26 * stride);                             \
    308    const T_VEC i27 = load(input + 27 * stride);                             \
    309    const T_VEC i28 = load(input + 28 * stride);                             \
    310    const T_VEC i29 = load(input + 29 * stride);                             \
    311    const T_VEC i30 = load(input + 30 * stride);                             \
    312    const T_VEC i31 = load(input + 31 * stride);                             \
    313    const T_VEC w0 = add(i0, i16);                                           \
    314    const T_VEC w1 = sub(i0, i16);                                           \
    315    const T_VEC w2 = add(i8, i24);                                           \
    316    const T_VEC w3 = sub(i8, i24);                                           \
    317    const T_VEC w4 = add(w0, w2);                                            \
    318    const T_VEC w5 = sub(w0, w2);                                            \
    319    const T_VEC w7 = add(i4, i20);                                           \
    320    const T_VEC w8 = sub(i4, i20);                                           \
    321    const T_VEC w9 = add(i12, i28);                                          \
    322    const T_VEC w10 = sub(i12, i28);                                         \
    323    const T_VEC w11 = add(w7, w9);                                           \
    324    const T_VEC w12 = sub(w7, w9);                                           \
    325    const T_VEC w14 = add(w4, w11);                                          \
    326    const T_VEC w15 = sub(w4, w11);                                          \
    327    const T_VEC w16[2] = { add(w1, mul(kWeight2, sub(w8, w10))),             \
    328                           sub(sub(kWeight0, w3),                            \
    329                               mul(kWeight2, add(w10, w8))) };               \
    330    const T_VEC w18[2] = { sub(w1, mul(kWeight2, sub(w8, w10))),             \
    331                           sub(w3, mul(kWeight2, add(w10, w8))) };           \
    332    const T_VEC w19 = add(i2, i18);                                          \
    333    const T_VEC w20 = sub(i2, i18);                                          \
    334    const T_VEC w21 = add(i10, i26);                                         \
    335    const T_VEC w22 = sub(i10, i26);                                         \
    336    const T_VEC w23 = add(w19, w21);                                         \
    337    const T_VEC w24 = sub(w19, w21);                                         \
    338    const T_VEC w26 = add(i6, i22);                                          \
    339    const T_VEC w27 = sub(i6, i22);                                          \
    340    const T_VEC w28 = add(i14, i30);                                         \
    341    const T_VEC w29 = sub(i14, i30);                                         \
    342    const T_VEC w30 = add(w26, w28);                                         \
    343    const T_VEC w31 = sub(w26, w28);                                         \
    344    const T_VEC w33 = add(w23, w30);                                         \
    345    const T_VEC w34 = sub(w23, w30);                                         \
    346    const T_VEC w35[2] = { add(w20, mul(kWeight2, sub(w27, w29))),           \
    347                           sub(sub(kWeight0, w22),                           \
    348                               mul(kWeight2, add(w29, w27))) };              \
    349    const T_VEC w37[2] = { sub(w20, mul(kWeight2, sub(w27, w29))),           \
    350                           sub(w22, mul(kWeight2, add(w29, w27))) };         \
    351    const T_VEC w38 = add(w14, w33);                                         \
    352    const T_VEC w39 = sub(w14, w33);                                         \
    353    const T_VEC w40[2] = {                                                   \
    354      add(w16[0], add(mul(kWeight3, w35[0]), mul(kWeight4, w35[1]))),        \
    355      add(w16[1], sub(mul(kWeight3, w35[1]), mul(kWeight4, w35[0])))         \
    356    };                                                                       \
    357    const T_VEC w41[2] = { add(w5, mul(kWeight2, sub(w24, w31))),            \
    358                           sub(sub(kWeight0, w12),                           \
    359                               mul(kWeight2, add(w31, w24))) };              \
    360    const T_VEC w42[2] = {                                                   \
    361      add(w18[0], add(mul(kWeight4, w37[0]), mul(kWeight3, w37[1]))),        \
    362      add(w18[1], sub(mul(kWeight4, w37[1]), mul(kWeight3, w37[0])))         \
    363    };                                                                       \
    364    const T_VEC w44[2] = {                                                   \
    365      add(w18[0],                                                            \
    366          sub(sub(kWeight0, mul(kWeight4, w37[0])), mul(kWeight3, w37[1]))), \
    367      sub(sub(kWeight0, w18[1]),                                             \
    368          sub(mul(kWeight3, w37[0]), mul(kWeight4, w37[1])))                 \
    369    };                                                                       \
    370    const T_VEC w45[2] = { sub(w5, mul(kWeight2, sub(w24, w31))),            \
    371                           sub(w12, mul(kWeight2, add(w31, w24))) };         \
    372    const T_VEC w46[2] = {                                                   \
    373      add(w16[0],                                                            \
    374          sub(sub(kWeight0, mul(kWeight3, w35[0])), mul(kWeight4, w35[1]))), \
    375      sub(sub(kWeight0, w16[1]),                                             \
    376          sub(mul(kWeight4, w35[0]), mul(kWeight3, w35[1])))                 \
    377    };                                                                       \
    378    const T_VEC w47 = add(i1, i17);                                          \
    379    const T_VEC w48 = sub(i1, i17);                                          \
    380    const T_VEC w49 = add(i9, i25);                                          \
    381    const T_VEC w50 = sub(i9, i25);                                          \
    382    const T_VEC w51 = add(w47, w49);                                         \
    383    const T_VEC w52 = sub(w47, w49);                                         \
    384    const T_VEC w54 = add(i5, i21);                                          \
    385    const T_VEC w55 = sub(i5, i21);                                          \
    386    const T_VEC w56 = add(i13, i29);                                         \
    387    const T_VEC w57 = sub(i13, i29);                                         \
    388    const T_VEC w58 = add(w54, w56);                                         \
    389    const T_VEC w59 = sub(w54, w56);                                         \
    390    const T_VEC w61 = add(w51, w58);                                         \
    391    const T_VEC w62 = sub(w51, w58);                                         \
    392    const T_VEC w63[2] = { add(w48, mul(kWeight2, sub(w55, w57))),           \
    393                           sub(sub(kWeight0, w50),                           \
    394                               mul(kWeight2, add(w57, w55))) };              \
    395    const T_VEC w65[2] = { sub(w48, mul(kWeight2, sub(w55, w57))),           \
    396                           sub(w50, mul(kWeight2, add(w57, w55))) };         \
    397    const T_VEC w66 = add(i3, i19);                                          \
    398    const T_VEC w67 = sub(i3, i19);                                          \
    399    const T_VEC w68 = add(i11, i27);                                         \
    400    const T_VEC w69 = sub(i11, i27);                                         \
    401    const T_VEC w70 = add(w66, w68);                                         \
    402    const T_VEC w71 = sub(w66, w68);                                         \
    403    const T_VEC w73 = add(i7, i23);                                          \
    404    const T_VEC w74 = sub(i7, i23);                                          \
    405    const T_VEC w75 = add(i15, i31);                                         \
    406    const T_VEC w76 = sub(i15, i31);                                         \
    407    const T_VEC w77 = add(w73, w75);                                         \
    408    const T_VEC w78 = sub(w73, w75);                                         \
    409    const T_VEC w80 = add(w70, w77);                                         \
    410    const T_VEC w81 = sub(w70, w77);                                         \
    411    const T_VEC w82[2] = { add(w67, mul(kWeight2, sub(w74, w76))),           \
    412                           sub(sub(kWeight0, w69),                           \
    413                               mul(kWeight2, add(w76, w74))) };              \
    414    const T_VEC w84[2] = { sub(w67, mul(kWeight2, sub(w74, w76))),           \
    415                           sub(w69, mul(kWeight2, add(w76, w74))) };         \
    416    const T_VEC w85 = add(w61, w80);                                         \
    417    const T_VEC w86 = sub(w61, w80);                                         \
    418    const T_VEC w87[2] = {                                                   \
    419      add(w63[0], add(mul(kWeight3, w82[0]), mul(kWeight4, w82[1]))),        \
    420      add(w63[1], sub(mul(kWeight3, w82[1]), mul(kWeight4, w82[0])))         \
    421    };                                                                       \
    422    const T_VEC w88[2] = { add(w52, mul(kWeight2, sub(w71, w78))),           \
    423                           sub(sub(kWeight0, w59),                           \
    424                               mul(kWeight2, add(w78, w71))) };              \
    425    const T_VEC w89[2] = {                                                   \
    426      add(w65[0], add(mul(kWeight4, w84[0]), mul(kWeight3, w84[1]))),        \
    427      add(w65[1], sub(mul(kWeight4, w84[1]), mul(kWeight3, w84[0])))         \
    428    };                                                                       \
    429    const T_VEC w91[2] = {                                                   \
    430      add(w65[0],                                                            \
    431          sub(sub(kWeight0, mul(kWeight4, w84[0])), mul(kWeight3, w84[1]))), \
    432      sub(sub(kWeight0, w65[1]),                                             \
    433          sub(mul(kWeight3, w84[0]), mul(kWeight4, w84[1])))                 \
    434    };                                                                       \
    435    const T_VEC w92[2] = { sub(w52, mul(kWeight2, sub(w71, w78))),           \
    436                           sub(w59, mul(kWeight2, add(w78, w71))) };         \
    437    const T_VEC w93[2] = {                                                   \
    438      add(w63[0],                                                            \
    439          sub(sub(kWeight0, mul(kWeight3, w82[0])), mul(kWeight4, w82[1]))), \
    440      sub(sub(kWeight0, w63[1]),                                             \
    441          sub(mul(kWeight4, w82[0]), mul(kWeight3, w82[1])))                 \
    442    };                                                                       \
    443    store(output + 0 * stride, add(w38, w85));                               \
    444    store(output + 1 * stride,                                               \
    445          add(w40[0], add(mul(kWeight5, w87[0]), mul(kWeight6, w87[1]))));   \
    446    store(output + 2 * stride,                                               \
    447          add(w41[0], add(mul(kWeight3, w88[0]), mul(kWeight4, w88[1]))));   \
    448    store(output + 3 * stride,                                               \
    449          add(w42[0], add(mul(kWeight7, w89[0]), mul(kWeight8, w89[1]))));   \
    450    store(output + 4 * stride, add(w15, mul(kWeight2, sub(w62, w81))));      \
    451    store(output + 5 * stride,                                               \
    452          add(w44[0], add(mul(kWeight8, w91[0]), mul(kWeight7, w91[1]))));   \
    453    store(output + 6 * stride,                                               \
    454          add(w45[0], add(mul(kWeight4, w92[0]), mul(kWeight3, w92[1]))));   \
    455    store(output + 7 * stride,                                               \
    456          add(w46[0], add(mul(kWeight6, w93[0]), mul(kWeight5, w93[1]))));   \
    457    store(output + 8 * stride, w39);                                         \
    458    store(output + 9 * stride,                                               \
    459          add(w46[0], sub(sub(kWeight0, mul(kWeight6, w93[0])),              \
    460                          mul(kWeight5, w93[1]))));                          \
    461    store(output + 10 * stride,                                              \
    462          add(w45[0], sub(sub(kWeight0, mul(kWeight4, w92[0])),              \
    463                          mul(kWeight3, w92[1]))));                          \
    464    store(output + 11 * stride,                                              \
    465          add(w44[0], sub(sub(kWeight0, mul(kWeight8, w91[0])),              \
    466                          mul(kWeight7, w91[1]))));                          \
    467    store(output + 12 * stride, sub(w15, mul(kWeight2, sub(w62, w81))));     \
    468    store(output + 13 * stride,                                              \
    469          add(w42[0], sub(sub(kWeight0, mul(kWeight7, w89[0])),              \
    470                          mul(kWeight8, w89[1]))));                          \
    471    store(output + 14 * stride,                                              \
    472          add(w41[0], sub(sub(kWeight0, mul(kWeight3, w88[0])),              \
    473                          mul(kWeight4, w88[1]))));                          \
    474    store(output + 15 * stride,                                              \
    475          add(w40[0], sub(sub(kWeight0, mul(kWeight5, w87[0])),              \
    476                          mul(kWeight6, w87[1]))));                          \
    477    store(output + 16 * stride, sub(w38, w85));                              \
    478    store(output + 17 * stride,                                              \
    479          add(w40[1], sub(mul(kWeight5, w87[1]), mul(kWeight6, w87[0]))));   \
    480    store(output + 18 * stride,                                              \
    481          add(w41[1], sub(mul(kWeight3, w88[1]), mul(kWeight4, w88[0]))));   \
    482    store(output + 19 * stride,                                              \
    483          add(w42[1], sub(mul(kWeight7, w89[1]), mul(kWeight8, w89[0]))));   \
    484    store(output + 20 * stride,                                              \
    485          sub(sub(kWeight0, w34), mul(kWeight2, add(w81, w62))));            \
    486    store(output + 21 * stride,                                              \
    487          add(w44[1], sub(mul(kWeight8, w91[1]), mul(kWeight7, w91[0]))));   \
    488    store(output + 22 * stride,                                              \
    489          add(w45[1], sub(mul(kWeight4, w92[1]), mul(kWeight3, w92[0]))));   \
    490    store(output + 23 * stride,                                              \
    491          add(w46[1], sub(mul(kWeight6, w93[1]), mul(kWeight5, w93[0]))));   \
    492    store(output + 24 * stride, sub(kWeight0, w86));                         \
    493    store(output + 25 * stride,                                              \
    494          sub(sub(kWeight0, w46[1]),                                         \
    495              sub(mul(kWeight5, w93[0]), mul(kWeight6, w93[1]))));           \
    496    store(output + 26 * stride,                                              \
    497          sub(sub(kWeight0, w45[1]),                                         \
    498              sub(mul(kWeight3, w92[0]), mul(kWeight4, w92[1]))));           \
    499    store(output + 27 * stride,                                              \
    500          sub(sub(kWeight0, w44[1]),                                         \
    501              sub(mul(kWeight7, w91[0]), mul(kWeight8, w91[1]))));           \
    502    store(output + 28 * stride, sub(w34, mul(kWeight2, add(w81, w62))));     \
    503    store(output + 29 * stride,                                              \
    504          sub(sub(kWeight0, w42[1]),                                         \
    505              sub(mul(kWeight8, w89[0]), mul(kWeight7, w89[1]))));           \
    506    store(output + 30 * stride,                                              \
    507          sub(sub(kWeight0, w41[1]),                                         \
    508              sub(mul(kWeight4, w88[0]), mul(kWeight3, w88[1]))));           \
    509    store(output + 31 * stride,                                              \
    510          sub(sub(kWeight0, w40[1]),                                         \
    511              sub(mul(kWeight6, w87[0]), mul(kWeight5, w87[1]))));           \
    512  }
    513 
    514 #define GEN_IFFT_2(ret, suffix, T, T_VEC, load, store)               \
    515  ret aom_ifft1d_2_##suffix(const T *input, T *output, int stride) { \
    516    const T_VEC i0 = load(input + 0 * stride);                       \
    517    const T_VEC i1 = load(input + 1 * stride);                       \
    518    store(output + 0 * stride, i0 + i1);                             \
    519    store(output + 1 * stride, i0 - i1);                             \
    520  }
    521 
    522 #define GEN_IFFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \
    523  ret aom_ifft1d_4_##suffix(const T *input, T *output, int stride) {       \
    524    const T_VEC kWeight0 = constant(0.0f);                                 \
    525    const T_VEC i0 = load(input + 0 * stride);                             \
    526    const T_VEC i1 = load(input + 1 * stride);                             \
    527    const T_VEC i2 = load(input + 2 * stride);                             \
    528    const T_VEC i3 = load(input + 3 * stride);                             \
    529    const T_VEC w2 = add(i0, i2);                                          \
    530    const T_VEC w3 = sub(i0, i2);                                          \
    531    const T_VEC w4[2] = { add(i1, i1), sub(i3, i3) };                      \
    532    const T_VEC w5[2] = { sub(i1, i1), sub(sub(kWeight0, i3), i3) };       \
    533    store(output + 0 * stride, add(w2, w4[0]));                            \
    534    store(output + 1 * stride, add(w3, w5[1]));                            \
    535    store(output + 2 * stride, sub(w2, w4[0]));                            \
    536    store(output + 3 * stride, sub(w3, w5[1]));                            \
    537  }
    538 
    539 #define GEN_IFFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
    540                   mul)                                                    \
    541  ret aom_ifft1d_8_##suffix(const T *input, T *output, int stride) {       \
    542    const T_VEC kWeight0 = constant(0.0f);                                 \
    543    const T_VEC kWeight2 = constant(0.707107f);                            \
    544    const T_VEC i0 = load(input + 0 * stride);                             \
    545    const T_VEC i1 = load(input + 1 * stride);                             \
    546    const T_VEC i2 = load(input + 2 * stride);                             \
    547    const T_VEC i3 = load(input + 3 * stride);                             \
    548    const T_VEC i4 = load(input + 4 * stride);                             \
    549    const T_VEC i5 = load(input + 5 * stride);                             \
    550    const T_VEC i6 = load(input + 6 * stride);                             \
    551    const T_VEC i7 = load(input + 7 * stride);                             \
    552    const T_VEC w6 = add(i0, i4);                                          \
    553    const T_VEC w7 = sub(i0, i4);                                          \
    554    const T_VEC w8[2] = { add(i2, i2), sub(i6, i6) };                      \
    555    const T_VEC w9[2] = { sub(i2, i2), sub(sub(kWeight0, i6), i6) };       \
    556    const T_VEC w10[2] = { add(w6, w8[0]), w8[1] };                        \
    557    const T_VEC w11[2] = { sub(w6, w8[0]), sub(kWeight0, w8[1]) };         \
    558    const T_VEC w12[2] = { add(w7, w9[1]), sub(kWeight0, w9[0]) };         \
    559    const T_VEC w13[2] = { sub(w7, w9[1]), w9[0] };                        \
    560    const T_VEC w14[2] = { add(i1, i3), sub(i7, i5) };                     \
    561    const T_VEC w15[2] = { sub(i1, i3), sub(sub(kWeight0, i5), i7) };      \
    562    const T_VEC w16[2] = { add(i3, i1), sub(i5, i7) };                     \
    563    const T_VEC w17[2] = { sub(i3, i1), sub(sub(kWeight0, i7), i5) };      \
    564    const T_VEC w18[2] = { add(w14[0], w16[0]), add(w14[1], w16[1]) };     \
    565    const T_VEC w19[2] = { sub(w14[0], w16[0]), sub(w14[1], w16[1]) };     \
    566    const T_VEC w20[2] = { add(w15[0], w17[1]), sub(w15[1], w17[0]) };     \
    567    const T_VEC w21[2] = { sub(w15[0], w17[1]), add(w15[1], w17[0]) };     \
    568    store(output + 0 * stride, add(w10[0], w18[0]));                       \
    569    store(output + 1 * stride,                                             \
    570          add(w12[0], mul(kWeight2, add(w20[0], w20[1]))));                \
    571    store(output + 2 * stride, add(w11[0], w19[1]));                       \
    572    store(output + 3 * stride,                                             \
    573          sub(w13[0], mul(kWeight2, sub(w21[0], w21[1]))));                \
    574    store(output + 4 * stride, sub(w10[0], w18[0]));                       \
    575    store(output + 5 * stride,                                             \
    576          add(w12[0], sub(sub(kWeight0, mul(kWeight2, w20[0])),            \
    577                          mul(kWeight2, w20[1]))));                        \
    578    store(output + 6 * stride, sub(w11[0], w19[1]));                       \
    579    store(output + 7 * stride,                                             \
    580          add(w13[0], mul(kWeight2, sub(w21[0], w21[1]))));                \
    581  }
    582 
    583 #define GEN_IFFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub,   \
    584                    mul)                                                      \
    585  ret aom_ifft1d_16_##suffix(const T *input, T *output, int stride) {         \
    586    const T_VEC kWeight0 = constant(0.0f);                                    \
    587    const T_VEC kWeight2 = constant(0.707107f);                               \
    588    const T_VEC kWeight3 = constant(0.92388f);                                \
    589    const T_VEC kWeight4 = constant(0.382683f);                               \
    590    const T_VEC i0 = load(input + 0 * stride);                                \
    591    const T_VEC i1 = load(input + 1 * stride);                                \
    592    const T_VEC i2 = load(input + 2 * stride);                                \
    593    const T_VEC i3 = load(input + 3 * stride);                                \
    594    const T_VEC i4 = load(input + 4 * stride);                                \
    595    const T_VEC i5 = load(input + 5 * stride);                                \
    596    const T_VEC i6 = load(input + 6 * stride);                                \
    597    const T_VEC i7 = load(input + 7 * stride);                                \
    598    const T_VEC i8 = load(input + 8 * stride);                                \
    599    const T_VEC i9 = load(input + 9 * stride);                                \
    600    const T_VEC i10 = load(input + 10 * stride);                              \
    601    const T_VEC i11 = load(input + 11 * stride);                              \
    602    const T_VEC i12 = load(input + 12 * stride);                              \
    603    const T_VEC i13 = load(input + 13 * stride);                              \
    604    const T_VEC i14 = load(input + 14 * stride);                              \
    605    const T_VEC i15 = load(input + 15 * stride);                              \
    606    const T_VEC w14 = add(i0, i8);                                            \
    607    const T_VEC w15 = sub(i0, i8);                                            \
    608    const T_VEC w16[2] = { add(i4, i4), sub(i12, i12) };                      \
    609    const T_VEC w17[2] = { sub(i4, i4), sub(sub(kWeight0, i12), i12) };       \
    610    const T_VEC w18[2] = { add(w14, w16[0]), w16[1] };                        \
    611    const T_VEC w19[2] = { sub(w14, w16[0]), sub(kWeight0, w16[1]) };         \
    612    const T_VEC w20[2] = { add(w15, w17[1]), sub(kWeight0, w17[0]) };         \
    613    const T_VEC w21[2] = { sub(w15, w17[1]), w17[0] };                        \
    614    const T_VEC w22[2] = { add(i2, i6), sub(i14, i10) };                      \
    615    const T_VEC w23[2] = { sub(i2, i6), sub(sub(kWeight0, i10), i14) };       \
    616    const T_VEC w24[2] = { add(i6, i2), sub(i10, i14) };                      \
    617    const T_VEC w25[2] = { sub(i6, i2), sub(sub(kWeight0, i14), i10) };       \
    618    const T_VEC w26[2] = { add(w22[0], w24[0]), add(w22[1], w24[1]) };        \
    619    const T_VEC w27[2] = { sub(w22[0], w24[0]), sub(w22[1], w24[1]) };        \
    620    const T_VEC w28[2] = { add(w23[0], w25[1]), sub(w23[1], w25[0]) };        \
    621    const T_VEC w29[2] = { sub(w23[0], w25[1]), add(w23[1], w25[0]) };        \
    622    const T_VEC w30[2] = { add(w18[0], w26[0]), add(w18[1], w26[1]) };        \
    623    const T_VEC w31[2] = { sub(w18[0], w26[0]), sub(w18[1], w26[1]) };        \
    624    const T_VEC w32[2] = { add(w20[0], mul(kWeight2, add(w28[0], w28[1]))),   \
    625                           add(w20[1], mul(kWeight2, sub(w28[1], w28[0]))) }; \
    626    const T_VEC w33[2] = { add(w20[0],                                        \
    627                               sub(sub(kWeight0, mul(kWeight2, w28[0])),      \
    628                                   mul(kWeight2, w28[1]))),                   \
    629                           add(w20[1], mul(kWeight2, sub(w28[0], w28[1]))) }; \
    630    const T_VEC w34[2] = { add(w19[0], w27[1]), sub(w19[1], w27[0]) };        \
    631    const T_VEC w35[2] = { sub(w19[0], w27[1]), add(w19[1], w27[0]) };        \
    632    const T_VEC w36[2] = { sub(w21[0], mul(kWeight2, sub(w29[0], w29[1]))),   \
    633                           sub(w21[1], mul(kWeight2, add(w29[1], w29[0]))) }; \
    634    const T_VEC w37[2] = { add(w21[0], mul(kWeight2, sub(w29[0], w29[1]))),   \
    635                           add(w21[1], mul(kWeight2, add(w29[1], w29[0]))) }; \
    636    const T_VEC w38[2] = { add(i1, i7), sub(i15, i9) };                       \
    637    const T_VEC w39[2] = { sub(i1, i7), sub(sub(kWeight0, i9), i15) };        \
    638    const T_VEC w40[2] = { add(i5, i3), sub(i11, i13) };                      \
    639    const T_VEC w41[2] = { sub(i5, i3), sub(sub(kWeight0, i13), i11) };       \
    640    const T_VEC w42[2] = { add(w38[0], w40[0]), add(w38[1], w40[1]) };        \
    641    const T_VEC w43[2] = { sub(w38[0], w40[0]), sub(w38[1], w40[1]) };        \
    642    const T_VEC w44[2] = { add(w39[0], w41[1]), sub(w39[1], w41[0]) };        \
    643    const T_VEC w45[2] = { sub(w39[0], w41[1]), add(w39[1], w41[0]) };        \
    644    const T_VEC w46[2] = { add(i3, i5), sub(i13, i11) };                      \
    645    const T_VEC w47[2] = { sub(i3, i5), sub(sub(kWeight0, i11), i13) };       \
    646    const T_VEC w48[2] = { add(i7, i1), sub(i9, i15) };                       \
    647    const T_VEC w49[2] = { sub(i7, i1), sub(sub(kWeight0, i15), i9) };        \
    648    const T_VEC w50[2] = { add(w46[0], w48[0]), add(w46[1], w48[1]) };        \
    649    const T_VEC w51[2] = { sub(w46[0], w48[0]), sub(w46[1], w48[1]) };        \
    650    const T_VEC w52[2] = { add(w47[0], w49[1]), sub(w47[1], w49[0]) };        \
    651    const T_VEC w53[2] = { sub(w47[0], w49[1]), add(w47[1], w49[0]) };        \
    652    const T_VEC w54[2] = { add(w42[0], w50[0]), add(w42[1], w50[1]) };        \
    653    const T_VEC w55[2] = { sub(w42[0], w50[0]), sub(w42[1], w50[1]) };        \
    654    const T_VEC w56[2] = { add(w44[0], mul(kWeight2, add(w52[0], w52[1]))),   \
    655                           add(w44[1], mul(kWeight2, sub(w52[1], w52[0]))) }; \
    656    const T_VEC w57[2] = { add(w44[0],                                        \
    657                               sub(sub(kWeight0, mul(kWeight2, w52[0])),      \
    658                                   mul(kWeight2, w52[1]))),                   \
    659                           add(w44[1], mul(kWeight2, sub(w52[0], w52[1]))) }; \
    660    const T_VEC w58[2] = { add(w43[0], w51[1]), sub(w43[1], w51[0]) };        \
    661    const T_VEC w59[2] = { sub(w43[0], w51[1]), add(w43[1], w51[0]) };        \
    662    const T_VEC w60[2] = { sub(w45[0], mul(kWeight2, sub(w53[0], w53[1]))),   \
    663                           sub(w45[1], mul(kWeight2, add(w53[1], w53[0]))) }; \
    664    const T_VEC w61[2] = { add(w45[0], mul(kWeight2, sub(w53[0], w53[1]))),   \
    665                           add(w45[1], mul(kWeight2, add(w53[1], w53[0]))) }; \
    666    store(output + 0 * stride, add(w30[0], w54[0]));                          \
    667    store(output + 1 * stride,                                                \
    668          add(w32[0], add(mul(kWeight3, w56[0]), mul(kWeight4, w56[1]))));    \
    669    store(output + 2 * stride,                                                \
    670          add(w34[0], mul(kWeight2, add(w58[0], w58[1]))));                   \
    671    store(output + 3 * stride,                                                \
    672          add(w36[0], add(mul(kWeight4, w60[0]), mul(kWeight3, w60[1]))));    \
    673    store(output + 4 * stride, add(w31[0], w55[1]));                          \
    674    store(output + 5 * stride,                                                \
    675          sub(w33[0], sub(mul(kWeight4, w57[0]), mul(kWeight3, w57[1]))));    \
    676    store(output + 6 * stride,                                                \
    677          sub(w35[0], mul(kWeight2, sub(w59[0], w59[1]))));                   \
    678    store(output + 7 * stride,                                                \
    679          sub(w37[0], sub(mul(kWeight3, w61[0]), mul(kWeight4, w61[1]))));    \
    680    store(output + 8 * stride, sub(w30[0], w54[0]));                          \
    681    store(output + 9 * stride,                                                \
    682          add(w32[0], sub(sub(kWeight0, mul(kWeight3, w56[0])),               \
    683                          mul(kWeight4, w56[1]))));                           \
    684    store(output + 10 * stride,                                               \
    685          add(w34[0], sub(sub(kWeight0, mul(kWeight2, w58[0])),               \
    686                          mul(kWeight2, w58[1]))));                           \
    687    store(output + 11 * stride,                                               \
    688          add(w36[0], sub(sub(kWeight0, mul(kWeight4, w60[0])),               \
    689                          mul(kWeight3, w60[1]))));                           \
    690    store(output + 12 * stride, sub(w31[0], w55[1]));                         \
    691    store(output + 13 * stride,                                               \
    692          add(w33[0], sub(mul(kWeight4, w57[0]), mul(kWeight3, w57[1]))));    \
    693    store(output + 14 * stride,                                               \
    694          add(w35[0], mul(kWeight2, sub(w59[0], w59[1]))));                   \
    695    store(output + 15 * stride,                                               \
    696          add(w37[0], sub(mul(kWeight3, w61[0]), mul(kWeight4, w61[1]))));    \
    697  }
    698 #define GEN_IFFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub,    \
    699                    mul)                                                       \
    700  ret aom_ifft1d_32_##suffix(const T *input, T *output, int stride) {          \
    701    const T_VEC kWeight0 = constant(0.0f);                                     \
    702    const T_VEC kWeight2 = constant(0.707107f);                                \
    703    const T_VEC kWeight3 = constant(0.92388f);                                 \
    704    const T_VEC kWeight4 = constant(0.382683f);                                \
    705    const T_VEC kWeight5 = constant(0.980785f);                                \
    706    const T_VEC kWeight6 = constant(0.19509f);                                 \
    707    const T_VEC kWeight7 = constant(0.83147f);                                 \
    708    const T_VEC kWeight8 = constant(0.55557f);                                 \
    709    const T_VEC i0 = load(input + 0 * stride);                                 \
    710    const T_VEC i1 = load(input + 1 * stride);                                 \
    711    const T_VEC i2 = load(input + 2 * stride);                                 \
    712    const T_VEC i3 = load(input + 3 * stride);                                 \
    713    const T_VEC i4 = load(input + 4 * stride);                                 \
    714    const T_VEC i5 = load(input + 5 * stride);                                 \
    715    const T_VEC i6 = load(input + 6 * stride);                                 \
    716    const T_VEC i7 = load(input + 7 * stride);                                 \
    717    const T_VEC i8 = load(input + 8 * stride);                                 \
    718    const T_VEC i9 = load(input + 9 * stride);                                 \
    719    const T_VEC i10 = load(input + 10 * stride);                               \
    720    const T_VEC i11 = load(input + 11 * stride);                               \
    721    const T_VEC i12 = load(input + 12 * stride);                               \
    722    const T_VEC i13 = load(input + 13 * stride);                               \
    723    const T_VEC i14 = load(input + 14 * stride);                               \
    724    const T_VEC i15 = load(input + 15 * stride);                               \
    725    const T_VEC i16 = load(input + 16 * stride);                               \
    726    const T_VEC i17 = load(input + 17 * stride);                               \
    727    const T_VEC i18 = load(input + 18 * stride);                               \
    728    const T_VEC i19 = load(input + 19 * stride);                               \
    729    const T_VEC i20 = load(input + 20 * stride);                               \
    730    const T_VEC i21 = load(input + 21 * stride);                               \
    731    const T_VEC i22 = load(input + 22 * stride);                               \
    732    const T_VEC i23 = load(input + 23 * stride);                               \
    733    const T_VEC i24 = load(input + 24 * stride);                               \
    734    const T_VEC i25 = load(input + 25 * stride);                               \
    735    const T_VEC i26 = load(input + 26 * stride);                               \
    736    const T_VEC i27 = load(input + 27 * stride);                               \
    737    const T_VEC i28 = load(input + 28 * stride);                               \
    738    const T_VEC i29 = load(input + 29 * stride);                               \
    739    const T_VEC i30 = load(input + 30 * stride);                               \
    740    const T_VEC i31 = load(input + 31 * stride);                               \
    741    const T_VEC w30 = add(i0, i16);                                            \
    742    const T_VEC w31 = sub(i0, i16);                                            \
    743    const T_VEC w32[2] = { add(i8, i8), sub(i24, i24) };                       \
    744    const T_VEC w33[2] = { sub(i8, i8), sub(sub(kWeight0, i24), i24) };        \
    745    const T_VEC w34[2] = { add(w30, w32[0]), w32[1] };                         \
    746    const T_VEC w35[2] = { sub(w30, w32[0]), sub(kWeight0, w32[1]) };          \
    747    const T_VEC w36[2] = { add(w31, w33[1]), sub(kWeight0, w33[0]) };          \
    748    const T_VEC w37[2] = { sub(w31, w33[1]), w33[0] };                         \
    749    const T_VEC w38[2] = { add(i4, i12), sub(i28, i20) };                      \
    750    const T_VEC w39[2] = { sub(i4, i12), sub(sub(kWeight0, i20), i28) };       \
    751    const T_VEC w40[2] = { add(i12, i4), sub(i20, i28) };                      \
    752    const T_VEC w41[2] = { sub(i12, i4), sub(sub(kWeight0, i28), i20) };       \
    753    const T_VEC w42[2] = { add(w38[0], w40[0]), add(w38[1], w40[1]) };         \
    754    const T_VEC w43[2] = { sub(w38[0], w40[0]), sub(w38[1], w40[1]) };         \
    755    const T_VEC w44[2] = { add(w39[0], w41[1]), sub(w39[1], w41[0]) };         \
    756    const T_VEC w45[2] = { sub(w39[0], w41[1]), add(w39[1], w41[0]) };         \
    757    const T_VEC w46[2] = { add(w34[0], w42[0]), add(w34[1], w42[1]) };         \
    758    const T_VEC w47[2] = { sub(w34[0], w42[0]), sub(w34[1], w42[1]) };         \
    759    const T_VEC w48[2] = { add(w36[0], mul(kWeight2, add(w44[0], w44[1]))),    \
    760                           add(w36[1], mul(kWeight2, sub(w44[1], w44[0]))) };  \
    761    const T_VEC w49[2] = { add(w36[0],                                         \
    762                               sub(sub(kWeight0, mul(kWeight2, w44[0])),       \
    763                                   mul(kWeight2, w44[1]))),                    \
    764                           add(w36[1], mul(kWeight2, sub(w44[0], w44[1]))) };  \
    765    const T_VEC w50[2] = { add(w35[0], w43[1]), sub(w35[1], w43[0]) };         \
    766    const T_VEC w51[2] = { sub(w35[0], w43[1]), add(w35[1], w43[0]) };         \
    767    const T_VEC w52[2] = { sub(w37[0], mul(kWeight2, sub(w45[0], w45[1]))),    \
    768                           sub(w37[1], mul(kWeight2, add(w45[1], w45[0]))) };  \
    769    const T_VEC w53[2] = { add(w37[0], mul(kWeight2, sub(w45[0], w45[1]))),    \
    770                           add(w37[1], mul(kWeight2, add(w45[1], w45[0]))) };  \
    771    const T_VEC w54[2] = { add(i2, i14), sub(i30, i18) };                      \
    772    const T_VEC w55[2] = { sub(i2, i14), sub(sub(kWeight0, i18), i30) };       \
    773    const T_VEC w56[2] = { add(i10, i6), sub(i22, i26) };                      \
    774    const T_VEC w57[2] = { sub(i10, i6), sub(sub(kWeight0, i26), i22) };       \
    775    const T_VEC w58[2] = { add(w54[0], w56[0]), add(w54[1], w56[1]) };         \
    776    const T_VEC w59[2] = { sub(w54[0], w56[0]), sub(w54[1], w56[1]) };         \
    777    const T_VEC w60[2] = { add(w55[0], w57[1]), sub(w55[1], w57[0]) };         \
    778    const T_VEC w61[2] = { sub(w55[0], w57[1]), add(w55[1], w57[0]) };         \
    779    const T_VEC w62[2] = { add(i6, i10), sub(i26, i22) };                      \
    780    const T_VEC w63[2] = { sub(i6, i10), sub(sub(kWeight0, i22), i26) };       \
    781    const T_VEC w64[2] = { add(i14, i2), sub(i18, i30) };                      \
    782    const T_VEC w65[2] = { sub(i14, i2), sub(sub(kWeight0, i30), i18) };       \
    783    const T_VEC w66[2] = { add(w62[0], w64[0]), add(w62[1], w64[1]) };         \
    784    const T_VEC w67[2] = { sub(w62[0], w64[0]), sub(w62[1], w64[1]) };         \
    785    const T_VEC w68[2] = { add(w63[0], w65[1]), sub(w63[1], w65[0]) };         \
    786    const T_VEC w69[2] = { sub(w63[0], w65[1]), add(w63[1], w65[0]) };         \
    787    const T_VEC w70[2] = { add(w58[0], w66[0]), add(w58[1], w66[1]) };         \
    788    const T_VEC w71[2] = { sub(w58[0], w66[0]), sub(w58[1], w66[1]) };         \
    789    const T_VEC w72[2] = { add(w60[0], mul(kWeight2, add(w68[0], w68[1]))),    \
    790                           add(w60[1], mul(kWeight2, sub(w68[1], w68[0]))) };  \
    791    const T_VEC w73[2] = { add(w60[0],                                         \
    792                               sub(sub(kWeight0, mul(kWeight2, w68[0])),       \
    793                                   mul(kWeight2, w68[1]))),                    \
    794                           add(w60[1], mul(kWeight2, sub(w68[0], w68[1]))) };  \
    795    const T_VEC w74[2] = { add(w59[0], w67[1]), sub(w59[1], w67[0]) };         \
    796    const T_VEC w75[2] = { sub(w59[0], w67[1]), add(w59[1], w67[0]) };         \
    797    const T_VEC w76[2] = { sub(w61[0], mul(kWeight2, sub(w69[0], w69[1]))),    \
    798                           sub(w61[1], mul(kWeight2, add(w69[1], w69[0]))) };  \
    799    const T_VEC w77[2] = { add(w61[0], mul(kWeight2, sub(w69[0], w69[1]))),    \
    800                           add(w61[1], mul(kWeight2, add(w69[1], w69[0]))) };  \
    801    const T_VEC w78[2] = { add(w46[0], w70[0]), add(w46[1], w70[1]) };         \
    802    const T_VEC w79[2] = { sub(w46[0], w70[0]), sub(w46[1], w70[1]) };         \
    803    const T_VEC w80[2] = {                                                     \
    804      add(w48[0], add(mul(kWeight3, w72[0]), mul(kWeight4, w72[1]))),          \
    805      add(w48[1], sub(mul(kWeight3, w72[1]), mul(kWeight4, w72[0])))           \
    806    };                                                                         \
    807    const T_VEC w81[2] = {                                                     \
    808      add(w48[0],                                                              \
    809          sub(sub(kWeight0, mul(kWeight3, w72[0])), mul(kWeight4, w72[1]))),   \
    810      add(w48[1], sub(mul(kWeight4, w72[0]), mul(kWeight3, w72[1])))           \
    811    };                                                                         \
    812    const T_VEC w82[2] = { add(w50[0], mul(kWeight2, add(w74[0], w74[1]))),    \
    813                           add(w50[1], mul(kWeight2, sub(w74[1], w74[0]))) };  \
    814    const T_VEC w83[2] = { add(w50[0],                                         \
    815                               sub(sub(kWeight0, mul(kWeight2, w74[0])),       \
    816                                   mul(kWeight2, w74[1]))),                    \
    817                           add(w50[1], mul(kWeight2, sub(w74[0], w74[1]))) };  \
    818    const T_VEC w84[2] = {                                                     \
    819      add(w52[0], add(mul(kWeight4, w76[0]), mul(kWeight3, w76[1]))),          \
    820      add(w52[1], sub(mul(kWeight4, w76[1]), mul(kWeight3, w76[0])))           \
    821    };                                                                         \
    822    const T_VEC w85[2] = {                                                     \
    823      add(w52[0],                                                              \
    824          sub(sub(kWeight0, mul(kWeight4, w76[0])), mul(kWeight3, w76[1]))),   \
    825      add(w52[1], sub(mul(kWeight3, w76[0]), mul(kWeight4, w76[1])))           \
    826    };                                                                         \
    827    const T_VEC w86[2] = { add(w47[0], w71[1]), sub(w47[1], w71[0]) };         \
    828    const T_VEC w87[2] = { sub(w47[0], w71[1]), add(w47[1], w71[0]) };         \
    829    const T_VEC w88[2] = {                                                     \
    830      sub(w49[0], sub(mul(kWeight4, w73[0]), mul(kWeight3, w73[1]))),          \
    831      add(w49[1],                                                              \
    832          sub(sub(kWeight0, mul(kWeight4, w73[1])), mul(kWeight3, w73[0])))    \
    833    };                                                                         \
    834    const T_VEC w89[2] = {                                                     \
    835      add(w49[0], sub(mul(kWeight4, w73[0]), mul(kWeight3, w73[1]))),          \
    836      add(w49[1], add(mul(kWeight4, w73[1]), mul(kWeight3, w73[0])))           \
    837    };                                                                         \
    838    const T_VEC w90[2] = { sub(w51[0], mul(kWeight2, sub(w75[0], w75[1]))),    \
    839                           sub(w51[1], mul(kWeight2, add(w75[1], w75[0]))) };  \
    840    const T_VEC w91[2] = { add(w51[0], mul(kWeight2, sub(w75[0], w75[1]))),    \
    841                           add(w51[1], mul(kWeight2, add(w75[1], w75[0]))) };  \
    842    const T_VEC w92[2] = {                                                     \
    843      sub(w53[0], sub(mul(kWeight3, w77[0]), mul(kWeight4, w77[1]))),          \
    844      add(w53[1],                                                              \
    845          sub(sub(kWeight0, mul(kWeight3, w77[1])), mul(kWeight4, w77[0])))    \
    846    };                                                                         \
    847    const T_VEC w93[2] = {                                                     \
    848      add(w53[0], sub(mul(kWeight3, w77[0]), mul(kWeight4, w77[1]))),          \
    849      add(w53[1], add(mul(kWeight3, w77[1]), mul(kWeight4, w77[0])))           \
    850    };                                                                         \
    851    const T_VEC w94[2] = { add(i1, i15), sub(i31, i17) };                      \
    852    const T_VEC w95[2] = { sub(i1, i15), sub(sub(kWeight0, i17), i31) };       \
    853    const T_VEC w96[2] = { add(i9, i7), sub(i23, i25) };                       \
    854    const T_VEC w97[2] = { sub(i9, i7), sub(sub(kWeight0, i25), i23) };        \
    855    const T_VEC w98[2] = { add(w94[0], w96[0]), add(w94[1], w96[1]) };         \
    856    const T_VEC w99[2] = { sub(w94[0], w96[0]), sub(w94[1], w96[1]) };         \
    857    const T_VEC w100[2] = { add(w95[0], w97[1]), sub(w95[1], w97[0]) };        \
    858    const T_VEC w101[2] = { sub(w95[0], w97[1]), add(w95[1], w97[0]) };        \
    859    const T_VEC w102[2] = { add(i5, i11), sub(i27, i21) };                     \
    860    const T_VEC w103[2] = { sub(i5, i11), sub(sub(kWeight0, i21), i27) };      \
    861    const T_VEC w104[2] = { add(i13, i3), sub(i19, i29) };                     \
    862    const T_VEC w105[2] = { sub(i13, i3), sub(sub(kWeight0, i29), i19) };      \
    863    const T_VEC w106[2] = { add(w102[0], w104[0]), add(w102[1], w104[1]) };    \
    864    const T_VEC w107[2] = { sub(w102[0], w104[0]), sub(w102[1], w104[1]) };    \
    865    const T_VEC w108[2] = { add(w103[0], w105[1]), sub(w103[1], w105[0]) };    \
    866    const T_VEC w109[2] = { sub(w103[0], w105[1]), add(w103[1], w105[0]) };    \
    867    const T_VEC w110[2] = { add(w98[0], w106[0]), add(w98[1], w106[1]) };      \
    868    const T_VEC w111[2] = { sub(w98[0], w106[0]), sub(w98[1], w106[1]) };      \
    869    const T_VEC w112[2] = {                                                    \
    870      add(w100[0], mul(kWeight2, add(w108[0], w108[1]))),                      \
    871      add(w100[1], mul(kWeight2, sub(w108[1], w108[0])))                       \
    872    };                                                                         \
    873    const T_VEC w113[2] = {                                                    \
    874      add(w100[0],                                                             \
    875          sub(sub(kWeight0, mul(kWeight2, w108[0])), mul(kWeight2, w108[1]))), \
    876      add(w100[1], mul(kWeight2, sub(w108[0], w108[1])))                       \
    877    };                                                                         \
    878    const T_VEC w114[2] = { add(w99[0], w107[1]), sub(w99[1], w107[0]) };      \
    879    const T_VEC w115[2] = { sub(w99[0], w107[1]), add(w99[1], w107[0]) };      \
    880    const T_VEC w116[2] = {                                                    \
    881      sub(w101[0], mul(kWeight2, sub(w109[0], w109[1]))),                      \
    882      sub(w101[1], mul(kWeight2, add(w109[1], w109[0])))                       \
    883    };                                                                         \
    884    const T_VEC w117[2] = {                                                    \
    885      add(w101[0], mul(kWeight2, sub(w109[0], w109[1]))),                      \
    886      add(w101[1], mul(kWeight2, add(w109[1], w109[0])))                       \
    887    };                                                                         \
    888    const T_VEC w118[2] = { add(i3, i13), sub(i29, i19) };                     \
    889    const T_VEC w119[2] = { sub(i3, i13), sub(sub(kWeight0, i19), i29) };      \
    890    const T_VEC w120[2] = { add(i11, i5), sub(i21, i27) };                     \
    891    const T_VEC w121[2] = { sub(i11, i5), sub(sub(kWeight0, i27), i21) };      \
    892    const T_VEC w122[2] = { add(w118[0], w120[0]), add(w118[1], w120[1]) };    \
    893    const T_VEC w123[2] = { sub(w118[0], w120[0]), sub(w118[1], w120[1]) };    \
    894    const T_VEC w124[2] = { add(w119[0], w121[1]), sub(w119[1], w121[0]) };    \
    895    const T_VEC w125[2] = { sub(w119[0], w121[1]), add(w119[1], w121[0]) };    \
    896    const T_VEC w126[2] = { add(i7, i9), sub(i25, i23) };                      \
    897    const T_VEC w127[2] = { sub(i7, i9), sub(sub(kWeight0, i23), i25) };       \
    898    const T_VEC w128[2] = { add(i15, i1), sub(i17, i31) };                     \
    899    const T_VEC w129[2] = { sub(i15, i1), sub(sub(kWeight0, i31), i17) };      \
    900    const T_VEC w130[2] = { add(w126[0], w128[0]), add(w126[1], w128[1]) };    \
    901    const T_VEC w131[2] = { sub(w126[0], w128[0]), sub(w126[1], w128[1]) };    \
    902    const T_VEC w132[2] = { add(w127[0], w129[1]), sub(w127[1], w129[0]) };    \
    903    const T_VEC w133[2] = { sub(w127[0], w129[1]), add(w127[1], w129[0]) };    \
    904    const T_VEC w134[2] = { add(w122[0], w130[0]), add(w122[1], w130[1]) };    \
    905    const T_VEC w135[2] = { sub(w122[0], w130[0]), sub(w122[1], w130[1]) };    \
    906    const T_VEC w136[2] = {                                                    \
    907      add(w124[0], mul(kWeight2, add(w132[0], w132[1]))),                      \
    908      add(w124[1], mul(kWeight2, sub(w132[1], w132[0])))                       \
    909    };                                                                         \
    910    const T_VEC w137[2] = {                                                    \
    911      add(w124[0],                                                             \
    912          sub(sub(kWeight0, mul(kWeight2, w132[0])), mul(kWeight2, w132[1]))), \
    913      add(w124[1], mul(kWeight2, sub(w132[0], w132[1])))                       \
    914    };                                                                         \
    915    const T_VEC w138[2] = { add(w123[0], w131[1]), sub(w123[1], w131[0]) };    \
    916    const T_VEC w139[2] = { sub(w123[0], w131[1]), add(w123[1], w131[0]) };    \
    917    const T_VEC w140[2] = {                                                    \
    918      sub(w125[0], mul(kWeight2, sub(w133[0], w133[1]))),                      \
    919      sub(w125[1], mul(kWeight2, add(w133[1], w133[0])))                       \
    920    };                                                                         \
    921    const T_VEC w141[2] = {                                                    \
    922      add(w125[0], mul(kWeight2, sub(w133[0], w133[1]))),                      \
    923      add(w125[1], mul(kWeight2, add(w133[1], w133[0])))                       \
    924    };                                                                         \
    925    const T_VEC w142[2] = { add(w110[0], w134[0]), add(w110[1], w134[1]) };    \
    926    const T_VEC w143[2] = { sub(w110[0], w134[0]), sub(w110[1], w134[1]) };    \
    927    const T_VEC w144[2] = {                                                    \
    928      add(w112[0], add(mul(kWeight3, w136[0]), mul(kWeight4, w136[1]))),       \
    929      add(w112[1], sub(mul(kWeight3, w136[1]), mul(kWeight4, w136[0])))        \
    930    };                                                                         \
    931    const T_VEC w145[2] = {                                                    \
    932      add(w112[0],                                                             \
    933          sub(sub(kWeight0, mul(kWeight3, w136[0])), mul(kWeight4, w136[1]))), \
    934      add(w112[1], sub(mul(kWeight4, w136[0]), mul(kWeight3, w136[1])))        \
    935    };                                                                         \
    936    const T_VEC w146[2] = {                                                    \
    937      add(w114[0], mul(kWeight2, add(w138[0], w138[1]))),                      \
    938      add(w114[1], mul(kWeight2, sub(w138[1], w138[0])))                       \
    939    };                                                                         \
    940    const T_VEC w147[2] = {                                                    \
    941      add(w114[0],                                                             \
    942          sub(sub(kWeight0, mul(kWeight2, w138[0])), mul(kWeight2, w138[1]))), \
    943      add(w114[1], mul(kWeight2, sub(w138[0], w138[1])))                       \
    944    };                                                                         \
    945    const T_VEC w148[2] = {                                                    \
    946      add(w116[0], add(mul(kWeight4, w140[0]), mul(kWeight3, w140[1]))),       \
    947      add(w116[1], sub(mul(kWeight4, w140[1]), mul(kWeight3, w140[0])))        \
    948    };                                                                         \
    949    const T_VEC w149[2] = {                                                    \
    950      add(w116[0],                                                             \
    951          sub(sub(kWeight0, mul(kWeight4, w140[0])), mul(kWeight3, w140[1]))), \
    952      add(w116[1], sub(mul(kWeight3, w140[0]), mul(kWeight4, w140[1])))        \
    953    };                                                                         \
    954    const T_VEC w150[2] = { add(w111[0], w135[1]), sub(w111[1], w135[0]) };    \
    955    const T_VEC w151[2] = { sub(w111[0], w135[1]), add(w111[1], w135[0]) };    \
    956    const T_VEC w152[2] = {                                                    \
    957      sub(w113[0], sub(mul(kWeight4, w137[0]), mul(kWeight3, w137[1]))),       \
    958      add(w113[1],                                                             \
    959          sub(sub(kWeight0, mul(kWeight4, w137[1])), mul(kWeight3, w137[0])))  \
    960    };                                                                         \
    961    const T_VEC w153[2] = {                                                    \
    962      add(w113[0], sub(mul(kWeight4, w137[0]), mul(kWeight3, w137[1]))),       \
    963      add(w113[1], add(mul(kWeight4, w137[1]), mul(kWeight3, w137[0])))        \
    964    };                                                                         \
    965    const T_VEC w154[2] = {                                                    \
    966      sub(w115[0], mul(kWeight2, sub(w139[0], w139[1]))),                      \
    967      sub(w115[1], mul(kWeight2, add(w139[1], w139[0])))                       \
    968    };                                                                         \
    969    const T_VEC w155[2] = {                                                    \
    970      add(w115[0], mul(kWeight2, sub(w139[0], w139[1]))),                      \
    971      add(w115[1], mul(kWeight2, add(w139[1], w139[0])))                       \
    972    };                                                                         \
    973    const T_VEC w156[2] = {                                                    \
    974      sub(w117[0], sub(mul(kWeight3, w141[0]), mul(kWeight4, w141[1]))),       \
    975      add(w117[1],                                                             \
    976          sub(sub(kWeight0, mul(kWeight3, w141[1])), mul(kWeight4, w141[0])))  \
    977    };                                                                         \
    978    const T_VEC w157[2] = {                                                    \
    979      add(w117[0], sub(mul(kWeight3, w141[0]), mul(kWeight4, w141[1]))),       \
    980      add(w117[1], add(mul(kWeight3, w141[1]), mul(kWeight4, w141[0])))        \
    981    };                                                                         \
    982    store(output + 0 * stride, add(w78[0], w142[0]));                          \
    983    store(output + 1 * stride,                                                 \
    984          add(w80[0], add(mul(kWeight5, w144[0]), mul(kWeight6, w144[1]))));   \
    985    store(output + 2 * stride,                                                 \
    986          add(w82[0], add(mul(kWeight3, w146[0]), mul(kWeight4, w146[1]))));   \
    987    store(output + 3 * stride,                                                 \
    988          add(w84[0], add(mul(kWeight7, w148[0]), mul(kWeight8, w148[1]))));   \
    989    store(output + 4 * stride,                                                 \
    990          add(w86[0], mul(kWeight2, add(w150[0], w150[1]))));                  \
    991    store(output + 5 * stride,                                                 \
    992          add(w88[0], add(mul(kWeight8, w152[0]), mul(kWeight7, w152[1]))));   \
    993    store(output + 6 * stride,                                                 \
    994          add(w90[0], add(mul(kWeight4, w154[0]), mul(kWeight3, w154[1]))));   \
    995    store(output + 7 * stride,                                                 \
    996          add(w92[0], add(mul(kWeight6, w156[0]), mul(kWeight5, w156[1]))));   \
    997    store(output + 8 * stride, add(w79[0], w143[1]));                          \
    998    store(output + 9 * stride,                                                 \
    999          sub(w81[0], sub(mul(kWeight6, w145[0]), mul(kWeight5, w145[1]))));   \
   1000    store(output + 10 * stride,                                                \
   1001          sub(w83[0], sub(mul(kWeight4, w147[0]), mul(kWeight3, w147[1]))));   \
   1002    store(output + 11 * stride,                                                \
   1003          sub(w85[0], sub(mul(kWeight8, w149[0]), mul(kWeight7, w149[1]))));   \
   1004    store(output + 12 * stride,                                                \
   1005          sub(w87[0], mul(kWeight2, sub(w151[0], w151[1]))));                  \
   1006    store(output + 13 * stride,                                                \
   1007          sub(w89[0], sub(mul(kWeight7, w153[0]), mul(kWeight8, w153[1]))));   \
   1008    store(output + 14 * stride,                                                \
   1009          sub(w91[0], sub(mul(kWeight3, w155[0]), mul(kWeight4, w155[1]))));   \
   1010    store(output + 15 * stride,                                                \
   1011          sub(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1]))));   \
   1012    store(output + 16 * stride, sub(w78[0], w142[0]));                         \
   1013    store(output + 17 * stride,                                                \
   1014          add(w80[0], sub(sub(kWeight0, mul(kWeight5, w144[0])),               \
   1015                          mul(kWeight6, w144[1]))));                           \
   1016    store(output + 18 * stride,                                                \
   1017          add(w82[0], sub(sub(kWeight0, mul(kWeight3, w146[0])),               \
   1018                          mul(kWeight4, w146[1]))));                           \
   1019    store(output + 19 * stride,                                                \
   1020          add(w84[0], sub(sub(kWeight0, mul(kWeight7, w148[0])),               \
   1021                          mul(kWeight8, w148[1]))));                           \
   1022    store(output + 20 * stride,                                                \
   1023          add(w86[0], sub(sub(kWeight0, mul(kWeight2, w150[0])),               \
   1024                          mul(kWeight2, w150[1]))));                           \
   1025    store(output + 21 * stride,                                                \
   1026          add(w88[0], sub(sub(kWeight0, mul(kWeight8, w152[0])),               \
   1027                          mul(kWeight7, w152[1]))));                           \
   1028    store(output + 22 * stride,                                                \
   1029          add(w90[0], sub(sub(kWeight0, mul(kWeight4, w154[0])),               \
   1030                          mul(kWeight3, w154[1]))));                           \
   1031    store(output + 23 * stride,                                                \
   1032          add(w92[0], sub(sub(kWeight0, mul(kWeight6, w156[0])),               \
   1033                          mul(kWeight5, w156[1]))));                           \
   1034    store(output + 24 * stride, sub(w79[0], w143[1]));                         \
   1035    store(output + 25 * stride,                                                \
   1036          add(w81[0], sub(mul(kWeight6, w145[0]), mul(kWeight5, w145[1]))));   \
   1037    store(output + 26 * stride,                                                \
   1038          add(w83[0], sub(mul(kWeight4, w147[0]), mul(kWeight3, w147[1]))));   \
   1039    store(output + 27 * stride,                                                \
   1040          add(w85[0], sub(mul(kWeight8, w149[0]), mul(kWeight7, w149[1]))));   \
   1041    store(output + 28 * stride,                                                \
   1042          add(w87[0], mul(kWeight2, sub(w151[0], w151[1]))));                  \
   1043    store(output + 29 * stride,                                                \
   1044          add(w89[0], sub(mul(kWeight7, w153[0]), mul(kWeight8, w153[1]))));   \
   1045    store(output + 30 * stride,                                                \
   1046          add(w91[0], sub(mul(kWeight3, w155[0]), mul(kWeight4, w155[1]))));   \
   1047    store(output + 31 * stride,                                                \
   1048          add(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1]))));   \
   1049  }
   1050 
   1051 #endif  // AOM_AOM_DSP_FFT_COMMON_H_