tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

av1_fwd_txfm2d_hwy.h (120844B)


      1 /*
      2 * Copyright (c) 2025, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #ifndef AOM_AV1_ENCODER_AV1_FWD_TXFM2D_HWY_H_
     13 #define AOM_AV1_ENCODER_AV1_FWD_TXFM2D_HWY_H_
     14 
     15 #include <stdint.h>
     16 
     17 #include "config/aom_config.h"
     18 #include "config/av1_rtcd.h"
     19 #include "third_party/highway/hwy/highway.h"
     20 #include "aom_dsp/txfm_common.h"
     21 #include "av1/common/av1_txfm.h"
     22 #include "av1/common/enums.h"
     23 #include "av1/encoder/av1_fwd_txfm1d.h"
     24 #include "av1/encoder/av1_fwd_txfm1d_cfg.h"
     25 
     26 #define FOR_EACH_TXFM2D(X, suffix) \
     27  X(4, 4, suffix)                  \
     28  X(8, 8, suffix)                  \
     29  X(16, 16, suffix)                \
     30  X(32, 32, suffix)                \
     31  X(64, 64, suffix)                \
     32  X(4, 8, suffix)                  \
     33  X(8, 4, suffix)                  \
     34  X(8, 16, suffix)                 \
     35  X(16, 8, suffix)                 \
     36  X(16, 32, suffix)                \
     37  X(32, 16, suffix)                \
     38  X(32, 64, suffix)                \
     39  X(64, 32, suffix)                \
     40  X(4, 16, suffix)                 \
     41  X(16, 4, suffix)                 \
     42  X(8, 32, suffix)                 \
     43  X(32, 8, suffix)                 \
     44  X(16, 64, suffix)                \
     45  X(64, 16, suffix)
     46 
     47 #if HWY_CXX_LANG >= 201703L
     48 #define CONSTEXPR_IF constexpr
     49 #else
     50 #define CONSTEXPR_IF
     51 #endif
     52 
     53 HWY_BEFORE_NAMESPACE();
     54 
     55 namespace {
     56 namespace HWY_NAMESPACE {
     57 
     58 namespace hn = hwy::HWY_NAMESPACE;
     59 
     60 constexpr int8_t kForwardTransformShift[TX_SIZES_ALL][3] = {
     61  { 2, 0, 0 },    //
     62  { 2, -1, 0 },   //
     63  { 2, -2, 0 },   //
     64  { 2, -4, 0 },   //
     65  { 0, -2, -2 },  //
     66  { 2, -1, 0 },   //
     67  { 2, -1, 0 },   //
     68  { 2, -2, 0 },   //
     69  { 2, -2, 0 },   //
     70  { 2, -4, 0 },   //
     71  { 2, -4, 0 },   //
     72  { 0, -2, -2 },  //
     73  { 2, -4, -2 },  //
     74  { 2, -1, 0 },   //
     75  { 2, -1, 0 },   //
     76  { 2, -2, 0 },   //
     77  { 2, -2, 0 },   //
     78  { 0, -2, 0 },   //
     79  { 2, -4, 0 },   //
     80 };
     81 
     82 constexpr int kTxSizeWideLog2[TX_SIZES_ALL] = {
     83  2, 3, 4, 5, 6, 2, 3, 3, 4, 4, 5, 5, 6, 2, 4, 3, 5, 4, 6,
     84 };
     85 
     86 // Transform block height in log2
     87 constexpr int kTxSizeHighLog2[TX_SIZES_ALL] = {
     88  2, 3, 4, 5, 6, 3, 2, 4, 3, 5, 4, 6, 5, 4, 2, 5, 3, 6, 4,
     89 };
     90 
     91 constexpr bool kApplyRectScaleList[TX_SIZES_ALL] = {
     92  false, false, false, false, false, true,  true,  true,  true,  true,
     93  true,  true,  true,  false, false, false, false, false, false,
     94 };
     95 
     96 constexpr int8_t kForwardCosBitCol[MAX_TXWH_IDX /*txw_idx*/]
     97                                  [MAX_TXWH_IDX /*txh_idx*/] = {
     98                                    { 13, 13, 13, 0, 0 },
     99                                    { 13, 13, 13, 12, 0 },
    100                                    { 13, 13, 13, 12, 13 },
    101                                    { 0, 13, 13, 12, 13 },
    102                                    { 0, 0, 13, 12, 13 }
    103                                  };
    104 
    105 constexpr int8_t kForwardCosBitRow[MAX_TXWH_IDX /*txw_idx*/]
    106                                  [MAX_TXWH_IDX /*txh_idx*/] = {
    107                                    { 13, 13, 12, 0, 0 },
    108                                    { 13, 13, 13, 12, 0 },
    109                                    { 13, 13, 12, 13, 12 },
    110                                    { 0, 12, 13, 12, 11 },
    111                                    { 0, 0, 12, 11, 10 }
    112                                  };
    113 
    114 // Transform block width in pixels
    115 constexpr int8_t kTxSizeWide[TX_SIZES_ALL] = {
    116  4, 8, 16, 32, 64, 4, 8, 8, 16, 16, 32, 32, 64, 4, 16, 8, 32, 16, 64,
    117 };
    118 
    119 // Transform block height in pixels
    120 constexpr int8_t kTxSizeHigh[TX_SIZES_ALL] = {
    121  4, 8, 16, 32, 64, 8, 4, 16, 8, 32, 16, 64, 32, 16, 4, 32, 8, 64, 16,
    122 };
    123 
    124 constexpr int GetTxwIndex(TX_SIZE tx_size) {
    125  return kTxSizeWideLog2[tx_size] - kTxSizeWideLog2[0];
    126 }
    127 
    128 constexpr int GetTxhIndex(TX_SIZE tx_size) {
    129  return kTxSizeHighLog2[tx_size] - kTxSizeHighLog2[0];
    130 }
    131 
    132 template <typename D>
    133 HWY_ATTR HWY_INLINE hn::VFromD<D> SetPair(D int_tag, int a, int b) {
    134  return hn::BitCast(
    135      int_tag,
    136      hn::Set(hn::RepartitionToWide<D>(),
    137              static_cast<int32_t>(
    138                  static_cast<uint16_t>(a) |
    139                  (static_cast<uint32_t>(static_cast<uint16_t>(b)) << 16))));
    140 }
    141 
    142 template <size_t LaneSize>
    143 struct ButterflyTraits {};
    144 
    145 template <>
    146 struct ButterflyTraits<2> {
    147  template <typename D>
    148  HWY_ATTR HWY_INLINE static void Whole(
    149      D int_tag, int w0, int w1, const hn::TFromD<D> *HWY_RESTRICT in0,
    150      const hn::TFromD<D> *HWY_RESTRICT in1, hn::TFromD<D> *HWY_RESTRICT out0,
    151      hn::TFromD<D> *HWY_RESTRICT out1, int bit,
    152      hn::VFromD<hn::Repartition<int32_t, D>> round) {
    153    constexpr hn::RepartitionToWide<D> int32_tag;
    154    const auto ww0 = SetPair(int_tag, w0, w1);
    155    const auto ww1 = SetPair(int_tag, w1, -w0);
    156    const auto i0 = hn::Load(int_tag, in0);
    157    const auto i1 = hn::Load(int_tag, in1);
    158    const auto t0 = hn::InterleaveLower(int_tag, i0, i1);
    159    const auto t1 = hn::InterleaveUpper(int_tag, i0, i1);
    160    const auto u0 = hn::WidenMulPairwiseAdd(int32_tag, t0, ww0);
    161    const auto u1 = hn::WidenMulPairwiseAdd(int32_tag, t1, ww0);
    162    const auto v0 = hn::WidenMulPairwiseAdd(int32_tag, t0, ww1);
    163    const auto v1 = hn::WidenMulPairwiseAdd(int32_tag, t1, ww1);
    164    const auto c0 = hn::ShiftRightSame(hn::Add(u0, round), bit);
    165    const auto c1 = hn::ShiftRightSame(hn::Add(u1, round), bit);
    166    const auto d0 = hn::ShiftRightSame(hn::Add(v0, round), bit);
    167    const auto d1 = hn::ShiftRightSame(hn::Add(v1, round), bit);
    168    hn::Store(hn::ReorderDemote2To(int_tag, c0, c1), int_tag, out0);
    169    hn::Store(hn::ReorderDemote2To(int_tag, d0, d1), int_tag, out1);
    170  }
    171 
    172  template <typename D>
    173  HWY_ATTR HWY_INLINE static void Half(
    174      D int_tag, int w0, int w1, const hn::TFromD<D> *HWY_RESTRICT in0,
    175      const hn::TFromD<D> *HWY_RESTRICT in1, hn::TFromD<D> *HWY_RESTRICT out,
    176      int bit, hn::VFromD<hn::Repartition<int32_t, D>> round) {
    177    constexpr hn::RepartitionToWide<D> int32_tag;
    178    const auto i0 = hn::Load(int_tag, in0);
    179    const auto i1 = hn::Load(int_tag, in1);
    180    const auto t0 = hn::InterleaveLower(int_tag, i0, i1);
    181    const auto t1 = hn::InterleaveUpper(int_tag, i0, i1);
    182    const auto ww0 = SetPair(int_tag, w0, w1);
    183    const auto u0 = hn::WidenMulPairwiseAdd(int32_tag, t0, ww0);
    184    const auto u1 = hn::WidenMulPairwiseAdd(int32_tag, t1, ww0);
    185    const auto c0 = hn::ShiftRightSame(hn::Add(u0, round), bit);
    186    const auto c1 = hn::ShiftRightSame(hn::Add(u1, round), bit);
    187    hn::Store(hn::ReorderDemote2To(int_tag, c0, c1), int_tag, out);
    188  }
    189 };
    190 
    191 template <>
    192 struct ButterflyTraits<4> {
    193  template <typename D>
    194  HWY_ATTR HWY_INLINE static void Whole(
    195      D int_tag, int w0, int w1, const hn::TFromD<D> *HWY_RESTRICT in0,
    196      const hn::TFromD<D> *HWY_RESTRICT in1, hn::TFromD<D> *HWY_RESTRICT out0,
    197      hn::TFromD<D> *HWY_RESTRICT out1, int bit,
    198      hn::VFromD<hn::Repartition<int32_t, D>> round) {
    199    const auto i0 = hn::Load(int_tag, in0);
    200    const auto i1 = hn::Load(int_tag, in1);
    201    const auto ww0 = hn::Set(int_tag, w0);
    202    const auto ww1 = hn::Set(int_tag, w1);
    203    const auto in1_w1 = hn::Mul(i1, ww1);
    204    const auto o0 = hn::MulAdd(i0, ww0, in1_w1);
    205    hn::Store(hn::ShiftRightSame(hn::Add(o0, round), bit), int_tag, out0);
    206    const auto in1_w0 = hn::Mul(i1, ww0);
    207    const auto o1 = hn::MulSub(i0, ww1, in1_w0);
    208    hn::Store(hn::ShiftRightSame(hn::Add(o1, round), bit), int_tag, out1);
    209  }
    210 
    211  template <typename D>
    212  HWY_ATTR HWY_INLINE static void Half(
    213      D int_tag, int w0, int w1, const hn::TFromD<D> *HWY_RESTRICT in0,
    214      const hn::TFromD<D> *HWY_RESTRICT in1, hn::TFromD<D> *HWY_RESTRICT out,
    215      int bit, hn::VFromD<hn::Repartition<int32_t, D>> round) {
    216    const auto i0 = hn::Load(int_tag, in0);
    217    const auto i1 = hn::Load(int_tag, in1);
    218    const auto ww0 = hn::Set(int_tag, w0);
    219    const auto ww1 = hn::Set(int_tag, w1);
    220    const auto in1_w1 = hn::Mul(i1, ww1);
    221    const auto o0 = hn::MulAdd(i0, ww0, in1_w1);
    222    hn::Store(hn::ShiftRightSame(hn::Add(o0, round), bit), int_tag, out);
    223  }
    224 };
    225 
    226 template <typename D>
    227 HWY_ATTR HWY_INLINE void Butterfly(
    228    D int_tag, int w0, int w1, const hn::TFromD<D> *HWY_RESTRICT in0,
    229    const hn::TFromD<D> *HWY_RESTRICT in1, hn::TFromD<D> *HWY_RESTRICT out0,
    230    hn::TFromD<D> *HWY_RESTRICT out1, int bit,
    231    hn::VFromD<hn::Repartition<int32_t, D>> round) {
    232  ButterflyTraits<sizeof(hn::TFromD<D>)>::Whole(int_tag, w0, w1, in0, in1, out0,
    233                                                out1, bit, round);
    234 }
    235 
    236 template <typename D>
    237 HWY_ATTR HWY_INLINE void HalfButterfly(
    238    D int_tag, int w0, int w1, const hn::TFromD<D> *HWY_RESTRICT in0,
    239    const hn::TFromD<D> *HWY_RESTRICT in1, hn::TFromD<D> *HWY_RESTRICT out,
    240    int bit, hn::VFromD<hn::Repartition<int32_t, D>> round) {
    241  ButterflyTraits<sizeof(hn::TFromD<D>)>::Half(int_tag, w0, w1, in0, in1, out,
    242                                               bit, round);
    243 }
    244 
    245 template <typename D>
    246 HWY_ATTR HWY_INLINE void AddSub(D int_tag, const hn::TFromD<D> *in0,
    247                                const hn::TFromD<D> *in1,
    248                                hn::TFromD<D> *out_add,
    249                                hn::TFromD<D> *out_sub) {
    250  const auto i0 = hn::Load(int_tag, in0);
    251  const auto i1 = hn::Load(int_tag, in1);
    252  if CONSTEXPR_IF (sizeof(hn::TFromD<D>) == 2) {
    253    hn::Store(hn::SaturatedAdd(i0, i1), int_tag, out_add);
    254    hn::Store(hn::SaturatedSub(i0, i1), int_tag, out_sub);
    255  } else {
    256    hn::Store(hn::Add(i0, i1), int_tag, out_add);
    257    hn::Store(hn::Sub(i0, i1), int_tag, out_sub);
    258  }
    259 }
    260 
    261 template <size_t LaneSize, size_t NumLanes>
    262 struct Fdct4Traits {
    263  template <typename D>
    264  HWY_ATTR HWY_INLINE static void Fdct4(D int_tag,
    265                                        hn::TFromD<D> *HWY_RESTRICT in,
    266                                        const int8_t cos_bit, size_t instride) {
    267    using T = hn::TFromD<D>;
    268    constexpr size_t kNumLanes = hn::MaxLanes(int_tag);
    269    HWY_ALIGN_MAX T buf0[4 * kNumLanes];
    270    const int32_t *HWY_RESTRICT const cospi = cospi_arr(cos_bit);
    271    constexpr hn::Repartition<int32_t, D> int32_tag;
    272    const auto round = hn::Set(int32_tag, 1 << (cos_bit - 1));
    273    AddSub(int_tag, &in[0 * instride], &in[3 * instride], &buf0[0 * kNumLanes],
    274           &buf0[3 * kNumLanes]);
    275    AddSub(int_tag, &in[1 * instride], &in[2 * instride], &buf0[1 * kNumLanes],
    276           &buf0[2 * kNumLanes]);
    277    Butterfly(int_tag, cospi[32], cospi[32], &buf0[0 * kNumLanes],
    278              &buf0[1 * kNumLanes], &in[0 * instride], &in[2 * instride],
    279              cos_bit, round);
    280    Butterfly(int_tag, cospi[16], cospi[48], &buf0[3 * kNumLanes],
    281              &buf0[2 * kNumLanes], &in[1 * instride], &in[3 * instride],
    282              cos_bit, round);
    283  }
    284 };
    285 
    286 template <>
    287 struct Fdct4Traits<2, 4> {
    288  template <typename D>
    289  HWY_ATTR HWY_INLINE static void Fdct4(D int_tag,
    290                                        hn::TFromD<D> *HWY_RESTRICT in,
    291                                        const int8_t cos_bit, size_t instride) {
    292    const int32_t *HWY_RESTRICT const cospi = cospi_arr(cos_bit);
    293    constexpr hn::FixedTag<hn::TFromD<D>, 8> demote_tag;
    294    constexpr hn::Repartition<int32_t, decltype(demote_tag)> int32_tag;
    295    const auto round = hn::Set(int32_tag, 1 << (cos_bit - 1));
    296    const auto cospi_p32_p32 = SetPair(int_tag, cospi[32], cospi[32]);
    297    const auto cospi_p32_m32 = SetPair(int_tag, cospi[32], -cospi[32]);
    298    const auto cospi_p16_p48 = SetPair(int_tag, cospi[16], cospi[48]);
    299    const auto cospi_p48_m16 = SetPair(int_tag, cospi[48], -cospi[16]);
    300    const auto i0 = hn::Load(int_tag, &in[0 * instride]);
    301    const auto i1 = hn::Load(int_tag, &in[1 * instride]);
    302    const auto i2 = hn::Load(int_tag, &in[2 * instride]);
    303    const auto i3 = hn::Load(int_tag, &in[3 * instride]);
    304    const auto u0 = hn::InterleaveLower(int_tag, i0, i1);
    305    const auto u1 = hn::InterleaveLower(int_tag, i3, i2);
    306    const auto v0 = hn::Add(u0, u1);
    307    const auto v1 = hn::Sub(u0, u1);
    308    const auto x0 = hn::WidenMulPairwiseAdd(int32_tag, v0, cospi_p32_p32);
    309    const auto x1 = hn::WidenMulPairwiseAdd(int32_tag, v0, cospi_p32_m32);
    310    const auto x2 = hn::WidenMulPairwiseAdd(int32_tag, v1, cospi_p16_p48);
    311    const auto x3 = hn::WidenMulPairwiseAdd(int32_tag, v1, cospi_p48_m16);
    312    const auto v0w0 = hn::ShiftRightSame(hn::Add(x0, round), cos_bit);
    313    const auto v0w1 = hn::ShiftRightSame(hn::Add(x1, round), cos_bit);
    314    const auto v1w0 = hn::ShiftRightSame(hn::Add(x2, round), cos_bit);
    315    const auto v1w1 = hn::ShiftRightSame(hn::Add(x3, round), cos_bit);
    316    const auto o0 = hn::ReorderDemote2To(demote_tag, v0w0, v0w1);
    317    const auto o1 = hn::ReorderDemote2To(demote_tag, v1w0, v1w1);
    318    hn::Store(o0, demote_tag, &in[0 * instride]);
    319    hn::Store(o1, demote_tag, &in[1 * instride]);
    320    hn::Store(hn::ShiftRightLanes<4>(demote_tag, o0), demote_tag,
    321              &in[2 * instride]);
    322    hn::Store(hn::ShiftRightLanes<4>(demote_tag, o1), demote_tag,
    323              &in[3 * instride]);
    324  }
    325 };
    326 
    327 template <typename D>
    328 HWY_ATTR HWY_INLINE void Fdct4(D int_tag, hn::TFromD<D> *HWY_RESTRICT in,
    329                               const int8_t cos_bit, size_t instride) {
    330  Fdct4Traits<sizeof(hn::TFromD<D>), hn::MaxLanes(int_tag)>::Fdct4(
    331      int_tag, in, cos_bit, instride);
    332 }
    333 
    334 template <typename D>
    335 HWY_ATTR HWY_INLINE void Fdct8(D int_tag, hn::TFromD<D> *HWY_RESTRICT in,
    336                               const int8_t cos_bit, size_t instride) {
    337  constexpr size_t kNumLanes = hn::MaxLanes(int_tag);
    338  HWY_ALIGN_MAX hn::TFromD<D> buf0[8 * kNumLanes];
    339  HWY_ALIGN_MAX hn::TFromD<D> buf1[8 * kNumLanes];
    340  const int32_t *HWY_RESTRICT const cospi = cospi_arr(cos_bit);
    341  const auto round = hn::Set(hn::Repartition<int32_t, D>(), 1 << (cos_bit - 1));
    342 
    343  // Even 8 points 0, 2, ..., 14
    344  // stage 0
    345  // stage 1
    346  // buf0/buf1
    347  AddSub(int_tag, &in[0 * instride], &in[7 * instride], &buf0[0 * kNumLanes],
    348         &buf1[7 * kNumLanes]);
    349  // buf0/buf0
    350  AddSub(int_tag, &in[1 * instride], &in[6 * instride], &buf0[1 * kNumLanes],
    351         &buf0[6 * kNumLanes]);
    352  // buf0/buf0
    353  AddSub(int_tag, &in[2 * instride], &in[5 * instride], &buf0[2 * kNumLanes],
    354         &buf0[5 * kNumLanes]);
    355  // buf0/buf1
    356  AddSub(int_tag, &in[3 * instride], &in[4 * instride], &buf0[3 * kNumLanes],
    357         &buf1[4 * kNumLanes]);
    358 
    359  // stage 2
    360  for (size_t i = 0; i < 2; ++i) {
    361    AddSub(int_tag, &buf0[i * kNumLanes], &buf0[(3 - i) * kNumLanes],
    362           &buf1[i * kNumLanes], &buf1[(3 - i) * kNumLanes]);
    363  }
    364 
    365  Butterfly(int_tag, -cospi[32], cospi[32], &buf0[5 * kNumLanes],
    366            &buf0[6 * kNumLanes], &buf1[5 * kNumLanes], &buf1[6 * kNumLanes],
    367            cos_bit, round);
    368 
    369  // stage 3
    370  // type 0
    371  Butterfly(int_tag, cospi[32], cospi[32], &buf1[0 * kNumLanes],
    372            &buf1[1 * kNumLanes], &in[0 * instride], &in[4 * instride], cos_bit,
    373            round);
    374 
    375  // type 1
    376  Butterfly(int_tag, cospi[16], cospi[48], &buf1[3 * kNumLanes],
    377            &buf1[2 * kNumLanes], &in[2 * instride], &in[6 * instride], cos_bit,
    378            round);
    379 
    380  AddSub(int_tag, &buf1[4 * kNumLanes], &buf1[5 * kNumLanes],
    381         &buf0[4 * kNumLanes], &buf0[5 * kNumLanes]);
    382  AddSub(int_tag, &buf1[7 * kNumLanes], &buf1[6 * kNumLanes],
    383         &buf0[7 * kNumLanes], &buf0[6 * kNumLanes]);
    384 
    385  // stage 4
    386  // stage 5
    387  Butterfly(int_tag, cospi[8], cospi[56], &buf0[7 * kNumLanes],
    388            &buf0[4 * kNumLanes], &in[1 * instride], &in[7 * instride], cos_bit,
    389            round);
    390  Butterfly(int_tag, cospi[40], cospi[24], &buf0[6 * kNumLanes],
    391            &buf0[5 * kNumLanes], &in[5 * instride], &in[3 * instride], cos_bit,
    392            round);
    393 }
    394 
    395 template <typename D>
    396 HWY_ATTR HWY_INLINE void Fdct16(D int_tag, hn::TFromD<D> *HWY_RESTRICT in,
    397                                const int8_t cos_bit, size_t instride) {
    398  constexpr size_t kNumLanes = hn::MaxLanes(int_tag);
    399  HWY_ALIGN_MAX hn::TFromD<D> buf0[16 * kNumLanes];
    400  HWY_ALIGN_MAX hn::TFromD<D> buf1[16 * kNumLanes];
    401  const int32_t *HWY_RESTRICT const cospi = cospi_arr(cos_bit);
    402  const auto round = hn::Set(hn::Repartition<int32_t, D>(), 1 << (cos_bit - 1));
    403 
    404  // Calculate the column 0, 1, 2, 3
    405  // stage 0
    406  // stage 1
    407  for (size_t i = 0; i < 8; ++i) {
    408    AddSub(int_tag, &in[i * instride], &in[(15 - i) * instride],
    409           &buf0[i * kNumLanes], &buf0[(15 - i) * kNumLanes]);
    410  }
    411 
    412  // stage 2
    413  for (size_t i = 0; i < 4; ++i) {
    414    AddSub(int_tag, &buf0[i * kNumLanes], &buf0[(7 - i) * kNumLanes],
    415           &buf1[i * kNumLanes], &buf1[(7 - i) * kNumLanes]);
    416  }
    417 
    418  Butterfly(int_tag, -cospi[32], cospi[32], &buf0[10 * kNumLanes],
    419            &buf0[13 * kNumLanes], &buf1[10 * kNumLanes], &buf1[13 * kNumLanes],
    420            cos_bit, round);
    421  Butterfly(int_tag, -cospi[32], cospi[32], &buf0[11 * kNumLanes],
    422            &buf0[12 * kNumLanes], &buf1[11 * kNumLanes], &buf1[12 * kNumLanes],
    423            cos_bit, round);
    424 
    425  // stage 3
    426  for (size_t i = 0; i < 2; ++i) {
    427    AddSub(int_tag, &buf1[i * kNumLanes], &buf1[(3 - i) * kNumLanes],
    428           &buf0[i * kNumLanes], &buf0[(3 - i) * kNumLanes]);
    429  }
    430 
    431  Butterfly(int_tag, -cospi[32], cospi[32], &buf1[5 * kNumLanes],
    432            &buf1[6 * kNumLanes], &buf0[5 * kNumLanes], &buf0[6 * kNumLanes],
    433            cos_bit, round);
    434 
    435  for (size_t i = 0; i < 2; ++i) {
    436    AddSub(int_tag, &buf0[(8 + i) * kNumLanes], &buf1[(11 - i) * kNumLanes],
    437           &buf0[(8 + i) * kNumLanes], &buf0[(11 - i) * kNumLanes]);
    438  }
    439  for (size_t i = 0; i < 2; ++i) {
    440    AddSub(int_tag, &buf0[(15 - i) * kNumLanes], &buf1[(12 + i) * kNumLanes],
    441           &buf0[(15 - i) * kNumLanes], &buf0[(12 + i) * kNumLanes]);
    442  }
    443 
    444  // stage 4
    445  Butterfly(int_tag, cospi[32], cospi[32], &buf0[0 * kNumLanes],
    446            &buf0[1 * kNumLanes], &in[0 * instride], &in[8 * instride], cos_bit,
    447            round);
    448 
    449  Butterfly(int_tag, cospi[16], cospi[48], &buf0[3 * kNumLanes],
    450            &buf0[2 * kNumLanes], &in[4 * instride], &in[12 * instride],
    451            cos_bit, round);
    452 
    453  AddSub(int_tag, &buf1[4 * kNumLanes], &buf0[5 * kNumLanes],
    454         &buf1[4 * kNumLanes], &buf1[5 * kNumLanes]);
    455  AddSub(int_tag, &buf1[7 * kNumLanes], &buf0[6 * kNumLanes],
    456         &buf1[7 * kNumLanes], &buf1[6 * kNumLanes]);
    457 
    458  Butterfly(int_tag, -cospi[16], cospi[48], &buf0[9 * kNumLanes],
    459            &buf0[14 * kNumLanes], &buf1[9 * kNumLanes], &buf1[14 * kNumLanes],
    460            cos_bit, round);
    461  Butterfly(int_tag, -cospi[48], -cospi[16], &buf0[10 * kNumLanes],
    462            &buf0[13 * kNumLanes], &buf1[10 * kNumLanes], &buf1[13 * kNumLanes],
    463            cos_bit, round);
    464 
    465  // stage 5
    466  Butterfly(int_tag, cospi[8], cospi[56], &buf1[7 * kNumLanes],
    467            &buf1[4 * kNumLanes], &in[2 * instride], &in[14 * instride],
    468            cos_bit, round);
    469  Butterfly(int_tag, cospi[40], cospi[24], &buf1[6 * kNumLanes],
    470            &buf1[5 * kNumLanes], &in[10 * instride], &in[6 * instride],
    471            cos_bit, round);
    472 
    473  AddSub(int_tag, &buf0[8 * kNumLanes], &buf1[9 * kNumLanes],
    474         &buf0[8 * kNumLanes], &buf0[9 * kNumLanes]);
    475  AddSub(int_tag, &buf0[11 * kNumLanes], &buf1[10 * kNumLanes],
    476         &buf0[11 * kNumLanes], &buf0[10 * kNumLanes]);
    477  AddSub(int_tag, &buf0[12 * kNumLanes], &buf1[13 * kNumLanes],
    478         &buf0[12 * kNumLanes], &buf0[13 * kNumLanes]);
    479  AddSub(int_tag, &buf0[15 * kNumLanes], &buf1[14 * kNumLanes],
    480         &buf0[15 * kNumLanes], &buf0[14 * kNumLanes]);
    481 
    482  // stage 6
    483  Butterfly(int_tag, cospi[4], cospi[60], &buf0[15 * kNumLanes],
    484            &buf0[8 * kNumLanes], &in[1 * instride], &in[15 * instride],
    485            cos_bit, round);
    486  Butterfly(int_tag, cospi[36], cospi[28], &buf0[14 * kNumLanes],
    487            &buf0[9 * kNumLanes], &in[9 * instride], &in[7 * instride], cos_bit,
    488            round);
    489  Butterfly(int_tag, cospi[20], cospi[44], &buf0[13 * kNumLanes],
    490            &buf0[10 * kNumLanes], &in[5 * instride], &in[11 * instride],
    491            cos_bit, round);
    492  Butterfly(int_tag, cospi[52], cospi[12], &buf0[12 * kNumLanes],
    493            &buf0[11 * kNumLanes], &in[13 * instride], &in[3 * instride],
    494            cos_bit, round);
    495 }
    496 
    497 template <typename D>
    498 HWY_ATTR HWY_INLINE void Fdct32(D int_tag, hn::TFromD<D> *HWY_RESTRICT in,
    499                                const int8_t cos_bit, size_t instride) {
    500  constexpr size_t kNumLanes = hn::MaxLanes(int_tag);
    501  HWY_ALIGN_MAX hn::TFromD<D> buf0[32 * kNumLanes];
    502  HWY_ALIGN_MAX hn::TFromD<D> buf1[32 * kNumLanes];
    503  const int32_t *HWY_RESTRICT const cospi = cospi_arr(cos_bit);
    504  const auto round = hn::Set(hn::Repartition<int32_t, D>(), 1 << (cos_bit - 1));
    505  // stage 0
    506  // stage 1
    507  for (size_t i = 0; i < 16; ++i) {
    508    AddSub(int_tag, &in[i * instride], &in[(31 - i) * instride],
    509           &buf1[i * kNumLanes], &buf1[(31 - i) * kNumLanes]);
    510  }
    511 
    512  // stage 2
    513  for (size_t i = 0; i < 8; ++i) {
    514    AddSub(int_tag, &buf1[i * kNumLanes], &buf1[(15 - i) * kNumLanes],
    515           &buf0[i * kNumLanes], &buf0[(15 - i) * kNumLanes]);
    516  }
    517 
    518  Butterfly(int_tag, -cospi[32], cospi[32], &buf1[20 * kNumLanes],
    519            &buf1[27 * kNumLanes], &buf0[20 * kNumLanes], &buf0[27 * kNumLanes],
    520            cos_bit, round);
    521  Butterfly(int_tag, -cospi[32], cospi[32], &buf1[21 * kNumLanes],
    522            &buf1[26 * kNumLanes], &buf0[21 * kNumLanes], &buf0[26 * kNumLanes],
    523            cos_bit, round);
    524  Butterfly(int_tag, -cospi[32], cospi[32], &buf1[22 * kNumLanes],
    525            &buf1[25 * kNumLanes], &buf0[22 * kNumLanes], &buf0[25 * kNumLanes],
    526            cos_bit, round);
    527  Butterfly(int_tag, -cospi[32], cospi[32], &buf1[23 * kNumLanes],
    528            &buf1[24 * kNumLanes], &buf0[23 * kNumLanes], &buf0[24 * kNumLanes],
    529            cos_bit, round);
    530 
    531  // stage 3
    532  for (size_t i = 0; i < 4; ++i) {
    533    AddSub(int_tag, &buf0[i * kNumLanes], &buf0[(7 - i) * kNumLanes],
    534           &buf1[i * kNumLanes], &buf1[(7 - i) * kNumLanes]);
    535  }
    536 
    537  Butterfly(int_tag, -cospi[32], cospi[32], &buf0[10 * kNumLanes],
    538            &buf0[13 * kNumLanes], &buf1[10 * kNumLanes], &buf1[13 * kNumLanes],
    539            cos_bit, round);
    540  Butterfly(int_tag, -cospi[32], cospi[32], &buf0[11 * kNumLanes],
    541            &buf0[12 * kNumLanes], &buf1[11 * kNumLanes], &buf1[12 * kNumLanes],
    542            cos_bit, round);
    543 
    544  for (size_t i = 0; i < 4; ++i) {
    545    AddSub(int_tag, &buf1[(16 + i) * kNumLanes], &buf0[(23 - i) * kNumLanes],
    546           &buf1[(16 + i) * kNumLanes], &buf1[(23 - i) * kNumLanes]);
    547  }
    548  for (size_t i = 0; i < 4; ++i) {
    549    AddSub(int_tag, &buf1[(31 - i) * kNumLanes], &buf0[(24 + i) * kNumLanes],
    550           &buf1[(31 - i) * kNumLanes], &buf1[(24 + i) * kNumLanes]);
    551  }
    552 
    553  // stage 4
    554  for (size_t i = 0; i < 2; ++i) {
    555    AddSub(int_tag, &buf1[i * kNumLanes], &buf1[(3 - i) * kNumLanes],
    556           &buf0[i * kNumLanes], &buf0[(3 - i) * kNumLanes]);
    557  }
    558 
    559  Butterfly(int_tag, -cospi[32], cospi[32], &buf1[5 * kNumLanes],
    560            &buf1[6 * kNumLanes], &buf0[5 * kNumLanes], &buf0[6 * kNumLanes],
    561            cos_bit, round);
    562 
    563  for (size_t i = 0; i < 2; ++i) {
    564    AddSub(int_tag, &buf0[(8 + i) * kNumLanes], &buf1[(11 - i) * kNumLanes],
    565           &buf0[(8 + i) * kNumLanes], &buf0[(11 - i) * kNumLanes]);
    566  }
    567  for (size_t i = 0; i < 2; ++i) {
    568    AddSub(int_tag, &buf0[(15 - i) * kNumLanes], &buf1[(12 + i) * kNumLanes],
    569           &buf0[(15 - i) * kNumLanes], &buf0[(12 + i) * kNumLanes]);
    570  }
    571 
    572  Butterfly(int_tag, -cospi[16], cospi[48], &buf1[18 * kNumLanes],
    573            &buf1[29 * kNumLanes], &buf0[18 * kNumLanes], &buf0[29 * kNumLanes],
    574            cos_bit, round);
    575  Butterfly(int_tag, -cospi[16], cospi[48], &buf1[19 * kNumLanes],
    576            &buf1[28 * kNumLanes], &buf0[19 * kNumLanes], &buf0[28 * kNumLanes],
    577            cos_bit, round);
    578  Butterfly(int_tag, -cospi[48], -cospi[16], &buf1[20 * kNumLanes],
    579            &buf1[27 * kNumLanes], &buf0[20 * kNumLanes], &buf0[27 * kNumLanes],
    580            cos_bit, round);
    581  Butterfly(int_tag, -cospi[48], -cospi[16], &buf1[21 * kNumLanes],
    582            &buf1[26 * kNumLanes], &buf0[21 * kNumLanes], &buf0[26 * kNumLanes],
    583            cos_bit, round);
    584 
    585  // stage 5
    586  Butterfly(int_tag, cospi[32], cospi[32], &buf0[0 * kNumLanes],
    587            &buf0[1 * kNumLanes], &in[0 * instride], &in[16 * instride],
    588            cos_bit, round);
    589  Butterfly(int_tag, cospi[16], cospi[48], &buf0[3 * kNumLanes],
    590            &buf0[2 * kNumLanes], &in[8 * instride], &in[24 * instride],
    591            cos_bit, round);
    592  AddSub(int_tag, &buf1[4 * kNumLanes], &buf0[5 * kNumLanes],
    593         &buf1[4 * kNumLanes], &buf1[5 * kNumLanes]);
    594  AddSub(int_tag, &buf1[7 * kNumLanes], &buf0[6 * kNumLanes],
    595         &buf1[7 * kNumLanes], &buf1[6 * kNumLanes]);
    596  Butterfly(int_tag, -cospi[16], cospi[48], &buf0[9 * kNumLanes],
    597            &buf0[14 * kNumLanes], &buf1[9 * kNumLanes], &buf1[14 * kNumLanes],
    598            cos_bit, round);
    599  Butterfly(int_tag, -cospi[48], -cospi[16], &buf0[10 * kNumLanes],
    600            &buf0[13 * kNumLanes], &buf1[10 * kNumLanes], &buf1[13 * kNumLanes],
    601            cos_bit, round);
    602 
    603  AddSub(int_tag, &buf1[16 * kNumLanes], &buf0[19 * kNumLanes],
    604         &buf1[16 * kNumLanes], &buf1[19 * kNumLanes]);
    605  AddSub(int_tag, &buf1[17 * kNumLanes], &buf0[18 * kNumLanes],
    606         &buf1[17 * kNumLanes], &buf1[18 * kNumLanes]);
    607  AddSub(int_tag, &buf1[23 * kNumLanes], &buf0[20 * kNumLanes],
    608         &buf1[23 * kNumLanes], &buf1[20 * kNumLanes]);
    609  AddSub(int_tag, &buf1[22 * kNumLanes], &buf0[21 * kNumLanes],
    610         &buf1[22 * kNumLanes], &buf1[21 * kNumLanes]);
    611  AddSub(int_tag, &buf1[24 * kNumLanes], &buf0[27 * kNumLanes],
    612         &buf1[24 * kNumLanes], &buf1[27 * kNumLanes]);
    613  AddSub(int_tag, &buf1[25 * kNumLanes], &buf0[26 * kNumLanes],
    614         &buf1[25 * kNumLanes], &buf1[26 * kNumLanes]);
    615  AddSub(int_tag, &buf1[31 * kNumLanes], &buf0[28 * kNumLanes],
    616         &buf1[31 * kNumLanes], &buf1[28 * kNumLanes]);
    617  AddSub(int_tag, &buf1[30 * kNumLanes], &buf0[29 * kNumLanes],
    618         &buf1[30 * kNumLanes], &buf1[29 * kNumLanes]);
    619 
    620  // stage 6
    621  Butterfly(int_tag, cospi[8], cospi[56], &buf1[7 * kNumLanes],
    622            &buf1[4 * kNumLanes], &in[4 * instride], &in[28 * instride],
    623            cos_bit, round);
    624  Butterfly(int_tag, cospi[40], cospi[24], &buf1[6 * kNumLanes],
    625            &buf1[5 * kNumLanes], &in[20 * instride], &in[12 * instride],
    626            cos_bit, round);
    627  AddSub(int_tag, &buf0[8 * kNumLanes], &buf1[9 * kNumLanes],
    628         &buf0[8 * kNumLanes], &buf0[9 * kNumLanes]);
    629  AddSub(int_tag, &buf0[11 * kNumLanes], &buf1[10 * kNumLanes],
    630         &buf0[11 * kNumLanes], &buf0[10 * kNumLanes]);
    631  AddSub(int_tag, &buf0[12 * kNumLanes], &buf1[13 * kNumLanes],
    632         &buf0[12 * kNumLanes], &buf0[13 * kNumLanes]);
    633  AddSub(int_tag, &buf0[15 * kNumLanes], &buf1[14 * kNumLanes],
    634         &buf0[15 * kNumLanes], &buf0[14 * kNumLanes]);
    635  Butterfly(int_tag, -cospi[8], cospi[56], &buf1[17 * kNumLanes],
    636            &buf1[30 * kNumLanes], &buf0[17 * kNumLanes], &buf0[30 * kNumLanes],
    637            cos_bit, round);
    638  Butterfly(int_tag, -cospi[56], -cospi[8], &buf1[18 * kNumLanes],
    639            &buf1[29 * kNumLanes], &buf0[18 * kNumLanes], &buf0[29 * kNumLanes],
    640            cos_bit, round);
    641  Butterfly(int_tag, -cospi[40], cospi[24], &buf1[21 * kNumLanes],
    642            &buf1[26 * kNumLanes], &buf0[21 * kNumLanes], &buf0[26 * kNumLanes],
    643            cos_bit, round);
    644  Butterfly(int_tag, -cospi[24], -cospi[40], &buf1[22 * kNumLanes],
    645            &buf1[25 * kNumLanes], &buf0[22 * kNumLanes], &buf0[25 * kNumLanes],
    646            cos_bit, round);
    647 
    648  // stage 7
    649  Butterfly(int_tag, cospi[4], cospi[60], &buf0[15 * kNumLanes],
    650            &buf0[8 * kNumLanes], &in[2 * instride], &in[30 * instride],
    651            cos_bit, round);
    652  Butterfly(int_tag, cospi[36], cospi[28], &buf0[14 * kNumLanes],
    653            &buf0[9 * kNumLanes], &in[18 * instride], &in[14 * instride],
    654            cos_bit, round);
    655  Butterfly(int_tag, cospi[20], cospi[44], &buf0[13 * kNumLanes],
    656            &buf0[10 * kNumLanes], &in[10 * instride], &in[22 * instride],
    657            cos_bit, round);
    658  Butterfly(int_tag, cospi[52], cospi[12], &buf0[12 * kNumLanes],
    659            &buf0[11 * kNumLanes], &in[26 * instride], &in[6 * instride],
    660            cos_bit, round);
    661  AddSub(int_tag, &buf1[16 * kNumLanes], &buf0[17 * kNumLanes],
    662         &buf1[16 * kNumLanes], &buf1[17 * kNumLanes]);
    663  AddSub(int_tag, &buf1[19 * kNumLanes], &buf0[18 * kNumLanes],
    664         &buf1[19 * kNumLanes], &buf1[18 * kNumLanes]);
    665  AddSub(int_tag, &buf1[20 * kNumLanes], &buf0[21 * kNumLanes],
    666         &buf1[20 * kNumLanes], &buf1[21 * kNumLanes]);
    667  AddSub(int_tag, &buf1[23 * kNumLanes], &buf0[22 * kNumLanes],
    668         &buf1[23 * kNumLanes], &buf1[22 * kNumLanes]);
    669  AddSub(int_tag, &buf1[24 * kNumLanes], &buf0[25 * kNumLanes],
    670         &buf1[24 * kNumLanes], &buf1[25 * kNumLanes]);
    671  AddSub(int_tag, &buf1[27 * kNumLanes], &buf0[26 * kNumLanes],
    672         &buf1[27 * kNumLanes], &buf1[26 * kNumLanes]);
    673  AddSub(int_tag, &buf1[28 * kNumLanes], &buf0[29 * kNumLanes],
    674         &buf1[28 * kNumLanes], &buf1[29 * kNumLanes]);
    675  AddSub(int_tag, &buf1[31 * kNumLanes], &buf0[30 * kNumLanes],
    676         &buf1[31 * kNumLanes], &buf1[30 * kNumLanes]);
    677 
    678  // stage 8 & 9
    679  Butterfly(int_tag, cospi[2], cospi[62], &buf1[31 * kNumLanes],
    680            &buf1[16 * kNumLanes], &in[1 * instride], &in[31 * instride],
    681            cos_bit, round);
    682  Butterfly(int_tag, cospi[34], cospi[30], &buf1[30 * kNumLanes],
    683            &buf1[17 * kNumLanes], &in[17 * instride], &in[15 * instride],
    684            cos_bit, round);
    685  Butterfly(int_tag, cospi[18], cospi[46], &buf1[29 * kNumLanes],
    686            &buf1[18 * kNumLanes], &in[9 * instride], &in[23 * instride],
    687            cos_bit, round);
    688  Butterfly(int_tag, cospi[50], cospi[14], &buf1[28 * kNumLanes],
    689            &buf1[19 * kNumLanes], &in[25 * instride], &in[7 * instride],
    690            cos_bit, round);
    691  Butterfly(int_tag, cospi[10], cospi[54], &buf1[27 * kNumLanes],
    692            &buf1[20 * kNumLanes], &in[5 * instride], &in[27 * instride],
    693            cos_bit, round);
    694  Butterfly(int_tag, cospi[42], cospi[22], &buf1[26 * kNumLanes],
    695            &buf1[21 * kNumLanes], &in[21 * instride], &in[11 * instride],
    696            cos_bit, round);
    697  Butterfly(int_tag, cospi[26], cospi[38], &buf1[25 * kNumLanes],
    698            &buf1[22 * kNumLanes], &in[13 * instride], &in[19 * instride],
    699            cos_bit, round);
    700  Butterfly(int_tag, cospi[58], cospi[6], &buf1[24 * kNumLanes],
    701            &buf1[23 * kNumLanes], &in[29 * instride], &in[3 * instride],
    702            cos_bit, round);
    703 
    704  // stage 9 was fused with prior stages.
    705 }
    706 
    707 template <size_t InStride, size_t OutStride, typename D>
    708 HWY_ATTR HWY_NOINLINE void Fdct64(D int_tag, hn::TFromD<D> *HWY_RESTRICT in,
    709                                  const int8_t cos_bit) {
    710  constexpr size_t kNumLanes = hn::MaxLanes(int_tag);
    711  constexpr size_t kNumBytes = kNumLanes * sizeof(hn::TFromD<D>);
    712  HWY_ALIGN_MAX hn::TFromD<D> buf0[64 * kNumLanes];
    713  HWY_ALIGN_MAX hn::TFromD<D> buf1[64 * kNumLanes];
    714  const int32_t *HWY_RESTRICT const cospi = cospi_arr(cos_bit);
    715  const auto round = hn::Set(hn::Repartition<int32_t, D>(), 1 << (cos_bit - 1));
    716 
    717  // stage 1
    718 #if HWY_TARGET == HWY_SSE4
    719  // For whatever reason, some compilers don't unroll this when building for
    720  // SSE4; help them along.
    721  HWY_UNROLL(32)
    722 #endif
    723  for (size_t i = 0; i < 32; ++i) {
    724    AddSub(int_tag, &in[i * InStride], &in[(63 - i) * InStride],
    725           &buf0[i * kNumLanes], &buf0[(63 - i) * kNumLanes]);
    726  }
    727 
    728  // stage 2
    729  for (size_t i = 0; i < 16; ++i) {
    730    AddSub(int_tag, &buf0[i * kNumLanes], &buf0[(31 - i) * kNumLanes],
    731           &buf1[i * kNumLanes], &buf1[(31 - i) * kNumLanes]);
    732  }
    733  for (size_t i = 0; i < 8; ++i) {
    734    Butterfly(int_tag, -cospi[32], cospi[32], &buf0[(40 + i) * kNumLanes],
    735              &buf0[(55 - i) * kNumLanes], &buf1[(40 + i) * kNumLanes],
    736              &buf1[(55 - i) * kNumLanes], cos_bit, round);
    737  }
    738 
    739  // stage 3
    740  for (size_t i = 0; i < 8; ++i) {
    741    AddSub(int_tag, &buf1[i * kNumLanes], &buf1[(15 - i) * kNumLanes],
    742           &buf0[i * kNumLanes], &buf0[(15 - i) * kNumLanes]);
    743  }
    744  for (size_t i = 0; i < 4; ++i) {
    745    Butterfly(int_tag, -cospi[32], cospi[32], &buf1[(20 + i) * kNumLanes],
    746              &buf1[(27 - i) * kNumLanes], &buf0[(20 + i) * kNumLanes],
    747              &buf0[(27 - i) * kNumLanes], cos_bit, round);
    748  }
    749  for (size_t i = 0; i < 8; ++i) {
    750    AddSub(int_tag, &buf0[(32 + i) * kNumLanes], &buf1[(47 - i) * kNumLanes],
    751           &buf0[(32 + i) * kNumLanes], &buf0[(47 - i) * kNumLanes]);
    752  }
    753  for (size_t i = 0; i < 8; ++i) {
    754    AddSub(int_tag, &buf0[(63 - i) * kNumLanes], &buf1[(48 + i) * kNumLanes],
    755           &buf0[(63 - i) * kNumLanes], &buf0[(48 + i) * kNumLanes]);
    756  }
    757  // stage 4
    758  for (size_t i = 0; i < 4; ++i) {
    759    AddSub(int_tag, &buf0[(0 + i) * kNumLanes], &buf0[(7 - i) * kNumLanes],
    760           &buf1[(0 + i) * kNumLanes], &buf1[(7 - i) * kNumLanes]);
    761  }
    762  for (size_t i = 0; i < 2; ++i) {
    763    Butterfly(int_tag, -cospi[32], cospi[32], &buf0[(10 + i) * kNumLanes],
    764              &buf0[(13 - i) * kNumLanes], &buf1[(10 + i) * kNumLanes],
    765              &buf1[(13 - i) * kNumLanes], cos_bit, round);
    766  }
    767  for (size_t i = 0; i < 4; ++i) {
    768    AddSub(int_tag, &buf1[(16 + i) * kNumLanes], &buf0[(23 - i) * kNumLanes],
    769           &buf1[(16 + i) * kNumLanes], &buf1[(23 - i) * kNumLanes]);
    770  }
    771  for (size_t i = 0; i < 4; ++i) {
    772    AddSub(int_tag, &buf1[(31 - i) * kNumLanes], &buf0[(24 + i) * kNumLanes],
    773           &buf1[(31 - i) * kNumLanes], &buf1[(24 + i) * kNumLanes]);
    774  }
    775  for (size_t i = 0; i < 4; ++i) {
    776    Butterfly(int_tag, -cospi[16], cospi[48], &buf0[(36 + i) * kNumLanes],
    777              &buf0[(59 - i) * kNumLanes], &buf1[(36 + i) * kNumLanes],
    778              &buf1[(59 - i) * kNumLanes], cos_bit, round);
    779  }
    780  for (size_t i = 4; i < 8; ++i) {
    781    Butterfly(int_tag, -cospi[48], -cospi[16], &buf0[(36 + i) * kNumLanes],
    782              &buf0[(59 - i) * kNumLanes], &buf1[(36 + i) * kNumLanes],
    783              &buf1[(59 - i) * kNumLanes], cos_bit, round);
    784  }
    785  // stage 5
    786  for (size_t i = 0; i < 2; ++i) {
    787    AddSub(int_tag, &buf1[(0 + i) * kNumLanes], &buf1[(3 - i) * kNumLanes],
    788           &buf0[(0 + i) * kNumLanes], &buf0[(3 - i) * kNumLanes]);
    789  }
    790  Butterfly(int_tag, -cospi[32], cospi[32], &buf1[5 * kNumLanes],
    791            &buf1[6 * kNumLanes], &buf0[5 * kNumLanes], &buf0[6 * kNumLanes],
    792            cos_bit, round);
    793  for (size_t i = 0; i < 2; ++i) {
    794    AddSub(int_tag, &buf0[(8 + i) * kNumLanes], &buf1[(11 - i) * kNumLanes],
    795           &buf0[(8 + i) * kNumLanes], &buf0[(11 - i) * kNumLanes]);
    796  }
    797  for (size_t i = 0; i < 2; ++i) {
    798    AddSub(int_tag, &buf0[(15 - i) * kNumLanes], &buf1[(12 + i) * kNumLanes],
    799           &buf0[(15 - i) * kNumLanes], &buf0[(12 + i) * kNumLanes]);
    800  }
    801  for (size_t i = 0; i < 2; ++i) {
    802    Butterfly(int_tag, -cospi[16], cospi[48], &buf1[(18 + i) * kNumLanes],
    803              &buf1[(29 - i) * kNumLanes], &buf0[(18 + i) * kNumLanes],
    804              &buf0[(29 - i) * kNumLanes], cos_bit, round);
    805  }
    806  for (size_t i = 2; i < 4; ++i) {
    807    Butterfly(int_tag, -cospi[48], -cospi[16], &buf1[(18 + i) * kNumLanes],
    808              &buf1[(29 - i) * kNumLanes], &buf0[(18 + i) * kNumLanes],
    809              &buf0[(29 - i) * kNumLanes], cos_bit, round);
    810  }
    811  for (size_t i = 0; i < 4; ++i) {
    812    AddSub(int_tag, &buf0[(32 + i) * kNumLanes], &buf1[(39 - i) * kNumLanes],
    813           &buf0[(32 + i) * kNumLanes], &buf0[(39 - i) * kNumLanes]);
    814  }
    815  for (size_t i = 0; i < 4; ++i) {
    816    AddSub(int_tag, &buf0[(47 - i) * kNumLanes], &buf1[(40 + i) * kNumLanes],
    817           &buf0[(47 - i) * kNumLanes], &buf0[(40 + i) * kNumLanes]);
    818  }
    819  for (size_t i = 0; i < 4; ++i) {
    820    AddSub(int_tag, &buf0[(48 + i) * kNumLanes], &buf1[(55 - i) * kNumLanes],
    821           &buf0[(48 + i) * kNumLanes], &buf0[(55 - i) * kNumLanes]);
    822  }
    823  for (size_t i = 0; i < 4; ++i) {
    824    AddSub(int_tag, &buf0[(63 - i) * kNumLanes], &buf1[(56 + i) * kNumLanes],
    825           &buf0[(63 - i) * kNumLanes], &buf0[(56 + i) * kNumLanes]);
    826  }
    827  // stage 6
    828  Butterfly(int_tag, cospi[32], cospi[32], &buf0[0 * kNumLanes],
    829            &buf0[1 * kNumLanes], &buf1[0 * kNumLanes], &buf1[1 * kNumLanes],
    830            cos_bit, round);
    831  Butterfly(int_tag, cospi[16], cospi[48], &buf0[3 * kNumLanes],
    832            &buf0[2 * kNumLanes], &buf1[2 * kNumLanes], &buf1[3 * kNumLanes],
    833            cos_bit, round);
    834  AddSub(int_tag, &buf1[4 * kNumLanes], &buf0[5 * kNumLanes],
    835         &buf1[4 * kNumLanes], &buf1[5 * kNumLanes]);
    836  AddSub(int_tag, &buf1[7 * kNumLanes], &buf0[6 * kNumLanes],
    837         &buf1[7 * kNumLanes], &buf1[6 * kNumLanes]);
    838  Butterfly(int_tag, -cospi[16], cospi[48], &buf0[9 * kNumLanes],
    839            &buf0[14 * kNumLanes], &buf1[9 * kNumLanes], &buf1[14 * kNumLanes],
    840            cos_bit, round);
    841  Butterfly(int_tag, -cospi[48], -cospi[16], &buf0[10 * kNumLanes],
    842            &buf0[13 * kNumLanes], &buf1[10 * kNumLanes], &buf1[13 * kNumLanes],
    843            cos_bit, round);
    844  for (size_t i = 0; i < 2; ++i) {
    845    AddSub(int_tag, &buf1[(16 + i) * kNumLanes], &buf0[(19 - i) * kNumLanes],
    846           &buf1[(16 + i) * kNumLanes], &buf1[(19 - i) * kNumLanes]);
    847  }
    848  for (size_t i = 0; i < 2; ++i) {
    849    AddSub(int_tag, &buf1[(23 - i) * kNumLanes], &buf0[(20 + i) * kNumLanes],
    850           &buf1[(23 - i) * kNumLanes], &buf1[(20 + i) * kNumLanes]);
    851  }
    852  for (size_t i = 0; i < 2; ++i) {
    853    AddSub(int_tag, &buf1[(24 + i) * kNumLanes], &buf0[(27 - i) * kNumLanes],
    854           &buf1[(24 + i) * kNumLanes], &buf1[(27 - i) * kNumLanes]);
    855  }
    856  for (size_t i = 0; i < 2; ++i) {
    857    AddSub(int_tag, &buf1[(31 - i) * kNumLanes], &buf0[(28 + i) * kNumLanes],
    858           &buf1[(31 - i) * kNumLanes], &buf1[(28 + i) * kNumLanes]);
    859  }
    860  for (size_t i = 0; i < 2; ++i) {
    861    Butterfly(int_tag, -cospi[8], cospi[56], &buf0[(34 + i) * kNumLanes],
    862              &buf0[(61 - i) * kNumLanes], &buf1[(34 + i) * kNumLanes],
    863              &buf1[(61 - i) * kNumLanes], cos_bit, round);
    864  }
    865  for (size_t i = 2; i < 4; ++i) {
    866    Butterfly(int_tag, -cospi[56], -cospi[8], &buf0[(34 + i) * kNumLanes],
    867              &buf0[(61 - i) * kNumLanes], &buf1[(34 + i) * kNumLanes],
    868              &buf1[(61 - i) * kNumLanes], cos_bit, round);
    869  }
    870  for (size_t i = 0; i < 2; ++i) {
    871    Butterfly(int_tag, -cospi[40], cospi[24], &buf0[(42 + i) * kNumLanes],
    872              &buf0[(53 - i) * kNumLanes], &buf1[(42 + i) * kNumLanes],
    873              &buf1[(53 - i) * kNumLanes], cos_bit, round);
    874  }
    875  for (size_t i = 2; i < 4; ++i) {
    876    Butterfly(int_tag, -cospi[24], -cospi[40], &buf0[(42 + i) * kNumLanes],
    877              &buf0[(53 - i) * kNumLanes], &buf1[(42 + i) * kNumLanes],
    878              &buf1[(53 - i) * kNumLanes], cos_bit, round);
    879  }
    880  // stage 7
    881  Butterfly(int_tag, cospi[8], cospi[56], &buf1[7 * kNumLanes],
    882            &buf1[4 * kNumLanes], &buf0[4 * kNumLanes], &buf0[7 * kNumLanes],
    883            cos_bit, round);
    884  Butterfly(int_tag, cospi[40], cospi[24], &buf1[6 * kNumLanes],
    885            &buf1[5 * kNumLanes], &buf0[5 * kNumLanes], &buf0[6 * kNumLanes],
    886            cos_bit, round);
    887  AddSub(int_tag, &buf0[8 * kNumLanes], &buf1[9 * kNumLanes],
    888         &buf0[8 * kNumLanes], &buf0[9 * kNumLanes]);
    889  AddSub(int_tag, &buf0[11 * kNumLanes], &buf1[10 * kNumLanes],
    890         &buf0[11 * kNumLanes], &buf0[10 * kNumLanes]);
    891  AddSub(int_tag, &buf0[12 * kNumLanes], &buf1[13 * kNumLanes],
    892         &buf0[12 * kNumLanes], &buf0[13 * kNumLanes]);
    893  AddSub(int_tag, &buf0[15 * kNumLanes], &buf1[14 * kNumLanes],
    894         &buf0[15 * kNumLanes], &buf0[14 * kNumLanes]);
    895  Butterfly(int_tag, -cospi[8], cospi[56], &buf1[17 * kNumLanes],
    896            &buf1[30 * kNumLanes], &buf0[17 * kNumLanes], &buf0[30 * kNumLanes],
    897            cos_bit, round);
    898  Butterfly(int_tag, -cospi[56], -cospi[8], &buf1[18 * kNumLanes],
    899            &buf1[29 * kNumLanes], &buf0[18 * kNumLanes], &buf0[29 * kNumLanes],
    900            cos_bit, round);
    901  Butterfly(int_tag, -cospi[40], cospi[24], &buf1[21 * kNumLanes],
    902            &buf1[26 * kNumLanes], &buf0[21 * kNumLanes], &buf0[26 * kNumLanes],
    903            cos_bit, round);
    904  Butterfly(int_tag, -cospi[24], -cospi[40], &buf1[22 * kNumLanes],
    905            &buf1[25 * kNumLanes], &buf0[22 * kNumLanes], &buf0[25 * kNumLanes],
    906            cos_bit, round);
    907  for (size_t i = 0; i < 2; ++i) {
    908    AddSub(int_tag, &buf0[(32 + i) * kNumLanes], &buf1[(35 - i) * kNumLanes],
    909           &buf0[(32 + i) * kNumLanes], &buf0[(35 - i) * kNumLanes]);
    910  }
    911  for (size_t i = 0; i < 2; ++i) {
    912    AddSub(int_tag, &buf0[(39 - i) * kNumLanes], &buf1[(36 + i) * kNumLanes],
    913           &buf0[(39 - i) * kNumLanes], &buf0[(36 + i) * kNumLanes]);
    914  }
    915  for (size_t i = 0; i < 2; ++i) {
    916    AddSub(int_tag, &buf0[(40 + i) * kNumLanes], &buf1[(43 - i) * kNumLanes],
    917           &buf0[(40 + i) * kNumLanes], &buf0[(43 - i) * kNumLanes]);
    918  }
    919  for (size_t i = 0; i < 2; ++i) {
    920    AddSub(int_tag, &buf0[(47 - i) * kNumLanes], &buf1[(44 + i) * kNumLanes],
    921           &buf0[(47 - i) * kNumLanes], &buf0[(44 + i) * kNumLanes]);
    922  }
    923  for (size_t i = 0; i < 2; ++i) {
    924    AddSub(int_tag, &buf0[(48 + i) * kNumLanes], &buf1[(51 - i) * kNumLanes],
    925           &buf0[(48 + i) * kNumLanes], &buf0[(51 - i) * kNumLanes]);
    926  }
    927  for (size_t i = 0; i < 2; ++i) {
    928    AddSub(int_tag, &buf0[(55 - i) * kNumLanes], &buf1[(52 + i) * kNumLanes],
    929           &buf0[(55 - i) * kNumLanes], &buf0[(52 + i) * kNumLanes]);
    930  }
    931  for (size_t i = 0; i < 2; ++i) {
    932    AddSub(int_tag, &buf0[(56 + i) * kNumLanes], &buf1[(59 - i) * kNumLanes],
    933           &buf0[(56 + i) * kNumLanes], &buf0[(59 - i) * kNumLanes]);
    934  }
    935  for (size_t i = 0; i < 2; ++i) {
    936    AddSub(int_tag, &buf0[(63 - i) * kNumLanes], &buf1[(60 + i) * kNumLanes],
    937           &buf0[(63 - i) * kNumLanes], &buf0[(60 + i) * kNumLanes]);
    938  }
    939  // stage 8
    940  Butterfly(int_tag, cospi[4], cospi[60], &buf0[15 * kNumLanes],
    941            &buf0[8 * kNumLanes], &buf1[8 * kNumLanes], &buf1[15 * kNumLanes],
    942            cos_bit, round);
    943  Butterfly(int_tag, cospi[36], cospi[28], &buf0[14 * kNumLanes],
    944            &buf0[9 * kNumLanes], &buf1[9 * kNumLanes], &buf1[14 * kNumLanes],
    945            cos_bit, round);
    946  Butterfly(int_tag, cospi[20], cospi[44], &buf0[13 * kNumLanes],
    947            &buf0[10 * kNumLanes], &buf1[10 * kNumLanes], &buf1[13 * kNumLanes],
    948            cos_bit, round);
    949  Butterfly(int_tag, cospi[52], cospi[12], &buf0[12 * kNumLanes],
    950            &buf0[11 * kNumLanes], &buf1[11 * kNumLanes], &buf1[12 * kNumLanes],
    951            cos_bit, round);
    952  AddSub(int_tag, &buf1[16 * kNumLanes], &buf0[17 * kNumLanes],
    953         &buf1[16 * kNumLanes], &buf1[17 * kNumLanes]);
    954  AddSub(int_tag, &buf1[19 * kNumLanes], &buf0[18 * kNumLanes],
    955         &buf1[19 * kNumLanes], &buf1[18 * kNumLanes]);
    956  AddSub(int_tag, &buf1[20 * kNumLanes], &buf0[21 * kNumLanes],
    957         &buf1[20 * kNumLanes], &buf1[21 * kNumLanes]);
    958  AddSub(int_tag, &buf1[23 * kNumLanes], &buf0[22 * kNumLanes],
    959         &buf1[23 * kNumLanes], &buf1[22 * kNumLanes]);
    960  AddSub(int_tag, &buf1[24 * kNumLanes], &buf0[25 * kNumLanes],
    961         &buf1[24 * kNumLanes], &buf1[25 * kNumLanes]);
    962  AddSub(int_tag, &buf1[27 * kNumLanes], &buf0[26 * kNumLanes],
    963         &buf1[27 * kNumLanes], &buf1[26 * kNumLanes]);
    964  AddSub(int_tag, &buf1[28 * kNumLanes], &buf0[29 * kNumLanes],
    965         &buf1[28 * kNumLanes], &buf1[29 * kNumLanes]);
    966  AddSub(int_tag, &buf1[31 * kNumLanes], &buf0[30 * kNumLanes],
    967         &buf1[31 * kNumLanes], &buf1[30 * kNumLanes]);
    968  Butterfly(int_tag, -cospi[4], cospi[60], &buf0[33 * kNumLanes],
    969            &buf0[62 * kNumLanes], &buf1[33 * kNumLanes], &buf1[62 * kNumLanes],
    970            cos_bit, round);
    971  Butterfly(int_tag, -cospi[60], -cospi[4], &buf0[34 * kNumLanes],
    972            &buf0[61 * kNumLanes], &buf1[34 * kNumLanes], &buf1[61 * kNumLanes],
    973            cos_bit, round);
    974  Butterfly(int_tag, -cospi[36], cospi[28], &buf0[37 * kNumLanes],
    975            &buf0[58 * kNumLanes], &buf1[37 * kNumLanes], &buf1[58 * kNumLanes],
    976            cos_bit, round);
    977  Butterfly(int_tag, -cospi[28], -cospi[36], &buf0[38 * kNumLanes],
    978            &buf0[57 * kNumLanes], &buf1[38 * kNumLanes], &buf1[57 * kNumLanes],
    979            cos_bit, round);
    980  Butterfly(int_tag, -cospi[20], cospi[44], &buf0[41 * kNumLanes],
    981            &buf0[54 * kNumLanes], &buf1[41 * kNumLanes], &buf1[54 * kNumLanes],
    982            cos_bit, round);
    983  Butterfly(int_tag, -cospi[44], -cospi[20], &buf0[42 * kNumLanes],
    984            &buf0[53 * kNumLanes], &buf1[42 * kNumLanes], &buf1[53 * kNumLanes],
    985            cos_bit, round);
    986  Butterfly(int_tag, -cospi[52], cospi[12], &buf0[45 * kNumLanes],
    987            &buf0[50 * kNumLanes], &buf1[45 * kNumLanes], &buf1[50 * kNumLanes],
    988            cos_bit, round);
    989  Butterfly(int_tag, -cospi[12], -cospi[52], &buf0[46 * kNumLanes],
    990            &buf0[49 * kNumLanes], &buf1[46 * kNumLanes], &buf1[49 * kNumLanes],
    991            cos_bit, round);
    992  // stage 9
    993  Butterfly(int_tag, cospi[2], cospi[62], &buf1[31 * kNumLanes],
    994            &buf1[16 * kNumLanes], &buf0[16 * kNumLanes], &buf0[31 * kNumLanes],
    995            cos_bit, round);
    996  Butterfly(int_tag, cospi[34], cospi[30], &buf1[30 * kNumLanes],
    997            &buf1[17 * kNumLanes], &buf0[17 * kNumLanes], &buf0[30 * kNumLanes],
    998            cos_bit, round);
    999  Butterfly(int_tag, cospi[18], cospi[46], &buf1[29 * kNumLanes],
   1000            &buf1[18 * kNumLanes], &buf0[18 * kNumLanes], &buf0[29 * kNumLanes],
   1001            cos_bit, round);
   1002  Butterfly(int_tag, cospi[50], cospi[14], &buf1[28 * kNumLanes],
   1003            &buf1[19 * kNumLanes], &buf0[19 * kNumLanes], &buf0[28 * kNumLanes],
   1004            cos_bit, round);
   1005  Butterfly(int_tag, cospi[10], cospi[54], &buf1[27 * kNumLanes],
   1006            &buf1[20 * kNumLanes], &buf0[20 * kNumLanes], &buf0[27 * kNumLanes],
   1007            cos_bit, round);
   1008  Butterfly(int_tag, cospi[42], cospi[22], &buf1[26 * kNumLanes],
   1009            &buf1[21 * kNumLanes], &buf0[21 * kNumLanes], &buf0[26 * kNumLanes],
   1010            cos_bit, round);
   1011  Butterfly(int_tag, cospi[26], cospi[38], &buf1[25 * kNumLanes],
   1012            &buf1[22 * kNumLanes], &buf0[22 * kNumLanes], &buf0[25 * kNumLanes],
   1013            cos_bit, round);
   1014  Butterfly(int_tag, cospi[58], cospi[6], &buf1[24 * kNumLanes],
   1015            &buf1[23 * kNumLanes], &buf0[23 * kNumLanes], &buf0[24 * kNumLanes],
   1016            cos_bit, round);
   1017  AddSub(int_tag, &buf0[32 * kNumLanes], &buf1[33 * kNumLanes],
   1018         &buf0[32 * kNumLanes], &buf0[33 * kNumLanes]);
   1019  AddSub(int_tag, &buf0[35 * kNumLanes], &buf1[34 * kNumLanes],
   1020         &buf0[35 * kNumLanes], &buf0[34 * kNumLanes]);
   1021  AddSub(int_tag, &buf0[36 * kNumLanes], &buf1[37 * kNumLanes],
   1022         &buf0[36 * kNumLanes], &buf0[37 * kNumLanes]);
   1023  AddSub(int_tag, &buf0[39 * kNumLanes], &buf1[38 * kNumLanes],
   1024         &buf0[39 * kNumLanes], &buf0[38 * kNumLanes]);
   1025  AddSub(int_tag, &buf0[40 * kNumLanes], &buf1[41 * kNumLanes],
   1026         &buf0[40 * kNumLanes], &buf0[41 * kNumLanes]);
   1027  AddSub(int_tag, &buf0[43 * kNumLanes], &buf1[42 * kNumLanes],
   1028         &buf0[43 * kNumLanes], &buf0[42 * kNumLanes]);
   1029  AddSub(int_tag, &buf0[44 * kNumLanes], &buf1[45 * kNumLanes],
   1030         &buf0[44 * kNumLanes], &buf0[45 * kNumLanes]);
   1031  AddSub(int_tag, &buf0[47 * kNumLanes], &buf1[46 * kNumLanes],
   1032         &buf0[47 * kNumLanes], &buf0[46 * kNumLanes]);
   1033  AddSub(int_tag, &buf0[48 * kNumLanes], &buf1[49 * kNumLanes],
   1034         &buf0[48 * kNumLanes], &buf0[49 * kNumLanes]);
   1035  AddSub(int_tag, &buf0[51 * kNumLanes], &buf1[50 * kNumLanes],
   1036         &buf0[51 * kNumLanes], &buf0[50 * kNumLanes]);
   1037  AddSub(int_tag, &buf0[52 * kNumLanes], &buf1[53 * kNumLanes],
   1038         &buf0[52 * kNumLanes], &buf0[53 * kNumLanes]);
   1039  AddSub(int_tag, &buf0[55 * kNumLanes], &buf1[54 * kNumLanes],
   1040         &buf0[55 * kNumLanes], &buf0[54 * kNumLanes]);
   1041  AddSub(int_tag, &buf0[56 * kNumLanes], &buf1[57 * kNumLanes],
   1042         &buf0[56 * kNumLanes], &buf0[57 * kNumLanes]);
   1043  AddSub(int_tag, &buf0[59 * kNumLanes], &buf1[58 * kNumLanes],
   1044         &buf0[59 * kNumLanes], &buf0[58 * kNumLanes]);
   1045  AddSub(int_tag, &buf0[60 * kNumLanes], &buf1[61 * kNumLanes],
   1046         &buf0[60 * kNumLanes], &buf0[61 * kNumLanes]);
   1047  AddSub(int_tag, &buf0[63 * kNumLanes], &buf1[62 * kNumLanes],
   1048         &buf0[63 * kNumLanes], &buf0[62 * kNumLanes]);
   1049  // stage 10
   1050  Butterfly(int_tag, cospi[1], cospi[63], &buf0[63 * kNumLanes],
   1051            &buf0[32 * kNumLanes], &buf1[32 * kNumLanes], &buf1[63 * kNumLanes],
   1052            cos_bit, round);
   1053  Butterfly(int_tag, cospi[33], cospi[31], &buf0[62 * kNumLanes],
   1054            &buf0[33 * kNumLanes], &buf1[33 * kNumLanes], &buf1[62 * kNumLanes],
   1055            cos_bit, round);
   1056  Butterfly(int_tag, cospi[17], cospi[47], &buf0[61 * kNumLanes],
   1057            &buf0[34 * kNumLanes], &buf1[34 * kNumLanes], &buf1[61 * kNumLanes],
   1058            cos_bit, round);
   1059  Butterfly(int_tag, cospi[49], cospi[15], &buf0[60 * kNumLanes],
   1060            &buf0[35 * kNumLanes], &buf1[35 * kNumLanes], &buf1[60 * kNumLanes],
   1061            cos_bit, round);
   1062  Butterfly(int_tag, cospi[9], cospi[55], &buf0[59 * kNumLanes],
   1063            &buf0[36 * kNumLanes], &buf1[36 * kNumLanes], &buf1[59 * kNumLanes],
   1064            cos_bit, round);
   1065  Butterfly(int_tag, cospi[41], cospi[23], &buf0[58 * kNumLanes],
   1066            &buf0[37 * kNumLanes], &buf1[37 * kNumLanes], &buf1[58 * kNumLanes],
   1067            cos_bit, round);
   1068  Butterfly(int_tag, cospi[25], cospi[39], &buf0[57 * kNumLanes],
   1069            &buf0[38 * kNumLanes], &buf1[38 * kNumLanes], &buf1[57 * kNumLanes],
   1070            cos_bit, round);
   1071  Butterfly(int_tag, cospi[57], cospi[7], &buf0[56 * kNumLanes],
   1072            &buf0[39 * kNumLanes], &buf1[39 * kNumLanes], &buf1[56 * kNumLanes],
   1073            cos_bit, round);
   1074  Butterfly(int_tag, cospi[05], cospi[59], &buf0[55 * kNumLanes],
   1075            &buf0[40 * kNumLanes], &buf1[40 * kNumLanes], &buf1[55 * kNumLanes],
   1076            cos_bit, round);
   1077  Butterfly(int_tag, cospi[37], cospi[27], &buf0[54 * kNumLanes],
   1078            &buf0[41 * kNumLanes], &buf1[41 * kNumLanes], &buf1[54 * kNumLanes],
   1079            cos_bit, round);
   1080  Butterfly(int_tag, cospi[21], cospi[43], &buf0[53 * kNumLanes],
   1081            &buf0[42 * kNumLanes], &buf1[42 * kNumLanes], &buf1[53 * kNumLanes],
   1082            cos_bit, round);
   1083  Butterfly(int_tag, cospi[53], cospi[11], &buf0[52 * kNumLanes],
   1084            &buf0[43 * kNumLanes], &buf1[43 * kNumLanes], &buf1[52 * kNumLanes],
   1085            cos_bit, round);
   1086  Butterfly(int_tag, cospi[13], cospi[51], &buf0[51 * kNumLanes],
   1087            &buf0[44 * kNumLanes], &buf1[44 * kNumLanes], &buf1[51 * kNumLanes],
   1088            cos_bit, round);
   1089  Butterfly(int_tag, cospi[45], cospi[19], &buf0[50 * kNumLanes],
   1090            &buf0[45 * kNumLanes], &buf1[45 * kNumLanes], &buf1[50 * kNumLanes],
   1091            cos_bit, round);
   1092  Butterfly(int_tag, cospi[29], cospi[35], &buf0[49 * kNumLanes],
   1093            &buf0[46 * kNumLanes], &buf1[46 * kNumLanes], &buf1[49 * kNumLanes],
   1094            cos_bit, round);
   1095  Butterfly(int_tag, cospi[61], cospi[3], &buf0[48 * kNumLanes],
   1096            &buf0[47 * kNumLanes], &buf1[47 * kNumLanes], &buf1[48 * kNumLanes],
   1097            cos_bit, round);
   1098 
   1099  // stage 11
   1100  hwy::CopyBytes<kNumBytes>(&buf1[0 * kNumLanes], &in[0 * OutStride]);
   1101  hwy::CopyBytes<kNumBytes>(&buf1[63 * kNumLanes], &in[63 * OutStride]);
   1102  hwy::CopyBytes<kNumBytes>(&buf1[32 * kNumLanes], &in[1 * OutStride]);
   1103  hwy::CopyBytes<kNumBytes>(&buf1[31 * kNumLanes], &in[62 * OutStride]);
   1104  hwy::CopyBytes<kNumBytes>(&buf0[16 * kNumLanes], &in[2 * OutStride]);
   1105  hwy::CopyBytes<kNumBytes>(&buf1[47 * kNumLanes], &in[61 * OutStride]);
   1106  hwy::CopyBytes<kNumBytes>(&buf1[48 * kNumLanes], &in[3 * OutStride]);
   1107  hwy::CopyBytes<kNumBytes>(&buf1[15 * kNumLanes], &in[60 * OutStride]);
   1108  hwy::CopyBytes<kNumBytes>(&buf1[8 * kNumLanes], &in[4 * OutStride]);
   1109  hwy::CopyBytes<kNumBytes>(&buf1[55 * kNumLanes], &in[59 * OutStride]);
   1110  hwy::CopyBytes<kNumBytes>(&buf1[40 * kNumLanes], &in[5 * OutStride]);
   1111  hwy::CopyBytes<kNumBytes>(&buf1[23 * kNumLanes], &in[58 * OutStride]);
   1112  hwy::CopyBytes<kNumBytes>(&buf0[24 * kNumLanes], &in[6 * OutStride]);
   1113  hwy::CopyBytes<kNumBytes>(&buf1[39 * kNumLanes], &in[57 * OutStride]);
   1114  hwy::CopyBytes<kNumBytes>(&buf1[56 * kNumLanes], &in[7 * OutStride]);
   1115  hwy::CopyBytes<kNumBytes>(&buf1[7 * kNumLanes], &in[56 * OutStride]);
   1116  hwy::CopyBytes<kNumBytes>(&buf0[4 * kNumLanes], &in[8 * OutStride]);
   1117  hwy::CopyBytes<kNumBytes>(&buf1[59 * kNumLanes], &in[55 * OutStride]);
   1118  hwy::CopyBytes<kNumBytes>(&buf1[36 * kNumLanes], &in[9 * OutStride]);
   1119  hwy::CopyBytes<kNumBytes>(&buf1[27 * kNumLanes], &in[54 * OutStride]);
   1120  hwy::CopyBytes<kNumBytes>(&buf0[20 * kNumLanes], &in[10 * OutStride]);
   1121  hwy::CopyBytes<kNumBytes>(&buf1[43 * kNumLanes], &in[53 * OutStride]);
   1122  hwy::CopyBytes<kNumBytes>(&buf1[52 * kNumLanes], &in[11 * OutStride]);
   1123  hwy::CopyBytes<kNumBytes>(&buf1[11 * kNumLanes], &in[52 * OutStride]);
   1124  hwy::CopyBytes<kNumBytes>(&buf1[12 * kNumLanes], &in[12 * OutStride]);
   1125  hwy::CopyBytes<kNumBytes>(&buf1[51 * kNumLanes], &in[51 * OutStride]);
   1126  hwy::CopyBytes<kNumBytes>(&buf1[44 * kNumLanes], &in[13 * OutStride]);
   1127  hwy::CopyBytes<kNumBytes>(&buf1[19 * kNumLanes], &in[50 * OutStride]);
   1128  hwy::CopyBytes<kNumBytes>(&buf0[28 * kNumLanes], &in[14 * OutStride]);
   1129  hwy::CopyBytes<kNumBytes>(&buf1[35 * kNumLanes], &in[49 * OutStride]);
   1130  hwy::CopyBytes<kNumBytes>(&buf1[60 * kNumLanes], &in[15 * OutStride]);
   1131  hwy::CopyBytes<kNumBytes>(&buf1[3 * kNumLanes], &in[48 * OutStride]);
   1132  hwy::CopyBytes<kNumBytes>(&buf1[2 * kNumLanes], &in[16 * OutStride]);
   1133  hwy::CopyBytes<kNumBytes>(&buf1[61 * kNumLanes], &in[47 * OutStride]);
   1134  hwy::CopyBytes<kNumBytes>(&buf1[34 * kNumLanes], &in[17 * OutStride]);
   1135  hwy::CopyBytes<kNumBytes>(&buf1[29 * kNumLanes], &in[46 * OutStride]);
   1136  hwy::CopyBytes<kNumBytes>(&buf0[18 * kNumLanes], &in[18 * OutStride]);
   1137  hwy::CopyBytes<kNumBytes>(&buf1[45 * kNumLanes], &in[45 * OutStride]);
   1138  hwy::CopyBytes<kNumBytes>(&buf1[50 * kNumLanes], &in[19 * OutStride]);
   1139  hwy::CopyBytes<kNumBytes>(&buf1[13 * kNumLanes], &in[44 * OutStride]);
   1140  hwy::CopyBytes<kNumBytes>(&buf1[10 * kNumLanes], &in[20 * OutStride]);
   1141  hwy::CopyBytes<kNumBytes>(&buf1[53 * kNumLanes], &in[43 * OutStride]);
   1142  hwy::CopyBytes<kNumBytes>(&buf1[42 * kNumLanes], &in[21 * OutStride]);
   1143  hwy::CopyBytes<kNumBytes>(&buf1[21 * kNumLanes], &in[42 * OutStride]);
   1144  hwy::CopyBytes<kNumBytes>(&buf0[26 * kNumLanes], &in[22 * OutStride]);
   1145  hwy::CopyBytes<kNumBytes>(&buf1[37 * kNumLanes], &in[41 * OutStride]);
   1146  hwy::CopyBytes<kNumBytes>(&buf1[58 * kNumLanes], &in[23 * OutStride]);
   1147  hwy::CopyBytes<kNumBytes>(&buf1[5 * kNumLanes], &in[40 * OutStride]);
   1148  hwy::CopyBytes<kNumBytes>(&buf0[6 * kNumLanes], &in[24 * OutStride]);
   1149  hwy::CopyBytes<kNumBytes>(&buf1[57 * kNumLanes], &in[39 * OutStride]);
   1150  hwy::CopyBytes<kNumBytes>(&buf1[38 * kNumLanes], &in[25 * OutStride]);
   1151  hwy::CopyBytes<kNumBytes>(&buf1[25 * kNumLanes], &in[38 * OutStride]);
   1152  hwy::CopyBytes<kNumBytes>(&buf0[22 * kNumLanes], &in[26 * OutStride]);
   1153  hwy::CopyBytes<kNumBytes>(&buf1[41 * kNumLanes], &in[37 * OutStride]);
   1154  hwy::CopyBytes<kNumBytes>(&buf1[54 * kNumLanes], &in[27 * OutStride]);
   1155  hwy::CopyBytes<kNumBytes>(&buf1[9 * kNumLanes], &in[36 * OutStride]);
   1156  hwy::CopyBytes<kNumBytes>(&buf1[14 * kNumLanes], &in[28 * OutStride]);
   1157  hwy::CopyBytes<kNumBytes>(&buf1[49 * kNumLanes], &in[35 * OutStride]);
   1158  hwy::CopyBytes<kNumBytes>(&buf1[46 * kNumLanes], &in[29 * OutStride]);
   1159  hwy::CopyBytes<kNumBytes>(&buf1[17 * kNumLanes], &in[34 * OutStride]);
   1160  hwy::CopyBytes<kNumBytes>(&buf0[30 * kNumLanes], &in[30 * OutStride]);
   1161  hwy::CopyBytes<kNumBytes>(&buf1[33 * kNumLanes], &in[33 * OutStride]);
   1162  hwy::CopyBytes<kNumBytes>(&buf1[62 * kNumLanes], &in[31 * OutStride]);
   1163  hwy::CopyBytes<kNumBytes>(&buf1[1 * kNumLanes], &in[32 * OutStride]);
   1164 }
   1165 
   1166 template <size_t LaneSize, size_t NumLanes>
   1167 struct Fadst4Traits {
   1168  template <size_t Width, typename D>
   1169  HWY_ATTR HWY_INLINE static void Fadst4(D int_tag,
   1170                                         hn::TFromD<D> *HWY_RESTRICT in,
   1171                                         const int8_t cos_bit,
   1172                                         const size_t instride) {
   1173    const int32_t *HWY_RESTRICT const sinpi = sinpi_arr(cos_bit);
   1174    const auto round = hn::Set(int_tag, 1 << (cos_bit - 1));
   1175    const auto sinpi1 = hn::Set(int_tag, sinpi[1]);
   1176    const auto sinpi2 = hn::Set(int_tag, sinpi[2]);
   1177    const auto sinpi3 = hn::Set(int_tag, sinpi[3]);
   1178    const auto sinpi4 = hn::Set(int_tag, sinpi[4]);
   1179    const auto in0 = hn::Load(int_tag, &in[0 * instride]);
   1180    const auto in1 = hn::Load(int_tag, &in[1 * instride]);
   1181    const auto in2 = hn::Load(int_tag, &in[2 * instride]);
   1182    const auto in3 = hn::Load(int_tag, &in[3 * instride]);
   1183    auto s0 = hn::Mul(in0, sinpi1);
   1184    auto s1 = hn::Mul(in0, sinpi4);
   1185    auto s2 = hn::Mul(in1, sinpi2);
   1186    auto s3 = hn::Mul(in1, sinpi1);
   1187    auto s4 = hn::Mul(in2, sinpi3);
   1188    auto s5 = hn::Mul(in3, sinpi4);
   1189    auto s6 = hn::Mul(in3, sinpi2);
   1190    auto s7 = hn::Sub(hn::Add(in0, in1), in3);
   1191    auto x0 = hn::Add(hn::Add(s0, s2), s5);
   1192    auto x1 = hn::Mul(s7, sinpi3);
   1193    auto x2 = hn::Add(hn::Sub(s1, s3), s6);
   1194    auto x3 = s4;
   1195    s0 = hn::Add(x0, x3);
   1196    s1 = x1;
   1197    s2 = hn::Sub(x2, x3);
   1198    s3 = hn::Add(hn::Sub(x2, x0), x3);
   1199    auto u0 = hn::Add(s0, round);
   1200    u0 = hn::ShiftRightSame(u0, cos_bit);
   1201    auto u1 = hn::Add(s1, round);
   1202    u1 = hn::ShiftRightSame(u1, cos_bit);
   1203    auto u2 = hn::Add(s2, round);
   1204    u2 = hn::ShiftRightSame(u2, cos_bit);
   1205    auto u3 = hn::Add(s3, round);
   1206    u3 = hn::ShiftRightSame(u3, cos_bit);
   1207    hn::Store(u0, int_tag, &in[0 * instride]);
   1208    hn::Store(u1, int_tag, &in[1 * instride]);
   1209    hn::Store(u2, int_tag, &in[2 * instride]);
   1210    hn::Store(u3, int_tag, &in[3 * instride]);
   1211  }
   1212 };
   1213 
   1214 template <>
   1215 struct Fadst4Traits<2, 4> {
   1216  template <size_t Width, typename D>
   1217  HWY_ATTR HWY_INLINE static void Fadst4(D int_tag,
   1218                                         hn::TFromD<D> *HWY_RESTRICT in,
   1219                                         const int8_t cos_bit,
   1220                                         const size_t instride) {
   1221    (void)int_tag;
   1222    const int32_t *HWY_RESTRICT const sinpi = sinpi_arr(cos_bit);
   1223    constexpr hn::FixedTag<hn::TFromD<D>, 8> demote_tag;
   1224    constexpr hn::RepartitionToWide<decltype(demote_tag)> int32_tag;
   1225    const auto round = hn::Set(int32_tag, 1 << (cos_bit - 1));
   1226    const auto sinpi_p01_p02 = SetPair(demote_tag, sinpi[1], sinpi[2]);
   1227    const auto sinpi_p04_m01 = SetPair(demote_tag, sinpi[4], -sinpi[1]);
   1228    const auto sinpi_p03_p04 = SetPair(demote_tag, sinpi[3], sinpi[4]);
   1229    const auto sinpi_m03_p02 = SetPair(demote_tag, -sinpi[3], sinpi[2]);
   1230    const auto sinpi_p03_p03 = hn::Set(demote_tag, sinpi[3]);
   1231    const auto in0 = hn::Load(demote_tag, &in[0 * instride]);
   1232    const auto in1 = hn::Load(demote_tag, &in[1 * instride]);
   1233    const auto in2 = hn::Load(demote_tag, &in[2 * instride]);
   1234    const auto in3 = hn::Load(demote_tag, &in[3 * instride]);
   1235    const auto in7 = hn::Add(in0, in1);
   1236    auto u0 = hn::InterleaveLower(in0, in1);
   1237    auto u1 = hn::InterleaveLower(in2, in3);
   1238    auto u2 = hn::InterleaveLower(in7, hn::Zero(demote_tag));
   1239    auto u3 = hn::InterleaveLower(in2, hn::Zero(demote_tag));
   1240    auto u4 = hn::InterleaveLower(in3, hn::Zero(demote_tag));
   1241    auto v0 = hn::WidenMulPairwiseAdd(int32_tag, u0, sinpi_p01_p02);  // s0 + s2
   1242    auto v1 = hn::WidenMulPairwiseAdd(int32_tag, u1, sinpi_p03_p04);  // s4 + s5
   1243    auto v2 = hn::WidenMulPairwiseAdd(int32_tag, u2, sinpi_p03_p03);  // x1
   1244    auto v3 = hn::WidenMulPairwiseAdd(int32_tag, u0, sinpi_p04_m01);  // s1 - s3
   1245    auto v4 =
   1246        hn::WidenMulPairwiseAdd(int32_tag, u1, sinpi_m03_p02);  // -s4 + s6
   1247    auto v5 = hn::WidenMulPairwiseAdd(int32_tag, u3, sinpi_p03_p03);  // s4
   1248    auto v6 = hn::WidenMulPairwiseAdd(int32_tag, u4, sinpi_p03_p03);
   1249    auto w0 = hn::Add(v0, v1);
   1250    auto w1 = hn::Sub(v2, v6);
   1251    auto w2 = hn::Add(v3, v4);
   1252    auto w3 = hn::Sub(w2, w0);
   1253    auto w4 = hn::ShiftLeft<2>(v5);
   1254    auto w5 = hn::Sub(w4, v5);
   1255    auto w6 = hn::Add(w3, w5);
   1256    v0 = hn::Add(w0, round);
   1257    v1 = hn::Add(w1, round);
   1258    v2 = hn::Add(w2, round);
   1259    v3 = hn::Add(w6, round);
   1260    w0 = hn::ShiftRightSame(v0, cos_bit);
   1261    w1 = hn::ShiftRightSame(v1, cos_bit);
   1262    w2 = hn::ShiftRightSame(v2, cos_bit);
   1263    w3 = hn::ShiftRightSame(v3, cos_bit);
   1264    auto o0 = hn::ReorderDemote2To(demote_tag, w0, w2);
   1265    auto o1 = hn::ReorderDemote2To(demote_tag, w1, w3);
   1266    hn::Store(o0, demote_tag, &in[0 * instride]);
   1267    hn::Store(o1, demote_tag, &in[1 * instride]);
   1268    hn::Store(hn::ShiftRightLanes<4>(demote_tag, o0), demote_tag,
   1269              &in[2 * instride]);
   1270    hn::Store(hn::ShiftRightLanes<4>(demote_tag, o1), demote_tag,
   1271              &in[3 * instride]);
   1272  }
   1273 };
   1274 
   1275 template <size_t NumLanes>
   1276 struct Fadst4Traits<2, NumLanes> {
   1277  template <size_t Width, typename D>
   1278  HWY_ATTR HWY_INLINE static void Fadst4(D int_tag,
   1279                                         hn::TFromD<D> *HWY_RESTRICT in,
   1280                                         const int8_t cos_bit,
   1281                                         const size_t instride) {
   1282    const int32_t *HWY_RESTRICT const sinpi = sinpi_arr(cos_bit);
   1283    constexpr hn::RepartitionToWide<D> int32_tag;
   1284    const auto round = hn::Set(int32_tag, 1 << (cos_bit - 1));
   1285    const auto sinpi_p01_p02 = SetPair(int_tag, sinpi[1], sinpi[2]);
   1286    const auto sinpi_p04_m01 = SetPair(int_tag, sinpi[4], -sinpi[1]);
   1287    const auto sinpi_p03_p04 = SetPair(int_tag, sinpi[3], sinpi[4]);
   1288    const auto sinpi_m03_p02 = SetPair(int_tag, -sinpi[3], sinpi[2]);
   1289    const auto sinpi_p03_p03 = hn::Set(int_tag, sinpi[3]);
   1290    const auto in0 = hn::Load(int_tag, &in[0 * instride]);
   1291    const auto in1 = hn::Load(int_tag, &in[1 * instride]);
   1292    const auto in2 = hn::Load(int_tag, &in[2 * instride]);
   1293    const auto in3 = hn::Load(int_tag, &in[3 * instride]);
   1294    const auto in7 = hn::Add(in0, in1);
   1295    auto ul0 = hn::InterleaveLower(int_tag, in0, in1);
   1296    auto uh0 = hn::InterleaveUpper(int_tag, in0, in1);
   1297    auto ul1 = hn::InterleaveLower(int_tag, in2, in3);
   1298    auto uh1 = hn::InterleaveUpper(int_tag, in2, in3);
   1299    auto ul2 = hn::InterleaveLower(int_tag, in7, hn::Zero(int_tag));
   1300    auto uh2 = hn::InterleaveUpper(int_tag, in7, hn::Zero(int_tag));
   1301    auto ul3 = hn::InterleaveLower(int_tag, in2, hn::Zero(int_tag));
   1302    auto uh3 = hn::InterleaveUpper(int_tag, in2, hn::Zero(int_tag));
   1303    auto ul4 = hn::InterleaveLower(int_tag, in3, hn::Zero(int_tag));
   1304    auto uh4 = hn::InterleaveUpper(int_tag, in3, hn::Zero(int_tag));
   1305    auto vl0 =
   1306        hn::WidenMulPairwiseAdd(int32_tag, ul0, sinpi_p01_p02);  // s0 + s2
   1307    auto vh0 =
   1308        hn::WidenMulPairwiseAdd(int32_tag, uh0, sinpi_p01_p02);  // s0 + s2
   1309    auto vl1 =
   1310        hn::WidenMulPairwiseAdd(int32_tag, ul1, sinpi_p03_p04);  // s4 + s5
   1311    auto vh1 =
   1312        hn::WidenMulPairwiseAdd(int32_tag, uh1, sinpi_p03_p04);  // s4 + s5
   1313    auto vl2 = hn::WidenMulPairwiseAdd(int32_tag, ul2, sinpi_p03_p03);  // x1
   1314    auto vh2 = hn::WidenMulPairwiseAdd(int32_tag, uh2, sinpi_p03_p03);  // x1
   1315    auto vl3 =
   1316        hn::WidenMulPairwiseAdd(int32_tag, ul0, sinpi_p04_m01);  // s1 - s3
   1317    auto vh3 =
   1318        hn::WidenMulPairwiseAdd(int32_tag, uh0, sinpi_p04_m01);  // s1 - s3
   1319    auto vl4 =
   1320        hn::WidenMulPairwiseAdd(int32_tag, ul1, sinpi_m03_p02);  // -s4 + s6
   1321    auto vh4 =
   1322        hn::WidenMulPairwiseAdd(int32_tag, uh1, sinpi_m03_p02);  // -s4 + s6
   1323    auto vl5 = hn::WidenMulPairwiseAdd(int32_tag, ul3, sinpi_p03_p03);  // s4
   1324    auto vh5 = hn::WidenMulPairwiseAdd(int32_tag, uh3, sinpi_p03_p03);  // s4
   1325    auto vl6 = hn::WidenMulPairwiseAdd(int32_tag, ul4, sinpi_p03_p03);
   1326    auto vh6 = hn::WidenMulPairwiseAdd(int32_tag, uh4, sinpi_p03_p03);
   1327    auto wl0 = hn::Add(vl0, vl1);
   1328    auto wh0 = hn::Add(vh0, vh1);
   1329    auto wl1 = hn::Sub(vl2, vl6);
   1330    auto wh1 = hn::Sub(vh2, vh6);
   1331    auto wl2 = hn::Add(vl3, vl4);
   1332    auto wh2 = hn::Add(vh3, vh4);
   1333    auto wl3 = hn::Sub(wl2, wl0);
   1334    auto wh3 = hn::Sub(wh2, wh0);
   1335    auto wl4 = hn::ShiftLeft<2>(vl5);
   1336    auto wh4 = hn::ShiftLeft<2>(vh5);
   1337    auto wl5 = hn::Sub(wl4, vl5);
   1338    auto wh5 = hn::Sub(wh4, vh5);
   1339    auto wl6 = hn::Add(wl3, wl5);
   1340    auto wh6 = hn::Add(wh3, wh5);
   1341    vl0 = hn::Add(wl0, round);
   1342    vh0 = hn::Add(wh0, round);
   1343    vl1 = hn::Add(wl1, round);
   1344    vh1 = hn::Add(wh1, round);
   1345    vl2 = hn::Add(wl2, round);
   1346    vh2 = hn::Add(wh2, round);
   1347    vl3 = hn::Add(wl6, round);
   1348    vh3 = hn::Add(wh6, round);
   1349    wl0 = hn::ShiftRightSame(vl0, cos_bit);
   1350    wh0 = hn::ShiftRightSame(vh0, cos_bit);
   1351    wl1 = hn::ShiftRightSame(vl1, cos_bit);
   1352    wh1 = hn::ShiftRightSame(vh1, cos_bit);
   1353    wl2 = hn::ShiftRightSame(vl2, cos_bit);
   1354    wh2 = hn::ShiftRightSame(vh2, cos_bit);
   1355    wl3 = hn::ShiftRightSame(vl3, cos_bit);
   1356    wh3 = hn::ShiftRightSame(vh3, cos_bit);
   1357    auto o0 = hn::ReorderDemote2To(int_tag, wl0, wh0);
   1358    auto o1 = hn::ReorderDemote2To(int_tag, wl1, wh1);
   1359    auto o2 = hn::ReorderDemote2To(int_tag, wl2, wh2);
   1360    auto o3 = hn::ReorderDemote2To(int_tag, wl3, wh3);
   1361    hn::Store(o0, int_tag, &in[0 * instride]);
   1362    hn::Store(o1, int_tag, &in[1 * instride]);
   1363    hn::Store(o2, int_tag, &in[2 * instride]);
   1364    hn::Store(o3, int_tag, &in[3 * instride]);
   1365  }
   1366 };
   1367 
   1368 template <size_t Width, typename D>
   1369 HWY_ATTR HWY_INLINE void Fadst4(D int_tag, hn::TFromD<D> *HWY_RESTRICT in,
   1370                                const int8_t cos_bit, const size_t instride) {
   1371  Fadst4Traits<sizeof(hn::TFromD<D>),
   1372               hn::MaxLanes(int_tag)>::template Fadst4<Width>(int_tag, in,
   1373                                                              cos_bit,
   1374                                                              instride);
   1375 }
   1376 
   1377 template <size_t Width, typename D>
   1378 HWY_ATTR HWY_INLINE void Fadst8(D int_tag, hn::TFromD<D> *HWY_RESTRICT in,
   1379                                const int8_t cos_bit, const size_t instride) {
   1380  constexpr size_t kNumLanes = hn::MaxLanes(int_tag);
   1381  constexpr size_t kNumBytes = kNumLanes * sizeof(hn::TFromD<D>);
   1382  HWY_ALIGN_MAX hn::TFromD<D> buf0[8 * kNumLanes];
   1383  HWY_ALIGN_MAX hn::TFromD<D> buf1[8 * kNumLanes];
   1384  const int32_t *HWY_RESTRICT cospi = cospi_arr(cos_bit);
   1385  const auto round = hn::Set(hn::Repartition<int32_t, D>(), 1 << (cos_bit - 1));
   1386 
   1387  // stage 0
   1388  // stage 1
   1389  hn::Store(hn::Load(int_tag, &in[0 * instride]), int_tag,
   1390            &buf0[0 * kNumLanes]);
   1391  hn::Store(hn::Neg(hn::Load(int_tag, &in[7 * instride])), int_tag,
   1392            &buf0[1 * kNumLanes]);
   1393  hn::Store(hn::Neg(hn::Load(int_tag, &in[3 * instride])), int_tag,
   1394            &buf0[2 * kNumLanes]);
   1395  hn::Store(hn::Load(int_tag, &in[4 * instride]), int_tag,
   1396            &buf0[3 * kNumLanes]);
   1397  hn::Store(hn::Neg(hn::Load(int_tag, &in[1 * instride])), int_tag,
   1398            &buf0[4 * kNumLanes]);
   1399  hn::Store(hn::Load(int_tag, &in[6 * instride]), int_tag,
   1400            &buf0[5 * kNumLanes]);
   1401  hn::Store(hn::Load(int_tag, &in[2 * instride]), int_tag,
   1402            &buf0[6 * kNumLanes]);
   1403  hn::Store(hn::Neg(hn::Load(int_tag, &in[5 * instride])), int_tag,
   1404            &buf0[7 * kNumLanes]);
   1405 
   1406  // stage 2
   1407  hwy::CopyBytes<2 * kNumBytes>(&buf0[0 * kNumLanes], &buf1[0 * kNumLanes]);
   1408  Butterfly(int_tag, cospi[32], cospi[32], &buf0[2 * kNumLanes],
   1409            &buf0[3 * kNumLanes], &buf1[2 * kNumLanes], &buf1[3 * kNumLanes],
   1410            cos_bit, round);
   1411  hwy::CopyBytes<2 * kNumBytes>(&buf0[4 * kNumLanes], &buf1[4 * kNumLanes]);
   1412  Butterfly(int_tag, cospi[32], cospi[32], &buf0[6 * kNumLanes],
   1413            &buf0[7 * kNumLanes], &buf1[6 * kNumLanes], &buf1[7 * kNumLanes],
   1414            cos_bit, round);
   1415 
   1416  // stage 3
   1417  for (size_t j = 0; j < 8; j += 4) {
   1418    for (size_t i = 0; i < 2; ++i) {
   1419      AddSub(int_tag, &buf1[(0 + i + j) * kNumLanes],
   1420             &buf1[(2 + i + j) * kNumLanes], &buf0[(0 + i + j) * kNumLanes],
   1421             &buf0[(2 + i + j) * kNumLanes]);
   1422    }
   1423  }
   1424 
   1425  // stage 4
   1426  hwy::CopyBytes<4 * kNumBytes>(&buf0[0 * kNumLanes], &buf1[0 * kNumLanes]);
   1427  HalfButterfly(int_tag, cospi[16], cospi[48], &buf0[4 * kNumLanes],
   1428                &buf0[5 * kNumLanes], &buf1[4 * kNumLanes], cos_bit, round);
   1429  HalfButterfly(int_tag, cospi[48], -cospi[16], &buf0[4 * kNumLanes],
   1430                &buf0[5 * kNumLanes], &buf1[5 * kNumLanes], cos_bit, round);
   1431  HalfButterfly(int_tag, -cospi[48], cospi[16], &buf0[6 * kNumLanes],
   1432                &buf0[7 * kNumLanes], &buf1[6 * kNumLanes], cos_bit, round);
   1433  HalfButterfly(int_tag, cospi[16], cospi[48], &buf0[6 * kNumLanes],
   1434                &buf0[7 * kNumLanes], &buf1[7 * kNumLanes], cos_bit, round);
   1435 
   1436  // stage 5
   1437  for (size_t i = 0; i < 4; ++i) {
   1438    AddSub(int_tag, &buf1[(0 + i) * kNumLanes], &buf1[(4 + i) * kNumLanes],
   1439           &buf0[(0 + i) * kNumLanes], &buf0[(4 + i) * kNumLanes]);
   1440  }
   1441 
   1442  // stage 6
   1443  HalfButterfly(int_tag, cospi[4], cospi[60], &buf0[0 * kNumLanes],
   1444                &buf0[1 * kNumLanes], &buf1[0 * kNumLanes], cos_bit, round);
   1445  HalfButterfly(int_tag, cospi[60], -cospi[4], &buf0[0 * kNumLanes],
   1446                &buf0[1 * kNumLanes], &buf1[1 * kNumLanes], cos_bit, round);
   1447  HalfButterfly(int_tag, cospi[20], cospi[44], &buf0[2 * kNumLanes],
   1448                &buf0[3 * kNumLanes], &buf1[2 * kNumLanes], cos_bit, round);
   1449  HalfButterfly(int_tag, cospi[44], -cospi[20], &buf0[2 * kNumLanes],
   1450                &buf0[3 * kNumLanes], &buf1[3 * kNumLanes], cos_bit, round);
   1451  HalfButterfly(int_tag, cospi[36], cospi[28], &buf0[4 * kNumLanes],
   1452                &buf0[5 * kNumLanes], &buf1[4 * kNumLanes], cos_bit, round);
   1453  HalfButterfly(int_tag, cospi[28], -cospi[36], &buf0[4 * kNumLanes],
   1454                &buf0[5 * kNumLanes], &buf1[5 * kNumLanes], cos_bit, round);
   1455  HalfButterfly(int_tag, cospi[52], cospi[12], &buf0[6 * kNumLanes],
   1456                &buf0[7 * kNumLanes], &buf1[6 * kNumLanes], cos_bit, round);
   1457  HalfButterfly(int_tag, cospi[12], -cospi[52], &buf0[6 * kNumLanes],
   1458                &buf0[7 * kNumLanes], &buf1[7 * kNumLanes], cos_bit, round);
   1459 
   1460  // stage 7
   1461  hwy::CopyBytes<kNumBytes>(&buf1[1 * kNumLanes], &in[0 * instride]);
   1462  hwy::CopyBytes<kNumBytes>(&buf1[6 * kNumLanes], &in[1 * instride]);
   1463  hwy::CopyBytes<kNumBytes>(&buf1[3 * kNumLanes], &in[2 * instride]);
   1464  hwy::CopyBytes<kNumBytes>(&buf1[4 * kNumLanes], &in[3 * instride]);
   1465  hwy::CopyBytes<kNumBytes>(&buf1[5 * kNumLanes], &in[4 * instride]);
   1466  hwy::CopyBytes<kNumBytes>(&buf1[2 * kNumLanes], &in[5 * instride]);
   1467  hwy::CopyBytes<kNumBytes>(&buf1[7 * kNumLanes], &in[6 * instride]);
   1468  hwy::CopyBytes<kNumBytes>(&buf1[0 * kNumLanes], &in[7 * instride]);
   1469 }
   1470 
   1471 template <size_t Width, typename D>
   1472 HWY_ATTR HWY_INLINE void Fadst16(D int_tag, hn::TFromD<D> *HWY_RESTRICT in,
   1473                                 const int8_t cos_bit, const size_t instride) {
   1474  constexpr size_t kNumLanes = hn::MaxLanes(int_tag);
   1475  constexpr size_t kNumBytes = kNumLanes * sizeof(hn::TFromD<D>);
   1476  HWY_ALIGN_MAX hn::TFromD<D> buf0[16 * kNumLanes];
   1477  HWY_ALIGN_MAX hn::TFromD<D> buf1[16 * kNumLanes];
   1478  const int32_t *HWY_RESTRICT const cospi = cospi_arr(cos_bit);
   1479  const auto round = hn::Set(hn::Repartition<int32_t, D>(), 1 << (cos_bit - 1));
   1480 
   1481  // stage 0
   1482  // stage 1
   1483  hn::Store(hn::Load(int_tag, &in[0 * instride]), int_tag,
   1484            &buf0[0 * kNumLanes]);
   1485  hn::Store(hn::Neg(hn::Load(int_tag, &in[15 * instride])), int_tag,
   1486            &buf0[1 * kNumLanes]);
   1487  hn::Store(hn::Neg(hn::Load(int_tag, &in[7 * instride])), int_tag,
   1488            &buf0[2 * kNumLanes]);
   1489  hn::Store(hn::Load(int_tag, &in[8 * instride]), int_tag,
   1490            &buf0[3 * kNumLanes]);
   1491  hn::Store(hn::Neg(hn::Load(int_tag, &in[3 * instride])), int_tag,
   1492            &buf0[4 * kNumLanes]);
   1493  hn::Store(hn::Load(int_tag, &in[12 * instride]), int_tag,
   1494            &buf0[5 * kNumLanes]);
   1495  hn::Store(hn::Load(int_tag, &in[4 * instride]), int_tag,
   1496            &buf0[6 * kNumLanes]);
   1497  hn::Store(hn::Neg(hn::Load(int_tag, &in[11 * instride])), int_tag,
   1498            &buf0[7 * kNumLanes]);
   1499  hn::Store(hn::Neg(hn::Load(int_tag, &in[1 * instride])), int_tag,
   1500            &buf0[8 * kNumLanes]);
   1501  hn::Store(hn::Load(int_tag, &in[14 * instride]), int_tag,
   1502            &buf0[9 * kNumLanes]);
   1503  hn::Store(hn::Load(int_tag, &in[6 * instride]), int_tag,
   1504            &buf0[10 * kNumLanes]);
   1505  hn::Store(hn::Neg(hn::Load(int_tag, &in[9 * instride])), int_tag,
   1506            &buf0[11 * kNumLanes]);
   1507  hn::Store(hn::Load(int_tag, &in[2 * instride]), int_tag,
   1508            &buf0[12 * kNumLanes]);
   1509  hn::Store(hn::Neg(hn::Load(int_tag, &in[13 * instride])), int_tag,
   1510            &buf0[13 * kNumLanes]);
   1511  hn::Store(hn::Neg(hn::Load(int_tag, &in[5 * instride])), int_tag,
   1512            &buf0[14 * kNumLanes]);
   1513  hn::Store(hn::Load(int_tag, &in[10 * instride]), int_tag,
   1514            &buf0[15 * kNumLanes]);
   1515 
   1516  // stage 2
   1517  hwy::CopyBytes<kNumBytes * 2>(&buf0[0 * kNumLanes], &buf1[0 * kNumLanes]);
   1518  Butterfly(int_tag, cospi[32], cospi[32], &buf0[2 * kNumLanes],
   1519            &buf0[3 * kNumLanes], &buf1[2 * kNumLanes], &buf1[3 * kNumLanes],
   1520            cos_bit, round);
   1521  hwy::CopyBytes<kNumBytes * 2>(&buf0[4 * kNumLanes], &buf1[4 * kNumLanes]);
   1522  Butterfly(int_tag, cospi[32], cospi[32], &buf0[6 * kNumLanes],
   1523            &buf0[7 * kNumLanes], &buf1[6 * kNumLanes], &buf1[7 * kNumLanes],
   1524            cos_bit, round);
   1525  hwy::CopyBytes<kNumBytes * 2>(&buf0[8 * kNumLanes], &buf1[8 * kNumLanes]);
   1526  Butterfly(int_tag, cospi[32], cospi[32], &buf0[10 * kNumLanes],
   1527            &buf0[11 * kNumLanes], &buf1[10 * kNumLanes], &buf1[11 * kNumLanes],
   1528            cos_bit, round);
   1529  hwy::CopyBytes<kNumBytes * 2>(&buf0[12 * kNumLanes], &buf1[12 * kNumLanes]);
   1530  Butterfly(int_tag, cospi[32], cospi[32], &buf0[14 * kNumLanes],
   1531            &buf0[15 * kNumLanes], &buf1[14 * kNumLanes], &buf1[15 * kNumLanes],
   1532            cos_bit, round);
   1533 
   1534  // stage 3
   1535  for (size_t j = 0; j < 16; j += 4) {
   1536    for (size_t i = 0; i < 2; ++i) {
   1537      AddSub(int_tag, &buf1[(0 + i + j) * kNumLanes],
   1538             &buf1[(2 + i + j) * kNumLanes], &buf0[(0 + i + j) * kNumLanes],
   1539             &buf0[(2 + i + j) * kNumLanes]);
   1540    }
   1541  }
   1542 
   1543  // stage 4
   1544  hwy::CopyBytes<kNumBytes * 4>(&buf0[0 * kNumLanes], &buf1[0 * kNumLanes]);
   1545  HalfButterfly(int_tag, cospi[16], cospi[48], &buf0[4 * kNumLanes],
   1546                &buf0[5 * kNumLanes], &buf1[4 * kNumLanes], cos_bit, round);
   1547  HalfButterfly(int_tag, cospi[48], -cospi[16], &buf0[4 * kNumLanes],
   1548                &buf0[5 * kNumLanes], &buf1[5 * kNumLanes], cos_bit, round);
   1549  HalfButterfly(int_tag, -cospi[48], cospi[16], &buf0[6 * kNumLanes],
   1550                &buf0[7 * kNumLanes], &buf1[6 * kNumLanes], cos_bit, round);
   1551  HalfButterfly(int_tag, cospi[16], cospi[48], &buf0[6 * kNumLanes],
   1552                &buf0[7 * kNumLanes], &buf1[7 * kNumLanes], cos_bit, round);
   1553  hwy::CopyBytes<kNumBytes * 4>(&buf0[8 * kNumLanes], &buf1[8 * kNumLanes]);
   1554  HalfButterfly(int_tag, cospi[16], cospi[48], &buf0[12 * kNumLanes],
   1555                &buf0[13 * kNumLanes], &buf1[12 * kNumLanes], cos_bit, round);
   1556  HalfButterfly(int_tag, cospi[48], -cospi[16], &buf0[12 * kNumLanes],
   1557                &buf0[13 * kNumLanes], &buf1[13 * kNumLanes], cos_bit, round);
   1558  HalfButterfly(int_tag, -cospi[48], cospi[16], &buf0[14 * kNumLanes],
   1559                &buf0[15 * kNumLanes], &buf1[14 * kNumLanes], cos_bit, round);
   1560  HalfButterfly(int_tag, cospi[16], cospi[48], &buf0[14 * kNumLanes],
   1561                &buf0[15 * kNumLanes], &buf1[15 * kNumLanes], cos_bit, round);
   1562 
   1563  // stage 5
   1564  for (size_t j = 0; j < 16; j += 8) {
   1565    for (size_t i = 0; i < 4; ++i) {
   1566      AddSub(int_tag, &buf1[(0 + i + j) * kNumLanes],
   1567             &buf1[(4 + i + j) * kNumLanes], &buf0[(0 + i + j) * kNumLanes],
   1568             &buf0[(4 + i + j) * kNumLanes]);
   1569    }
   1570  }
   1571 
   1572  // stage 6
   1573  hwy::CopyBytes<kNumBytes * 8>(&buf0[0 * kNumLanes], &buf1[0 * kNumLanes]);
   1574  HalfButterfly(int_tag, cospi[8], cospi[56], &buf0[8 * kNumLanes],
   1575                &buf0[9 * kNumLanes], &buf1[8 * kNumLanes], cos_bit, round);
   1576  HalfButterfly(int_tag, cospi[56], -cospi[8], &buf0[8 * kNumLanes],
   1577                &buf0[9 * kNumLanes], &buf1[9 * kNumLanes], cos_bit, round);
   1578  HalfButterfly(int_tag, cospi[40], cospi[24], &buf0[10 * kNumLanes],
   1579                &buf0[11 * kNumLanes], &buf1[10 * kNumLanes], cos_bit, round);
   1580  HalfButterfly(int_tag, cospi[24], -cospi[40], &buf0[10 * kNumLanes],
   1581                &buf0[11 * kNumLanes], &buf1[11 * kNumLanes], cos_bit, round);
   1582  HalfButterfly(int_tag, -cospi[56], cospi[8], &buf0[12 * kNumLanes],
   1583                &buf0[13 * kNumLanes], &buf1[12 * kNumLanes], cos_bit, round);
   1584  HalfButterfly(int_tag, cospi[8], cospi[56], &buf0[12 * kNumLanes],
   1585                &buf0[13 * kNumLanes], &buf1[13 * kNumLanes], cos_bit, round);
   1586  HalfButterfly(int_tag, -cospi[24], cospi[40], &buf0[14 * kNumLanes],
   1587                &buf0[15 * kNumLanes], &buf1[14 * kNumLanes], cos_bit, round);
   1588  HalfButterfly(int_tag, cospi[40], cospi[24], &buf0[14 * kNumLanes],
   1589                &buf0[15 * kNumLanes], &buf1[15 * kNumLanes], cos_bit, round);
   1590 
   1591  // stage 7
   1592  for (size_t i = 0; i < 8; ++i) {
   1593    AddSub(int_tag, &buf1[(0 + i) * kNumLanes], &buf1[(8 + i) * kNumLanes],
   1594           &buf0[(0 + i) * kNumLanes], &buf0[(8 + i) * kNumLanes]);
   1595  }
   1596 
   1597  // stage 8
   1598  HalfButterfly(int_tag, cospi[2], cospi[62], &buf0[0 * kNumLanes],
   1599                &buf0[1 * kNumLanes], &buf1[0 * kNumLanes], cos_bit, round);
   1600  HalfButterfly(int_tag, cospi[62], -cospi[2], &buf0[0 * kNumLanes],
   1601                &buf0[1 * kNumLanes], &buf1[1 * kNumLanes], cos_bit, round);
   1602  HalfButterfly(int_tag, cospi[10], cospi[54], &buf0[2 * kNumLanes],
   1603                &buf0[3 * kNumLanes], &buf1[2 * kNumLanes], cos_bit, round);
   1604  HalfButterfly(int_tag, cospi[54], -cospi[10], &buf0[2 * kNumLanes],
   1605                &buf0[3 * kNumLanes], &buf1[3 * kNumLanes], cos_bit, round);
   1606  HalfButterfly(int_tag, cospi[18], cospi[46], &buf0[4 * kNumLanes],
   1607                &buf0[5 * kNumLanes], &buf1[4 * kNumLanes], cos_bit, round);
   1608  HalfButterfly(int_tag, cospi[46], -cospi[18], &buf0[4 * kNumLanes],
   1609                &buf0[5 * kNumLanes], &buf1[5 * kNumLanes], cos_bit, round);
   1610  HalfButterfly(int_tag, cospi[26], cospi[38], &buf0[6 * kNumLanes],
   1611                &buf0[7 * kNumLanes], &buf1[6 * kNumLanes], cos_bit, round);
   1612  HalfButterfly(int_tag, cospi[38], -cospi[26], &buf0[6 * kNumLanes],
   1613                &buf0[7 * kNumLanes], &buf1[7 * kNumLanes], cos_bit, round);
   1614  HalfButterfly(int_tag, cospi[34], cospi[30], &buf0[8 * kNumLanes],
   1615                &buf0[9 * kNumLanes], &buf1[8 * kNumLanes], cos_bit, round);
   1616  HalfButterfly(int_tag, cospi[30], -cospi[34], &buf0[8 * kNumLanes],
   1617                &buf0[9 * kNumLanes], &buf1[9 * kNumLanes], cos_bit, round);
   1618  HalfButterfly(int_tag, cospi[42], cospi[22], &buf0[10 * kNumLanes],
   1619                &buf0[11 * kNumLanes], &buf1[10 * kNumLanes], cos_bit, round);
   1620  HalfButterfly(int_tag, cospi[22], -cospi[42], &buf0[10 * kNumLanes],
   1621                &buf0[11 * kNumLanes], &buf1[11 * kNumLanes], cos_bit, round);
   1622  HalfButterfly(int_tag, cospi[50], cospi[14], &buf0[12 * kNumLanes],
   1623                &buf0[13 * kNumLanes], &buf1[12 * kNumLanes], cos_bit, round);
   1624  HalfButterfly(int_tag, cospi[14], -cospi[50], &buf0[12 * kNumLanes],
   1625                &buf0[13 * kNumLanes], &buf1[13 * kNumLanes], cos_bit, round);
   1626  HalfButterfly(int_tag, cospi[58], cospi[6], &buf0[14 * kNumLanes],
   1627                &buf0[15 * kNumLanes], &buf1[14 * kNumLanes], cos_bit, round);
   1628  HalfButterfly(int_tag, cospi[6], -cospi[58], &buf0[14 * kNumLanes],
   1629                &buf0[15 * kNumLanes], &buf1[15 * kNumLanes], cos_bit, round);
   1630 
   1631  // stage 9
   1632  hwy::CopyBytes<kNumBytes>(&buf1[1 * kNumLanes], &in[0 * instride]);
   1633  hwy::CopyBytes<kNumBytes>(&buf1[14 * kNumLanes], &in[1 * instride]);
   1634  hwy::CopyBytes<kNumBytes>(&buf1[3 * kNumLanes], &in[2 * instride]);
   1635  hwy::CopyBytes<kNumBytes>(&buf1[12 * kNumLanes], &in[3 * instride]);
   1636  hwy::CopyBytes<kNumBytes>(&buf1[5 * kNumLanes], &in[4 * instride]);
   1637  hwy::CopyBytes<kNumBytes>(&buf1[10 * kNumLanes], &in[5 * instride]);
   1638  hwy::CopyBytes<kNumBytes>(&buf1[7 * kNumLanes], &in[6 * instride]);
   1639  hwy::CopyBytes<kNumBytes>(&buf1[8 * kNumLanes], &in[7 * instride]);
   1640  hwy::CopyBytes<kNumBytes>(&buf1[9 * kNumLanes], &in[8 * instride]);
   1641  hwy::CopyBytes<kNumBytes>(&buf1[6 * kNumLanes], &in[9 * instride]);
   1642  hwy::CopyBytes<kNumBytes>(&buf1[11 * kNumLanes], &in[10 * instride]);
   1643  hwy::CopyBytes<kNumBytes>(&buf1[4 * kNumLanes], &in[11 * instride]);
   1644  hwy::CopyBytes<kNumBytes>(&buf1[13 * kNumLanes], &in[12 * instride]);
   1645  hwy::CopyBytes<kNumBytes>(&buf1[2 * kNumLanes], &in[13 * instride]);
   1646  hwy::CopyBytes<kNumBytes>(&buf1[15 * kNumLanes], &in[14 * instride]);
   1647  hwy::CopyBytes<kNumBytes>(&buf1[0 * kNumLanes], &in[15 * instride]);
   1648 }
   1649 
   1650 template <size_t Width, typename D>
   1651 HWY_ATTR HWY_INLINE void IdtxAdd2(D tag, hn::TFromD<D> *HWY_RESTRICT in) {
   1652  for (size_t x = 0; x < Width; x += hn::MaxLanes(tag)) {
   1653    auto v = hn::Load(tag, &in[x]);
   1654    hn::Store(hn::Add(v, v), tag, &in[x]);
   1655  }
   1656 }
   1657 
   1658 template <size_t Width, int Shift, typename D>
   1659 HWY_ATTR HWY_INLINE void IdtxShift(D tag, hn::TFromD<D> *HWY_RESTRICT in) {
   1660  for (size_t x = 0; x < Width; x += hn::MaxLanes(tag)) {
   1661    hn::Store(hn::ShiftLeft<Shift>(hn::Load(tag, &in[x])), tag, &in[x]);
   1662  }
   1663 }
   1664 
   1665 template <int Scale, typename D>
   1666 HWY_ATTR HWY_INLINE void PromoteScale2x16ByNewSqrt2(
   1667    D tag, hn::VFromD<D> v, hn::VFromD<hn::RepartitionToWide<D>> &out0,
   1668    hn::VFromD<hn::RepartitionToWide<D>> &out1) {
   1669  constexpr hn::RepartitionToWide<D> int32_tag;
   1670  auto one = hn::Set(tag, 1);
   1671  auto scale_rounding = SetPair(tag, Scale * NewSqrt2, 1 << (NewSqrt2Bits - 1));
   1672  auto a0 = hn::InterleaveLower(tag, v, one);
   1673  auto a1 = hn::InterleaveUpper(tag, v, one);
   1674  out0 = hn::ShiftRight<NewSqrt2Bits>(
   1675      hn::WidenMulPairwiseAdd(int32_tag, a0, scale_rounding));
   1676  out1 = hn::ShiftRight<NewSqrt2Bits>(
   1677      hn::WidenMulPairwiseAdd(int32_tag, a1, scale_rounding));
   1678 }
   1679 
   1680 template <size_t LaneSize, size_t NumLanes>
   1681 struct ScaleByNewSqrt2Traits {
   1682  template <int Scale, typename D>
   1683  HWY_ATTR HWY_INLINE static hn::VFromD<D> ScaleByNewSqrt2(D tag,
   1684                                                           hn::VFromD<D> v) {
   1685    auto fact = hn::Set(tag, Scale * NewSqrt2);
   1686    auto offset = hn::Set(tag, 1 << (NewSqrt2Bits - 1));
   1687    return hn::ShiftRight<NewSqrt2Bits>(hn::MulAdd(v, fact, offset));
   1688  }
   1689 };
   1690 
   1691 template <>
   1692 struct ScaleByNewSqrt2Traits<2, 4> {
   1693  template <int Scale, typename D>
   1694  HWY_ATTR HWY_INLINE static hn::VFromD<D> ScaleByNewSqrt2(D tag,
   1695                                                           hn::VFromD<D> v) {
   1696    auto one = hn::Set(tag, 1);
   1697    auto scale_rounding =
   1698        SetPair(tag, Scale * NewSqrt2, 1 << (NewSqrt2Bits - 1));
   1699    constexpr hn::Rebind<int32_t, D> int32_tag;
   1700    auto a = hn::InterleaveLower(tag, v, one);
   1701    auto b = hn::ShiftRight<NewSqrt2Bits>(
   1702        hn::WidenMulPairwiseAdd(int32_tag, a, scale_rounding));
   1703    return hn::DemoteTo(tag, b);
   1704  }
   1705 };
   1706 
   1707 template <size_t NumLanes>
   1708 struct ScaleByNewSqrt2Traits<2, NumLanes> {
   1709  template <int Scale, typename D>
   1710  HWY_ATTR HWY_INLINE static hn::VFromD<D> ScaleByNewSqrt2(D tag,
   1711                                                           hn::VFromD<D> v) {
   1712    hn::VFromD<hn::RepartitionToWide<D>> b0, b1;
   1713    PromoteScale2x16ByNewSqrt2<Scale>(tag, v, b0, b1);
   1714    return hn::ReorderDemote2To(tag, b0, b1);
   1715  }
   1716 };
   1717 
   1718 template <int Scale, typename D>
   1719 HWY_ATTR HWY_INLINE hn::VFromD<D> ScaleByNewSqrt2(D tag, hn::VFromD<D> v) {
   1720  return ScaleByNewSqrt2Traits<sizeof(hn::TFromD<D>), hn::MaxLanes(tag)>::
   1721      template ScaleByNewSqrt2<Scale>(tag, v);
   1722 }
   1723 
   1724 template <size_t Width, int Scale, typename D>
   1725 HWY_ATTR HWY_INLINE void IdtxSqrt2(D tag, hn::TFromD<D> *HWY_RESTRICT in) {
   1726  for (size_t x = 0; x < Width; x += hn::MaxLanes(tag)) {
   1727    hn::Store(ScaleByNewSqrt2<Scale>(tag, hn::Load(tag, &in[x])), tag, &in[x]);
   1728  }
   1729 }
   1730 
   1731 template <size_t Width, size_t Stride, typename T>
   1732 HWY_ATTR void FdctNx4Block(T *HWY_RESTRICT in, int8_t cos_bit) {
   1733  constexpr auto int_tag = hn::CappedTag<T, Width>();
   1734  for (size_t i = 0; i < Width; i += hn::MaxLanes(int_tag)) {
   1735    Fdct4(int_tag, &in[i], cos_bit, Stride);
   1736  }
   1737 }
   1738 
   1739 template <size_t Width, size_t Stride, typename T>
   1740 HWY_ATTR void FdctNx8Block(T *HWY_RESTRICT in, int8_t cos_bit) {
   1741  constexpr auto int_tag = hn::CappedTag<T, Stride>();
   1742  for (size_t i = 0; i < Width; i += hn::MaxLanes(int_tag)) {
   1743    Fdct8(int_tag, &in[i], cos_bit, Stride);
   1744  }
   1745 }
   1746 
   1747 template <size_t Width, size_t Stride, typename T>
   1748 HWY_ATTR void FdctNx16Block(T *HWY_RESTRICT in, int8_t cos_bit) {
   1749  constexpr auto int_tag = hn::CappedTag<T, Stride>();
   1750  for (size_t i = 0; i < Width; i += hn::MaxLanes(int_tag)) {
   1751    Fdct16(int_tag, &in[i], cos_bit, Stride);
   1752  }
   1753 }
   1754 
   1755 template <size_t Width, size_t Stride, typename T>
   1756 HWY_ATTR void FdctNx32Block(T *HWY_RESTRICT in, int8_t cos_bit) {
   1757  constexpr auto int_tag = hn::CappedTag<T, Stride>();
   1758  for (size_t i = 0; i < Width; i += hn::MaxLanes(int_tag)) {
   1759    Fdct32(int_tag, &in[i], cos_bit, Stride);
   1760  }
   1761 }
   1762 
   1763 template <size_t InWidth, size_t InStride, size_t OutWidth, size_t OutStride,
   1764          typename T>
   1765 HWY_ATTR void FdctNx64Block(T *HWY_RESTRICT in, int8_t cos_bit) {
   1766  constexpr auto int_tag = hn::CappedTag<T, InWidth>();
   1767  for (size_t i = 0; i < OutWidth; i += hn::MaxLanes(int_tag)) {
   1768    Fdct64<InStride, OutStride>(int_tag, &in[i], cos_bit);
   1769  }
   1770 }
   1771 
   1772 template <size_t Width, size_t Stride, typename T>
   1773 HWY_ATTR HWY_INLINE void FadstNx4Block(T *HWY_RESTRICT in, int8_t cos_bit) {
   1774  constexpr auto int_tag = hn::CappedTag<T, Width>();
   1775  for (size_t i = 0; i < Width; i += hn::MaxLanes(int_tag)) {
   1776    Fadst4<Width>(int_tag, &in[i], cos_bit, Stride);
   1777  }
   1778 }
   1779 
   1780 template <size_t Width, size_t Stride, typename T>
   1781 HWY_ATTR void FadstNx8Block(T *HWY_RESTRICT in, int8_t cos_bit) {
   1782  constexpr auto int_tag = hn::CappedTag<T, Stride>();
   1783  for (size_t i = 0; i < Width; i += hn::MaxLanes(int_tag)) {
   1784    Fadst8<Width>(int_tag, &in[i], cos_bit, Stride);
   1785  }
   1786 }
   1787 
   1788 template <size_t Width, size_t Stride, typename T>
   1789 HWY_ATTR void FadstNx16Block(T *HWY_RESTRICT in, int8_t cos_bit) {
   1790  constexpr auto int_tag = hn::CappedTag<T, Stride>();
   1791  for (size_t i = 0; i < Width; i += hn::MaxLanes(int_tag)) {
   1792    Fadst16<Width>(int_tag, &in[i], cos_bit, Stride);
   1793  }
   1794 }
   1795 
   1796 template <size_t Width, size_t Stride, size_t BlockHeight, typename T>
   1797 HWY_ATTR void IdtxAdd2Block(T *HWY_RESTRICT in, int8_t cos_bit) {
   1798  (void)cos_bit;
   1799  constexpr auto int_tag = hn::CappedTag<T, Width>();
   1800  for (size_t y = 0; y < BlockHeight; ++y) {
   1801    IdtxAdd2<Width>(int_tag, &in[y * Stride]);
   1802  }
   1803 }
   1804 
   1805 template <size_t Width, size_t Stride, size_t BlockHeight, int Scale,
   1806          typename T>
   1807 HWY_ATTR void IdtxSqrt2Block(T *HWY_RESTRICT in, int8_t cos_bit) {
   1808  (void)cos_bit;
   1809  constexpr auto int_tag = hn::CappedTag<T, Width>();
   1810  for (size_t y = 0; y < BlockHeight; ++y) {
   1811    IdtxSqrt2<Width, Scale>(int_tag, &in[y * Stride]);
   1812  }
   1813 }
   1814 
   1815 template <size_t Width, size_t Stride, size_t BlockHeight, int Shift,
   1816          typename T>
   1817 HWY_ATTR void IdtxShiftBlock(T *HWY_RESTRICT in, int8_t cos_bit) {
   1818  (void)cos_bit;
   1819  constexpr auto int_tag = hn::CappedTag<T, Width>();
   1820  for (size_t y = 0; y < BlockHeight; ++y) {
   1821    IdtxShift<Width, Shift>(int_tag, &in[y * Stride]);
   1822  }
   1823 }
   1824 
   1825 template <typename T>
   1826 void TransformFail(T *in, int8_t cos_bit) {
   1827  (void)in;
   1828  (void)cos_bit;
   1829  assert(false && "Incorrect transform requested.");
   1830 }
   1831 
   1832 template <typename T>
   1833 using Transform1D = void (*)(T *in, int8_t cos_bit);
   1834 
   1835 template <bool PositiveOrZero>
   1836 struct RoundShiftTraits {};
   1837 
   1838 template <>
   1839 struct RoundShiftTraits<true> {
   1840  template <int Bit, typename D>
   1841  HWY_ATTR HWY_INLINE static hn::VFromD<D> Shift(D int_tag,
   1842                                                 hn::VFromD<D> value) {
   1843    (void)int_tag;
   1844    if CONSTEXPR_IF (Bit == 0) {
   1845      return value;
   1846    } else {
   1847      return hn::ShiftLeft<Bit>(value);
   1848    }
   1849  }
   1850 };
   1851 
   1852 template <>
   1853 struct RoundShiftTraits<false> {
   1854  template <int Bit, typename D>
   1855  HWY_ATTR HWY_INLINE static hn::VFromD<D> Shift(D int_tag,
   1856                                                 hn::VFromD<D> value) {
   1857    const auto round = hn::Set(int_tag, 1 << (-Bit - 1));
   1858    return hn::ShiftRight<-Bit>(hn::Add(value, round));
   1859  }
   1860 };
   1861 
   1862 template <int Bit, typename D>
   1863 HWY_ATTR HWY_INLINE hn::VFromD<D> RoundShift(D int_tag, hn::VFromD<D> value) {
   1864  return RoundShiftTraits<(Bit >= 0)>::template Shift<Bit>(int_tag, value);
   1865 }
   1866 
   1867 template <bool ApplyRectScale, typename D>
   1868 HWY_ATTR HWY_INLINE hn::VFromD<D> RectScale(D int_tag, hn::VFromD<D> v) {
   1869  if CONSTEXPR_IF (ApplyRectScale) {
   1870    return ScaleByNewSqrt2<1>(int_tag, v);
   1871  }
   1872  return v;
   1873 }
   1874 
   1875 template <bool IsSame>
   1876 struct MaybePromoteTraits {};
   1877 
   1878 template <>
   1879 struct MaybePromoteTraits<true> {
   1880  template <typename VIn, typename D>
   1881  HWY_ATTR HWY_INLINE static hn::VFromD<D> PromoteTo(D out_tag, VIn in) {
   1882    (void)out_tag;
   1883    return in;
   1884  }
   1885 
   1886  template <typename VIn, typename D>
   1887  HWY_ATTR HWY_INLINE static void PromoteStore2(D int_tag, VIn v,
   1888                                                hn::TFromD<D> *out) {
   1889    hn::StoreU(v, int_tag, out);
   1890  }
   1891 };
   1892 
   1893 template <>
   1894 struct MaybePromoteTraits<false> {
   1895  template <typename VIn, typename D>
   1896  HWY_ATTR HWY_INLINE static hn::VFromD<D> PromoteTo(D out_tag, VIn in) {
   1897    return hn::PromoteTo(out_tag, in);
   1898  }
   1899 
   1900  template <typename VIn, typename TOut, typename D>
   1901  HWY_ATTR HWY_INLINE static void PromoteStore2(D int_tag, VIn v, TOut *out) {
   1902    (void)int_tag;
   1903    constexpr hn::Repartition<TOut, D> store_tag;
   1904    hn::StoreU(hn::PromoteLowerTo(store_tag, v), store_tag, out);
   1905    hn::StoreU(hn::PromoteUpperTo(store_tag, v), store_tag,
   1906               out + hn::MaxLanes(store_tag));
   1907  }
   1908 };
   1909 
   1910 template <typename VIn, typename D>
   1911 HWY_ATTR HWY_INLINE hn::VFromD<D> MaybePromoteTo(D out_tag, VIn in) {
   1912  return MaybePromoteTraits<
   1913      std::is_same<hn::TFromD<D>, hn::TFromV<VIn>>::value>::PromoteTo(out_tag,
   1914                                                                      in);
   1915 }
   1916 
   1917 template <int8_t Bit, bool ApplyRectScale, typename TIn, typename TOut>
   1918 HWY_ATTR HWY_INLINE void Transpose4(const TIn *HWY_RESTRICT in,
   1919                                    TOut *HWY_RESTRICT out, size_t instride,
   1920                                    size_t outstride) {
   1921  constexpr hn::FixedTag<TIn, 4> int_tag;
   1922  auto i0 = RectScale<ApplyRectScale>(
   1923      int_tag, RoundShift<Bit>(int_tag, hn::Load(int_tag, &in[0 * instride])));
   1924  auto i1 = RectScale<ApplyRectScale>(
   1925      int_tag, RoundShift<Bit>(int_tag, hn::Load(int_tag, &in[1 * instride])));
   1926  auto i2 = RectScale<ApplyRectScale>(
   1927      int_tag, RoundShift<Bit>(int_tag, hn::Load(int_tag, &in[2 * instride])));
   1928  auto i3 = RectScale<ApplyRectScale>(
   1929      int_tag, RoundShift<Bit>(int_tag, hn::Load(int_tag, &in[3 * instride])));
   1930  HWY_ALIGN_MAX TOut interleaved[16];
   1931  constexpr hn::FixedTag<TOut, 4> out_tag;
   1932  hn::StoreInterleaved4(MaybePromoteTo(out_tag, i0),
   1933                        MaybePromoteTo(out_tag, i1),
   1934                        MaybePromoteTo(out_tag, i2),
   1935                        MaybePromoteTo(out_tag, i3), out_tag, interleaved);
   1936  for (size_t i = 0; i < 4; ++i) {
   1937    hwy::CopyBytes<hn::MaxLanes(int_tag) * sizeof(*out)>(&interleaved[i * 4],
   1938                                                         &out[i * outstride]);
   1939  }
   1940 }
   1941 
   1942 template <int8_t Bit, bool ApplyRectScale, typename TIn, typename TOut>
   1943 HWY_ATTR HWY_INLINE void Transpose8(const TIn *HWY_RESTRICT in,
   1944                                    TOut *HWY_RESTRICT out, size_t instride,
   1945                                    size_t outstride) {
   1946  constexpr hn::FixedTag<TIn, 8> int_tag;
   1947  constexpr hn::Rebind<TOut, decltype(int_tag)> out_tag;
   1948  // N.B. there isn't a StoreInterleaved8, so hand-code Transpose8.
   1949  constexpr hn::RepartitionToWide<decltype(out_tag)> wide_int_tag;
   1950  HWY_ALIGN_MAX hn::TFromD<decltype(wide_int_tag)> interleaved0[16];
   1951  HWY_ALIGN_MAX hn::TFromD<decltype(wide_int_tag)> interleaved1[16];
   1952  auto i0 = hn::Load(int_tag, &in[0 * instride]);
   1953  auto i1 = hn::Load(int_tag, &in[1 * instride]);
   1954  auto i2 = hn::Load(int_tag, &in[2 * instride]);
   1955  auto i3 = hn::Load(int_tag, &in[3 * instride]);
   1956  auto i4 = hn::Load(int_tag, &in[4 * instride]);
   1957  auto i5 = hn::Load(int_tag, &in[5 * instride]);
   1958  auto i6 = hn::Load(int_tag, &in[6 * instride]);
   1959  auto i7 = hn::Load(int_tag, &in[7 * instride]);
   1960  auto s0 = hn::Undefined(out_tag);
   1961  auto s1 = hn::Undefined(out_tag);
   1962  auto s2 = hn::Undefined(out_tag);
   1963  auto s3 = hn::Undefined(out_tag);
   1964  auto s4 = hn::Undefined(out_tag);
   1965  auto s5 = hn::Undefined(out_tag);
   1966  auto s6 = hn::Undefined(out_tag);
   1967  auto s7 = hn::Undefined(out_tag);
   1968  auto ip0 = MaybePromoteTo(out_tag, i0);
   1969  auto ip1 = MaybePromoteTo(out_tag, i1);
   1970  auto ip2 = MaybePromoteTo(out_tag, i2);
   1971  auto ip3 = MaybePromoteTo(out_tag, i3);
   1972  auto ip4 = MaybePromoteTo(out_tag, i4);
   1973  auto ip5 = MaybePromoteTo(out_tag, i5);
   1974  auto ip6 = MaybePromoteTo(out_tag, i6);
   1975  auto ip7 = MaybePromoteTo(out_tag, i7);
   1976  s0 = RectScale<ApplyRectScale>(out_tag, RoundShift<Bit>(out_tag, ip0));
   1977  s1 = RectScale<ApplyRectScale>(out_tag, RoundShift<Bit>(out_tag, ip1));
   1978  s2 = RectScale<ApplyRectScale>(out_tag, RoundShift<Bit>(out_tag, ip2));
   1979  s3 = RectScale<ApplyRectScale>(out_tag, RoundShift<Bit>(out_tag, ip3));
   1980  s4 = RectScale<ApplyRectScale>(out_tag, RoundShift<Bit>(out_tag, ip4));
   1981  s5 = RectScale<ApplyRectScale>(out_tag, RoundShift<Bit>(out_tag, ip5));
   1982  s6 = RectScale<ApplyRectScale>(out_tag, RoundShift<Bit>(out_tag, ip6));
   1983  s7 = RectScale<ApplyRectScale>(out_tag, RoundShift<Bit>(out_tag, ip7));
   1984  auto u0 = hn::ZipLower(wide_int_tag, s0, s1);
   1985  auto u1 = hn::ZipUpper(wide_int_tag, s0, s1);
   1986  auto u2 = hn::ZipLower(wide_int_tag, s2, s3);
   1987  auto u3 = hn::ZipUpper(wide_int_tag, s2, s3);
   1988  auto u4 = hn::ZipLower(wide_int_tag, s4, s5);
   1989  auto u5 = hn::ZipUpper(wide_int_tag, s4, s5);
   1990  auto u6 = hn::ZipLower(wide_int_tag, s6, s7);
   1991  auto u7 = hn::ZipUpper(wide_int_tag, s6, s7);
   1992  hn::StoreInterleaved4(u0, u2, u4, u6, wide_int_tag, interleaved0);
   1993  hn::StoreInterleaved4(u1, u3, u5, u7, wide_int_tag, interleaved1);
   1994  constexpr size_t kNumBytes = hn::MaxLanes(int_tag) * sizeof(*out);
   1995  if CONSTEXPR_IF (sizeof(TOut) == 2) {
   1996    hwy::CopyBytes<kNumBytes>(&interleaved0[0], &out[0 * outstride]);
   1997    hwy::CopyBytes<kNumBytes>(&interleaved0[4], &out[1 * outstride]);
   1998    hwy::CopyBytes<kNumBytes>(&interleaved0[8], &out[2 * outstride]);
   1999    hwy::CopyBytes<kNumBytes>(&interleaved0[12], &out[3 * outstride]);
   2000    hwy::CopyBytes<kNumBytes>(&interleaved1[0], &out[4 * outstride]);
   2001    hwy::CopyBytes<kNumBytes>(&interleaved1[4], &out[5 * outstride]);
   2002    hwy::CopyBytes<kNumBytes>(&interleaved1[8], &out[6 * outstride]);
   2003    hwy::CopyBytes<kNumBytes>(&interleaved1[12], &out[7 * outstride]);
   2004  } else {
   2005    hwy::CopyBytes<kNumBytes>(&interleaved0[0], &out[0 * outstride]);
   2006    hwy::CopyBytes<kNumBytes>(&interleaved0[4], &out[1 * outstride]);
   2007    hwy::CopyBytes<kNumBytes>(&interleaved1[0], &out[2 * outstride]);
   2008    hwy::CopyBytes<kNumBytes>(&interleaved1[4], &out[3 * outstride]);
   2009    hwy::CopyBytes<kNumBytes>(&interleaved0[8], &out[4 * outstride]);
   2010    hwy::CopyBytes<kNumBytes>(&interleaved0[12], &out[5 * outstride]);
   2011    hwy::CopyBytes<kNumBytes>(&interleaved1[8], &out[6 * outstride]);
   2012    hwy::CopyBytes<kNumBytes>(&interleaved1[12], &out[7 * outstride]);
   2013  }
   2014 }
   2015 
   2016 template <typename D>
   2017 HWY_ATTR HWY_INLINE hn::VFromD<D> LocalInterleaveEvenBlocks(D tag,
   2018                                                            hn::VFromD<D> a,
   2019                                                            hn::VFromD<D> b) {
   2020  static_assert(sizeof(hn::TFromD<D>) == 8,
   2021                "LocalInterleaveEvenBlocks requires 64-bit lanes.");
   2022  HWY_ALIGN static constexpr int64_t kIndices[] = { 0, 1, 8 + 0, 8 + 1,
   2023                                                    4, 5, 8 + 4, 8 + 5 };
   2024  auto indices = hn::SetTableIndices(tag, kIndices);
   2025  return hn::TwoTablesLookupLanes(tag, a, b, indices);
   2026 }
   2027 
   2028 template <typename D>
   2029 HWY_ATTR HWY_INLINE hn::VFromD<D> LocalInterleaveOddBlocks(D tag,
   2030                                                           hn::VFromD<D> a,
   2031                                                           hn::VFromD<D> b) {
   2032  static_assert(sizeof(hn::TFromD<D>) == 8,
   2033                "LocalInterleaveOddBlocks requires 64-bit lanes.");
   2034  HWY_ALIGN static constexpr int64_t kIndices[] = { 2, 3, 8 + 2, 8 + 3,
   2035                                                    6, 7, 8 + 6, 8 + 7 };
   2036  auto indices = hn::SetTableIndices(tag, kIndices);
   2037  return hn::TwoTablesLookupLanes(tag, a, b, indices);
   2038 }
   2039 
   2040 template <size_t LaneSize>
   2041 struct Transpose16Traits {};
   2042 
   2043 template <>
   2044 struct Transpose16Traits<2> {
   2045  template <int8_t Bit, bool ApplyRectScale, typename TIn, typename TOut>
   2046  HWY_ATTR HWY_INLINE static void Transpose16(const TIn *HWY_RESTRICT in,
   2047                                              TOut *HWY_RESTRICT out,
   2048                                              size_t instride,
   2049                                              size_t outstride) {
   2050    constexpr hn::FixedTag<TIn, 16> int_tag;
   2051    static_assert(hn::MaxLanes(int_tag) == 16,
   2052                  "16-bit Transpose16 requires an 16-lane int_tag");
   2053    constexpr hn::RepartitionToWide<decltype(int_tag)> wide_int_tag;
   2054    constexpr hn::RepartitionToWide<decltype(wide_int_tag)> widex2_int_tag;
   2055    HWY_ALIGN_MAX hn::TFromD<decltype(wide_int_tag)>
   2056        y[16 * hn::MaxLanes(wide_int_tag)];
   2057    HWY_ALIGN_MAX hn::TFromD<decltype(widex2_int_tag)>
   2058        z[16 * hn::MaxLanes(widex2_int_tag)];
   2059    for (size_t i = 0; i < 16; i += 2) {
   2060      auto i0 = RectScale<ApplyRectScale>(
   2061          int_tag,
   2062          RoundShift<Bit>(int_tag, hn::Load(int_tag, &in[(i + 0) * instride])));
   2063      auto i1 = RectScale<ApplyRectScale>(
   2064          int_tag,
   2065          RoundShift<Bit>(int_tag, hn::Load(int_tag, &in[(i + 1) * instride])));
   2066      hn::Store(hn::ZipLower(wide_int_tag, i0, i1), wide_int_tag,
   2067                &y[(i + 0) * hn::MaxLanes(wide_int_tag)]);
   2068      hn::Store(hn::ZipUpper(wide_int_tag, i0, i1), wide_int_tag,
   2069                &y[(i + 1) * hn::MaxLanes(wide_int_tag)]);
   2070    }
   2071    for (size_t i = 0; i < 16; i += 4) {
   2072      for (size_t j = 0; j < 2; ++j) {
   2073        auto i0 = hn::Load(wide_int_tag,
   2074                           &y[(i + j + 0) * hn::MaxLanes(wide_int_tag)]);
   2075        auto i2 = hn::Load(wide_int_tag,
   2076                           &y[(i + j + 2) * hn::MaxLanes(wide_int_tag)]);
   2077        hn::Store(hn::ZipLower(widex2_int_tag, i0, i2), widex2_int_tag,
   2078                  &z[(i + j + 0) * hn::MaxLanes(widex2_int_tag)]);
   2079        hn::Store(hn::ZipUpper(widex2_int_tag, i0, i2), widex2_int_tag,
   2080                  &z[(i + j + 2) * hn::MaxLanes(widex2_int_tag)]);
   2081      }
   2082    }
   2083    for (size_t i = 0; i < 16; i += 8) {
   2084      for (size_t j = 0; j < 4; ++j) {
   2085        auto i0 = hn::Load(widex2_int_tag,
   2086                           &z[(i + j + 0) * hn::MaxLanes(widex2_int_tag)]);
   2087        auto i4 = hn::Load(widex2_int_tag,
   2088                           &z[(i + j + 4) * hn::MaxLanes(widex2_int_tag)]);
   2089        hn::Store(hn::InterleaveLower(widex2_int_tag, i0, i4), widex2_int_tag,
   2090                  &z[(i + j + 0) * hn::MaxLanes(widex2_int_tag)]);
   2091        hn::Store(hn::InterleaveUpper(widex2_int_tag, i0, i4), widex2_int_tag,
   2092                  &z[(i + j + 4) * hn::MaxLanes(widex2_int_tag)]);
   2093      }
   2094    }
   2095    static constexpr size_t kStoreIndex[] = { 0, 4,  2,  6,  1, 5,  3,  7,
   2096                                              8, 12, 10, 14, 9, 13, 11, 15 };
   2097    for (size_t j = 0; j < 8; ++j) {
   2098      auto i0 =
   2099          hn::Load(widex2_int_tag, &z[(j + 0) * hn::MaxLanes(widex2_int_tag)]);
   2100      auto i8 =
   2101          hn::Load(widex2_int_tag, &z[(j + 8) * hn::MaxLanes(widex2_int_tag)]);
   2102      hn::StoreU(
   2103          hn::BitCast(int_tag, hn::ConcatLowerLower(widex2_int_tag, i8, i0)),
   2104          int_tag, &out[kStoreIndex[j + 0] * outstride]);
   2105      hn::StoreU(
   2106          hn::BitCast(int_tag, hn::ConcatUpperUpper(widex2_int_tag, i8, i0)),
   2107          int_tag, &out[kStoreIndex[j + 8] * outstride]);
   2108    }
   2109  }
   2110 };
   2111 
   2112 template <>
   2113 struct Transpose16Traits<4> {
   2114  template <int8_t Bit, bool ApplyRectScale, typename TIn, typename TOut>
   2115  HWY_ATTR HWY_INLINE static void Transpose16(const TIn *HWY_RESTRICT in,
   2116                                              TOut *HWY_RESTRICT out,
   2117                                              size_t instride,
   2118                                              size_t outstride) {
   2119    constexpr hn::FixedTag<TIn, 16> int_tag;
   2120    static_assert(hn::MaxLanes(int_tag) == 16,
   2121                  "32-bit Transpose16 requires an 16-lane int_tag");
   2122    constexpr hn::RepartitionToWide<decltype(int_tag)> wide_int_tag;
   2123    HWY_ALIGN_MAX hn::TFromD<decltype(wide_int_tag)>
   2124        z[16 * hn::MaxLanes(wide_int_tag)];
   2125    for (size_t i = 0; i < 16; i += 2) {
   2126      auto i0 = RectScale<ApplyRectScale>(
   2127          int_tag,
   2128          RoundShift<Bit>(int_tag, hn::Load(int_tag, &in[(i + 0) * instride])));
   2129      auto i1 = RectScale<ApplyRectScale>(
   2130          int_tag,
   2131          RoundShift<Bit>(int_tag, hn::Load(int_tag, &in[(i + 1) * instride])));
   2132      hn::Store(hn::ZipLower(wide_int_tag, i0, i1), wide_int_tag,
   2133                &z[(i + 0) * hn::MaxLanes(wide_int_tag)]);
   2134      hn::Store(hn::ZipUpper(wide_int_tag, i0, i1), wide_int_tag,
   2135                &z[(i + 1) * hn::MaxLanes(wide_int_tag)]);
   2136    }
   2137    for (size_t i = 0; i < 16; i += 4) {
   2138      for (size_t j = 0; j < 2; ++j) {
   2139        auto i0 = hn::Load(wide_int_tag,
   2140                           &z[(i + j + 0) * hn::MaxLanes(wide_int_tag)]);
   2141        auto i2 = hn::Load(wide_int_tag,
   2142                           &z[(i + j + 2) * hn::MaxLanes(wide_int_tag)]);
   2143        hn::Store(hn::InterleaveLower(wide_int_tag, i0, i2), wide_int_tag,
   2144                  &z[(i + j + 0) * hn::MaxLanes(wide_int_tag)]);
   2145        hn::Store(hn::InterleaveUpper(wide_int_tag, i0, i2), wide_int_tag,
   2146                  &z[(i + j + 2) * hn::MaxLanes(wide_int_tag)]);
   2147      }
   2148    }
   2149    for (size_t i = 0; i < 16; i += 8) {
   2150      for (size_t j = 0; j < 4; ++j) {
   2151        auto i0 = hn::Load(wide_int_tag,
   2152                           &z[(i + j + 0) * hn::MaxLanes(wide_int_tag)]);
   2153        auto i4 = hn::Load(wide_int_tag,
   2154                           &z[(i + j + 4) * hn::MaxLanes(wide_int_tag)]);
   2155        hn::Store(LocalInterleaveEvenBlocks(wide_int_tag, i0, i4), wide_int_tag,
   2156                  &z[(i + j + 0) * hn::MaxLanes(wide_int_tag)]);
   2157        hn::Store(LocalInterleaveOddBlocks(wide_int_tag, i0, i4), wide_int_tag,
   2158                  &z[(i + j + 4) * hn::MaxLanes(wide_int_tag)]);
   2159      }
   2160    }
   2161    static constexpr size_t kStoreIndex[] = { 0, 2,  1, 3,  4,  6,  5,  7,
   2162                                              8, 10, 9, 11, 12, 14, 13, 15 };
   2163    for (size_t j = 0; j < 8; ++j) {
   2164      auto i0 =
   2165          hn::Load(wide_int_tag, &z[(j + 0) * hn::MaxLanes(wide_int_tag)]);
   2166      auto i8 =
   2167          hn::Load(wide_int_tag, &z[(j + 8) * hn::MaxLanes(wide_int_tag)]);
   2168      hn::StoreU(
   2169          hn::BitCast(int_tag, hn::ConcatLowerLower(wide_int_tag, i8, i0)),
   2170          int_tag, &out[kStoreIndex[j + 0] * outstride]);
   2171      hn::StoreU(
   2172          hn::BitCast(int_tag, hn::ConcatUpperUpper(wide_int_tag, i8, i0)),
   2173          int_tag, &out[kStoreIndex[j + 8] * outstride]);
   2174    }
   2175  }
   2176 };
   2177 
   2178 template <int8_t Bit, bool ApplyRectScale, typename TIn, typename TOut>
   2179 HWY_ATTR HWY_INLINE void Transpose16(const TIn *HWY_RESTRICT in,
   2180                                     TOut *HWY_RESTRICT out, size_t instride,
   2181                                     size_t outstride) {
   2182  static_assert(sizeof(TOut) == sizeof(TIn),
   2183                "Transpose16 does not directly support integer promotion.");
   2184  Transpose16Traits<sizeof(TIn)>::template Transpose16<Bit, ApplyRectScale>(
   2185      in, out, instride, outstride);
   2186 }
   2187 
   2188 template <size_t NumLanes, bool RequiresPromotion>
   2189 struct TransposeTraits {};
   2190 
   2191 template <>
   2192 struct TransposeTraits<16, true> {
   2193  template <size_t Width, size_t Height, int8_t Bit, bool ApplyRectScale,
   2194            typename TIn, typename TOut>
   2195  HWY_ATTR HWY_INLINE static void Transpose(const TIn *HWY_RESTRICT in,
   2196                                            TOut *HWY_RESTRICT out,
   2197                                            size_t instride, size_t outstride) {
   2198    constexpr auto int_tag =
   2199        hn::CappedTag<TOut, AOMMIN(16, AOMMIN(Width, Height))>();
   2200    constexpr hn::Rebind<TIn, decltype(int_tag)> input_tag;
   2201    HWY_ALIGN_MAX hn::TFromD<decltype(int_tag)> p[16 * hn::MaxLanes(int_tag)];
   2202    for (size_t r = 0; r < Height; r += 16) {
   2203      for (size_t c = 0; c < Width; c += 16) {
   2204        for (size_t i = 0; i < 16; ++i) {
   2205          hn::Store(
   2206              hn::PromoteTo(int_tag,
   2207                            hn::Load(input_tag, &in[(r + i) * instride + c])),
   2208              int_tag, &p[i * hn::MaxLanes(int_tag)]);
   2209        }
   2210        Transpose16<Bit, ApplyRectScale>(p, &out[c * outstride + r],
   2211                                         hn::MaxLanes(int_tag), outstride);
   2212      }
   2213    }
   2214  }
   2215 };
   2216 
   2217 template <>
   2218 struct TransposeTraits<16, false> {
   2219  template <size_t Width, size_t Height, int8_t Bit, bool ApplyRectScale,
   2220            typename TIn, typename TOut>
   2221  HWY_ATTR HWY_INLINE static void Transpose(const TIn *HWY_RESTRICT in,
   2222                                            TOut *HWY_RESTRICT out,
   2223                                            size_t instride, size_t outstride) {
   2224    for (size_t r = 0; r < Height; r += 16) {
   2225      for (size_t c = 0; c < Width; c += 16) {
   2226        Transpose16<Bit, ApplyRectScale>(&in[r * instride + c],
   2227                                         &out[c * outstride + r], instride,
   2228                                         outstride);
   2229      }
   2230    }
   2231  }
   2232 };
   2233 
   2234 template <bool RequiresPromotion>
   2235 struct TransposeTraits<8, RequiresPromotion> {
   2236  template <size_t Width, size_t Height, int8_t Bit, bool ApplyRectScale,
   2237            typename TIn, typename TOut>
   2238  HWY_ATTR HWY_INLINE static void Transpose(const TIn *HWY_RESTRICT in,
   2239                                            TOut *HWY_RESTRICT out,
   2240                                            size_t instride, size_t outstride) {
   2241    for (size_t r = 0; r < Height; r += 8) {
   2242      for (size_t c = 0; c < Width; c += 8) {
   2243        Transpose8<Bit, ApplyRectScale>(&in[r * instride + c],
   2244                                        &out[c * outstride + r], instride,
   2245                                        outstride);
   2246      }
   2247    }
   2248  }
   2249 };
   2250 
   2251 template <bool RequiresPromotion>
   2252 struct TransposeTraits<4, RequiresPromotion> {
   2253  template <size_t Width, size_t Height, int8_t Bit, bool ApplyRectScale,
   2254            typename TIn, typename TOut>
   2255  HWY_ATTR HWY_INLINE static void Transpose(const TIn *HWY_RESTRICT in,
   2256                                            TOut *HWY_RESTRICT out,
   2257                                            size_t instride, size_t outstride) {
   2258    for (size_t r = 0; r < Height; r += 4) {
   2259      for (size_t c = 0; c < Width; c += 4) {
   2260        Transpose4<Bit, ApplyRectScale>(&in[r * instride + c],
   2261                                        &out[c * outstride + r], instride,
   2262                                        outstride);
   2263      }
   2264    }
   2265  }
   2266 };
   2267 
   2268 template <size_t Width, size_t Height, int8_t Bit, bool ApplyRectScale,
   2269          typename TIn, typename TOut>
   2270 HWY_ATTR HWY_INLINE void Transpose(const TIn *HWY_RESTRICT in,
   2271                                   TOut *HWY_RESTRICT out, size_t instride,
   2272                                   size_t outstride) {
   2273  constexpr auto int_tag =
   2274      hn::CappedTag<TOut, AOMMIN(16, AOMMIN(Width, Height))>();
   2275  TransposeTraits<hn::MaxLanes(int_tag), !std::is_same<TIn, TOut>::value>::
   2276      template Transpose<Width, Height, Bit, ApplyRectScale>(in, out, instride,
   2277                                                             outstride);
   2278 }
   2279 
   2280 template <size_t Width, size_t Height, int Shift, bool ApplyRectScale,
   2281          typename TIn, typename TOut>
   2282 HWY_ATTR HWY_INLINE void StoreBlock(const TIn *HWY_RESTRICT in, size_t instride,
   2283                                    TOut *HWY_RESTRICT out, size_t outstride) {
   2284  constexpr hn::CappedTag<TIn, Width> load_tag;
   2285  for (size_t r = 0; r < Height; ++r) {
   2286    for (size_t c = 0; c < Width; c += hn::MaxLanes(load_tag)) {
   2287      auto v = RectScale<ApplyRectScale>(
   2288          load_tag, RoundShift<Shift>(
   2289                        load_tag, hn::Load(load_tag, &in[r * instride + c])));
   2290      MaybePromoteTraits<std::is_same<TIn, TOut>::value>::PromoteStore2(
   2291          load_tag, v, &out[r * outstride + c]);
   2292    }
   2293  }
   2294 }
   2295 
   2296 template <int8_t Shift, size_t Width, bool FlipLeftRight, typename TInput,
   2297          typename TIn>
   2298 HWY_ATTR HWY_INLINE void LoadLine(const TInput *HWY_RESTRICT input,
   2299                                  TIn *HWY_RESTRICT in) {
   2300  constexpr hn::CappedTag<TIn, Width> store_tag;
   2301  constexpr hn::Rebind<TInput, decltype(store_tag)> load_tag;
   2302  for (size_t x = 0; x < Width / hn::MaxLanes(load_tag); ++x) {
   2303    auto v = hn::LoadU(load_tag, &input[x * hn::MaxLanes(load_tag)]);
   2304    if CONSTEXPR_IF (FlipLeftRight) {
   2305      v = hn::Reverse(load_tag, v);
   2306    }
   2307    auto vp = MaybePromoteTo(store_tag, v);
   2308    hn::Store(
   2309        hn::ShiftLeft<Shift>(vp), store_tag,
   2310        &in[(FlipLeftRight ? (Width / hn::MaxLanes(store_tag)) - x - 1 : x) *
   2311            hn::MaxLanes(store_tag)]);
   2312  }
   2313 }
   2314 
   2315 template <int8_t Shift, size_t Width, size_t OutStride, size_t Height,
   2316          bool FlipUpDown, bool FlipLeftRight, typename TInput, typename TIn>
   2317 HWY_ATTR HWY_INLINE void LoadBuffer(const TInput *HWY_RESTRICT input,
   2318                                    TIn *HWY_RESTRICT in, size_t stride) {
   2319  for (size_t y = 0; y < Height; ++y) {
   2320    LoadLine<Shift, Width, FlipLeftRight>(
   2321        input + y * stride, &in[(FlipUpDown ? Height - y - 1 : y) * OutStride]);
   2322  }
   2323 }
   2324 
   2325 template <size_t TransformWidth, size_t BlockWidth, size_t BlockHeight,
   2326          typename T>
   2327 HWY_ATTR HWY_FLATTEN HWY_INLINE void Transform4(TX_TYPE_1D tx_type, T *in,
   2328                                                int8_t cos_bit) {
   2329  switch (tx_type) {
   2330    case DCT_1D: FdctNx4Block<TransformWidth, BlockWidth>(in, cos_bit); break;
   2331    case IDTX_1D:
   2332      IdtxSqrt2Block<TransformWidth, BlockWidth, BlockHeight, 1>(in, cos_bit);
   2333      break;
   2334    default: FadstNx4Block<TransformWidth, BlockWidth>(in, cos_bit); break;
   2335  }
   2336 }
   2337 
   2338 template <size_t TransformWidth, size_t BlockWidth, size_t BlockHeight,
   2339          typename T>
   2340 HWY_ATTR HWY_FLATTEN HWY_INLINE void Transform8(TX_TYPE_1D tx_type, T *in,
   2341                                                int8_t cos_bit) {
   2342  switch (tx_type) {
   2343    case DCT_1D: FdctNx8Block<TransformWidth, BlockWidth>(in, cos_bit); break;
   2344    case IDTX_1D:
   2345      IdtxAdd2Block<TransformWidth, BlockWidth, BlockHeight>(in, cos_bit);
   2346      break;
   2347    default: FadstNx8Block<TransformWidth, BlockWidth>(in, cos_bit); break;
   2348  }
   2349 }
   2350 
   2351 template <size_t TransformWidth, size_t BlockWidth, size_t BlockHeight,
   2352          typename T>
   2353 HWY_ATTR HWY_INLINE void Transform16(TX_TYPE_1D tx_type, T *in,
   2354                                     int8_t cos_bit) {
   2355  static const Transform1D<T> kTransform[] = {
   2356    FdctNx16Block<TransformWidth, BlockWidth, T>,   // DCT_1D
   2357    FadstNx16Block<TransformWidth, BlockWidth, T>,  // ADST_1D
   2358    FadstNx16Block<TransformWidth, BlockWidth, T>,  // FLIPADST_1D
   2359    IdtxSqrt2Block<TransformWidth, BlockWidth, BlockHeight, 2, T>,  // IDTX_1D
   2360  };
   2361  kTransform[tx_type](in, cos_bit);
   2362 }
   2363 
   2364 template <size_t TransformWidth, size_t BlockWidth, size_t BlockHeight,
   2365          typename T>
   2366 HWY_ATTR HWY_INLINE void Transform32(TX_TYPE_1D tx_type, T *in,
   2367                                     int8_t cos_bit) {
   2368  static const Transform1D<T> kTransform[] = {
   2369    FdctNx32Block<TransformWidth, BlockWidth, T>,  // DCT_1D
   2370    TransformFail<T>,                              // ADST_1D
   2371    TransformFail<T>,                              // FLIPADST_1D
   2372    IdtxShiftBlock<TransformWidth, BlockWidth, BlockHeight, 2, T>,  // IDTX_1D
   2373  };
   2374  kTransform[tx_type](in, cos_bit);
   2375 }
   2376 
   2377 template <size_t TransformWidth, size_t BlockWidth, typename T>
   2378 HWY_ATTR HWY_INLINE void TransformFull64(TX_TYPE_1D tx_type, T *in,
   2379                                         int8_t cos_bit) {
   2380  (void)tx_type;
   2381  assert(tx_type == DCT_1D);
   2382  FdctNx64Block<TransformWidth, BlockWidth, TransformWidth, BlockWidth>(
   2383      in, cos_bit);
   2384 }
   2385 
   2386 template <size_t TransformWidth, size_t BlockWidth, size_t TransformHeight,
   2387          size_t BlockHeight, typename T>
   2388 HWY_ATTR HWY_INLINE void TransformBelow32(TX_TYPE_1D tx_type, T *in,
   2389                                          int8_t cos_bit) {
   2390  if CONSTEXPR_IF (TransformHeight == 4) {
   2391    Transform4<TransformWidth, BlockWidth, BlockHeight>(tx_type, in, cos_bit);
   2392  } else if CONSTEXPR_IF (TransformHeight == 8) {
   2393    Transform8<TransformWidth, BlockWidth, BlockHeight>(tx_type, in, cos_bit);
   2394  } else if CONSTEXPR_IF (TransformHeight == 16) {
   2395    Transform16<TransformWidth, BlockWidth, BlockHeight>(tx_type, in, cos_bit);
   2396  } else if CONSTEXPR_IF (TransformHeight == 32) {
   2397    Transform32<TransformWidth, BlockWidth, BlockHeight>(tx_type, in, cos_bit);
   2398  } else {
   2399    assert(false && "Unsupported transform size.");
   2400  }
   2401 }
   2402 
   2403 template <size_t TransformWidth, size_t BlockWidth, size_t TransformHeight,
   2404          size_t BlockHeight, typename T>
   2405 HWY_ATTR HWY_INLINE void RowTransform(TX_TYPE_1D tx_type, T *in,
   2406                                      int8_t cos_bit) {
   2407  if CONSTEXPR_IF (TransformWidth == 64 && TransformHeight == 64) {
   2408    assert(tx_type == DCT_1D);
   2409    // 64x64 only writes 32x32 of coefficients.
   2410    FdctNx64Block<TransformWidth, BlockWidth, 32, 32>(in, cos_bit);
   2411  } else if CONSTEXPR_IF (TransformHeight == 64) {
   2412    TransformFull64<TransformWidth, BlockWidth>(tx_type, in, cos_bit);
   2413  } else {
   2414    TransformBelow32<TransformWidth, BlockWidth, TransformHeight, BlockHeight>(
   2415        tx_type, in, cos_bit);
   2416  }
   2417 }
   2418 
   2419 template <TX_SIZE TxSize, typename T>
   2420 HWY_ATTR HWY_MAYBE_UNUSED void ForwardTransform2D(const int16_t *input,
   2421                                                  int32_t *output,
   2422                                                  size_t stride,
   2423                                                  TX_TYPE tx_type) {
   2424  constexpr size_t kWidth = kTxSizeWide[TxSize];
   2425  constexpr size_t kHeight = kTxSizeHigh[TxSize];
   2426  // Ensure the storage is aligned to the architecture's block width.
   2427  constexpr size_t kMinVectorSize =
   2428      hn::BlockDFromD<hn::ScalableTag<T>>().MaxBytes() / sizeof(uint8_t);
   2429  constexpr size_t kBlockWidth = AOMMAX(kMinVectorSize / sizeof(T), kWidth);
   2430  constexpr size_t kBlockHeight = AOMMAX(kMinVectorSize / sizeof(T), kHeight);
   2431  HWY_ALIGN_MAX T buf0[kBlockWidth * kBlockHeight];
   2432  constexpr bool kBigRectangle = (kBlockWidth == 64 && kBlockHeight >= 32) ||
   2433                                 (kBlockWidth >= 32 && kBlockHeight == 64);
   2434  using T2 = typename std::conditional<kBigRectangle, int32_t, T>::type;
   2435  HWY_ALIGN_MAX T2 buf1[kBlockWidth * kBlockHeight];
   2436  constexpr int8_t kShift[3] = { kForwardTransformShift[TxSize][0],
   2437                                 kForwardTransformShift[TxSize][1],
   2438                                 kForwardTransformShift[TxSize][2] };
   2439  constexpr int kTransformWidthIndex = GetTxwIndex(TxSize);
   2440  constexpr int kTransformHeightIndex = GetTxhIndex(TxSize);
   2441  constexpr int8_t cos_bit_col =
   2442      kForwardCosBitCol[kTransformWidthIndex][kTransformHeightIndex];
   2443  constexpr int8_t cos_bit_row =
   2444      kForwardCosBitRow[kTransformWidthIndex][kTransformHeightIndex];
   2445  const TX_TYPE_1D vertical_transform = vtx_tab[tx_type];
   2446  const TX_TYPE_1D horizontal_transform = htx_tab[tx_type];
   2447  constexpr bool kApplyRectScale = kApplyRectScaleList[TxSize];
   2448  switch ((vertical_transform == FLIPADST_1D ? 1 : 0) |
   2449          (horizontal_transform == FLIPADST_1D ? 2 : 0)) {
   2450    case 0:
   2451      LoadBuffer<kShift[0], kWidth, kBlockWidth, kHeight, false, false>(
   2452          input, buf0, stride);
   2453      break;
   2454    case 1:
   2455      LoadBuffer<kShift[0], kWidth, kBlockWidth, kHeight, true, false>(
   2456          input, buf0, stride);
   2457      break;
   2458    case 2:
   2459      LoadBuffer<kShift[0], kWidth, kBlockWidth, kHeight, false, true>(
   2460          input, buf0, stride);
   2461      break;
   2462    case 3:
   2463      LoadBuffer<kShift[0], kWidth, kBlockWidth, kHeight, true, true>(
   2464          input, buf0, stride);
   2465      break;
   2466  }
   2467  if CONSTEXPR_IF (kHeight == 64) {
   2468    TransformFull64<kWidth, kBlockWidth>(vertical_transform, buf0, cos_bit_col);
   2469  } else {
   2470    TransformBelow32<kWidth, kBlockWidth, kHeight, kBlockHeight>(
   2471        vertical_transform, buf0, cos_bit_col);
   2472  }
   2473  Transpose<kWidth, kHeight, kShift[1], false>(buf0, buf1, kBlockWidth,
   2474                                               kBlockHeight);
   2475  if CONSTEXPR_IF (kWidth == 64 && kHeight == 64) {
   2476    // 64x64 only writes 32x32 of coefficients.
   2477    assert(tx_type == DCT_1D);
   2478    FdctNx64Block<kHeight, kBlockHeight, 32, 32>(buf1, cos_bit_row);
   2479    StoreBlock<32, 32, kShift[2], kApplyRectScale>(buf1, 32, output, 32);
   2480  } else if CONSTEXPR_IF (kHeight == 64 && (kWidth == 16 || kWidth == 32)) {
   2481    // 32x64 and 16x64 coefficients are packed into Wx32, discarding the
   2482    // right-most results.
   2483    RowTransform<32, kBlockHeight, kWidth, kBlockWidth>(horizontal_transform,
   2484                                                        buf1, cos_bit_row);
   2485    StoreBlock<kHeight, kWidth, kShift[2], kApplyRectScale>(buf1, kBlockHeight,
   2486                                                            output, 32);
   2487  } else {
   2488    RowTransform<kHeight, kBlockHeight, kWidth, kBlockWidth>(
   2489        horizontal_transform, buf1, cos_bit_row);
   2490    StoreBlock<kHeight, kWidth, kShift[2], kApplyRectScale>(buf1, kBlockHeight,
   2491                                                            output, kHeight);
   2492  }
   2493  if CONSTEXPR_IF (kHeight <= 16 && kWidth == 64) {
   2494    hwy::ZeroBytes<kHeight * 32 * sizeof(*output)>(output + kHeight * 32);
   2495  }
   2496 }
   2497 
   2498 HWY_MAYBE_UNUSED void LowBitdepthForwardTransform2D(const int16_t *src_diff,
   2499                                                    tran_low_t *coeff,
   2500                                                    int diff_stride,
   2501                                                    TxfmParam *txfm_param) {
   2502  if (txfm_param->lossless && txfm_param->tx_size == TX_4X4) {
   2503    assert(txfm_param->tx_type == DCT_DCT);
   2504    av1_fwht4x4(src_diff, coeff, diff_stride);
   2505    return;
   2506  }
   2507  using TransformFunction = decltype(&ForwardTransform2D<TX_4X4, int16_t>);
   2508  constexpr TransformFunction kTable[] = {
   2509 #define POINTER(w, h, _) &ForwardTransform2D<TX_##w##X##h, int16_t>,
   2510    FOR_EACH_TXFM2D(POINTER, _)
   2511 #undef POINTER
   2512  };
   2513  kTable[txfm_param->tx_size](src_diff, coeff, diff_stride,
   2514                              txfm_param->tx_type);
   2515 }
   2516 
   2517 }  // namespace HWY_NAMESPACE
   2518 }  // namespace
   2519 
   2520 HWY_AFTER_NAMESPACE();
   2521 
   2522 #define MAKE_HIGHBD_TXFM2D(w, h, suffix)                                       \
   2523  extern "C" void av1_fwd_txfm2d_##w##x##h##_##suffix(                         \
   2524      const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type,      \
   2525      int bd);                                                                 \
   2526  HWY_ATTR void av1_fwd_txfm2d_##w##x##h##_##suffix(                           \
   2527      const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type,      \
   2528      int bd) {                                                                \
   2529    (void)bd;                                                                  \
   2530    HWY_NAMESPACE::ForwardTransform2D<TX_##w##X##h, int32_t>(input, output,    \
   2531                                                             stride, tx_type); \
   2532  }
   2533 
   2534 #define MAKE_LOWBD_TXFM2D(w, h, suffix)                                        \
   2535  extern "C" void av1_lowbd_fwd_txfm2d_##w##x##h##_##suffix(                   \
   2536      const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type,      \
   2537      int bd);                                                                 \
   2538  HWY_ATTR void av1_lowbd_fwd_txfm2d_##w##x##h##_##suffix(                     \
   2539      const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type,      \
   2540      int bd) {                                                                \
   2541    (void)bd;                                                                  \
   2542    HWY_NAMESPACE::ForwardTransform2D<TX_##w##X##h, int16_t>(input, output,    \
   2543                                                             stride, tx_type); \
   2544  }
   2545 
   2546 #define MAKE_LOWBD_TXFM2D_DISPATCH(suffix)                                     \
   2547  extern "C" void av1_lowbd_fwd_txfm_##suffix(                                 \
   2548      const int16_t *src_diff, tran_low_t *coeff, int diff_stride,             \
   2549      TxfmParam *txfm_param);                                                  \
   2550  HWY_ATTR void av1_lowbd_fwd_txfm_##suffix(                                   \
   2551      const int16_t *src_diff, tran_low_t *coeff, int diff_stride,             \
   2552      TxfmParam *txfm_param) {                                                 \
   2553    HWY_NAMESPACE::LowBitdepthForwardTransform2D(src_diff, coeff, diff_stride, \
   2554                                                 txfm_param);                  \
   2555  }
   2556 
   2557 #endif  // AOM_AV1_ENCODER_AV1_FWD_TXFM2D_HWY_H_