av1_fwd_txfm2d_hwy.h (120844B)
1 /* 2 * Copyright (c) 2025, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #ifndef AOM_AV1_ENCODER_AV1_FWD_TXFM2D_HWY_H_ 13 #define AOM_AV1_ENCODER_AV1_FWD_TXFM2D_HWY_H_ 14 15 #include <stdint.h> 16 17 #include "config/aom_config.h" 18 #include "config/av1_rtcd.h" 19 #include "third_party/highway/hwy/highway.h" 20 #include "aom_dsp/txfm_common.h" 21 #include "av1/common/av1_txfm.h" 22 #include "av1/common/enums.h" 23 #include "av1/encoder/av1_fwd_txfm1d.h" 24 #include "av1/encoder/av1_fwd_txfm1d_cfg.h" 25 26 #define FOR_EACH_TXFM2D(X, suffix) \ 27 X(4, 4, suffix) \ 28 X(8, 8, suffix) \ 29 X(16, 16, suffix) \ 30 X(32, 32, suffix) \ 31 X(64, 64, suffix) \ 32 X(4, 8, suffix) \ 33 X(8, 4, suffix) \ 34 X(8, 16, suffix) \ 35 X(16, 8, suffix) \ 36 X(16, 32, suffix) \ 37 X(32, 16, suffix) \ 38 X(32, 64, suffix) \ 39 X(64, 32, suffix) \ 40 X(4, 16, suffix) \ 41 X(16, 4, suffix) \ 42 X(8, 32, suffix) \ 43 X(32, 8, suffix) \ 44 X(16, 64, suffix) \ 45 X(64, 16, suffix) 46 47 #if HWY_CXX_LANG >= 201703L 48 #define CONSTEXPR_IF constexpr 49 #else 50 #define CONSTEXPR_IF 51 #endif 52 53 HWY_BEFORE_NAMESPACE(); 54 55 namespace { 56 namespace HWY_NAMESPACE { 57 58 namespace hn = hwy::HWY_NAMESPACE; 59 60 constexpr int8_t kForwardTransformShift[TX_SIZES_ALL][3] = { 61 { 2, 0, 0 }, // 62 { 2, -1, 0 }, // 63 { 2, -2, 0 }, // 64 { 2, -4, 0 }, // 65 { 0, -2, -2 }, // 66 { 2, -1, 0 }, // 67 { 2, -1, 0 }, // 68 { 2, -2, 0 }, // 69 { 2, -2, 0 }, // 70 { 2, -4, 0 }, // 71 { 2, -4, 0 }, // 72 { 0, -2, -2 }, // 73 { 2, -4, -2 }, // 74 { 2, -1, 0 }, // 75 { 2, -1, 0 }, // 76 { 2, -2, 0 }, // 77 { 2, -2, 0 }, // 78 { 0, -2, 0 }, // 79 { 2, -4, 0 }, // 80 }; 81 82 constexpr int kTxSizeWideLog2[TX_SIZES_ALL] = { 83 2, 3, 4, 5, 6, 2, 3, 3, 4, 4, 5, 5, 6, 2, 4, 3, 5, 4, 6, 84 }; 85 86 // Transform block height in log2 87 constexpr int kTxSizeHighLog2[TX_SIZES_ALL] = { 88 2, 3, 4, 5, 6, 3, 2, 4, 3, 5, 4, 6, 5, 4, 2, 5, 3, 6, 4, 89 }; 90 91 constexpr bool kApplyRectScaleList[TX_SIZES_ALL] = { 92 false, false, false, false, false, true, true, true, true, true, 93 true, true, true, false, false, false, false, false, false, 94 }; 95 96 constexpr int8_t kForwardCosBitCol[MAX_TXWH_IDX /*txw_idx*/] 97 [MAX_TXWH_IDX /*txh_idx*/] = { 98 { 13, 13, 13, 0, 0 }, 99 { 13, 13, 13, 12, 0 }, 100 { 13, 13, 13, 12, 13 }, 101 { 0, 13, 13, 12, 13 }, 102 { 0, 0, 13, 12, 13 } 103 }; 104 105 constexpr int8_t kForwardCosBitRow[MAX_TXWH_IDX /*txw_idx*/] 106 [MAX_TXWH_IDX /*txh_idx*/] = { 107 { 13, 13, 12, 0, 0 }, 108 { 13, 13, 13, 12, 0 }, 109 { 13, 13, 12, 13, 12 }, 110 { 0, 12, 13, 12, 11 }, 111 { 0, 0, 12, 11, 10 } 112 }; 113 114 // Transform block width in pixels 115 constexpr int8_t kTxSizeWide[TX_SIZES_ALL] = { 116 4, 8, 16, 32, 64, 4, 8, 8, 16, 16, 32, 32, 64, 4, 16, 8, 32, 16, 64, 117 }; 118 119 // Transform block height in pixels 120 constexpr int8_t kTxSizeHigh[TX_SIZES_ALL] = { 121 4, 8, 16, 32, 64, 8, 4, 16, 8, 32, 16, 64, 32, 16, 4, 32, 8, 64, 16, 122 }; 123 124 constexpr int GetTxwIndex(TX_SIZE tx_size) { 125 return kTxSizeWideLog2[tx_size] - kTxSizeWideLog2[0]; 126 } 127 128 constexpr int GetTxhIndex(TX_SIZE tx_size) { 129 return kTxSizeHighLog2[tx_size] - kTxSizeHighLog2[0]; 130 } 131 132 template <typename D> 133 HWY_ATTR HWY_INLINE hn::VFromD<D> SetPair(D int_tag, int a, int b) { 134 return hn::BitCast( 135 int_tag, 136 hn::Set(hn::RepartitionToWide<D>(), 137 static_cast<int32_t>( 138 static_cast<uint16_t>(a) | 139 (static_cast<uint32_t>(static_cast<uint16_t>(b)) << 16)))); 140 } 141 142 template <size_t LaneSize> 143 struct ButterflyTraits {}; 144 145 template <> 146 struct ButterflyTraits<2> { 147 template <typename D> 148 HWY_ATTR HWY_INLINE static void Whole( 149 D int_tag, int w0, int w1, const hn::TFromD<D> *HWY_RESTRICT in0, 150 const hn::TFromD<D> *HWY_RESTRICT in1, hn::TFromD<D> *HWY_RESTRICT out0, 151 hn::TFromD<D> *HWY_RESTRICT out1, int bit, 152 hn::VFromD<hn::Repartition<int32_t, D>> round) { 153 constexpr hn::RepartitionToWide<D> int32_tag; 154 const auto ww0 = SetPair(int_tag, w0, w1); 155 const auto ww1 = SetPair(int_tag, w1, -w0); 156 const auto i0 = hn::Load(int_tag, in0); 157 const auto i1 = hn::Load(int_tag, in1); 158 const auto t0 = hn::InterleaveLower(int_tag, i0, i1); 159 const auto t1 = hn::InterleaveUpper(int_tag, i0, i1); 160 const auto u0 = hn::WidenMulPairwiseAdd(int32_tag, t0, ww0); 161 const auto u1 = hn::WidenMulPairwiseAdd(int32_tag, t1, ww0); 162 const auto v0 = hn::WidenMulPairwiseAdd(int32_tag, t0, ww1); 163 const auto v1 = hn::WidenMulPairwiseAdd(int32_tag, t1, ww1); 164 const auto c0 = hn::ShiftRightSame(hn::Add(u0, round), bit); 165 const auto c1 = hn::ShiftRightSame(hn::Add(u1, round), bit); 166 const auto d0 = hn::ShiftRightSame(hn::Add(v0, round), bit); 167 const auto d1 = hn::ShiftRightSame(hn::Add(v1, round), bit); 168 hn::Store(hn::ReorderDemote2To(int_tag, c0, c1), int_tag, out0); 169 hn::Store(hn::ReorderDemote2To(int_tag, d0, d1), int_tag, out1); 170 } 171 172 template <typename D> 173 HWY_ATTR HWY_INLINE static void Half( 174 D int_tag, int w0, int w1, const hn::TFromD<D> *HWY_RESTRICT in0, 175 const hn::TFromD<D> *HWY_RESTRICT in1, hn::TFromD<D> *HWY_RESTRICT out, 176 int bit, hn::VFromD<hn::Repartition<int32_t, D>> round) { 177 constexpr hn::RepartitionToWide<D> int32_tag; 178 const auto i0 = hn::Load(int_tag, in0); 179 const auto i1 = hn::Load(int_tag, in1); 180 const auto t0 = hn::InterleaveLower(int_tag, i0, i1); 181 const auto t1 = hn::InterleaveUpper(int_tag, i0, i1); 182 const auto ww0 = SetPair(int_tag, w0, w1); 183 const auto u0 = hn::WidenMulPairwiseAdd(int32_tag, t0, ww0); 184 const auto u1 = hn::WidenMulPairwiseAdd(int32_tag, t1, ww0); 185 const auto c0 = hn::ShiftRightSame(hn::Add(u0, round), bit); 186 const auto c1 = hn::ShiftRightSame(hn::Add(u1, round), bit); 187 hn::Store(hn::ReorderDemote2To(int_tag, c0, c1), int_tag, out); 188 } 189 }; 190 191 template <> 192 struct ButterflyTraits<4> { 193 template <typename D> 194 HWY_ATTR HWY_INLINE static void Whole( 195 D int_tag, int w0, int w1, const hn::TFromD<D> *HWY_RESTRICT in0, 196 const hn::TFromD<D> *HWY_RESTRICT in1, hn::TFromD<D> *HWY_RESTRICT out0, 197 hn::TFromD<D> *HWY_RESTRICT out1, int bit, 198 hn::VFromD<hn::Repartition<int32_t, D>> round) { 199 const auto i0 = hn::Load(int_tag, in0); 200 const auto i1 = hn::Load(int_tag, in1); 201 const auto ww0 = hn::Set(int_tag, w0); 202 const auto ww1 = hn::Set(int_tag, w1); 203 const auto in1_w1 = hn::Mul(i1, ww1); 204 const auto o0 = hn::MulAdd(i0, ww0, in1_w1); 205 hn::Store(hn::ShiftRightSame(hn::Add(o0, round), bit), int_tag, out0); 206 const auto in1_w0 = hn::Mul(i1, ww0); 207 const auto o1 = hn::MulSub(i0, ww1, in1_w0); 208 hn::Store(hn::ShiftRightSame(hn::Add(o1, round), bit), int_tag, out1); 209 } 210 211 template <typename D> 212 HWY_ATTR HWY_INLINE static void Half( 213 D int_tag, int w0, int w1, const hn::TFromD<D> *HWY_RESTRICT in0, 214 const hn::TFromD<D> *HWY_RESTRICT in1, hn::TFromD<D> *HWY_RESTRICT out, 215 int bit, hn::VFromD<hn::Repartition<int32_t, D>> round) { 216 const auto i0 = hn::Load(int_tag, in0); 217 const auto i1 = hn::Load(int_tag, in1); 218 const auto ww0 = hn::Set(int_tag, w0); 219 const auto ww1 = hn::Set(int_tag, w1); 220 const auto in1_w1 = hn::Mul(i1, ww1); 221 const auto o0 = hn::MulAdd(i0, ww0, in1_w1); 222 hn::Store(hn::ShiftRightSame(hn::Add(o0, round), bit), int_tag, out); 223 } 224 }; 225 226 template <typename D> 227 HWY_ATTR HWY_INLINE void Butterfly( 228 D int_tag, int w0, int w1, const hn::TFromD<D> *HWY_RESTRICT in0, 229 const hn::TFromD<D> *HWY_RESTRICT in1, hn::TFromD<D> *HWY_RESTRICT out0, 230 hn::TFromD<D> *HWY_RESTRICT out1, int bit, 231 hn::VFromD<hn::Repartition<int32_t, D>> round) { 232 ButterflyTraits<sizeof(hn::TFromD<D>)>::Whole(int_tag, w0, w1, in0, in1, out0, 233 out1, bit, round); 234 } 235 236 template <typename D> 237 HWY_ATTR HWY_INLINE void HalfButterfly( 238 D int_tag, int w0, int w1, const hn::TFromD<D> *HWY_RESTRICT in0, 239 const hn::TFromD<D> *HWY_RESTRICT in1, hn::TFromD<D> *HWY_RESTRICT out, 240 int bit, hn::VFromD<hn::Repartition<int32_t, D>> round) { 241 ButterflyTraits<sizeof(hn::TFromD<D>)>::Half(int_tag, w0, w1, in0, in1, out, 242 bit, round); 243 } 244 245 template <typename D> 246 HWY_ATTR HWY_INLINE void AddSub(D int_tag, const hn::TFromD<D> *in0, 247 const hn::TFromD<D> *in1, 248 hn::TFromD<D> *out_add, 249 hn::TFromD<D> *out_sub) { 250 const auto i0 = hn::Load(int_tag, in0); 251 const auto i1 = hn::Load(int_tag, in1); 252 if CONSTEXPR_IF (sizeof(hn::TFromD<D>) == 2) { 253 hn::Store(hn::SaturatedAdd(i0, i1), int_tag, out_add); 254 hn::Store(hn::SaturatedSub(i0, i1), int_tag, out_sub); 255 } else { 256 hn::Store(hn::Add(i0, i1), int_tag, out_add); 257 hn::Store(hn::Sub(i0, i1), int_tag, out_sub); 258 } 259 } 260 261 template <size_t LaneSize, size_t NumLanes> 262 struct Fdct4Traits { 263 template <typename D> 264 HWY_ATTR HWY_INLINE static void Fdct4(D int_tag, 265 hn::TFromD<D> *HWY_RESTRICT in, 266 const int8_t cos_bit, size_t instride) { 267 using T = hn::TFromD<D>; 268 constexpr size_t kNumLanes = hn::MaxLanes(int_tag); 269 HWY_ALIGN_MAX T buf0[4 * kNumLanes]; 270 const int32_t *HWY_RESTRICT const cospi = cospi_arr(cos_bit); 271 constexpr hn::Repartition<int32_t, D> int32_tag; 272 const auto round = hn::Set(int32_tag, 1 << (cos_bit - 1)); 273 AddSub(int_tag, &in[0 * instride], &in[3 * instride], &buf0[0 * kNumLanes], 274 &buf0[3 * kNumLanes]); 275 AddSub(int_tag, &in[1 * instride], &in[2 * instride], &buf0[1 * kNumLanes], 276 &buf0[2 * kNumLanes]); 277 Butterfly(int_tag, cospi[32], cospi[32], &buf0[0 * kNumLanes], 278 &buf0[1 * kNumLanes], &in[0 * instride], &in[2 * instride], 279 cos_bit, round); 280 Butterfly(int_tag, cospi[16], cospi[48], &buf0[3 * kNumLanes], 281 &buf0[2 * kNumLanes], &in[1 * instride], &in[3 * instride], 282 cos_bit, round); 283 } 284 }; 285 286 template <> 287 struct Fdct4Traits<2, 4> { 288 template <typename D> 289 HWY_ATTR HWY_INLINE static void Fdct4(D int_tag, 290 hn::TFromD<D> *HWY_RESTRICT in, 291 const int8_t cos_bit, size_t instride) { 292 const int32_t *HWY_RESTRICT const cospi = cospi_arr(cos_bit); 293 constexpr hn::FixedTag<hn::TFromD<D>, 8> demote_tag; 294 constexpr hn::Repartition<int32_t, decltype(demote_tag)> int32_tag; 295 const auto round = hn::Set(int32_tag, 1 << (cos_bit - 1)); 296 const auto cospi_p32_p32 = SetPair(int_tag, cospi[32], cospi[32]); 297 const auto cospi_p32_m32 = SetPair(int_tag, cospi[32], -cospi[32]); 298 const auto cospi_p16_p48 = SetPair(int_tag, cospi[16], cospi[48]); 299 const auto cospi_p48_m16 = SetPair(int_tag, cospi[48], -cospi[16]); 300 const auto i0 = hn::Load(int_tag, &in[0 * instride]); 301 const auto i1 = hn::Load(int_tag, &in[1 * instride]); 302 const auto i2 = hn::Load(int_tag, &in[2 * instride]); 303 const auto i3 = hn::Load(int_tag, &in[3 * instride]); 304 const auto u0 = hn::InterleaveLower(int_tag, i0, i1); 305 const auto u1 = hn::InterleaveLower(int_tag, i3, i2); 306 const auto v0 = hn::Add(u0, u1); 307 const auto v1 = hn::Sub(u0, u1); 308 const auto x0 = hn::WidenMulPairwiseAdd(int32_tag, v0, cospi_p32_p32); 309 const auto x1 = hn::WidenMulPairwiseAdd(int32_tag, v0, cospi_p32_m32); 310 const auto x2 = hn::WidenMulPairwiseAdd(int32_tag, v1, cospi_p16_p48); 311 const auto x3 = hn::WidenMulPairwiseAdd(int32_tag, v1, cospi_p48_m16); 312 const auto v0w0 = hn::ShiftRightSame(hn::Add(x0, round), cos_bit); 313 const auto v0w1 = hn::ShiftRightSame(hn::Add(x1, round), cos_bit); 314 const auto v1w0 = hn::ShiftRightSame(hn::Add(x2, round), cos_bit); 315 const auto v1w1 = hn::ShiftRightSame(hn::Add(x3, round), cos_bit); 316 const auto o0 = hn::ReorderDemote2To(demote_tag, v0w0, v0w1); 317 const auto o1 = hn::ReorderDemote2To(demote_tag, v1w0, v1w1); 318 hn::Store(o0, demote_tag, &in[0 * instride]); 319 hn::Store(o1, demote_tag, &in[1 * instride]); 320 hn::Store(hn::ShiftRightLanes<4>(demote_tag, o0), demote_tag, 321 &in[2 * instride]); 322 hn::Store(hn::ShiftRightLanes<4>(demote_tag, o1), demote_tag, 323 &in[3 * instride]); 324 } 325 }; 326 327 template <typename D> 328 HWY_ATTR HWY_INLINE void Fdct4(D int_tag, hn::TFromD<D> *HWY_RESTRICT in, 329 const int8_t cos_bit, size_t instride) { 330 Fdct4Traits<sizeof(hn::TFromD<D>), hn::MaxLanes(int_tag)>::Fdct4( 331 int_tag, in, cos_bit, instride); 332 } 333 334 template <typename D> 335 HWY_ATTR HWY_INLINE void Fdct8(D int_tag, hn::TFromD<D> *HWY_RESTRICT in, 336 const int8_t cos_bit, size_t instride) { 337 constexpr size_t kNumLanes = hn::MaxLanes(int_tag); 338 HWY_ALIGN_MAX hn::TFromD<D> buf0[8 * kNumLanes]; 339 HWY_ALIGN_MAX hn::TFromD<D> buf1[8 * kNumLanes]; 340 const int32_t *HWY_RESTRICT const cospi = cospi_arr(cos_bit); 341 const auto round = hn::Set(hn::Repartition<int32_t, D>(), 1 << (cos_bit - 1)); 342 343 // Even 8 points 0, 2, ..., 14 344 // stage 0 345 // stage 1 346 // buf0/buf1 347 AddSub(int_tag, &in[0 * instride], &in[7 * instride], &buf0[0 * kNumLanes], 348 &buf1[7 * kNumLanes]); 349 // buf0/buf0 350 AddSub(int_tag, &in[1 * instride], &in[6 * instride], &buf0[1 * kNumLanes], 351 &buf0[6 * kNumLanes]); 352 // buf0/buf0 353 AddSub(int_tag, &in[2 * instride], &in[5 * instride], &buf0[2 * kNumLanes], 354 &buf0[5 * kNumLanes]); 355 // buf0/buf1 356 AddSub(int_tag, &in[3 * instride], &in[4 * instride], &buf0[3 * kNumLanes], 357 &buf1[4 * kNumLanes]); 358 359 // stage 2 360 for (size_t i = 0; i < 2; ++i) { 361 AddSub(int_tag, &buf0[i * kNumLanes], &buf0[(3 - i) * kNumLanes], 362 &buf1[i * kNumLanes], &buf1[(3 - i) * kNumLanes]); 363 } 364 365 Butterfly(int_tag, -cospi[32], cospi[32], &buf0[5 * kNumLanes], 366 &buf0[6 * kNumLanes], &buf1[5 * kNumLanes], &buf1[6 * kNumLanes], 367 cos_bit, round); 368 369 // stage 3 370 // type 0 371 Butterfly(int_tag, cospi[32], cospi[32], &buf1[0 * kNumLanes], 372 &buf1[1 * kNumLanes], &in[0 * instride], &in[4 * instride], cos_bit, 373 round); 374 375 // type 1 376 Butterfly(int_tag, cospi[16], cospi[48], &buf1[3 * kNumLanes], 377 &buf1[2 * kNumLanes], &in[2 * instride], &in[6 * instride], cos_bit, 378 round); 379 380 AddSub(int_tag, &buf1[4 * kNumLanes], &buf1[5 * kNumLanes], 381 &buf0[4 * kNumLanes], &buf0[5 * kNumLanes]); 382 AddSub(int_tag, &buf1[7 * kNumLanes], &buf1[6 * kNumLanes], 383 &buf0[7 * kNumLanes], &buf0[6 * kNumLanes]); 384 385 // stage 4 386 // stage 5 387 Butterfly(int_tag, cospi[8], cospi[56], &buf0[7 * kNumLanes], 388 &buf0[4 * kNumLanes], &in[1 * instride], &in[7 * instride], cos_bit, 389 round); 390 Butterfly(int_tag, cospi[40], cospi[24], &buf0[6 * kNumLanes], 391 &buf0[5 * kNumLanes], &in[5 * instride], &in[3 * instride], cos_bit, 392 round); 393 } 394 395 template <typename D> 396 HWY_ATTR HWY_INLINE void Fdct16(D int_tag, hn::TFromD<D> *HWY_RESTRICT in, 397 const int8_t cos_bit, size_t instride) { 398 constexpr size_t kNumLanes = hn::MaxLanes(int_tag); 399 HWY_ALIGN_MAX hn::TFromD<D> buf0[16 * kNumLanes]; 400 HWY_ALIGN_MAX hn::TFromD<D> buf1[16 * kNumLanes]; 401 const int32_t *HWY_RESTRICT const cospi = cospi_arr(cos_bit); 402 const auto round = hn::Set(hn::Repartition<int32_t, D>(), 1 << (cos_bit - 1)); 403 404 // Calculate the column 0, 1, 2, 3 405 // stage 0 406 // stage 1 407 for (size_t i = 0; i < 8; ++i) { 408 AddSub(int_tag, &in[i * instride], &in[(15 - i) * instride], 409 &buf0[i * kNumLanes], &buf0[(15 - i) * kNumLanes]); 410 } 411 412 // stage 2 413 for (size_t i = 0; i < 4; ++i) { 414 AddSub(int_tag, &buf0[i * kNumLanes], &buf0[(7 - i) * kNumLanes], 415 &buf1[i * kNumLanes], &buf1[(7 - i) * kNumLanes]); 416 } 417 418 Butterfly(int_tag, -cospi[32], cospi[32], &buf0[10 * kNumLanes], 419 &buf0[13 * kNumLanes], &buf1[10 * kNumLanes], &buf1[13 * kNumLanes], 420 cos_bit, round); 421 Butterfly(int_tag, -cospi[32], cospi[32], &buf0[11 * kNumLanes], 422 &buf0[12 * kNumLanes], &buf1[11 * kNumLanes], &buf1[12 * kNumLanes], 423 cos_bit, round); 424 425 // stage 3 426 for (size_t i = 0; i < 2; ++i) { 427 AddSub(int_tag, &buf1[i * kNumLanes], &buf1[(3 - i) * kNumLanes], 428 &buf0[i * kNumLanes], &buf0[(3 - i) * kNumLanes]); 429 } 430 431 Butterfly(int_tag, -cospi[32], cospi[32], &buf1[5 * kNumLanes], 432 &buf1[6 * kNumLanes], &buf0[5 * kNumLanes], &buf0[6 * kNumLanes], 433 cos_bit, round); 434 435 for (size_t i = 0; i < 2; ++i) { 436 AddSub(int_tag, &buf0[(8 + i) * kNumLanes], &buf1[(11 - i) * kNumLanes], 437 &buf0[(8 + i) * kNumLanes], &buf0[(11 - i) * kNumLanes]); 438 } 439 for (size_t i = 0; i < 2; ++i) { 440 AddSub(int_tag, &buf0[(15 - i) * kNumLanes], &buf1[(12 + i) * kNumLanes], 441 &buf0[(15 - i) * kNumLanes], &buf0[(12 + i) * kNumLanes]); 442 } 443 444 // stage 4 445 Butterfly(int_tag, cospi[32], cospi[32], &buf0[0 * kNumLanes], 446 &buf0[1 * kNumLanes], &in[0 * instride], &in[8 * instride], cos_bit, 447 round); 448 449 Butterfly(int_tag, cospi[16], cospi[48], &buf0[3 * kNumLanes], 450 &buf0[2 * kNumLanes], &in[4 * instride], &in[12 * instride], 451 cos_bit, round); 452 453 AddSub(int_tag, &buf1[4 * kNumLanes], &buf0[5 * kNumLanes], 454 &buf1[4 * kNumLanes], &buf1[5 * kNumLanes]); 455 AddSub(int_tag, &buf1[7 * kNumLanes], &buf0[6 * kNumLanes], 456 &buf1[7 * kNumLanes], &buf1[6 * kNumLanes]); 457 458 Butterfly(int_tag, -cospi[16], cospi[48], &buf0[9 * kNumLanes], 459 &buf0[14 * kNumLanes], &buf1[9 * kNumLanes], &buf1[14 * kNumLanes], 460 cos_bit, round); 461 Butterfly(int_tag, -cospi[48], -cospi[16], &buf0[10 * kNumLanes], 462 &buf0[13 * kNumLanes], &buf1[10 * kNumLanes], &buf1[13 * kNumLanes], 463 cos_bit, round); 464 465 // stage 5 466 Butterfly(int_tag, cospi[8], cospi[56], &buf1[7 * kNumLanes], 467 &buf1[4 * kNumLanes], &in[2 * instride], &in[14 * instride], 468 cos_bit, round); 469 Butterfly(int_tag, cospi[40], cospi[24], &buf1[6 * kNumLanes], 470 &buf1[5 * kNumLanes], &in[10 * instride], &in[6 * instride], 471 cos_bit, round); 472 473 AddSub(int_tag, &buf0[8 * kNumLanes], &buf1[9 * kNumLanes], 474 &buf0[8 * kNumLanes], &buf0[9 * kNumLanes]); 475 AddSub(int_tag, &buf0[11 * kNumLanes], &buf1[10 * kNumLanes], 476 &buf0[11 * kNumLanes], &buf0[10 * kNumLanes]); 477 AddSub(int_tag, &buf0[12 * kNumLanes], &buf1[13 * kNumLanes], 478 &buf0[12 * kNumLanes], &buf0[13 * kNumLanes]); 479 AddSub(int_tag, &buf0[15 * kNumLanes], &buf1[14 * kNumLanes], 480 &buf0[15 * kNumLanes], &buf0[14 * kNumLanes]); 481 482 // stage 6 483 Butterfly(int_tag, cospi[4], cospi[60], &buf0[15 * kNumLanes], 484 &buf0[8 * kNumLanes], &in[1 * instride], &in[15 * instride], 485 cos_bit, round); 486 Butterfly(int_tag, cospi[36], cospi[28], &buf0[14 * kNumLanes], 487 &buf0[9 * kNumLanes], &in[9 * instride], &in[7 * instride], cos_bit, 488 round); 489 Butterfly(int_tag, cospi[20], cospi[44], &buf0[13 * kNumLanes], 490 &buf0[10 * kNumLanes], &in[5 * instride], &in[11 * instride], 491 cos_bit, round); 492 Butterfly(int_tag, cospi[52], cospi[12], &buf0[12 * kNumLanes], 493 &buf0[11 * kNumLanes], &in[13 * instride], &in[3 * instride], 494 cos_bit, round); 495 } 496 497 template <typename D> 498 HWY_ATTR HWY_INLINE void Fdct32(D int_tag, hn::TFromD<D> *HWY_RESTRICT in, 499 const int8_t cos_bit, size_t instride) { 500 constexpr size_t kNumLanes = hn::MaxLanes(int_tag); 501 HWY_ALIGN_MAX hn::TFromD<D> buf0[32 * kNumLanes]; 502 HWY_ALIGN_MAX hn::TFromD<D> buf1[32 * kNumLanes]; 503 const int32_t *HWY_RESTRICT const cospi = cospi_arr(cos_bit); 504 const auto round = hn::Set(hn::Repartition<int32_t, D>(), 1 << (cos_bit - 1)); 505 // stage 0 506 // stage 1 507 for (size_t i = 0; i < 16; ++i) { 508 AddSub(int_tag, &in[i * instride], &in[(31 - i) * instride], 509 &buf1[i * kNumLanes], &buf1[(31 - i) * kNumLanes]); 510 } 511 512 // stage 2 513 for (size_t i = 0; i < 8; ++i) { 514 AddSub(int_tag, &buf1[i * kNumLanes], &buf1[(15 - i) * kNumLanes], 515 &buf0[i * kNumLanes], &buf0[(15 - i) * kNumLanes]); 516 } 517 518 Butterfly(int_tag, -cospi[32], cospi[32], &buf1[20 * kNumLanes], 519 &buf1[27 * kNumLanes], &buf0[20 * kNumLanes], &buf0[27 * kNumLanes], 520 cos_bit, round); 521 Butterfly(int_tag, -cospi[32], cospi[32], &buf1[21 * kNumLanes], 522 &buf1[26 * kNumLanes], &buf0[21 * kNumLanes], &buf0[26 * kNumLanes], 523 cos_bit, round); 524 Butterfly(int_tag, -cospi[32], cospi[32], &buf1[22 * kNumLanes], 525 &buf1[25 * kNumLanes], &buf0[22 * kNumLanes], &buf0[25 * kNumLanes], 526 cos_bit, round); 527 Butterfly(int_tag, -cospi[32], cospi[32], &buf1[23 * kNumLanes], 528 &buf1[24 * kNumLanes], &buf0[23 * kNumLanes], &buf0[24 * kNumLanes], 529 cos_bit, round); 530 531 // stage 3 532 for (size_t i = 0; i < 4; ++i) { 533 AddSub(int_tag, &buf0[i * kNumLanes], &buf0[(7 - i) * kNumLanes], 534 &buf1[i * kNumLanes], &buf1[(7 - i) * kNumLanes]); 535 } 536 537 Butterfly(int_tag, -cospi[32], cospi[32], &buf0[10 * kNumLanes], 538 &buf0[13 * kNumLanes], &buf1[10 * kNumLanes], &buf1[13 * kNumLanes], 539 cos_bit, round); 540 Butterfly(int_tag, -cospi[32], cospi[32], &buf0[11 * kNumLanes], 541 &buf0[12 * kNumLanes], &buf1[11 * kNumLanes], &buf1[12 * kNumLanes], 542 cos_bit, round); 543 544 for (size_t i = 0; i < 4; ++i) { 545 AddSub(int_tag, &buf1[(16 + i) * kNumLanes], &buf0[(23 - i) * kNumLanes], 546 &buf1[(16 + i) * kNumLanes], &buf1[(23 - i) * kNumLanes]); 547 } 548 for (size_t i = 0; i < 4; ++i) { 549 AddSub(int_tag, &buf1[(31 - i) * kNumLanes], &buf0[(24 + i) * kNumLanes], 550 &buf1[(31 - i) * kNumLanes], &buf1[(24 + i) * kNumLanes]); 551 } 552 553 // stage 4 554 for (size_t i = 0; i < 2; ++i) { 555 AddSub(int_tag, &buf1[i * kNumLanes], &buf1[(3 - i) * kNumLanes], 556 &buf0[i * kNumLanes], &buf0[(3 - i) * kNumLanes]); 557 } 558 559 Butterfly(int_tag, -cospi[32], cospi[32], &buf1[5 * kNumLanes], 560 &buf1[6 * kNumLanes], &buf0[5 * kNumLanes], &buf0[6 * kNumLanes], 561 cos_bit, round); 562 563 for (size_t i = 0; i < 2; ++i) { 564 AddSub(int_tag, &buf0[(8 + i) * kNumLanes], &buf1[(11 - i) * kNumLanes], 565 &buf0[(8 + i) * kNumLanes], &buf0[(11 - i) * kNumLanes]); 566 } 567 for (size_t i = 0; i < 2; ++i) { 568 AddSub(int_tag, &buf0[(15 - i) * kNumLanes], &buf1[(12 + i) * kNumLanes], 569 &buf0[(15 - i) * kNumLanes], &buf0[(12 + i) * kNumLanes]); 570 } 571 572 Butterfly(int_tag, -cospi[16], cospi[48], &buf1[18 * kNumLanes], 573 &buf1[29 * kNumLanes], &buf0[18 * kNumLanes], &buf0[29 * kNumLanes], 574 cos_bit, round); 575 Butterfly(int_tag, -cospi[16], cospi[48], &buf1[19 * kNumLanes], 576 &buf1[28 * kNumLanes], &buf0[19 * kNumLanes], &buf0[28 * kNumLanes], 577 cos_bit, round); 578 Butterfly(int_tag, -cospi[48], -cospi[16], &buf1[20 * kNumLanes], 579 &buf1[27 * kNumLanes], &buf0[20 * kNumLanes], &buf0[27 * kNumLanes], 580 cos_bit, round); 581 Butterfly(int_tag, -cospi[48], -cospi[16], &buf1[21 * kNumLanes], 582 &buf1[26 * kNumLanes], &buf0[21 * kNumLanes], &buf0[26 * kNumLanes], 583 cos_bit, round); 584 585 // stage 5 586 Butterfly(int_tag, cospi[32], cospi[32], &buf0[0 * kNumLanes], 587 &buf0[1 * kNumLanes], &in[0 * instride], &in[16 * instride], 588 cos_bit, round); 589 Butterfly(int_tag, cospi[16], cospi[48], &buf0[3 * kNumLanes], 590 &buf0[2 * kNumLanes], &in[8 * instride], &in[24 * instride], 591 cos_bit, round); 592 AddSub(int_tag, &buf1[4 * kNumLanes], &buf0[5 * kNumLanes], 593 &buf1[4 * kNumLanes], &buf1[5 * kNumLanes]); 594 AddSub(int_tag, &buf1[7 * kNumLanes], &buf0[6 * kNumLanes], 595 &buf1[7 * kNumLanes], &buf1[6 * kNumLanes]); 596 Butterfly(int_tag, -cospi[16], cospi[48], &buf0[9 * kNumLanes], 597 &buf0[14 * kNumLanes], &buf1[9 * kNumLanes], &buf1[14 * kNumLanes], 598 cos_bit, round); 599 Butterfly(int_tag, -cospi[48], -cospi[16], &buf0[10 * kNumLanes], 600 &buf0[13 * kNumLanes], &buf1[10 * kNumLanes], &buf1[13 * kNumLanes], 601 cos_bit, round); 602 603 AddSub(int_tag, &buf1[16 * kNumLanes], &buf0[19 * kNumLanes], 604 &buf1[16 * kNumLanes], &buf1[19 * kNumLanes]); 605 AddSub(int_tag, &buf1[17 * kNumLanes], &buf0[18 * kNumLanes], 606 &buf1[17 * kNumLanes], &buf1[18 * kNumLanes]); 607 AddSub(int_tag, &buf1[23 * kNumLanes], &buf0[20 * kNumLanes], 608 &buf1[23 * kNumLanes], &buf1[20 * kNumLanes]); 609 AddSub(int_tag, &buf1[22 * kNumLanes], &buf0[21 * kNumLanes], 610 &buf1[22 * kNumLanes], &buf1[21 * kNumLanes]); 611 AddSub(int_tag, &buf1[24 * kNumLanes], &buf0[27 * kNumLanes], 612 &buf1[24 * kNumLanes], &buf1[27 * kNumLanes]); 613 AddSub(int_tag, &buf1[25 * kNumLanes], &buf0[26 * kNumLanes], 614 &buf1[25 * kNumLanes], &buf1[26 * kNumLanes]); 615 AddSub(int_tag, &buf1[31 * kNumLanes], &buf0[28 * kNumLanes], 616 &buf1[31 * kNumLanes], &buf1[28 * kNumLanes]); 617 AddSub(int_tag, &buf1[30 * kNumLanes], &buf0[29 * kNumLanes], 618 &buf1[30 * kNumLanes], &buf1[29 * kNumLanes]); 619 620 // stage 6 621 Butterfly(int_tag, cospi[8], cospi[56], &buf1[7 * kNumLanes], 622 &buf1[4 * kNumLanes], &in[4 * instride], &in[28 * instride], 623 cos_bit, round); 624 Butterfly(int_tag, cospi[40], cospi[24], &buf1[6 * kNumLanes], 625 &buf1[5 * kNumLanes], &in[20 * instride], &in[12 * instride], 626 cos_bit, round); 627 AddSub(int_tag, &buf0[8 * kNumLanes], &buf1[9 * kNumLanes], 628 &buf0[8 * kNumLanes], &buf0[9 * kNumLanes]); 629 AddSub(int_tag, &buf0[11 * kNumLanes], &buf1[10 * kNumLanes], 630 &buf0[11 * kNumLanes], &buf0[10 * kNumLanes]); 631 AddSub(int_tag, &buf0[12 * kNumLanes], &buf1[13 * kNumLanes], 632 &buf0[12 * kNumLanes], &buf0[13 * kNumLanes]); 633 AddSub(int_tag, &buf0[15 * kNumLanes], &buf1[14 * kNumLanes], 634 &buf0[15 * kNumLanes], &buf0[14 * kNumLanes]); 635 Butterfly(int_tag, -cospi[8], cospi[56], &buf1[17 * kNumLanes], 636 &buf1[30 * kNumLanes], &buf0[17 * kNumLanes], &buf0[30 * kNumLanes], 637 cos_bit, round); 638 Butterfly(int_tag, -cospi[56], -cospi[8], &buf1[18 * kNumLanes], 639 &buf1[29 * kNumLanes], &buf0[18 * kNumLanes], &buf0[29 * kNumLanes], 640 cos_bit, round); 641 Butterfly(int_tag, -cospi[40], cospi[24], &buf1[21 * kNumLanes], 642 &buf1[26 * kNumLanes], &buf0[21 * kNumLanes], &buf0[26 * kNumLanes], 643 cos_bit, round); 644 Butterfly(int_tag, -cospi[24], -cospi[40], &buf1[22 * kNumLanes], 645 &buf1[25 * kNumLanes], &buf0[22 * kNumLanes], &buf0[25 * kNumLanes], 646 cos_bit, round); 647 648 // stage 7 649 Butterfly(int_tag, cospi[4], cospi[60], &buf0[15 * kNumLanes], 650 &buf0[8 * kNumLanes], &in[2 * instride], &in[30 * instride], 651 cos_bit, round); 652 Butterfly(int_tag, cospi[36], cospi[28], &buf0[14 * kNumLanes], 653 &buf0[9 * kNumLanes], &in[18 * instride], &in[14 * instride], 654 cos_bit, round); 655 Butterfly(int_tag, cospi[20], cospi[44], &buf0[13 * kNumLanes], 656 &buf0[10 * kNumLanes], &in[10 * instride], &in[22 * instride], 657 cos_bit, round); 658 Butterfly(int_tag, cospi[52], cospi[12], &buf0[12 * kNumLanes], 659 &buf0[11 * kNumLanes], &in[26 * instride], &in[6 * instride], 660 cos_bit, round); 661 AddSub(int_tag, &buf1[16 * kNumLanes], &buf0[17 * kNumLanes], 662 &buf1[16 * kNumLanes], &buf1[17 * kNumLanes]); 663 AddSub(int_tag, &buf1[19 * kNumLanes], &buf0[18 * kNumLanes], 664 &buf1[19 * kNumLanes], &buf1[18 * kNumLanes]); 665 AddSub(int_tag, &buf1[20 * kNumLanes], &buf0[21 * kNumLanes], 666 &buf1[20 * kNumLanes], &buf1[21 * kNumLanes]); 667 AddSub(int_tag, &buf1[23 * kNumLanes], &buf0[22 * kNumLanes], 668 &buf1[23 * kNumLanes], &buf1[22 * kNumLanes]); 669 AddSub(int_tag, &buf1[24 * kNumLanes], &buf0[25 * kNumLanes], 670 &buf1[24 * kNumLanes], &buf1[25 * kNumLanes]); 671 AddSub(int_tag, &buf1[27 * kNumLanes], &buf0[26 * kNumLanes], 672 &buf1[27 * kNumLanes], &buf1[26 * kNumLanes]); 673 AddSub(int_tag, &buf1[28 * kNumLanes], &buf0[29 * kNumLanes], 674 &buf1[28 * kNumLanes], &buf1[29 * kNumLanes]); 675 AddSub(int_tag, &buf1[31 * kNumLanes], &buf0[30 * kNumLanes], 676 &buf1[31 * kNumLanes], &buf1[30 * kNumLanes]); 677 678 // stage 8 & 9 679 Butterfly(int_tag, cospi[2], cospi[62], &buf1[31 * kNumLanes], 680 &buf1[16 * kNumLanes], &in[1 * instride], &in[31 * instride], 681 cos_bit, round); 682 Butterfly(int_tag, cospi[34], cospi[30], &buf1[30 * kNumLanes], 683 &buf1[17 * kNumLanes], &in[17 * instride], &in[15 * instride], 684 cos_bit, round); 685 Butterfly(int_tag, cospi[18], cospi[46], &buf1[29 * kNumLanes], 686 &buf1[18 * kNumLanes], &in[9 * instride], &in[23 * instride], 687 cos_bit, round); 688 Butterfly(int_tag, cospi[50], cospi[14], &buf1[28 * kNumLanes], 689 &buf1[19 * kNumLanes], &in[25 * instride], &in[7 * instride], 690 cos_bit, round); 691 Butterfly(int_tag, cospi[10], cospi[54], &buf1[27 * kNumLanes], 692 &buf1[20 * kNumLanes], &in[5 * instride], &in[27 * instride], 693 cos_bit, round); 694 Butterfly(int_tag, cospi[42], cospi[22], &buf1[26 * kNumLanes], 695 &buf1[21 * kNumLanes], &in[21 * instride], &in[11 * instride], 696 cos_bit, round); 697 Butterfly(int_tag, cospi[26], cospi[38], &buf1[25 * kNumLanes], 698 &buf1[22 * kNumLanes], &in[13 * instride], &in[19 * instride], 699 cos_bit, round); 700 Butterfly(int_tag, cospi[58], cospi[6], &buf1[24 * kNumLanes], 701 &buf1[23 * kNumLanes], &in[29 * instride], &in[3 * instride], 702 cos_bit, round); 703 704 // stage 9 was fused with prior stages. 705 } 706 707 template <size_t InStride, size_t OutStride, typename D> 708 HWY_ATTR HWY_NOINLINE void Fdct64(D int_tag, hn::TFromD<D> *HWY_RESTRICT in, 709 const int8_t cos_bit) { 710 constexpr size_t kNumLanes = hn::MaxLanes(int_tag); 711 constexpr size_t kNumBytes = kNumLanes * sizeof(hn::TFromD<D>); 712 HWY_ALIGN_MAX hn::TFromD<D> buf0[64 * kNumLanes]; 713 HWY_ALIGN_MAX hn::TFromD<D> buf1[64 * kNumLanes]; 714 const int32_t *HWY_RESTRICT const cospi = cospi_arr(cos_bit); 715 const auto round = hn::Set(hn::Repartition<int32_t, D>(), 1 << (cos_bit - 1)); 716 717 // stage 1 718 #if HWY_TARGET == HWY_SSE4 719 // For whatever reason, some compilers don't unroll this when building for 720 // SSE4; help them along. 721 HWY_UNROLL(32) 722 #endif 723 for (size_t i = 0; i < 32; ++i) { 724 AddSub(int_tag, &in[i * InStride], &in[(63 - i) * InStride], 725 &buf0[i * kNumLanes], &buf0[(63 - i) * kNumLanes]); 726 } 727 728 // stage 2 729 for (size_t i = 0; i < 16; ++i) { 730 AddSub(int_tag, &buf0[i * kNumLanes], &buf0[(31 - i) * kNumLanes], 731 &buf1[i * kNumLanes], &buf1[(31 - i) * kNumLanes]); 732 } 733 for (size_t i = 0; i < 8; ++i) { 734 Butterfly(int_tag, -cospi[32], cospi[32], &buf0[(40 + i) * kNumLanes], 735 &buf0[(55 - i) * kNumLanes], &buf1[(40 + i) * kNumLanes], 736 &buf1[(55 - i) * kNumLanes], cos_bit, round); 737 } 738 739 // stage 3 740 for (size_t i = 0; i < 8; ++i) { 741 AddSub(int_tag, &buf1[i * kNumLanes], &buf1[(15 - i) * kNumLanes], 742 &buf0[i * kNumLanes], &buf0[(15 - i) * kNumLanes]); 743 } 744 for (size_t i = 0; i < 4; ++i) { 745 Butterfly(int_tag, -cospi[32], cospi[32], &buf1[(20 + i) * kNumLanes], 746 &buf1[(27 - i) * kNumLanes], &buf0[(20 + i) * kNumLanes], 747 &buf0[(27 - i) * kNumLanes], cos_bit, round); 748 } 749 for (size_t i = 0; i < 8; ++i) { 750 AddSub(int_tag, &buf0[(32 + i) * kNumLanes], &buf1[(47 - i) * kNumLanes], 751 &buf0[(32 + i) * kNumLanes], &buf0[(47 - i) * kNumLanes]); 752 } 753 for (size_t i = 0; i < 8; ++i) { 754 AddSub(int_tag, &buf0[(63 - i) * kNumLanes], &buf1[(48 + i) * kNumLanes], 755 &buf0[(63 - i) * kNumLanes], &buf0[(48 + i) * kNumLanes]); 756 } 757 // stage 4 758 for (size_t i = 0; i < 4; ++i) { 759 AddSub(int_tag, &buf0[(0 + i) * kNumLanes], &buf0[(7 - i) * kNumLanes], 760 &buf1[(0 + i) * kNumLanes], &buf1[(7 - i) * kNumLanes]); 761 } 762 for (size_t i = 0; i < 2; ++i) { 763 Butterfly(int_tag, -cospi[32], cospi[32], &buf0[(10 + i) * kNumLanes], 764 &buf0[(13 - i) * kNumLanes], &buf1[(10 + i) * kNumLanes], 765 &buf1[(13 - i) * kNumLanes], cos_bit, round); 766 } 767 for (size_t i = 0; i < 4; ++i) { 768 AddSub(int_tag, &buf1[(16 + i) * kNumLanes], &buf0[(23 - i) * kNumLanes], 769 &buf1[(16 + i) * kNumLanes], &buf1[(23 - i) * kNumLanes]); 770 } 771 for (size_t i = 0; i < 4; ++i) { 772 AddSub(int_tag, &buf1[(31 - i) * kNumLanes], &buf0[(24 + i) * kNumLanes], 773 &buf1[(31 - i) * kNumLanes], &buf1[(24 + i) * kNumLanes]); 774 } 775 for (size_t i = 0; i < 4; ++i) { 776 Butterfly(int_tag, -cospi[16], cospi[48], &buf0[(36 + i) * kNumLanes], 777 &buf0[(59 - i) * kNumLanes], &buf1[(36 + i) * kNumLanes], 778 &buf1[(59 - i) * kNumLanes], cos_bit, round); 779 } 780 for (size_t i = 4; i < 8; ++i) { 781 Butterfly(int_tag, -cospi[48], -cospi[16], &buf0[(36 + i) * kNumLanes], 782 &buf0[(59 - i) * kNumLanes], &buf1[(36 + i) * kNumLanes], 783 &buf1[(59 - i) * kNumLanes], cos_bit, round); 784 } 785 // stage 5 786 for (size_t i = 0; i < 2; ++i) { 787 AddSub(int_tag, &buf1[(0 + i) * kNumLanes], &buf1[(3 - i) * kNumLanes], 788 &buf0[(0 + i) * kNumLanes], &buf0[(3 - i) * kNumLanes]); 789 } 790 Butterfly(int_tag, -cospi[32], cospi[32], &buf1[5 * kNumLanes], 791 &buf1[6 * kNumLanes], &buf0[5 * kNumLanes], &buf0[6 * kNumLanes], 792 cos_bit, round); 793 for (size_t i = 0; i < 2; ++i) { 794 AddSub(int_tag, &buf0[(8 + i) * kNumLanes], &buf1[(11 - i) * kNumLanes], 795 &buf0[(8 + i) * kNumLanes], &buf0[(11 - i) * kNumLanes]); 796 } 797 for (size_t i = 0; i < 2; ++i) { 798 AddSub(int_tag, &buf0[(15 - i) * kNumLanes], &buf1[(12 + i) * kNumLanes], 799 &buf0[(15 - i) * kNumLanes], &buf0[(12 + i) * kNumLanes]); 800 } 801 for (size_t i = 0; i < 2; ++i) { 802 Butterfly(int_tag, -cospi[16], cospi[48], &buf1[(18 + i) * kNumLanes], 803 &buf1[(29 - i) * kNumLanes], &buf0[(18 + i) * kNumLanes], 804 &buf0[(29 - i) * kNumLanes], cos_bit, round); 805 } 806 for (size_t i = 2; i < 4; ++i) { 807 Butterfly(int_tag, -cospi[48], -cospi[16], &buf1[(18 + i) * kNumLanes], 808 &buf1[(29 - i) * kNumLanes], &buf0[(18 + i) * kNumLanes], 809 &buf0[(29 - i) * kNumLanes], cos_bit, round); 810 } 811 for (size_t i = 0; i < 4; ++i) { 812 AddSub(int_tag, &buf0[(32 + i) * kNumLanes], &buf1[(39 - i) * kNumLanes], 813 &buf0[(32 + i) * kNumLanes], &buf0[(39 - i) * kNumLanes]); 814 } 815 for (size_t i = 0; i < 4; ++i) { 816 AddSub(int_tag, &buf0[(47 - i) * kNumLanes], &buf1[(40 + i) * kNumLanes], 817 &buf0[(47 - i) * kNumLanes], &buf0[(40 + i) * kNumLanes]); 818 } 819 for (size_t i = 0; i < 4; ++i) { 820 AddSub(int_tag, &buf0[(48 + i) * kNumLanes], &buf1[(55 - i) * kNumLanes], 821 &buf0[(48 + i) * kNumLanes], &buf0[(55 - i) * kNumLanes]); 822 } 823 for (size_t i = 0; i < 4; ++i) { 824 AddSub(int_tag, &buf0[(63 - i) * kNumLanes], &buf1[(56 + i) * kNumLanes], 825 &buf0[(63 - i) * kNumLanes], &buf0[(56 + i) * kNumLanes]); 826 } 827 // stage 6 828 Butterfly(int_tag, cospi[32], cospi[32], &buf0[0 * kNumLanes], 829 &buf0[1 * kNumLanes], &buf1[0 * kNumLanes], &buf1[1 * kNumLanes], 830 cos_bit, round); 831 Butterfly(int_tag, cospi[16], cospi[48], &buf0[3 * kNumLanes], 832 &buf0[2 * kNumLanes], &buf1[2 * kNumLanes], &buf1[3 * kNumLanes], 833 cos_bit, round); 834 AddSub(int_tag, &buf1[4 * kNumLanes], &buf0[5 * kNumLanes], 835 &buf1[4 * kNumLanes], &buf1[5 * kNumLanes]); 836 AddSub(int_tag, &buf1[7 * kNumLanes], &buf0[6 * kNumLanes], 837 &buf1[7 * kNumLanes], &buf1[6 * kNumLanes]); 838 Butterfly(int_tag, -cospi[16], cospi[48], &buf0[9 * kNumLanes], 839 &buf0[14 * kNumLanes], &buf1[9 * kNumLanes], &buf1[14 * kNumLanes], 840 cos_bit, round); 841 Butterfly(int_tag, -cospi[48], -cospi[16], &buf0[10 * kNumLanes], 842 &buf0[13 * kNumLanes], &buf1[10 * kNumLanes], &buf1[13 * kNumLanes], 843 cos_bit, round); 844 for (size_t i = 0; i < 2; ++i) { 845 AddSub(int_tag, &buf1[(16 + i) * kNumLanes], &buf0[(19 - i) * kNumLanes], 846 &buf1[(16 + i) * kNumLanes], &buf1[(19 - i) * kNumLanes]); 847 } 848 for (size_t i = 0; i < 2; ++i) { 849 AddSub(int_tag, &buf1[(23 - i) * kNumLanes], &buf0[(20 + i) * kNumLanes], 850 &buf1[(23 - i) * kNumLanes], &buf1[(20 + i) * kNumLanes]); 851 } 852 for (size_t i = 0; i < 2; ++i) { 853 AddSub(int_tag, &buf1[(24 + i) * kNumLanes], &buf0[(27 - i) * kNumLanes], 854 &buf1[(24 + i) * kNumLanes], &buf1[(27 - i) * kNumLanes]); 855 } 856 for (size_t i = 0; i < 2; ++i) { 857 AddSub(int_tag, &buf1[(31 - i) * kNumLanes], &buf0[(28 + i) * kNumLanes], 858 &buf1[(31 - i) * kNumLanes], &buf1[(28 + i) * kNumLanes]); 859 } 860 for (size_t i = 0; i < 2; ++i) { 861 Butterfly(int_tag, -cospi[8], cospi[56], &buf0[(34 + i) * kNumLanes], 862 &buf0[(61 - i) * kNumLanes], &buf1[(34 + i) * kNumLanes], 863 &buf1[(61 - i) * kNumLanes], cos_bit, round); 864 } 865 for (size_t i = 2; i < 4; ++i) { 866 Butterfly(int_tag, -cospi[56], -cospi[8], &buf0[(34 + i) * kNumLanes], 867 &buf0[(61 - i) * kNumLanes], &buf1[(34 + i) * kNumLanes], 868 &buf1[(61 - i) * kNumLanes], cos_bit, round); 869 } 870 for (size_t i = 0; i < 2; ++i) { 871 Butterfly(int_tag, -cospi[40], cospi[24], &buf0[(42 + i) * kNumLanes], 872 &buf0[(53 - i) * kNumLanes], &buf1[(42 + i) * kNumLanes], 873 &buf1[(53 - i) * kNumLanes], cos_bit, round); 874 } 875 for (size_t i = 2; i < 4; ++i) { 876 Butterfly(int_tag, -cospi[24], -cospi[40], &buf0[(42 + i) * kNumLanes], 877 &buf0[(53 - i) * kNumLanes], &buf1[(42 + i) * kNumLanes], 878 &buf1[(53 - i) * kNumLanes], cos_bit, round); 879 } 880 // stage 7 881 Butterfly(int_tag, cospi[8], cospi[56], &buf1[7 * kNumLanes], 882 &buf1[4 * kNumLanes], &buf0[4 * kNumLanes], &buf0[7 * kNumLanes], 883 cos_bit, round); 884 Butterfly(int_tag, cospi[40], cospi[24], &buf1[6 * kNumLanes], 885 &buf1[5 * kNumLanes], &buf0[5 * kNumLanes], &buf0[6 * kNumLanes], 886 cos_bit, round); 887 AddSub(int_tag, &buf0[8 * kNumLanes], &buf1[9 * kNumLanes], 888 &buf0[8 * kNumLanes], &buf0[9 * kNumLanes]); 889 AddSub(int_tag, &buf0[11 * kNumLanes], &buf1[10 * kNumLanes], 890 &buf0[11 * kNumLanes], &buf0[10 * kNumLanes]); 891 AddSub(int_tag, &buf0[12 * kNumLanes], &buf1[13 * kNumLanes], 892 &buf0[12 * kNumLanes], &buf0[13 * kNumLanes]); 893 AddSub(int_tag, &buf0[15 * kNumLanes], &buf1[14 * kNumLanes], 894 &buf0[15 * kNumLanes], &buf0[14 * kNumLanes]); 895 Butterfly(int_tag, -cospi[8], cospi[56], &buf1[17 * kNumLanes], 896 &buf1[30 * kNumLanes], &buf0[17 * kNumLanes], &buf0[30 * kNumLanes], 897 cos_bit, round); 898 Butterfly(int_tag, -cospi[56], -cospi[8], &buf1[18 * kNumLanes], 899 &buf1[29 * kNumLanes], &buf0[18 * kNumLanes], &buf0[29 * kNumLanes], 900 cos_bit, round); 901 Butterfly(int_tag, -cospi[40], cospi[24], &buf1[21 * kNumLanes], 902 &buf1[26 * kNumLanes], &buf0[21 * kNumLanes], &buf0[26 * kNumLanes], 903 cos_bit, round); 904 Butterfly(int_tag, -cospi[24], -cospi[40], &buf1[22 * kNumLanes], 905 &buf1[25 * kNumLanes], &buf0[22 * kNumLanes], &buf0[25 * kNumLanes], 906 cos_bit, round); 907 for (size_t i = 0; i < 2; ++i) { 908 AddSub(int_tag, &buf0[(32 + i) * kNumLanes], &buf1[(35 - i) * kNumLanes], 909 &buf0[(32 + i) * kNumLanes], &buf0[(35 - i) * kNumLanes]); 910 } 911 for (size_t i = 0; i < 2; ++i) { 912 AddSub(int_tag, &buf0[(39 - i) * kNumLanes], &buf1[(36 + i) * kNumLanes], 913 &buf0[(39 - i) * kNumLanes], &buf0[(36 + i) * kNumLanes]); 914 } 915 for (size_t i = 0; i < 2; ++i) { 916 AddSub(int_tag, &buf0[(40 + i) * kNumLanes], &buf1[(43 - i) * kNumLanes], 917 &buf0[(40 + i) * kNumLanes], &buf0[(43 - i) * kNumLanes]); 918 } 919 for (size_t i = 0; i < 2; ++i) { 920 AddSub(int_tag, &buf0[(47 - i) * kNumLanes], &buf1[(44 + i) * kNumLanes], 921 &buf0[(47 - i) * kNumLanes], &buf0[(44 + i) * kNumLanes]); 922 } 923 for (size_t i = 0; i < 2; ++i) { 924 AddSub(int_tag, &buf0[(48 + i) * kNumLanes], &buf1[(51 - i) * kNumLanes], 925 &buf0[(48 + i) * kNumLanes], &buf0[(51 - i) * kNumLanes]); 926 } 927 for (size_t i = 0; i < 2; ++i) { 928 AddSub(int_tag, &buf0[(55 - i) * kNumLanes], &buf1[(52 + i) * kNumLanes], 929 &buf0[(55 - i) * kNumLanes], &buf0[(52 + i) * kNumLanes]); 930 } 931 for (size_t i = 0; i < 2; ++i) { 932 AddSub(int_tag, &buf0[(56 + i) * kNumLanes], &buf1[(59 - i) * kNumLanes], 933 &buf0[(56 + i) * kNumLanes], &buf0[(59 - i) * kNumLanes]); 934 } 935 for (size_t i = 0; i < 2; ++i) { 936 AddSub(int_tag, &buf0[(63 - i) * kNumLanes], &buf1[(60 + i) * kNumLanes], 937 &buf0[(63 - i) * kNumLanes], &buf0[(60 + i) * kNumLanes]); 938 } 939 // stage 8 940 Butterfly(int_tag, cospi[4], cospi[60], &buf0[15 * kNumLanes], 941 &buf0[8 * kNumLanes], &buf1[8 * kNumLanes], &buf1[15 * kNumLanes], 942 cos_bit, round); 943 Butterfly(int_tag, cospi[36], cospi[28], &buf0[14 * kNumLanes], 944 &buf0[9 * kNumLanes], &buf1[9 * kNumLanes], &buf1[14 * kNumLanes], 945 cos_bit, round); 946 Butterfly(int_tag, cospi[20], cospi[44], &buf0[13 * kNumLanes], 947 &buf0[10 * kNumLanes], &buf1[10 * kNumLanes], &buf1[13 * kNumLanes], 948 cos_bit, round); 949 Butterfly(int_tag, cospi[52], cospi[12], &buf0[12 * kNumLanes], 950 &buf0[11 * kNumLanes], &buf1[11 * kNumLanes], &buf1[12 * kNumLanes], 951 cos_bit, round); 952 AddSub(int_tag, &buf1[16 * kNumLanes], &buf0[17 * kNumLanes], 953 &buf1[16 * kNumLanes], &buf1[17 * kNumLanes]); 954 AddSub(int_tag, &buf1[19 * kNumLanes], &buf0[18 * kNumLanes], 955 &buf1[19 * kNumLanes], &buf1[18 * kNumLanes]); 956 AddSub(int_tag, &buf1[20 * kNumLanes], &buf0[21 * kNumLanes], 957 &buf1[20 * kNumLanes], &buf1[21 * kNumLanes]); 958 AddSub(int_tag, &buf1[23 * kNumLanes], &buf0[22 * kNumLanes], 959 &buf1[23 * kNumLanes], &buf1[22 * kNumLanes]); 960 AddSub(int_tag, &buf1[24 * kNumLanes], &buf0[25 * kNumLanes], 961 &buf1[24 * kNumLanes], &buf1[25 * kNumLanes]); 962 AddSub(int_tag, &buf1[27 * kNumLanes], &buf0[26 * kNumLanes], 963 &buf1[27 * kNumLanes], &buf1[26 * kNumLanes]); 964 AddSub(int_tag, &buf1[28 * kNumLanes], &buf0[29 * kNumLanes], 965 &buf1[28 * kNumLanes], &buf1[29 * kNumLanes]); 966 AddSub(int_tag, &buf1[31 * kNumLanes], &buf0[30 * kNumLanes], 967 &buf1[31 * kNumLanes], &buf1[30 * kNumLanes]); 968 Butterfly(int_tag, -cospi[4], cospi[60], &buf0[33 * kNumLanes], 969 &buf0[62 * kNumLanes], &buf1[33 * kNumLanes], &buf1[62 * kNumLanes], 970 cos_bit, round); 971 Butterfly(int_tag, -cospi[60], -cospi[4], &buf0[34 * kNumLanes], 972 &buf0[61 * kNumLanes], &buf1[34 * kNumLanes], &buf1[61 * kNumLanes], 973 cos_bit, round); 974 Butterfly(int_tag, -cospi[36], cospi[28], &buf0[37 * kNumLanes], 975 &buf0[58 * kNumLanes], &buf1[37 * kNumLanes], &buf1[58 * kNumLanes], 976 cos_bit, round); 977 Butterfly(int_tag, -cospi[28], -cospi[36], &buf0[38 * kNumLanes], 978 &buf0[57 * kNumLanes], &buf1[38 * kNumLanes], &buf1[57 * kNumLanes], 979 cos_bit, round); 980 Butterfly(int_tag, -cospi[20], cospi[44], &buf0[41 * kNumLanes], 981 &buf0[54 * kNumLanes], &buf1[41 * kNumLanes], &buf1[54 * kNumLanes], 982 cos_bit, round); 983 Butterfly(int_tag, -cospi[44], -cospi[20], &buf0[42 * kNumLanes], 984 &buf0[53 * kNumLanes], &buf1[42 * kNumLanes], &buf1[53 * kNumLanes], 985 cos_bit, round); 986 Butterfly(int_tag, -cospi[52], cospi[12], &buf0[45 * kNumLanes], 987 &buf0[50 * kNumLanes], &buf1[45 * kNumLanes], &buf1[50 * kNumLanes], 988 cos_bit, round); 989 Butterfly(int_tag, -cospi[12], -cospi[52], &buf0[46 * kNumLanes], 990 &buf0[49 * kNumLanes], &buf1[46 * kNumLanes], &buf1[49 * kNumLanes], 991 cos_bit, round); 992 // stage 9 993 Butterfly(int_tag, cospi[2], cospi[62], &buf1[31 * kNumLanes], 994 &buf1[16 * kNumLanes], &buf0[16 * kNumLanes], &buf0[31 * kNumLanes], 995 cos_bit, round); 996 Butterfly(int_tag, cospi[34], cospi[30], &buf1[30 * kNumLanes], 997 &buf1[17 * kNumLanes], &buf0[17 * kNumLanes], &buf0[30 * kNumLanes], 998 cos_bit, round); 999 Butterfly(int_tag, cospi[18], cospi[46], &buf1[29 * kNumLanes], 1000 &buf1[18 * kNumLanes], &buf0[18 * kNumLanes], &buf0[29 * kNumLanes], 1001 cos_bit, round); 1002 Butterfly(int_tag, cospi[50], cospi[14], &buf1[28 * kNumLanes], 1003 &buf1[19 * kNumLanes], &buf0[19 * kNumLanes], &buf0[28 * kNumLanes], 1004 cos_bit, round); 1005 Butterfly(int_tag, cospi[10], cospi[54], &buf1[27 * kNumLanes], 1006 &buf1[20 * kNumLanes], &buf0[20 * kNumLanes], &buf0[27 * kNumLanes], 1007 cos_bit, round); 1008 Butterfly(int_tag, cospi[42], cospi[22], &buf1[26 * kNumLanes], 1009 &buf1[21 * kNumLanes], &buf0[21 * kNumLanes], &buf0[26 * kNumLanes], 1010 cos_bit, round); 1011 Butterfly(int_tag, cospi[26], cospi[38], &buf1[25 * kNumLanes], 1012 &buf1[22 * kNumLanes], &buf0[22 * kNumLanes], &buf0[25 * kNumLanes], 1013 cos_bit, round); 1014 Butterfly(int_tag, cospi[58], cospi[6], &buf1[24 * kNumLanes], 1015 &buf1[23 * kNumLanes], &buf0[23 * kNumLanes], &buf0[24 * kNumLanes], 1016 cos_bit, round); 1017 AddSub(int_tag, &buf0[32 * kNumLanes], &buf1[33 * kNumLanes], 1018 &buf0[32 * kNumLanes], &buf0[33 * kNumLanes]); 1019 AddSub(int_tag, &buf0[35 * kNumLanes], &buf1[34 * kNumLanes], 1020 &buf0[35 * kNumLanes], &buf0[34 * kNumLanes]); 1021 AddSub(int_tag, &buf0[36 * kNumLanes], &buf1[37 * kNumLanes], 1022 &buf0[36 * kNumLanes], &buf0[37 * kNumLanes]); 1023 AddSub(int_tag, &buf0[39 * kNumLanes], &buf1[38 * kNumLanes], 1024 &buf0[39 * kNumLanes], &buf0[38 * kNumLanes]); 1025 AddSub(int_tag, &buf0[40 * kNumLanes], &buf1[41 * kNumLanes], 1026 &buf0[40 * kNumLanes], &buf0[41 * kNumLanes]); 1027 AddSub(int_tag, &buf0[43 * kNumLanes], &buf1[42 * kNumLanes], 1028 &buf0[43 * kNumLanes], &buf0[42 * kNumLanes]); 1029 AddSub(int_tag, &buf0[44 * kNumLanes], &buf1[45 * kNumLanes], 1030 &buf0[44 * kNumLanes], &buf0[45 * kNumLanes]); 1031 AddSub(int_tag, &buf0[47 * kNumLanes], &buf1[46 * kNumLanes], 1032 &buf0[47 * kNumLanes], &buf0[46 * kNumLanes]); 1033 AddSub(int_tag, &buf0[48 * kNumLanes], &buf1[49 * kNumLanes], 1034 &buf0[48 * kNumLanes], &buf0[49 * kNumLanes]); 1035 AddSub(int_tag, &buf0[51 * kNumLanes], &buf1[50 * kNumLanes], 1036 &buf0[51 * kNumLanes], &buf0[50 * kNumLanes]); 1037 AddSub(int_tag, &buf0[52 * kNumLanes], &buf1[53 * kNumLanes], 1038 &buf0[52 * kNumLanes], &buf0[53 * kNumLanes]); 1039 AddSub(int_tag, &buf0[55 * kNumLanes], &buf1[54 * kNumLanes], 1040 &buf0[55 * kNumLanes], &buf0[54 * kNumLanes]); 1041 AddSub(int_tag, &buf0[56 * kNumLanes], &buf1[57 * kNumLanes], 1042 &buf0[56 * kNumLanes], &buf0[57 * kNumLanes]); 1043 AddSub(int_tag, &buf0[59 * kNumLanes], &buf1[58 * kNumLanes], 1044 &buf0[59 * kNumLanes], &buf0[58 * kNumLanes]); 1045 AddSub(int_tag, &buf0[60 * kNumLanes], &buf1[61 * kNumLanes], 1046 &buf0[60 * kNumLanes], &buf0[61 * kNumLanes]); 1047 AddSub(int_tag, &buf0[63 * kNumLanes], &buf1[62 * kNumLanes], 1048 &buf0[63 * kNumLanes], &buf0[62 * kNumLanes]); 1049 // stage 10 1050 Butterfly(int_tag, cospi[1], cospi[63], &buf0[63 * kNumLanes], 1051 &buf0[32 * kNumLanes], &buf1[32 * kNumLanes], &buf1[63 * kNumLanes], 1052 cos_bit, round); 1053 Butterfly(int_tag, cospi[33], cospi[31], &buf0[62 * kNumLanes], 1054 &buf0[33 * kNumLanes], &buf1[33 * kNumLanes], &buf1[62 * kNumLanes], 1055 cos_bit, round); 1056 Butterfly(int_tag, cospi[17], cospi[47], &buf0[61 * kNumLanes], 1057 &buf0[34 * kNumLanes], &buf1[34 * kNumLanes], &buf1[61 * kNumLanes], 1058 cos_bit, round); 1059 Butterfly(int_tag, cospi[49], cospi[15], &buf0[60 * kNumLanes], 1060 &buf0[35 * kNumLanes], &buf1[35 * kNumLanes], &buf1[60 * kNumLanes], 1061 cos_bit, round); 1062 Butterfly(int_tag, cospi[9], cospi[55], &buf0[59 * kNumLanes], 1063 &buf0[36 * kNumLanes], &buf1[36 * kNumLanes], &buf1[59 * kNumLanes], 1064 cos_bit, round); 1065 Butterfly(int_tag, cospi[41], cospi[23], &buf0[58 * kNumLanes], 1066 &buf0[37 * kNumLanes], &buf1[37 * kNumLanes], &buf1[58 * kNumLanes], 1067 cos_bit, round); 1068 Butterfly(int_tag, cospi[25], cospi[39], &buf0[57 * kNumLanes], 1069 &buf0[38 * kNumLanes], &buf1[38 * kNumLanes], &buf1[57 * kNumLanes], 1070 cos_bit, round); 1071 Butterfly(int_tag, cospi[57], cospi[7], &buf0[56 * kNumLanes], 1072 &buf0[39 * kNumLanes], &buf1[39 * kNumLanes], &buf1[56 * kNumLanes], 1073 cos_bit, round); 1074 Butterfly(int_tag, cospi[05], cospi[59], &buf0[55 * kNumLanes], 1075 &buf0[40 * kNumLanes], &buf1[40 * kNumLanes], &buf1[55 * kNumLanes], 1076 cos_bit, round); 1077 Butterfly(int_tag, cospi[37], cospi[27], &buf0[54 * kNumLanes], 1078 &buf0[41 * kNumLanes], &buf1[41 * kNumLanes], &buf1[54 * kNumLanes], 1079 cos_bit, round); 1080 Butterfly(int_tag, cospi[21], cospi[43], &buf0[53 * kNumLanes], 1081 &buf0[42 * kNumLanes], &buf1[42 * kNumLanes], &buf1[53 * kNumLanes], 1082 cos_bit, round); 1083 Butterfly(int_tag, cospi[53], cospi[11], &buf0[52 * kNumLanes], 1084 &buf0[43 * kNumLanes], &buf1[43 * kNumLanes], &buf1[52 * kNumLanes], 1085 cos_bit, round); 1086 Butterfly(int_tag, cospi[13], cospi[51], &buf0[51 * kNumLanes], 1087 &buf0[44 * kNumLanes], &buf1[44 * kNumLanes], &buf1[51 * kNumLanes], 1088 cos_bit, round); 1089 Butterfly(int_tag, cospi[45], cospi[19], &buf0[50 * kNumLanes], 1090 &buf0[45 * kNumLanes], &buf1[45 * kNumLanes], &buf1[50 * kNumLanes], 1091 cos_bit, round); 1092 Butterfly(int_tag, cospi[29], cospi[35], &buf0[49 * kNumLanes], 1093 &buf0[46 * kNumLanes], &buf1[46 * kNumLanes], &buf1[49 * kNumLanes], 1094 cos_bit, round); 1095 Butterfly(int_tag, cospi[61], cospi[3], &buf0[48 * kNumLanes], 1096 &buf0[47 * kNumLanes], &buf1[47 * kNumLanes], &buf1[48 * kNumLanes], 1097 cos_bit, round); 1098 1099 // stage 11 1100 hwy::CopyBytes<kNumBytes>(&buf1[0 * kNumLanes], &in[0 * OutStride]); 1101 hwy::CopyBytes<kNumBytes>(&buf1[63 * kNumLanes], &in[63 * OutStride]); 1102 hwy::CopyBytes<kNumBytes>(&buf1[32 * kNumLanes], &in[1 * OutStride]); 1103 hwy::CopyBytes<kNumBytes>(&buf1[31 * kNumLanes], &in[62 * OutStride]); 1104 hwy::CopyBytes<kNumBytes>(&buf0[16 * kNumLanes], &in[2 * OutStride]); 1105 hwy::CopyBytes<kNumBytes>(&buf1[47 * kNumLanes], &in[61 * OutStride]); 1106 hwy::CopyBytes<kNumBytes>(&buf1[48 * kNumLanes], &in[3 * OutStride]); 1107 hwy::CopyBytes<kNumBytes>(&buf1[15 * kNumLanes], &in[60 * OutStride]); 1108 hwy::CopyBytes<kNumBytes>(&buf1[8 * kNumLanes], &in[4 * OutStride]); 1109 hwy::CopyBytes<kNumBytes>(&buf1[55 * kNumLanes], &in[59 * OutStride]); 1110 hwy::CopyBytes<kNumBytes>(&buf1[40 * kNumLanes], &in[5 * OutStride]); 1111 hwy::CopyBytes<kNumBytes>(&buf1[23 * kNumLanes], &in[58 * OutStride]); 1112 hwy::CopyBytes<kNumBytes>(&buf0[24 * kNumLanes], &in[6 * OutStride]); 1113 hwy::CopyBytes<kNumBytes>(&buf1[39 * kNumLanes], &in[57 * OutStride]); 1114 hwy::CopyBytes<kNumBytes>(&buf1[56 * kNumLanes], &in[7 * OutStride]); 1115 hwy::CopyBytes<kNumBytes>(&buf1[7 * kNumLanes], &in[56 * OutStride]); 1116 hwy::CopyBytes<kNumBytes>(&buf0[4 * kNumLanes], &in[8 * OutStride]); 1117 hwy::CopyBytes<kNumBytes>(&buf1[59 * kNumLanes], &in[55 * OutStride]); 1118 hwy::CopyBytes<kNumBytes>(&buf1[36 * kNumLanes], &in[9 * OutStride]); 1119 hwy::CopyBytes<kNumBytes>(&buf1[27 * kNumLanes], &in[54 * OutStride]); 1120 hwy::CopyBytes<kNumBytes>(&buf0[20 * kNumLanes], &in[10 * OutStride]); 1121 hwy::CopyBytes<kNumBytes>(&buf1[43 * kNumLanes], &in[53 * OutStride]); 1122 hwy::CopyBytes<kNumBytes>(&buf1[52 * kNumLanes], &in[11 * OutStride]); 1123 hwy::CopyBytes<kNumBytes>(&buf1[11 * kNumLanes], &in[52 * OutStride]); 1124 hwy::CopyBytes<kNumBytes>(&buf1[12 * kNumLanes], &in[12 * OutStride]); 1125 hwy::CopyBytes<kNumBytes>(&buf1[51 * kNumLanes], &in[51 * OutStride]); 1126 hwy::CopyBytes<kNumBytes>(&buf1[44 * kNumLanes], &in[13 * OutStride]); 1127 hwy::CopyBytes<kNumBytes>(&buf1[19 * kNumLanes], &in[50 * OutStride]); 1128 hwy::CopyBytes<kNumBytes>(&buf0[28 * kNumLanes], &in[14 * OutStride]); 1129 hwy::CopyBytes<kNumBytes>(&buf1[35 * kNumLanes], &in[49 * OutStride]); 1130 hwy::CopyBytes<kNumBytes>(&buf1[60 * kNumLanes], &in[15 * OutStride]); 1131 hwy::CopyBytes<kNumBytes>(&buf1[3 * kNumLanes], &in[48 * OutStride]); 1132 hwy::CopyBytes<kNumBytes>(&buf1[2 * kNumLanes], &in[16 * OutStride]); 1133 hwy::CopyBytes<kNumBytes>(&buf1[61 * kNumLanes], &in[47 * OutStride]); 1134 hwy::CopyBytes<kNumBytes>(&buf1[34 * kNumLanes], &in[17 * OutStride]); 1135 hwy::CopyBytes<kNumBytes>(&buf1[29 * kNumLanes], &in[46 * OutStride]); 1136 hwy::CopyBytes<kNumBytes>(&buf0[18 * kNumLanes], &in[18 * OutStride]); 1137 hwy::CopyBytes<kNumBytes>(&buf1[45 * kNumLanes], &in[45 * OutStride]); 1138 hwy::CopyBytes<kNumBytes>(&buf1[50 * kNumLanes], &in[19 * OutStride]); 1139 hwy::CopyBytes<kNumBytes>(&buf1[13 * kNumLanes], &in[44 * OutStride]); 1140 hwy::CopyBytes<kNumBytes>(&buf1[10 * kNumLanes], &in[20 * OutStride]); 1141 hwy::CopyBytes<kNumBytes>(&buf1[53 * kNumLanes], &in[43 * OutStride]); 1142 hwy::CopyBytes<kNumBytes>(&buf1[42 * kNumLanes], &in[21 * OutStride]); 1143 hwy::CopyBytes<kNumBytes>(&buf1[21 * kNumLanes], &in[42 * OutStride]); 1144 hwy::CopyBytes<kNumBytes>(&buf0[26 * kNumLanes], &in[22 * OutStride]); 1145 hwy::CopyBytes<kNumBytes>(&buf1[37 * kNumLanes], &in[41 * OutStride]); 1146 hwy::CopyBytes<kNumBytes>(&buf1[58 * kNumLanes], &in[23 * OutStride]); 1147 hwy::CopyBytes<kNumBytes>(&buf1[5 * kNumLanes], &in[40 * OutStride]); 1148 hwy::CopyBytes<kNumBytes>(&buf0[6 * kNumLanes], &in[24 * OutStride]); 1149 hwy::CopyBytes<kNumBytes>(&buf1[57 * kNumLanes], &in[39 * OutStride]); 1150 hwy::CopyBytes<kNumBytes>(&buf1[38 * kNumLanes], &in[25 * OutStride]); 1151 hwy::CopyBytes<kNumBytes>(&buf1[25 * kNumLanes], &in[38 * OutStride]); 1152 hwy::CopyBytes<kNumBytes>(&buf0[22 * kNumLanes], &in[26 * OutStride]); 1153 hwy::CopyBytes<kNumBytes>(&buf1[41 * kNumLanes], &in[37 * OutStride]); 1154 hwy::CopyBytes<kNumBytes>(&buf1[54 * kNumLanes], &in[27 * OutStride]); 1155 hwy::CopyBytes<kNumBytes>(&buf1[9 * kNumLanes], &in[36 * OutStride]); 1156 hwy::CopyBytes<kNumBytes>(&buf1[14 * kNumLanes], &in[28 * OutStride]); 1157 hwy::CopyBytes<kNumBytes>(&buf1[49 * kNumLanes], &in[35 * OutStride]); 1158 hwy::CopyBytes<kNumBytes>(&buf1[46 * kNumLanes], &in[29 * OutStride]); 1159 hwy::CopyBytes<kNumBytes>(&buf1[17 * kNumLanes], &in[34 * OutStride]); 1160 hwy::CopyBytes<kNumBytes>(&buf0[30 * kNumLanes], &in[30 * OutStride]); 1161 hwy::CopyBytes<kNumBytes>(&buf1[33 * kNumLanes], &in[33 * OutStride]); 1162 hwy::CopyBytes<kNumBytes>(&buf1[62 * kNumLanes], &in[31 * OutStride]); 1163 hwy::CopyBytes<kNumBytes>(&buf1[1 * kNumLanes], &in[32 * OutStride]); 1164 } 1165 1166 template <size_t LaneSize, size_t NumLanes> 1167 struct Fadst4Traits { 1168 template <size_t Width, typename D> 1169 HWY_ATTR HWY_INLINE static void Fadst4(D int_tag, 1170 hn::TFromD<D> *HWY_RESTRICT in, 1171 const int8_t cos_bit, 1172 const size_t instride) { 1173 const int32_t *HWY_RESTRICT const sinpi = sinpi_arr(cos_bit); 1174 const auto round = hn::Set(int_tag, 1 << (cos_bit - 1)); 1175 const auto sinpi1 = hn::Set(int_tag, sinpi[1]); 1176 const auto sinpi2 = hn::Set(int_tag, sinpi[2]); 1177 const auto sinpi3 = hn::Set(int_tag, sinpi[3]); 1178 const auto sinpi4 = hn::Set(int_tag, sinpi[4]); 1179 const auto in0 = hn::Load(int_tag, &in[0 * instride]); 1180 const auto in1 = hn::Load(int_tag, &in[1 * instride]); 1181 const auto in2 = hn::Load(int_tag, &in[2 * instride]); 1182 const auto in3 = hn::Load(int_tag, &in[3 * instride]); 1183 auto s0 = hn::Mul(in0, sinpi1); 1184 auto s1 = hn::Mul(in0, sinpi4); 1185 auto s2 = hn::Mul(in1, sinpi2); 1186 auto s3 = hn::Mul(in1, sinpi1); 1187 auto s4 = hn::Mul(in2, sinpi3); 1188 auto s5 = hn::Mul(in3, sinpi4); 1189 auto s6 = hn::Mul(in3, sinpi2); 1190 auto s7 = hn::Sub(hn::Add(in0, in1), in3); 1191 auto x0 = hn::Add(hn::Add(s0, s2), s5); 1192 auto x1 = hn::Mul(s7, sinpi3); 1193 auto x2 = hn::Add(hn::Sub(s1, s3), s6); 1194 auto x3 = s4; 1195 s0 = hn::Add(x0, x3); 1196 s1 = x1; 1197 s2 = hn::Sub(x2, x3); 1198 s3 = hn::Add(hn::Sub(x2, x0), x3); 1199 auto u0 = hn::Add(s0, round); 1200 u0 = hn::ShiftRightSame(u0, cos_bit); 1201 auto u1 = hn::Add(s1, round); 1202 u1 = hn::ShiftRightSame(u1, cos_bit); 1203 auto u2 = hn::Add(s2, round); 1204 u2 = hn::ShiftRightSame(u2, cos_bit); 1205 auto u3 = hn::Add(s3, round); 1206 u3 = hn::ShiftRightSame(u3, cos_bit); 1207 hn::Store(u0, int_tag, &in[0 * instride]); 1208 hn::Store(u1, int_tag, &in[1 * instride]); 1209 hn::Store(u2, int_tag, &in[2 * instride]); 1210 hn::Store(u3, int_tag, &in[3 * instride]); 1211 } 1212 }; 1213 1214 template <> 1215 struct Fadst4Traits<2, 4> { 1216 template <size_t Width, typename D> 1217 HWY_ATTR HWY_INLINE static void Fadst4(D int_tag, 1218 hn::TFromD<D> *HWY_RESTRICT in, 1219 const int8_t cos_bit, 1220 const size_t instride) { 1221 (void)int_tag; 1222 const int32_t *HWY_RESTRICT const sinpi = sinpi_arr(cos_bit); 1223 constexpr hn::FixedTag<hn::TFromD<D>, 8> demote_tag; 1224 constexpr hn::RepartitionToWide<decltype(demote_tag)> int32_tag; 1225 const auto round = hn::Set(int32_tag, 1 << (cos_bit - 1)); 1226 const auto sinpi_p01_p02 = SetPair(demote_tag, sinpi[1], sinpi[2]); 1227 const auto sinpi_p04_m01 = SetPair(demote_tag, sinpi[4], -sinpi[1]); 1228 const auto sinpi_p03_p04 = SetPair(demote_tag, sinpi[3], sinpi[4]); 1229 const auto sinpi_m03_p02 = SetPair(demote_tag, -sinpi[3], sinpi[2]); 1230 const auto sinpi_p03_p03 = hn::Set(demote_tag, sinpi[3]); 1231 const auto in0 = hn::Load(demote_tag, &in[0 * instride]); 1232 const auto in1 = hn::Load(demote_tag, &in[1 * instride]); 1233 const auto in2 = hn::Load(demote_tag, &in[2 * instride]); 1234 const auto in3 = hn::Load(demote_tag, &in[3 * instride]); 1235 const auto in7 = hn::Add(in0, in1); 1236 auto u0 = hn::InterleaveLower(in0, in1); 1237 auto u1 = hn::InterleaveLower(in2, in3); 1238 auto u2 = hn::InterleaveLower(in7, hn::Zero(demote_tag)); 1239 auto u3 = hn::InterleaveLower(in2, hn::Zero(demote_tag)); 1240 auto u4 = hn::InterleaveLower(in3, hn::Zero(demote_tag)); 1241 auto v0 = hn::WidenMulPairwiseAdd(int32_tag, u0, sinpi_p01_p02); // s0 + s2 1242 auto v1 = hn::WidenMulPairwiseAdd(int32_tag, u1, sinpi_p03_p04); // s4 + s5 1243 auto v2 = hn::WidenMulPairwiseAdd(int32_tag, u2, sinpi_p03_p03); // x1 1244 auto v3 = hn::WidenMulPairwiseAdd(int32_tag, u0, sinpi_p04_m01); // s1 - s3 1245 auto v4 = 1246 hn::WidenMulPairwiseAdd(int32_tag, u1, sinpi_m03_p02); // -s4 + s6 1247 auto v5 = hn::WidenMulPairwiseAdd(int32_tag, u3, sinpi_p03_p03); // s4 1248 auto v6 = hn::WidenMulPairwiseAdd(int32_tag, u4, sinpi_p03_p03); 1249 auto w0 = hn::Add(v0, v1); 1250 auto w1 = hn::Sub(v2, v6); 1251 auto w2 = hn::Add(v3, v4); 1252 auto w3 = hn::Sub(w2, w0); 1253 auto w4 = hn::ShiftLeft<2>(v5); 1254 auto w5 = hn::Sub(w4, v5); 1255 auto w6 = hn::Add(w3, w5); 1256 v0 = hn::Add(w0, round); 1257 v1 = hn::Add(w1, round); 1258 v2 = hn::Add(w2, round); 1259 v3 = hn::Add(w6, round); 1260 w0 = hn::ShiftRightSame(v0, cos_bit); 1261 w1 = hn::ShiftRightSame(v1, cos_bit); 1262 w2 = hn::ShiftRightSame(v2, cos_bit); 1263 w3 = hn::ShiftRightSame(v3, cos_bit); 1264 auto o0 = hn::ReorderDemote2To(demote_tag, w0, w2); 1265 auto o1 = hn::ReorderDemote2To(demote_tag, w1, w3); 1266 hn::Store(o0, demote_tag, &in[0 * instride]); 1267 hn::Store(o1, demote_tag, &in[1 * instride]); 1268 hn::Store(hn::ShiftRightLanes<4>(demote_tag, o0), demote_tag, 1269 &in[2 * instride]); 1270 hn::Store(hn::ShiftRightLanes<4>(demote_tag, o1), demote_tag, 1271 &in[3 * instride]); 1272 } 1273 }; 1274 1275 template <size_t NumLanes> 1276 struct Fadst4Traits<2, NumLanes> { 1277 template <size_t Width, typename D> 1278 HWY_ATTR HWY_INLINE static void Fadst4(D int_tag, 1279 hn::TFromD<D> *HWY_RESTRICT in, 1280 const int8_t cos_bit, 1281 const size_t instride) { 1282 const int32_t *HWY_RESTRICT const sinpi = sinpi_arr(cos_bit); 1283 constexpr hn::RepartitionToWide<D> int32_tag; 1284 const auto round = hn::Set(int32_tag, 1 << (cos_bit - 1)); 1285 const auto sinpi_p01_p02 = SetPair(int_tag, sinpi[1], sinpi[2]); 1286 const auto sinpi_p04_m01 = SetPair(int_tag, sinpi[4], -sinpi[1]); 1287 const auto sinpi_p03_p04 = SetPair(int_tag, sinpi[3], sinpi[4]); 1288 const auto sinpi_m03_p02 = SetPair(int_tag, -sinpi[3], sinpi[2]); 1289 const auto sinpi_p03_p03 = hn::Set(int_tag, sinpi[3]); 1290 const auto in0 = hn::Load(int_tag, &in[0 * instride]); 1291 const auto in1 = hn::Load(int_tag, &in[1 * instride]); 1292 const auto in2 = hn::Load(int_tag, &in[2 * instride]); 1293 const auto in3 = hn::Load(int_tag, &in[3 * instride]); 1294 const auto in7 = hn::Add(in0, in1); 1295 auto ul0 = hn::InterleaveLower(int_tag, in0, in1); 1296 auto uh0 = hn::InterleaveUpper(int_tag, in0, in1); 1297 auto ul1 = hn::InterleaveLower(int_tag, in2, in3); 1298 auto uh1 = hn::InterleaveUpper(int_tag, in2, in3); 1299 auto ul2 = hn::InterleaveLower(int_tag, in7, hn::Zero(int_tag)); 1300 auto uh2 = hn::InterleaveUpper(int_tag, in7, hn::Zero(int_tag)); 1301 auto ul3 = hn::InterleaveLower(int_tag, in2, hn::Zero(int_tag)); 1302 auto uh3 = hn::InterleaveUpper(int_tag, in2, hn::Zero(int_tag)); 1303 auto ul4 = hn::InterleaveLower(int_tag, in3, hn::Zero(int_tag)); 1304 auto uh4 = hn::InterleaveUpper(int_tag, in3, hn::Zero(int_tag)); 1305 auto vl0 = 1306 hn::WidenMulPairwiseAdd(int32_tag, ul0, sinpi_p01_p02); // s0 + s2 1307 auto vh0 = 1308 hn::WidenMulPairwiseAdd(int32_tag, uh0, sinpi_p01_p02); // s0 + s2 1309 auto vl1 = 1310 hn::WidenMulPairwiseAdd(int32_tag, ul1, sinpi_p03_p04); // s4 + s5 1311 auto vh1 = 1312 hn::WidenMulPairwiseAdd(int32_tag, uh1, sinpi_p03_p04); // s4 + s5 1313 auto vl2 = hn::WidenMulPairwiseAdd(int32_tag, ul2, sinpi_p03_p03); // x1 1314 auto vh2 = hn::WidenMulPairwiseAdd(int32_tag, uh2, sinpi_p03_p03); // x1 1315 auto vl3 = 1316 hn::WidenMulPairwiseAdd(int32_tag, ul0, sinpi_p04_m01); // s1 - s3 1317 auto vh3 = 1318 hn::WidenMulPairwiseAdd(int32_tag, uh0, sinpi_p04_m01); // s1 - s3 1319 auto vl4 = 1320 hn::WidenMulPairwiseAdd(int32_tag, ul1, sinpi_m03_p02); // -s4 + s6 1321 auto vh4 = 1322 hn::WidenMulPairwiseAdd(int32_tag, uh1, sinpi_m03_p02); // -s4 + s6 1323 auto vl5 = hn::WidenMulPairwiseAdd(int32_tag, ul3, sinpi_p03_p03); // s4 1324 auto vh5 = hn::WidenMulPairwiseAdd(int32_tag, uh3, sinpi_p03_p03); // s4 1325 auto vl6 = hn::WidenMulPairwiseAdd(int32_tag, ul4, sinpi_p03_p03); 1326 auto vh6 = hn::WidenMulPairwiseAdd(int32_tag, uh4, sinpi_p03_p03); 1327 auto wl0 = hn::Add(vl0, vl1); 1328 auto wh0 = hn::Add(vh0, vh1); 1329 auto wl1 = hn::Sub(vl2, vl6); 1330 auto wh1 = hn::Sub(vh2, vh6); 1331 auto wl2 = hn::Add(vl3, vl4); 1332 auto wh2 = hn::Add(vh3, vh4); 1333 auto wl3 = hn::Sub(wl2, wl0); 1334 auto wh3 = hn::Sub(wh2, wh0); 1335 auto wl4 = hn::ShiftLeft<2>(vl5); 1336 auto wh4 = hn::ShiftLeft<2>(vh5); 1337 auto wl5 = hn::Sub(wl4, vl5); 1338 auto wh5 = hn::Sub(wh4, vh5); 1339 auto wl6 = hn::Add(wl3, wl5); 1340 auto wh6 = hn::Add(wh3, wh5); 1341 vl0 = hn::Add(wl0, round); 1342 vh0 = hn::Add(wh0, round); 1343 vl1 = hn::Add(wl1, round); 1344 vh1 = hn::Add(wh1, round); 1345 vl2 = hn::Add(wl2, round); 1346 vh2 = hn::Add(wh2, round); 1347 vl3 = hn::Add(wl6, round); 1348 vh3 = hn::Add(wh6, round); 1349 wl0 = hn::ShiftRightSame(vl0, cos_bit); 1350 wh0 = hn::ShiftRightSame(vh0, cos_bit); 1351 wl1 = hn::ShiftRightSame(vl1, cos_bit); 1352 wh1 = hn::ShiftRightSame(vh1, cos_bit); 1353 wl2 = hn::ShiftRightSame(vl2, cos_bit); 1354 wh2 = hn::ShiftRightSame(vh2, cos_bit); 1355 wl3 = hn::ShiftRightSame(vl3, cos_bit); 1356 wh3 = hn::ShiftRightSame(vh3, cos_bit); 1357 auto o0 = hn::ReorderDemote2To(int_tag, wl0, wh0); 1358 auto o1 = hn::ReorderDemote2To(int_tag, wl1, wh1); 1359 auto o2 = hn::ReorderDemote2To(int_tag, wl2, wh2); 1360 auto o3 = hn::ReorderDemote2To(int_tag, wl3, wh3); 1361 hn::Store(o0, int_tag, &in[0 * instride]); 1362 hn::Store(o1, int_tag, &in[1 * instride]); 1363 hn::Store(o2, int_tag, &in[2 * instride]); 1364 hn::Store(o3, int_tag, &in[3 * instride]); 1365 } 1366 }; 1367 1368 template <size_t Width, typename D> 1369 HWY_ATTR HWY_INLINE void Fadst4(D int_tag, hn::TFromD<D> *HWY_RESTRICT in, 1370 const int8_t cos_bit, const size_t instride) { 1371 Fadst4Traits<sizeof(hn::TFromD<D>), 1372 hn::MaxLanes(int_tag)>::template Fadst4<Width>(int_tag, in, 1373 cos_bit, 1374 instride); 1375 } 1376 1377 template <size_t Width, typename D> 1378 HWY_ATTR HWY_INLINE void Fadst8(D int_tag, hn::TFromD<D> *HWY_RESTRICT in, 1379 const int8_t cos_bit, const size_t instride) { 1380 constexpr size_t kNumLanes = hn::MaxLanes(int_tag); 1381 constexpr size_t kNumBytes = kNumLanes * sizeof(hn::TFromD<D>); 1382 HWY_ALIGN_MAX hn::TFromD<D> buf0[8 * kNumLanes]; 1383 HWY_ALIGN_MAX hn::TFromD<D> buf1[8 * kNumLanes]; 1384 const int32_t *HWY_RESTRICT cospi = cospi_arr(cos_bit); 1385 const auto round = hn::Set(hn::Repartition<int32_t, D>(), 1 << (cos_bit - 1)); 1386 1387 // stage 0 1388 // stage 1 1389 hn::Store(hn::Load(int_tag, &in[0 * instride]), int_tag, 1390 &buf0[0 * kNumLanes]); 1391 hn::Store(hn::Neg(hn::Load(int_tag, &in[7 * instride])), int_tag, 1392 &buf0[1 * kNumLanes]); 1393 hn::Store(hn::Neg(hn::Load(int_tag, &in[3 * instride])), int_tag, 1394 &buf0[2 * kNumLanes]); 1395 hn::Store(hn::Load(int_tag, &in[4 * instride]), int_tag, 1396 &buf0[3 * kNumLanes]); 1397 hn::Store(hn::Neg(hn::Load(int_tag, &in[1 * instride])), int_tag, 1398 &buf0[4 * kNumLanes]); 1399 hn::Store(hn::Load(int_tag, &in[6 * instride]), int_tag, 1400 &buf0[5 * kNumLanes]); 1401 hn::Store(hn::Load(int_tag, &in[2 * instride]), int_tag, 1402 &buf0[6 * kNumLanes]); 1403 hn::Store(hn::Neg(hn::Load(int_tag, &in[5 * instride])), int_tag, 1404 &buf0[7 * kNumLanes]); 1405 1406 // stage 2 1407 hwy::CopyBytes<2 * kNumBytes>(&buf0[0 * kNumLanes], &buf1[0 * kNumLanes]); 1408 Butterfly(int_tag, cospi[32], cospi[32], &buf0[2 * kNumLanes], 1409 &buf0[3 * kNumLanes], &buf1[2 * kNumLanes], &buf1[3 * kNumLanes], 1410 cos_bit, round); 1411 hwy::CopyBytes<2 * kNumBytes>(&buf0[4 * kNumLanes], &buf1[4 * kNumLanes]); 1412 Butterfly(int_tag, cospi[32], cospi[32], &buf0[6 * kNumLanes], 1413 &buf0[7 * kNumLanes], &buf1[6 * kNumLanes], &buf1[7 * kNumLanes], 1414 cos_bit, round); 1415 1416 // stage 3 1417 for (size_t j = 0; j < 8; j += 4) { 1418 for (size_t i = 0; i < 2; ++i) { 1419 AddSub(int_tag, &buf1[(0 + i + j) * kNumLanes], 1420 &buf1[(2 + i + j) * kNumLanes], &buf0[(0 + i + j) * kNumLanes], 1421 &buf0[(2 + i + j) * kNumLanes]); 1422 } 1423 } 1424 1425 // stage 4 1426 hwy::CopyBytes<4 * kNumBytes>(&buf0[0 * kNumLanes], &buf1[0 * kNumLanes]); 1427 HalfButterfly(int_tag, cospi[16], cospi[48], &buf0[4 * kNumLanes], 1428 &buf0[5 * kNumLanes], &buf1[4 * kNumLanes], cos_bit, round); 1429 HalfButterfly(int_tag, cospi[48], -cospi[16], &buf0[4 * kNumLanes], 1430 &buf0[5 * kNumLanes], &buf1[5 * kNumLanes], cos_bit, round); 1431 HalfButterfly(int_tag, -cospi[48], cospi[16], &buf0[6 * kNumLanes], 1432 &buf0[7 * kNumLanes], &buf1[6 * kNumLanes], cos_bit, round); 1433 HalfButterfly(int_tag, cospi[16], cospi[48], &buf0[6 * kNumLanes], 1434 &buf0[7 * kNumLanes], &buf1[7 * kNumLanes], cos_bit, round); 1435 1436 // stage 5 1437 for (size_t i = 0; i < 4; ++i) { 1438 AddSub(int_tag, &buf1[(0 + i) * kNumLanes], &buf1[(4 + i) * kNumLanes], 1439 &buf0[(0 + i) * kNumLanes], &buf0[(4 + i) * kNumLanes]); 1440 } 1441 1442 // stage 6 1443 HalfButterfly(int_tag, cospi[4], cospi[60], &buf0[0 * kNumLanes], 1444 &buf0[1 * kNumLanes], &buf1[0 * kNumLanes], cos_bit, round); 1445 HalfButterfly(int_tag, cospi[60], -cospi[4], &buf0[0 * kNumLanes], 1446 &buf0[1 * kNumLanes], &buf1[1 * kNumLanes], cos_bit, round); 1447 HalfButterfly(int_tag, cospi[20], cospi[44], &buf0[2 * kNumLanes], 1448 &buf0[3 * kNumLanes], &buf1[2 * kNumLanes], cos_bit, round); 1449 HalfButterfly(int_tag, cospi[44], -cospi[20], &buf0[2 * kNumLanes], 1450 &buf0[3 * kNumLanes], &buf1[3 * kNumLanes], cos_bit, round); 1451 HalfButterfly(int_tag, cospi[36], cospi[28], &buf0[4 * kNumLanes], 1452 &buf0[5 * kNumLanes], &buf1[4 * kNumLanes], cos_bit, round); 1453 HalfButterfly(int_tag, cospi[28], -cospi[36], &buf0[4 * kNumLanes], 1454 &buf0[5 * kNumLanes], &buf1[5 * kNumLanes], cos_bit, round); 1455 HalfButterfly(int_tag, cospi[52], cospi[12], &buf0[6 * kNumLanes], 1456 &buf0[7 * kNumLanes], &buf1[6 * kNumLanes], cos_bit, round); 1457 HalfButterfly(int_tag, cospi[12], -cospi[52], &buf0[6 * kNumLanes], 1458 &buf0[7 * kNumLanes], &buf1[7 * kNumLanes], cos_bit, round); 1459 1460 // stage 7 1461 hwy::CopyBytes<kNumBytes>(&buf1[1 * kNumLanes], &in[0 * instride]); 1462 hwy::CopyBytes<kNumBytes>(&buf1[6 * kNumLanes], &in[1 * instride]); 1463 hwy::CopyBytes<kNumBytes>(&buf1[3 * kNumLanes], &in[2 * instride]); 1464 hwy::CopyBytes<kNumBytes>(&buf1[4 * kNumLanes], &in[3 * instride]); 1465 hwy::CopyBytes<kNumBytes>(&buf1[5 * kNumLanes], &in[4 * instride]); 1466 hwy::CopyBytes<kNumBytes>(&buf1[2 * kNumLanes], &in[5 * instride]); 1467 hwy::CopyBytes<kNumBytes>(&buf1[7 * kNumLanes], &in[6 * instride]); 1468 hwy::CopyBytes<kNumBytes>(&buf1[0 * kNumLanes], &in[7 * instride]); 1469 } 1470 1471 template <size_t Width, typename D> 1472 HWY_ATTR HWY_INLINE void Fadst16(D int_tag, hn::TFromD<D> *HWY_RESTRICT in, 1473 const int8_t cos_bit, const size_t instride) { 1474 constexpr size_t kNumLanes = hn::MaxLanes(int_tag); 1475 constexpr size_t kNumBytes = kNumLanes * sizeof(hn::TFromD<D>); 1476 HWY_ALIGN_MAX hn::TFromD<D> buf0[16 * kNumLanes]; 1477 HWY_ALIGN_MAX hn::TFromD<D> buf1[16 * kNumLanes]; 1478 const int32_t *HWY_RESTRICT const cospi = cospi_arr(cos_bit); 1479 const auto round = hn::Set(hn::Repartition<int32_t, D>(), 1 << (cos_bit - 1)); 1480 1481 // stage 0 1482 // stage 1 1483 hn::Store(hn::Load(int_tag, &in[0 * instride]), int_tag, 1484 &buf0[0 * kNumLanes]); 1485 hn::Store(hn::Neg(hn::Load(int_tag, &in[15 * instride])), int_tag, 1486 &buf0[1 * kNumLanes]); 1487 hn::Store(hn::Neg(hn::Load(int_tag, &in[7 * instride])), int_tag, 1488 &buf0[2 * kNumLanes]); 1489 hn::Store(hn::Load(int_tag, &in[8 * instride]), int_tag, 1490 &buf0[3 * kNumLanes]); 1491 hn::Store(hn::Neg(hn::Load(int_tag, &in[3 * instride])), int_tag, 1492 &buf0[4 * kNumLanes]); 1493 hn::Store(hn::Load(int_tag, &in[12 * instride]), int_tag, 1494 &buf0[5 * kNumLanes]); 1495 hn::Store(hn::Load(int_tag, &in[4 * instride]), int_tag, 1496 &buf0[6 * kNumLanes]); 1497 hn::Store(hn::Neg(hn::Load(int_tag, &in[11 * instride])), int_tag, 1498 &buf0[7 * kNumLanes]); 1499 hn::Store(hn::Neg(hn::Load(int_tag, &in[1 * instride])), int_tag, 1500 &buf0[8 * kNumLanes]); 1501 hn::Store(hn::Load(int_tag, &in[14 * instride]), int_tag, 1502 &buf0[9 * kNumLanes]); 1503 hn::Store(hn::Load(int_tag, &in[6 * instride]), int_tag, 1504 &buf0[10 * kNumLanes]); 1505 hn::Store(hn::Neg(hn::Load(int_tag, &in[9 * instride])), int_tag, 1506 &buf0[11 * kNumLanes]); 1507 hn::Store(hn::Load(int_tag, &in[2 * instride]), int_tag, 1508 &buf0[12 * kNumLanes]); 1509 hn::Store(hn::Neg(hn::Load(int_tag, &in[13 * instride])), int_tag, 1510 &buf0[13 * kNumLanes]); 1511 hn::Store(hn::Neg(hn::Load(int_tag, &in[5 * instride])), int_tag, 1512 &buf0[14 * kNumLanes]); 1513 hn::Store(hn::Load(int_tag, &in[10 * instride]), int_tag, 1514 &buf0[15 * kNumLanes]); 1515 1516 // stage 2 1517 hwy::CopyBytes<kNumBytes * 2>(&buf0[0 * kNumLanes], &buf1[0 * kNumLanes]); 1518 Butterfly(int_tag, cospi[32], cospi[32], &buf0[2 * kNumLanes], 1519 &buf0[3 * kNumLanes], &buf1[2 * kNumLanes], &buf1[3 * kNumLanes], 1520 cos_bit, round); 1521 hwy::CopyBytes<kNumBytes * 2>(&buf0[4 * kNumLanes], &buf1[4 * kNumLanes]); 1522 Butterfly(int_tag, cospi[32], cospi[32], &buf0[6 * kNumLanes], 1523 &buf0[7 * kNumLanes], &buf1[6 * kNumLanes], &buf1[7 * kNumLanes], 1524 cos_bit, round); 1525 hwy::CopyBytes<kNumBytes * 2>(&buf0[8 * kNumLanes], &buf1[8 * kNumLanes]); 1526 Butterfly(int_tag, cospi[32], cospi[32], &buf0[10 * kNumLanes], 1527 &buf0[11 * kNumLanes], &buf1[10 * kNumLanes], &buf1[11 * kNumLanes], 1528 cos_bit, round); 1529 hwy::CopyBytes<kNumBytes * 2>(&buf0[12 * kNumLanes], &buf1[12 * kNumLanes]); 1530 Butterfly(int_tag, cospi[32], cospi[32], &buf0[14 * kNumLanes], 1531 &buf0[15 * kNumLanes], &buf1[14 * kNumLanes], &buf1[15 * kNumLanes], 1532 cos_bit, round); 1533 1534 // stage 3 1535 for (size_t j = 0; j < 16; j += 4) { 1536 for (size_t i = 0; i < 2; ++i) { 1537 AddSub(int_tag, &buf1[(0 + i + j) * kNumLanes], 1538 &buf1[(2 + i + j) * kNumLanes], &buf0[(0 + i + j) * kNumLanes], 1539 &buf0[(2 + i + j) * kNumLanes]); 1540 } 1541 } 1542 1543 // stage 4 1544 hwy::CopyBytes<kNumBytes * 4>(&buf0[0 * kNumLanes], &buf1[0 * kNumLanes]); 1545 HalfButterfly(int_tag, cospi[16], cospi[48], &buf0[4 * kNumLanes], 1546 &buf0[5 * kNumLanes], &buf1[4 * kNumLanes], cos_bit, round); 1547 HalfButterfly(int_tag, cospi[48], -cospi[16], &buf0[4 * kNumLanes], 1548 &buf0[5 * kNumLanes], &buf1[5 * kNumLanes], cos_bit, round); 1549 HalfButterfly(int_tag, -cospi[48], cospi[16], &buf0[6 * kNumLanes], 1550 &buf0[7 * kNumLanes], &buf1[6 * kNumLanes], cos_bit, round); 1551 HalfButterfly(int_tag, cospi[16], cospi[48], &buf0[6 * kNumLanes], 1552 &buf0[7 * kNumLanes], &buf1[7 * kNumLanes], cos_bit, round); 1553 hwy::CopyBytes<kNumBytes * 4>(&buf0[8 * kNumLanes], &buf1[8 * kNumLanes]); 1554 HalfButterfly(int_tag, cospi[16], cospi[48], &buf0[12 * kNumLanes], 1555 &buf0[13 * kNumLanes], &buf1[12 * kNumLanes], cos_bit, round); 1556 HalfButterfly(int_tag, cospi[48], -cospi[16], &buf0[12 * kNumLanes], 1557 &buf0[13 * kNumLanes], &buf1[13 * kNumLanes], cos_bit, round); 1558 HalfButterfly(int_tag, -cospi[48], cospi[16], &buf0[14 * kNumLanes], 1559 &buf0[15 * kNumLanes], &buf1[14 * kNumLanes], cos_bit, round); 1560 HalfButterfly(int_tag, cospi[16], cospi[48], &buf0[14 * kNumLanes], 1561 &buf0[15 * kNumLanes], &buf1[15 * kNumLanes], cos_bit, round); 1562 1563 // stage 5 1564 for (size_t j = 0; j < 16; j += 8) { 1565 for (size_t i = 0; i < 4; ++i) { 1566 AddSub(int_tag, &buf1[(0 + i + j) * kNumLanes], 1567 &buf1[(4 + i + j) * kNumLanes], &buf0[(0 + i + j) * kNumLanes], 1568 &buf0[(4 + i + j) * kNumLanes]); 1569 } 1570 } 1571 1572 // stage 6 1573 hwy::CopyBytes<kNumBytes * 8>(&buf0[0 * kNumLanes], &buf1[0 * kNumLanes]); 1574 HalfButterfly(int_tag, cospi[8], cospi[56], &buf0[8 * kNumLanes], 1575 &buf0[9 * kNumLanes], &buf1[8 * kNumLanes], cos_bit, round); 1576 HalfButterfly(int_tag, cospi[56], -cospi[8], &buf0[8 * kNumLanes], 1577 &buf0[9 * kNumLanes], &buf1[9 * kNumLanes], cos_bit, round); 1578 HalfButterfly(int_tag, cospi[40], cospi[24], &buf0[10 * kNumLanes], 1579 &buf0[11 * kNumLanes], &buf1[10 * kNumLanes], cos_bit, round); 1580 HalfButterfly(int_tag, cospi[24], -cospi[40], &buf0[10 * kNumLanes], 1581 &buf0[11 * kNumLanes], &buf1[11 * kNumLanes], cos_bit, round); 1582 HalfButterfly(int_tag, -cospi[56], cospi[8], &buf0[12 * kNumLanes], 1583 &buf0[13 * kNumLanes], &buf1[12 * kNumLanes], cos_bit, round); 1584 HalfButterfly(int_tag, cospi[8], cospi[56], &buf0[12 * kNumLanes], 1585 &buf0[13 * kNumLanes], &buf1[13 * kNumLanes], cos_bit, round); 1586 HalfButterfly(int_tag, -cospi[24], cospi[40], &buf0[14 * kNumLanes], 1587 &buf0[15 * kNumLanes], &buf1[14 * kNumLanes], cos_bit, round); 1588 HalfButterfly(int_tag, cospi[40], cospi[24], &buf0[14 * kNumLanes], 1589 &buf0[15 * kNumLanes], &buf1[15 * kNumLanes], cos_bit, round); 1590 1591 // stage 7 1592 for (size_t i = 0; i < 8; ++i) { 1593 AddSub(int_tag, &buf1[(0 + i) * kNumLanes], &buf1[(8 + i) * kNumLanes], 1594 &buf0[(0 + i) * kNumLanes], &buf0[(8 + i) * kNumLanes]); 1595 } 1596 1597 // stage 8 1598 HalfButterfly(int_tag, cospi[2], cospi[62], &buf0[0 * kNumLanes], 1599 &buf0[1 * kNumLanes], &buf1[0 * kNumLanes], cos_bit, round); 1600 HalfButterfly(int_tag, cospi[62], -cospi[2], &buf0[0 * kNumLanes], 1601 &buf0[1 * kNumLanes], &buf1[1 * kNumLanes], cos_bit, round); 1602 HalfButterfly(int_tag, cospi[10], cospi[54], &buf0[2 * kNumLanes], 1603 &buf0[3 * kNumLanes], &buf1[2 * kNumLanes], cos_bit, round); 1604 HalfButterfly(int_tag, cospi[54], -cospi[10], &buf0[2 * kNumLanes], 1605 &buf0[3 * kNumLanes], &buf1[3 * kNumLanes], cos_bit, round); 1606 HalfButterfly(int_tag, cospi[18], cospi[46], &buf0[4 * kNumLanes], 1607 &buf0[5 * kNumLanes], &buf1[4 * kNumLanes], cos_bit, round); 1608 HalfButterfly(int_tag, cospi[46], -cospi[18], &buf0[4 * kNumLanes], 1609 &buf0[5 * kNumLanes], &buf1[5 * kNumLanes], cos_bit, round); 1610 HalfButterfly(int_tag, cospi[26], cospi[38], &buf0[6 * kNumLanes], 1611 &buf0[7 * kNumLanes], &buf1[6 * kNumLanes], cos_bit, round); 1612 HalfButterfly(int_tag, cospi[38], -cospi[26], &buf0[6 * kNumLanes], 1613 &buf0[7 * kNumLanes], &buf1[7 * kNumLanes], cos_bit, round); 1614 HalfButterfly(int_tag, cospi[34], cospi[30], &buf0[8 * kNumLanes], 1615 &buf0[9 * kNumLanes], &buf1[8 * kNumLanes], cos_bit, round); 1616 HalfButterfly(int_tag, cospi[30], -cospi[34], &buf0[8 * kNumLanes], 1617 &buf0[9 * kNumLanes], &buf1[9 * kNumLanes], cos_bit, round); 1618 HalfButterfly(int_tag, cospi[42], cospi[22], &buf0[10 * kNumLanes], 1619 &buf0[11 * kNumLanes], &buf1[10 * kNumLanes], cos_bit, round); 1620 HalfButterfly(int_tag, cospi[22], -cospi[42], &buf0[10 * kNumLanes], 1621 &buf0[11 * kNumLanes], &buf1[11 * kNumLanes], cos_bit, round); 1622 HalfButterfly(int_tag, cospi[50], cospi[14], &buf0[12 * kNumLanes], 1623 &buf0[13 * kNumLanes], &buf1[12 * kNumLanes], cos_bit, round); 1624 HalfButterfly(int_tag, cospi[14], -cospi[50], &buf0[12 * kNumLanes], 1625 &buf0[13 * kNumLanes], &buf1[13 * kNumLanes], cos_bit, round); 1626 HalfButterfly(int_tag, cospi[58], cospi[6], &buf0[14 * kNumLanes], 1627 &buf0[15 * kNumLanes], &buf1[14 * kNumLanes], cos_bit, round); 1628 HalfButterfly(int_tag, cospi[6], -cospi[58], &buf0[14 * kNumLanes], 1629 &buf0[15 * kNumLanes], &buf1[15 * kNumLanes], cos_bit, round); 1630 1631 // stage 9 1632 hwy::CopyBytes<kNumBytes>(&buf1[1 * kNumLanes], &in[0 * instride]); 1633 hwy::CopyBytes<kNumBytes>(&buf1[14 * kNumLanes], &in[1 * instride]); 1634 hwy::CopyBytes<kNumBytes>(&buf1[3 * kNumLanes], &in[2 * instride]); 1635 hwy::CopyBytes<kNumBytes>(&buf1[12 * kNumLanes], &in[3 * instride]); 1636 hwy::CopyBytes<kNumBytes>(&buf1[5 * kNumLanes], &in[4 * instride]); 1637 hwy::CopyBytes<kNumBytes>(&buf1[10 * kNumLanes], &in[5 * instride]); 1638 hwy::CopyBytes<kNumBytes>(&buf1[7 * kNumLanes], &in[6 * instride]); 1639 hwy::CopyBytes<kNumBytes>(&buf1[8 * kNumLanes], &in[7 * instride]); 1640 hwy::CopyBytes<kNumBytes>(&buf1[9 * kNumLanes], &in[8 * instride]); 1641 hwy::CopyBytes<kNumBytes>(&buf1[6 * kNumLanes], &in[9 * instride]); 1642 hwy::CopyBytes<kNumBytes>(&buf1[11 * kNumLanes], &in[10 * instride]); 1643 hwy::CopyBytes<kNumBytes>(&buf1[4 * kNumLanes], &in[11 * instride]); 1644 hwy::CopyBytes<kNumBytes>(&buf1[13 * kNumLanes], &in[12 * instride]); 1645 hwy::CopyBytes<kNumBytes>(&buf1[2 * kNumLanes], &in[13 * instride]); 1646 hwy::CopyBytes<kNumBytes>(&buf1[15 * kNumLanes], &in[14 * instride]); 1647 hwy::CopyBytes<kNumBytes>(&buf1[0 * kNumLanes], &in[15 * instride]); 1648 } 1649 1650 template <size_t Width, typename D> 1651 HWY_ATTR HWY_INLINE void IdtxAdd2(D tag, hn::TFromD<D> *HWY_RESTRICT in) { 1652 for (size_t x = 0; x < Width; x += hn::MaxLanes(tag)) { 1653 auto v = hn::Load(tag, &in[x]); 1654 hn::Store(hn::Add(v, v), tag, &in[x]); 1655 } 1656 } 1657 1658 template <size_t Width, int Shift, typename D> 1659 HWY_ATTR HWY_INLINE void IdtxShift(D tag, hn::TFromD<D> *HWY_RESTRICT in) { 1660 for (size_t x = 0; x < Width; x += hn::MaxLanes(tag)) { 1661 hn::Store(hn::ShiftLeft<Shift>(hn::Load(tag, &in[x])), tag, &in[x]); 1662 } 1663 } 1664 1665 template <int Scale, typename D> 1666 HWY_ATTR HWY_INLINE void PromoteScale2x16ByNewSqrt2( 1667 D tag, hn::VFromD<D> v, hn::VFromD<hn::RepartitionToWide<D>> &out0, 1668 hn::VFromD<hn::RepartitionToWide<D>> &out1) { 1669 constexpr hn::RepartitionToWide<D> int32_tag; 1670 auto one = hn::Set(tag, 1); 1671 auto scale_rounding = SetPair(tag, Scale * NewSqrt2, 1 << (NewSqrt2Bits - 1)); 1672 auto a0 = hn::InterleaveLower(tag, v, one); 1673 auto a1 = hn::InterleaveUpper(tag, v, one); 1674 out0 = hn::ShiftRight<NewSqrt2Bits>( 1675 hn::WidenMulPairwiseAdd(int32_tag, a0, scale_rounding)); 1676 out1 = hn::ShiftRight<NewSqrt2Bits>( 1677 hn::WidenMulPairwiseAdd(int32_tag, a1, scale_rounding)); 1678 } 1679 1680 template <size_t LaneSize, size_t NumLanes> 1681 struct ScaleByNewSqrt2Traits { 1682 template <int Scale, typename D> 1683 HWY_ATTR HWY_INLINE static hn::VFromD<D> ScaleByNewSqrt2(D tag, 1684 hn::VFromD<D> v) { 1685 auto fact = hn::Set(tag, Scale * NewSqrt2); 1686 auto offset = hn::Set(tag, 1 << (NewSqrt2Bits - 1)); 1687 return hn::ShiftRight<NewSqrt2Bits>(hn::MulAdd(v, fact, offset)); 1688 } 1689 }; 1690 1691 template <> 1692 struct ScaleByNewSqrt2Traits<2, 4> { 1693 template <int Scale, typename D> 1694 HWY_ATTR HWY_INLINE static hn::VFromD<D> ScaleByNewSqrt2(D tag, 1695 hn::VFromD<D> v) { 1696 auto one = hn::Set(tag, 1); 1697 auto scale_rounding = 1698 SetPair(tag, Scale * NewSqrt2, 1 << (NewSqrt2Bits - 1)); 1699 constexpr hn::Rebind<int32_t, D> int32_tag; 1700 auto a = hn::InterleaveLower(tag, v, one); 1701 auto b = hn::ShiftRight<NewSqrt2Bits>( 1702 hn::WidenMulPairwiseAdd(int32_tag, a, scale_rounding)); 1703 return hn::DemoteTo(tag, b); 1704 } 1705 }; 1706 1707 template <size_t NumLanes> 1708 struct ScaleByNewSqrt2Traits<2, NumLanes> { 1709 template <int Scale, typename D> 1710 HWY_ATTR HWY_INLINE static hn::VFromD<D> ScaleByNewSqrt2(D tag, 1711 hn::VFromD<D> v) { 1712 hn::VFromD<hn::RepartitionToWide<D>> b0, b1; 1713 PromoteScale2x16ByNewSqrt2<Scale>(tag, v, b0, b1); 1714 return hn::ReorderDemote2To(tag, b0, b1); 1715 } 1716 }; 1717 1718 template <int Scale, typename D> 1719 HWY_ATTR HWY_INLINE hn::VFromD<D> ScaleByNewSqrt2(D tag, hn::VFromD<D> v) { 1720 return ScaleByNewSqrt2Traits<sizeof(hn::TFromD<D>), hn::MaxLanes(tag)>:: 1721 template ScaleByNewSqrt2<Scale>(tag, v); 1722 } 1723 1724 template <size_t Width, int Scale, typename D> 1725 HWY_ATTR HWY_INLINE void IdtxSqrt2(D tag, hn::TFromD<D> *HWY_RESTRICT in) { 1726 for (size_t x = 0; x < Width; x += hn::MaxLanes(tag)) { 1727 hn::Store(ScaleByNewSqrt2<Scale>(tag, hn::Load(tag, &in[x])), tag, &in[x]); 1728 } 1729 } 1730 1731 template <size_t Width, size_t Stride, typename T> 1732 HWY_ATTR void FdctNx4Block(T *HWY_RESTRICT in, int8_t cos_bit) { 1733 constexpr auto int_tag = hn::CappedTag<T, Width>(); 1734 for (size_t i = 0; i < Width; i += hn::MaxLanes(int_tag)) { 1735 Fdct4(int_tag, &in[i], cos_bit, Stride); 1736 } 1737 } 1738 1739 template <size_t Width, size_t Stride, typename T> 1740 HWY_ATTR void FdctNx8Block(T *HWY_RESTRICT in, int8_t cos_bit) { 1741 constexpr auto int_tag = hn::CappedTag<T, Stride>(); 1742 for (size_t i = 0; i < Width; i += hn::MaxLanes(int_tag)) { 1743 Fdct8(int_tag, &in[i], cos_bit, Stride); 1744 } 1745 } 1746 1747 template <size_t Width, size_t Stride, typename T> 1748 HWY_ATTR void FdctNx16Block(T *HWY_RESTRICT in, int8_t cos_bit) { 1749 constexpr auto int_tag = hn::CappedTag<T, Stride>(); 1750 for (size_t i = 0; i < Width; i += hn::MaxLanes(int_tag)) { 1751 Fdct16(int_tag, &in[i], cos_bit, Stride); 1752 } 1753 } 1754 1755 template <size_t Width, size_t Stride, typename T> 1756 HWY_ATTR void FdctNx32Block(T *HWY_RESTRICT in, int8_t cos_bit) { 1757 constexpr auto int_tag = hn::CappedTag<T, Stride>(); 1758 for (size_t i = 0; i < Width; i += hn::MaxLanes(int_tag)) { 1759 Fdct32(int_tag, &in[i], cos_bit, Stride); 1760 } 1761 } 1762 1763 template <size_t InWidth, size_t InStride, size_t OutWidth, size_t OutStride, 1764 typename T> 1765 HWY_ATTR void FdctNx64Block(T *HWY_RESTRICT in, int8_t cos_bit) { 1766 constexpr auto int_tag = hn::CappedTag<T, InWidth>(); 1767 for (size_t i = 0; i < OutWidth; i += hn::MaxLanes(int_tag)) { 1768 Fdct64<InStride, OutStride>(int_tag, &in[i], cos_bit); 1769 } 1770 } 1771 1772 template <size_t Width, size_t Stride, typename T> 1773 HWY_ATTR HWY_INLINE void FadstNx4Block(T *HWY_RESTRICT in, int8_t cos_bit) { 1774 constexpr auto int_tag = hn::CappedTag<T, Width>(); 1775 for (size_t i = 0; i < Width; i += hn::MaxLanes(int_tag)) { 1776 Fadst4<Width>(int_tag, &in[i], cos_bit, Stride); 1777 } 1778 } 1779 1780 template <size_t Width, size_t Stride, typename T> 1781 HWY_ATTR void FadstNx8Block(T *HWY_RESTRICT in, int8_t cos_bit) { 1782 constexpr auto int_tag = hn::CappedTag<T, Stride>(); 1783 for (size_t i = 0; i < Width; i += hn::MaxLanes(int_tag)) { 1784 Fadst8<Width>(int_tag, &in[i], cos_bit, Stride); 1785 } 1786 } 1787 1788 template <size_t Width, size_t Stride, typename T> 1789 HWY_ATTR void FadstNx16Block(T *HWY_RESTRICT in, int8_t cos_bit) { 1790 constexpr auto int_tag = hn::CappedTag<T, Stride>(); 1791 for (size_t i = 0; i < Width; i += hn::MaxLanes(int_tag)) { 1792 Fadst16<Width>(int_tag, &in[i], cos_bit, Stride); 1793 } 1794 } 1795 1796 template <size_t Width, size_t Stride, size_t BlockHeight, typename T> 1797 HWY_ATTR void IdtxAdd2Block(T *HWY_RESTRICT in, int8_t cos_bit) { 1798 (void)cos_bit; 1799 constexpr auto int_tag = hn::CappedTag<T, Width>(); 1800 for (size_t y = 0; y < BlockHeight; ++y) { 1801 IdtxAdd2<Width>(int_tag, &in[y * Stride]); 1802 } 1803 } 1804 1805 template <size_t Width, size_t Stride, size_t BlockHeight, int Scale, 1806 typename T> 1807 HWY_ATTR void IdtxSqrt2Block(T *HWY_RESTRICT in, int8_t cos_bit) { 1808 (void)cos_bit; 1809 constexpr auto int_tag = hn::CappedTag<T, Width>(); 1810 for (size_t y = 0; y < BlockHeight; ++y) { 1811 IdtxSqrt2<Width, Scale>(int_tag, &in[y * Stride]); 1812 } 1813 } 1814 1815 template <size_t Width, size_t Stride, size_t BlockHeight, int Shift, 1816 typename T> 1817 HWY_ATTR void IdtxShiftBlock(T *HWY_RESTRICT in, int8_t cos_bit) { 1818 (void)cos_bit; 1819 constexpr auto int_tag = hn::CappedTag<T, Width>(); 1820 for (size_t y = 0; y < BlockHeight; ++y) { 1821 IdtxShift<Width, Shift>(int_tag, &in[y * Stride]); 1822 } 1823 } 1824 1825 template <typename T> 1826 void TransformFail(T *in, int8_t cos_bit) { 1827 (void)in; 1828 (void)cos_bit; 1829 assert(false && "Incorrect transform requested."); 1830 } 1831 1832 template <typename T> 1833 using Transform1D = void (*)(T *in, int8_t cos_bit); 1834 1835 template <bool PositiveOrZero> 1836 struct RoundShiftTraits {}; 1837 1838 template <> 1839 struct RoundShiftTraits<true> { 1840 template <int Bit, typename D> 1841 HWY_ATTR HWY_INLINE static hn::VFromD<D> Shift(D int_tag, 1842 hn::VFromD<D> value) { 1843 (void)int_tag; 1844 if CONSTEXPR_IF (Bit == 0) { 1845 return value; 1846 } else { 1847 return hn::ShiftLeft<Bit>(value); 1848 } 1849 } 1850 }; 1851 1852 template <> 1853 struct RoundShiftTraits<false> { 1854 template <int Bit, typename D> 1855 HWY_ATTR HWY_INLINE static hn::VFromD<D> Shift(D int_tag, 1856 hn::VFromD<D> value) { 1857 const auto round = hn::Set(int_tag, 1 << (-Bit - 1)); 1858 return hn::ShiftRight<-Bit>(hn::Add(value, round)); 1859 } 1860 }; 1861 1862 template <int Bit, typename D> 1863 HWY_ATTR HWY_INLINE hn::VFromD<D> RoundShift(D int_tag, hn::VFromD<D> value) { 1864 return RoundShiftTraits<(Bit >= 0)>::template Shift<Bit>(int_tag, value); 1865 } 1866 1867 template <bool ApplyRectScale, typename D> 1868 HWY_ATTR HWY_INLINE hn::VFromD<D> RectScale(D int_tag, hn::VFromD<D> v) { 1869 if CONSTEXPR_IF (ApplyRectScale) { 1870 return ScaleByNewSqrt2<1>(int_tag, v); 1871 } 1872 return v; 1873 } 1874 1875 template <bool IsSame> 1876 struct MaybePromoteTraits {}; 1877 1878 template <> 1879 struct MaybePromoteTraits<true> { 1880 template <typename VIn, typename D> 1881 HWY_ATTR HWY_INLINE static hn::VFromD<D> PromoteTo(D out_tag, VIn in) { 1882 (void)out_tag; 1883 return in; 1884 } 1885 1886 template <typename VIn, typename D> 1887 HWY_ATTR HWY_INLINE static void PromoteStore2(D int_tag, VIn v, 1888 hn::TFromD<D> *out) { 1889 hn::StoreU(v, int_tag, out); 1890 } 1891 }; 1892 1893 template <> 1894 struct MaybePromoteTraits<false> { 1895 template <typename VIn, typename D> 1896 HWY_ATTR HWY_INLINE static hn::VFromD<D> PromoteTo(D out_tag, VIn in) { 1897 return hn::PromoteTo(out_tag, in); 1898 } 1899 1900 template <typename VIn, typename TOut, typename D> 1901 HWY_ATTR HWY_INLINE static void PromoteStore2(D int_tag, VIn v, TOut *out) { 1902 (void)int_tag; 1903 constexpr hn::Repartition<TOut, D> store_tag; 1904 hn::StoreU(hn::PromoteLowerTo(store_tag, v), store_tag, out); 1905 hn::StoreU(hn::PromoteUpperTo(store_tag, v), store_tag, 1906 out + hn::MaxLanes(store_tag)); 1907 } 1908 }; 1909 1910 template <typename VIn, typename D> 1911 HWY_ATTR HWY_INLINE hn::VFromD<D> MaybePromoteTo(D out_tag, VIn in) { 1912 return MaybePromoteTraits< 1913 std::is_same<hn::TFromD<D>, hn::TFromV<VIn>>::value>::PromoteTo(out_tag, 1914 in); 1915 } 1916 1917 template <int8_t Bit, bool ApplyRectScale, typename TIn, typename TOut> 1918 HWY_ATTR HWY_INLINE void Transpose4(const TIn *HWY_RESTRICT in, 1919 TOut *HWY_RESTRICT out, size_t instride, 1920 size_t outstride) { 1921 constexpr hn::FixedTag<TIn, 4> int_tag; 1922 auto i0 = RectScale<ApplyRectScale>( 1923 int_tag, RoundShift<Bit>(int_tag, hn::Load(int_tag, &in[0 * instride]))); 1924 auto i1 = RectScale<ApplyRectScale>( 1925 int_tag, RoundShift<Bit>(int_tag, hn::Load(int_tag, &in[1 * instride]))); 1926 auto i2 = RectScale<ApplyRectScale>( 1927 int_tag, RoundShift<Bit>(int_tag, hn::Load(int_tag, &in[2 * instride]))); 1928 auto i3 = RectScale<ApplyRectScale>( 1929 int_tag, RoundShift<Bit>(int_tag, hn::Load(int_tag, &in[3 * instride]))); 1930 HWY_ALIGN_MAX TOut interleaved[16]; 1931 constexpr hn::FixedTag<TOut, 4> out_tag; 1932 hn::StoreInterleaved4(MaybePromoteTo(out_tag, i0), 1933 MaybePromoteTo(out_tag, i1), 1934 MaybePromoteTo(out_tag, i2), 1935 MaybePromoteTo(out_tag, i3), out_tag, interleaved); 1936 for (size_t i = 0; i < 4; ++i) { 1937 hwy::CopyBytes<hn::MaxLanes(int_tag) * sizeof(*out)>(&interleaved[i * 4], 1938 &out[i * outstride]); 1939 } 1940 } 1941 1942 template <int8_t Bit, bool ApplyRectScale, typename TIn, typename TOut> 1943 HWY_ATTR HWY_INLINE void Transpose8(const TIn *HWY_RESTRICT in, 1944 TOut *HWY_RESTRICT out, size_t instride, 1945 size_t outstride) { 1946 constexpr hn::FixedTag<TIn, 8> int_tag; 1947 constexpr hn::Rebind<TOut, decltype(int_tag)> out_tag; 1948 // N.B. there isn't a StoreInterleaved8, so hand-code Transpose8. 1949 constexpr hn::RepartitionToWide<decltype(out_tag)> wide_int_tag; 1950 HWY_ALIGN_MAX hn::TFromD<decltype(wide_int_tag)> interleaved0[16]; 1951 HWY_ALIGN_MAX hn::TFromD<decltype(wide_int_tag)> interleaved1[16]; 1952 auto i0 = hn::Load(int_tag, &in[0 * instride]); 1953 auto i1 = hn::Load(int_tag, &in[1 * instride]); 1954 auto i2 = hn::Load(int_tag, &in[2 * instride]); 1955 auto i3 = hn::Load(int_tag, &in[3 * instride]); 1956 auto i4 = hn::Load(int_tag, &in[4 * instride]); 1957 auto i5 = hn::Load(int_tag, &in[5 * instride]); 1958 auto i6 = hn::Load(int_tag, &in[6 * instride]); 1959 auto i7 = hn::Load(int_tag, &in[7 * instride]); 1960 auto s0 = hn::Undefined(out_tag); 1961 auto s1 = hn::Undefined(out_tag); 1962 auto s2 = hn::Undefined(out_tag); 1963 auto s3 = hn::Undefined(out_tag); 1964 auto s4 = hn::Undefined(out_tag); 1965 auto s5 = hn::Undefined(out_tag); 1966 auto s6 = hn::Undefined(out_tag); 1967 auto s7 = hn::Undefined(out_tag); 1968 auto ip0 = MaybePromoteTo(out_tag, i0); 1969 auto ip1 = MaybePromoteTo(out_tag, i1); 1970 auto ip2 = MaybePromoteTo(out_tag, i2); 1971 auto ip3 = MaybePromoteTo(out_tag, i3); 1972 auto ip4 = MaybePromoteTo(out_tag, i4); 1973 auto ip5 = MaybePromoteTo(out_tag, i5); 1974 auto ip6 = MaybePromoteTo(out_tag, i6); 1975 auto ip7 = MaybePromoteTo(out_tag, i7); 1976 s0 = RectScale<ApplyRectScale>(out_tag, RoundShift<Bit>(out_tag, ip0)); 1977 s1 = RectScale<ApplyRectScale>(out_tag, RoundShift<Bit>(out_tag, ip1)); 1978 s2 = RectScale<ApplyRectScale>(out_tag, RoundShift<Bit>(out_tag, ip2)); 1979 s3 = RectScale<ApplyRectScale>(out_tag, RoundShift<Bit>(out_tag, ip3)); 1980 s4 = RectScale<ApplyRectScale>(out_tag, RoundShift<Bit>(out_tag, ip4)); 1981 s5 = RectScale<ApplyRectScale>(out_tag, RoundShift<Bit>(out_tag, ip5)); 1982 s6 = RectScale<ApplyRectScale>(out_tag, RoundShift<Bit>(out_tag, ip6)); 1983 s7 = RectScale<ApplyRectScale>(out_tag, RoundShift<Bit>(out_tag, ip7)); 1984 auto u0 = hn::ZipLower(wide_int_tag, s0, s1); 1985 auto u1 = hn::ZipUpper(wide_int_tag, s0, s1); 1986 auto u2 = hn::ZipLower(wide_int_tag, s2, s3); 1987 auto u3 = hn::ZipUpper(wide_int_tag, s2, s3); 1988 auto u4 = hn::ZipLower(wide_int_tag, s4, s5); 1989 auto u5 = hn::ZipUpper(wide_int_tag, s4, s5); 1990 auto u6 = hn::ZipLower(wide_int_tag, s6, s7); 1991 auto u7 = hn::ZipUpper(wide_int_tag, s6, s7); 1992 hn::StoreInterleaved4(u0, u2, u4, u6, wide_int_tag, interleaved0); 1993 hn::StoreInterleaved4(u1, u3, u5, u7, wide_int_tag, interleaved1); 1994 constexpr size_t kNumBytes = hn::MaxLanes(int_tag) * sizeof(*out); 1995 if CONSTEXPR_IF (sizeof(TOut) == 2) { 1996 hwy::CopyBytes<kNumBytes>(&interleaved0[0], &out[0 * outstride]); 1997 hwy::CopyBytes<kNumBytes>(&interleaved0[4], &out[1 * outstride]); 1998 hwy::CopyBytes<kNumBytes>(&interleaved0[8], &out[2 * outstride]); 1999 hwy::CopyBytes<kNumBytes>(&interleaved0[12], &out[3 * outstride]); 2000 hwy::CopyBytes<kNumBytes>(&interleaved1[0], &out[4 * outstride]); 2001 hwy::CopyBytes<kNumBytes>(&interleaved1[4], &out[5 * outstride]); 2002 hwy::CopyBytes<kNumBytes>(&interleaved1[8], &out[6 * outstride]); 2003 hwy::CopyBytes<kNumBytes>(&interleaved1[12], &out[7 * outstride]); 2004 } else { 2005 hwy::CopyBytes<kNumBytes>(&interleaved0[0], &out[0 * outstride]); 2006 hwy::CopyBytes<kNumBytes>(&interleaved0[4], &out[1 * outstride]); 2007 hwy::CopyBytes<kNumBytes>(&interleaved1[0], &out[2 * outstride]); 2008 hwy::CopyBytes<kNumBytes>(&interleaved1[4], &out[3 * outstride]); 2009 hwy::CopyBytes<kNumBytes>(&interleaved0[8], &out[4 * outstride]); 2010 hwy::CopyBytes<kNumBytes>(&interleaved0[12], &out[5 * outstride]); 2011 hwy::CopyBytes<kNumBytes>(&interleaved1[8], &out[6 * outstride]); 2012 hwy::CopyBytes<kNumBytes>(&interleaved1[12], &out[7 * outstride]); 2013 } 2014 } 2015 2016 template <typename D> 2017 HWY_ATTR HWY_INLINE hn::VFromD<D> LocalInterleaveEvenBlocks(D tag, 2018 hn::VFromD<D> a, 2019 hn::VFromD<D> b) { 2020 static_assert(sizeof(hn::TFromD<D>) == 8, 2021 "LocalInterleaveEvenBlocks requires 64-bit lanes."); 2022 HWY_ALIGN static constexpr int64_t kIndices[] = { 0, 1, 8 + 0, 8 + 1, 2023 4, 5, 8 + 4, 8 + 5 }; 2024 auto indices = hn::SetTableIndices(tag, kIndices); 2025 return hn::TwoTablesLookupLanes(tag, a, b, indices); 2026 } 2027 2028 template <typename D> 2029 HWY_ATTR HWY_INLINE hn::VFromD<D> LocalInterleaveOddBlocks(D tag, 2030 hn::VFromD<D> a, 2031 hn::VFromD<D> b) { 2032 static_assert(sizeof(hn::TFromD<D>) == 8, 2033 "LocalInterleaveOddBlocks requires 64-bit lanes."); 2034 HWY_ALIGN static constexpr int64_t kIndices[] = { 2, 3, 8 + 2, 8 + 3, 2035 6, 7, 8 + 6, 8 + 7 }; 2036 auto indices = hn::SetTableIndices(tag, kIndices); 2037 return hn::TwoTablesLookupLanes(tag, a, b, indices); 2038 } 2039 2040 template <size_t LaneSize> 2041 struct Transpose16Traits {}; 2042 2043 template <> 2044 struct Transpose16Traits<2> { 2045 template <int8_t Bit, bool ApplyRectScale, typename TIn, typename TOut> 2046 HWY_ATTR HWY_INLINE static void Transpose16(const TIn *HWY_RESTRICT in, 2047 TOut *HWY_RESTRICT out, 2048 size_t instride, 2049 size_t outstride) { 2050 constexpr hn::FixedTag<TIn, 16> int_tag; 2051 static_assert(hn::MaxLanes(int_tag) == 16, 2052 "16-bit Transpose16 requires an 16-lane int_tag"); 2053 constexpr hn::RepartitionToWide<decltype(int_tag)> wide_int_tag; 2054 constexpr hn::RepartitionToWide<decltype(wide_int_tag)> widex2_int_tag; 2055 HWY_ALIGN_MAX hn::TFromD<decltype(wide_int_tag)> 2056 y[16 * hn::MaxLanes(wide_int_tag)]; 2057 HWY_ALIGN_MAX hn::TFromD<decltype(widex2_int_tag)> 2058 z[16 * hn::MaxLanes(widex2_int_tag)]; 2059 for (size_t i = 0; i < 16; i += 2) { 2060 auto i0 = RectScale<ApplyRectScale>( 2061 int_tag, 2062 RoundShift<Bit>(int_tag, hn::Load(int_tag, &in[(i + 0) * instride]))); 2063 auto i1 = RectScale<ApplyRectScale>( 2064 int_tag, 2065 RoundShift<Bit>(int_tag, hn::Load(int_tag, &in[(i + 1) * instride]))); 2066 hn::Store(hn::ZipLower(wide_int_tag, i0, i1), wide_int_tag, 2067 &y[(i + 0) * hn::MaxLanes(wide_int_tag)]); 2068 hn::Store(hn::ZipUpper(wide_int_tag, i0, i1), wide_int_tag, 2069 &y[(i + 1) * hn::MaxLanes(wide_int_tag)]); 2070 } 2071 for (size_t i = 0; i < 16; i += 4) { 2072 for (size_t j = 0; j < 2; ++j) { 2073 auto i0 = hn::Load(wide_int_tag, 2074 &y[(i + j + 0) * hn::MaxLanes(wide_int_tag)]); 2075 auto i2 = hn::Load(wide_int_tag, 2076 &y[(i + j + 2) * hn::MaxLanes(wide_int_tag)]); 2077 hn::Store(hn::ZipLower(widex2_int_tag, i0, i2), widex2_int_tag, 2078 &z[(i + j + 0) * hn::MaxLanes(widex2_int_tag)]); 2079 hn::Store(hn::ZipUpper(widex2_int_tag, i0, i2), widex2_int_tag, 2080 &z[(i + j + 2) * hn::MaxLanes(widex2_int_tag)]); 2081 } 2082 } 2083 for (size_t i = 0; i < 16; i += 8) { 2084 for (size_t j = 0; j < 4; ++j) { 2085 auto i0 = hn::Load(widex2_int_tag, 2086 &z[(i + j + 0) * hn::MaxLanes(widex2_int_tag)]); 2087 auto i4 = hn::Load(widex2_int_tag, 2088 &z[(i + j + 4) * hn::MaxLanes(widex2_int_tag)]); 2089 hn::Store(hn::InterleaveLower(widex2_int_tag, i0, i4), widex2_int_tag, 2090 &z[(i + j + 0) * hn::MaxLanes(widex2_int_tag)]); 2091 hn::Store(hn::InterleaveUpper(widex2_int_tag, i0, i4), widex2_int_tag, 2092 &z[(i + j + 4) * hn::MaxLanes(widex2_int_tag)]); 2093 } 2094 } 2095 static constexpr size_t kStoreIndex[] = { 0, 4, 2, 6, 1, 5, 3, 7, 2096 8, 12, 10, 14, 9, 13, 11, 15 }; 2097 for (size_t j = 0; j < 8; ++j) { 2098 auto i0 = 2099 hn::Load(widex2_int_tag, &z[(j + 0) * hn::MaxLanes(widex2_int_tag)]); 2100 auto i8 = 2101 hn::Load(widex2_int_tag, &z[(j + 8) * hn::MaxLanes(widex2_int_tag)]); 2102 hn::StoreU( 2103 hn::BitCast(int_tag, hn::ConcatLowerLower(widex2_int_tag, i8, i0)), 2104 int_tag, &out[kStoreIndex[j + 0] * outstride]); 2105 hn::StoreU( 2106 hn::BitCast(int_tag, hn::ConcatUpperUpper(widex2_int_tag, i8, i0)), 2107 int_tag, &out[kStoreIndex[j + 8] * outstride]); 2108 } 2109 } 2110 }; 2111 2112 template <> 2113 struct Transpose16Traits<4> { 2114 template <int8_t Bit, bool ApplyRectScale, typename TIn, typename TOut> 2115 HWY_ATTR HWY_INLINE static void Transpose16(const TIn *HWY_RESTRICT in, 2116 TOut *HWY_RESTRICT out, 2117 size_t instride, 2118 size_t outstride) { 2119 constexpr hn::FixedTag<TIn, 16> int_tag; 2120 static_assert(hn::MaxLanes(int_tag) == 16, 2121 "32-bit Transpose16 requires an 16-lane int_tag"); 2122 constexpr hn::RepartitionToWide<decltype(int_tag)> wide_int_tag; 2123 HWY_ALIGN_MAX hn::TFromD<decltype(wide_int_tag)> 2124 z[16 * hn::MaxLanes(wide_int_tag)]; 2125 for (size_t i = 0; i < 16; i += 2) { 2126 auto i0 = RectScale<ApplyRectScale>( 2127 int_tag, 2128 RoundShift<Bit>(int_tag, hn::Load(int_tag, &in[(i + 0) * instride]))); 2129 auto i1 = RectScale<ApplyRectScale>( 2130 int_tag, 2131 RoundShift<Bit>(int_tag, hn::Load(int_tag, &in[(i + 1) * instride]))); 2132 hn::Store(hn::ZipLower(wide_int_tag, i0, i1), wide_int_tag, 2133 &z[(i + 0) * hn::MaxLanes(wide_int_tag)]); 2134 hn::Store(hn::ZipUpper(wide_int_tag, i0, i1), wide_int_tag, 2135 &z[(i + 1) * hn::MaxLanes(wide_int_tag)]); 2136 } 2137 for (size_t i = 0; i < 16; i += 4) { 2138 for (size_t j = 0; j < 2; ++j) { 2139 auto i0 = hn::Load(wide_int_tag, 2140 &z[(i + j + 0) * hn::MaxLanes(wide_int_tag)]); 2141 auto i2 = hn::Load(wide_int_tag, 2142 &z[(i + j + 2) * hn::MaxLanes(wide_int_tag)]); 2143 hn::Store(hn::InterleaveLower(wide_int_tag, i0, i2), wide_int_tag, 2144 &z[(i + j + 0) * hn::MaxLanes(wide_int_tag)]); 2145 hn::Store(hn::InterleaveUpper(wide_int_tag, i0, i2), wide_int_tag, 2146 &z[(i + j + 2) * hn::MaxLanes(wide_int_tag)]); 2147 } 2148 } 2149 for (size_t i = 0; i < 16; i += 8) { 2150 for (size_t j = 0; j < 4; ++j) { 2151 auto i0 = hn::Load(wide_int_tag, 2152 &z[(i + j + 0) * hn::MaxLanes(wide_int_tag)]); 2153 auto i4 = hn::Load(wide_int_tag, 2154 &z[(i + j + 4) * hn::MaxLanes(wide_int_tag)]); 2155 hn::Store(LocalInterleaveEvenBlocks(wide_int_tag, i0, i4), wide_int_tag, 2156 &z[(i + j + 0) * hn::MaxLanes(wide_int_tag)]); 2157 hn::Store(LocalInterleaveOddBlocks(wide_int_tag, i0, i4), wide_int_tag, 2158 &z[(i + j + 4) * hn::MaxLanes(wide_int_tag)]); 2159 } 2160 } 2161 static constexpr size_t kStoreIndex[] = { 0, 2, 1, 3, 4, 6, 5, 7, 2162 8, 10, 9, 11, 12, 14, 13, 15 }; 2163 for (size_t j = 0; j < 8; ++j) { 2164 auto i0 = 2165 hn::Load(wide_int_tag, &z[(j + 0) * hn::MaxLanes(wide_int_tag)]); 2166 auto i8 = 2167 hn::Load(wide_int_tag, &z[(j + 8) * hn::MaxLanes(wide_int_tag)]); 2168 hn::StoreU( 2169 hn::BitCast(int_tag, hn::ConcatLowerLower(wide_int_tag, i8, i0)), 2170 int_tag, &out[kStoreIndex[j + 0] * outstride]); 2171 hn::StoreU( 2172 hn::BitCast(int_tag, hn::ConcatUpperUpper(wide_int_tag, i8, i0)), 2173 int_tag, &out[kStoreIndex[j + 8] * outstride]); 2174 } 2175 } 2176 }; 2177 2178 template <int8_t Bit, bool ApplyRectScale, typename TIn, typename TOut> 2179 HWY_ATTR HWY_INLINE void Transpose16(const TIn *HWY_RESTRICT in, 2180 TOut *HWY_RESTRICT out, size_t instride, 2181 size_t outstride) { 2182 static_assert(sizeof(TOut) == sizeof(TIn), 2183 "Transpose16 does not directly support integer promotion."); 2184 Transpose16Traits<sizeof(TIn)>::template Transpose16<Bit, ApplyRectScale>( 2185 in, out, instride, outstride); 2186 } 2187 2188 template <size_t NumLanes, bool RequiresPromotion> 2189 struct TransposeTraits {}; 2190 2191 template <> 2192 struct TransposeTraits<16, true> { 2193 template <size_t Width, size_t Height, int8_t Bit, bool ApplyRectScale, 2194 typename TIn, typename TOut> 2195 HWY_ATTR HWY_INLINE static void Transpose(const TIn *HWY_RESTRICT in, 2196 TOut *HWY_RESTRICT out, 2197 size_t instride, size_t outstride) { 2198 constexpr auto int_tag = 2199 hn::CappedTag<TOut, AOMMIN(16, AOMMIN(Width, Height))>(); 2200 constexpr hn::Rebind<TIn, decltype(int_tag)> input_tag; 2201 HWY_ALIGN_MAX hn::TFromD<decltype(int_tag)> p[16 * hn::MaxLanes(int_tag)]; 2202 for (size_t r = 0; r < Height; r += 16) { 2203 for (size_t c = 0; c < Width; c += 16) { 2204 for (size_t i = 0; i < 16; ++i) { 2205 hn::Store( 2206 hn::PromoteTo(int_tag, 2207 hn::Load(input_tag, &in[(r + i) * instride + c])), 2208 int_tag, &p[i * hn::MaxLanes(int_tag)]); 2209 } 2210 Transpose16<Bit, ApplyRectScale>(p, &out[c * outstride + r], 2211 hn::MaxLanes(int_tag), outstride); 2212 } 2213 } 2214 } 2215 }; 2216 2217 template <> 2218 struct TransposeTraits<16, false> { 2219 template <size_t Width, size_t Height, int8_t Bit, bool ApplyRectScale, 2220 typename TIn, typename TOut> 2221 HWY_ATTR HWY_INLINE static void Transpose(const TIn *HWY_RESTRICT in, 2222 TOut *HWY_RESTRICT out, 2223 size_t instride, size_t outstride) { 2224 for (size_t r = 0; r < Height; r += 16) { 2225 for (size_t c = 0; c < Width; c += 16) { 2226 Transpose16<Bit, ApplyRectScale>(&in[r * instride + c], 2227 &out[c * outstride + r], instride, 2228 outstride); 2229 } 2230 } 2231 } 2232 }; 2233 2234 template <bool RequiresPromotion> 2235 struct TransposeTraits<8, RequiresPromotion> { 2236 template <size_t Width, size_t Height, int8_t Bit, bool ApplyRectScale, 2237 typename TIn, typename TOut> 2238 HWY_ATTR HWY_INLINE static void Transpose(const TIn *HWY_RESTRICT in, 2239 TOut *HWY_RESTRICT out, 2240 size_t instride, size_t outstride) { 2241 for (size_t r = 0; r < Height; r += 8) { 2242 for (size_t c = 0; c < Width; c += 8) { 2243 Transpose8<Bit, ApplyRectScale>(&in[r * instride + c], 2244 &out[c * outstride + r], instride, 2245 outstride); 2246 } 2247 } 2248 } 2249 }; 2250 2251 template <bool RequiresPromotion> 2252 struct TransposeTraits<4, RequiresPromotion> { 2253 template <size_t Width, size_t Height, int8_t Bit, bool ApplyRectScale, 2254 typename TIn, typename TOut> 2255 HWY_ATTR HWY_INLINE static void Transpose(const TIn *HWY_RESTRICT in, 2256 TOut *HWY_RESTRICT out, 2257 size_t instride, size_t outstride) { 2258 for (size_t r = 0; r < Height; r += 4) { 2259 for (size_t c = 0; c < Width; c += 4) { 2260 Transpose4<Bit, ApplyRectScale>(&in[r * instride + c], 2261 &out[c * outstride + r], instride, 2262 outstride); 2263 } 2264 } 2265 } 2266 }; 2267 2268 template <size_t Width, size_t Height, int8_t Bit, bool ApplyRectScale, 2269 typename TIn, typename TOut> 2270 HWY_ATTR HWY_INLINE void Transpose(const TIn *HWY_RESTRICT in, 2271 TOut *HWY_RESTRICT out, size_t instride, 2272 size_t outstride) { 2273 constexpr auto int_tag = 2274 hn::CappedTag<TOut, AOMMIN(16, AOMMIN(Width, Height))>(); 2275 TransposeTraits<hn::MaxLanes(int_tag), !std::is_same<TIn, TOut>::value>:: 2276 template Transpose<Width, Height, Bit, ApplyRectScale>(in, out, instride, 2277 outstride); 2278 } 2279 2280 template <size_t Width, size_t Height, int Shift, bool ApplyRectScale, 2281 typename TIn, typename TOut> 2282 HWY_ATTR HWY_INLINE void StoreBlock(const TIn *HWY_RESTRICT in, size_t instride, 2283 TOut *HWY_RESTRICT out, size_t outstride) { 2284 constexpr hn::CappedTag<TIn, Width> load_tag; 2285 for (size_t r = 0; r < Height; ++r) { 2286 for (size_t c = 0; c < Width; c += hn::MaxLanes(load_tag)) { 2287 auto v = RectScale<ApplyRectScale>( 2288 load_tag, RoundShift<Shift>( 2289 load_tag, hn::Load(load_tag, &in[r * instride + c]))); 2290 MaybePromoteTraits<std::is_same<TIn, TOut>::value>::PromoteStore2( 2291 load_tag, v, &out[r * outstride + c]); 2292 } 2293 } 2294 } 2295 2296 template <int8_t Shift, size_t Width, bool FlipLeftRight, typename TInput, 2297 typename TIn> 2298 HWY_ATTR HWY_INLINE void LoadLine(const TInput *HWY_RESTRICT input, 2299 TIn *HWY_RESTRICT in) { 2300 constexpr hn::CappedTag<TIn, Width> store_tag; 2301 constexpr hn::Rebind<TInput, decltype(store_tag)> load_tag; 2302 for (size_t x = 0; x < Width / hn::MaxLanes(load_tag); ++x) { 2303 auto v = hn::LoadU(load_tag, &input[x * hn::MaxLanes(load_tag)]); 2304 if CONSTEXPR_IF (FlipLeftRight) { 2305 v = hn::Reverse(load_tag, v); 2306 } 2307 auto vp = MaybePromoteTo(store_tag, v); 2308 hn::Store( 2309 hn::ShiftLeft<Shift>(vp), store_tag, 2310 &in[(FlipLeftRight ? (Width / hn::MaxLanes(store_tag)) - x - 1 : x) * 2311 hn::MaxLanes(store_tag)]); 2312 } 2313 } 2314 2315 template <int8_t Shift, size_t Width, size_t OutStride, size_t Height, 2316 bool FlipUpDown, bool FlipLeftRight, typename TInput, typename TIn> 2317 HWY_ATTR HWY_INLINE void LoadBuffer(const TInput *HWY_RESTRICT input, 2318 TIn *HWY_RESTRICT in, size_t stride) { 2319 for (size_t y = 0; y < Height; ++y) { 2320 LoadLine<Shift, Width, FlipLeftRight>( 2321 input + y * stride, &in[(FlipUpDown ? Height - y - 1 : y) * OutStride]); 2322 } 2323 } 2324 2325 template <size_t TransformWidth, size_t BlockWidth, size_t BlockHeight, 2326 typename T> 2327 HWY_ATTR HWY_FLATTEN HWY_INLINE void Transform4(TX_TYPE_1D tx_type, T *in, 2328 int8_t cos_bit) { 2329 switch (tx_type) { 2330 case DCT_1D: FdctNx4Block<TransformWidth, BlockWidth>(in, cos_bit); break; 2331 case IDTX_1D: 2332 IdtxSqrt2Block<TransformWidth, BlockWidth, BlockHeight, 1>(in, cos_bit); 2333 break; 2334 default: FadstNx4Block<TransformWidth, BlockWidth>(in, cos_bit); break; 2335 } 2336 } 2337 2338 template <size_t TransformWidth, size_t BlockWidth, size_t BlockHeight, 2339 typename T> 2340 HWY_ATTR HWY_FLATTEN HWY_INLINE void Transform8(TX_TYPE_1D tx_type, T *in, 2341 int8_t cos_bit) { 2342 switch (tx_type) { 2343 case DCT_1D: FdctNx8Block<TransformWidth, BlockWidth>(in, cos_bit); break; 2344 case IDTX_1D: 2345 IdtxAdd2Block<TransformWidth, BlockWidth, BlockHeight>(in, cos_bit); 2346 break; 2347 default: FadstNx8Block<TransformWidth, BlockWidth>(in, cos_bit); break; 2348 } 2349 } 2350 2351 template <size_t TransformWidth, size_t BlockWidth, size_t BlockHeight, 2352 typename T> 2353 HWY_ATTR HWY_INLINE void Transform16(TX_TYPE_1D tx_type, T *in, 2354 int8_t cos_bit) { 2355 static const Transform1D<T> kTransform[] = { 2356 FdctNx16Block<TransformWidth, BlockWidth, T>, // DCT_1D 2357 FadstNx16Block<TransformWidth, BlockWidth, T>, // ADST_1D 2358 FadstNx16Block<TransformWidth, BlockWidth, T>, // FLIPADST_1D 2359 IdtxSqrt2Block<TransformWidth, BlockWidth, BlockHeight, 2, T>, // IDTX_1D 2360 }; 2361 kTransform[tx_type](in, cos_bit); 2362 } 2363 2364 template <size_t TransformWidth, size_t BlockWidth, size_t BlockHeight, 2365 typename T> 2366 HWY_ATTR HWY_INLINE void Transform32(TX_TYPE_1D tx_type, T *in, 2367 int8_t cos_bit) { 2368 static const Transform1D<T> kTransform[] = { 2369 FdctNx32Block<TransformWidth, BlockWidth, T>, // DCT_1D 2370 TransformFail<T>, // ADST_1D 2371 TransformFail<T>, // FLIPADST_1D 2372 IdtxShiftBlock<TransformWidth, BlockWidth, BlockHeight, 2, T>, // IDTX_1D 2373 }; 2374 kTransform[tx_type](in, cos_bit); 2375 } 2376 2377 template <size_t TransformWidth, size_t BlockWidth, typename T> 2378 HWY_ATTR HWY_INLINE void TransformFull64(TX_TYPE_1D tx_type, T *in, 2379 int8_t cos_bit) { 2380 (void)tx_type; 2381 assert(tx_type == DCT_1D); 2382 FdctNx64Block<TransformWidth, BlockWidth, TransformWidth, BlockWidth>( 2383 in, cos_bit); 2384 } 2385 2386 template <size_t TransformWidth, size_t BlockWidth, size_t TransformHeight, 2387 size_t BlockHeight, typename T> 2388 HWY_ATTR HWY_INLINE void TransformBelow32(TX_TYPE_1D tx_type, T *in, 2389 int8_t cos_bit) { 2390 if CONSTEXPR_IF (TransformHeight == 4) { 2391 Transform4<TransformWidth, BlockWidth, BlockHeight>(tx_type, in, cos_bit); 2392 } else if CONSTEXPR_IF (TransformHeight == 8) { 2393 Transform8<TransformWidth, BlockWidth, BlockHeight>(tx_type, in, cos_bit); 2394 } else if CONSTEXPR_IF (TransformHeight == 16) { 2395 Transform16<TransformWidth, BlockWidth, BlockHeight>(tx_type, in, cos_bit); 2396 } else if CONSTEXPR_IF (TransformHeight == 32) { 2397 Transform32<TransformWidth, BlockWidth, BlockHeight>(tx_type, in, cos_bit); 2398 } else { 2399 assert(false && "Unsupported transform size."); 2400 } 2401 } 2402 2403 template <size_t TransformWidth, size_t BlockWidth, size_t TransformHeight, 2404 size_t BlockHeight, typename T> 2405 HWY_ATTR HWY_INLINE void RowTransform(TX_TYPE_1D tx_type, T *in, 2406 int8_t cos_bit) { 2407 if CONSTEXPR_IF (TransformWidth == 64 && TransformHeight == 64) { 2408 assert(tx_type == DCT_1D); 2409 // 64x64 only writes 32x32 of coefficients. 2410 FdctNx64Block<TransformWidth, BlockWidth, 32, 32>(in, cos_bit); 2411 } else if CONSTEXPR_IF (TransformHeight == 64) { 2412 TransformFull64<TransformWidth, BlockWidth>(tx_type, in, cos_bit); 2413 } else { 2414 TransformBelow32<TransformWidth, BlockWidth, TransformHeight, BlockHeight>( 2415 tx_type, in, cos_bit); 2416 } 2417 } 2418 2419 template <TX_SIZE TxSize, typename T> 2420 HWY_ATTR HWY_MAYBE_UNUSED void ForwardTransform2D(const int16_t *input, 2421 int32_t *output, 2422 size_t stride, 2423 TX_TYPE tx_type) { 2424 constexpr size_t kWidth = kTxSizeWide[TxSize]; 2425 constexpr size_t kHeight = kTxSizeHigh[TxSize]; 2426 // Ensure the storage is aligned to the architecture's block width. 2427 constexpr size_t kMinVectorSize = 2428 hn::BlockDFromD<hn::ScalableTag<T>>().MaxBytes() / sizeof(uint8_t); 2429 constexpr size_t kBlockWidth = AOMMAX(kMinVectorSize / sizeof(T), kWidth); 2430 constexpr size_t kBlockHeight = AOMMAX(kMinVectorSize / sizeof(T), kHeight); 2431 HWY_ALIGN_MAX T buf0[kBlockWidth * kBlockHeight]; 2432 constexpr bool kBigRectangle = (kBlockWidth == 64 && kBlockHeight >= 32) || 2433 (kBlockWidth >= 32 && kBlockHeight == 64); 2434 using T2 = typename std::conditional<kBigRectangle, int32_t, T>::type; 2435 HWY_ALIGN_MAX T2 buf1[kBlockWidth * kBlockHeight]; 2436 constexpr int8_t kShift[3] = { kForwardTransformShift[TxSize][0], 2437 kForwardTransformShift[TxSize][1], 2438 kForwardTransformShift[TxSize][2] }; 2439 constexpr int kTransformWidthIndex = GetTxwIndex(TxSize); 2440 constexpr int kTransformHeightIndex = GetTxhIndex(TxSize); 2441 constexpr int8_t cos_bit_col = 2442 kForwardCosBitCol[kTransformWidthIndex][kTransformHeightIndex]; 2443 constexpr int8_t cos_bit_row = 2444 kForwardCosBitRow[kTransformWidthIndex][kTransformHeightIndex]; 2445 const TX_TYPE_1D vertical_transform = vtx_tab[tx_type]; 2446 const TX_TYPE_1D horizontal_transform = htx_tab[tx_type]; 2447 constexpr bool kApplyRectScale = kApplyRectScaleList[TxSize]; 2448 switch ((vertical_transform == FLIPADST_1D ? 1 : 0) | 2449 (horizontal_transform == FLIPADST_1D ? 2 : 0)) { 2450 case 0: 2451 LoadBuffer<kShift[0], kWidth, kBlockWidth, kHeight, false, false>( 2452 input, buf0, stride); 2453 break; 2454 case 1: 2455 LoadBuffer<kShift[0], kWidth, kBlockWidth, kHeight, true, false>( 2456 input, buf0, stride); 2457 break; 2458 case 2: 2459 LoadBuffer<kShift[0], kWidth, kBlockWidth, kHeight, false, true>( 2460 input, buf0, stride); 2461 break; 2462 case 3: 2463 LoadBuffer<kShift[0], kWidth, kBlockWidth, kHeight, true, true>( 2464 input, buf0, stride); 2465 break; 2466 } 2467 if CONSTEXPR_IF (kHeight == 64) { 2468 TransformFull64<kWidth, kBlockWidth>(vertical_transform, buf0, cos_bit_col); 2469 } else { 2470 TransformBelow32<kWidth, kBlockWidth, kHeight, kBlockHeight>( 2471 vertical_transform, buf0, cos_bit_col); 2472 } 2473 Transpose<kWidth, kHeight, kShift[1], false>(buf0, buf1, kBlockWidth, 2474 kBlockHeight); 2475 if CONSTEXPR_IF (kWidth == 64 && kHeight == 64) { 2476 // 64x64 only writes 32x32 of coefficients. 2477 assert(tx_type == DCT_1D); 2478 FdctNx64Block<kHeight, kBlockHeight, 32, 32>(buf1, cos_bit_row); 2479 StoreBlock<32, 32, kShift[2], kApplyRectScale>(buf1, 32, output, 32); 2480 } else if CONSTEXPR_IF (kHeight == 64 && (kWidth == 16 || kWidth == 32)) { 2481 // 32x64 and 16x64 coefficients are packed into Wx32, discarding the 2482 // right-most results. 2483 RowTransform<32, kBlockHeight, kWidth, kBlockWidth>(horizontal_transform, 2484 buf1, cos_bit_row); 2485 StoreBlock<kHeight, kWidth, kShift[2], kApplyRectScale>(buf1, kBlockHeight, 2486 output, 32); 2487 } else { 2488 RowTransform<kHeight, kBlockHeight, kWidth, kBlockWidth>( 2489 horizontal_transform, buf1, cos_bit_row); 2490 StoreBlock<kHeight, kWidth, kShift[2], kApplyRectScale>(buf1, kBlockHeight, 2491 output, kHeight); 2492 } 2493 if CONSTEXPR_IF (kHeight <= 16 && kWidth == 64) { 2494 hwy::ZeroBytes<kHeight * 32 * sizeof(*output)>(output + kHeight * 32); 2495 } 2496 } 2497 2498 HWY_MAYBE_UNUSED void LowBitdepthForwardTransform2D(const int16_t *src_diff, 2499 tran_low_t *coeff, 2500 int diff_stride, 2501 TxfmParam *txfm_param) { 2502 if (txfm_param->lossless && txfm_param->tx_size == TX_4X4) { 2503 assert(txfm_param->tx_type == DCT_DCT); 2504 av1_fwht4x4(src_diff, coeff, diff_stride); 2505 return; 2506 } 2507 using TransformFunction = decltype(&ForwardTransform2D<TX_4X4, int16_t>); 2508 constexpr TransformFunction kTable[] = { 2509 #define POINTER(w, h, _) &ForwardTransform2D<TX_##w##X##h, int16_t>, 2510 FOR_EACH_TXFM2D(POINTER, _) 2511 #undef POINTER 2512 }; 2513 kTable[txfm_param->tx_size](src_diff, coeff, diff_stride, 2514 txfm_param->tx_type); 2515 } 2516 2517 } // namespace HWY_NAMESPACE 2518 } // namespace 2519 2520 HWY_AFTER_NAMESPACE(); 2521 2522 #define MAKE_HIGHBD_TXFM2D(w, h, suffix) \ 2523 extern "C" void av1_fwd_txfm2d_##w##x##h##_##suffix( \ 2524 const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, \ 2525 int bd); \ 2526 HWY_ATTR void av1_fwd_txfm2d_##w##x##h##_##suffix( \ 2527 const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, \ 2528 int bd) { \ 2529 (void)bd; \ 2530 HWY_NAMESPACE::ForwardTransform2D<TX_##w##X##h, int32_t>(input, output, \ 2531 stride, tx_type); \ 2532 } 2533 2534 #define MAKE_LOWBD_TXFM2D(w, h, suffix) \ 2535 extern "C" void av1_lowbd_fwd_txfm2d_##w##x##h##_##suffix( \ 2536 const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, \ 2537 int bd); \ 2538 HWY_ATTR void av1_lowbd_fwd_txfm2d_##w##x##h##_##suffix( \ 2539 const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, \ 2540 int bd) { \ 2541 (void)bd; \ 2542 HWY_NAMESPACE::ForwardTransform2D<TX_##w##X##h, int16_t>(input, output, \ 2543 stride, tx_type); \ 2544 } 2545 2546 #define MAKE_LOWBD_TXFM2D_DISPATCH(suffix) \ 2547 extern "C" void av1_lowbd_fwd_txfm_##suffix( \ 2548 const int16_t *src_diff, tran_low_t *coeff, int diff_stride, \ 2549 TxfmParam *txfm_param); \ 2550 HWY_ATTR void av1_lowbd_fwd_txfm_##suffix( \ 2551 const int16_t *src_diff, tran_low_t *coeff, int diff_stride, \ 2552 TxfmParam *txfm_param) { \ 2553 HWY_NAMESPACE::LowBitdepthForwardTransform2D(src_diff, coeff, diff_stride, \ 2554 txfm_param); \ 2555 } 2556 2557 #endif // AOM_AV1_ENCODER_AV1_FWD_TXFM2D_HWY_H_