highbd_inv_txfm_avx2.c (169251B)
1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 #include <assert.h> 12 #include <immintrin.h> 13 14 #include "config/aom_config.h" 15 #include "config/av1_rtcd.h" 16 17 #include "av1/common/av1_inv_txfm1d_cfg.h" 18 #include "av1/common/idct.h" 19 #include "av1/common/x86/av1_inv_txfm_ssse3.h" 20 #include "av1/common/x86/highbd_txfm_utility_sse4.h" 21 #include "aom_dsp/x86/txfm_common_avx2.h" 22 23 // Note: 24 // Total 32x4 registers to represent 32x32 block coefficients. 25 // For high bit depth, each coefficient is 4-byte. 26 // Each __m256i register holds 8 coefficients. 27 // So each "row" we needs 4 register. Totally 32 rows 28 // Register layout: 29 // v0, v1, v2, v3, 30 // v4, v5, v6, v7, 31 // ... ... 32 // v124, v125, v126, v127 33 34 static inline __m256i highbd_clamp_epi16_avx2(__m256i u, int bd) { 35 const __m256i zero = _mm256_setzero_si256(); 36 const __m256i one = _mm256_set1_epi16(1); 37 const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one); 38 __m256i clamped, mask; 39 40 mask = _mm256_cmpgt_epi16(u, max); 41 clamped = _mm256_andnot_si256(mask, u); 42 mask = _mm256_and_si256(mask, max); 43 clamped = _mm256_or_si256(mask, clamped); 44 mask = _mm256_cmpgt_epi16(clamped, zero); 45 clamped = _mm256_and_si256(clamped, mask); 46 47 return clamped; 48 } 49 50 static inline void round_shift_4x4_avx2(__m256i *in, int shift) { 51 if (shift != 0) { 52 __m256i rnding = _mm256_set1_epi32(1 << (shift - 1)); 53 in[0] = _mm256_add_epi32(in[0], rnding); 54 in[1] = _mm256_add_epi32(in[1], rnding); 55 in[2] = _mm256_add_epi32(in[2], rnding); 56 in[3] = _mm256_add_epi32(in[3], rnding); 57 58 in[0] = _mm256_srai_epi32(in[0], shift); 59 in[1] = _mm256_srai_epi32(in[1], shift); 60 in[2] = _mm256_srai_epi32(in[2], shift); 61 in[3] = _mm256_srai_epi32(in[3], shift); 62 } 63 } 64 65 static inline void round_shift_8x8_avx2(__m256i *in, int shift) { 66 round_shift_4x4_avx2(in, shift); 67 round_shift_4x4_avx2(in + 4, shift); 68 round_shift_4x4_avx2(in + 8, shift); 69 round_shift_4x4_avx2(in + 12, shift); 70 } 71 72 static void highbd_clamp_epi32_avx2(__m256i *in, __m256i *out, 73 const __m256i *clamp_lo, 74 const __m256i *clamp_hi, int size) { 75 __m256i a0, a1; 76 for (int i = 0; i < size; i += 4) { 77 a0 = _mm256_max_epi32(in[i], *clamp_lo); 78 out[i] = _mm256_min_epi32(a0, *clamp_hi); 79 80 a1 = _mm256_max_epi32(in[i + 1], *clamp_lo); 81 out[i + 1] = _mm256_min_epi32(a1, *clamp_hi); 82 83 a0 = _mm256_max_epi32(in[i + 2], *clamp_lo); 84 out[i + 2] = _mm256_min_epi32(a0, *clamp_hi); 85 86 a1 = _mm256_max_epi32(in[i + 3], *clamp_lo); 87 out[i + 3] = _mm256_min_epi32(a1, *clamp_hi); 88 } 89 } 90 91 static inline __m256i highbd_get_recon_16x8_avx2(const __m256i pred, 92 __m256i res0, __m256i res1, 93 const int bd) { 94 __m256i x0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(pred)); 95 __m256i x1 = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(pred, 1)); 96 97 x0 = _mm256_add_epi32(res0, x0); 98 x1 = _mm256_add_epi32(res1, x1); 99 x0 = _mm256_packus_epi32(x0, x1); 100 x0 = _mm256_permute4x64_epi64(x0, 0xd8); 101 x0 = highbd_clamp_epi16_avx2(x0, bd); 102 return x0; 103 } 104 105 static inline void highbd_write_buffer_16xn_avx2(__m256i *in, uint16_t *output, 106 int stride, int flipud, 107 int height, const int bd) { 108 int j = flipud ? (height - 1) : 0; 109 const int step = flipud ? -1 : 1; 110 for (int i = 0; i < height; ++i, j += step) { 111 __m256i v = _mm256_loadu_si256((__m256i const *)(output + i * stride)); 112 __m256i u = highbd_get_recon_16x8_avx2(v, in[j], in[j + height], bd); 113 114 _mm256_storeu_si256((__m256i *)(output + i * stride), u); 115 } 116 } 117 static inline __m256i highbd_get_recon_8x8_avx2(const __m256i pred, __m256i res, 118 const int bd) { 119 __m256i x0 = pred; 120 x0 = _mm256_add_epi32(res, x0); 121 x0 = _mm256_packus_epi32(x0, x0); 122 x0 = _mm256_permute4x64_epi64(x0, 0xd8); 123 x0 = highbd_clamp_epi16_avx2(x0, bd); 124 return x0; 125 } 126 127 static inline void highbd_write_buffer_8xn_avx2(__m256i *in, uint16_t *output, 128 int stride, int flipud, 129 int height, const int bd) { 130 int j = flipud ? (height - 1) : 0; 131 __m128i temp; 132 const int step = flipud ? -1 : 1; 133 for (int i = 0; i < height; ++i, j += step) { 134 temp = _mm_loadu_si128((__m128i const *)(output + i * stride)); 135 __m256i v = _mm256_cvtepi16_epi32(temp); 136 __m256i u = highbd_get_recon_8x8_avx2(v, in[j], bd); 137 __m128i u1 = _mm256_castsi256_si128(u); 138 _mm_storeu_si128((__m128i *)(output + i * stride), u1); 139 } 140 } 141 static void neg_shift_avx2(const __m256i in0, const __m256i in1, __m256i *out0, 142 __m256i *out1, const __m256i *clamp_lo, 143 const __m256i *clamp_hi, int shift) { 144 __m256i offset = _mm256_set1_epi32((1 << shift) >> 1); 145 __m256i a0 = _mm256_add_epi32(offset, in0); 146 __m256i a1 = _mm256_sub_epi32(offset, in1); 147 148 a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift)); 149 a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift)); 150 151 a0 = _mm256_max_epi32(a0, *clamp_lo); 152 a0 = _mm256_min_epi32(a0, *clamp_hi); 153 a1 = _mm256_max_epi32(a1, *clamp_lo); 154 a1 = _mm256_min_epi32(a1, *clamp_hi); 155 156 *out0 = a0; 157 *out1 = a1; 158 } 159 160 static void transpose_8x8_avx2(const __m256i *in, __m256i *out) { 161 __m256i u0, u1, u2, u3, u4, u5, u6, u7; 162 __m256i x0, x1; 163 164 u0 = _mm256_unpacklo_epi32(in[0], in[1]); 165 u1 = _mm256_unpackhi_epi32(in[0], in[1]); 166 167 u2 = _mm256_unpacklo_epi32(in[2], in[3]); 168 u3 = _mm256_unpackhi_epi32(in[2], in[3]); 169 170 u4 = _mm256_unpacklo_epi32(in[4], in[5]); 171 u5 = _mm256_unpackhi_epi32(in[4], in[5]); 172 173 u6 = _mm256_unpacklo_epi32(in[6], in[7]); 174 u7 = _mm256_unpackhi_epi32(in[6], in[7]); 175 176 x0 = _mm256_unpacklo_epi64(u0, u2); 177 x1 = _mm256_unpacklo_epi64(u4, u6); 178 out[0] = _mm256_permute2f128_si256(x0, x1, 0x20); 179 out[4] = _mm256_permute2f128_si256(x0, x1, 0x31); 180 181 x0 = _mm256_unpackhi_epi64(u0, u2); 182 x1 = _mm256_unpackhi_epi64(u4, u6); 183 out[1] = _mm256_permute2f128_si256(x0, x1, 0x20); 184 out[5] = _mm256_permute2f128_si256(x0, x1, 0x31); 185 186 x0 = _mm256_unpacklo_epi64(u1, u3); 187 x1 = _mm256_unpacklo_epi64(u5, u7); 188 out[2] = _mm256_permute2f128_si256(x0, x1, 0x20); 189 out[6] = _mm256_permute2f128_si256(x0, x1, 0x31); 190 191 x0 = _mm256_unpackhi_epi64(u1, u3); 192 x1 = _mm256_unpackhi_epi64(u5, u7); 193 out[3] = _mm256_permute2f128_si256(x0, x1, 0x20); 194 out[7] = _mm256_permute2f128_si256(x0, x1, 0x31); 195 } 196 197 static void transpose_8x8_flip_avx2(const __m256i *in, __m256i *out) { 198 __m256i u0, u1, u2, u3, u4, u5, u6, u7; 199 __m256i x0, x1; 200 201 u0 = _mm256_unpacklo_epi32(in[7], in[6]); 202 u1 = _mm256_unpackhi_epi32(in[7], in[6]); 203 204 u2 = _mm256_unpacklo_epi32(in[5], in[4]); 205 u3 = _mm256_unpackhi_epi32(in[5], in[4]); 206 207 u4 = _mm256_unpacklo_epi32(in[3], in[2]); 208 u5 = _mm256_unpackhi_epi32(in[3], in[2]); 209 210 u6 = _mm256_unpacklo_epi32(in[1], in[0]); 211 u7 = _mm256_unpackhi_epi32(in[1], in[0]); 212 213 x0 = _mm256_unpacklo_epi64(u0, u2); 214 x1 = _mm256_unpacklo_epi64(u4, u6); 215 out[0] = _mm256_permute2f128_si256(x0, x1, 0x20); 216 out[4] = _mm256_permute2f128_si256(x0, x1, 0x31); 217 218 x0 = _mm256_unpackhi_epi64(u0, u2); 219 x1 = _mm256_unpackhi_epi64(u4, u6); 220 out[1] = _mm256_permute2f128_si256(x0, x1, 0x20); 221 out[5] = _mm256_permute2f128_si256(x0, x1, 0x31); 222 223 x0 = _mm256_unpacklo_epi64(u1, u3); 224 x1 = _mm256_unpacklo_epi64(u5, u7); 225 out[2] = _mm256_permute2f128_si256(x0, x1, 0x20); 226 out[6] = _mm256_permute2f128_si256(x0, x1, 0x31); 227 228 x0 = _mm256_unpackhi_epi64(u1, u3); 229 x1 = _mm256_unpackhi_epi64(u5, u7); 230 out[3] = _mm256_permute2f128_si256(x0, x1, 0x20); 231 out[7] = _mm256_permute2f128_si256(x0, x1, 0x31); 232 } 233 234 static inline void load_buffer_32bit_input(const int32_t *in, int stride, 235 __m256i *out, int out_size) { 236 for (int i = 0; i < out_size; ++i) { 237 out[i] = _mm256_loadu_si256((const __m256i *)(in + i * stride)); 238 } 239 } 240 241 static inline __m256i half_btf_0_avx2(const __m256i *w0, const __m256i *n0, 242 const __m256i *rounding, int bit) { 243 __m256i x; 244 x = _mm256_mullo_epi32(*w0, *n0); 245 x = _mm256_add_epi32(x, *rounding); 246 x = _mm256_srai_epi32(x, bit); 247 return x; 248 } 249 250 static inline __m256i half_btf_avx2(const __m256i *w0, const __m256i *n0, 251 const __m256i *w1, const __m256i *n1, 252 const __m256i *rounding, int bit) { 253 __m256i x, y; 254 255 x = _mm256_mullo_epi32(*w0, *n0); 256 y = _mm256_mullo_epi32(*w1, *n1); 257 x = _mm256_add_epi32(x, y); 258 x = _mm256_add_epi32(x, *rounding); 259 x = _mm256_srai_epi32(x, bit); 260 return x; 261 } 262 263 static void addsub_avx2(const __m256i in0, const __m256i in1, __m256i *out0, 264 __m256i *out1, const __m256i *clamp_lo, 265 const __m256i *clamp_hi) { 266 __m256i a0 = _mm256_add_epi32(in0, in1); 267 __m256i a1 = _mm256_sub_epi32(in0, in1); 268 269 a0 = _mm256_max_epi32(a0, *clamp_lo); 270 a0 = _mm256_min_epi32(a0, *clamp_hi); 271 a1 = _mm256_max_epi32(a1, *clamp_lo); 272 a1 = _mm256_min_epi32(a1, *clamp_hi); 273 274 *out0 = a0; 275 *out1 = a1; 276 } 277 278 static inline void idct32_stage4_avx2( 279 __m256i *bf1, const __m256i *cospim8, const __m256i *cospi56, 280 const __m256i *cospi8, const __m256i *cospim56, const __m256i *cospim40, 281 const __m256i *cospi24, const __m256i *cospi40, const __m256i *cospim24, 282 const __m256i *rounding, int bit) { 283 __m256i temp1, temp2; 284 temp1 = half_btf_avx2(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit); 285 bf1[30] = half_btf_avx2(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit); 286 bf1[17] = temp1; 287 288 temp2 = half_btf_avx2(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit); 289 bf1[29] = half_btf_avx2(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit); 290 bf1[18] = temp2; 291 292 temp1 = half_btf_avx2(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit); 293 bf1[26] = half_btf_avx2(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit); 294 bf1[21] = temp1; 295 296 temp2 = half_btf_avx2(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit); 297 bf1[25] = half_btf_avx2(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit); 298 bf1[22] = temp2; 299 } 300 301 static inline void idct32_stage5_avx2( 302 __m256i *bf1, const __m256i *cospim16, const __m256i *cospi48, 303 const __m256i *cospi16, const __m256i *cospim48, const __m256i *clamp_lo, 304 const __m256i *clamp_hi, const __m256i *rounding, int bit) { 305 __m256i temp1, temp2; 306 temp1 = half_btf_avx2(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit); 307 bf1[14] = half_btf_avx2(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit); 308 bf1[9] = temp1; 309 310 temp2 = half_btf_avx2(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit); 311 bf1[13] = half_btf_avx2(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit); 312 bf1[10] = temp2; 313 314 addsub_avx2(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi); 315 addsub_avx2(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi); 316 addsub_avx2(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi); 317 addsub_avx2(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi); 318 addsub_avx2(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi); 319 addsub_avx2(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi); 320 addsub_avx2(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi); 321 addsub_avx2(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi); 322 } 323 324 static inline void idct32_stage6_avx2( 325 __m256i *bf1, const __m256i *cospim32, const __m256i *cospi32, 326 const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16, 327 const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi, 328 const __m256i *rounding, int bit) { 329 __m256i temp1, temp2; 330 temp1 = half_btf_avx2(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit); 331 bf1[6] = half_btf_avx2(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit); 332 bf1[5] = temp1; 333 334 addsub_avx2(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi); 335 addsub_avx2(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi); 336 addsub_avx2(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi); 337 addsub_avx2(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi); 338 339 temp1 = half_btf_avx2(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit); 340 bf1[29] = half_btf_avx2(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit); 341 bf1[18] = temp1; 342 temp2 = half_btf_avx2(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit); 343 bf1[28] = half_btf_avx2(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit); 344 bf1[19] = temp2; 345 temp1 = half_btf_avx2(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit); 346 bf1[27] = half_btf_avx2(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit); 347 bf1[20] = temp1; 348 temp2 = half_btf_avx2(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit); 349 bf1[26] = half_btf_avx2(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit); 350 bf1[21] = temp2; 351 } 352 353 static inline void idct32_stage7_avx2(__m256i *bf1, const __m256i *cospim32, 354 const __m256i *cospi32, 355 const __m256i *clamp_lo, 356 const __m256i *clamp_hi, 357 const __m256i *rounding, int bit) { 358 __m256i temp1, temp2; 359 addsub_avx2(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi); 360 addsub_avx2(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi); 361 addsub_avx2(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi); 362 addsub_avx2(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi); 363 364 temp1 = half_btf_avx2(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit); 365 bf1[13] = half_btf_avx2(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit); 366 bf1[10] = temp1; 367 temp2 = half_btf_avx2(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit); 368 bf1[12] = half_btf_avx2(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit); 369 bf1[11] = temp2; 370 371 addsub_avx2(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi); 372 addsub_avx2(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi); 373 addsub_avx2(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi); 374 addsub_avx2(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi); 375 addsub_avx2(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi); 376 addsub_avx2(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi); 377 addsub_avx2(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi); 378 addsub_avx2(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi); 379 } 380 381 static inline void idct32_stage8_avx2(__m256i *bf1, const __m256i *cospim32, 382 const __m256i *cospi32, 383 const __m256i *clamp_lo, 384 const __m256i *clamp_hi, 385 const __m256i *rounding, int bit) { 386 __m256i temp1, temp2; 387 addsub_avx2(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi); 388 addsub_avx2(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi); 389 addsub_avx2(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi); 390 addsub_avx2(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi); 391 addsub_avx2(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi); 392 addsub_avx2(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi); 393 addsub_avx2(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi); 394 addsub_avx2(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi); 395 396 temp1 = half_btf_avx2(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit); 397 bf1[27] = half_btf_avx2(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit); 398 bf1[20] = temp1; 399 temp2 = half_btf_avx2(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit); 400 bf1[26] = half_btf_avx2(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit); 401 bf1[21] = temp2; 402 temp1 = half_btf_avx2(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit); 403 bf1[25] = half_btf_avx2(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit); 404 bf1[22] = temp1; 405 temp2 = half_btf_avx2(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit); 406 bf1[24] = half_btf_avx2(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit); 407 bf1[23] = temp2; 408 } 409 410 static inline void idct32_stage9_avx2(__m256i *bf1, __m256i *out, 411 const int do_cols, const int bd, 412 const int out_shift, 413 const __m256i *clamp_lo, 414 const __m256i *clamp_hi) { 415 addsub_avx2(bf1[0], bf1[31], out + 0, out + 31, clamp_lo, clamp_hi); 416 addsub_avx2(bf1[1], bf1[30], out + 1, out + 30, clamp_lo, clamp_hi); 417 addsub_avx2(bf1[2], bf1[29], out + 2, out + 29, clamp_lo, clamp_hi); 418 addsub_avx2(bf1[3], bf1[28], out + 3, out + 28, clamp_lo, clamp_hi); 419 addsub_avx2(bf1[4], bf1[27], out + 4, out + 27, clamp_lo, clamp_hi); 420 addsub_avx2(bf1[5], bf1[26], out + 5, out + 26, clamp_lo, clamp_hi); 421 addsub_avx2(bf1[6], bf1[25], out + 6, out + 25, clamp_lo, clamp_hi); 422 addsub_avx2(bf1[7], bf1[24], out + 7, out + 24, clamp_lo, clamp_hi); 423 addsub_avx2(bf1[8], bf1[23], out + 8, out + 23, clamp_lo, clamp_hi); 424 addsub_avx2(bf1[9], bf1[22], out + 9, out + 22, clamp_lo, clamp_hi); 425 addsub_avx2(bf1[10], bf1[21], out + 10, out + 21, clamp_lo, clamp_hi); 426 addsub_avx2(bf1[11], bf1[20], out + 11, out + 20, clamp_lo, clamp_hi); 427 addsub_avx2(bf1[12], bf1[19], out + 12, out + 19, clamp_lo, clamp_hi); 428 addsub_avx2(bf1[13], bf1[18], out + 13, out + 18, clamp_lo, clamp_hi); 429 addsub_avx2(bf1[14], bf1[17], out + 14, out + 17, clamp_lo, clamp_hi); 430 addsub_avx2(bf1[15], bf1[16], out + 15, out + 16, clamp_lo, clamp_hi); 431 if (!do_cols) { 432 const int log_range_out = AOMMAX(16, bd + 6); 433 const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1))); 434 const __m256i clamp_hi_out = 435 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); 436 round_shift_8x8_avx2(out, out_shift); 437 round_shift_8x8_avx2(out + 16, out_shift); 438 highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 32); 439 } 440 } 441 442 static void idct32_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols, 443 int bd, int out_shift) { 444 const int32_t *cospi = cospi_arr(bit); 445 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); 446 const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1)); 447 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); 448 __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); 449 __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); 450 __m256i x; 451 // stage 0 452 // stage 1 453 // stage 2 454 // stage 3 455 // stage 4 456 // stage 5 457 x = _mm256_mullo_epi32(in[0], cospi32); 458 x = _mm256_add_epi32(x, rounding); 459 x = _mm256_srai_epi32(x, bit); 460 461 // stage 6 462 // stage 7 463 // stage 8 464 // stage 9 465 if (!do_cols) { 466 const int log_range_out = AOMMAX(16, bd + 6); 467 __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1); 468 clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1))); 469 clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); 470 x = _mm256_add_epi32(offset, x); 471 x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift)); 472 } 473 x = _mm256_max_epi32(x, clamp_lo); 474 x = _mm256_min_epi32(x, clamp_hi); 475 out[0] = x; 476 out[1] = x; 477 out[2] = x; 478 out[3] = x; 479 out[4] = x; 480 out[5] = x; 481 out[6] = x; 482 out[7] = x; 483 out[8] = x; 484 out[9] = x; 485 out[10] = x; 486 out[11] = x; 487 out[12] = x; 488 out[13] = x; 489 out[14] = x; 490 out[15] = x; 491 out[16] = x; 492 out[17] = x; 493 out[18] = x; 494 out[19] = x; 495 out[20] = x; 496 out[21] = x; 497 out[22] = x; 498 out[23] = x; 499 out[24] = x; 500 out[25] = x; 501 out[26] = x; 502 out[27] = x; 503 out[28] = x; 504 out[29] = x; 505 out[30] = x; 506 out[31] = x; 507 } 508 509 static void idct32_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols, 510 int bd, int out_shift) { 511 const int32_t *cospi = cospi_arr(bit); 512 const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); 513 const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); 514 const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); 515 const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); 516 const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); 517 const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); 518 const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); 519 const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]); 520 const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); 521 const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); 522 const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); 523 const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); 524 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); 525 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); 526 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); 527 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); 528 const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); 529 const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); 530 const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); 531 const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); 532 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); 533 const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); 534 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); 535 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); 536 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); 537 const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); 538 const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1)); 539 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); 540 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); 541 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); 542 __m256i bf1[32]; 543 544 { 545 // stage 0 546 // stage 1 547 bf1[0] = in[0]; 548 bf1[4] = in[4]; 549 bf1[8] = in[2]; 550 bf1[12] = in[6]; 551 bf1[16] = in[1]; 552 bf1[20] = in[5]; 553 bf1[24] = in[3]; 554 bf1[28] = in[7]; 555 556 // stage 2 557 bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit); 558 bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit); 559 bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit); 560 bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit); 561 bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit); 562 bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit); 563 bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit); 564 bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit); 565 566 // stage 3 567 bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit); 568 bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit); 569 570 bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit); 571 bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit); 572 bf1[17] = bf1[16]; 573 bf1[18] = bf1[19]; 574 bf1[21] = bf1[20]; 575 bf1[22] = bf1[23]; 576 bf1[25] = bf1[24]; 577 bf1[26] = bf1[27]; 578 bf1[29] = bf1[28]; 579 bf1[30] = bf1[31]; 580 581 // stage 4 582 bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit); 583 bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit); 584 585 bf1[9] = bf1[8]; 586 bf1[10] = bf1[11]; 587 bf1[13] = bf1[12]; 588 bf1[14] = bf1[15]; 589 590 idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40, 591 &cospi24, &cospi40, &cospim24, &rounding, bit); 592 593 // stage 5 594 bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit); 595 bf1[1] = bf1[0]; 596 bf1[5] = bf1[4]; 597 bf1[6] = bf1[7]; 598 599 idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo, 600 &clamp_hi, &rounding, bit); 601 602 // stage 6 603 bf1[3] = bf1[0]; 604 bf1[2] = bf1[1]; 605 606 idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, 607 &cospim48, &clamp_lo, &clamp_hi, &rounding, bit); 608 609 // stage 7 610 idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, 611 &rounding, bit); 612 613 // stage 8 614 idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, 615 &rounding, bit); 616 617 // stage 9 618 idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); 619 } 620 } 621 622 static void idct32_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols, 623 int bd, int out_shift) { 624 const int32_t *cospi = cospi_arr(bit); 625 const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); 626 const __m256i cospi30 = _mm256_set1_epi32(cospi[30]); 627 const __m256i cospi46 = _mm256_set1_epi32(cospi[46]); 628 const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); 629 const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); 630 const __m256i cospi22 = _mm256_set1_epi32(cospi[22]); 631 const __m256i cospi38 = _mm256_set1_epi32(cospi[38]); 632 const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); 633 const __m256i cospi26 = _mm256_set1_epi32(cospi[26]); 634 const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); 635 const __m256i cospi18 = _mm256_set1_epi32(cospi[18]); 636 const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); 637 const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); 638 const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]); 639 const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]); 640 const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]); 641 const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); 642 const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); 643 const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); 644 const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); 645 const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); 646 const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); 647 const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); 648 const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); 649 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); 650 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); 651 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); 652 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); 653 const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); 654 const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); 655 const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); 656 const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); 657 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); 658 const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); 659 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); 660 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); 661 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); 662 const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); 663 const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1)); 664 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); 665 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); 666 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); 667 __m256i bf1[32]; 668 669 { 670 // stage 0 671 // stage 1 672 bf1[0] = in[0]; 673 bf1[2] = in[8]; 674 bf1[4] = in[4]; 675 bf1[6] = in[12]; 676 bf1[8] = in[2]; 677 bf1[10] = in[10]; 678 bf1[12] = in[6]; 679 bf1[14] = in[14]; 680 bf1[16] = in[1]; 681 bf1[18] = in[9]; 682 bf1[20] = in[5]; 683 bf1[22] = in[13]; 684 bf1[24] = in[3]; 685 bf1[26] = in[11]; 686 bf1[28] = in[7]; 687 bf1[30] = in[15]; 688 689 // stage 2 690 bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit); 691 bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit); 692 bf1[17] = half_btf_0_avx2(&cospim34, &bf1[30], &rounding, bit); 693 bf1[30] = half_btf_0_avx2(&cospi30, &bf1[30], &rounding, bit); 694 bf1[29] = half_btf_0_avx2(&cospi18, &bf1[18], &rounding, bit); 695 bf1[18] = half_btf_0_avx2(&cospi46, &bf1[18], &rounding, bit); 696 bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit); 697 bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit); 698 bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit); 699 bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit); 700 bf1[21] = half_btf_0_avx2(&cospim42, &bf1[26], &rounding, bit); 701 bf1[26] = half_btf_0_avx2(&cospi22, &bf1[26], &rounding, bit); 702 bf1[25] = half_btf_0_avx2(&cospi26, &bf1[22], &rounding, bit); 703 bf1[22] = half_btf_0_avx2(&cospi38, &bf1[22], &rounding, bit); 704 bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit); 705 bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit); 706 707 // stage 3 708 bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit); 709 bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit); 710 bf1[9] = half_btf_0_avx2(&cospim36, &bf1[14], &rounding, bit); 711 bf1[14] = half_btf_0_avx2(&cospi28, &bf1[14], &rounding, bit); 712 bf1[13] = half_btf_0_avx2(&cospi20, &bf1[10], &rounding, bit); 713 bf1[10] = half_btf_0_avx2(&cospi44, &bf1[10], &rounding, bit); 714 bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit); 715 bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit); 716 717 addsub_avx2(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi); 718 addsub_avx2(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi); 719 addsub_avx2(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi); 720 addsub_avx2(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi); 721 addsub_avx2(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi); 722 addsub_avx2(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi); 723 addsub_avx2(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi); 724 addsub_avx2(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi); 725 726 // stage 4 727 bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit); 728 bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit); 729 bf1[5] = half_btf_0_avx2(&cospim40, &bf1[6], &rounding, bit); 730 bf1[6] = half_btf_0_avx2(&cospi24, &bf1[6], &rounding, bit); 731 732 addsub_avx2(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi); 733 addsub_avx2(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi); 734 addsub_avx2(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi); 735 addsub_avx2(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi); 736 737 idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40, 738 &cospi24, &cospi40, &cospim24, &rounding, bit); 739 740 // stage 5 741 bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit); 742 bf1[1] = bf1[0]; 743 bf1[3] = half_btf_0_avx2(&cospi16, &bf1[2], &rounding, bit); 744 bf1[2] = half_btf_0_avx2(&cospi48, &bf1[2], &rounding, bit); 745 746 addsub_avx2(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi); 747 addsub_avx2(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi); 748 749 idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo, 750 &clamp_hi, &rounding, bit); 751 752 // stage 6 753 addsub_avx2(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi); 754 addsub_avx2(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi); 755 756 idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, 757 &cospim48, &clamp_lo, &clamp_hi, &rounding, bit); 758 759 // stage 7 760 idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, 761 &rounding, bit); 762 763 // stage 8 764 idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, 765 &rounding, bit); 766 767 // stage 9 768 idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); 769 } 770 } 771 772 static void idct32_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd, 773 int out_shift) { 774 const int32_t *cospi = cospi_arr(bit); 775 const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); 776 const __m256i cospi30 = _mm256_set1_epi32(cospi[30]); 777 const __m256i cospi46 = _mm256_set1_epi32(cospi[46]); 778 const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); 779 const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); 780 const __m256i cospi22 = _mm256_set1_epi32(cospi[22]); 781 const __m256i cospi38 = _mm256_set1_epi32(cospi[38]); 782 const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); 783 const __m256i cospi58 = _mm256_set1_epi32(cospi[58]); 784 const __m256i cospi26 = _mm256_set1_epi32(cospi[26]); 785 const __m256i cospi42 = _mm256_set1_epi32(cospi[42]); 786 const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); 787 const __m256i cospi50 = _mm256_set1_epi32(cospi[50]); 788 const __m256i cospi18 = _mm256_set1_epi32(cospi[18]); 789 const __m256i cospi34 = _mm256_set1_epi32(cospi[34]); 790 const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); 791 const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); 792 const __m256i cospim26 = _mm256_set1_epi32(-cospi[26]); 793 const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]); 794 const __m256i cospim10 = _mm256_set1_epi32(-cospi[10]); 795 const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]); 796 const __m256i cospim18 = _mm256_set1_epi32(-cospi[18]); 797 const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]); 798 const __m256i cospim2 = _mm256_set1_epi32(-cospi[2]); 799 const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); 800 const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); 801 const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); 802 const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); 803 const __m256i cospi52 = _mm256_set1_epi32(cospi[52]); 804 const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); 805 const __m256i cospi36 = _mm256_set1_epi32(cospi[36]); 806 const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); 807 const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); 808 const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]); 809 const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); 810 const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]); 811 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); 812 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); 813 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); 814 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); 815 const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); 816 const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); 817 const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); 818 const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); 819 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); 820 const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); 821 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); 822 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); 823 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); 824 const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); 825 const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1)); 826 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); 827 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); 828 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); 829 __m256i bf1[32], bf0[32]; 830 831 { 832 // stage 0 833 // stage 1 834 bf1[0] = in[0]; 835 bf1[1] = in[16]; 836 bf1[2] = in[8]; 837 bf1[3] = in[24]; 838 bf1[4] = in[4]; 839 bf1[5] = in[20]; 840 bf1[6] = in[12]; 841 bf1[7] = in[28]; 842 bf1[8] = in[2]; 843 bf1[9] = in[18]; 844 bf1[10] = in[10]; 845 bf1[11] = in[26]; 846 bf1[12] = in[6]; 847 bf1[13] = in[22]; 848 bf1[14] = in[14]; 849 bf1[15] = in[30]; 850 bf1[16] = in[1]; 851 bf1[17] = in[17]; 852 bf1[18] = in[9]; 853 bf1[19] = in[25]; 854 bf1[20] = in[5]; 855 bf1[21] = in[21]; 856 bf1[22] = in[13]; 857 bf1[23] = in[29]; 858 bf1[24] = in[3]; 859 bf1[25] = in[19]; 860 bf1[26] = in[11]; 861 bf1[27] = in[27]; 862 bf1[28] = in[7]; 863 bf1[29] = in[23]; 864 bf1[30] = in[15]; 865 bf1[31] = in[31]; 866 867 // stage 2 868 bf0[0] = bf1[0]; 869 bf0[1] = bf1[1]; 870 bf0[2] = bf1[2]; 871 bf0[3] = bf1[3]; 872 bf0[4] = bf1[4]; 873 bf0[5] = bf1[5]; 874 bf0[6] = bf1[6]; 875 bf0[7] = bf1[7]; 876 bf0[8] = bf1[8]; 877 bf0[9] = bf1[9]; 878 bf0[10] = bf1[10]; 879 bf0[11] = bf1[11]; 880 bf0[12] = bf1[12]; 881 bf0[13] = bf1[13]; 882 bf0[14] = bf1[14]; 883 bf0[15] = bf1[15]; 884 bf0[16] = 885 half_btf_avx2(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit); 886 bf0[17] = 887 half_btf_avx2(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit); 888 bf0[18] = 889 half_btf_avx2(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit); 890 bf0[19] = 891 half_btf_avx2(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit); 892 bf0[20] = 893 half_btf_avx2(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit); 894 bf0[21] = 895 half_btf_avx2(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit); 896 bf0[22] = 897 half_btf_avx2(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit); 898 bf0[23] = 899 half_btf_avx2(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit); 900 bf0[24] = 901 half_btf_avx2(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit); 902 bf0[25] = 903 half_btf_avx2(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit); 904 bf0[26] = 905 half_btf_avx2(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit); 906 bf0[27] = 907 half_btf_avx2(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit); 908 bf0[28] = 909 half_btf_avx2(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit); 910 bf0[29] = 911 half_btf_avx2(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit); 912 bf0[30] = 913 half_btf_avx2(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit); 914 bf0[31] = 915 half_btf_avx2(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit); 916 917 // stage 3 918 bf1[0] = bf0[0]; 919 bf1[1] = bf0[1]; 920 bf1[2] = bf0[2]; 921 bf1[3] = bf0[3]; 922 bf1[4] = bf0[4]; 923 bf1[5] = bf0[5]; 924 bf1[6] = bf0[6]; 925 bf1[7] = bf0[7]; 926 bf1[8] = 927 half_btf_avx2(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit); 928 bf1[9] = 929 half_btf_avx2(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit); 930 bf1[10] = 931 half_btf_avx2(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit); 932 bf1[11] = 933 half_btf_avx2(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit); 934 bf1[12] = 935 half_btf_avx2(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit); 936 bf1[13] = 937 half_btf_avx2(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit); 938 bf1[14] = 939 half_btf_avx2(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit); 940 bf1[15] = 941 half_btf_avx2(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit); 942 943 addsub_avx2(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi); 944 addsub_avx2(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi); 945 addsub_avx2(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi); 946 addsub_avx2(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi); 947 addsub_avx2(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi); 948 addsub_avx2(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi); 949 addsub_avx2(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi); 950 addsub_avx2(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi); 951 952 // stage 4 953 bf0[0] = bf1[0]; 954 bf0[1] = bf1[1]; 955 bf0[2] = bf1[2]; 956 bf0[3] = bf1[3]; 957 bf0[4] = 958 half_btf_avx2(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit); 959 bf0[5] = 960 half_btf_avx2(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit); 961 bf0[6] = 962 half_btf_avx2(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit); 963 bf0[7] = half_btf_avx2(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit); 964 965 addsub_avx2(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi); 966 addsub_avx2(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi); 967 addsub_avx2(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi); 968 addsub_avx2(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi); 969 970 bf0[16] = bf1[16]; 971 bf0[17] = 972 half_btf_avx2(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit); 973 bf0[18] = 974 half_btf_avx2(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit); 975 bf0[19] = bf1[19]; 976 bf0[20] = bf1[20]; 977 bf0[21] = 978 half_btf_avx2(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit); 979 bf0[22] = 980 half_btf_avx2(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit); 981 bf0[23] = bf1[23]; 982 bf0[24] = bf1[24]; 983 bf0[25] = 984 half_btf_avx2(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit); 985 bf0[26] = 986 half_btf_avx2(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit); 987 bf0[27] = bf1[27]; 988 bf0[28] = bf1[28]; 989 bf0[29] = 990 half_btf_avx2(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit); 991 bf0[30] = 992 half_btf_avx2(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit); 993 bf0[31] = bf1[31]; 994 995 // stage 5 996 bf1[0] = 997 half_btf_avx2(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit); 998 bf1[1] = 999 half_btf_avx2(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit); 1000 bf1[2] = 1001 half_btf_avx2(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit); 1002 bf1[3] = 1003 half_btf_avx2(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit); 1004 addsub_avx2(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi); 1005 addsub_avx2(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi); 1006 bf1[8] = bf0[8]; 1007 bf1[9] = 1008 half_btf_avx2(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit); 1009 bf1[10] = 1010 half_btf_avx2(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit); 1011 bf1[11] = bf0[11]; 1012 bf1[12] = bf0[12]; 1013 bf1[13] = 1014 half_btf_avx2(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit); 1015 bf1[14] = 1016 half_btf_avx2(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit); 1017 bf1[15] = bf0[15]; 1018 addsub_avx2(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi); 1019 addsub_avx2(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi); 1020 addsub_avx2(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi); 1021 addsub_avx2(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi); 1022 addsub_avx2(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi); 1023 addsub_avx2(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi); 1024 addsub_avx2(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi); 1025 addsub_avx2(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi); 1026 1027 // stage 6 1028 addsub_avx2(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi); 1029 addsub_avx2(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi); 1030 bf0[4] = bf1[4]; 1031 bf0[5] = 1032 half_btf_avx2(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit); 1033 bf0[6] = 1034 half_btf_avx2(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit); 1035 bf0[7] = bf1[7]; 1036 addsub_avx2(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi); 1037 addsub_avx2(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi); 1038 addsub_avx2(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi); 1039 addsub_avx2(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi); 1040 bf0[16] = bf1[16]; 1041 bf0[17] = bf1[17]; 1042 bf0[18] = 1043 half_btf_avx2(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit); 1044 bf0[19] = 1045 half_btf_avx2(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit); 1046 bf0[20] = 1047 half_btf_avx2(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit); 1048 bf0[21] = 1049 half_btf_avx2(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit); 1050 bf0[22] = bf1[22]; 1051 bf0[23] = bf1[23]; 1052 bf0[24] = bf1[24]; 1053 bf0[25] = bf1[25]; 1054 bf0[26] = 1055 half_btf_avx2(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit); 1056 bf0[27] = 1057 half_btf_avx2(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit); 1058 bf0[28] = 1059 half_btf_avx2(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit); 1060 bf0[29] = 1061 half_btf_avx2(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit); 1062 bf0[30] = bf1[30]; 1063 bf0[31] = bf1[31]; 1064 1065 // stage 7 1066 addsub_avx2(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi); 1067 addsub_avx2(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi); 1068 addsub_avx2(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi); 1069 addsub_avx2(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi); 1070 bf1[8] = bf0[8]; 1071 bf1[9] = bf0[9]; 1072 bf1[10] = 1073 half_btf_avx2(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit); 1074 bf1[11] = 1075 half_btf_avx2(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit); 1076 bf1[12] = 1077 half_btf_avx2(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit); 1078 bf1[13] = 1079 half_btf_avx2(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit); 1080 bf1[14] = bf0[14]; 1081 bf1[15] = bf0[15]; 1082 addsub_avx2(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi); 1083 addsub_avx2(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi); 1084 addsub_avx2(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi); 1085 addsub_avx2(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi); 1086 addsub_avx2(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi); 1087 addsub_avx2(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi); 1088 addsub_avx2(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi); 1089 addsub_avx2(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi); 1090 1091 // stage 8 1092 addsub_avx2(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi); 1093 addsub_avx2(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi); 1094 addsub_avx2(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi); 1095 addsub_avx2(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi); 1096 addsub_avx2(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi); 1097 addsub_avx2(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi); 1098 addsub_avx2(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi); 1099 addsub_avx2(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi); 1100 bf0[16] = bf1[16]; 1101 bf0[17] = bf1[17]; 1102 bf0[18] = bf1[18]; 1103 bf0[19] = bf1[19]; 1104 bf0[20] = 1105 half_btf_avx2(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit); 1106 bf0[21] = 1107 half_btf_avx2(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit); 1108 bf0[22] = 1109 half_btf_avx2(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit); 1110 bf0[23] = 1111 half_btf_avx2(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit); 1112 bf0[24] = 1113 half_btf_avx2(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit); 1114 bf0[25] = 1115 half_btf_avx2(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit); 1116 bf0[26] = 1117 half_btf_avx2(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit); 1118 bf0[27] = 1119 half_btf_avx2(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit); 1120 bf0[28] = bf1[28]; 1121 bf0[29] = bf1[29]; 1122 bf0[30] = bf1[30]; 1123 bf0[31] = bf1[31]; 1124 1125 // stage 9 1126 addsub_avx2(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo, &clamp_hi); 1127 addsub_avx2(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo, &clamp_hi); 1128 addsub_avx2(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo, &clamp_hi); 1129 addsub_avx2(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo, &clamp_hi); 1130 addsub_avx2(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo, &clamp_hi); 1131 addsub_avx2(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo, &clamp_hi); 1132 addsub_avx2(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo, &clamp_hi); 1133 addsub_avx2(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo, &clamp_hi); 1134 addsub_avx2(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo, &clamp_hi); 1135 addsub_avx2(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo, &clamp_hi); 1136 addsub_avx2(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo, &clamp_hi); 1137 addsub_avx2(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo, &clamp_hi); 1138 addsub_avx2(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo, &clamp_hi); 1139 addsub_avx2(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo, &clamp_hi); 1140 addsub_avx2(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo, &clamp_hi); 1141 addsub_avx2(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo, &clamp_hi); 1142 if (!do_cols) { 1143 const int log_range_out = AOMMAX(16, bd + 6); 1144 const __m256i clamp_lo_out = 1145 _mm256_set1_epi32(-(1 << (log_range_out - 1))); 1146 const __m256i clamp_hi_out = 1147 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); 1148 round_shift_8x8_avx2(out, out_shift); 1149 round_shift_8x8_avx2(out + 16, out_shift); 1150 highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 32); 1151 } 1152 } 1153 } 1154 static void idct16_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols, 1155 int bd, int out_shift) { 1156 const int32_t *cospi = cospi_arr(bit); 1157 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); 1158 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); 1159 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); 1160 __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); 1161 __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); 1162 1163 { 1164 // stage 0 1165 // stage 1 1166 // stage 2 1167 // stage 3 1168 // stage 4 1169 in[0] = _mm256_mullo_epi32(in[0], cospi32); 1170 in[0] = _mm256_add_epi32(in[0], rnding); 1171 in[0] = _mm256_srai_epi32(in[0], bit); 1172 1173 // stage 5 1174 // stage 6 1175 // stage 7 1176 if (!do_cols) { 1177 const int log_range_out = AOMMAX(16, bd + 6); 1178 clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1))); 1179 clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); 1180 __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1); 1181 in[0] = _mm256_add_epi32(in[0], offset); 1182 in[0] = _mm256_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift)); 1183 } 1184 in[0] = _mm256_max_epi32(in[0], clamp_lo); 1185 in[0] = _mm256_min_epi32(in[0], clamp_hi); 1186 out[0] = in[0]; 1187 out[1] = in[0]; 1188 out[2] = in[0]; 1189 out[3] = in[0]; 1190 out[4] = in[0]; 1191 out[5] = in[0]; 1192 out[6] = in[0]; 1193 out[7] = in[0]; 1194 out[8] = in[0]; 1195 out[9] = in[0]; 1196 out[10] = in[0]; 1197 out[11] = in[0]; 1198 out[12] = in[0]; 1199 out[13] = in[0]; 1200 out[14] = in[0]; 1201 out[15] = in[0]; 1202 } 1203 } 1204 1205 static void idct16_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols, 1206 int bd, int out_shift) { 1207 const int32_t *cospi = cospi_arr(bit); 1208 const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); 1209 const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); 1210 const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); 1211 const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); 1212 const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); 1213 const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); 1214 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); 1215 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); 1216 const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); 1217 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); 1218 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); 1219 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); 1220 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); 1221 const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); 1222 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); 1223 const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); 1224 const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); 1225 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); 1226 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); 1227 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); 1228 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); 1229 __m256i u[16], x, y; 1230 1231 { 1232 // stage 0 1233 // stage 1 1234 u[0] = in[0]; 1235 u[2] = in[4]; 1236 u[4] = in[2]; 1237 u[6] = in[6]; 1238 u[8] = in[1]; 1239 u[10] = in[5]; 1240 u[12] = in[3]; 1241 u[14] = in[7]; 1242 1243 // stage 2 1244 u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit); 1245 u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit); 1246 1247 u[9] = half_btf_0_avx2(&cospim36, &u[14], &rnding, bit); 1248 u[14] = half_btf_0_avx2(&cospi28, &u[14], &rnding, bit); 1249 1250 u[13] = half_btf_0_avx2(&cospi20, &u[10], &rnding, bit); 1251 u[10] = half_btf_0_avx2(&cospi44, &u[10], &rnding, bit); 1252 1253 u[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit); 1254 u[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit); 1255 1256 // stage 3 1257 u[7] = half_btf_0_avx2(&cospi8, &u[4], &rnding, bit); 1258 u[4] = half_btf_0_avx2(&cospi56, &u[4], &rnding, bit); 1259 u[5] = half_btf_0_avx2(&cospim40, &u[6], &rnding, bit); 1260 u[6] = half_btf_0_avx2(&cospi24, &u[6], &rnding, bit); 1261 1262 addsub_avx2(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi); 1263 addsub_avx2(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi); 1264 addsub_avx2(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi); 1265 addsub_avx2(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi); 1266 1267 // stage 4 1268 x = _mm256_mullo_epi32(u[0], cospi32); 1269 u[0] = _mm256_add_epi32(x, rnding); 1270 u[0] = _mm256_srai_epi32(u[0], bit); 1271 u[1] = u[0]; 1272 1273 u[3] = half_btf_0_avx2(&cospi16, &u[2], &rnding, bit); 1274 u[2] = half_btf_0_avx2(&cospi48, &u[2], &rnding, bit); 1275 1276 addsub_avx2(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi); 1277 addsub_avx2(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi); 1278 1279 x = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); 1280 u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); 1281 u[9] = x; 1282 y = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); 1283 u[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); 1284 u[10] = y; 1285 1286 // stage 5 1287 addsub_avx2(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi); 1288 addsub_avx2(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi); 1289 1290 x = _mm256_mullo_epi32(u[5], cospi32); 1291 y = _mm256_mullo_epi32(u[6], cospi32); 1292 u[5] = _mm256_sub_epi32(y, x); 1293 u[5] = _mm256_add_epi32(u[5], rnding); 1294 u[5] = _mm256_srai_epi32(u[5], bit); 1295 1296 u[6] = _mm256_add_epi32(y, x); 1297 u[6] = _mm256_add_epi32(u[6], rnding); 1298 u[6] = _mm256_srai_epi32(u[6], bit); 1299 1300 addsub_avx2(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi); 1301 addsub_avx2(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi); 1302 addsub_avx2(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi); 1303 addsub_avx2(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi); 1304 1305 // stage 6 1306 addsub_avx2(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi); 1307 addsub_avx2(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi); 1308 addsub_avx2(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi); 1309 addsub_avx2(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi); 1310 1311 x = _mm256_mullo_epi32(u[10], cospi32); 1312 y = _mm256_mullo_epi32(u[13], cospi32); 1313 u[10] = _mm256_sub_epi32(y, x); 1314 u[10] = _mm256_add_epi32(u[10], rnding); 1315 u[10] = _mm256_srai_epi32(u[10], bit); 1316 1317 u[13] = _mm256_add_epi32(x, y); 1318 u[13] = _mm256_add_epi32(u[13], rnding); 1319 u[13] = _mm256_srai_epi32(u[13], bit); 1320 1321 x = _mm256_mullo_epi32(u[11], cospi32); 1322 y = _mm256_mullo_epi32(u[12], cospi32); 1323 u[11] = _mm256_sub_epi32(y, x); 1324 u[11] = _mm256_add_epi32(u[11], rnding); 1325 u[11] = _mm256_srai_epi32(u[11], bit); 1326 1327 u[12] = _mm256_add_epi32(x, y); 1328 u[12] = _mm256_add_epi32(u[12], rnding); 1329 u[12] = _mm256_srai_epi32(u[12], bit); 1330 // stage 7 1331 addsub_avx2(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi); 1332 addsub_avx2(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi); 1333 addsub_avx2(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi); 1334 addsub_avx2(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi); 1335 addsub_avx2(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi); 1336 addsub_avx2(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi); 1337 addsub_avx2(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi); 1338 addsub_avx2(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi); 1339 1340 if (!do_cols) { 1341 const int log_range_out = AOMMAX(16, bd + 6); 1342 const __m256i clamp_lo_out = 1343 _mm256_set1_epi32(-(1 << (log_range_out - 1))); 1344 const __m256i clamp_hi_out = 1345 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); 1346 round_shift_8x8_avx2(out, out_shift); 1347 highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 16); 1348 } 1349 } 1350 } 1351 1352 static void idct16_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd, 1353 int out_shift) { 1354 const int32_t *cospi = cospi_arr(bit); 1355 const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); 1356 const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]); 1357 const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); 1358 const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); 1359 const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); 1360 const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); 1361 const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]); 1362 const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); 1363 const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); 1364 const __m256i cospi52 = _mm256_set1_epi32(cospi[52]); 1365 const __m256i cospi36 = _mm256_set1_epi32(cospi[36]); 1366 const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); 1367 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); 1368 const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); 1369 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); 1370 const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); 1371 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); 1372 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); 1373 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); 1374 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); 1375 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); 1376 const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); 1377 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); 1378 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); 1379 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); 1380 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); 1381 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); 1382 __m256i u[16], v[16], x, y; 1383 1384 { 1385 // stage 0 1386 // stage 1 1387 u[0] = in[0]; 1388 u[1] = in[8]; 1389 u[2] = in[4]; 1390 u[3] = in[12]; 1391 u[4] = in[2]; 1392 u[5] = in[10]; 1393 u[6] = in[6]; 1394 u[7] = in[14]; 1395 u[8] = in[1]; 1396 u[9] = in[9]; 1397 u[10] = in[5]; 1398 u[11] = in[13]; 1399 u[12] = in[3]; 1400 u[13] = in[11]; 1401 u[14] = in[7]; 1402 u[15] = in[15]; 1403 1404 // stage 2 1405 v[0] = u[0]; 1406 v[1] = u[1]; 1407 v[2] = u[2]; 1408 v[3] = u[3]; 1409 v[4] = u[4]; 1410 v[5] = u[5]; 1411 v[6] = u[6]; 1412 v[7] = u[7]; 1413 1414 v[8] = half_btf_avx2(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit); 1415 v[9] = half_btf_avx2(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit); 1416 v[10] = half_btf_avx2(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit); 1417 v[11] = half_btf_avx2(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit); 1418 v[12] = half_btf_avx2(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit); 1419 v[13] = half_btf_avx2(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit); 1420 v[14] = half_btf_avx2(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit); 1421 v[15] = half_btf_avx2(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit); 1422 1423 // stage 3 1424 u[0] = v[0]; 1425 u[1] = v[1]; 1426 u[2] = v[2]; 1427 u[3] = v[3]; 1428 u[4] = half_btf_avx2(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit); 1429 u[5] = half_btf_avx2(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit); 1430 u[6] = half_btf_avx2(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit); 1431 u[7] = half_btf_avx2(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit); 1432 addsub_avx2(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi); 1433 addsub_avx2(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi); 1434 addsub_avx2(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi); 1435 addsub_avx2(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi); 1436 1437 // stage 4 1438 x = _mm256_mullo_epi32(u[0], cospi32); 1439 y = _mm256_mullo_epi32(u[1], cospi32); 1440 v[0] = _mm256_add_epi32(x, y); 1441 v[0] = _mm256_add_epi32(v[0], rnding); 1442 v[0] = _mm256_srai_epi32(v[0], bit); 1443 1444 v[1] = _mm256_sub_epi32(x, y); 1445 v[1] = _mm256_add_epi32(v[1], rnding); 1446 v[1] = _mm256_srai_epi32(v[1], bit); 1447 1448 v[2] = half_btf_avx2(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit); 1449 v[3] = half_btf_avx2(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit); 1450 addsub_avx2(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi); 1451 addsub_avx2(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi); 1452 v[8] = u[8]; 1453 v[9] = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); 1454 v[10] = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); 1455 v[11] = u[11]; 1456 v[12] = u[12]; 1457 v[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); 1458 v[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); 1459 v[15] = u[15]; 1460 1461 // stage 5 1462 addsub_avx2(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi); 1463 addsub_avx2(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi); 1464 u[4] = v[4]; 1465 1466 x = _mm256_mullo_epi32(v[5], cospi32); 1467 y = _mm256_mullo_epi32(v[6], cospi32); 1468 u[5] = _mm256_sub_epi32(y, x); 1469 u[5] = _mm256_add_epi32(u[5], rnding); 1470 u[5] = _mm256_srai_epi32(u[5], bit); 1471 1472 u[6] = _mm256_add_epi32(y, x); 1473 u[6] = _mm256_add_epi32(u[6], rnding); 1474 u[6] = _mm256_srai_epi32(u[6], bit); 1475 1476 u[7] = v[7]; 1477 addsub_avx2(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi); 1478 addsub_avx2(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi); 1479 addsub_avx2(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi); 1480 addsub_avx2(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi); 1481 1482 // stage 6 1483 addsub_avx2(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi); 1484 addsub_avx2(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi); 1485 addsub_avx2(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi); 1486 addsub_avx2(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi); 1487 v[8] = u[8]; 1488 v[9] = u[9]; 1489 1490 x = _mm256_mullo_epi32(u[10], cospi32); 1491 y = _mm256_mullo_epi32(u[13], cospi32); 1492 v[10] = _mm256_sub_epi32(y, x); 1493 v[10] = _mm256_add_epi32(v[10], rnding); 1494 v[10] = _mm256_srai_epi32(v[10], bit); 1495 1496 v[13] = _mm256_add_epi32(x, y); 1497 v[13] = _mm256_add_epi32(v[13], rnding); 1498 v[13] = _mm256_srai_epi32(v[13], bit); 1499 1500 x = _mm256_mullo_epi32(u[11], cospi32); 1501 y = _mm256_mullo_epi32(u[12], cospi32); 1502 v[11] = _mm256_sub_epi32(y, x); 1503 v[11] = _mm256_add_epi32(v[11], rnding); 1504 v[11] = _mm256_srai_epi32(v[11], bit); 1505 1506 v[12] = _mm256_add_epi32(x, y); 1507 v[12] = _mm256_add_epi32(v[12], rnding); 1508 v[12] = _mm256_srai_epi32(v[12], bit); 1509 1510 v[14] = u[14]; 1511 v[15] = u[15]; 1512 1513 // stage 7 1514 addsub_avx2(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi); 1515 addsub_avx2(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi); 1516 addsub_avx2(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi); 1517 addsub_avx2(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi); 1518 addsub_avx2(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi); 1519 addsub_avx2(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi); 1520 addsub_avx2(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi); 1521 addsub_avx2(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi); 1522 1523 if (!do_cols) { 1524 const int log_range_out = AOMMAX(16, bd + 6); 1525 const __m256i clamp_lo_out = 1526 _mm256_set1_epi32(-(1 << (log_range_out - 1))); 1527 const __m256i clamp_hi_out = 1528 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); 1529 round_shift_8x8_avx2(out, out_shift); 1530 highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 16); 1531 } 1532 } 1533 } 1534 1535 static void iadst16_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols, 1536 int bd, int out_shift) { 1537 const int32_t *cospi = cospi_arr(bit); 1538 const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); 1539 const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); 1540 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); 1541 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); 1542 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); 1543 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); 1544 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); 1545 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); 1546 const __m256i zero = _mm256_setzero_si256(); 1547 __m256i v[16], x, y, temp1, temp2; 1548 1549 // Calculate the column 0, 1, 2, 3 1550 { 1551 // stage 0 1552 // stage 1 1553 // stage 2 1554 x = _mm256_mullo_epi32(in[0], cospi62); 1555 v[0] = _mm256_add_epi32(x, rnding); 1556 v[0] = _mm256_srai_epi32(v[0], bit); 1557 1558 x = _mm256_mullo_epi32(in[0], cospi2); 1559 v[1] = _mm256_sub_epi32(zero, x); 1560 v[1] = _mm256_add_epi32(v[1], rnding); 1561 v[1] = _mm256_srai_epi32(v[1], bit); 1562 1563 // stage 3 1564 v[8] = v[0]; 1565 v[9] = v[1]; 1566 1567 // stage 4 1568 temp1 = _mm256_mullo_epi32(v[8], cospi8); 1569 x = _mm256_mullo_epi32(v[9], cospi56); 1570 temp1 = _mm256_add_epi32(temp1, x); 1571 temp1 = _mm256_add_epi32(temp1, rnding); 1572 temp1 = _mm256_srai_epi32(temp1, bit); 1573 1574 temp2 = _mm256_mullo_epi32(v[8], cospi56); 1575 x = _mm256_mullo_epi32(v[9], cospi8); 1576 temp2 = _mm256_sub_epi32(temp2, x); 1577 temp2 = _mm256_add_epi32(temp2, rnding); 1578 temp2 = _mm256_srai_epi32(temp2, bit); 1579 v[8] = temp1; 1580 v[9] = temp2; 1581 1582 // stage 5 1583 v[4] = v[0]; 1584 v[5] = v[1]; 1585 v[12] = v[8]; 1586 v[13] = v[9]; 1587 1588 // stage 6 1589 temp1 = _mm256_mullo_epi32(v[4], cospi16); 1590 x = _mm256_mullo_epi32(v[5], cospi48); 1591 temp1 = _mm256_add_epi32(temp1, x); 1592 temp1 = _mm256_add_epi32(temp1, rnding); 1593 temp1 = _mm256_srai_epi32(temp1, bit); 1594 1595 temp2 = _mm256_mullo_epi32(v[4], cospi48); 1596 x = _mm256_mullo_epi32(v[5], cospi16); 1597 temp2 = _mm256_sub_epi32(temp2, x); 1598 temp2 = _mm256_add_epi32(temp2, rnding); 1599 temp2 = _mm256_srai_epi32(temp2, bit); 1600 v[4] = temp1; 1601 v[5] = temp2; 1602 1603 temp1 = _mm256_mullo_epi32(v[12], cospi16); 1604 x = _mm256_mullo_epi32(v[13], cospi48); 1605 temp1 = _mm256_add_epi32(temp1, x); 1606 temp1 = _mm256_add_epi32(temp1, rnding); 1607 temp1 = _mm256_srai_epi32(temp1, bit); 1608 1609 temp2 = _mm256_mullo_epi32(v[12], cospi48); 1610 x = _mm256_mullo_epi32(v[13], cospi16); 1611 temp2 = _mm256_sub_epi32(temp2, x); 1612 temp2 = _mm256_add_epi32(temp2, rnding); 1613 temp2 = _mm256_srai_epi32(temp2, bit); 1614 v[12] = temp1; 1615 v[13] = temp2; 1616 1617 // stage 7 1618 v[2] = v[0]; 1619 v[3] = v[1]; 1620 v[6] = v[4]; 1621 v[7] = v[5]; 1622 v[10] = v[8]; 1623 v[11] = v[9]; 1624 v[14] = v[12]; 1625 v[15] = v[13]; 1626 1627 // stage 8 1628 y = _mm256_mullo_epi32(v[2], cospi32); 1629 x = _mm256_mullo_epi32(v[3], cospi32); 1630 v[2] = _mm256_add_epi32(y, x); 1631 v[2] = _mm256_add_epi32(v[2], rnding); 1632 v[2] = _mm256_srai_epi32(v[2], bit); 1633 1634 v[3] = _mm256_sub_epi32(y, x); 1635 v[3] = _mm256_add_epi32(v[3], rnding); 1636 v[3] = _mm256_srai_epi32(v[3], bit); 1637 1638 y = _mm256_mullo_epi32(v[6], cospi32); 1639 x = _mm256_mullo_epi32(v[7], cospi32); 1640 v[6] = _mm256_add_epi32(y, x); 1641 v[6] = _mm256_add_epi32(v[6], rnding); 1642 v[6] = _mm256_srai_epi32(v[6], bit); 1643 1644 v[7] = _mm256_sub_epi32(y, x); 1645 v[7] = _mm256_add_epi32(v[7], rnding); 1646 v[7] = _mm256_srai_epi32(v[7], bit); 1647 1648 y = _mm256_mullo_epi32(v[10], cospi32); 1649 x = _mm256_mullo_epi32(v[11], cospi32); 1650 v[10] = _mm256_add_epi32(y, x); 1651 v[10] = _mm256_add_epi32(v[10], rnding); 1652 v[10] = _mm256_srai_epi32(v[10], bit); 1653 1654 v[11] = _mm256_sub_epi32(y, x); 1655 v[11] = _mm256_add_epi32(v[11], rnding); 1656 v[11] = _mm256_srai_epi32(v[11], bit); 1657 1658 y = _mm256_mullo_epi32(v[14], cospi32); 1659 x = _mm256_mullo_epi32(v[15], cospi32); 1660 v[14] = _mm256_add_epi32(y, x); 1661 v[14] = _mm256_add_epi32(v[14], rnding); 1662 v[14] = _mm256_srai_epi32(v[14], bit); 1663 1664 v[15] = _mm256_sub_epi32(y, x); 1665 v[15] = _mm256_add_epi32(v[15], rnding); 1666 v[15] = _mm256_srai_epi32(v[15], bit); 1667 1668 // stage 9 1669 if (do_cols) { 1670 out[0] = v[0]; 1671 out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), v[8]); 1672 out[2] = v[12]; 1673 out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), v[4]); 1674 out[4] = v[6]; 1675 out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), v[14]); 1676 out[6] = v[10]; 1677 out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), v[2]); 1678 out[8] = v[3]; 1679 out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), v[11]); 1680 out[10] = v[15]; 1681 out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), v[7]); 1682 out[12] = v[5]; 1683 out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), v[13]); 1684 out[14] = v[9]; 1685 out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), v[1]); 1686 } else { 1687 const int log_range_out = AOMMAX(16, bd + 6); 1688 const __m256i clamp_lo_out = 1689 _mm256_set1_epi32(-(1 << (log_range_out - 1))); 1690 const __m256i clamp_hi_out = 1691 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); 1692 1693 neg_shift_avx2(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, 1694 out_shift); 1695 neg_shift_avx2(v[12], v[4], out + 2, out + 3, &clamp_lo_out, 1696 &clamp_hi_out, out_shift); 1697 neg_shift_avx2(v[6], v[14], out + 4, out + 5, &clamp_lo_out, 1698 &clamp_hi_out, out_shift); 1699 neg_shift_avx2(v[10], v[2], out + 6, out + 7, &clamp_lo_out, 1700 &clamp_hi_out, out_shift); 1701 neg_shift_avx2(v[3], v[11], out + 8, out + 9, &clamp_lo_out, 1702 &clamp_hi_out, out_shift); 1703 neg_shift_avx2(v[15], v[7], out + 10, out + 11, &clamp_lo_out, 1704 &clamp_hi_out, out_shift); 1705 neg_shift_avx2(v[5], v[13], out + 12, out + 13, &clamp_lo_out, 1706 &clamp_hi_out, out_shift); 1707 neg_shift_avx2(v[9], v[1], out + 14, out + 15, &clamp_lo_out, 1708 &clamp_hi_out, out_shift); 1709 } 1710 } 1711 } 1712 1713 static void iadst16_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols, 1714 int bd, int out_shift) { 1715 const int32_t *cospi = cospi_arr(bit); 1716 const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); 1717 const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); 1718 const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); 1719 const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); 1720 const __m256i cospi18 = _mm256_set1_epi32(cospi[18]); 1721 const __m256i cospi46 = _mm256_set1_epi32(cospi[46]); 1722 const __m256i cospi26 = _mm256_set1_epi32(cospi[26]); 1723 const __m256i cospi38 = _mm256_set1_epi32(cospi[38]); 1724 const __m256i cospi34 = _mm256_set1_epi32(cospi[34]); 1725 const __m256i cospi30 = _mm256_set1_epi32(cospi[30]); 1726 const __m256i cospi42 = _mm256_set1_epi32(cospi[42]); 1727 const __m256i cospi22 = _mm256_set1_epi32(cospi[22]); 1728 const __m256i cospi50 = _mm256_set1_epi32(cospi[50]); 1729 const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); 1730 const __m256i cospi58 = _mm256_set1_epi32(cospi[58]); 1731 const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); 1732 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); 1733 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); 1734 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); 1735 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); 1736 const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); 1737 const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); 1738 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); 1739 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); 1740 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); 1741 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); 1742 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); 1743 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); 1744 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); 1745 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); 1746 __m256i u[16], x, y; 1747 1748 { 1749 // stage 0 1750 // stage 1 1751 // stage 2 1752 __m256i zero = _mm256_setzero_si256(); 1753 x = _mm256_mullo_epi32(in[0], cospi62); 1754 u[0] = _mm256_add_epi32(x, rnding); 1755 u[0] = _mm256_srai_epi32(u[0], bit); 1756 1757 x = _mm256_mullo_epi32(in[0], cospi2); 1758 u[1] = _mm256_sub_epi32(zero, x); 1759 u[1] = _mm256_add_epi32(u[1], rnding); 1760 u[1] = _mm256_srai_epi32(u[1], bit); 1761 1762 x = _mm256_mullo_epi32(in[2], cospi54); 1763 u[2] = _mm256_add_epi32(x, rnding); 1764 u[2] = _mm256_srai_epi32(u[2], bit); 1765 1766 x = _mm256_mullo_epi32(in[2], cospi10); 1767 u[3] = _mm256_sub_epi32(zero, x); 1768 u[3] = _mm256_add_epi32(u[3], rnding); 1769 u[3] = _mm256_srai_epi32(u[3], bit); 1770 1771 x = _mm256_mullo_epi32(in[4], cospi46); 1772 u[4] = _mm256_add_epi32(x, rnding); 1773 u[4] = _mm256_srai_epi32(u[4], bit); 1774 1775 x = _mm256_mullo_epi32(in[4], cospi18); 1776 u[5] = _mm256_sub_epi32(zero, x); 1777 u[5] = _mm256_add_epi32(u[5], rnding); 1778 u[5] = _mm256_srai_epi32(u[5], bit); 1779 1780 x = _mm256_mullo_epi32(in[6], cospi38); 1781 u[6] = _mm256_add_epi32(x, rnding); 1782 u[6] = _mm256_srai_epi32(u[6], bit); 1783 1784 x = _mm256_mullo_epi32(in[6], cospi26); 1785 u[7] = _mm256_sub_epi32(zero, x); 1786 u[7] = _mm256_add_epi32(u[7], rnding); 1787 u[7] = _mm256_srai_epi32(u[7], bit); 1788 1789 u[8] = _mm256_mullo_epi32(in[7], cospi34); 1790 u[8] = _mm256_add_epi32(u[8], rnding); 1791 u[8] = _mm256_srai_epi32(u[8], bit); 1792 1793 u[9] = _mm256_mullo_epi32(in[7], cospi30); 1794 u[9] = _mm256_add_epi32(u[9], rnding); 1795 u[9] = _mm256_srai_epi32(u[9], bit); 1796 1797 u[10] = _mm256_mullo_epi32(in[5], cospi42); 1798 u[10] = _mm256_add_epi32(u[10], rnding); 1799 u[10] = _mm256_srai_epi32(u[10], bit); 1800 1801 u[11] = _mm256_mullo_epi32(in[5], cospi22); 1802 u[11] = _mm256_add_epi32(u[11], rnding); 1803 u[11] = _mm256_srai_epi32(u[11], bit); 1804 1805 u[12] = _mm256_mullo_epi32(in[3], cospi50); 1806 u[12] = _mm256_add_epi32(u[12], rnding); 1807 u[12] = _mm256_srai_epi32(u[12], bit); 1808 1809 u[13] = _mm256_mullo_epi32(in[3], cospi14); 1810 u[13] = _mm256_add_epi32(u[13], rnding); 1811 u[13] = _mm256_srai_epi32(u[13], bit); 1812 1813 u[14] = _mm256_mullo_epi32(in[1], cospi58); 1814 u[14] = _mm256_add_epi32(u[14], rnding); 1815 u[14] = _mm256_srai_epi32(u[14], bit); 1816 1817 u[15] = _mm256_mullo_epi32(in[1], cospi6); 1818 u[15] = _mm256_add_epi32(u[15], rnding); 1819 u[15] = _mm256_srai_epi32(u[15], bit); 1820 1821 // stage 3 1822 addsub_avx2(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi); 1823 addsub_avx2(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi); 1824 addsub_avx2(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi); 1825 addsub_avx2(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi); 1826 addsub_avx2(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi); 1827 addsub_avx2(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi); 1828 addsub_avx2(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi); 1829 addsub_avx2(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi); 1830 1831 // stage 4 1832 y = _mm256_mullo_epi32(u[8], cospi56); 1833 x = _mm256_mullo_epi32(u[9], cospi56); 1834 u[8] = _mm256_mullo_epi32(u[8], cospi8); 1835 u[8] = _mm256_add_epi32(u[8], x); 1836 u[8] = _mm256_add_epi32(u[8], rnding); 1837 u[8] = _mm256_srai_epi32(u[8], bit); 1838 1839 x = _mm256_mullo_epi32(u[9], cospi8); 1840 u[9] = _mm256_sub_epi32(y, x); 1841 u[9] = _mm256_add_epi32(u[9], rnding); 1842 u[9] = _mm256_srai_epi32(u[9], bit); 1843 1844 x = _mm256_mullo_epi32(u[11], cospi24); 1845 y = _mm256_mullo_epi32(u[10], cospi24); 1846 u[10] = _mm256_mullo_epi32(u[10], cospi40); 1847 u[10] = _mm256_add_epi32(u[10], x); 1848 u[10] = _mm256_add_epi32(u[10], rnding); 1849 u[10] = _mm256_srai_epi32(u[10], bit); 1850 1851 x = _mm256_mullo_epi32(u[11], cospi40); 1852 u[11] = _mm256_sub_epi32(y, x); 1853 u[11] = _mm256_add_epi32(u[11], rnding); 1854 u[11] = _mm256_srai_epi32(u[11], bit); 1855 1856 x = _mm256_mullo_epi32(u[13], cospi8); 1857 y = _mm256_mullo_epi32(u[12], cospi8); 1858 u[12] = _mm256_mullo_epi32(u[12], cospim56); 1859 u[12] = _mm256_add_epi32(u[12], x); 1860 u[12] = _mm256_add_epi32(u[12], rnding); 1861 u[12] = _mm256_srai_epi32(u[12], bit); 1862 1863 x = _mm256_mullo_epi32(u[13], cospim56); 1864 u[13] = _mm256_sub_epi32(y, x); 1865 u[13] = _mm256_add_epi32(u[13], rnding); 1866 u[13] = _mm256_srai_epi32(u[13], bit); 1867 1868 x = _mm256_mullo_epi32(u[15], cospi40); 1869 y = _mm256_mullo_epi32(u[14], cospi40); 1870 u[14] = _mm256_mullo_epi32(u[14], cospim24); 1871 u[14] = _mm256_add_epi32(u[14], x); 1872 u[14] = _mm256_add_epi32(u[14], rnding); 1873 u[14] = _mm256_srai_epi32(u[14], bit); 1874 1875 x = _mm256_mullo_epi32(u[15], cospim24); 1876 u[15] = _mm256_sub_epi32(y, x); 1877 u[15] = _mm256_add_epi32(u[15], rnding); 1878 u[15] = _mm256_srai_epi32(u[15], bit); 1879 1880 // stage 5 1881 addsub_avx2(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi); 1882 addsub_avx2(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi); 1883 addsub_avx2(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi); 1884 addsub_avx2(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi); 1885 addsub_avx2(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi); 1886 addsub_avx2(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi); 1887 addsub_avx2(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi); 1888 addsub_avx2(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi); 1889 1890 // stage 6 1891 x = _mm256_mullo_epi32(u[5], cospi48); 1892 y = _mm256_mullo_epi32(u[4], cospi48); 1893 u[4] = _mm256_mullo_epi32(u[4], cospi16); 1894 u[4] = _mm256_add_epi32(u[4], x); 1895 u[4] = _mm256_add_epi32(u[4], rnding); 1896 u[4] = _mm256_srai_epi32(u[4], bit); 1897 1898 x = _mm256_mullo_epi32(u[5], cospi16); 1899 u[5] = _mm256_sub_epi32(y, x); 1900 u[5] = _mm256_add_epi32(u[5], rnding); 1901 u[5] = _mm256_srai_epi32(u[5], bit); 1902 1903 x = _mm256_mullo_epi32(u[7], cospi16); 1904 y = _mm256_mullo_epi32(u[6], cospi16); 1905 u[6] = _mm256_mullo_epi32(u[6], cospim48); 1906 u[6] = _mm256_add_epi32(u[6], x); 1907 u[6] = _mm256_add_epi32(u[6], rnding); 1908 u[6] = _mm256_srai_epi32(u[6], bit); 1909 1910 x = _mm256_mullo_epi32(u[7], cospim48); 1911 u[7] = _mm256_sub_epi32(y, x); 1912 u[7] = _mm256_add_epi32(u[7], rnding); 1913 u[7] = _mm256_srai_epi32(u[7], bit); 1914 1915 x = _mm256_mullo_epi32(u[13], cospi48); 1916 y = _mm256_mullo_epi32(u[12], cospi48); 1917 u[12] = _mm256_mullo_epi32(u[12], cospi16); 1918 u[12] = _mm256_add_epi32(u[12], x); 1919 u[12] = _mm256_add_epi32(u[12], rnding); 1920 u[12] = _mm256_srai_epi32(u[12], bit); 1921 1922 x = _mm256_mullo_epi32(u[13], cospi16); 1923 u[13] = _mm256_sub_epi32(y, x); 1924 u[13] = _mm256_add_epi32(u[13], rnding); 1925 u[13] = _mm256_srai_epi32(u[13], bit); 1926 1927 x = _mm256_mullo_epi32(u[15], cospi16); 1928 y = _mm256_mullo_epi32(u[14], cospi16); 1929 u[14] = _mm256_mullo_epi32(u[14], cospim48); 1930 u[14] = _mm256_add_epi32(u[14], x); 1931 u[14] = _mm256_add_epi32(u[14], rnding); 1932 u[14] = _mm256_srai_epi32(u[14], bit); 1933 1934 x = _mm256_mullo_epi32(u[15], cospim48); 1935 u[15] = _mm256_sub_epi32(y, x); 1936 u[15] = _mm256_add_epi32(u[15], rnding); 1937 u[15] = _mm256_srai_epi32(u[15], bit); 1938 1939 // stage 7 1940 addsub_avx2(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi); 1941 addsub_avx2(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi); 1942 addsub_avx2(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi); 1943 addsub_avx2(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi); 1944 addsub_avx2(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi); 1945 addsub_avx2(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi); 1946 addsub_avx2(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi); 1947 addsub_avx2(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi); 1948 1949 // stage 8 1950 y = _mm256_mullo_epi32(u[2], cospi32); 1951 x = _mm256_mullo_epi32(u[3], cospi32); 1952 u[2] = _mm256_add_epi32(y, x); 1953 u[2] = _mm256_add_epi32(u[2], rnding); 1954 u[2] = _mm256_srai_epi32(u[2], bit); 1955 1956 u[3] = _mm256_sub_epi32(y, x); 1957 u[3] = _mm256_add_epi32(u[3], rnding); 1958 u[3] = _mm256_srai_epi32(u[3], bit); 1959 y = _mm256_mullo_epi32(u[6], cospi32); 1960 x = _mm256_mullo_epi32(u[7], cospi32); 1961 u[6] = _mm256_add_epi32(y, x); 1962 u[6] = _mm256_add_epi32(u[6], rnding); 1963 u[6] = _mm256_srai_epi32(u[6], bit); 1964 1965 u[7] = _mm256_sub_epi32(y, x); 1966 u[7] = _mm256_add_epi32(u[7], rnding); 1967 u[7] = _mm256_srai_epi32(u[7], bit); 1968 1969 y = _mm256_mullo_epi32(u[10], cospi32); 1970 x = _mm256_mullo_epi32(u[11], cospi32); 1971 u[10] = _mm256_add_epi32(y, x); 1972 u[10] = _mm256_add_epi32(u[10], rnding); 1973 u[10] = _mm256_srai_epi32(u[10], bit); 1974 1975 u[11] = _mm256_sub_epi32(y, x); 1976 u[11] = _mm256_add_epi32(u[11], rnding); 1977 u[11] = _mm256_srai_epi32(u[11], bit); 1978 1979 y = _mm256_mullo_epi32(u[14], cospi32); 1980 x = _mm256_mullo_epi32(u[15], cospi32); 1981 u[14] = _mm256_add_epi32(y, x); 1982 u[14] = _mm256_add_epi32(u[14], rnding); 1983 u[14] = _mm256_srai_epi32(u[14], bit); 1984 1985 u[15] = _mm256_sub_epi32(y, x); 1986 u[15] = _mm256_add_epi32(u[15], rnding); 1987 u[15] = _mm256_srai_epi32(u[15], bit); 1988 1989 // stage 9 1990 if (do_cols) { 1991 out[0] = u[0]; 1992 out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), u[8]); 1993 out[2] = u[12]; 1994 out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), u[4]); 1995 out[4] = u[6]; 1996 out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), u[14]); 1997 out[6] = u[10]; 1998 out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), u[2]); 1999 out[8] = u[3]; 2000 out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), u[11]); 2001 out[10] = u[15]; 2002 out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), u[7]); 2003 out[12] = u[5]; 2004 out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), u[13]); 2005 out[14] = u[9]; 2006 out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), u[1]); 2007 } else { 2008 const int log_range_out = AOMMAX(16, bd + 6); 2009 const __m256i clamp_lo_out = 2010 _mm256_set1_epi32(-(1 << (log_range_out - 1))); 2011 const __m256i clamp_hi_out = 2012 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); 2013 2014 neg_shift_avx2(u[0], u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, 2015 out_shift); 2016 neg_shift_avx2(u[12], u[4], out + 2, out + 3, &clamp_lo_out, 2017 &clamp_hi_out, out_shift); 2018 neg_shift_avx2(u[6], u[14], out + 4, out + 5, &clamp_lo_out, 2019 &clamp_hi_out, out_shift); 2020 neg_shift_avx2(u[10], u[2], out + 6, out + 7, &clamp_lo_out, 2021 &clamp_hi_out, out_shift); 2022 neg_shift_avx2(u[3], u[11], out + 8, out + 9, &clamp_lo_out, 2023 &clamp_hi_out, out_shift); 2024 neg_shift_avx2(u[15], u[7], out + 10, out + 11, &clamp_lo_out, 2025 &clamp_hi_out, out_shift); 2026 neg_shift_avx2(u[5], u[13], out + 12, out + 13, &clamp_lo_out, 2027 &clamp_hi_out, out_shift); 2028 neg_shift_avx2(u[9], u[1], out + 14, out + 15, &clamp_lo_out, 2029 &clamp_hi_out, out_shift); 2030 } 2031 } 2032 } 2033 2034 static void iadst16_avx2(__m256i *in, __m256i *out, int bit, int do_cols, 2035 int bd, int out_shift) { 2036 const int32_t *cospi = cospi_arr(bit); 2037 const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); 2038 const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); 2039 const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); 2040 const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); 2041 const __m256i cospi18 = _mm256_set1_epi32(cospi[18]); 2042 const __m256i cospi46 = _mm256_set1_epi32(cospi[46]); 2043 const __m256i cospi26 = _mm256_set1_epi32(cospi[26]); 2044 const __m256i cospi38 = _mm256_set1_epi32(cospi[38]); 2045 const __m256i cospi34 = _mm256_set1_epi32(cospi[34]); 2046 const __m256i cospi30 = _mm256_set1_epi32(cospi[30]); 2047 const __m256i cospi42 = _mm256_set1_epi32(cospi[42]); 2048 const __m256i cospi22 = _mm256_set1_epi32(cospi[22]); 2049 const __m256i cospi50 = _mm256_set1_epi32(cospi[50]); 2050 const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); 2051 const __m256i cospi58 = _mm256_set1_epi32(cospi[58]); 2052 const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); 2053 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); 2054 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); 2055 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); 2056 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); 2057 const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); 2058 const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); 2059 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); 2060 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); 2061 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); 2062 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); 2063 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); 2064 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); 2065 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); 2066 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); 2067 __m256i u[16], v[16], x, y; 2068 2069 { 2070 // stage 0 2071 // stage 1 2072 // stage 2 2073 v[0] = _mm256_mullo_epi32(in[15], cospi2); 2074 x = _mm256_mullo_epi32(in[0], cospi62); 2075 v[0] = _mm256_add_epi32(v[0], x); 2076 v[0] = _mm256_add_epi32(v[0], rnding); 2077 v[0] = _mm256_srai_epi32(v[0], bit); 2078 2079 v[1] = _mm256_mullo_epi32(in[15], cospi62); 2080 x = _mm256_mullo_epi32(in[0], cospi2); 2081 v[1] = _mm256_sub_epi32(v[1], x); 2082 v[1] = _mm256_add_epi32(v[1], rnding); 2083 v[1] = _mm256_srai_epi32(v[1], bit); 2084 2085 v[2] = _mm256_mullo_epi32(in[13], cospi10); 2086 x = _mm256_mullo_epi32(in[2], cospi54); 2087 v[2] = _mm256_add_epi32(v[2], x); 2088 v[2] = _mm256_add_epi32(v[2], rnding); 2089 v[2] = _mm256_srai_epi32(v[2], bit); 2090 2091 v[3] = _mm256_mullo_epi32(in[13], cospi54); 2092 x = _mm256_mullo_epi32(in[2], cospi10); 2093 v[3] = _mm256_sub_epi32(v[3], x); 2094 v[3] = _mm256_add_epi32(v[3], rnding); 2095 v[3] = _mm256_srai_epi32(v[3], bit); 2096 2097 v[4] = _mm256_mullo_epi32(in[11], cospi18); 2098 x = _mm256_mullo_epi32(in[4], cospi46); 2099 v[4] = _mm256_add_epi32(v[4], x); 2100 v[4] = _mm256_add_epi32(v[4], rnding); 2101 v[4] = _mm256_srai_epi32(v[4], bit); 2102 2103 v[5] = _mm256_mullo_epi32(in[11], cospi46); 2104 x = _mm256_mullo_epi32(in[4], cospi18); 2105 v[5] = _mm256_sub_epi32(v[5], x); 2106 v[5] = _mm256_add_epi32(v[5], rnding); 2107 v[5] = _mm256_srai_epi32(v[5], bit); 2108 2109 v[6] = _mm256_mullo_epi32(in[9], cospi26); 2110 x = _mm256_mullo_epi32(in[6], cospi38); 2111 v[6] = _mm256_add_epi32(v[6], x); 2112 v[6] = _mm256_add_epi32(v[6], rnding); 2113 v[6] = _mm256_srai_epi32(v[6], bit); 2114 2115 v[7] = _mm256_mullo_epi32(in[9], cospi38); 2116 x = _mm256_mullo_epi32(in[6], cospi26); 2117 v[7] = _mm256_sub_epi32(v[7], x); 2118 v[7] = _mm256_add_epi32(v[7], rnding); 2119 v[7] = _mm256_srai_epi32(v[7], bit); 2120 2121 v[8] = _mm256_mullo_epi32(in[7], cospi34); 2122 x = _mm256_mullo_epi32(in[8], cospi30); 2123 v[8] = _mm256_add_epi32(v[8], x); 2124 v[8] = _mm256_add_epi32(v[8], rnding); 2125 v[8] = _mm256_srai_epi32(v[8], bit); 2126 2127 v[9] = _mm256_mullo_epi32(in[7], cospi30); 2128 x = _mm256_mullo_epi32(in[8], cospi34); 2129 v[9] = _mm256_sub_epi32(v[9], x); 2130 v[9] = _mm256_add_epi32(v[9], rnding); 2131 v[9] = _mm256_srai_epi32(v[9], bit); 2132 2133 v[10] = _mm256_mullo_epi32(in[5], cospi42); 2134 x = _mm256_mullo_epi32(in[10], cospi22); 2135 v[10] = _mm256_add_epi32(v[10], x); 2136 v[10] = _mm256_add_epi32(v[10], rnding); 2137 v[10] = _mm256_srai_epi32(v[10], bit); 2138 2139 v[11] = _mm256_mullo_epi32(in[5], cospi22); 2140 x = _mm256_mullo_epi32(in[10], cospi42); 2141 v[11] = _mm256_sub_epi32(v[11], x); 2142 v[11] = _mm256_add_epi32(v[11], rnding); 2143 v[11] = _mm256_srai_epi32(v[11], bit); 2144 2145 v[12] = _mm256_mullo_epi32(in[3], cospi50); 2146 x = _mm256_mullo_epi32(in[12], cospi14); 2147 v[12] = _mm256_add_epi32(v[12], x); 2148 v[12] = _mm256_add_epi32(v[12], rnding); 2149 v[12] = _mm256_srai_epi32(v[12], bit); 2150 2151 v[13] = _mm256_mullo_epi32(in[3], cospi14); 2152 x = _mm256_mullo_epi32(in[12], cospi50); 2153 v[13] = _mm256_sub_epi32(v[13], x); 2154 v[13] = _mm256_add_epi32(v[13], rnding); 2155 v[13] = _mm256_srai_epi32(v[13], bit); 2156 2157 v[14] = _mm256_mullo_epi32(in[1], cospi58); 2158 x = _mm256_mullo_epi32(in[14], cospi6); 2159 v[14] = _mm256_add_epi32(v[14], x); 2160 v[14] = _mm256_add_epi32(v[14], rnding); 2161 v[14] = _mm256_srai_epi32(v[14], bit); 2162 2163 v[15] = _mm256_mullo_epi32(in[1], cospi6); 2164 x = _mm256_mullo_epi32(in[14], cospi58); 2165 v[15] = _mm256_sub_epi32(v[15], x); 2166 v[15] = _mm256_add_epi32(v[15], rnding); 2167 v[15] = _mm256_srai_epi32(v[15], bit); 2168 2169 // stage 3 2170 addsub_avx2(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi); 2171 addsub_avx2(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi); 2172 addsub_avx2(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi); 2173 addsub_avx2(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi); 2174 addsub_avx2(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi); 2175 addsub_avx2(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi); 2176 addsub_avx2(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi); 2177 addsub_avx2(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi); 2178 2179 // stage 4 2180 v[0] = u[0]; 2181 v[1] = u[1]; 2182 v[2] = u[2]; 2183 v[3] = u[3]; 2184 v[4] = u[4]; 2185 v[5] = u[5]; 2186 v[6] = u[6]; 2187 v[7] = u[7]; 2188 2189 v[8] = _mm256_mullo_epi32(u[8], cospi8); 2190 x = _mm256_mullo_epi32(u[9], cospi56); 2191 v[8] = _mm256_add_epi32(v[8], x); 2192 v[8] = _mm256_add_epi32(v[8], rnding); 2193 v[8] = _mm256_srai_epi32(v[8], bit); 2194 2195 v[9] = _mm256_mullo_epi32(u[8], cospi56); 2196 x = _mm256_mullo_epi32(u[9], cospi8); 2197 v[9] = _mm256_sub_epi32(v[9], x); 2198 v[9] = _mm256_add_epi32(v[9], rnding); 2199 v[9] = _mm256_srai_epi32(v[9], bit); 2200 2201 v[10] = _mm256_mullo_epi32(u[10], cospi40); 2202 x = _mm256_mullo_epi32(u[11], cospi24); 2203 v[10] = _mm256_add_epi32(v[10], x); 2204 v[10] = _mm256_add_epi32(v[10], rnding); 2205 v[10] = _mm256_srai_epi32(v[10], bit); 2206 2207 v[11] = _mm256_mullo_epi32(u[10], cospi24); 2208 x = _mm256_mullo_epi32(u[11], cospi40); 2209 v[11] = _mm256_sub_epi32(v[11], x); 2210 v[11] = _mm256_add_epi32(v[11], rnding); 2211 v[11] = _mm256_srai_epi32(v[11], bit); 2212 2213 v[12] = _mm256_mullo_epi32(u[12], cospim56); 2214 x = _mm256_mullo_epi32(u[13], cospi8); 2215 v[12] = _mm256_add_epi32(v[12], x); 2216 v[12] = _mm256_add_epi32(v[12], rnding); 2217 v[12] = _mm256_srai_epi32(v[12], bit); 2218 2219 v[13] = _mm256_mullo_epi32(u[12], cospi8); 2220 x = _mm256_mullo_epi32(u[13], cospim56); 2221 v[13] = _mm256_sub_epi32(v[13], x); 2222 v[13] = _mm256_add_epi32(v[13], rnding); 2223 v[13] = _mm256_srai_epi32(v[13], bit); 2224 2225 v[14] = _mm256_mullo_epi32(u[14], cospim24); 2226 x = _mm256_mullo_epi32(u[15], cospi40); 2227 v[14] = _mm256_add_epi32(v[14], x); 2228 v[14] = _mm256_add_epi32(v[14], rnding); 2229 v[14] = _mm256_srai_epi32(v[14], bit); 2230 2231 v[15] = _mm256_mullo_epi32(u[14], cospi40); 2232 x = _mm256_mullo_epi32(u[15], cospim24); 2233 v[15] = _mm256_sub_epi32(v[15], x); 2234 v[15] = _mm256_add_epi32(v[15], rnding); 2235 v[15] = _mm256_srai_epi32(v[15], bit); 2236 2237 // stage 5 2238 addsub_avx2(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi); 2239 addsub_avx2(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi); 2240 addsub_avx2(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi); 2241 addsub_avx2(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi); 2242 addsub_avx2(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi); 2243 addsub_avx2(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi); 2244 addsub_avx2(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi); 2245 addsub_avx2(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi); 2246 2247 // stage 6 2248 v[0] = u[0]; 2249 v[1] = u[1]; 2250 v[2] = u[2]; 2251 v[3] = u[3]; 2252 2253 v[4] = _mm256_mullo_epi32(u[4], cospi16); 2254 x = _mm256_mullo_epi32(u[5], cospi48); 2255 v[4] = _mm256_add_epi32(v[4], x); 2256 v[4] = _mm256_add_epi32(v[4], rnding); 2257 v[4] = _mm256_srai_epi32(v[4], bit); 2258 2259 v[5] = _mm256_mullo_epi32(u[4], cospi48); 2260 x = _mm256_mullo_epi32(u[5], cospi16); 2261 v[5] = _mm256_sub_epi32(v[5], x); 2262 v[5] = _mm256_add_epi32(v[5], rnding); 2263 v[5] = _mm256_srai_epi32(v[5], bit); 2264 2265 v[6] = _mm256_mullo_epi32(u[6], cospim48); 2266 x = _mm256_mullo_epi32(u[7], cospi16); 2267 v[6] = _mm256_add_epi32(v[6], x); 2268 v[6] = _mm256_add_epi32(v[6], rnding); 2269 v[6] = _mm256_srai_epi32(v[6], bit); 2270 2271 v[7] = _mm256_mullo_epi32(u[6], cospi16); 2272 x = _mm256_mullo_epi32(u[7], cospim48); 2273 v[7] = _mm256_sub_epi32(v[7], x); 2274 v[7] = _mm256_add_epi32(v[7], rnding); 2275 v[7] = _mm256_srai_epi32(v[7], bit); 2276 2277 v[8] = u[8]; 2278 v[9] = u[9]; 2279 v[10] = u[10]; 2280 v[11] = u[11]; 2281 2282 v[12] = _mm256_mullo_epi32(u[12], cospi16); 2283 x = _mm256_mullo_epi32(u[13], cospi48); 2284 v[12] = _mm256_add_epi32(v[12], x); 2285 v[12] = _mm256_add_epi32(v[12], rnding); 2286 v[12] = _mm256_srai_epi32(v[12], bit); 2287 2288 v[13] = _mm256_mullo_epi32(u[12], cospi48); 2289 x = _mm256_mullo_epi32(u[13], cospi16); 2290 v[13] = _mm256_sub_epi32(v[13], x); 2291 v[13] = _mm256_add_epi32(v[13], rnding); 2292 v[13] = _mm256_srai_epi32(v[13], bit); 2293 2294 v[14] = _mm256_mullo_epi32(u[14], cospim48); 2295 x = _mm256_mullo_epi32(u[15], cospi16); 2296 v[14] = _mm256_add_epi32(v[14], x); 2297 v[14] = _mm256_add_epi32(v[14], rnding); 2298 v[14] = _mm256_srai_epi32(v[14], bit); 2299 2300 v[15] = _mm256_mullo_epi32(u[14], cospi16); 2301 x = _mm256_mullo_epi32(u[15], cospim48); 2302 v[15] = _mm256_sub_epi32(v[15], x); 2303 v[15] = _mm256_add_epi32(v[15], rnding); 2304 v[15] = _mm256_srai_epi32(v[15], bit); 2305 2306 // stage 7 2307 addsub_avx2(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi); 2308 addsub_avx2(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi); 2309 addsub_avx2(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi); 2310 addsub_avx2(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi); 2311 addsub_avx2(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi); 2312 addsub_avx2(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi); 2313 addsub_avx2(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi); 2314 addsub_avx2(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi); 2315 2316 // stage 8 2317 v[0] = u[0]; 2318 v[1] = u[1]; 2319 2320 y = _mm256_mullo_epi32(u[2], cospi32); 2321 x = _mm256_mullo_epi32(u[3], cospi32); 2322 v[2] = _mm256_add_epi32(y, x); 2323 v[2] = _mm256_add_epi32(v[2], rnding); 2324 v[2] = _mm256_srai_epi32(v[2], bit); 2325 2326 v[3] = _mm256_sub_epi32(y, x); 2327 v[3] = _mm256_add_epi32(v[3], rnding); 2328 v[3] = _mm256_srai_epi32(v[3], bit); 2329 2330 v[4] = u[4]; 2331 v[5] = u[5]; 2332 2333 y = _mm256_mullo_epi32(u[6], cospi32); 2334 x = _mm256_mullo_epi32(u[7], cospi32); 2335 v[6] = _mm256_add_epi32(y, x); 2336 v[6] = _mm256_add_epi32(v[6], rnding); 2337 v[6] = _mm256_srai_epi32(v[6], bit); 2338 2339 v[7] = _mm256_sub_epi32(y, x); 2340 v[7] = _mm256_add_epi32(v[7], rnding); 2341 v[7] = _mm256_srai_epi32(v[7], bit); 2342 2343 v[8] = u[8]; 2344 v[9] = u[9]; 2345 2346 y = _mm256_mullo_epi32(u[10], cospi32); 2347 x = _mm256_mullo_epi32(u[11], cospi32); 2348 v[10] = _mm256_add_epi32(y, x); 2349 v[10] = _mm256_add_epi32(v[10], rnding); 2350 v[10] = _mm256_srai_epi32(v[10], bit); 2351 2352 v[11] = _mm256_sub_epi32(y, x); 2353 v[11] = _mm256_add_epi32(v[11], rnding); 2354 v[11] = _mm256_srai_epi32(v[11], bit); 2355 2356 v[12] = u[12]; 2357 v[13] = u[13]; 2358 2359 y = _mm256_mullo_epi32(u[14], cospi32); 2360 x = _mm256_mullo_epi32(u[15], cospi32); 2361 v[14] = _mm256_add_epi32(y, x); 2362 v[14] = _mm256_add_epi32(v[14], rnding); 2363 v[14] = _mm256_srai_epi32(v[14], bit); 2364 2365 v[15] = _mm256_sub_epi32(y, x); 2366 v[15] = _mm256_add_epi32(v[15], rnding); 2367 v[15] = _mm256_srai_epi32(v[15], bit); 2368 2369 // stage 9 2370 if (do_cols) { 2371 out[0] = v[0]; 2372 out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), v[8]); 2373 out[2] = v[12]; 2374 out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), v[4]); 2375 out[4] = v[6]; 2376 out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), v[14]); 2377 out[6] = v[10]; 2378 out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), v[2]); 2379 out[8] = v[3]; 2380 out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), v[11]); 2381 out[10] = v[15]; 2382 out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), v[7]); 2383 out[12] = v[5]; 2384 out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), v[13]); 2385 out[14] = v[9]; 2386 out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), v[1]); 2387 } else { 2388 const int log_range_out = AOMMAX(16, bd + 6); 2389 const __m256i clamp_lo_out = 2390 _mm256_set1_epi32(-(1 << (log_range_out - 1))); 2391 const __m256i clamp_hi_out = 2392 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); 2393 2394 neg_shift_avx2(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, 2395 out_shift); 2396 neg_shift_avx2(v[12], v[4], out + 2, out + 3, &clamp_lo_out, 2397 &clamp_hi_out, out_shift); 2398 neg_shift_avx2(v[6], v[14], out + 4, out + 5, &clamp_lo_out, 2399 &clamp_hi_out, out_shift); 2400 neg_shift_avx2(v[10], v[2], out + 6, out + 7, &clamp_lo_out, 2401 &clamp_hi_out, out_shift); 2402 neg_shift_avx2(v[3], v[11], out + 8, out + 9, &clamp_lo_out, 2403 &clamp_hi_out, out_shift); 2404 neg_shift_avx2(v[15], v[7], out + 10, out + 11, &clamp_lo_out, 2405 &clamp_hi_out, out_shift); 2406 neg_shift_avx2(v[5], v[13], out + 12, out + 13, &clamp_lo_out, 2407 &clamp_hi_out, out_shift); 2408 neg_shift_avx2(v[9], v[1], out + 14, out + 15, &clamp_lo_out, 2409 &clamp_hi_out, out_shift); 2410 } 2411 } 2412 } 2413 static void idct8x8_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols, 2414 int bd, int out_shift) { 2415 const int32_t *cospi = cospi_arr(bit); 2416 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); 2417 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); 2418 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); 2419 __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); 2420 __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); 2421 __m256i x; 2422 2423 // stage 0 2424 // stage 1 2425 // stage 2 2426 // stage 3 2427 x = _mm256_mullo_epi32(in[0], cospi32); 2428 x = _mm256_add_epi32(x, rnding); 2429 x = _mm256_srai_epi32(x, bit); 2430 2431 // stage 4 2432 // stage 5 2433 if (!do_cols) { 2434 const int log_range_out = AOMMAX(16, bd + 6); 2435 __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1); 2436 clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1))); 2437 clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); 2438 x = _mm256_add_epi32(x, offset); 2439 x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift)); 2440 } 2441 x = _mm256_max_epi32(x, clamp_lo); 2442 x = _mm256_min_epi32(x, clamp_hi); 2443 out[0] = x; 2444 out[1] = x; 2445 out[2] = x; 2446 out[3] = x; 2447 out[4] = x; 2448 out[5] = x; 2449 out[6] = x; 2450 out[7] = x; 2451 } 2452 static void idct8x8_avx2(__m256i *in, __m256i *out, int bit, int do_cols, 2453 int bd, int out_shift) { 2454 const int32_t *cospi = cospi_arr(bit); 2455 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); 2456 const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); 2457 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); 2458 const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); 2459 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); 2460 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); 2461 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); 2462 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); 2463 const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); 2464 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); 2465 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); 2466 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); 2467 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); 2468 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); 2469 __m256i u0, u1, u2, u3, u4, u5, u6, u7; 2470 __m256i v0, v1, v2, v3, v4, v5, v6, v7; 2471 __m256i x, y; 2472 2473 // stage 0 2474 // stage 1 2475 // stage 2 2476 u0 = in[0]; 2477 u1 = in[4]; 2478 u2 = in[2]; 2479 u3 = in[6]; 2480 2481 x = _mm256_mullo_epi32(in[1], cospi56); 2482 y = _mm256_mullo_epi32(in[7], cospim8); 2483 u4 = _mm256_add_epi32(x, y); 2484 u4 = _mm256_add_epi32(u4, rnding); 2485 u4 = _mm256_srai_epi32(u4, bit); 2486 2487 x = _mm256_mullo_epi32(in[1], cospi8); 2488 y = _mm256_mullo_epi32(in[7], cospi56); 2489 u7 = _mm256_add_epi32(x, y); 2490 u7 = _mm256_add_epi32(u7, rnding); 2491 u7 = _mm256_srai_epi32(u7, bit); 2492 2493 x = _mm256_mullo_epi32(in[5], cospi24); 2494 y = _mm256_mullo_epi32(in[3], cospim40); 2495 u5 = _mm256_add_epi32(x, y); 2496 u5 = _mm256_add_epi32(u5, rnding); 2497 u5 = _mm256_srai_epi32(u5, bit); 2498 2499 x = _mm256_mullo_epi32(in[5], cospi40); 2500 y = _mm256_mullo_epi32(in[3], cospi24); 2501 u6 = _mm256_add_epi32(x, y); 2502 u6 = _mm256_add_epi32(u6, rnding); 2503 u6 = _mm256_srai_epi32(u6, bit); 2504 2505 // stage 3 2506 x = _mm256_mullo_epi32(u0, cospi32); 2507 y = _mm256_mullo_epi32(u1, cospi32); 2508 v0 = _mm256_add_epi32(x, y); 2509 v0 = _mm256_add_epi32(v0, rnding); 2510 v0 = _mm256_srai_epi32(v0, bit); 2511 2512 v1 = _mm256_sub_epi32(x, y); 2513 v1 = _mm256_add_epi32(v1, rnding); 2514 v1 = _mm256_srai_epi32(v1, bit); 2515 2516 x = _mm256_mullo_epi32(u2, cospi48); 2517 y = _mm256_mullo_epi32(u3, cospim16); 2518 v2 = _mm256_add_epi32(x, y); 2519 v2 = _mm256_add_epi32(v2, rnding); 2520 v2 = _mm256_srai_epi32(v2, bit); 2521 2522 x = _mm256_mullo_epi32(u2, cospi16); 2523 y = _mm256_mullo_epi32(u3, cospi48); 2524 v3 = _mm256_add_epi32(x, y); 2525 v3 = _mm256_add_epi32(v3, rnding); 2526 v3 = _mm256_srai_epi32(v3, bit); 2527 2528 addsub_avx2(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi); 2529 addsub_avx2(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi); 2530 2531 // stage 4 2532 addsub_avx2(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi); 2533 addsub_avx2(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi); 2534 u4 = v4; 2535 u7 = v7; 2536 2537 x = _mm256_mullo_epi32(v5, cospi32); 2538 y = _mm256_mullo_epi32(v6, cospi32); 2539 u6 = _mm256_add_epi32(y, x); 2540 u6 = _mm256_add_epi32(u6, rnding); 2541 u6 = _mm256_srai_epi32(u6, bit); 2542 2543 u5 = _mm256_sub_epi32(y, x); 2544 u5 = _mm256_add_epi32(u5, rnding); 2545 u5 = _mm256_srai_epi32(u5, bit); 2546 2547 addsub_avx2(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi); 2548 addsub_avx2(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi); 2549 addsub_avx2(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi); 2550 addsub_avx2(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi); 2551 // stage 5 2552 if (!do_cols) { 2553 const int log_range_out = AOMMAX(16, bd + 6); 2554 const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1))); 2555 const __m256i clamp_hi_out = 2556 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); 2557 2558 round_shift_4x4_avx2(out, out_shift); 2559 round_shift_4x4_avx2(out + 4, out_shift); 2560 highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 8); 2561 } 2562 } 2563 static void iadst8x8_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols, 2564 int bd, int out_shift) { 2565 const int32_t *cospi = cospi_arr(bit); 2566 const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); 2567 const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); 2568 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); 2569 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); 2570 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); 2571 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); 2572 const __m256i kZero = _mm256_setzero_si256(); 2573 __m256i u[8], x; 2574 2575 // stage 0 2576 // stage 1 2577 // stage 2 2578 2579 x = _mm256_mullo_epi32(in[0], cospi60); 2580 u[0] = _mm256_add_epi32(x, rnding); 2581 u[0] = _mm256_srai_epi32(u[0], bit); 2582 2583 x = _mm256_mullo_epi32(in[0], cospi4); 2584 u[1] = _mm256_sub_epi32(kZero, x); 2585 u[1] = _mm256_add_epi32(u[1], rnding); 2586 u[1] = _mm256_srai_epi32(u[1], bit); 2587 2588 // stage 3 2589 // stage 4 2590 __m256i temp1, temp2; 2591 temp1 = _mm256_mullo_epi32(u[0], cospi16); 2592 x = _mm256_mullo_epi32(u[1], cospi48); 2593 temp1 = _mm256_add_epi32(temp1, x); 2594 temp1 = _mm256_add_epi32(temp1, rnding); 2595 temp1 = _mm256_srai_epi32(temp1, bit); 2596 u[4] = temp1; 2597 2598 temp2 = _mm256_mullo_epi32(u[0], cospi48); 2599 x = _mm256_mullo_epi32(u[1], cospi16); 2600 u[5] = _mm256_sub_epi32(temp2, x); 2601 u[5] = _mm256_add_epi32(u[5], rnding); 2602 u[5] = _mm256_srai_epi32(u[5], bit); 2603 2604 // stage 5 2605 // stage 6 2606 temp1 = _mm256_mullo_epi32(u[0], cospi32); 2607 x = _mm256_mullo_epi32(u[1], cospi32); 2608 u[2] = _mm256_add_epi32(temp1, x); 2609 u[2] = _mm256_add_epi32(u[2], rnding); 2610 u[2] = _mm256_srai_epi32(u[2], bit); 2611 2612 u[3] = _mm256_sub_epi32(temp1, x); 2613 u[3] = _mm256_add_epi32(u[3], rnding); 2614 u[3] = _mm256_srai_epi32(u[3], bit); 2615 2616 temp1 = _mm256_mullo_epi32(u[4], cospi32); 2617 x = _mm256_mullo_epi32(u[5], cospi32); 2618 u[6] = _mm256_add_epi32(temp1, x); 2619 u[6] = _mm256_add_epi32(u[6], rnding); 2620 u[6] = _mm256_srai_epi32(u[6], bit); 2621 2622 u[7] = _mm256_sub_epi32(temp1, x); 2623 u[7] = _mm256_add_epi32(u[7], rnding); 2624 u[7] = _mm256_srai_epi32(u[7], bit); 2625 2626 // stage 7 2627 if (do_cols) { 2628 out[0] = u[0]; 2629 out[1] = _mm256_sub_epi32(kZero, u[4]); 2630 out[2] = u[6]; 2631 out[3] = _mm256_sub_epi32(kZero, u[2]); 2632 out[4] = u[3]; 2633 out[5] = _mm256_sub_epi32(kZero, u[7]); 2634 out[6] = u[5]; 2635 out[7] = _mm256_sub_epi32(kZero, u[1]); 2636 } else { 2637 const int log_range_out = AOMMAX(16, bd + 6); 2638 const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1))); 2639 const __m256i clamp_hi_out = 2640 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); 2641 2642 neg_shift_avx2(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, 2643 out_shift); 2644 neg_shift_avx2(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, 2645 out_shift); 2646 neg_shift_avx2(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, 2647 out_shift); 2648 neg_shift_avx2(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, 2649 out_shift); 2650 } 2651 } 2652 2653 static void iadst8x8_avx2(__m256i *in, __m256i *out, int bit, int do_cols, 2654 int bd, int out_shift) { 2655 const int32_t *cospi = cospi_arr(bit); 2656 const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); 2657 const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); 2658 const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); 2659 const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); 2660 const __m256i cospi36 = _mm256_set1_epi32(cospi[36]); 2661 const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); 2662 const __m256i cospi52 = _mm256_set1_epi32(cospi[52]); 2663 const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); 2664 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); 2665 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); 2666 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); 2667 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); 2668 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); 2669 const __m256i kZero = _mm256_setzero_si256(); 2670 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); 2671 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); 2672 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); 2673 __m256i u[8], v[8], x; 2674 2675 // stage 0 2676 // stage 1 2677 // stage 2 2678 2679 u[0] = _mm256_mullo_epi32(in[7], cospi4); 2680 x = _mm256_mullo_epi32(in[0], cospi60); 2681 u[0] = _mm256_add_epi32(u[0], x); 2682 u[0] = _mm256_add_epi32(u[0], rnding); 2683 u[0] = _mm256_srai_epi32(u[0], bit); 2684 2685 u[1] = _mm256_mullo_epi32(in[7], cospi60); 2686 x = _mm256_mullo_epi32(in[0], cospi4); 2687 u[1] = _mm256_sub_epi32(u[1], x); 2688 u[1] = _mm256_add_epi32(u[1], rnding); 2689 u[1] = _mm256_srai_epi32(u[1], bit); 2690 2691 u[2] = _mm256_mullo_epi32(in[5], cospi20); 2692 x = _mm256_mullo_epi32(in[2], cospi44); 2693 u[2] = _mm256_add_epi32(u[2], x); 2694 u[2] = _mm256_add_epi32(u[2], rnding); 2695 u[2] = _mm256_srai_epi32(u[2], bit); 2696 2697 u[3] = _mm256_mullo_epi32(in[5], cospi44); 2698 x = _mm256_mullo_epi32(in[2], cospi20); 2699 u[3] = _mm256_sub_epi32(u[3], x); 2700 u[3] = _mm256_add_epi32(u[3], rnding); 2701 u[3] = _mm256_srai_epi32(u[3], bit); 2702 2703 u[4] = _mm256_mullo_epi32(in[3], cospi36); 2704 x = _mm256_mullo_epi32(in[4], cospi28); 2705 u[4] = _mm256_add_epi32(u[4], x); 2706 u[4] = _mm256_add_epi32(u[4], rnding); 2707 u[4] = _mm256_srai_epi32(u[4], bit); 2708 2709 u[5] = _mm256_mullo_epi32(in[3], cospi28); 2710 x = _mm256_mullo_epi32(in[4], cospi36); 2711 u[5] = _mm256_sub_epi32(u[5], x); 2712 u[5] = _mm256_add_epi32(u[5], rnding); 2713 u[5] = _mm256_srai_epi32(u[5], bit); 2714 2715 u[6] = _mm256_mullo_epi32(in[1], cospi52); 2716 x = _mm256_mullo_epi32(in[6], cospi12); 2717 u[6] = _mm256_add_epi32(u[6], x); 2718 u[6] = _mm256_add_epi32(u[6], rnding); 2719 u[6] = _mm256_srai_epi32(u[6], bit); 2720 2721 u[7] = _mm256_mullo_epi32(in[1], cospi12); 2722 x = _mm256_mullo_epi32(in[6], cospi52); 2723 u[7] = _mm256_sub_epi32(u[7], x); 2724 u[7] = _mm256_add_epi32(u[7], rnding); 2725 u[7] = _mm256_srai_epi32(u[7], bit); 2726 2727 // stage 3 2728 addsub_avx2(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi); 2729 addsub_avx2(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi); 2730 addsub_avx2(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi); 2731 addsub_avx2(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi); 2732 2733 // stage 4 2734 u[0] = v[0]; 2735 u[1] = v[1]; 2736 u[2] = v[2]; 2737 u[3] = v[3]; 2738 2739 u[4] = _mm256_mullo_epi32(v[4], cospi16); 2740 x = _mm256_mullo_epi32(v[5], cospi48); 2741 u[4] = _mm256_add_epi32(u[4], x); 2742 u[4] = _mm256_add_epi32(u[4], rnding); 2743 u[4] = _mm256_srai_epi32(u[4], bit); 2744 2745 u[5] = _mm256_mullo_epi32(v[4], cospi48); 2746 x = _mm256_mullo_epi32(v[5], cospi16); 2747 u[5] = _mm256_sub_epi32(u[5], x); 2748 u[5] = _mm256_add_epi32(u[5], rnding); 2749 u[5] = _mm256_srai_epi32(u[5], bit); 2750 2751 u[6] = _mm256_mullo_epi32(v[6], cospim48); 2752 x = _mm256_mullo_epi32(v[7], cospi16); 2753 u[6] = _mm256_add_epi32(u[6], x); 2754 u[6] = _mm256_add_epi32(u[6], rnding); 2755 u[6] = _mm256_srai_epi32(u[6], bit); 2756 2757 u[7] = _mm256_mullo_epi32(v[6], cospi16); 2758 x = _mm256_mullo_epi32(v[7], cospim48); 2759 u[7] = _mm256_sub_epi32(u[7], x); 2760 u[7] = _mm256_add_epi32(u[7], rnding); 2761 u[7] = _mm256_srai_epi32(u[7], bit); 2762 2763 // stage 5 2764 addsub_avx2(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi); 2765 addsub_avx2(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi); 2766 addsub_avx2(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi); 2767 addsub_avx2(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi); 2768 2769 // stage 6 2770 u[0] = v[0]; 2771 u[1] = v[1]; 2772 u[4] = v[4]; 2773 u[5] = v[5]; 2774 2775 v[0] = _mm256_mullo_epi32(v[2], cospi32); 2776 x = _mm256_mullo_epi32(v[3], cospi32); 2777 u[2] = _mm256_add_epi32(v[0], x); 2778 u[2] = _mm256_add_epi32(u[2], rnding); 2779 u[2] = _mm256_srai_epi32(u[2], bit); 2780 2781 u[3] = _mm256_sub_epi32(v[0], x); 2782 u[3] = _mm256_add_epi32(u[3], rnding); 2783 u[3] = _mm256_srai_epi32(u[3], bit); 2784 2785 v[0] = _mm256_mullo_epi32(v[6], cospi32); 2786 x = _mm256_mullo_epi32(v[7], cospi32); 2787 u[6] = _mm256_add_epi32(v[0], x); 2788 u[6] = _mm256_add_epi32(u[6], rnding); 2789 u[6] = _mm256_srai_epi32(u[6], bit); 2790 2791 u[7] = _mm256_sub_epi32(v[0], x); 2792 u[7] = _mm256_add_epi32(u[7], rnding); 2793 u[7] = _mm256_srai_epi32(u[7], bit); 2794 2795 // stage 7 2796 if (do_cols) { 2797 out[0] = u[0]; 2798 out[1] = _mm256_sub_epi32(kZero, u[4]); 2799 out[2] = u[6]; 2800 out[3] = _mm256_sub_epi32(kZero, u[2]); 2801 out[4] = u[3]; 2802 out[5] = _mm256_sub_epi32(kZero, u[7]); 2803 out[6] = u[5]; 2804 out[7] = _mm256_sub_epi32(kZero, u[1]); 2805 } else { 2806 const int log_range_out = AOMMAX(16, bd + 6); 2807 const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1))); 2808 const __m256i clamp_hi_out = 2809 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); 2810 2811 neg_shift_avx2(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, 2812 out_shift); 2813 neg_shift_avx2(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, 2814 out_shift); 2815 neg_shift_avx2(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, 2816 out_shift); 2817 neg_shift_avx2(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, 2818 out_shift); 2819 } 2820 } 2821 static inline void idct64_stage8_avx2( 2822 __m256i *u, const __m256i *cospim32, const __m256i *cospi32, 2823 const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16, 2824 const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi, 2825 const __m256i *rnding, int bit) { 2826 int i; 2827 __m256i temp1, temp2, temp3, temp4; 2828 temp1 = half_btf_avx2(cospim32, &u[10], cospi32, &u[13], rnding, bit); 2829 u[13] = half_btf_avx2(cospi32, &u[10], cospi32, &u[13], rnding, bit); 2830 u[10] = temp1; 2831 temp2 = half_btf_avx2(cospim32, &u[11], cospi32, &u[12], rnding, bit); 2832 u[12] = half_btf_avx2(cospi32, &u[11], cospi32, &u[12], rnding, bit); 2833 u[11] = temp2; 2834 2835 for (i = 16; i < 20; ++i) { 2836 addsub_avx2(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi); 2837 addsub_avx2(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo, clamp_hi); 2838 } 2839 2840 temp1 = half_btf_avx2(cospim16, &u[36], cospi48, &u[59], rnding, bit); 2841 temp2 = half_btf_avx2(cospim16, &u[37], cospi48, &u[58], rnding, bit); 2842 temp3 = half_btf_avx2(cospim16, &u[38], cospi48, &u[57], rnding, bit); 2843 temp4 = half_btf_avx2(cospim16, &u[39], cospi48, &u[56], rnding, bit); 2844 u[56] = half_btf_avx2(cospi48, &u[39], cospi16, &u[56], rnding, bit); 2845 u[57] = half_btf_avx2(cospi48, &u[38], cospi16, &u[57], rnding, bit); 2846 u[58] = half_btf_avx2(cospi48, &u[37], cospi16, &u[58], rnding, bit); 2847 u[59] = half_btf_avx2(cospi48, &u[36], cospi16, &u[59], rnding, bit); 2848 u[36] = temp1; 2849 u[37] = temp2; 2850 u[38] = temp3; 2851 u[39] = temp4; 2852 2853 temp1 = half_btf_avx2(cospim48, &u[40], cospim16, &u[55], rnding, bit); 2854 temp2 = half_btf_avx2(cospim48, &u[41], cospim16, &u[54], rnding, bit); 2855 temp3 = half_btf_avx2(cospim48, &u[42], cospim16, &u[53], rnding, bit); 2856 temp4 = half_btf_avx2(cospim48, &u[43], cospim16, &u[52], rnding, bit); 2857 u[52] = half_btf_avx2(cospim16, &u[43], cospi48, &u[52], rnding, bit); 2858 u[53] = half_btf_avx2(cospim16, &u[42], cospi48, &u[53], rnding, bit); 2859 u[54] = half_btf_avx2(cospim16, &u[41], cospi48, &u[54], rnding, bit); 2860 u[55] = half_btf_avx2(cospim16, &u[40], cospi48, &u[55], rnding, bit); 2861 u[40] = temp1; 2862 u[41] = temp2; 2863 u[42] = temp3; 2864 u[43] = temp4; 2865 } 2866 2867 static inline void idct64_stage9_avx2(__m256i *u, const __m256i *cospim32, 2868 const __m256i *cospi32, 2869 const __m256i *clamp_lo, 2870 const __m256i *clamp_hi, 2871 const __m256i *rnding, int bit) { 2872 int i; 2873 __m256i temp1, temp2, temp3, temp4; 2874 for (i = 0; i < 8; ++i) { 2875 addsub_avx2(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi); 2876 } 2877 2878 temp1 = half_btf_avx2(cospim32, &u[20], cospi32, &u[27], rnding, bit); 2879 temp2 = half_btf_avx2(cospim32, &u[21], cospi32, &u[26], rnding, bit); 2880 temp3 = half_btf_avx2(cospim32, &u[22], cospi32, &u[25], rnding, bit); 2881 temp4 = half_btf_avx2(cospim32, &u[23], cospi32, &u[24], rnding, bit); 2882 u[24] = half_btf_avx2(cospi32, &u[23], cospi32, &u[24], rnding, bit); 2883 u[25] = half_btf_avx2(cospi32, &u[22], cospi32, &u[25], rnding, bit); 2884 u[26] = half_btf_avx2(cospi32, &u[21], cospi32, &u[26], rnding, bit); 2885 u[27] = half_btf_avx2(cospi32, &u[20], cospi32, &u[27], rnding, bit); 2886 u[20] = temp1; 2887 u[21] = temp2; 2888 u[22] = temp3; 2889 u[23] = temp4; 2890 for (i = 32; i < 40; i++) { 2891 addsub_avx2(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi); 2892 } 2893 2894 for (i = 48; i < 56; i++) { 2895 addsub_avx2(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi); 2896 } 2897 } 2898 2899 static inline void idct64_stage10_avx2(__m256i *u, const __m256i *cospim32, 2900 const __m256i *cospi32, 2901 const __m256i *clamp_lo, 2902 const __m256i *clamp_hi, 2903 const __m256i *rnding, int bit) { 2904 __m256i temp1, temp2, temp3, temp4; 2905 for (int i = 0; i < 16; i++) { 2906 addsub_avx2(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi); 2907 } 2908 2909 temp1 = half_btf_avx2(cospim32, &u[40], cospi32, &u[55], rnding, bit); 2910 temp2 = half_btf_avx2(cospim32, &u[41], cospi32, &u[54], rnding, bit); 2911 temp3 = half_btf_avx2(cospim32, &u[42], cospi32, &u[53], rnding, bit); 2912 temp4 = half_btf_avx2(cospim32, &u[43], cospi32, &u[52], rnding, bit); 2913 u[52] = half_btf_avx2(cospi32, &u[43], cospi32, &u[52], rnding, bit); 2914 u[53] = half_btf_avx2(cospi32, &u[42], cospi32, &u[53], rnding, bit); 2915 u[54] = half_btf_avx2(cospi32, &u[41], cospi32, &u[54], rnding, bit); 2916 u[55] = half_btf_avx2(cospi32, &u[40], cospi32, &u[55], rnding, bit); 2917 u[40] = temp1; 2918 u[41] = temp2; 2919 u[42] = temp3; 2920 u[43] = temp4; 2921 2922 temp1 = half_btf_avx2(cospim32, &u[44], cospi32, &u[51], rnding, bit); 2923 temp2 = half_btf_avx2(cospim32, &u[45], cospi32, &u[50], rnding, bit); 2924 temp3 = half_btf_avx2(cospim32, &u[46], cospi32, &u[49], rnding, bit); 2925 temp4 = half_btf_avx2(cospim32, &u[47], cospi32, &u[48], rnding, bit); 2926 u[48] = half_btf_avx2(cospi32, &u[47], cospi32, &u[48], rnding, bit); 2927 u[49] = half_btf_avx2(cospi32, &u[46], cospi32, &u[49], rnding, bit); 2928 u[50] = half_btf_avx2(cospi32, &u[45], cospi32, &u[50], rnding, bit); 2929 u[51] = half_btf_avx2(cospi32, &u[44], cospi32, &u[51], rnding, bit); 2930 u[44] = temp1; 2931 u[45] = temp2; 2932 u[46] = temp3; 2933 u[47] = temp4; 2934 } 2935 2936 static inline void idct64_stage11_avx2(__m256i *u, __m256i *out, int do_cols, 2937 int bd, int out_shift, 2938 const __m256i *clamp_lo, 2939 const __m256i *clamp_hi) { 2940 for (int i = 0; i < 32; i++) { 2941 addsub_avx2(u[i], u[63 - i], &out[(i)], &out[(63 - i)], clamp_lo, clamp_hi); 2942 } 2943 2944 if (!do_cols) { 2945 const int log_range_out = AOMMAX(16, bd + 6); 2946 const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1))); 2947 const __m256i clamp_hi_out = 2948 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); 2949 2950 round_shift_8x8_avx2(out, out_shift); 2951 round_shift_8x8_avx2(out + 16, out_shift); 2952 round_shift_8x8_avx2(out + 32, out_shift); 2953 round_shift_8x8_avx2(out + 48, out_shift); 2954 highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 64); 2955 } 2956 } 2957 2958 static void idct64_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols, 2959 int bd, int out_shift) { 2960 const int32_t *cospi = cospi_arr(bit); 2961 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); 2962 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); 2963 __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); 2964 __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); 2965 2966 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); 2967 2968 { 2969 __m256i x; 2970 2971 // stage 1 2972 // stage 2 2973 // stage 3 2974 // stage 4 2975 // stage 5 2976 // stage 6 2977 x = half_btf_0_avx2(&cospi32, &in[0], &rnding, bit); 2978 2979 // stage 8 2980 // stage 9 2981 // stage 10 2982 // stage 11 2983 if (!do_cols) { 2984 const int log_range_out = AOMMAX(16, bd + 6); 2985 clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1))); 2986 clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); 2987 if (out_shift != 0) { 2988 __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1); 2989 x = _mm256_add_epi32(x, offset); 2990 x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift)); 2991 } 2992 } 2993 x = _mm256_max_epi32(x, clamp_lo); 2994 x = _mm256_min_epi32(x, clamp_hi); 2995 out[0] = x; 2996 out[1] = x; 2997 out[2] = x; 2998 out[3] = x; 2999 out[4] = x; 3000 out[5] = x; 3001 out[6] = x; 3002 out[7] = x; 3003 out[8] = x; 3004 out[9] = x; 3005 out[10] = x; 3006 out[11] = x; 3007 out[12] = x; 3008 out[13] = x; 3009 out[14] = x; 3010 out[15] = x; 3011 out[16] = x; 3012 out[17] = x; 3013 out[18] = x; 3014 out[19] = x; 3015 out[20] = x; 3016 out[21] = x; 3017 out[22] = x; 3018 out[23] = x; 3019 out[24] = x; 3020 out[25] = x; 3021 out[26] = x; 3022 out[27] = x; 3023 out[28] = x; 3024 out[29] = x; 3025 out[30] = x; 3026 out[31] = x; 3027 out[32] = x; 3028 out[33] = x; 3029 out[34] = x; 3030 out[35] = x; 3031 out[36] = x; 3032 out[37] = x; 3033 out[38] = x; 3034 out[39] = x; 3035 out[40] = x; 3036 out[41] = x; 3037 out[42] = x; 3038 out[43] = x; 3039 out[44] = x; 3040 out[45] = x; 3041 out[46] = x; 3042 out[47] = x; 3043 out[48] = x; 3044 out[49] = x; 3045 out[50] = x; 3046 out[51] = x; 3047 out[52] = x; 3048 out[53] = x; 3049 out[54] = x; 3050 out[55] = x; 3051 out[56] = x; 3052 out[57] = x; 3053 out[58] = x; 3054 out[59] = x; 3055 out[60] = x; 3056 out[61] = x; 3057 out[62] = x; 3058 out[63] = x; 3059 } 3060 } 3061 static void idct64_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols, 3062 int bd, int out_shift) { 3063 int i, j; 3064 const int32_t *cospi = cospi_arr(bit); 3065 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); 3066 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); 3067 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); 3068 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); 3069 3070 const __m256i cospi1 = _mm256_set1_epi32(cospi[1]); 3071 const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); 3072 const __m256i cospi3 = _mm256_set1_epi32(cospi[3]); 3073 const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); 3074 const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); 3075 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); 3076 const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); 3077 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); 3078 const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); 3079 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); 3080 const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); 3081 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); 3082 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); 3083 const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); 3084 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); 3085 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); 3086 const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); 3087 const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]); 3088 const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); 3089 const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]); 3090 const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); 3091 const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]); 3092 const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); 3093 const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]); 3094 const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); 3095 const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); 3096 const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); 3097 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); 3098 const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); 3099 const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); 3100 const __m256i cospi63 = _mm256_set1_epi32(cospi[63]); 3101 const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]); 3102 const __m256i cospi7 = _mm256_set1_epi32(cospi[7]); 3103 const __m256i cospi5 = _mm256_set1_epi32(cospi[5]); 3104 const __m256i cospi59 = _mm256_set1_epi32(cospi[59]); 3105 const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]); 3106 const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); 3107 const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); 3108 3109 { 3110 __m256i u[64]; 3111 3112 // stage 1 3113 u[0] = in[0]; 3114 u[8] = in[4]; 3115 u[16] = in[2]; 3116 u[24] = in[6]; 3117 u[32] = in[1]; 3118 u[40] = in[5]; 3119 u[48] = in[3]; 3120 u[56] = in[7]; 3121 3122 // stage 2 3123 u[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit); 3124 u[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit); 3125 u[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit); 3126 u[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit); 3127 u[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit); 3128 u[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit); 3129 u[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit); 3130 u[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit); 3131 3132 // stage 3 3133 u[31] = half_btf_0_avx2(&cospi2, &u[16], &rnding, bit); 3134 u[16] = half_btf_0_avx2(&cospi62, &u[16], &rnding, bit); 3135 u[23] = half_btf_0_avx2(&cospim58, &u[24], &rnding, bit); 3136 u[24] = half_btf_0_avx2(&cospi6, &u[24], &rnding, bit); 3137 u[33] = u[32]; 3138 u[38] = u[39]; 3139 u[41] = u[40]; 3140 u[46] = u[47]; 3141 u[49] = u[48]; 3142 u[54] = u[55]; 3143 u[57] = u[56]; 3144 u[62] = u[63]; 3145 3146 // stage 4 3147 __m256i temp1, temp2; 3148 u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit); 3149 u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit); 3150 u[17] = u[16]; 3151 u[22] = u[23]; 3152 u[25] = u[24]; 3153 u[30] = u[31]; 3154 3155 temp1 = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); 3156 u[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); 3157 u[33] = temp1; 3158 3159 temp2 = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); 3160 u[38] = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); 3161 u[57] = temp2; 3162 3163 temp1 = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); 3164 u[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); 3165 u[41] = temp1; 3166 3167 temp2 = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); 3168 u[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); 3169 u[46] = temp2; 3170 3171 // stage 5 3172 u[9] = u[8]; 3173 u[14] = u[15]; 3174 3175 temp1 = half_btf_avx2(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit); 3176 u[30] = half_btf_avx2(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit); 3177 u[17] = temp1; 3178 3179 temp2 = half_btf_avx2(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit); 3180 u[25] = half_btf_avx2(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit); 3181 u[22] = temp2; 3182 3183 u[35] = u[32]; 3184 u[34] = u[33]; 3185 u[36] = u[39]; 3186 u[37] = u[38]; 3187 u[43] = u[40]; 3188 u[42] = u[41]; 3189 u[44] = u[47]; 3190 u[45] = u[46]; 3191 u[51] = u[48]; 3192 u[50] = u[49]; 3193 u[52] = u[55]; 3194 u[53] = u[54]; 3195 u[59] = u[56]; 3196 u[58] = u[57]; 3197 u[60] = u[63]; 3198 u[61] = u[62]; 3199 3200 // stage 6 3201 temp1 = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit); 3202 u[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit); 3203 u[0] = temp1; 3204 3205 temp2 = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); 3206 u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); 3207 u[9] = temp2; 3208 u[19] = u[16]; 3209 u[18] = u[17]; 3210 u[20] = u[23]; 3211 u[21] = u[22]; 3212 u[27] = u[24]; 3213 u[26] = u[25]; 3214 u[28] = u[31]; 3215 u[29] = u[30]; 3216 3217 temp1 = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); 3218 u[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); 3219 u[34] = temp1; 3220 temp2 = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); 3221 u[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); 3222 u[35] = temp2; 3223 temp1 = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); 3224 u[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); 3225 u[36] = temp1; 3226 temp2 = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); 3227 u[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); 3228 u[37] = temp2; 3229 temp1 = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); 3230 u[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); 3231 u[42] = temp1; 3232 temp2 = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); 3233 u[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); 3234 u[43] = temp2; 3235 temp1 = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); 3236 u[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); 3237 u[44] = temp1; 3238 temp2 = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); 3239 u[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); 3240 u[45] = temp2; 3241 3242 // stage 7 3243 u[3] = u[0]; 3244 u[2] = u[1]; 3245 u[11] = u[8]; 3246 u[10] = u[9]; 3247 u[12] = u[15]; 3248 u[13] = u[14]; 3249 3250 temp1 = half_btf_avx2(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit); 3251 u[29] = half_btf_avx2(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit); 3252 u[18] = temp1; 3253 temp2 = half_btf_avx2(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit); 3254 u[28] = half_btf_avx2(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit); 3255 u[19] = temp2; 3256 temp1 = half_btf_avx2(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit); 3257 u[27] = half_btf_avx2(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit); 3258 u[20] = temp1; 3259 temp2 = half_btf_avx2(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit); 3260 u[26] = half_btf_avx2(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit); 3261 u[21] = temp2; 3262 for (i = 32; i < 64; i += 16) { 3263 for (j = i; j < i + 4; j++) { 3264 addsub_avx2(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); 3265 addsub_avx2(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, 3266 &clamp_hi); 3267 } 3268 } 3269 3270 // stage 8 3271 u[7] = u[0]; 3272 u[6] = u[1]; 3273 u[5] = u[2]; 3274 u[4] = u[3]; 3275 3276 idct64_stage8_avx2(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, 3277 &cospim48, &clamp_lo, &clamp_hi, &rnding, bit); 3278 3279 // stage 9 3280 idct64_stage9_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, 3281 bit); 3282 3283 // stage 10 3284 idct64_stage10_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, 3285 bit); 3286 3287 // stage 11 3288 idct64_stage11_avx2(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); 3289 } 3290 } 3291 static void idct64_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols, 3292 int bd, int out_shift) { 3293 int i, j; 3294 const int32_t *cospi = cospi_arr(bit); 3295 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); 3296 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); 3297 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); 3298 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); 3299 3300 const __m256i cospi1 = _mm256_set1_epi32(cospi[1]); 3301 const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); 3302 const __m256i cospi3 = _mm256_set1_epi32(cospi[3]); 3303 const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); 3304 const __m256i cospi5 = _mm256_set1_epi32(cospi[5]); 3305 const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); 3306 const __m256i cospi7 = _mm256_set1_epi32(cospi[7]); 3307 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); 3308 const __m256i cospi9 = _mm256_set1_epi32(cospi[9]); 3309 const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); 3310 const __m256i cospi11 = _mm256_set1_epi32(cospi[11]); 3311 const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); 3312 const __m256i cospi13 = _mm256_set1_epi32(cospi[13]); 3313 const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); 3314 const __m256i cospi15 = _mm256_set1_epi32(cospi[15]); 3315 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); 3316 const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); 3317 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); 3318 const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); 3319 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); 3320 const __m256i cospi36 = _mm256_set1_epi32(cospi[36]); 3321 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); 3322 const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); 3323 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); 3324 const __m256i cospi51 = _mm256_set1_epi32(cospi[51]); 3325 const __m256i cospi52 = _mm256_set1_epi32(cospi[52]); 3326 const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); 3327 const __m256i cospi55 = _mm256_set1_epi32(cospi[55]); 3328 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); 3329 const __m256i cospi59 = _mm256_set1_epi32(cospi[59]); 3330 const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); 3331 const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); 3332 const __m256i cospi63 = _mm256_set1_epi32(cospi[63]); 3333 3334 const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]); 3335 const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); 3336 const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]); 3337 const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); 3338 const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]); 3339 const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); 3340 const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]); 3341 const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); 3342 const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); 3343 const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); 3344 const __m256i cospim44 = _mm256_set1_epi32(-cospi[44]); 3345 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); 3346 const __m256i cospim49 = _mm256_set1_epi32(-cospi[49]); 3347 const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]); 3348 const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); 3349 const __m256i cospim53 = _mm256_set1_epi32(-cospi[53]); 3350 const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); 3351 const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]); 3352 const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); 3353 const __m256i cospim60 = _mm256_set1_epi32(-cospi[60]); 3354 const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]); 3355 3356 { 3357 __m256i u[64]; 3358 __m256i tmp1, tmp2, tmp3, tmp4; 3359 // stage 1 3360 u[0] = in[0]; 3361 u[32] = in[1]; 3362 u[36] = in[9]; 3363 u[40] = in[5]; 3364 u[44] = in[13]; 3365 u[48] = in[3]; 3366 u[52] = in[11]; 3367 u[56] = in[7]; 3368 u[60] = in[15]; 3369 u[16] = in[2]; 3370 u[20] = in[10]; 3371 u[24] = in[6]; 3372 u[28] = in[14]; 3373 u[4] = in[8]; 3374 u[8] = in[4]; 3375 u[12] = in[12]; 3376 3377 // stage 2 3378 u[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit); 3379 u[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit); 3380 u[35] = half_btf_0_avx2(&cospim49, &u[60], &rnding, bit); 3381 u[60] = half_btf_0_avx2(&cospi15, &u[60], &rnding, bit); 3382 u[59] = half_btf_0_avx2(&cospi9, &u[36], &rnding, bit); 3383 u[36] = half_btf_0_avx2(&cospi55, &u[36], &rnding, bit); 3384 u[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit); 3385 u[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit); 3386 u[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit); 3387 u[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit); 3388 u[43] = half_btf_0_avx2(&cospim53, &u[52], &rnding, bit); 3389 u[52] = half_btf_0_avx2(&cospi11, &u[52], &rnding, bit); 3390 u[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit); 3391 u[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit); 3392 u[51] = half_btf_0_avx2(&cospi13, &u[44], &rnding, bit); 3393 u[44] = half_btf_0_avx2(&cospi51, &u[44], &rnding, bit); 3394 3395 // stage 3 3396 u[31] = half_btf_0_avx2(&cospi2, &u[16], &rnding, bit); 3397 u[16] = half_btf_0_avx2(&cospi62, &u[16], &rnding, bit); 3398 u[19] = half_btf_0_avx2(&cospim50, &u[28], &rnding, bit); 3399 u[28] = half_btf_0_avx2(&cospi14, &u[28], &rnding, bit); 3400 u[27] = half_btf_0_avx2(&cospi10, &u[20], &rnding, bit); 3401 u[20] = half_btf_0_avx2(&cospi54, &u[20], &rnding, bit); 3402 u[23] = half_btf_0_avx2(&cospim58, &u[24], &rnding, bit); 3403 u[24] = half_btf_0_avx2(&cospi6, &u[24], &rnding, bit); 3404 u[33] = u[32]; 3405 u[34] = u[35]; 3406 u[37] = u[36]; 3407 u[38] = u[39]; 3408 u[41] = u[40]; 3409 u[42] = u[43]; 3410 u[45] = u[44]; 3411 u[46] = u[47]; 3412 u[49] = u[48]; 3413 u[50] = u[51]; 3414 u[53] = u[52]; 3415 u[54] = u[55]; 3416 u[57] = u[56]; 3417 u[58] = u[59]; 3418 u[61] = u[60]; 3419 u[62] = u[63]; 3420 3421 // stage 4 3422 u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit); 3423 u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit); 3424 u[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit); 3425 u[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit); 3426 3427 u[17] = u[16]; 3428 u[18] = u[19]; 3429 u[21] = u[20]; 3430 u[22] = u[23]; 3431 u[25] = u[24]; 3432 u[26] = u[27]; 3433 u[29] = u[28]; 3434 u[30] = u[31]; 3435 3436 tmp1 = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); 3437 tmp2 = half_btf_avx2(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit); 3438 tmp3 = half_btf_avx2(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit); 3439 tmp4 = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); 3440 u[57] = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); 3441 u[58] = half_btf_avx2(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit); 3442 u[61] = half_btf_avx2(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit); 3443 u[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); 3444 u[33] = tmp1; 3445 u[34] = tmp2; 3446 u[37] = tmp3; 3447 u[38] = tmp4; 3448 3449 tmp1 = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); 3450 tmp2 = half_btf_avx2(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit); 3451 tmp3 = half_btf_avx2(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit); 3452 tmp4 = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); 3453 u[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); 3454 u[50] = half_btf_avx2(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit); 3455 u[53] = half_btf_avx2(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit); 3456 u[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); 3457 u[41] = tmp1; 3458 u[42] = tmp2; 3459 u[45] = tmp3; 3460 u[46] = tmp4; 3461 3462 // stage 5 3463 u[7] = half_btf_0_avx2(&cospi8, &u[4], &rnding, bit); 3464 u[4] = half_btf_0_avx2(&cospi56, &u[4], &rnding, bit); 3465 3466 u[9] = u[8]; 3467 u[10] = u[11]; 3468 u[13] = u[12]; 3469 u[14] = u[15]; 3470 3471 tmp1 = half_btf_avx2(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit); 3472 tmp2 = half_btf_avx2(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit); 3473 tmp3 = half_btf_avx2(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit); 3474 tmp4 = half_btf_avx2(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit); 3475 u[25] = half_btf_avx2(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit); 3476 u[26] = half_btf_avx2(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit); 3477 u[29] = half_btf_avx2(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit); 3478 u[30] = half_btf_avx2(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit); 3479 u[17] = tmp1; 3480 u[18] = tmp2; 3481 u[21] = tmp3; 3482 u[22] = tmp4; 3483 3484 for (i = 32; i < 64; i += 8) { 3485 addsub_avx2(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, 3486 &clamp_hi); 3487 addsub_avx2(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, 3488 &clamp_hi); 3489 3490 addsub_avx2(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, 3491 &clamp_hi); 3492 addsub_avx2(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, 3493 &clamp_hi); 3494 } 3495 3496 // stage 6 3497 tmp1 = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit); 3498 u[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit); 3499 u[0] = tmp1; 3500 u[5] = u[4]; 3501 u[6] = u[7]; 3502 3503 tmp1 = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); 3504 u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); 3505 u[9] = tmp1; 3506 tmp2 = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); 3507 u[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); 3508 u[10] = tmp2; 3509 3510 for (i = 16; i < 32; i += 8) { 3511 addsub_avx2(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, 3512 &clamp_hi); 3513 addsub_avx2(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, 3514 &clamp_hi); 3515 3516 addsub_avx2(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, 3517 &clamp_hi); 3518 addsub_avx2(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, 3519 &clamp_hi); 3520 } 3521 3522 tmp1 = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); 3523 tmp2 = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); 3524 tmp3 = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); 3525 tmp4 = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); 3526 u[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); 3527 u[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); 3528 u[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); 3529 u[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); 3530 u[34] = tmp1; 3531 u[35] = tmp2; 3532 u[36] = tmp3; 3533 u[37] = tmp4; 3534 3535 tmp1 = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); 3536 tmp2 = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); 3537 tmp3 = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); 3538 tmp4 = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); 3539 u[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); 3540 u[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); 3541 u[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); 3542 u[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); 3543 u[42] = tmp1; 3544 u[43] = tmp2; 3545 u[44] = tmp3; 3546 u[45] = tmp4; 3547 3548 // stage 7 3549 u[3] = u[0]; 3550 u[2] = u[1]; 3551 tmp1 = half_btf_avx2(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit); 3552 u[6] = half_btf_avx2(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit); 3553 u[5] = tmp1; 3554 addsub_avx2(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi); 3555 addsub_avx2(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi); 3556 addsub_avx2(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi); 3557 addsub_avx2(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi); 3558 3559 tmp1 = half_btf_avx2(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit); 3560 tmp2 = half_btf_avx2(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit); 3561 tmp3 = half_btf_avx2(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit); 3562 tmp4 = half_btf_avx2(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit); 3563 u[26] = half_btf_avx2(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit); 3564 u[27] = half_btf_avx2(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit); 3565 u[28] = half_btf_avx2(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit); 3566 u[29] = half_btf_avx2(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit); 3567 u[18] = tmp1; 3568 u[19] = tmp2; 3569 u[20] = tmp3; 3570 u[21] = tmp4; 3571 3572 for (i = 32; i < 64; i += 16) { 3573 for (j = i; j < i + 4; j++) { 3574 addsub_avx2(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); 3575 addsub_avx2(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, 3576 &clamp_hi); 3577 } 3578 } 3579 3580 // stage 8 3581 for (i = 0; i < 4; ++i) { 3582 addsub_avx2(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi); 3583 } 3584 3585 idct64_stage8_avx2(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, 3586 &cospim48, &clamp_lo, &clamp_hi, &rnding, bit); 3587 3588 // stage 9 3589 idct64_stage9_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, 3590 bit); 3591 3592 // stage 10 3593 idct64_stage10_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, 3594 bit); 3595 3596 // stage 11 3597 idct64_stage11_avx2(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); 3598 } 3599 } 3600 static void idct64_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd, 3601 int out_shift) { 3602 int i, j; 3603 const int32_t *cospi = cospi_arr(bit); 3604 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); 3605 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); 3606 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); 3607 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); 3608 3609 const __m256i cospi1 = _mm256_set1_epi32(cospi[1]); 3610 const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); 3611 const __m256i cospi3 = _mm256_set1_epi32(cospi[3]); 3612 const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); 3613 const __m256i cospi5 = _mm256_set1_epi32(cospi[5]); 3614 const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); 3615 const __m256i cospi7 = _mm256_set1_epi32(cospi[7]); 3616 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); 3617 const __m256i cospi9 = _mm256_set1_epi32(cospi[9]); 3618 const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); 3619 const __m256i cospi11 = _mm256_set1_epi32(cospi[11]); 3620 const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); 3621 const __m256i cospi13 = _mm256_set1_epi32(cospi[13]); 3622 const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); 3623 const __m256i cospi15 = _mm256_set1_epi32(cospi[15]); 3624 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); 3625 const __m256i cospi17 = _mm256_set1_epi32(cospi[17]); 3626 const __m256i cospi18 = _mm256_set1_epi32(cospi[18]); 3627 const __m256i cospi19 = _mm256_set1_epi32(cospi[19]); 3628 const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); 3629 const __m256i cospi21 = _mm256_set1_epi32(cospi[21]); 3630 const __m256i cospi22 = _mm256_set1_epi32(cospi[22]); 3631 const __m256i cospi23 = _mm256_set1_epi32(cospi[23]); 3632 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); 3633 const __m256i cospi25 = _mm256_set1_epi32(cospi[25]); 3634 const __m256i cospi26 = _mm256_set1_epi32(cospi[26]); 3635 const __m256i cospi27 = _mm256_set1_epi32(cospi[27]); 3636 const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); 3637 const __m256i cospi29 = _mm256_set1_epi32(cospi[29]); 3638 const __m256i cospi30 = _mm256_set1_epi32(cospi[30]); 3639 const __m256i cospi31 = _mm256_set1_epi32(cospi[31]); 3640 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); 3641 const __m256i cospi35 = _mm256_set1_epi32(cospi[35]); 3642 const __m256i cospi36 = _mm256_set1_epi32(cospi[36]); 3643 const __m256i cospi38 = _mm256_set1_epi32(cospi[38]); 3644 const __m256i cospi39 = _mm256_set1_epi32(cospi[39]); 3645 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); 3646 const __m256i cospi43 = _mm256_set1_epi32(cospi[43]); 3647 const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); 3648 const __m256i cospi46 = _mm256_set1_epi32(cospi[46]); 3649 const __m256i cospi47 = _mm256_set1_epi32(cospi[47]); 3650 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); 3651 const __m256i cospi51 = _mm256_set1_epi32(cospi[51]); 3652 const __m256i cospi52 = _mm256_set1_epi32(cospi[52]); 3653 const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); 3654 const __m256i cospi55 = _mm256_set1_epi32(cospi[55]); 3655 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); 3656 const __m256i cospi59 = _mm256_set1_epi32(cospi[59]); 3657 const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); 3658 const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); 3659 const __m256i cospi63 = _mm256_set1_epi32(cospi[63]); 3660 3661 const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]); 3662 const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); 3663 const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]); 3664 const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); 3665 const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]); 3666 const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); 3667 const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]); 3668 const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); 3669 const __m256i cospim33 = _mm256_set1_epi32(-cospi[33]); 3670 const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]); 3671 const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); 3672 const __m256i cospim37 = _mm256_set1_epi32(-cospi[37]); 3673 const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); 3674 const __m256i cospim41 = _mm256_set1_epi32(-cospi[41]); 3675 const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]); 3676 const __m256i cospim44 = _mm256_set1_epi32(-cospi[44]); 3677 const __m256i cospim45 = _mm256_set1_epi32(-cospi[45]); 3678 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); 3679 const __m256i cospim49 = _mm256_set1_epi32(-cospi[49]); 3680 const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]); 3681 const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); 3682 const __m256i cospim53 = _mm256_set1_epi32(-cospi[53]); 3683 const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); 3684 const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]); 3685 const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); 3686 const __m256i cospim60 = _mm256_set1_epi32(-cospi[60]); 3687 const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]); 3688 3689 { 3690 __m256i u[64], v[64]; 3691 3692 // stage 1 3693 u[32] = in[1]; 3694 u[34] = in[17]; 3695 u[36] = in[9]; 3696 u[38] = in[25]; 3697 u[40] = in[5]; 3698 u[42] = in[21]; 3699 u[44] = in[13]; 3700 u[46] = in[29]; 3701 u[48] = in[3]; 3702 u[50] = in[19]; 3703 u[52] = in[11]; 3704 u[54] = in[27]; 3705 u[56] = in[7]; 3706 u[58] = in[23]; 3707 u[60] = in[15]; 3708 u[62] = in[31]; 3709 3710 v[16] = in[2]; 3711 v[18] = in[18]; 3712 v[20] = in[10]; 3713 v[22] = in[26]; 3714 v[24] = in[6]; 3715 v[26] = in[22]; 3716 v[28] = in[14]; 3717 v[30] = in[30]; 3718 3719 u[8] = in[4]; 3720 u[10] = in[20]; 3721 u[12] = in[12]; 3722 u[14] = in[28]; 3723 3724 v[4] = in[8]; 3725 v[6] = in[24]; 3726 3727 u[0] = in[0]; 3728 u[2] = in[16]; 3729 3730 // stage 2 3731 v[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit); 3732 v[33] = half_btf_0_avx2(&cospim33, &u[62], &rnding, bit); 3733 v[34] = half_btf_0_avx2(&cospi47, &u[34], &rnding, bit); 3734 v[35] = half_btf_0_avx2(&cospim49, &u[60], &rnding, bit); 3735 v[36] = half_btf_0_avx2(&cospi55, &u[36], &rnding, bit); 3736 v[37] = half_btf_0_avx2(&cospim41, &u[58], &rnding, bit); 3737 v[38] = half_btf_0_avx2(&cospi39, &u[38], &rnding, bit); 3738 v[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit); 3739 v[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit); 3740 v[41] = half_btf_0_avx2(&cospim37, &u[54], &rnding, bit); 3741 v[42] = half_btf_0_avx2(&cospi43, &u[42], &rnding, bit); 3742 v[43] = half_btf_0_avx2(&cospim53, &u[52], &rnding, bit); 3743 v[44] = half_btf_0_avx2(&cospi51, &u[44], &rnding, bit); 3744 v[45] = half_btf_0_avx2(&cospim45, &u[50], &rnding, bit); 3745 v[46] = half_btf_0_avx2(&cospi35, &u[46], &rnding, bit); 3746 v[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit); 3747 v[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit); 3748 v[49] = half_btf_0_avx2(&cospi29, &u[46], &rnding, bit); 3749 v[50] = half_btf_0_avx2(&cospi19, &u[50], &rnding, bit); 3750 v[51] = half_btf_0_avx2(&cospi13, &u[44], &rnding, bit); 3751 v[52] = half_btf_0_avx2(&cospi11, &u[52], &rnding, bit); 3752 v[53] = half_btf_0_avx2(&cospi21, &u[42], &rnding, bit); 3753 v[54] = half_btf_0_avx2(&cospi27, &u[54], &rnding, bit); 3754 v[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit); 3755 v[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit); 3756 v[57] = half_btf_0_avx2(&cospi25, &u[38], &rnding, bit); 3757 v[58] = half_btf_0_avx2(&cospi23, &u[58], &rnding, bit); 3758 v[59] = half_btf_0_avx2(&cospi9, &u[36], &rnding, bit); 3759 v[60] = half_btf_0_avx2(&cospi15, &u[60], &rnding, bit); 3760 v[61] = half_btf_0_avx2(&cospi17, &u[34], &rnding, bit); 3761 v[62] = half_btf_0_avx2(&cospi31, &u[62], &rnding, bit); 3762 v[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit); 3763 3764 // stage 3 3765 u[16] = half_btf_0_avx2(&cospi62, &v[16], &rnding, bit); 3766 u[17] = half_btf_0_avx2(&cospim34, &v[30], &rnding, bit); 3767 u[18] = half_btf_0_avx2(&cospi46, &v[18], &rnding, bit); 3768 u[19] = half_btf_0_avx2(&cospim50, &v[28], &rnding, bit); 3769 u[20] = half_btf_0_avx2(&cospi54, &v[20], &rnding, bit); 3770 u[21] = half_btf_0_avx2(&cospim42, &v[26], &rnding, bit); 3771 u[22] = half_btf_0_avx2(&cospi38, &v[22], &rnding, bit); 3772 u[23] = half_btf_0_avx2(&cospim58, &v[24], &rnding, bit); 3773 u[24] = half_btf_0_avx2(&cospi6, &v[24], &rnding, bit); 3774 u[25] = half_btf_0_avx2(&cospi26, &v[22], &rnding, bit); 3775 u[26] = half_btf_0_avx2(&cospi22, &v[26], &rnding, bit); 3776 u[27] = half_btf_0_avx2(&cospi10, &v[20], &rnding, bit); 3777 u[28] = half_btf_0_avx2(&cospi14, &v[28], &rnding, bit); 3778 u[29] = half_btf_0_avx2(&cospi18, &v[18], &rnding, bit); 3779 u[30] = half_btf_0_avx2(&cospi30, &v[30], &rnding, bit); 3780 u[31] = half_btf_0_avx2(&cospi2, &v[16], &rnding, bit); 3781 3782 for (i = 32; i < 64; i += 4) { 3783 addsub_avx2(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo, 3784 &clamp_hi); 3785 addsub_avx2(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo, 3786 &clamp_hi); 3787 } 3788 3789 // stage 4 3790 v[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit); 3791 v[9] = half_btf_0_avx2(&cospim36, &u[14], &rnding, bit); 3792 v[10] = half_btf_0_avx2(&cospi44, &u[10], &rnding, bit); 3793 v[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit); 3794 v[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit); 3795 v[13] = half_btf_0_avx2(&cospi20, &u[10], &rnding, bit); 3796 v[14] = half_btf_0_avx2(&cospi28, &u[14], &rnding, bit); 3797 v[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit); 3798 3799 for (i = 16; i < 32; i += 4) { 3800 addsub_avx2(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo, 3801 &clamp_hi); 3802 addsub_avx2(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo, 3803 &clamp_hi); 3804 } 3805 3806 for (i = 32; i < 64; i += 4) { 3807 v[i + 0] = u[i + 0]; 3808 v[i + 3] = u[i + 3]; 3809 } 3810 3811 v[33] = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); 3812 v[34] = half_btf_avx2(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit); 3813 v[37] = half_btf_avx2(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit); 3814 v[38] = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); 3815 v[41] = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); 3816 v[42] = half_btf_avx2(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit); 3817 v[45] = half_btf_avx2(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit); 3818 v[46] = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); 3819 v[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); 3820 v[50] = half_btf_avx2(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit); 3821 v[53] = half_btf_avx2(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit); 3822 v[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); 3823 v[57] = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); 3824 v[58] = half_btf_avx2(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit); 3825 v[61] = half_btf_avx2(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit); 3826 v[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); 3827 3828 // stage 5 3829 u[4] = half_btf_0_avx2(&cospi56, &v[4], &rnding, bit); 3830 u[5] = half_btf_0_avx2(&cospim40, &v[6], &rnding, bit); 3831 u[6] = half_btf_0_avx2(&cospi24, &v[6], &rnding, bit); 3832 u[7] = half_btf_0_avx2(&cospi8, &v[4], &rnding, bit); 3833 3834 for (i = 8; i < 16; i += 4) { 3835 addsub_avx2(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo, 3836 &clamp_hi); 3837 addsub_avx2(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo, 3838 &clamp_hi); 3839 } 3840 3841 for (i = 16; i < 32; i += 4) { 3842 u[i + 0] = v[i + 0]; 3843 u[i + 3] = v[i + 3]; 3844 } 3845 3846 u[17] = half_btf_avx2(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit); 3847 u[18] = half_btf_avx2(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit); 3848 u[21] = half_btf_avx2(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit); 3849 u[22] = half_btf_avx2(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit); 3850 u[25] = half_btf_avx2(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit); 3851 u[26] = half_btf_avx2(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit); 3852 u[29] = half_btf_avx2(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit); 3853 u[30] = half_btf_avx2(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit); 3854 3855 for (i = 32; i < 64; i += 8) { 3856 addsub_avx2(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, 3857 &clamp_hi); 3858 addsub_avx2(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, 3859 &clamp_hi); 3860 3861 addsub_avx2(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, 3862 &clamp_hi); 3863 addsub_avx2(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, 3864 &clamp_hi); 3865 } 3866 3867 // stage 6 3868 v[0] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit); 3869 v[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit); 3870 v[2] = half_btf_0_avx2(&cospi48, &u[2], &rnding, bit); 3871 v[3] = half_btf_0_avx2(&cospi16, &u[2], &rnding, bit); 3872 3873 addsub_avx2(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi); 3874 addsub_avx2(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi); 3875 3876 for (i = 8; i < 16; i += 4) { 3877 v[i + 0] = u[i + 0]; 3878 v[i + 3] = u[i + 3]; 3879 } 3880 3881 v[9] = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); 3882 v[10] = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); 3883 v[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); 3884 v[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); 3885 3886 for (i = 16; i < 32; i += 8) { 3887 addsub_avx2(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo, 3888 &clamp_hi); 3889 addsub_avx2(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo, 3890 &clamp_hi); 3891 3892 addsub_avx2(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo, 3893 &clamp_hi); 3894 addsub_avx2(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo, 3895 &clamp_hi); 3896 } 3897 3898 for (i = 32; i < 64; i += 8) { 3899 v[i + 0] = u[i + 0]; 3900 v[i + 1] = u[i + 1]; 3901 v[i + 6] = u[i + 6]; 3902 v[i + 7] = u[i + 7]; 3903 } 3904 3905 v[34] = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); 3906 v[35] = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); 3907 v[36] = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); 3908 v[37] = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); 3909 v[42] = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); 3910 v[43] = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); 3911 v[44] = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); 3912 v[45] = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); 3913 v[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); 3914 v[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); 3915 v[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); 3916 v[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); 3917 v[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); 3918 v[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); 3919 v[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); 3920 v[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); 3921 3922 // stage 7 3923 addsub_avx2(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi); 3924 addsub_avx2(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi); 3925 3926 u[4] = v[4]; 3927 u[7] = v[7]; 3928 u[5] = half_btf_avx2(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit); 3929 u[6] = half_btf_avx2(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit); 3930 3931 addsub_avx2(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi); 3932 addsub_avx2(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi); 3933 addsub_avx2(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi); 3934 addsub_avx2(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi); 3935 3936 for (i = 16; i < 32; i += 8) { 3937 u[i + 0] = v[i + 0]; 3938 u[i + 1] = v[i + 1]; 3939 u[i + 6] = v[i + 6]; 3940 u[i + 7] = v[i + 7]; 3941 } 3942 3943 u[18] = half_btf_avx2(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit); 3944 u[19] = half_btf_avx2(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit); 3945 u[20] = half_btf_avx2(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit); 3946 u[21] = half_btf_avx2(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit); 3947 u[26] = half_btf_avx2(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit); 3948 u[27] = half_btf_avx2(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit); 3949 u[28] = half_btf_avx2(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit); 3950 u[29] = half_btf_avx2(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit); 3951 3952 for (i = 32; i < 64; i += 16) { 3953 for (j = i; j < i + 4; j++) { 3954 addsub_avx2(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); 3955 addsub_avx2(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, 3956 &clamp_hi); 3957 } 3958 } 3959 3960 // stage 8 3961 for (i = 0; i < 4; ++i) { 3962 addsub_avx2(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi); 3963 } 3964 3965 v[8] = u[8]; 3966 v[9] = u[9]; 3967 v[14] = u[14]; 3968 v[15] = u[15]; 3969 3970 v[10] = half_btf_avx2(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit); 3971 v[11] = half_btf_avx2(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit); 3972 v[12] = half_btf_avx2(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit); 3973 v[13] = half_btf_avx2(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit); 3974 3975 for (i = 16; i < 20; ++i) { 3976 addsub_avx2(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi); 3977 addsub_avx2(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo, 3978 &clamp_hi); 3979 } 3980 3981 for (i = 32; i < 36; ++i) { 3982 v[i] = u[i]; 3983 v[i + 12] = u[i + 12]; 3984 v[i + 16] = u[i + 16]; 3985 v[i + 28] = u[i + 28]; 3986 } 3987 3988 v[36] = half_btf_avx2(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit); 3989 v[37] = half_btf_avx2(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit); 3990 v[38] = half_btf_avx2(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit); 3991 v[39] = half_btf_avx2(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit); 3992 v[40] = half_btf_avx2(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit); 3993 v[41] = half_btf_avx2(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit); 3994 v[42] = half_btf_avx2(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit); 3995 v[43] = half_btf_avx2(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit); 3996 v[52] = half_btf_avx2(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit); 3997 v[53] = half_btf_avx2(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit); 3998 v[54] = half_btf_avx2(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit); 3999 v[55] = half_btf_avx2(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit); 4000 v[56] = half_btf_avx2(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit); 4001 v[57] = half_btf_avx2(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit); 4002 v[58] = half_btf_avx2(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit); 4003 v[59] = half_btf_avx2(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit); 4004 4005 // stage 9 4006 for (i = 0; i < 8; ++i) { 4007 addsub_avx2(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi); 4008 } 4009 4010 for (i = 16; i < 20; ++i) { 4011 u[i] = v[i]; 4012 u[i + 12] = v[i + 12]; 4013 } 4014 4015 u[20] = half_btf_avx2(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit); 4016 u[21] = half_btf_avx2(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit); 4017 u[22] = half_btf_avx2(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit); 4018 u[23] = half_btf_avx2(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit); 4019 u[24] = half_btf_avx2(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit); 4020 u[25] = half_btf_avx2(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit); 4021 u[26] = half_btf_avx2(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit); 4022 u[27] = half_btf_avx2(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit); 4023 4024 for (i = 32; i < 40; i++) { 4025 addsub_avx2(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi); 4026 } 4027 4028 for (i = 48; i < 56; i++) { 4029 addsub_avx2(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi); 4030 } 4031 4032 // stage 10 4033 for (i = 0; i < 16; i++) { 4034 addsub_avx2(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi); 4035 } 4036 4037 for (i = 32; i < 40; i++) v[i] = u[i]; 4038 4039 v[40] = half_btf_avx2(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit); 4040 v[41] = half_btf_avx2(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit); 4041 v[42] = half_btf_avx2(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit); 4042 v[43] = half_btf_avx2(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit); 4043 v[44] = half_btf_avx2(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit); 4044 v[45] = half_btf_avx2(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit); 4045 v[46] = half_btf_avx2(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit); 4046 v[47] = half_btf_avx2(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit); 4047 v[48] = half_btf_avx2(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit); 4048 v[49] = half_btf_avx2(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit); 4049 v[50] = half_btf_avx2(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit); 4050 v[51] = half_btf_avx2(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit); 4051 v[52] = half_btf_avx2(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit); 4052 v[53] = half_btf_avx2(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit); 4053 v[54] = half_btf_avx2(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit); 4054 v[55] = half_btf_avx2(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit); 4055 4056 for (i = 56; i < 64; i++) v[i] = u[i]; 4057 4058 // stage 11 4059 for (i = 0; i < 32; i++) { 4060 addsub_avx2(v[i], v[63 - i], &out[(i)], &out[(63 - i)], &clamp_lo, 4061 &clamp_hi); 4062 } 4063 if (!do_cols) { 4064 const int log_range_out = AOMMAX(16, bd + 6); 4065 const __m256i clamp_lo_out = 4066 _mm256_set1_epi32(-(1 << (log_range_out - 1))); 4067 const __m256i clamp_hi_out = 4068 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); 4069 4070 round_shift_8x8_avx2(out, out_shift); 4071 round_shift_8x8_avx2(out + 16, out_shift); 4072 round_shift_8x8_avx2(out + 32, out_shift); 4073 round_shift_8x8_avx2(out + 48, out_shift); 4074 highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 64); 4075 } 4076 } 4077 } 4078 typedef void (*transform_1d_avx2)(__m256i *in, __m256i *out, int bit, 4079 int do_cols, int bd, int out_shift); 4080 4081 static const transform_1d_avx2 4082 highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = { 4083 { 4084 { NULL, NULL, NULL, NULL }, 4085 { NULL, NULL, NULL, NULL }, 4086 { NULL, NULL, NULL, NULL }, 4087 }, 4088 { 4089 { idct8x8_low1_avx2, idct8x8_avx2, NULL, NULL }, 4090 { iadst8x8_low1_avx2, iadst8x8_avx2, NULL, NULL }, 4091 { NULL, NULL, NULL, NULL }, 4092 }, 4093 { 4094 { idct16_low1_avx2, idct16_low8_avx2, idct16_avx2, NULL }, 4095 { iadst16_low1_avx2, iadst16_low8_avx2, iadst16_avx2, NULL }, 4096 { NULL, NULL, NULL, NULL }, 4097 }, 4098 { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 }, 4099 { NULL, NULL, NULL, NULL }, 4100 { NULL, NULL, NULL, NULL } }, 4101 4102 { { idct64_low1_avx2, idct64_low8_avx2, idct64_low16_avx2, idct64_avx2 }, 4103 { NULL, NULL, NULL, NULL }, 4104 { NULL, NULL, NULL, NULL } } 4105 }; 4106 4107 static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input, 4108 uint16_t *output, int stride, 4109 TX_TYPE tx_type, 4110 TX_SIZE tx_size, int eob, 4111 const int bd) { 4112 __m256i buf1[64 * 8]; 4113 int eobx, eoby; 4114 get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); 4115 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; 4116 const int txw_idx = get_txw_idx(tx_size); 4117 const int txh_idx = get_txh_idx(tx_size); 4118 const int txfm_size_col = tx_size_wide[tx_size]; 4119 const int txfm_size_row = tx_size_high[tx_size]; 4120 const int buf_size_w_div8 = txfm_size_col >> 3; 4121 const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3; 4122 const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; 4123 const int input_stride = AOMMIN(32, txfm_size_row); 4124 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); 4125 const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; 4126 const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; 4127 const transform_1d_avx2 row_txfm = 4128 highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; 4129 const transform_1d_avx2 col_txfm = 4130 highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; 4131 4132 assert(col_txfm != NULL); 4133 assert(row_txfm != NULL); 4134 int ud_flip, lr_flip; 4135 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 4136 4137 // 1st stage: column transform 4138 for (int i = 0; i < buf_size_nonzero_h_div8; i++) { 4139 __m256i buf0[64]; 4140 load_buffer_32bit_input(input + i * 8, input_stride, buf0, 4141 buf_size_nonzero_w); 4142 if (rect_type == 1 || rect_type == -1) { 4143 round_shift_rect_array_32_avx2(buf0, buf0, buf_size_nonzero_w, 0, 4144 NewInvSqrt2); 4145 } 4146 row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); 4147 4148 __m256i *_buf1 = buf1 + i * 8; 4149 if (lr_flip) { 4150 for (int j = 0; j < buf_size_w_div8; ++j) { 4151 transpose_8x8_flip_avx2( 4152 &buf0[j * 8], &_buf1[(buf_size_w_div8 - 1 - j) * txfm_size_row]); 4153 } 4154 } else { 4155 for (int j = 0; j < buf_size_w_div8; ++j) { 4156 transpose_8x8_avx2(&buf0[j * 8], &_buf1[j * txfm_size_row]); 4157 } 4158 } 4159 } 4160 // 2nd stage: column transform 4161 for (int i = 0; i < buf_size_w_div8; i++) { 4162 col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1, 4163 bd, 0); 4164 4165 round_shift_array_32_avx2(buf1 + i * txfm_size_row, 4166 buf1 + i * txfm_size_row, txfm_size_row, 4167 -shift[1]); 4168 } 4169 4170 // write to buffer 4171 if (txfm_size_col >= 16) { 4172 for (int i = 0; i < (txfm_size_col >> 4); i++) { 4173 highbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row * 2, 4174 output + 16 * i, stride, ud_flip, 4175 txfm_size_row, bd); 4176 } 4177 } else if (txfm_size_col == 8) { 4178 highbd_write_buffer_8xn_avx2(buf1, output, stride, ud_flip, txfm_size_row, 4179 bd); 4180 } 4181 } 4182 4183 static void av1_highbd_inv_txfm2d_add_universe_avx2(const int32_t *input, 4184 uint8_t *output, int stride, 4185 TX_TYPE tx_type, 4186 TX_SIZE tx_size, int eob, 4187 const int bd) { 4188 switch (tx_type) { 4189 case DCT_DCT: 4190 case ADST_DCT: 4191 case DCT_ADST: 4192 case ADST_ADST: 4193 case FLIPADST_DCT: 4194 case DCT_FLIPADST: 4195 case FLIPADST_FLIPADST: 4196 case ADST_FLIPADST: 4197 case FLIPADST_ADST: 4198 highbd_inv_txfm2d_add_no_identity_avx2(input, CONVERT_TO_SHORTPTR(output), 4199 stride, tx_type, tx_size, eob, bd); 4200 break; 4201 case IDTX: 4202 case H_DCT: 4203 case H_ADST: 4204 case H_FLIPADST: 4205 case V_DCT: 4206 case V_ADST: 4207 case V_FLIPADST: 4208 av1_highbd_inv_txfm2d_add_universe_sse4_1(input, output, stride, tx_type, 4209 tx_size, eob, bd); 4210 break; 4211 default: assert(0); break; 4212 } 4213 } 4214 void av1_highbd_inv_txfm_add_avx2(const tran_low_t *input, uint8_t *dest, 4215 int stride, const TxfmParam *txfm_param) { 4216 assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); 4217 const TX_SIZE tx_size = txfm_param->tx_size; 4218 switch (tx_size) { 4219 case TX_4X8: 4220 case TX_8X4: 4221 case TX_4X4: 4222 case TX_16X4: 4223 case TX_4X16: 4224 av1_highbd_inv_txfm_add_sse4_1(input, dest, stride, txfm_param); 4225 break; 4226 default: 4227 av1_highbd_inv_txfm2d_add_universe_avx2( 4228 input, dest, stride, txfm_param->tx_type, txfm_param->tx_size, 4229 txfm_param->eob, txfm_param->bd); 4230 break; 4231 } 4232 }