highbd_fwd_txfm_avx2.c (127814B)
1 /* 2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 #include <assert.h> 12 #include <immintrin.h> /*AVX2*/ 13 14 #include "config/aom_config.h" 15 #include "config/av1_rtcd.h" 16 #include "av1/common/av1_txfm.h" 17 #include "av1/encoder/av1_fwd_txfm1d_cfg.h" 18 #include "aom_dsp/txfm_common.h" 19 #include "aom_ports/mem.h" 20 #include "aom_dsp/x86/txfm_common_sse2.h" 21 #include "aom_dsp/x86/txfm_common_avx2.h" 22 23 static inline void load_buffer_8x8_avx2(const int16_t *input, __m256i *out, 24 int stride, int flipud, int fliplr, 25 int shift) { 26 __m128i out1[8]; 27 if (!flipud) { 28 out1[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); 29 out1[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); 30 out1[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); 31 out1[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); 32 out1[4] = _mm_load_si128((const __m128i *)(input + 4 * stride)); 33 out1[5] = _mm_load_si128((const __m128i *)(input + 5 * stride)); 34 out1[6] = _mm_load_si128((const __m128i *)(input + 6 * stride)); 35 out1[7] = _mm_load_si128((const __m128i *)(input + 7 * stride)); 36 37 } else { 38 out1[7] = _mm_load_si128((const __m128i *)(input + 0 * stride)); 39 out1[6] = _mm_load_si128((const __m128i *)(input + 1 * stride)); 40 out1[5] = _mm_load_si128((const __m128i *)(input + 2 * stride)); 41 out1[4] = _mm_load_si128((const __m128i *)(input + 3 * stride)); 42 out1[3] = _mm_load_si128((const __m128i *)(input + 4 * stride)); 43 out1[2] = _mm_load_si128((const __m128i *)(input + 5 * stride)); 44 out1[1] = _mm_load_si128((const __m128i *)(input + 6 * stride)); 45 out1[0] = _mm_load_si128((const __m128i *)(input + 7 * stride)); 46 } 47 if (!fliplr) { 48 out[0] = _mm256_cvtepi16_epi32(out1[0]); 49 out[1] = _mm256_cvtepi16_epi32(out1[1]); 50 out[2] = _mm256_cvtepi16_epi32(out1[2]); 51 out[3] = _mm256_cvtepi16_epi32(out1[3]); 52 out[4] = _mm256_cvtepi16_epi32(out1[4]); 53 out[5] = _mm256_cvtepi16_epi32(out1[5]); 54 out[6] = _mm256_cvtepi16_epi32(out1[6]); 55 out[7] = _mm256_cvtepi16_epi32(out1[7]); 56 57 } else { 58 out[0] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[0])); 59 out[1] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[1])); 60 out[2] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[2])); 61 out[3] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[3])); 62 out[4] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[4])); 63 out[5] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[5])); 64 out[6] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[6])); 65 out[7] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[7])); 66 } 67 out[0] = _mm256_slli_epi32(out[0], shift); 68 out[1] = _mm256_slli_epi32(out[1], shift); 69 out[2] = _mm256_slli_epi32(out[2], shift); 70 out[3] = _mm256_slli_epi32(out[3], shift); 71 out[4] = _mm256_slli_epi32(out[4], shift); 72 out[5] = _mm256_slli_epi32(out[5], shift); 73 out[6] = _mm256_slli_epi32(out[6], shift); 74 out[7] = _mm256_slli_epi32(out[7], shift); 75 } 76 static inline void col_txfm_8x8_rounding(__m256i *in, int shift) { 77 const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1)); 78 79 in[0] = _mm256_add_epi32(in[0], rounding); 80 in[1] = _mm256_add_epi32(in[1], rounding); 81 in[2] = _mm256_add_epi32(in[2], rounding); 82 in[3] = _mm256_add_epi32(in[3], rounding); 83 in[4] = _mm256_add_epi32(in[4], rounding); 84 in[5] = _mm256_add_epi32(in[5], rounding); 85 in[6] = _mm256_add_epi32(in[6], rounding); 86 in[7] = _mm256_add_epi32(in[7], rounding); 87 88 in[0] = _mm256_srai_epi32(in[0], shift); 89 in[1] = _mm256_srai_epi32(in[1], shift); 90 in[2] = _mm256_srai_epi32(in[2], shift); 91 in[3] = _mm256_srai_epi32(in[3], shift); 92 in[4] = _mm256_srai_epi32(in[4], shift); 93 in[5] = _mm256_srai_epi32(in[5], shift); 94 in[6] = _mm256_srai_epi32(in[6], shift); 95 in[7] = _mm256_srai_epi32(in[7], shift); 96 } 97 static inline void load_buffer_8x16_avx2(const int16_t *input, __m256i *out, 98 int stride, int flipud, int fliplr, 99 int shift) { 100 const int16_t *topL = input; 101 const int16_t *botL = input + 8 * stride; 102 103 const int16_t *tmp; 104 105 if (flipud) { 106 tmp = topL; 107 topL = botL; 108 botL = tmp; 109 } 110 load_buffer_8x8_avx2(topL, out, stride, flipud, fliplr, shift); 111 load_buffer_8x8_avx2(botL, out + 8, stride, flipud, fliplr, shift); 112 } 113 static inline void load_buffer_16xn_avx2(const int16_t *input, __m256i *out, 114 int stride, int height, int outstride, 115 int flipud, int fliplr) { 116 __m256i out1[64]; 117 if (!flipud) { 118 for (int i = 0; i < height; i++) { 119 out1[i] = _mm256_loadu_si256((const __m256i *)(input + i * stride)); 120 } 121 } else { 122 for (int i = 0; i < height; i++) { 123 out1[(height - 1) - i] = 124 _mm256_loadu_si256((const __m256i *)(input + i * stride)); 125 } 126 } 127 if (!fliplr) { 128 for (int i = 0; i < height; i++) { 129 out[i * outstride] = 130 _mm256_cvtepi16_epi32(_mm256_castsi256_si128(out1[i])); 131 out[i * outstride + 1] = 132 _mm256_cvtepi16_epi32(_mm256_extractf128_si256(out1[i], 1)); 133 } 134 } else { 135 for (int i = 0; i < height; i++) { 136 out[i * outstride + 1] = _mm256_cvtepi16_epi32( 137 mm_reverse_epi16(_mm256_castsi256_si128(out1[i]))); 138 out[i * outstride + 0] = _mm256_cvtepi16_epi32( 139 mm_reverse_epi16(_mm256_extractf128_si256(out1[i], 1))); 140 } 141 } 142 } 143 144 static void fwd_txfm_transpose_8x8_avx2(const __m256i *in, __m256i *out, 145 const int instride, 146 const int outstride) { 147 __m256i u0, u1, u2, u3, u4, u5, u6, u7; 148 __m256i x0, x1; 149 150 u0 = _mm256_unpacklo_epi32(in[0 * instride], in[1 * instride]); 151 u1 = _mm256_unpackhi_epi32(in[0 * instride], in[1 * instride]); 152 153 u2 = _mm256_unpacklo_epi32(in[2 * instride], in[3 * instride]); 154 u3 = _mm256_unpackhi_epi32(in[2 * instride], in[3 * instride]); 155 156 u4 = _mm256_unpacklo_epi32(in[4 * instride], in[5 * instride]); 157 u5 = _mm256_unpackhi_epi32(in[4 * instride], in[5 * instride]); 158 159 u6 = _mm256_unpacklo_epi32(in[6 * instride], in[7 * instride]); 160 u7 = _mm256_unpackhi_epi32(in[6 * instride], in[7 * instride]); 161 162 x0 = _mm256_unpacklo_epi64(u0, u2); 163 x1 = _mm256_unpacklo_epi64(u4, u6); 164 out[0 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20); 165 out[4 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31); 166 167 x0 = _mm256_unpackhi_epi64(u0, u2); 168 x1 = _mm256_unpackhi_epi64(u4, u6); 169 out[1 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20); 170 out[5 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31); 171 172 x0 = _mm256_unpacklo_epi64(u1, u3); 173 x1 = _mm256_unpacklo_epi64(u5, u7); 174 out[2 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20); 175 out[6 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31); 176 177 x0 = _mm256_unpackhi_epi64(u1, u3); 178 x1 = _mm256_unpackhi_epi64(u5, u7); 179 out[3 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20); 180 out[7 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31); 181 } 182 static inline void round_shift_32_8xn_avx2(__m256i *in, int size, int bit, 183 int stride) { 184 if (bit < 0) { 185 bit = -bit; 186 __m256i round = _mm256_set1_epi32(1 << (bit - 1)); 187 for (int i = 0; i < size; ++i) { 188 in[stride * i] = _mm256_add_epi32(in[stride * i], round); 189 in[stride * i] = _mm256_srai_epi32(in[stride * i], bit); 190 } 191 } else if (bit > 0) { 192 for (int i = 0; i < size; ++i) { 193 in[stride * i] = _mm256_slli_epi32(in[stride * i], bit); 194 } 195 } 196 } 197 static inline void store_buffer_avx2(const __m256i *const in, int32_t *out, 198 const int stride, const int out_size) { 199 for (int i = 0; i < out_size; ++i) { 200 _mm256_store_si256((__m256i *)(out), in[i]); 201 out += stride; 202 } 203 } 204 static inline void fwd_txfm_transpose_16x16_avx2(const __m256i *in, 205 __m256i *out) { 206 fwd_txfm_transpose_8x8_avx2(&in[0], &out[0], 2, 2); 207 fwd_txfm_transpose_8x8_avx2(&in[1], &out[16], 2, 2); 208 fwd_txfm_transpose_8x8_avx2(&in[16], &out[1], 2, 2); 209 fwd_txfm_transpose_8x8_avx2(&in[17], &out[17], 2, 2); 210 } 211 212 static inline __m256i av1_half_btf_avx2(const __m256i *w0, const __m256i *n0, 213 const __m256i *w1, const __m256i *n1, 214 const __m256i *rounding, int bit) { 215 __m256i x, y; 216 217 x = _mm256_mullo_epi32(*w0, *n0); 218 y = _mm256_mullo_epi32(*w1, *n1); 219 x = _mm256_add_epi32(x, y); 220 x = _mm256_add_epi32(x, *rounding); 221 x = _mm256_srai_epi32(x, bit); 222 return x; 223 } 224 #define btf_32_avx2_type0(w0, w1, in0, in1, out0, out1, bit) \ 225 do { \ 226 const __m256i ww0 = _mm256_set1_epi32(w0); \ 227 const __m256i ww1 = _mm256_set1_epi32(w1); \ 228 const __m256i in0_w0 = _mm256_mullo_epi32(in0, ww0); \ 229 const __m256i in1_w1 = _mm256_mullo_epi32(in1, ww1); \ 230 out0 = _mm256_add_epi32(in0_w0, in1_w1); \ 231 round_shift_32_8xn_avx2(&out0, 1, -bit, 1); \ 232 const __m256i in0_w1 = _mm256_mullo_epi32(in0, ww1); \ 233 const __m256i in1_w0 = _mm256_mullo_epi32(in1, ww0); \ 234 out1 = _mm256_sub_epi32(in0_w1, in1_w0); \ 235 round_shift_32_8xn_avx2(&out1, 1, -bit, 1); \ 236 } while (0) 237 238 #define btf_32_type0_avx2_new(ww0, ww1, in0, in1, out0, out1, r, bit) \ 239 do { \ 240 const __m256i in0_w0 = _mm256_mullo_epi32(in0, ww0); \ 241 const __m256i in1_w1 = _mm256_mullo_epi32(in1, ww1); \ 242 out0 = _mm256_add_epi32(in0_w0, in1_w1); \ 243 out0 = _mm256_add_epi32(out0, r); \ 244 out0 = _mm256_srai_epi32(out0, bit); \ 245 const __m256i in0_w1 = _mm256_mullo_epi32(in0, ww1); \ 246 const __m256i in1_w0 = _mm256_mullo_epi32(in1, ww0); \ 247 out1 = _mm256_sub_epi32(in0_w1, in1_w0); \ 248 out1 = _mm256_add_epi32(out1, r); \ 249 out1 = _mm256_srai_epi32(out1, bit); \ 250 } while (0) 251 252 typedef void (*transform_1d_avx2)(__m256i *in, __m256i *out, 253 const int8_t cos_bit, int instride, 254 int outstride); 255 static void fdct8_avx2(__m256i *in, __m256i *out, const int8_t bit, 256 const int col_num, const int outstride) { 257 const int32_t *cospi = cospi_arr(bit); 258 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); 259 const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); 260 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); 261 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); 262 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); 263 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); 264 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); 265 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); 266 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); 267 __m256i u[8], v[8]; 268 for (int col = 0; col < col_num; ++col) { 269 u[0] = _mm256_add_epi32(in[0 * col_num + col], in[7 * col_num + col]); 270 v[7] = _mm256_sub_epi32(in[0 * col_num + col], in[7 * col_num + col]); 271 u[1] = _mm256_add_epi32(in[1 * col_num + col], in[6 * col_num + col]); 272 u[6] = _mm256_sub_epi32(in[1 * col_num + col], in[6 * col_num + col]); 273 u[2] = _mm256_add_epi32(in[2 * col_num + col], in[5 * col_num + col]); 274 u[5] = _mm256_sub_epi32(in[2 * col_num + col], in[5 * col_num + col]); 275 u[3] = _mm256_add_epi32(in[3 * col_num + col], in[4 * col_num + col]); 276 v[4] = _mm256_sub_epi32(in[3 * col_num + col], in[4 * col_num + col]); 277 v[0] = _mm256_add_epi32(u[0], u[3]); 278 v[3] = _mm256_sub_epi32(u[0], u[3]); 279 v[1] = _mm256_add_epi32(u[1], u[2]); 280 v[2] = _mm256_sub_epi32(u[1], u[2]); 281 282 v[5] = _mm256_mullo_epi32(u[5], cospim32); 283 v[6] = _mm256_mullo_epi32(u[6], cospi32); 284 v[5] = _mm256_add_epi32(v[5], v[6]); 285 v[5] = _mm256_add_epi32(v[5], rnding); 286 v[5] = _mm256_srai_epi32(v[5], bit); 287 288 u[0] = _mm256_mullo_epi32(u[5], cospi32); 289 v[6] = _mm256_mullo_epi32(u[6], cospim32); 290 v[6] = _mm256_sub_epi32(u[0], v[6]); 291 v[6] = _mm256_add_epi32(v[6], rnding); 292 v[6] = _mm256_srai_epi32(v[6], bit); 293 294 // stage 3 295 // type 0 296 v[0] = _mm256_mullo_epi32(v[0], cospi32); 297 v[1] = _mm256_mullo_epi32(v[1], cospi32); 298 u[0] = _mm256_add_epi32(v[0], v[1]); 299 u[0] = _mm256_add_epi32(u[0], rnding); 300 u[0] = _mm256_srai_epi32(u[0], bit); 301 302 u[1] = _mm256_sub_epi32(v[0], v[1]); 303 u[1] = _mm256_add_epi32(u[1], rnding); 304 u[1] = _mm256_srai_epi32(u[1], bit); 305 306 // type 1 307 v[0] = _mm256_mullo_epi32(v[2], cospi48); 308 v[1] = _mm256_mullo_epi32(v[3], cospi16); 309 u[2] = _mm256_add_epi32(v[0], v[1]); 310 u[2] = _mm256_add_epi32(u[2], rnding); 311 u[2] = _mm256_srai_epi32(u[2], bit); 312 313 v[0] = _mm256_mullo_epi32(v[2], cospi16); 314 v[1] = _mm256_mullo_epi32(v[3], cospi48); 315 u[3] = _mm256_sub_epi32(v[1], v[0]); 316 u[3] = _mm256_add_epi32(u[3], rnding); 317 u[3] = _mm256_srai_epi32(u[3], bit); 318 319 u[4] = _mm256_add_epi32(v[4], v[5]); 320 u[5] = _mm256_sub_epi32(v[4], v[5]); 321 u[6] = _mm256_sub_epi32(v[7], v[6]); 322 u[7] = _mm256_add_epi32(v[7], v[6]); 323 324 // stage 4 325 // stage 5 326 v[0] = _mm256_mullo_epi32(u[4], cospi56); 327 v[1] = _mm256_mullo_epi32(u[7], cospi8); 328 v[0] = _mm256_add_epi32(v[0], v[1]); 329 v[0] = _mm256_add_epi32(v[0], rnding); 330 out[1 * outstride + col] = _mm256_srai_epi32(v[0], bit); // buf0[4] 331 332 v[0] = _mm256_mullo_epi32(u[4], cospi8); 333 v[1] = _mm256_mullo_epi32(u[7], cospi56); 334 v[0] = _mm256_sub_epi32(v[1], v[0]); 335 v[0] = _mm256_add_epi32(v[0], rnding); 336 out[7 * outstride + col] = _mm256_srai_epi32(v[0], bit); // buf0[7] 337 338 v[0] = _mm256_mullo_epi32(u[5], cospi24); 339 v[1] = _mm256_mullo_epi32(u[6], cospi40); 340 v[0] = _mm256_add_epi32(v[0], v[1]); 341 v[0] = _mm256_add_epi32(v[0], rnding); 342 out[5 * outstride + col] = _mm256_srai_epi32(v[0], bit); // buf0[5] 343 344 v[0] = _mm256_mullo_epi32(u[5], cospi40); 345 v[1] = _mm256_mullo_epi32(u[6], cospi24); 346 v[0] = _mm256_sub_epi32(v[1], v[0]); 347 v[0] = _mm256_add_epi32(v[0], rnding); 348 out[3 * outstride + col] = _mm256_srai_epi32(v[0], bit); // buf0[6] 349 350 out[0 * outstride + col] = u[0]; // buf0[0] 351 out[4 * outstride + col] = u[1]; // buf0[1] 352 out[2 * outstride + col] = u[2]; // buf0[2] 353 out[6 * outstride + col] = u[3]; // buf0[3] 354 } 355 } 356 static void fadst8_avx2(__m256i *in, __m256i *out, const int8_t bit, 357 const int col_num, const int outstirde) { 358 (void)col_num; 359 const int32_t *cospi = cospi_arr(bit); 360 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); 361 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); 362 const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); 363 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); 364 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); 365 const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); 366 const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]); 367 const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); 368 const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); 369 const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]); 370 const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); 371 const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); 372 const __m256i cospi36 = _mm256_set1_epi32(cospi[36]); 373 const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); 374 const __m256i cospi52 = _mm256_set1_epi32(cospi[52]); 375 const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); 376 const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); 377 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); 378 const __m256i zero = _mm256_setzero_si256(); 379 __m256i u0, u1, u2, u3, u4, u5, u6, u7; 380 __m256i v0, v1, v2, v3, v4, v5, v6, v7; 381 __m256i x, y; 382 for (int col = 0; col < col_num; ++col) { 383 u0 = in[0 * col_num + col]; 384 u1 = _mm256_sub_epi32(zero, in[7 * col_num + col]); 385 u2 = _mm256_sub_epi32(zero, in[3 * col_num + col]); 386 u3 = in[4 * col_num + col]; 387 u4 = _mm256_sub_epi32(zero, in[1 * col_num + col]); 388 u5 = in[6 * col_num + col]; 389 u6 = in[2 * col_num + col]; 390 u7 = _mm256_sub_epi32(zero, in[5 * col_num + col]); 391 392 // stage 2 393 v0 = u0; 394 v1 = u1; 395 396 x = _mm256_mullo_epi32(u2, cospi32); 397 y = _mm256_mullo_epi32(u3, cospi32); 398 v2 = _mm256_add_epi32(x, y); 399 v2 = _mm256_add_epi32(v2, rnding); 400 v2 = _mm256_srai_epi32(v2, bit); 401 402 v3 = _mm256_sub_epi32(x, y); 403 v3 = _mm256_add_epi32(v3, rnding); 404 v3 = _mm256_srai_epi32(v3, bit); 405 406 v4 = u4; 407 v5 = u5; 408 409 x = _mm256_mullo_epi32(u6, cospi32); 410 y = _mm256_mullo_epi32(u7, cospi32); 411 v6 = _mm256_add_epi32(x, y); 412 v6 = _mm256_add_epi32(v6, rnding); 413 v6 = _mm256_srai_epi32(v6, bit); 414 415 v7 = _mm256_sub_epi32(x, y); 416 v7 = _mm256_add_epi32(v7, rnding); 417 v7 = _mm256_srai_epi32(v7, bit); 418 419 // stage 3 420 u0 = _mm256_add_epi32(v0, v2); 421 u1 = _mm256_add_epi32(v1, v3); 422 u2 = _mm256_sub_epi32(v0, v2); 423 u3 = _mm256_sub_epi32(v1, v3); 424 u4 = _mm256_add_epi32(v4, v6); 425 u5 = _mm256_add_epi32(v5, v7); 426 u6 = _mm256_sub_epi32(v4, v6); 427 u7 = _mm256_sub_epi32(v5, v7); 428 429 // stage 4 430 v0 = u0; 431 v1 = u1; 432 v2 = u2; 433 v3 = u3; 434 435 x = _mm256_mullo_epi32(u4, cospi16); 436 y = _mm256_mullo_epi32(u5, cospi48); 437 v4 = _mm256_add_epi32(x, y); 438 v4 = _mm256_add_epi32(v4, rnding); 439 v4 = _mm256_srai_epi32(v4, bit); 440 441 x = _mm256_mullo_epi32(u4, cospi48); 442 y = _mm256_mullo_epi32(u5, cospim16); 443 v5 = _mm256_add_epi32(x, y); 444 v5 = _mm256_add_epi32(v5, rnding); 445 v5 = _mm256_srai_epi32(v5, bit); 446 447 x = _mm256_mullo_epi32(u6, cospim48); 448 y = _mm256_mullo_epi32(u7, cospi16); 449 v6 = _mm256_add_epi32(x, y); 450 v6 = _mm256_add_epi32(v6, rnding); 451 v6 = _mm256_srai_epi32(v6, bit); 452 453 x = _mm256_mullo_epi32(u6, cospi16); 454 y = _mm256_mullo_epi32(u7, cospi48); 455 v7 = _mm256_add_epi32(x, y); 456 v7 = _mm256_add_epi32(v7, rnding); 457 v7 = _mm256_srai_epi32(v7, bit); 458 459 // stage 5 460 u0 = _mm256_add_epi32(v0, v4); 461 u1 = _mm256_add_epi32(v1, v5); 462 u2 = _mm256_add_epi32(v2, v6); 463 u3 = _mm256_add_epi32(v3, v7); 464 u4 = _mm256_sub_epi32(v0, v4); 465 u5 = _mm256_sub_epi32(v1, v5); 466 u6 = _mm256_sub_epi32(v2, v6); 467 u7 = _mm256_sub_epi32(v3, v7); 468 469 // stage 6 470 x = _mm256_mullo_epi32(u0, cospi4); 471 y = _mm256_mullo_epi32(u1, cospi60); 472 v0 = _mm256_add_epi32(x, y); 473 v0 = _mm256_add_epi32(v0, rnding); 474 v0 = _mm256_srai_epi32(v0, bit); 475 476 x = _mm256_mullo_epi32(u0, cospi60); 477 y = _mm256_mullo_epi32(u1, cospim4); 478 v1 = _mm256_add_epi32(x, y); 479 v1 = _mm256_add_epi32(v1, rnding); 480 v1 = _mm256_srai_epi32(v1, bit); 481 482 x = _mm256_mullo_epi32(u2, cospi20); 483 y = _mm256_mullo_epi32(u3, cospi44); 484 v2 = _mm256_add_epi32(x, y); 485 v2 = _mm256_add_epi32(v2, rnding); 486 v2 = _mm256_srai_epi32(v2, bit); 487 488 x = _mm256_mullo_epi32(u2, cospi44); 489 y = _mm256_mullo_epi32(u3, cospim20); 490 v3 = _mm256_add_epi32(x, y); 491 v3 = _mm256_add_epi32(v3, rnding); 492 v3 = _mm256_srai_epi32(v3, bit); 493 494 x = _mm256_mullo_epi32(u4, cospi36); 495 y = _mm256_mullo_epi32(u5, cospi28); 496 v4 = _mm256_add_epi32(x, y); 497 v4 = _mm256_add_epi32(v4, rnding); 498 v4 = _mm256_srai_epi32(v4, bit); 499 500 x = _mm256_mullo_epi32(u4, cospi28); 501 y = _mm256_mullo_epi32(u5, cospim36); 502 v5 = _mm256_add_epi32(x, y); 503 v5 = _mm256_add_epi32(v5, rnding); 504 v5 = _mm256_srai_epi32(v5, bit); 505 506 x = _mm256_mullo_epi32(u6, cospi52); 507 y = _mm256_mullo_epi32(u7, cospi12); 508 v6 = _mm256_add_epi32(x, y); 509 v6 = _mm256_add_epi32(v6, rnding); 510 v6 = _mm256_srai_epi32(v6, bit); 511 512 x = _mm256_mullo_epi32(u6, cospi12); 513 y = _mm256_mullo_epi32(u7, cospim52); 514 v7 = _mm256_add_epi32(x, y); 515 v7 = _mm256_add_epi32(v7, rnding); 516 v7 = _mm256_srai_epi32(v7, bit); 517 518 // stage 7 519 out[0 * outstirde + col] = v1; 520 out[1 * outstirde + col] = v6; 521 out[2 * outstirde + col] = v3; 522 out[3 * outstirde + col] = v4; 523 out[4 * outstirde + col] = v5; 524 out[5 * outstirde + col] = v2; 525 out[6 * outstirde + col] = v7; 526 out[7 * outstirde + col] = v0; 527 } 528 } 529 static void idtx8_avx2(__m256i *in, __m256i *out, const int8_t bit, int col_num, 530 int outstride) { 531 (void)bit; 532 (void)outstride; 533 int num_iters = 8 * col_num; 534 for (int i = 0; i < num_iters; i += 8) { 535 out[i] = _mm256_add_epi32(in[i], in[i]); 536 out[i + 1] = _mm256_add_epi32(in[i + 1], in[i + 1]); 537 out[i + 2] = _mm256_add_epi32(in[i + 2], in[i + 2]); 538 out[i + 3] = _mm256_add_epi32(in[i + 3], in[i + 3]); 539 out[i + 4] = _mm256_add_epi32(in[i + 4], in[i + 4]); 540 out[i + 5] = _mm256_add_epi32(in[i + 5], in[i + 5]); 541 out[i + 6] = _mm256_add_epi32(in[i + 6], in[i + 6]); 542 out[i + 7] = _mm256_add_epi32(in[i + 7], in[i + 7]); 543 } 544 } 545 void av1_fwd_txfm2d_8x8_avx2(const int16_t *input, int32_t *coeff, int stride, 546 TX_TYPE tx_type, int bd) { 547 __m256i in[8], out[8]; 548 const TX_SIZE tx_size = TX_8X8; 549 const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; 550 const int txw_idx = get_txw_idx(tx_size); 551 const int txh_idx = get_txh_idx(tx_size); 552 const int width = tx_size_wide[tx_size]; 553 const int width_div8 = (width >> 3); 554 555 switch (tx_type) { 556 case DCT_DCT: 557 load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); 558 fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, 559 width_div8); 560 col_txfm_8x8_rounding(out, -shift[1]); 561 fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); 562 fdct8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, 563 width_div8); 564 store_buffer_avx2(out, coeff, 8, 8); 565 break; 566 case ADST_DCT: 567 load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); 568 fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, 569 width_div8); 570 col_txfm_8x8_rounding(out, -shift[1]); 571 fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); 572 fdct8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, 573 width_div8); 574 store_buffer_avx2(out, coeff, 8, 8); 575 break; 576 case DCT_ADST: 577 load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); 578 fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, 579 width_div8); 580 col_txfm_8x8_rounding(out, -shift[1]); 581 fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); 582 fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, 583 width_div8); 584 store_buffer_avx2(out, coeff, 8, 8); 585 break; 586 case ADST_ADST: 587 load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); 588 fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, 589 width_div8); 590 col_txfm_8x8_rounding(out, -shift[1]); 591 fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); 592 fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, 593 width_div8); 594 store_buffer_avx2(out, coeff, 8, 8); 595 break; 596 case FLIPADST_DCT: 597 load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]); 598 fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, 599 width_div8); 600 col_txfm_8x8_rounding(out, -shift[1]); 601 fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); 602 fdct8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, 603 width_div8); 604 store_buffer_avx2(out, coeff, 8, 8); 605 break; 606 case DCT_FLIPADST: 607 load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]); 608 fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, 609 width_div8); 610 col_txfm_8x8_rounding(out, -shift[1]); 611 fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); 612 fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, 613 width_div8); 614 store_buffer_avx2(out, coeff, 8, 8); 615 break; 616 case FLIPADST_FLIPADST: 617 load_buffer_8x8_avx2(input, in, stride, 1, 1, shift[0]); 618 fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, 619 width_div8); 620 col_txfm_8x8_rounding(out, -shift[1]); 621 fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); 622 fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, 623 width_div8); 624 store_buffer_avx2(out, coeff, 8, 8); 625 break; 626 case ADST_FLIPADST: 627 load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]); 628 fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, 629 width_div8); 630 col_txfm_8x8_rounding(out, -shift[1]); 631 fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); 632 fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, 633 width_div8); 634 store_buffer_avx2(out, coeff, 8, 8); 635 break; 636 case FLIPADST_ADST: 637 load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]); 638 fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, 639 width_div8); 640 col_txfm_8x8_rounding(out, -shift[1]); 641 fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); 642 fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, 643 width_div8); 644 store_buffer_avx2(out, coeff, 8, 8); 645 break; 646 case IDTX: 647 load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); 648 idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, 649 width_div8); 650 col_txfm_8x8_rounding(out, -shift[1]); 651 fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); 652 idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, 653 width_div8); 654 store_buffer_avx2(out, coeff, 8, 8); 655 break; 656 case V_DCT: 657 load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); 658 fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, 659 width_div8); 660 col_txfm_8x8_rounding(out, -shift[1]); 661 fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); 662 idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, 663 width_div8); 664 store_buffer_avx2(out, coeff, 8, 8); 665 break; 666 case H_DCT: 667 load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); 668 idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, 669 width_div8); 670 col_txfm_8x8_rounding(out, -shift[1]); 671 fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); 672 fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, 673 width_div8); 674 store_buffer_avx2(out, coeff, 8, 8); 675 break; 676 case V_ADST: 677 load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); 678 fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, 679 width_div8); 680 col_txfm_8x8_rounding(out, -shift[1]); 681 fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); 682 idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, 683 width_div8); 684 store_buffer_avx2(out, coeff, 8, 8); 685 break; 686 case H_ADST: 687 load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); 688 idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, 689 width_div8); 690 col_txfm_8x8_rounding(out, -shift[1]); 691 fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); 692 fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, 693 width_div8); 694 store_buffer_avx2(out, coeff, 8, 8); 695 break; 696 case V_FLIPADST: 697 load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]); 698 fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, 699 width_div8); 700 col_txfm_8x8_rounding(out, -shift[1]); 701 fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); 702 idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, 703 width_div8); 704 store_buffer_avx2(out, coeff, 8, 8); 705 break; 706 case H_FLIPADST: 707 load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]); 708 idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, 709 width_div8); 710 col_txfm_8x8_rounding(out, -shift[1]); 711 fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); 712 fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, 713 width_div8); 714 store_buffer_avx2(out, coeff, 8, 8); 715 break; 716 default: assert(0); 717 } 718 (void)bd; 719 } 720 721 static void fdct16_avx2(__m256i *in, __m256i *out, const int8_t bit, 722 const int col_num, const int outstride) { 723 const int32_t *cospi = cospi_arr(bit); 724 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); 725 const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); 726 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); 727 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); 728 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); 729 const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); 730 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); 731 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); 732 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); 733 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); 734 const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); 735 const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); 736 const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); 737 const __m256i cospi36 = _mm256_set1_epi32(cospi[36]); 738 const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); 739 const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); 740 const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); 741 const __m256i cospi52 = _mm256_set1_epi32(cospi[52]); 742 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); 743 __m256i u[16], v[16], x; 744 int col; 745 746 // Calculate the column 0, 1, 2, 3 747 for (col = 0; col < col_num; ++col) { 748 // stage 0 749 // stage 1 750 u[0] = _mm256_add_epi32(in[0 * col_num + col], in[15 * col_num + col]); 751 u[15] = _mm256_sub_epi32(in[0 * col_num + col], in[15 * col_num + col]); 752 u[1] = _mm256_add_epi32(in[1 * col_num + col], in[14 * col_num + col]); 753 u[14] = _mm256_sub_epi32(in[1 * col_num + col], in[14 * col_num + col]); 754 u[2] = _mm256_add_epi32(in[2 * col_num + col], in[13 * col_num + col]); 755 u[13] = _mm256_sub_epi32(in[2 * col_num + col], in[13 * col_num + col]); 756 u[3] = _mm256_add_epi32(in[3 * col_num + col], in[12 * col_num + col]); 757 u[12] = _mm256_sub_epi32(in[3 * col_num + col], in[12 * col_num + col]); 758 u[4] = _mm256_add_epi32(in[4 * col_num + col], in[11 * col_num + col]); 759 u[11] = _mm256_sub_epi32(in[4 * col_num + col], in[11 * col_num + col]); 760 u[5] = _mm256_add_epi32(in[5 * col_num + col], in[10 * col_num + col]); 761 u[10] = _mm256_sub_epi32(in[5 * col_num + col], in[10 * col_num + col]); 762 u[6] = _mm256_add_epi32(in[6 * col_num + col], in[9 * col_num + col]); 763 u[9] = _mm256_sub_epi32(in[6 * col_num + col], in[9 * col_num + col]); 764 u[7] = _mm256_add_epi32(in[7 * col_num + col], in[8 * col_num + col]); 765 u[8] = _mm256_sub_epi32(in[7 * col_num + col], in[8 * col_num + col]); 766 767 // stage 2 768 v[0] = _mm256_add_epi32(u[0], u[7]); 769 v[7] = _mm256_sub_epi32(u[0], u[7]); 770 v[1] = _mm256_add_epi32(u[1], u[6]); 771 v[6] = _mm256_sub_epi32(u[1], u[6]); 772 v[2] = _mm256_add_epi32(u[2], u[5]); 773 v[5] = _mm256_sub_epi32(u[2], u[5]); 774 v[3] = _mm256_add_epi32(u[3], u[4]); 775 v[4] = _mm256_sub_epi32(u[3], u[4]); 776 v[8] = u[8]; 777 v[9] = u[9]; 778 779 v[10] = _mm256_mullo_epi32(u[10], cospim32); 780 x = _mm256_mullo_epi32(u[13], cospi32); 781 v[10] = _mm256_add_epi32(v[10], x); 782 v[10] = _mm256_add_epi32(v[10], rnding); 783 v[10] = _mm256_srai_epi32(v[10], bit); 784 785 v[13] = _mm256_mullo_epi32(u[10], cospi32); 786 x = _mm256_mullo_epi32(u[13], cospim32); 787 v[13] = _mm256_sub_epi32(v[13], x); 788 v[13] = _mm256_add_epi32(v[13], rnding); 789 v[13] = _mm256_srai_epi32(v[13], bit); 790 791 v[11] = _mm256_mullo_epi32(u[11], cospim32); 792 x = _mm256_mullo_epi32(u[12], cospi32); 793 v[11] = _mm256_add_epi32(v[11], x); 794 v[11] = _mm256_add_epi32(v[11], rnding); 795 v[11] = _mm256_srai_epi32(v[11], bit); 796 797 v[12] = _mm256_mullo_epi32(u[11], cospi32); 798 x = _mm256_mullo_epi32(u[12], cospim32); 799 v[12] = _mm256_sub_epi32(v[12], x); 800 v[12] = _mm256_add_epi32(v[12], rnding); 801 v[12] = _mm256_srai_epi32(v[12], bit); 802 v[14] = u[14]; 803 v[15] = u[15]; 804 805 // stage 3 806 u[0] = _mm256_add_epi32(v[0], v[3]); 807 u[3] = _mm256_sub_epi32(v[0], v[3]); 808 u[1] = _mm256_add_epi32(v[1], v[2]); 809 u[2] = _mm256_sub_epi32(v[1], v[2]); 810 u[4] = v[4]; 811 812 u[5] = _mm256_mullo_epi32(v[5], cospim32); 813 x = _mm256_mullo_epi32(v[6], cospi32); 814 u[5] = _mm256_add_epi32(u[5], x); 815 u[5] = _mm256_add_epi32(u[5], rnding); 816 u[5] = _mm256_srai_epi32(u[5], bit); 817 818 u[6] = _mm256_mullo_epi32(v[5], cospi32); 819 x = _mm256_mullo_epi32(v[6], cospim32); 820 u[6] = _mm256_sub_epi32(u[6], x); 821 u[6] = _mm256_add_epi32(u[6], rnding); 822 u[6] = _mm256_srai_epi32(u[6], bit); 823 824 u[7] = v[7]; 825 u[8] = _mm256_add_epi32(v[8], v[11]); 826 u[11] = _mm256_sub_epi32(v[8], v[11]); 827 u[9] = _mm256_add_epi32(v[9], v[10]); 828 u[10] = _mm256_sub_epi32(v[9], v[10]); 829 u[12] = _mm256_sub_epi32(v[15], v[12]); 830 u[15] = _mm256_add_epi32(v[15], v[12]); 831 u[13] = _mm256_sub_epi32(v[14], v[13]); 832 u[14] = _mm256_add_epi32(v[14], v[13]); 833 834 // stage 4 835 u[0] = _mm256_mullo_epi32(u[0], cospi32); 836 u[1] = _mm256_mullo_epi32(u[1], cospi32); 837 v[0] = _mm256_add_epi32(u[0], u[1]); 838 v[0] = _mm256_add_epi32(v[0], rnding); 839 v[0] = _mm256_srai_epi32(v[0], bit); 840 841 v[1] = _mm256_sub_epi32(u[0], u[1]); 842 v[1] = _mm256_add_epi32(v[1], rnding); 843 v[1] = _mm256_srai_epi32(v[1], bit); 844 845 v[2] = _mm256_mullo_epi32(u[2], cospi48); 846 x = _mm256_mullo_epi32(u[3], cospi16); 847 v[2] = _mm256_add_epi32(v[2], x); 848 v[2] = _mm256_add_epi32(v[2], rnding); 849 v[2] = _mm256_srai_epi32(v[2], bit); 850 851 v[3] = _mm256_mullo_epi32(u[2], cospi16); 852 x = _mm256_mullo_epi32(u[3], cospi48); 853 v[3] = _mm256_sub_epi32(x, v[3]); 854 v[3] = _mm256_add_epi32(v[3], rnding); 855 v[3] = _mm256_srai_epi32(v[3], bit); 856 857 v[4] = _mm256_add_epi32(u[4], u[5]); 858 v[5] = _mm256_sub_epi32(u[4], u[5]); 859 v[6] = _mm256_sub_epi32(u[7], u[6]); 860 v[7] = _mm256_add_epi32(u[7], u[6]); 861 v[8] = u[8]; 862 863 v[9] = _mm256_mullo_epi32(u[9], cospim16); 864 x = _mm256_mullo_epi32(u[14], cospi48); 865 v[9] = _mm256_add_epi32(v[9], x); 866 v[9] = _mm256_add_epi32(v[9], rnding); 867 v[9] = _mm256_srai_epi32(v[9], bit); 868 869 v[14] = _mm256_mullo_epi32(u[9], cospi48); 870 x = _mm256_mullo_epi32(u[14], cospim16); 871 v[14] = _mm256_sub_epi32(v[14], x); 872 v[14] = _mm256_add_epi32(v[14], rnding); 873 v[14] = _mm256_srai_epi32(v[14], bit); 874 875 v[10] = _mm256_mullo_epi32(u[10], cospim48); 876 x = _mm256_mullo_epi32(u[13], cospim16); 877 v[10] = _mm256_add_epi32(v[10], x); 878 v[10] = _mm256_add_epi32(v[10], rnding); 879 v[10] = _mm256_srai_epi32(v[10], bit); 880 881 v[13] = _mm256_mullo_epi32(u[10], cospim16); 882 x = _mm256_mullo_epi32(u[13], cospim48); 883 v[13] = _mm256_sub_epi32(v[13], x); 884 v[13] = _mm256_add_epi32(v[13], rnding); 885 v[13] = _mm256_srai_epi32(v[13], bit); 886 887 v[11] = u[11]; 888 v[12] = u[12]; 889 v[15] = u[15]; 890 891 // stage 5 892 u[0] = v[0]; 893 u[1] = v[1]; 894 u[2] = v[2]; 895 u[3] = v[3]; 896 897 u[4] = _mm256_mullo_epi32(v[4], cospi56); 898 x = _mm256_mullo_epi32(v[7], cospi8); 899 u[4] = _mm256_add_epi32(u[4], x); 900 u[4] = _mm256_add_epi32(u[4], rnding); 901 u[4] = _mm256_srai_epi32(u[4], bit); 902 903 u[7] = _mm256_mullo_epi32(v[4], cospi8); 904 x = _mm256_mullo_epi32(v[7], cospi56); 905 u[7] = _mm256_sub_epi32(x, u[7]); 906 u[7] = _mm256_add_epi32(u[7], rnding); 907 u[7] = _mm256_srai_epi32(u[7], bit); 908 909 u[5] = _mm256_mullo_epi32(v[5], cospi24); 910 x = _mm256_mullo_epi32(v[6], cospi40); 911 u[5] = _mm256_add_epi32(u[5], x); 912 u[5] = _mm256_add_epi32(u[5], rnding); 913 u[5] = _mm256_srai_epi32(u[5], bit); 914 915 u[6] = _mm256_mullo_epi32(v[5], cospi40); 916 x = _mm256_mullo_epi32(v[6], cospi24); 917 u[6] = _mm256_sub_epi32(x, u[6]); 918 u[6] = _mm256_add_epi32(u[6], rnding); 919 u[6] = _mm256_srai_epi32(u[6], bit); 920 921 u[8] = _mm256_add_epi32(v[8], v[9]); 922 u[9] = _mm256_sub_epi32(v[8], v[9]); 923 u[10] = _mm256_sub_epi32(v[11], v[10]); 924 u[11] = _mm256_add_epi32(v[11], v[10]); 925 u[12] = _mm256_add_epi32(v[12], v[13]); 926 u[13] = _mm256_sub_epi32(v[12], v[13]); 927 u[14] = _mm256_sub_epi32(v[15], v[14]); 928 u[15] = _mm256_add_epi32(v[15], v[14]); 929 930 // stage 6 931 v[0] = u[0]; 932 v[1] = u[1]; 933 v[2] = u[2]; 934 v[3] = u[3]; 935 v[4] = u[4]; 936 v[5] = u[5]; 937 v[6] = u[6]; 938 v[7] = u[7]; 939 940 v[8] = _mm256_mullo_epi32(u[8], cospi60); 941 x = _mm256_mullo_epi32(u[15], cospi4); 942 v[8] = _mm256_add_epi32(v[8], x); 943 v[8] = _mm256_add_epi32(v[8], rnding); 944 v[8] = _mm256_srai_epi32(v[8], bit); 945 946 v[15] = _mm256_mullo_epi32(u[8], cospi4); 947 x = _mm256_mullo_epi32(u[15], cospi60); 948 v[15] = _mm256_sub_epi32(x, v[15]); 949 v[15] = _mm256_add_epi32(v[15], rnding); 950 v[15] = _mm256_srai_epi32(v[15], bit); 951 952 v[9] = _mm256_mullo_epi32(u[9], cospi28); 953 x = _mm256_mullo_epi32(u[14], cospi36); 954 v[9] = _mm256_add_epi32(v[9], x); 955 v[9] = _mm256_add_epi32(v[9], rnding); 956 v[9] = _mm256_srai_epi32(v[9], bit); 957 958 v[14] = _mm256_mullo_epi32(u[9], cospi36); 959 x = _mm256_mullo_epi32(u[14], cospi28); 960 v[14] = _mm256_sub_epi32(x, v[14]); 961 v[14] = _mm256_add_epi32(v[14], rnding); 962 v[14] = _mm256_srai_epi32(v[14], bit); 963 964 v[10] = _mm256_mullo_epi32(u[10], cospi44); 965 x = _mm256_mullo_epi32(u[13], cospi20); 966 v[10] = _mm256_add_epi32(v[10], x); 967 v[10] = _mm256_add_epi32(v[10], rnding); 968 v[10] = _mm256_srai_epi32(v[10], bit); 969 970 v[13] = _mm256_mullo_epi32(u[10], cospi20); 971 x = _mm256_mullo_epi32(u[13], cospi44); 972 v[13] = _mm256_sub_epi32(x, v[13]); 973 v[13] = _mm256_add_epi32(v[13], rnding); 974 v[13] = _mm256_srai_epi32(v[13], bit); 975 976 v[11] = _mm256_mullo_epi32(u[11], cospi12); 977 x = _mm256_mullo_epi32(u[12], cospi52); 978 v[11] = _mm256_add_epi32(v[11], x); 979 v[11] = _mm256_add_epi32(v[11], rnding); 980 v[11] = _mm256_srai_epi32(v[11], bit); 981 982 v[12] = _mm256_mullo_epi32(u[11], cospi52); 983 x = _mm256_mullo_epi32(u[12], cospi12); 984 v[12] = _mm256_sub_epi32(x, v[12]); 985 v[12] = _mm256_add_epi32(v[12], rnding); 986 v[12] = _mm256_srai_epi32(v[12], bit); 987 988 out[0 * outstride + col] = v[0]; 989 out[1 * outstride + col] = v[8]; 990 out[2 * outstride + col] = v[4]; 991 out[3 * outstride + col] = v[12]; 992 out[4 * outstride + col] = v[2]; 993 out[5 * outstride + col] = v[10]; 994 out[6 * outstride + col] = v[6]; 995 out[7 * outstride + col] = v[14]; 996 out[8 * outstride + col] = v[1]; 997 out[9 * outstride + col] = v[9]; 998 out[10 * outstride + col] = v[5]; 999 out[11 * outstride + col] = v[13]; 1000 out[12 * outstride + col] = v[3]; 1001 out[13 * outstride + col] = v[11]; 1002 out[14 * outstride + col] = v[7]; 1003 out[15 * outstride + col] = v[15]; 1004 } 1005 } 1006 static void fadst16_avx2(__m256i *in, __m256i *out, const int8_t bit, 1007 const int num_cols, const int outstride) { 1008 const int32_t *cospi = cospi_arr(bit); 1009 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); 1010 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); 1011 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); 1012 const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); 1013 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); 1014 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); 1015 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); 1016 const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); 1017 const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); 1018 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); 1019 const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); 1020 const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); 1021 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); 1022 const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); 1023 const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); 1024 const __m256i cospim2 = _mm256_set1_epi32(-cospi[2]); 1025 const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); 1026 const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); 1027 const __m256i cospim10 = _mm256_set1_epi32(-cospi[10]); 1028 const __m256i cospi18 = _mm256_set1_epi32(cospi[18]); 1029 const __m256i cospi46 = _mm256_set1_epi32(cospi[46]); 1030 const __m256i cospim18 = _mm256_set1_epi32(-cospi[18]); 1031 const __m256i cospi26 = _mm256_set1_epi32(cospi[26]); 1032 const __m256i cospi38 = _mm256_set1_epi32(cospi[38]); 1033 const __m256i cospim26 = _mm256_set1_epi32(-cospi[26]); 1034 const __m256i cospi34 = _mm256_set1_epi32(cospi[34]); 1035 const __m256i cospi30 = _mm256_set1_epi32(cospi[30]); 1036 const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]); 1037 const __m256i cospi42 = _mm256_set1_epi32(cospi[42]); 1038 const __m256i cospi22 = _mm256_set1_epi32(cospi[22]); 1039 const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]); 1040 const __m256i cospi50 = _mm256_set1_epi32(cospi[50]); 1041 const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); 1042 const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]); 1043 const __m256i cospi58 = _mm256_set1_epi32(cospi[58]); 1044 const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); 1045 const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); 1046 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); 1047 const __m256i zero = _mm256_setzero_si256(); 1048 1049 __m256i u[16], v[16], x, y; 1050 int col; 1051 1052 for (col = 0; col < num_cols; ++col) { 1053 // stage 0 1054 // stage 1 1055 u[0] = in[0 * num_cols + col]; 1056 u[1] = _mm256_sub_epi32(zero, in[15 * num_cols + col]); 1057 u[2] = _mm256_sub_epi32(zero, in[7 * num_cols + col]); 1058 u[3] = in[8 * num_cols + col]; 1059 u[4] = _mm256_sub_epi32(zero, in[3 * num_cols + col]); 1060 u[5] = in[12 * num_cols + col]; 1061 u[6] = in[4 * num_cols + col]; 1062 u[7] = _mm256_sub_epi32(zero, in[11 * num_cols + col]); 1063 u[8] = _mm256_sub_epi32(zero, in[1 * num_cols + col]); 1064 u[9] = in[14 * num_cols + col]; 1065 u[10] = in[6 * num_cols + col]; 1066 u[11] = _mm256_sub_epi32(zero, in[9 * num_cols + col]); 1067 u[12] = in[2 * num_cols + col]; 1068 u[13] = _mm256_sub_epi32(zero, in[13 * num_cols + col]); 1069 u[14] = _mm256_sub_epi32(zero, in[5 * num_cols + col]); 1070 u[15] = in[10 * num_cols + col]; 1071 1072 // stage 2 1073 v[0] = u[0]; 1074 v[1] = u[1]; 1075 1076 x = _mm256_mullo_epi32(u[2], cospi32); 1077 y = _mm256_mullo_epi32(u[3], cospi32); 1078 v[2] = _mm256_add_epi32(x, y); 1079 v[2] = _mm256_add_epi32(v[2], rnding); 1080 v[2] = _mm256_srai_epi32(v[2], bit); 1081 1082 v[3] = _mm256_sub_epi32(x, y); 1083 v[3] = _mm256_add_epi32(v[3], rnding); 1084 v[3] = _mm256_srai_epi32(v[3], bit); 1085 1086 v[4] = u[4]; 1087 v[5] = u[5]; 1088 1089 x = _mm256_mullo_epi32(u[6], cospi32); 1090 y = _mm256_mullo_epi32(u[7], cospi32); 1091 v[6] = _mm256_add_epi32(x, y); 1092 v[6] = _mm256_add_epi32(v[6], rnding); 1093 v[6] = _mm256_srai_epi32(v[6], bit); 1094 1095 v[7] = _mm256_sub_epi32(x, y); 1096 v[7] = _mm256_add_epi32(v[7], rnding); 1097 v[7] = _mm256_srai_epi32(v[7], bit); 1098 1099 v[8] = u[8]; 1100 v[9] = u[9]; 1101 1102 x = _mm256_mullo_epi32(u[10], cospi32); 1103 y = _mm256_mullo_epi32(u[11], cospi32); 1104 v[10] = _mm256_add_epi32(x, y); 1105 v[10] = _mm256_add_epi32(v[10], rnding); 1106 v[10] = _mm256_srai_epi32(v[10], bit); 1107 1108 v[11] = _mm256_sub_epi32(x, y); 1109 v[11] = _mm256_add_epi32(v[11], rnding); 1110 v[11] = _mm256_srai_epi32(v[11], bit); 1111 1112 v[12] = u[12]; 1113 v[13] = u[13]; 1114 1115 x = _mm256_mullo_epi32(u[14], cospi32); 1116 y = _mm256_mullo_epi32(u[15], cospi32); 1117 v[14] = _mm256_add_epi32(x, y); 1118 v[14] = _mm256_add_epi32(v[14], rnding); 1119 v[14] = _mm256_srai_epi32(v[14], bit); 1120 1121 v[15] = _mm256_sub_epi32(x, y); 1122 v[15] = _mm256_add_epi32(v[15], rnding); 1123 v[15] = _mm256_srai_epi32(v[15], bit); 1124 1125 // stage 3 1126 u[0] = _mm256_add_epi32(v[0], v[2]); 1127 u[1] = _mm256_add_epi32(v[1], v[3]); 1128 u[2] = _mm256_sub_epi32(v[0], v[2]); 1129 u[3] = _mm256_sub_epi32(v[1], v[3]); 1130 u[4] = _mm256_add_epi32(v[4], v[6]); 1131 u[5] = _mm256_add_epi32(v[5], v[7]); 1132 u[6] = _mm256_sub_epi32(v[4], v[6]); 1133 u[7] = _mm256_sub_epi32(v[5], v[7]); 1134 u[8] = _mm256_add_epi32(v[8], v[10]); 1135 u[9] = _mm256_add_epi32(v[9], v[11]); 1136 u[10] = _mm256_sub_epi32(v[8], v[10]); 1137 u[11] = _mm256_sub_epi32(v[9], v[11]); 1138 u[12] = _mm256_add_epi32(v[12], v[14]); 1139 u[13] = _mm256_add_epi32(v[13], v[15]); 1140 u[14] = _mm256_sub_epi32(v[12], v[14]); 1141 u[15] = _mm256_sub_epi32(v[13], v[15]); 1142 1143 // stage 4 1144 v[0] = u[0]; 1145 v[1] = u[1]; 1146 v[2] = u[2]; 1147 v[3] = u[3]; 1148 v[4] = av1_half_btf_avx2(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit); 1149 v[5] = av1_half_btf_avx2(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit); 1150 v[6] = av1_half_btf_avx2(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit); 1151 v[7] = av1_half_btf_avx2(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit); 1152 v[8] = u[8]; 1153 v[9] = u[9]; 1154 v[10] = u[10]; 1155 v[11] = u[11]; 1156 v[12] = av1_half_btf_avx2(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit); 1157 v[13] = 1158 av1_half_btf_avx2(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit); 1159 v[14] = 1160 av1_half_btf_avx2(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit); 1161 v[15] = av1_half_btf_avx2(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit); 1162 1163 // stage 5 1164 u[0] = _mm256_add_epi32(v[0], v[4]); 1165 u[1] = _mm256_add_epi32(v[1], v[5]); 1166 u[2] = _mm256_add_epi32(v[2], v[6]); 1167 u[3] = _mm256_add_epi32(v[3], v[7]); 1168 u[4] = _mm256_sub_epi32(v[0], v[4]); 1169 u[5] = _mm256_sub_epi32(v[1], v[5]); 1170 u[6] = _mm256_sub_epi32(v[2], v[6]); 1171 u[7] = _mm256_sub_epi32(v[3], v[7]); 1172 u[8] = _mm256_add_epi32(v[8], v[12]); 1173 u[9] = _mm256_add_epi32(v[9], v[13]); 1174 u[10] = _mm256_add_epi32(v[10], v[14]); 1175 u[11] = _mm256_add_epi32(v[11], v[15]); 1176 u[12] = _mm256_sub_epi32(v[8], v[12]); 1177 u[13] = _mm256_sub_epi32(v[9], v[13]); 1178 u[14] = _mm256_sub_epi32(v[10], v[14]); 1179 u[15] = _mm256_sub_epi32(v[11], v[15]); 1180 1181 // stage 6 1182 v[0] = u[0]; 1183 v[1] = u[1]; 1184 v[2] = u[2]; 1185 v[3] = u[3]; 1186 v[4] = u[4]; 1187 v[5] = u[5]; 1188 v[6] = u[6]; 1189 v[7] = u[7]; 1190 v[8] = av1_half_btf_avx2(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit); 1191 v[9] = av1_half_btf_avx2(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit); 1192 v[10] = av1_half_btf_avx2(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit); 1193 v[11] = 1194 av1_half_btf_avx2(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit); 1195 v[12] = av1_half_btf_avx2(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit); 1196 v[13] = av1_half_btf_avx2(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit); 1197 v[14] = 1198 av1_half_btf_avx2(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit); 1199 v[15] = av1_half_btf_avx2(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit); 1200 1201 // stage 7 1202 u[0] = _mm256_add_epi32(v[0], v[8]); 1203 u[1] = _mm256_add_epi32(v[1], v[9]); 1204 u[2] = _mm256_add_epi32(v[2], v[10]); 1205 u[3] = _mm256_add_epi32(v[3], v[11]); 1206 u[4] = _mm256_add_epi32(v[4], v[12]); 1207 u[5] = _mm256_add_epi32(v[5], v[13]); 1208 u[6] = _mm256_add_epi32(v[6], v[14]); 1209 u[7] = _mm256_add_epi32(v[7], v[15]); 1210 u[8] = _mm256_sub_epi32(v[0], v[8]); 1211 u[9] = _mm256_sub_epi32(v[1], v[9]); 1212 u[10] = _mm256_sub_epi32(v[2], v[10]); 1213 u[11] = _mm256_sub_epi32(v[3], v[11]); 1214 u[12] = _mm256_sub_epi32(v[4], v[12]); 1215 u[13] = _mm256_sub_epi32(v[5], v[13]); 1216 u[14] = _mm256_sub_epi32(v[6], v[14]); 1217 u[15] = _mm256_sub_epi32(v[7], v[15]); 1218 1219 // stage 8 1220 v[0] = av1_half_btf_avx2(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit); 1221 v[1] = av1_half_btf_avx2(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit); 1222 v[2] = av1_half_btf_avx2(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit); 1223 v[3] = av1_half_btf_avx2(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit); 1224 v[4] = av1_half_btf_avx2(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit); 1225 v[5] = av1_half_btf_avx2(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit); 1226 v[6] = av1_half_btf_avx2(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit); 1227 v[7] = av1_half_btf_avx2(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit); 1228 v[8] = av1_half_btf_avx2(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit); 1229 v[9] = av1_half_btf_avx2(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit); 1230 v[10] = av1_half_btf_avx2(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit); 1231 v[11] = 1232 av1_half_btf_avx2(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit); 1233 v[12] = av1_half_btf_avx2(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit); 1234 v[13] = 1235 av1_half_btf_avx2(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit); 1236 v[14] = av1_half_btf_avx2(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit); 1237 v[15] = av1_half_btf_avx2(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit); 1238 1239 // stage 9 1240 out[0 * outstride + col] = v[1]; 1241 out[1 * outstride + col] = v[14]; 1242 out[2 * outstride + col] = v[3]; 1243 out[3 * outstride + col] = v[12]; 1244 out[4 * outstride + col] = v[5]; 1245 out[5 * outstride + col] = v[10]; 1246 out[6 * outstride + col] = v[7]; 1247 out[7 * outstride + col] = v[8]; 1248 out[8 * outstride + col] = v[9]; 1249 out[9 * outstride + col] = v[6]; 1250 out[10 * outstride + col] = v[11]; 1251 out[11 * outstride + col] = v[4]; 1252 out[12 * outstride + col] = v[13]; 1253 out[13 * outstride + col] = v[2]; 1254 out[14 * outstride + col] = v[15]; 1255 out[15 * outstride + col] = v[0]; 1256 } 1257 } 1258 static void idtx16_avx2(__m256i *in, __m256i *out, const int8_t bit, 1259 int col_num, const int outstride) { 1260 (void)bit; 1261 (void)outstride; 1262 __m256i fact = _mm256_set1_epi32(2 * NewSqrt2); 1263 __m256i offset = _mm256_set1_epi32(1 << (NewSqrt2Bits - 1)); 1264 __m256i a_low; 1265 1266 int num_iters = 16 * col_num; 1267 for (int i = 0; i < num_iters; i++) { 1268 a_low = _mm256_mullo_epi32(in[i], fact); 1269 a_low = _mm256_add_epi32(a_low, offset); 1270 out[i] = _mm256_srai_epi32(a_low, NewSqrt2Bits); 1271 } 1272 } 1273 static const transform_1d_avx2 col_highbd_txfm8x16_arr[TX_TYPES] = { 1274 fdct16_avx2, // DCT_DCT 1275 fadst16_avx2, // ADST_DCT 1276 fdct16_avx2, // DCT_ADST 1277 fadst16_avx2, // ADST_ADST 1278 fadst16_avx2, // FLIPADST_DCT 1279 fdct16_avx2, // DCT_FLIPADST 1280 fadst16_avx2, // FLIPADST_FLIPADST 1281 fadst16_avx2, // ADST_FLIPADST 1282 fadst16_avx2, // FLIPADST_ADST 1283 idtx16_avx2, // IDTX 1284 fdct16_avx2, // V_DCT 1285 idtx16_avx2, // H_DCT 1286 fadst16_avx2, // V_ADST 1287 idtx16_avx2, // H_ADST 1288 fadst16_avx2, // V_FLIPADST 1289 idtx16_avx2 // H_FLIPADST 1290 }; 1291 static const transform_1d_avx2 row_highbd_txfm8x8_arr[TX_TYPES] = { 1292 fdct8_avx2, // DCT_DCT 1293 fdct8_avx2, // ADST_DCT 1294 fadst8_avx2, // DCT_ADST 1295 fadst8_avx2, // ADST_ADST 1296 fdct8_avx2, // FLIPADST_DCT 1297 fadst8_avx2, // DCT_FLIPADST 1298 fadst8_avx2, // FLIPADST_FLIPADST 1299 fadst8_avx2, // ADST_FLIPADST 1300 fadst8_avx2, // FLIPADST_ADST 1301 idtx8_avx2, // IDTX 1302 idtx8_avx2, // V_DCT 1303 fdct8_avx2, // H_DCT 1304 idtx8_avx2, // V_ADST 1305 fadst8_avx2, // H_ADST 1306 idtx8_avx2, // V_FLIPADST 1307 fadst8_avx2 // H_FLIPADST 1308 }; 1309 void av1_fwd_txfm2d_8x16_avx2(const int16_t *input, int32_t *coeff, int stride, 1310 TX_TYPE tx_type, int bd) { 1311 __m256i in[16], out[16]; 1312 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16]; 1313 const int txw_idx = get_txw_idx(TX_8X16); 1314 const int txh_idx = get_txh_idx(TX_8X16); 1315 const transform_1d_avx2 col_txfm = col_highbd_txfm8x16_arr[tx_type]; 1316 const transform_1d_avx2 row_txfm = row_highbd_txfm8x8_arr[tx_type]; 1317 const int8_t bit = av1_fwd_cos_bit_col[txw_idx][txh_idx]; 1318 int ud_flip, lr_flip; 1319 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 1320 1321 load_buffer_8x16_avx2(input, in, stride, ud_flip, lr_flip, shift[0]); 1322 col_txfm(in, out, bit, 1, 1); 1323 col_txfm_8x8_rounding(out, -shift[1]); 1324 col_txfm_8x8_rounding(&out[8], -shift[1]); 1325 fwd_txfm_transpose_8x8_avx2(out, in, 1, 2); 1326 fwd_txfm_transpose_8x8_avx2(&out[8], &in[1], 1, 2); 1327 row_txfm(in, out, bit, 2, 2); 1328 round_shift_rect_array_32_avx2(out, in, 16, -shift[2], NewSqrt2); 1329 store_buffer_avx2(in, coeff, 8, 16); 1330 (void)bd; 1331 } 1332 static const transform_1d_avx2 col_highbd_txfm8x8_arr[TX_TYPES] = { 1333 fdct8_avx2, // DCT_DCT 1334 fadst8_avx2, // ADST_DCT 1335 fdct8_avx2, // DCT_ADST 1336 fadst8_avx2, // ADST_ADST 1337 fadst8_avx2, // FLIPADST_DCT 1338 fdct8_avx2, // DCT_FLIPADST 1339 fadst8_avx2, // FLIPADST_FLIPADST 1340 fadst8_avx2, // ADST_FLIPADST 1341 fadst8_avx2, // FLIPADST_ADST 1342 idtx8_avx2, // IDTX 1343 fdct8_avx2, // V_DCT 1344 idtx8_avx2, // H_DCT 1345 fadst8_avx2, // V_ADST 1346 idtx8_avx2, // H_ADST 1347 fadst8_avx2, // V_FLIPADST 1348 idtx8_avx2 // H_FLIPADST 1349 }; 1350 static const transform_1d_avx2 row_highbd_txfm8x16_arr[TX_TYPES] = { 1351 fdct16_avx2, // DCT_DCT 1352 fdct16_avx2, // ADST_DCT 1353 fadst16_avx2, // DCT_ADST 1354 fadst16_avx2, // ADST_ADST 1355 fdct16_avx2, // FLIPADST_DCT 1356 fadst16_avx2, // DCT_FLIPADST 1357 fadst16_avx2, // FLIPADST_FLIPADST 1358 fadst16_avx2, // ADST_FLIPADST 1359 fadst16_avx2, // FLIPADST_ADST 1360 idtx16_avx2, // IDTX 1361 idtx16_avx2, // V_DCT 1362 fdct16_avx2, // H_DCT 1363 idtx16_avx2, // V_ADST 1364 fadst16_avx2, // H_ADST 1365 idtx16_avx2, // V_FLIPADST 1366 fadst16_avx2 // H_FLIPADST 1367 }; 1368 void av1_fwd_txfm2d_16x8_avx2(const int16_t *input, int32_t *coeff, int stride, 1369 TX_TYPE tx_type, int bd) { 1370 __m256i in[16], out[16]; 1371 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8]; 1372 const int txw_idx = get_txw_idx(TX_16X8); 1373 const int txh_idx = get_txh_idx(TX_16X8); 1374 const transform_1d_avx2 col_txfm = col_highbd_txfm8x8_arr[tx_type]; 1375 const transform_1d_avx2 row_txfm = row_highbd_txfm8x16_arr[tx_type]; 1376 const int8_t bit = av1_fwd_cos_bit_col[txw_idx][txh_idx]; 1377 int ud_flip, lr_flip; 1378 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 1379 1380 load_buffer_16xn_avx2(input, in, stride, 8, 2, ud_flip, lr_flip); 1381 round_shift_32_8xn_avx2(in, 16, shift[0], 1); 1382 col_txfm(in, out, bit, 2, 2); 1383 round_shift_32_8xn_avx2(out, 16, shift[1], 1); 1384 fwd_txfm_transpose_8x8_avx2(out, in, 2, 1); 1385 fwd_txfm_transpose_8x8_avx2(&out[1], &in[8], 2, 1); 1386 row_txfm(in, out, bit, 1, 1); 1387 round_shift_rect_array_32_avx2(out, out, 16, -shift[2], NewSqrt2); 1388 store_buffer_avx2(out, coeff, 8, 16); 1389 (void)bd; 1390 } 1391 void av1_fwd_txfm2d_16x16_avx2(const int16_t *input, int32_t *coeff, int stride, 1392 TX_TYPE tx_type, int bd) { 1393 __m256i in[32], out[32]; 1394 const TX_SIZE tx_size = TX_16X16; 1395 const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; 1396 const int txw_idx = get_txw_idx(tx_size); 1397 const int txh_idx = get_txh_idx(tx_size); 1398 const int width = tx_size_wide[tx_size]; 1399 const int height = tx_size_high[tx_size]; 1400 const int width_div8 = (width >> 3); 1401 const int width_div16 = (width >> 4); 1402 const int size = (height << 1); 1403 switch (tx_type) { 1404 case DCT_DCT: 1405 load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); 1406 round_shift_32_8xn_avx2(in, size, shift[0], width_div16); 1407 fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, 1408 width_div8); 1409 round_shift_32_8xn_avx2(out, size, shift[1], width_div16); 1410 fwd_txfm_transpose_16x16_avx2(out, in); 1411 fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, 1412 width_div8); 1413 store_buffer_avx2(out, coeff, 8, 32); 1414 break; 1415 case ADST_DCT: 1416 load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); 1417 round_shift_32_8xn_avx2(in, size, shift[0], width_div16); 1418 fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, 1419 width_div8); 1420 round_shift_32_8xn_avx2(out, size, shift[1], width_div16); 1421 fwd_txfm_transpose_16x16_avx2(out, in); 1422 fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, 1423 width_div8); 1424 store_buffer_avx2(out, coeff, 8, 32); 1425 break; 1426 case DCT_ADST: 1427 load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); 1428 round_shift_32_8xn_avx2(in, size, shift[0], width_div16); 1429 fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, 1430 width_div8); 1431 round_shift_32_8xn_avx2(out, size, shift[1], width_div16); 1432 fwd_txfm_transpose_16x16_avx2(out, in); 1433 fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, 1434 width_div8); 1435 store_buffer_avx2(out, coeff, 8, 32); 1436 break; 1437 case ADST_ADST: 1438 load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); 1439 round_shift_32_8xn_avx2(in, size, shift[0], width_div16); 1440 fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, 1441 width_div8); 1442 round_shift_32_8xn_avx2(out, size, shift[1], width_div16); 1443 fwd_txfm_transpose_16x16_avx2(out, in); 1444 fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, 1445 width_div8); 1446 store_buffer_avx2(out, coeff, 8, 32); 1447 break; 1448 case FLIPADST_DCT: 1449 load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0); 1450 round_shift_32_8xn_avx2(in, size, shift[0], width_div16); 1451 fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, 1452 width_div8); 1453 round_shift_32_8xn_avx2(out, size, shift[1], width_div16); 1454 fwd_txfm_transpose_16x16_avx2(out, in); 1455 fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, 1456 width_div8); 1457 store_buffer_avx2(out, coeff, 8, 32); 1458 break; 1459 case DCT_FLIPADST: 1460 load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1); 1461 round_shift_32_8xn_avx2(in, size, shift[0], width_div16); 1462 fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, 1463 width_div8); 1464 round_shift_32_8xn_avx2(out, size, shift[1], width_div16); 1465 fwd_txfm_transpose_16x16_avx2(out, in); 1466 fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, 1467 width_div8); 1468 store_buffer_avx2(out, coeff, 8, 32); 1469 break; 1470 case FLIPADST_FLIPADST: 1471 load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 1); 1472 round_shift_32_8xn_avx2(in, size, shift[0], width_div16); 1473 fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, 1474 width_div8); 1475 round_shift_32_8xn_avx2(out, size, shift[1], width_div16); 1476 fwd_txfm_transpose_16x16_avx2(out, in); 1477 fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, 1478 width_div8); 1479 store_buffer_avx2(out, coeff, 8, 32); 1480 break; 1481 case ADST_FLIPADST: 1482 load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1); 1483 round_shift_32_8xn_avx2(in, size, shift[0], width_div16); 1484 fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, 1485 width_div8); 1486 round_shift_32_8xn_avx2(out, size, shift[1], width_div16); 1487 fwd_txfm_transpose_16x16_avx2(out, in); 1488 fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, 1489 width_div8); 1490 store_buffer_avx2(out, coeff, 8, 32); 1491 break; 1492 case FLIPADST_ADST: 1493 load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0); 1494 round_shift_32_8xn_avx2(in, size, shift[0], width_div16); 1495 fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, 1496 width_div8); 1497 round_shift_32_8xn_avx2(out, size, shift[1], width_div16); 1498 fwd_txfm_transpose_16x16_avx2(out, in); 1499 fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, 1500 width_div8); 1501 store_buffer_avx2(out, coeff, 8, 32); 1502 break; 1503 case IDTX: 1504 load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); 1505 round_shift_32_8xn_avx2(in, size, shift[0], width_div16); 1506 idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, 1507 width_div8); 1508 round_shift_32_8xn_avx2(out, size, shift[1], width_div16); 1509 fwd_txfm_transpose_16x16_avx2(out, in); 1510 idtx16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, 1511 width_div8); 1512 store_buffer_avx2(out, coeff, 8, 32); 1513 break; 1514 case V_DCT: 1515 load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); 1516 round_shift_32_8xn_avx2(in, size, shift[0], width_div16); 1517 fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, 1518 width_div8); 1519 round_shift_32_8xn_avx2(out, size, shift[1], width_div16); 1520 fwd_txfm_transpose_16x16_avx2(out, in); 1521 idtx16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, 1522 width_div8); 1523 store_buffer_avx2(out, coeff, 8, 32); 1524 break; 1525 case H_DCT: 1526 load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); 1527 round_shift_32_8xn_avx2(in, size, shift[0], width_div16); 1528 idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, 1529 width_div8); 1530 round_shift_32_8xn_avx2(out, size, shift[1], width_div16); 1531 fwd_txfm_transpose_16x16_avx2(out, in); 1532 fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, 1533 width_div8); 1534 store_buffer_avx2(out, coeff, 8, 32); 1535 break; 1536 case V_ADST: 1537 load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); 1538 round_shift_32_8xn_avx2(in, size, shift[0], width_div16); 1539 fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, 1540 width_div8); 1541 round_shift_32_8xn_avx2(out, size, shift[1], width_div16); 1542 fwd_txfm_transpose_16x16_avx2(out, in); 1543 idtx16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, 1544 width_div8); 1545 store_buffer_avx2(out, coeff, 8, 32); 1546 break; 1547 case H_ADST: 1548 load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); 1549 round_shift_32_8xn_avx2(in, size, shift[0], width_div16); 1550 idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, 1551 width_div8); 1552 round_shift_32_8xn_avx2(out, size, shift[1], width_div16); 1553 fwd_txfm_transpose_16x16_avx2(out, in); 1554 fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, 1555 width_div8); 1556 store_buffer_avx2(out, coeff, 8, 32); 1557 break; 1558 case V_FLIPADST: 1559 load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0); 1560 round_shift_32_8xn_avx2(in, size, shift[0], width_div16); 1561 fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, 1562 width_div8); 1563 round_shift_32_8xn_avx2(out, size, shift[1], width_div16); 1564 fwd_txfm_transpose_16x16_avx2(out, in); 1565 idtx16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, 1566 width_div8); 1567 store_buffer_avx2(out, coeff, 8, 32); 1568 break; 1569 case H_FLIPADST: 1570 load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1); 1571 round_shift_32_8xn_avx2(in, size, shift[0], width_div16); 1572 idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, 1573 width_div8); 1574 round_shift_32_8xn_avx2(out, size, shift[1], width_div16); 1575 fwd_txfm_transpose_16x16_avx2(out, in); 1576 fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, 1577 width_div8); 1578 store_buffer_avx2(out, coeff, 8, 32); 1579 break; 1580 default: assert(0); 1581 } 1582 (void)bd; 1583 } 1584 static inline void fdct32_avx2(__m256i *input, __m256i *output, 1585 const int8_t cos_bit, const int instride, 1586 const int outstride) { 1587 __m256i buf0[32]; 1588 __m256i buf1[32]; 1589 const int32_t *cospi; 1590 int startidx = 0 * instride; 1591 int endidx = 31 * instride; 1592 // stage 0 1593 // stage 1 1594 buf1[0] = _mm256_add_epi32(input[startidx], input[endidx]); 1595 buf1[31] = _mm256_sub_epi32(input[startidx], input[endidx]); 1596 startidx += instride; 1597 endidx -= instride; 1598 buf1[1] = _mm256_add_epi32(input[startidx], input[endidx]); 1599 buf1[30] = _mm256_sub_epi32(input[startidx], input[endidx]); 1600 startidx += instride; 1601 endidx -= instride; 1602 buf1[2] = _mm256_add_epi32(input[startidx], input[endidx]); 1603 buf1[29] = _mm256_sub_epi32(input[startidx], input[endidx]); 1604 startidx += instride; 1605 endidx -= instride; 1606 buf1[3] = _mm256_add_epi32(input[startidx], input[endidx]); 1607 buf1[28] = _mm256_sub_epi32(input[startidx], input[endidx]); 1608 startidx += instride; 1609 endidx -= instride; 1610 buf1[4] = _mm256_add_epi32(input[startidx], input[endidx]); 1611 buf1[27] = _mm256_sub_epi32(input[startidx], input[endidx]); 1612 startidx += instride; 1613 endidx -= instride; 1614 buf1[5] = _mm256_add_epi32(input[startidx], input[endidx]); 1615 buf1[26] = _mm256_sub_epi32(input[startidx], input[endidx]); 1616 startidx += instride; 1617 endidx -= instride; 1618 buf1[6] = _mm256_add_epi32(input[startidx], input[endidx]); 1619 buf1[25] = _mm256_sub_epi32(input[startidx], input[endidx]); 1620 startidx += instride; 1621 endidx -= instride; 1622 buf1[7] = _mm256_add_epi32(input[startidx], input[endidx]); 1623 buf1[24] = _mm256_sub_epi32(input[startidx], input[endidx]); 1624 startidx += instride; 1625 endidx -= instride; 1626 buf1[8] = _mm256_add_epi32(input[startidx], input[endidx]); 1627 buf1[23] = _mm256_sub_epi32(input[startidx], input[endidx]); 1628 startidx += instride; 1629 endidx -= instride; 1630 buf1[9] = _mm256_add_epi32(input[startidx], input[endidx]); 1631 buf1[22] = _mm256_sub_epi32(input[startidx], input[endidx]); 1632 startidx += instride; 1633 endidx -= instride; 1634 buf1[10] = _mm256_add_epi32(input[startidx], input[endidx]); 1635 buf1[21] = _mm256_sub_epi32(input[startidx], input[endidx]); 1636 startidx += instride; 1637 endidx -= instride; 1638 buf1[11] = _mm256_add_epi32(input[startidx], input[endidx]); 1639 buf1[20] = _mm256_sub_epi32(input[startidx], input[endidx]); 1640 startidx += instride; 1641 endidx -= instride; 1642 buf1[12] = _mm256_add_epi32(input[startidx], input[endidx]); 1643 buf1[19] = _mm256_sub_epi32(input[startidx], input[endidx]); 1644 startidx += instride; 1645 endidx -= instride; 1646 buf1[13] = _mm256_add_epi32(input[startidx], input[endidx]); 1647 buf1[18] = _mm256_sub_epi32(input[startidx], input[endidx]); 1648 startidx += instride; 1649 endidx -= instride; 1650 buf1[14] = _mm256_add_epi32(input[startidx], input[endidx]); 1651 buf1[17] = _mm256_sub_epi32(input[startidx], input[endidx]); 1652 startidx += instride; 1653 endidx -= instride; 1654 buf1[15] = _mm256_add_epi32(input[startidx], input[endidx]); 1655 buf1[16] = _mm256_sub_epi32(input[startidx], input[endidx]); 1656 1657 // stage 2 1658 cospi = cospi_arr(cos_bit); 1659 buf0[0] = _mm256_add_epi32(buf1[0], buf1[15]); 1660 buf0[15] = _mm256_sub_epi32(buf1[0], buf1[15]); 1661 buf0[1] = _mm256_add_epi32(buf1[1], buf1[14]); 1662 buf0[14] = _mm256_sub_epi32(buf1[1], buf1[14]); 1663 buf0[2] = _mm256_add_epi32(buf1[2], buf1[13]); 1664 buf0[13] = _mm256_sub_epi32(buf1[2], buf1[13]); 1665 buf0[3] = _mm256_add_epi32(buf1[3], buf1[12]); 1666 buf0[12] = _mm256_sub_epi32(buf1[3], buf1[12]); 1667 buf0[4] = _mm256_add_epi32(buf1[4], buf1[11]); 1668 buf0[11] = _mm256_sub_epi32(buf1[4], buf1[11]); 1669 buf0[5] = _mm256_add_epi32(buf1[5], buf1[10]); 1670 buf0[10] = _mm256_sub_epi32(buf1[5], buf1[10]); 1671 buf0[6] = _mm256_add_epi32(buf1[6], buf1[9]); 1672 buf0[9] = _mm256_sub_epi32(buf1[6], buf1[9]); 1673 buf0[7] = _mm256_add_epi32(buf1[7], buf1[8]); 1674 buf0[8] = _mm256_sub_epi32(buf1[7], buf1[8]); 1675 buf0[16] = buf1[16]; 1676 buf0[17] = buf1[17]; 1677 buf0[18] = buf1[18]; 1678 buf0[19] = buf1[19]; 1679 btf_32_avx2_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20], 1680 buf0[27], cos_bit); 1681 btf_32_avx2_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21], 1682 buf0[26], cos_bit); 1683 btf_32_avx2_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22], 1684 buf0[25], cos_bit); 1685 btf_32_avx2_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23], 1686 buf0[24], cos_bit); 1687 buf0[28] = buf1[28]; 1688 buf0[29] = buf1[29]; 1689 buf0[30] = buf1[30]; 1690 buf0[31] = buf1[31]; 1691 1692 // stage 3 1693 cospi = cospi_arr(cos_bit); 1694 buf1[0] = _mm256_add_epi32(buf0[0], buf0[7]); 1695 buf1[7] = _mm256_sub_epi32(buf0[0], buf0[7]); 1696 buf1[1] = _mm256_add_epi32(buf0[1], buf0[6]); 1697 buf1[6] = _mm256_sub_epi32(buf0[1], buf0[6]); 1698 buf1[2] = _mm256_add_epi32(buf0[2], buf0[5]); 1699 buf1[5] = _mm256_sub_epi32(buf0[2], buf0[5]); 1700 buf1[3] = _mm256_add_epi32(buf0[3], buf0[4]); 1701 buf1[4] = _mm256_sub_epi32(buf0[3], buf0[4]); 1702 buf1[8] = buf0[8]; 1703 buf1[9] = buf0[9]; 1704 btf_32_avx2_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10], 1705 buf1[13], cos_bit); 1706 btf_32_avx2_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11], 1707 buf1[12], cos_bit); 1708 buf1[14] = buf0[14]; 1709 buf1[15] = buf0[15]; 1710 buf1[16] = _mm256_add_epi32(buf0[16], buf0[23]); 1711 buf1[23] = _mm256_sub_epi32(buf0[16], buf0[23]); 1712 buf1[17] = _mm256_add_epi32(buf0[17], buf0[22]); 1713 buf1[22] = _mm256_sub_epi32(buf0[17], buf0[22]); 1714 buf1[18] = _mm256_add_epi32(buf0[18], buf0[21]); 1715 buf1[21] = _mm256_sub_epi32(buf0[18], buf0[21]); 1716 buf1[19] = _mm256_add_epi32(buf0[19], buf0[20]); 1717 buf1[20] = _mm256_sub_epi32(buf0[19], buf0[20]); 1718 buf1[24] = _mm256_sub_epi32(buf0[31], buf0[24]); 1719 buf1[31] = _mm256_add_epi32(buf0[31], buf0[24]); 1720 buf1[25] = _mm256_sub_epi32(buf0[30], buf0[25]); 1721 buf1[30] = _mm256_add_epi32(buf0[30], buf0[25]); 1722 buf1[26] = _mm256_sub_epi32(buf0[29], buf0[26]); 1723 buf1[29] = _mm256_add_epi32(buf0[29], buf0[26]); 1724 buf1[27] = _mm256_sub_epi32(buf0[28], buf0[27]); 1725 buf1[28] = _mm256_add_epi32(buf0[28], buf0[27]); 1726 1727 // stage 4 1728 cospi = cospi_arr(cos_bit); 1729 buf0[0] = _mm256_add_epi32(buf1[0], buf1[3]); 1730 buf0[3] = _mm256_sub_epi32(buf1[0], buf1[3]); 1731 buf0[1] = _mm256_add_epi32(buf1[1], buf1[2]); 1732 buf0[2] = _mm256_sub_epi32(buf1[1], buf1[2]); 1733 buf0[4] = buf1[4]; 1734 btf_32_avx2_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6], 1735 cos_bit); 1736 buf0[7] = buf1[7]; 1737 buf0[8] = _mm256_add_epi32(buf1[8], buf1[11]); 1738 buf0[11] = _mm256_sub_epi32(buf1[8], buf1[11]); 1739 buf0[9] = _mm256_add_epi32(buf1[9], buf1[10]); 1740 buf0[10] = _mm256_sub_epi32(buf1[9], buf1[10]); 1741 buf0[12] = _mm256_sub_epi32(buf1[15], buf1[12]); 1742 buf0[15] = _mm256_add_epi32(buf1[15], buf1[12]); 1743 buf0[13] = _mm256_sub_epi32(buf1[14], buf1[13]); 1744 buf0[14] = _mm256_add_epi32(buf1[14], buf1[13]); 1745 buf0[16] = buf1[16]; 1746 buf0[17] = buf1[17]; 1747 btf_32_avx2_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18], 1748 buf0[29], cos_bit); 1749 btf_32_avx2_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19], 1750 buf0[28], cos_bit); 1751 btf_32_avx2_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20], 1752 buf0[27], cos_bit); 1753 btf_32_avx2_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21], 1754 buf0[26], cos_bit); 1755 buf0[22] = buf1[22]; 1756 buf0[23] = buf1[23]; 1757 buf0[24] = buf1[24]; 1758 buf0[25] = buf1[25]; 1759 buf0[30] = buf1[30]; 1760 buf0[31] = buf1[31]; 1761 1762 // stage 5 1763 cospi = cospi_arr(cos_bit); 1764 btf_32_avx2_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1], 1765 cos_bit); 1766 btf_32_avx2_type0(cospi[16], cospi[48], buf0[3], buf0[2], buf1[2], buf1[3], 1767 cos_bit); 1768 buf1[4] = _mm256_add_epi32(buf0[4], buf0[5]); 1769 buf1[5] = _mm256_sub_epi32(buf0[4], buf0[5]); 1770 buf1[6] = _mm256_sub_epi32(buf0[7], buf0[6]); 1771 buf1[7] = _mm256_add_epi32(buf0[7], buf0[6]); 1772 buf1[8] = buf0[8]; 1773 btf_32_avx2_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9], buf1[14], 1774 cos_bit); 1775 btf_32_avx2_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10], 1776 buf1[13], cos_bit); 1777 buf1[11] = buf0[11]; 1778 buf1[12] = buf0[12]; 1779 buf1[15] = buf0[15]; 1780 buf1[16] = _mm256_add_epi32(buf0[16], buf0[19]); 1781 buf1[19] = _mm256_sub_epi32(buf0[16], buf0[19]); 1782 buf1[17] = _mm256_add_epi32(buf0[17], buf0[18]); 1783 buf1[18] = _mm256_sub_epi32(buf0[17], buf0[18]); 1784 buf1[20] = _mm256_sub_epi32(buf0[23], buf0[20]); 1785 buf1[23] = _mm256_add_epi32(buf0[23], buf0[20]); 1786 buf1[21] = _mm256_sub_epi32(buf0[22], buf0[21]); 1787 buf1[22] = _mm256_add_epi32(buf0[22], buf0[21]); 1788 buf1[24] = _mm256_add_epi32(buf0[24], buf0[27]); 1789 buf1[27] = _mm256_sub_epi32(buf0[24], buf0[27]); 1790 buf1[25] = _mm256_add_epi32(buf0[25], buf0[26]); 1791 buf1[26] = _mm256_sub_epi32(buf0[25], buf0[26]); 1792 buf1[28] = _mm256_sub_epi32(buf0[31], buf0[28]); 1793 buf1[31] = _mm256_add_epi32(buf0[31], buf0[28]); 1794 buf1[29] = _mm256_sub_epi32(buf0[30], buf0[29]); 1795 buf1[30] = _mm256_add_epi32(buf0[30], buf0[29]); 1796 1797 // stage 6 1798 cospi = cospi_arr(cos_bit); 1799 buf0[0] = buf1[0]; 1800 buf0[1] = buf1[1]; 1801 buf0[2] = buf1[2]; 1802 buf0[3] = buf1[3]; 1803 btf_32_avx2_type0(cospi[8], cospi[56], buf1[7], buf1[4], buf0[4], buf0[7], 1804 cos_bit); 1805 btf_32_avx2_type0(cospi[40], cospi[24], buf1[6], buf1[5], buf0[5], buf0[6], 1806 cos_bit); 1807 buf0[8] = _mm256_add_epi32(buf1[8], buf1[9]); 1808 buf0[9] = _mm256_sub_epi32(buf1[8], buf1[9]); 1809 buf0[10] = _mm256_sub_epi32(buf1[11], buf1[10]); 1810 buf0[11] = _mm256_add_epi32(buf1[11], buf1[10]); 1811 buf0[12] = _mm256_add_epi32(buf1[12], buf1[13]); 1812 buf0[13] = _mm256_sub_epi32(buf1[12], buf1[13]); 1813 buf0[14] = _mm256_sub_epi32(buf1[15], buf1[14]); 1814 buf0[15] = _mm256_add_epi32(buf1[15], buf1[14]); 1815 buf0[16] = buf1[16]; 1816 btf_32_avx2_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17], 1817 buf0[30], cos_bit); 1818 btf_32_avx2_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18], 1819 buf0[29], cos_bit); 1820 buf0[19] = buf1[19]; 1821 buf0[20] = buf1[20]; 1822 btf_32_avx2_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21], 1823 buf0[26], cos_bit); 1824 btf_32_avx2_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22], 1825 buf0[25], cos_bit); 1826 buf0[23] = buf1[23]; 1827 buf0[24] = buf1[24]; 1828 buf0[27] = buf1[27]; 1829 buf0[28] = buf1[28]; 1830 buf0[31] = buf1[31]; 1831 1832 // stage 7 1833 cospi = cospi_arr(cos_bit); 1834 buf1[0] = buf0[0]; 1835 buf1[1] = buf0[1]; 1836 buf1[2] = buf0[2]; 1837 buf1[3] = buf0[3]; 1838 buf1[4] = buf0[4]; 1839 buf1[5] = buf0[5]; 1840 buf1[6] = buf0[6]; 1841 buf1[7] = buf0[7]; 1842 btf_32_avx2_type0(cospi[4], cospi[60], buf0[15], buf0[8], buf1[8], buf1[15], 1843 cos_bit); 1844 btf_32_avx2_type0(cospi[36], cospi[28], buf0[14], buf0[9], buf1[9], buf1[14], 1845 cos_bit); 1846 btf_32_avx2_type0(cospi[20], cospi[44], buf0[13], buf0[10], buf1[10], 1847 buf1[13], cos_bit); 1848 btf_32_avx2_type0(cospi[52], cospi[12], buf0[12], buf0[11], buf1[11], 1849 buf1[12], cos_bit); 1850 buf1[16] = _mm256_add_epi32(buf0[16], buf0[17]); 1851 buf1[17] = _mm256_sub_epi32(buf0[16], buf0[17]); 1852 buf1[18] = _mm256_sub_epi32(buf0[19], buf0[18]); 1853 buf1[19] = _mm256_add_epi32(buf0[19], buf0[18]); 1854 buf1[20] = _mm256_add_epi32(buf0[20], buf0[21]); 1855 buf1[21] = _mm256_sub_epi32(buf0[20], buf0[21]); 1856 buf1[22] = _mm256_sub_epi32(buf0[23], buf0[22]); 1857 buf1[23] = _mm256_add_epi32(buf0[23], buf0[22]); 1858 buf1[24] = _mm256_add_epi32(buf0[24], buf0[25]); 1859 buf1[25] = _mm256_sub_epi32(buf0[24], buf0[25]); 1860 buf1[26] = _mm256_sub_epi32(buf0[27], buf0[26]); 1861 buf1[27] = _mm256_add_epi32(buf0[27], buf0[26]); 1862 buf1[28] = _mm256_add_epi32(buf0[28], buf0[29]); 1863 buf1[29] = _mm256_sub_epi32(buf0[28], buf0[29]); 1864 buf1[30] = _mm256_sub_epi32(buf0[31], buf0[30]); 1865 buf1[31] = _mm256_add_epi32(buf0[31], buf0[30]); 1866 1867 // stage 8 1868 cospi = cospi_arr(cos_bit); 1869 buf0[0] = buf1[0]; 1870 buf0[1] = buf1[1]; 1871 buf0[2] = buf1[2]; 1872 buf0[3] = buf1[3]; 1873 buf0[4] = buf1[4]; 1874 buf0[5] = buf1[5]; 1875 buf0[6] = buf1[6]; 1876 buf0[7] = buf1[7]; 1877 buf0[8] = buf1[8]; 1878 buf0[9] = buf1[9]; 1879 buf0[10] = buf1[10]; 1880 buf0[11] = buf1[11]; 1881 buf0[12] = buf1[12]; 1882 buf0[13] = buf1[13]; 1883 buf0[14] = buf1[14]; 1884 buf0[15] = buf1[15]; 1885 btf_32_avx2_type0(cospi[2], cospi[62], buf1[31], buf1[16], buf0[16], buf0[31], 1886 cos_bit); 1887 btf_32_avx2_type0(cospi[34], cospi[30], buf1[30], buf1[17], buf0[17], 1888 buf0[30], cos_bit); 1889 btf_32_avx2_type0(cospi[18], cospi[46], buf1[29], buf1[18], buf0[18], 1890 buf0[29], cos_bit); 1891 btf_32_avx2_type0(cospi[50], cospi[14], buf1[28], buf1[19], buf0[19], 1892 buf0[28], cos_bit); 1893 btf_32_avx2_type0(cospi[10], cospi[54], buf1[27], buf1[20], buf0[20], 1894 buf0[27], cos_bit); 1895 btf_32_avx2_type0(cospi[42], cospi[22], buf1[26], buf1[21], buf0[21], 1896 buf0[26], cos_bit); 1897 btf_32_avx2_type0(cospi[26], cospi[38], buf1[25], buf1[22], buf0[22], 1898 buf0[25], cos_bit); 1899 btf_32_avx2_type0(cospi[58], cospi[6], buf1[24], buf1[23], buf0[23], buf0[24], 1900 cos_bit); 1901 1902 startidx = 0 * outstride; 1903 endidx = 31 * outstride; 1904 // stage 9 1905 output[startidx] = buf0[0]; 1906 output[endidx] = buf0[31]; 1907 startidx += outstride; 1908 endidx -= outstride; 1909 output[startidx] = buf0[16]; 1910 output[endidx] = buf0[15]; 1911 startidx += outstride; 1912 endidx -= outstride; 1913 output[startidx] = buf0[8]; 1914 output[endidx] = buf0[23]; 1915 startidx += outstride; 1916 endidx -= outstride; 1917 output[startidx] = buf0[24]; 1918 output[endidx] = buf0[7]; 1919 startidx += outstride; 1920 endidx -= outstride; 1921 output[startidx] = buf0[4]; 1922 output[endidx] = buf0[27]; 1923 startidx += outstride; 1924 endidx -= outstride; 1925 output[startidx] = buf0[20]; 1926 output[endidx] = buf0[11]; 1927 startidx += outstride; 1928 endidx -= outstride; 1929 output[startidx] = buf0[12]; 1930 output[endidx] = buf0[19]; 1931 startidx += outstride; 1932 endidx -= outstride; 1933 output[startidx] = buf0[28]; 1934 output[endidx] = buf0[3]; 1935 startidx += outstride; 1936 endidx -= outstride; 1937 output[startidx] = buf0[2]; 1938 output[endidx] = buf0[29]; 1939 startidx += outstride; 1940 endidx -= outstride; 1941 output[startidx] = buf0[18]; 1942 output[endidx] = buf0[13]; 1943 startidx += outstride; 1944 endidx -= outstride; 1945 output[startidx] = buf0[10]; 1946 output[endidx] = buf0[21]; 1947 startidx += outstride; 1948 endidx -= outstride; 1949 output[startidx] = buf0[26]; 1950 output[endidx] = buf0[5]; 1951 startidx += outstride; 1952 endidx -= outstride; 1953 output[startidx] = buf0[6]; 1954 output[endidx] = buf0[25]; 1955 startidx += outstride; 1956 endidx -= outstride; 1957 output[startidx] = buf0[22]; 1958 output[endidx] = buf0[9]; 1959 startidx += outstride; 1960 endidx -= outstride; 1961 output[startidx] = buf0[14]; 1962 output[endidx] = buf0[17]; 1963 startidx += outstride; 1964 endidx -= outstride; 1965 output[startidx] = buf0[30]; 1966 output[endidx] = buf0[1]; 1967 } 1968 static inline void idtx32x32_avx2(__m256i *input, __m256i *output, 1969 const int8_t cos_bit, int instride, 1970 int outstride) { 1971 (void)cos_bit; 1972 for (int i = 0; i < 32; i += 8) { 1973 output[i * outstride] = _mm256_slli_epi32(input[i * instride], 2); 1974 output[(i + 1) * outstride] = 1975 _mm256_slli_epi32(input[(i + 1) * instride], 2); 1976 output[(i + 2) * outstride] = 1977 _mm256_slli_epi32(input[(i + 2) * instride], 2); 1978 output[(i + 3) * outstride] = 1979 _mm256_slli_epi32(input[(i + 3) * instride], 2); 1980 output[(i + 4) * outstride] = 1981 _mm256_slli_epi32(input[(i + 4) * instride], 2); 1982 output[(i + 5) * outstride] = 1983 _mm256_slli_epi32(input[(i + 5) * instride], 2); 1984 output[(i + 6) * outstride] = 1985 _mm256_slli_epi32(input[(i + 6) * instride], 2); 1986 output[(i + 7) * outstride] = 1987 _mm256_slli_epi32(input[(i + 7) * instride], 2); 1988 } 1989 } 1990 static const transform_1d_avx2 col_txfm8x32_arr[TX_TYPES] = { 1991 fdct32_avx2, // DCT_DCT 1992 NULL, // ADST_DCT 1993 NULL, // DCT_ADST 1994 NULL, // ADST_ADST 1995 NULL, // FLIPADST_DCT 1996 NULL, // DCT_FLIPADST 1997 NULL, // FLIPADST_FLIPADST 1998 NULL, // ADST_FLIPADST 1999 NULL, // FLIPADST_ADST 2000 idtx32x32_avx2, // IDTX 2001 NULL, // V_DCT 2002 NULL, // H_DCT 2003 NULL, // V_ADST 2004 NULL, // H_ADST 2005 NULL, // V_FLIPADST 2006 NULL // H_FLIPADST 2007 }; 2008 static const transform_1d_avx2 row_txfm8x32_arr[TX_TYPES] = { 2009 fdct32_avx2, // DCT_DCT 2010 NULL, // ADST_DCT 2011 NULL, // DCT_ADST 2012 NULL, // ADST_ADST 2013 NULL, // FLIPADST_DCT 2014 NULL, // DCT_FLIPADST 2015 NULL, // FLIPADST_FLIPADST 2016 NULL, // ADST_FLIPADST 2017 NULL, // FLIPADST_ADST 2018 idtx32x32_avx2, // IDTX 2019 NULL, // V_DCT 2020 NULL, // H_DCT 2021 NULL, // V_ADST 2022 NULL, // H_ADST 2023 NULL, // V_FLIPADST 2024 NULL // H_FLIPADST 2025 }; 2026 void av1_fwd_txfm2d_32x32_avx2(const int16_t *input, int32_t *output, 2027 int stride, TX_TYPE tx_type, int bd) { 2028 (void)bd; 2029 __m256i buf0[128], buf1[128]; 2030 const int tx_size = TX_32X32; 2031 const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; 2032 const int txw_idx = get_txw_idx(tx_size); 2033 const int txh_idx = get_txh_idx(tx_size); 2034 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; 2035 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; 2036 const int width = tx_size_wide[tx_size]; 2037 const int height = tx_size_high[tx_size]; 2038 const transform_1d_avx2 col_txfm = col_txfm8x32_arr[tx_type]; 2039 const transform_1d_avx2 row_txfm = row_txfm8x32_arr[tx_type]; 2040 int r, c; 2041 const int width_div16 = (width >> 4); 2042 const int width_div8 = (width >> 3); 2043 2044 for (int i = 0; i < width_div16; i++) { 2045 load_buffer_16xn_avx2(input + (i << 4), &buf0[(i << 1)], stride, height, 2046 width_div8, 0, 0); 2047 round_shift_32_8xn_avx2(&buf0[(i << 1)], height, shift[0], width_div8); 2048 round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[0], width_div8); 2049 col_txfm(&buf0[(i << 1)], &buf0[(i << 1)], cos_bit_col, width_div8, 2050 width_div8); 2051 col_txfm(&buf0[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_col, width_div8, 2052 width_div8); 2053 round_shift_32_8xn_avx2(&buf0[(i << 1)], height, shift[1], width_div8); 2054 round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[1], width_div8); 2055 } 2056 2057 for (r = 0; r < height; r += 8) { 2058 for (c = 0; c < width_div8; c++) { 2059 fwd_txfm_transpose_8x8_avx2(&buf0[r * width_div8 + c], 2060 &buf1[c * 8 * width_div8 + (r >> 3)], 2061 width_div8, width_div8); 2062 } 2063 } 2064 2065 for (int i = 0; i < width_div16; i++) { 2066 row_txfm(&buf1[(i << 1)], &buf1[(i << 1)], cos_bit_row, width_div8, 2067 width_div8); 2068 row_txfm(&buf1[(i << 1) + 1], &buf1[(i << 1) + 1], cos_bit_row, width_div8, 2069 width_div8); 2070 round_shift_32_8xn_avx2(&buf1[(i << 1)], height, shift[2], width_div8); 2071 round_shift_32_8xn_avx2(&buf1[(i << 1) + 1], height, shift[2], width_div8); 2072 } 2073 2074 store_buffer_avx2(buf1, output, 8, 128); 2075 } 2076 static inline void fdct64_stage2_avx2(__m256i *x1, __m256i *x2, 2077 __m256i *cospi_m32, __m256i *cospi_p32, 2078 const __m256i *__rounding, 2079 int8_t cos_bit) { 2080 x2[0] = _mm256_add_epi32(x1[0], x1[31]); 2081 x2[31] = _mm256_sub_epi32(x1[0], x1[31]); 2082 x2[1] = _mm256_add_epi32(x1[1], x1[30]); 2083 x2[30] = _mm256_sub_epi32(x1[1], x1[30]); 2084 x2[2] = _mm256_add_epi32(x1[2], x1[29]); 2085 x2[29] = _mm256_sub_epi32(x1[2], x1[29]); 2086 x2[3] = _mm256_add_epi32(x1[3], x1[28]); 2087 x2[28] = _mm256_sub_epi32(x1[3], x1[28]); 2088 x2[4] = _mm256_add_epi32(x1[4], x1[27]); 2089 x2[27] = _mm256_sub_epi32(x1[4], x1[27]); 2090 x2[5] = _mm256_add_epi32(x1[5], x1[26]); 2091 x2[26] = _mm256_sub_epi32(x1[5], x1[26]); 2092 x2[6] = _mm256_add_epi32(x1[6], x1[25]); 2093 x2[25] = _mm256_sub_epi32(x1[6], x1[25]); 2094 x2[7] = _mm256_add_epi32(x1[7], x1[24]); 2095 x2[24] = _mm256_sub_epi32(x1[7], x1[24]); 2096 x2[8] = _mm256_add_epi32(x1[8], x1[23]); 2097 x2[23] = _mm256_sub_epi32(x1[8], x1[23]); 2098 x2[9] = _mm256_add_epi32(x1[9], x1[22]); 2099 x2[22] = _mm256_sub_epi32(x1[9], x1[22]); 2100 x2[10] = _mm256_add_epi32(x1[10], x1[21]); 2101 x2[21] = _mm256_sub_epi32(x1[10], x1[21]); 2102 x2[11] = _mm256_add_epi32(x1[11], x1[20]); 2103 x2[20] = _mm256_sub_epi32(x1[11], x1[20]); 2104 x2[12] = _mm256_add_epi32(x1[12], x1[19]); 2105 x2[19] = _mm256_sub_epi32(x1[12], x1[19]); 2106 x2[13] = _mm256_add_epi32(x1[13], x1[18]); 2107 x2[18] = _mm256_sub_epi32(x1[13], x1[18]); 2108 x2[14] = _mm256_add_epi32(x1[14], x1[17]); 2109 x2[17] = _mm256_sub_epi32(x1[14], x1[17]); 2110 x2[15] = _mm256_add_epi32(x1[15], x1[16]); 2111 x2[16] = _mm256_sub_epi32(x1[15], x1[16]); 2112 x2[32] = x1[32]; 2113 x2[33] = x1[33]; 2114 x2[34] = x1[34]; 2115 x2[35] = x1[35]; 2116 x2[36] = x1[36]; 2117 x2[37] = x1[37]; 2118 x2[38] = x1[38]; 2119 x2[39] = x1[39]; 2120 btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[40], x1[55], x2[40], x2[55], 2121 *__rounding, cos_bit); 2122 btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[41], x1[54], x2[41], x2[54], 2123 *__rounding, cos_bit); 2124 btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[42], x1[53], x2[42], x2[53], 2125 *__rounding, cos_bit); 2126 btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[43], x1[52], x2[43], x2[52], 2127 *__rounding, cos_bit); 2128 btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[44], x1[51], x2[44], x2[51], 2129 *__rounding, cos_bit); 2130 btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[45], x1[50], x2[45], x2[50], 2131 *__rounding, cos_bit); 2132 btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[46], x1[49], x2[46], x2[49], 2133 *__rounding, cos_bit); 2134 btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[47], x1[48], x2[47], x2[48], 2135 *__rounding, cos_bit); 2136 x2[56] = x1[56]; 2137 x2[57] = x1[57]; 2138 x2[58] = x1[58]; 2139 x2[59] = x1[59]; 2140 x2[60] = x1[60]; 2141 x2[61] = x1[61]; 2142 x2[62] = x1[62]; 2143 x2[63] = x1[63]; 2144 } 2145 static inline void fdct64_stage3_avx2(__m256i *x2, __m256i *x3, 2146 __m256i *cospi_m32, __m256i *cospi_p32, 2147 const __m256i *__rounding, 2148 int8_t cos_bit) { 2149 x3[0] = _mm256_add_epi32(x2[0], x2[15]); 2150 x3[15] = _mm256_sub_epi32(x2[0], x2[15]); 2151 x3[1] = _mm256_add_epi32(x2[1], x2[14]); 2152 x3[14] = _mm256_sub_epi32(x2[1], x2[14]); 2153 x3[2] = _mm256_add_epi32(x2[2], x2[13]); 2154 x3[13] = _mm256_sub_epi32(x2[2], x2[13]); 2155 x3[3] = _mm256_add_epi32(x2[3], x2[12]); 2156 x3[12] = _mm256_sub_epi32(x2[3], x2[12]); 2157 x3[4] = _mm256_add_epi32(x2[4], x2[11]); 2158 x3[11] = _mm256_sub_epi32(x2[4], x2[11]); 2159 x3[5] = _mm256_add_epi32(x2[5], x2[10]); 2160 x3[10] = _mm256_sub_epi32(x2[5], x2[10]); 2161 x3[6] = _mm256_add_epi32(x2[6], x2[9]); 2162 x3[9] = _mm256_sub_epi32(x2[6], x2[9]); 2163 x3[7] = _mm256_add_epi32(x2[7], x2[8]); 2164 x3[8] = _mm256_sub_epi32(x2[7], x2[8]); 2165 x3[16] = x2[16]; 2166 x3[17] = x2[17]; 2167 x3[18] = x2[18]; 2168 x3[19] = x2[19]; 2169 btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[20], x2[27], x3[20], x3[27], 2170 *__rounding, cos_bit); 2171 btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[21], x2[26], x3[21], x3[26], 2172 *__rounding, cos_bit); 2173 btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[22], x2[25], x3[22], x3[25], 2174 *__rounding, cos_bit); 2175 btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[23], x2[24], x3[23], x3[24], 2176 *__rounding, cos_bit); 2177 x3[28] = x2[28]; 2178 x3[29] = x2[29]; 2179 x3[30] = x2[30]; 2180 x3[31] = x2[31]; 2181 x3[32] = _mm256_add_epi32(x2[32], x2[47]); 2182 x3[47] = _mm256_sub_epi32(x2[32], x2[47]); 2183 x3[33] = _mm256_add_epi32(x2[33], x2[46]); 2184 x3[46] = _mm256_sub_epi32(x2[33], x2[46]); 2185 x3[34] = _mm256_add_epi32(x2[34], x2[45]); 2186 x3[45] = _mm256_sub_epi32(x2[34], x2[45]); 2187 x3[35] = _mm256_add_epi32(x2[35], x2[44]); 2188 x3[44] = _mm256_sub_epi32(x2[35], x2[44]); 2189 x3[36] = _mm256_add_epi32(x2[36], x2[43]); 2190 x3[43] = _mm256_sub_epi32(x2[36], x2[43]); 2191 x3[37] = _mm256_add_epi32(x2[37], x2[42]); 2192 x3[42] = _mm256_sub_epi32(x2[37], x2[42]); 2193 x3[38] = _mm256_add_epi32(x2[38], x2[41]); 2194 x3[41] = _mm256_sub_epi32(x2[38], x2[41]); 2195 x3[39] = _mm256_add_epi32(x2[39], x2[40]); 2196 x3[40] = _mm256_sub_epi32(x2[39], x2[40]); 2197 x3[48] = _mm256_sub_epi32(x2[63], x2[48]); 2198 x3[63] = _mm256_add_epi32(x2[63], x2[48]); 2199 x3[49] = _mm256_sub_epi32(x2[62], x2[49]); 2200 x3[62] = _mm256_add_epi32(x2[62], x2[49]); 2201 x3[50] = _mm256_sub_epi32(x2[61], x2[50]); 2202 x3[61] = _mm256_add_epi32(x2[61], x2[50]); 2203 x3[51] = _mm256_sub_epi32(x2[60], x2[51]); 2204 x3[60] = _mm256_add_epi32(x2[60], x2[51]); 2205 x3[52] = _mm256_sub_epi32(x2[59], x2[52]); 2206 x3[59] = _mm256_add_epi32(x2[59], x2[52]); 2207 x3[53] = _mm256_sub_epi32(x2[58], x2[53]); 2208 x3[58] = _mm256_add_epi32(x2[58], x2[53]); 2209 x3[54] = _mm256_sub_epi32(x2[57], x2[54]); 2210 x3[57] = _mm256_add_epi32(x2[57], x2[54]); 2211 x3[55] = _mm256_sub_epi32(x2[56], x2[55]); 2212 x3[56] = _mm256_add_epi32(x2[56], x2[55]); 2213 } 2214 static inline void fdct64_stage4_avx2(__m256i *x3, __m256i *x4, 2215 __m256i *cospi_m32, __m256i *cospi_p32, 2216 __m256i *cospi_m16, __m256i *cospi_p48, 2217 __m256i *cospi_m48, 2218 const __m256i *__rounding, 2219 int8_t cos_bit) { 2220 x4[0] = _mm256_add_epi32(x3[0], x3[7]); 2221 x4[7] = _mm256_sub_epi32(x3[0], x3[7]); 2222 x4[1] = _mm256_add_epi32(x3[1], x3[6]); 2223 x4[6] = _mm256_sub_epi32(x3[1], x3[6]); 2224 x4[2] = _mm256_add_epi32(x3[2], x3[5]); 2225 x4[5] = _mm256_sub_epi32(x3[2], x3[5]); 2226 x4[3] = _mm256_add_epi32(x3[3], x3[4]); 2227 x4[4] = _mm256_sub_epi32(x3[3], x3[4]); 2228 x4[8] = x3[8]; 2229 x4[9] = x3[9]; 2230 btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x3[10], x3[13], x4[10], x4[13], 2231 *__rounding, cos_bit); 2232 btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x3[11], x3[12], x4[11], x4[12], 2233 *__rounding, cos_bit); 2234 x4[14] = x3[14]; 2235 x4[15] = x3[15]; 2236 x4[16] = _mm256_add_epi32(x3[16], x3[23]); 2237 x4[23] = _mm256_sub_epi32(x3[16], x3[23]); 2238 x4[17] = _mm256_add_epi32(x3[17], x3[22]); 2239 x4[22] = _mm256_sub_epi32(x3[17], x3[22]); 2240 x4[18] = _mm256_add_epi32(x3[18], x3[21]); 2241 x4[21] = _mm256_sub_epi32(x3[18], x3[21]); 2242 x4[19] = _mm256_add_epi32(x3[19], x3[20]); 2243 x4[20] = _mm256_sub_epi32(x3[19], x3[20]); 2244 x4[24] = _mm256_sub_epi32(x3[31], x3[24]); 2245 x4[31] = _mm256_add_epi32(x3[31], x3[24]); 2246 x4[25] = _mm256_sub_epi32(x3[30], x3[25]); 2247 x4[30] = _mm256_add_epi32(x3[30], x3[25]); 2248 x4[26] = _mm256_sub_epi32(x3[29], x3[26]); 2249 x4[29] = _mm256_add_epi32(x3[29], x3[26]); 2250 x4[27] = _mm256_sub_epi32(x3[28], x3[27]); 2251 x4[28] = _mm256_add_epi32(x3[28], x3[27]); 2252 x4[32] = x3[32]; 2253 x4[33] = x3[33]; 2254 x4[34] = x3[34]; 2255 x4[35] = x3[35]; 2256 btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[36], x3[59], x4[36], x4[59], 2257 *__rounding, cos_bit); 2258 btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[37], x3[58], x4[37], x4[58], 2259 *__rounding, cos_bit); 2260 btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[38], x3[57], x4[38], x4[57], 2261 *__rounding, cos_bit); 2262 btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[39], x3[56], x4[39], x4[56], 2263 *__rounding, cos_bit); 2264 btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[40], x3[55], x4[40], x4[55], 2265 *__rounding, cos_bit); 2266 btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[41], x3[54], x4[41], x4[54], 2267 *__rounding, cos_bit); 2268 btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[42], x3[53], x4[42], x4[53], 2269 *__rounding, cos_bit); 2270 btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[43], x3[52], x4[43], x4[52], 2271 *__rounding, cos_bit); 2272 x4[44] = x3[44]; 2273 x4[45] = x3[45]; 2274 x4[46] = x3[46]; 2275 x4[47] = x3[47]; 2276 x4[48] = x3[48]; 2277 x4[49] = x3[49]; 2278 x4[50] = x3[50]; 2279 x4[51] = x3[51]; 2280 x4[60] = x3[60]; 2281 x4[61] = x3[61]; 2282 x4[62] = x3[62]; 2283 x4[63] = x3[63]; 2284 } 2285 static inline void fdct64_stage5_avx2(__m256i *x4, __m256i *x5, 2286 __m256i *cospi_m32, __m256i *cospi_p32, 2287 __m256i *cospi_m16, __m256i *cospi_p48, 2288 __m256i *cospi_m48, 2289 const __m256i *__rounding, 2290 int8_t cos_bit) { 2291 x5[0] = _mm256_add_epi32(x4[0], x4[3]); 2292 x5[3] = _mm256_sub_epi32(x4[0], x4[3]); 2293 x5[1] = _mm256_add_epi32(x4[1], x4[2]); 2294 x5[2] = _mm256_sub_epi32(x4[1], x4[2]); 2295 x5[4] = x4[4]; 2296 btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x4[5], x4[6], x5[5], x5[6], 2297 *__rounding, cos_bit); 2298 x5[7] = x4[7]; 2299 x5[8] = _mm256_add_epi32(x4[8], x4[11]); 2300 x5[11] = _mm256_sub_epi32(x4[8], x4[11]); 2301 x5[9] = _mm256_add_epi32(x4[9], x4[10]); 2302 x5[10] = _mm256_sub_epi32(x4[9], x4[10]); 2303 x5[12] = _mm256_sub_epi32(x4[15], x4[12]); 2304 x5[15] = _mm256_add_epi32(x4[15], x4[12]); 2305 x5[13] = _mm256_sub_epi32(x4[14], x4[13]); 2306 x5[14] = _mm256_add_epi32(x4[14], x4[13]); 2307 x5[16] = x4[16]; 2308 x5[17] = x4[17]; 2309 btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x4[18], x4[29], x5[18], x5[29], 2310 *__rounding, cos_bit); 2311 btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x4[19], x4[28], x5[19], x5[28], 2312 *__rounding, cos_bit); 2313 btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x4[20], x4[27], x5[20], x5[27], 2314 *__rounding, cos_bit); 2315 btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x4[21], x4[26], x5[21], x5[26], 2316 *__rounding, cos_bit); 2317 x5[22] = x4[22]; 2318 x5[23] = x4[23]; 2319 x5[24] = x4[24]; 2320 x5[25] = x4[25]; 2321 x5[30] = x4[30]; 2322 x5[31] = x4[31]; 2323 x5[32] = _mm256_add_epi32(x4[32], x4[39]); 2324 x5[39] = _mm256_sub_epi32(x4[32], x4[39]); 2325 x5[33] = _mm256_add_epi32(x4[33], x4[38]); 2326 x5[38] = _mm256_sub_epi32(x4[33], x4[38]); 2327 x5[34] = _mm256_add_epi32(x4[34], x4[37]); 2328 x5[37] = _mm256_sub_epi32(x4[34], x4[37]); 2329 x5[35] = _mm256_add_epi32(x4[35], x4[36]); 2330 x5[36] = _mm256_sub_epi32(x4[35], x4[36]); 2331 x5[40] = _mm256_sub_epi32(x4[47], x4[40]); 2332 x5[47] = _mm256_add_epi32(x4[47], x4[40]); 2333 x5[41] = _mm256_sub_epi32(x4[46], x4[41]); 2334 x5[46] = _mm256_add_epi32(x4[46], x4[41]); 2335 x5[42] = _mm256_sub_epi32(x4[45], x4[42]); 2336 x5[45] = _mm256_add_epi32(x4[45], x4[42]); 2337 x5[43] = _mm256_sub_epi32(x4[44], x4[43]); 2338 x5[44] = _mm256_add_epi32(x4[44], x4[43]); 2339 x5[48] = _mm256_add_epi32(x4[48], x4[55]); 2340 x5[55] = _mm256_sub_epi32(x4[48], x4[55]); 2341 x5[49] = _mm256_add_epi32(x4[49], x4[54]); 2342 x5[54] = _mm256_sub_epi32(x4[49], x4[54]); 2343 x5[50] = _mm256_add_epi32(x4[50], x4[53]); 2344 x5[53] = _mm256_sub_epi32(x4[50], x4[53]); 2345 x5[51] = _mm256_add_epi32(x4[51], x4[52]); 2346 x5[52] = _mm256_sub_epi32(x4[51], x4[52]); 2347 x5[56] = _mm256_sub_epi32(x4[63], x4[56]); 2348 x5[63] = _mm256_add_epi32(x4[63], x4[56]); 2349 x5[57] = _mm256_sub_epi32(x4[62], x4[57]); 2350 x5[62] = _mm256_add_epi32(x4[62], x4[57]); 2351 x5[58] = _mm256_sub_epi32(x4[61], x4[58]); 2352 x5[61] = _mm256_add_epi32(x4[61], x4[58]); 2353 x5[59] = _mm256_sub_epi32(x4[60], x4[59]); 2354 x5[60] = _mm256_add_epi32(x4[60], x4[59]); 2355 } 2356 static inline void fdct64_stage6_avx2( 2357 __m256i *x5, __m256i *x6, __m256i *cospi_p16, __m256i *cospi_p32, 2358 __m256i *cospi_m16, __m256i *cospi_p48, __m256i *cospi_m48, 2359 __m256i *cospi_m08, __m256i *cospi_p56, __m256i *cospi_m56, 2360 __m256i *cospi_m40, __m256i *cospi_p24, __m256i *cospi_m24, 2361 const __m256i *__rounding, int8_t cos_bit) { 2362 btf_32_type0_avx2_new(*cospi_p32, *cospi_p32, x5[0], x5[1], x6[0], x6[1], 2363 *__rounding, cos_bit); 2364 btf_32_type0_avx2_new(*cospi_p16, *cospi_p48, x5[3], x5[2], x6[2], x6[3], 2365 *__rounding, cos_bit); 2366 x6[4] = _mm256_add_epi32(x5[4], x5[5]); 2367 x6[5] = _mm256_sub_epi32(x5[4], x5[5]); 2368 x6[6] = _mm256_sub_epi32(x5[7], x5[6]); 2369 x6[7] = _mm256_add_epi32(x5[7], x5[6]); 2370 x6[8] = x5[8]; 2371 btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x5[9], x5[14], x6[9], x6[14], 2372 *__rounding, cos_bit); 2373 btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x5[10], x5[13], x6[10], x6[13], 2374 *__rounding, cos_bit); 2375 x6[11] = x5[11]; 2376 x6[12] = x5[12]; 2377 x6[15] = x5[15]; 2378 x6[16] = _mm256_add_epi32(x5[16], x5[19]); 2379 x6[19] = _mm256_sub_epi32(x5[16], x5[19]); 2380 x6[17] = _mm256_add_epi32(x5[17], x5[18]); 2381 x6[18] = _mm256_sub_epi32(x5[17], x5[18]); 2382 x6[20] = _mm256_sub_epi32(x5[23], x5[20]); 2383 x6[23] = _mm256_add_epi32(x5[23], x5[20]); 2384 x6[21] = _mm256_sub_epi32(x5[22], x5[21]); 2385 x6[22] = _mm256_add_epi32(x5[22], x5[21]); 2386 x6[24] = _mm256_add_epi32(x5[24], x5[27]); 2387 x6[27] = _mm256_sub_epi32(x5[24], x5[27]); 2388 x6[25] = _mm256_add_epi32(x5[25], x5[26]); 2389 x6[26] = _mm256_sub_epi32(x5[25], x5[26]); 2390 x6[28] = _mm256_sub_epi32(x5[31], x5[28]); 2391 x6[31] = _mm256_add_epi32(x5[31], x5[28]); 2392 x6[29] = _mm256_sub_epi32(x5[30], x5[29]); 2393 x6[30] = _mm256_add_epi32(x5[30], x5[29]); 2394 x6[32] = x5[32]; 2395 x6[33] = x5[33]; 2396 btf_32_type0_avx2_new(*cospi_m08, *cospi_p56, x5[34], x5[61], x6[34], x6[61], 2397 *__rounding, cos_bit); 2398 btf_32_type0_avx2_new(*cospi_m08, *cospi_p56, x5[35], x5[60], x6[35], x6[60], 2399 *__rounding, cos_bit); 2400 btf_32_type0_avx2_new(*cospi_m56, *cospi_m08, x5[36], x5[59], x6[36], x6[59], 2401 *__rounding, cos_bit); 2402 btf_32_type0_avx2_new(*cospi_m56, *cospi_m08, x5[37], x5[58], x6[37], x6[58], 2403 *__rounding, cos_bit); 2404 x6[38] = x5[38]; 2405 x6[39] = x5[39]; 2406 x6[40] = x5[40]; 2407 x6[41] = x5[41]; 2408 btf_32_type0_avx2_new(*cospi_m40, *cospi_p24, x5[42], x5[53], x6[42], x6[53], 2409 *__rounding, cos_bit); 2410 btf_32_type0_avx2_new(*cospi_m40, *cospi_p24, x5[43], x5[52], x6[43], x6[52], 2411 *__rounding, cos_bit); 2412 btf_32_type0_avx2_new(*cospi_m24, *cospi_m40, x5[44], x5[51], x6[44], x6[51], 2413 *__rounding, cos_bit); 2414 btf_32_type0_avx2_new(*cospi_m24, *cospi_m40, x5[45], x5[50], x6[45], x6[50], 2415 *__rounding, cos_bit); 2416 x6[46] = x5[46]; 2417 x6[47] = x5[47]; 2418 x6[48] = x5[48]; 2419 x6[49] = x5[49]; 2420 x6[54] = x5[54]; 2421 x6[55] = x5[55]; 2422 x6[56] = x5[56]; 2423 x6[57] = x5[57]; 2424 x6[62] = x5[62]; 2425 x6[63] = x5[63]; 2426 } 2427 static inline void fdct64_stage7_avx2(__m256i *x6, __m256i *x7, 2428 __m256i *cospi_p08, __m256i *cospi_p56, 2429 __m256i *cospi_p40, __m256i *cospi_p24, 2430 __m256i *cospi_m08, __m256i *cospi_m56, 2431 __m256i *cospi_m40, __m256i *cospi_m24, 2432 const __m256i *__rounding, 2433 int8_t cos_bit) { 2434 x7[0] = x6[0]; 2435 x7[1] = x6[1]; 2436 x7[2] = x6[2]; 2437 x7[3] = x6[3]; 2438 btf_32_type0_avx2_new(*cospi_p08, *cospi_p56, x6[7], x6[4], x7[4], x7[7], 2439 *__rounding, cos_bit); 2440 btf_32_type0_avx2_new(*cospi_p40, *cospi_p24, x6[6], x6[5], x7[5], x7[6], 2441 *__rounding, cos_bit); 2442 x7[8] = _mm256_add_epi32(x6[8], x6[9]); 2443 x7[9] = _mm256_sub_epi32(x6[8], x6[9]); 2444 x7[10] = _mm256_sub_epi32(x6[11], x6[10]); 2445 x7[11] = _mm256_add_epi32(x6[11], x6[10]); 2446 x7[12] = _mm256_add_epi32(x6[12], x6[13]); 2447 x7[13] = _mm256_sub_epi32(x6[12], x6[13]); 2448 x7[14] = _mm256_sub_epi32(x6[15], x6[14]); 2449 x7[15] = _mm256_add_epi32(x6[15], x6[14]); 2450 x7[16] = x6[16]; 2451 btf_32_type0_avx2_new(*cospi_m08, *cospi_p56, x6[17], x6[30], x7[17], x7[30], 2452 *__rounding, cos_bit); 2453 btf_32_type0_avx2_new(*cospi_m56, *cospi_m08, x6[18], x6[29], x7[18], x7[29], 2454 *__rounding, cos_bit); 2455 x7[19] = x6[19]; 2456 x7[20] = x6[20]; 2457 btf_32_type0_avx2_new(*cospi_m40, *cospi_p24, x6[21], x6[26], x7[21], x7[26], 2458 *__rounding, cos_bit); 2459 btf_32_type0_avx2_new(*cospi_m24, *cospi_m40, x6[22], x6[25], x7[22], x7[25], 2460 *__rounding, cos_bit); 2461 x7[23] = x6[23]; 2462 x7[24] = x6[24]; 2463 x7[27] = x6[27]; 2464 x7[28] = x6[28]; 2465 x7[31] = x6[31]; 2466 x7[32] = _mm256_add_epi32(x6[32], x6[35]); 2467 x7[35] = _mm256_sub_epi32(x6[32], x6[35]); 2468 x7[33] = _mm256_add_epi32(x6[33], x6[34]); 2469 x7[34] = _mm256_sub_epi32(x6[33], x6[34]); 2470 x7[36] = _mm256_sub_epi32(x6[39], x6[36]); 2471 x7[39] = _mm256_add_epi32(x6[39], x6[36]); 2472 x7[37] = _mm256_sub_epi32(x6[38], x6[37]); 2473 x7[38] = _mm256_add_epi32(x6[38], x6[37]); 2474 x7[40] = _mm256_add_epi32(x6[40], x6[43]); 2475 x7[43] = _mm256_sub_epi32(x6[40], x6[43]); 2476 x7[41] = _mm256_add_epi32(x6[41], x6[42]); 2477 x7[42] = _mm256_sub_epi32(x6[41], x6[42]); 2478 x7[44] = _mm256_sub_epi32(x6[47], x6[44]); 2479 x7[47] = _mm256_add_epi32(x6[47], x6[44]); 2480 x7[45] = _mm256_sub_epi32(x6[46], x6[45]); 2481 x7[46] = _mm256_add_epi32(x6[46], x6[45]); 2482 x7[48] = _mm256_add_epi32(x6[48], x6[51]); 2483 x7[51] = _mm256_sub_epi32(x6[48], x6[51]); 2484 x7[49] = _mm256_add_epi32(x6[49], x6[50]); 2485 x7[50] = _mm256_sub_epi32(x6[49], x6[50]); 2486 x7[52] = _mm256_sub_epi32(x6[55], x6[52]); 2487 x7[55] = _mm256_add_epi32(x6[55], x6[52]); 2488 x7[53] = _mm256_sub_epi32(x6[54], x6[53]); 2489 x7[54] = _mm256_add_epi32(x6[54], x6[53]); 2490 x7[56] = _mm256_add_epi32(x6[56], x6[59]); 2491 x7[59] = _mm256_sub_epi32(x6[56], x6[59]); 2492 x7[57] = _mm256_add_epi32(x6[57], x6[58]); 2493 x7[58] = _mm256_sub_epi32(x6[57], x6[58]); 2494 x7[60] = _mm256_sub_epi32(x6[63], x6[60]); 2495 x7[63] = _mm256_add_epi32(x6[63], x6[60]); 2496 x7[61] = _mm256_sub_epi32(x6[62], x6[61]); 2497 x7[62] = _mm256_add_epi32(x6[62], x6[61]); 2498 } 2499 static inline void fdct64_stage8_avx2(__m256i *x7, __m256i *x8, 2500 const int32_t *cospi, 2501 const __m256i *__rounding, 2502 int8_t cos_bit) { 2503 __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]); 2504 __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]); 2505 __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]); 2506 __m256i cospi_p36 = _mm256_set1_epi32(cospi[36]); 2507 __m256i cospi_p44 = _mm256_set1_epi32(cospi[44]); 2508 __m256i cospi_p20 = _mm256_set1_epi32(cospi[20]); 2509 __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]); 2510 __m256i cospi_p52 = _mm256_set1_epi32(cospi[52]); 2511 __m256i cospi_m04 = _mm256_set1_epi32(-cospi[4]); 2512 __m256i cospi_m60 = _mm256_set1_epi32(-cospi[60]); 2513 __m256i cospi_m36 = _mm256_set1_epi32(-cospi[36]); 2514 __m256i cospi_m28 = _mm256_set1_epi32(-cospi[28]); 2515 __m256i cospi_m20 = _mm256_set1_epi32(-cospi[20]); 2516 __m256i cospi_m44 = _mm256_set1_epi32(-cospi[44]); 2517 __m256i cospi_m52 = _mm256_set1_epi32(-cospi[52]); 2518 __m256i cospi_m12 = _mm256_set1_epi32(-cospi[12]); 2519 2520 x8[0] = x7[0]; 2521 x8[1] = x7[1]; 2522 x8[2] = x7[2]; 2523 x8[3] = x7[3]; 2524 x8[4] = x7[4]; 2525 x8[5] = x7[5]; 2526 x8[6] = x7[6]; 2527 x8[7] = x7[7]; 2528 2529 btf_32_type0_avx2_new(cospi_p04, cospi_p60, x7[15], x7[8], x8[8], x8[15], 2530 *__rounding, cos_bit); 2531 btf_32_type0_avx2_new(cospi_p36, cospi_p28, x7[14], x7[9], x8[9], x8[14], 2532 *__rounding, cos_bit); 2533 btf_32_type0_avx2_new(cospi_p20, cospi_p44, x7[13], x7[10], x8[10], x8[13], 2534 *__rounding, cos_bit); 2535 btf_32_type0_avx2_new(cospi_p52, cospi_p12, x7[12], x7[11], x8[11], x8[12], 2536 *__rounding, cos_bit); 2537 x8[16] = _mm256_add_epi32(x7[16], x7[17]); 2538 x8[17] = _mm256_sub_epi32(x7[16], x7[17]); 2539 x8[18] = _mm256_sub_epi32(x7[19], x7[18]); 2540 x8[19] = _mm256_add_epi32(x7[19], x7[18]); 2541 x8[20] = _mm256_add_epi32(x7[20], x7[21]); 2542 x8[21] = _mm256_sub_epi32(x7[20], x7[21]); 2543 x8[22] = _mm256_sub_epi32(x7[23], x7[22]); 2544 x8[23] = _mm256_add_epi32(x7[23], x7[22]); 2545 x8[24] = _mm256_add_epi32(x7[24], x7[25]); 2546 x8[25] = _mm256_sub_epi32(x7[24], x7[25]); 2547 x8[26] = _mm256_sub_epi32(x7[27], x7[26]); 2548 x8[27] = _mm256_add_epi32(x7[27], x7[26]); 2549 x8[28] = _mm256_add_epi32(x7[28], x7[29]); 2550 x8[29] = _mm256_sub_epi32(x7[28], x7[29]); 2551 x8[30] = _mm256_sub_epi32(x7[31], x7[30]); 2552 x8[31] = _mm256_add_epi32(x7[31], x7[30]); 2553 x8[32] = x7[32]; 2554 btf_32_type0_avx2_new(cospi_m04, cospi_p60, x7[33], x7[62], x8[33], x8[62], 2555 *__rounding, cos_bit); 2556 btf_32_type0_avx2_new(cospi_m60, cospi_m04, x7[34], x7[61], x8[34], x8[61], 2557 *__rounding, cos_bit); 2558 x8[35] = x7[35]; 2559 x8[36] = x7[36]; 2560 btf_32_type0_avx2_new(cospi_m36, cospi_p28, x7[37], x7[58], x8[37], x8[58], 2561 *__rounding, cos_bit); 2562 btf_32_type0_avx2_new(cospi_m28, cospi_m36, x7[38], x7[57], x8[38], x8[57], 2563 *__rounding, cos_bit); 2564 x8[39] = x7[39]; 2565 x8[40] = x7[40]; 2566 btf_32_type0_avx2_new(cospi_m20, cospi_p44, x7[41], x7[54], x8[41], x8[54], 2567 *__rounding, cos_bit); 2568 btf_32_type0_avx2_new(cospi_m44, cospi_m20, x7[42], x7[53], x8[42], x8[53], 2569 *__rounding, cos_bit); 2570 x8[43] = x7[43]; 2571 x8[44] = x7[44]; 2572 btf_32_type0_avx2_new(cospi_m52, cospi_p12, x7[45], x7[50], x8[45], x8[50], 2573 *__rounding, cos_bit); 2574 btf_32_type0_avx2_new(cospi_m12, cospi_m52, x7[46], x7[49], x8[46], x8[49], 2575 *__rounding, cos_bit); 2576 x8[47] = x7[47]; 2577 x8[48] = x7[48]; 2578 x8[51] = x7[51]; 2579 x8[52] = x7[52]; 2580 x8[55] = x7[55]; 2581 x8[56] = x7[56]; 2582 x8[59] = x7[59]; 2583 x8[60] = x7[60]; 2584 x8[63] = x7[63]; 2585 } 2586 static inline void fdct64_stage9_avx2(__m256i *x8, __m256i *x9, 2587 const int32_t *cospi, 2588 const __m256i *__rounding, 2589 int8_t cos_bit) { 2590 __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]); 2591 __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]); 2592 __m256i cospi_p30 = _mm256_set1_epi32(cospi[30]); 2593 __m256i cospi_p34 = _mm256_set1_epi32(cospi[34]); 2594 __m256i cospi_p46 = _mm256_set1_epi32(cospi[46]); 2595 __m256i cospi_p18 = _mm256_set1_epi32(cospi[18]); 2596 __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]); 2597 __m256i cospi_p50 = _mm256_set1_epi32(cospi[50]); 2598 __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]); 2599 __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]); 2600 __m256i cospi_p22 = _mm256_set1_epi32(cospi[22]); 2601 __m256i cospi_p42 = _mm256_set1_epi32(cospi[42]); 2602 __m256i cospi_p38 = _mm256_set1_epi32(cospi[38]); 2603 __m256i cospi_p26 = _mm256_set1_epi32(cospi[26]); 2604 __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]); 2605 __m256i cospi_p58 = _mm256_set1_epi32(cospi[58]); 2606 2607 x9[0] = x8[0]; 2608 x9[1] = x8[1]; 2609 x9[2] = x8[2]; 2610 x9[3] = x8[3]; 2611 x9[4] = x8[4]; 2612 x9[5] = x8[5]; 2613 x9[6] = x8[6]; 2614 x9[7] = x8[7]; 2615 x9[8] = x8[8]; 2616 x9[9] = x8[9]; 2617 x9[10] = x8[10]; 2618 x9[11] = x8[11]; 2619 x9[12] = x8[12]; 2620 x9[13] = x8[13]; 2621 x9[14] = x8[14]; 2622 x9[15] = x8[15]; 2623 btf_32_type0_avx2_new(cospi_p02, cospi_p62, x8[31], x8[16], x9[16], x9[31], 2624 *__rounding, cos_bit); 2625 btf_32_type0_avx2_new(cospi_p34, cospi_p30, x8[30], x8[17], x9[17], x9[30], 2626 *__rounding, cos_bit); 2627 btf_32_type0_avx2_new(cospi_p18, cospi_p46, x8[29], x8[18], x9[18], x9[29], 2628 *__rounding, cos_bit); 2629 btf_32_type0_avx2_new(cospi_p50, cospi_p14, x8[28], x8[19], x9[19], x9[28], 2630 *__rounding, cos_bit); 2631 btf_32_type0_avx2_new(cospi_p10, cospi_p54, x8[27], x8[20], x9[20], x9[27], 2632 *__rounding, cos_bit); 2633 btf_32_type0_avx2_new(cospi_p42, cospi_p22, x8[26], x8[21], x9[21], x9[26], 2634 *__rounding, cos_bit); 2635 btf_32_type0_avx2_new(cospi_p26, cospi_p38, x8[25], x8[22], x9[22], x9[25], 2636 *__rounding, cos_bit); 2637 btf_32_type0_avx2_new(cospi_p58, cospi_p06, x8[24], x8[23], x9[23], x9[24], 2638 *__rounding, cos_bit); 2639 x9[32] = _mm256_add_epi32(x8[32], x8[33]); 2640 x9[33] = _mm256_sub_epi32(x8[32], x8[33]); 2641 x9[34] = _mm256_sub_epi32(x8[35], x8[34]); 2642 x9[35] = _mm256_add_epi32(x8[35], x8[34]); 2643 x9[36] = _mm256_add_epi32(x8[36], x8[37]); 2644 x9[37] = _mm256_sub_epi32(x8[36], x8[37]); 2645 x9[38] = _mm256_sub_epi32(x8[39], x8[38]); 2646 x9[39] = _mm256_add_epi32(x8[39], x8[38]); 2647 x9[40] = _mm256_add_epi32(x8[40], x8[41]); 2648 x9[41] = _mm256_sub_epi32(x8[40], x8[41]); 2649 x9[42] = _mm256_sub_epi32(x8[43], x8[42]); 2650 x9[43] = _mm256_add_epi32(x8[43], x8[42]); 2651 x9[44] = _mm256_add_epi32(x8[44], x8[45]); 2652 x9[45] = _mm256_sub_epi32(x8[44], x8[45]); 2653 x9[46] = _mm256_sub_epi32(x8[47], x8[46]); 2654 x9[47] = _mm256_add_epi32(x8[47], x8[46]); 2655 x9[48] = _mm256_add_epi32(x8[48], x8[49]); 2656 x9[49] = _mm256_sub_epi32(x8[48], x8[49]); 2657 x9[50] = _mm256_sub_epi32(x8[51], x8[50]); 2658 x9[51] = _mm256_add_epi32(x8[51], x8[50]); 2659 x9[52] = _mm256_add_epi32(x8[52], x8[53]); 2660 x9[53] = _mm256_sub_epi32(x8[52], x8[53]); 2661 x9[54] = _mm256_sub_epi32(x8[55], x8[54]); 2662 x9[55] = _mm256_add_epi32(x8[55], x8[54]); 2663 x9[56] = _mm256_add_epi32(x8[56], x8[57]); 2664 x9[57] = _mm256_sub_epi32(x8[56], x8[57]); 2665 x9[58] = _mm256_sub_epi32(x8[59], x8[58]); 2666 x9[59] = _mm256_add_epi32(x8[59], x8[58]); 2667 x9[60] = _mm256_add_epi32(x8[60], x8[61]); 2668 x9[61] = _mm256_sub_epi32(x8[60], x8[61]); 2669 x9[62] = _mm256_sub_epi32(x8[63], x8[62]); 2670 x9[63] = _mm256_add_epi32(x8[63], x8[62]); 2671 } 2672 static inline void fdct64_stage10_avx2(__m256i *x9, __m256i *x10, 2673 const int32_t *cospi, 2674 const __m256i *__rounding, 2675 int8_t cos_bit) { 2676 __m256i cospi_p63 = _mm256_set1_epi32(cospi[63]); 2677 __m256i cospi_p01 = _mm256_set1_epi32(cospi[1]); 2678 __m256i cospi_p31 = _mm256_set1_epi32(cospi[31]); 2679 __m256i cospi_p33 = _mm256_set1_epi32(cospi[33]); 2680 __m256i cospi_p47 = _mm256_set1_epi32(cospi[47]); 2681 __m256i cospi_p17 = _mm256_set1_epi32(cospi[17]); 2682 __m256i cospi_p15 = _mm256_set1_epi32(cospi[15]); 2683 __m256i cospi_p49 = _mm256_set1_epi32(cospi[49]); 2684 __m256i cospi_p55 = _mm256_set1_epi32(cospi[55]); 2685 __m256i cospi_p09 = _mm256_set1_epi32(cospi[9]); 2686 __m256i cospi_p23 = _mm256_set1_epi32(cospi[23]); 2687 __m256i cospi_p41 = _mm256_set1_epi32(cospi[41]); 2688 __m256i cospi_p39 = _mm256_set1_epi32(cospi[39]); 2689 __m256i cospi_p25 = _mm256_set1_epi32(cospi[25]); 2690 __m256i cospi_p07 = _mm256_set1_epi32(cospi[7]); 2691 __m256i cospi_p57 = _mm256_set1_epi32(cospi[57]); 2692 __m256i cospi_p59 = _mm256_set1_epi32(cospi[59]); 2693 __m256i cospi_p05 = _mm256_set1_epi32(cospi[5]); 2694 __m256i cospi_p27 = _mm256_set1_epi32(cospi[27]); 2695 __m256i cospi_p37 = _mm256_set1_epi32(cospi[37]); 2696 __m256i cospi_p43 = _mm256_set1_epi32(cospi[43]); 2697 __m256i cospi_p21 = _mm256_set1_epi32(cospi[21]); 2698 __m256i cospi_p11 = _mm256_set1_epi32(cospi[11]); 2699 __m256i cospi_p53 = _mm256_set1_epi32(cospi[53]); 2700 __m256i cospi_p51 = _mm256_set1_epi32(cospi[51]); 2701 __m256i cospi_p13 = _mm256_set1_epi32(cospi[13]); 2702 __m256i cospi_p19 = _mm256_set1_epi32(cospi[19]); 2703 __m256i cospi_p45 = _mm256_set1_epi32(cospi[45]); 2704 __m256i cospi_p35 = _mm256_set1_epi32(cospi[35]); 2705 __m256i cospi_p29 = _mm256_set1_epi32(cospi[29]); 2706 __m256i cospi_p03 = _mm256_set1_epi32(cospi[3]); 2707 __m256i cospi_p61 = _mm256_set1_epi32(cospi[61]); 2708 2709 x10[0] = x9[0]; 2710 x10[1] = x9[1]; 2711 x10[2] = x9[2]; 2712 x10[3] = x9[3]; 2713 x10[4] = x9[4]; 2714 x10[5] = x9[5]; 2715 x10[6] = x9[6]; 2716 x10[7] = x9[7]; 2717 x10[8] = x9[8]; 2718 x10[9] = x9[9]; 2719 x10[10] = x9[10]; 2720 x10[11] = x9[11]; 2721 x10[12] = x9[12]; 2722 x10[13] = x9[13]; 2723 x10[14] = x9[14]; 2724 x10[15] = x9[15]; 2725 x10[16] = x9[16]; 2726 x10[17] = x9[17]; 2727 x10[18] = x9[18]; 2728 x10[19] = x9[19]; 2729 x10[20] = x9[20]; 2730 x10[21] = x9[21]; 2731 x10[22] = x9[22]; 2732 x10[23] = x9[23]; 2733 x10[24] = x9[24]; 2734 x10[25] = x9[25]; 2735 x10[26] = x9[26]; 2736 x10[27] = x9[27]; 2737 x10[28] = x9[28]; 2738 x10[29] = x9[29]; 2739 x10[30] = x9[30]; 2740 x10[31] = x9[31]; 2741 btf_32_type0_avx2_new(cospi_p01, cospi_p63, x9[63], x9[32], x10[32], x10[63], 2742 *__rounding, cos_bit); 2743 btf_32_type0_avx2_new(cospi_p33, cospi_p31, x9[62], x9[33], x10[33], x10[62], 2744 *__rounding, cos_bit); 2745 btf_32_type0_avx2_new(cospi_p17, cospi_p47, x9[61], x9[34], x10[34], x10[61], 2746 *__rounding, cos_bit); 2747 btf_32_type0_avx2_new(cospi_p49, cospi_p15, x9[60], x9[35], x10[35], x10[60], 2748 *__rounding, cos_bit); 2749 btf_32_type0_avx2_new(cospi_p09, cospi_p55, x9[59], x9[36], x10[36], x10[59], 2750 *__rounding, cos_bit); 2751 btf_32_type0_avx2_new(cospi_p41, cospi_p23, x9[58], x9[37], x10[37], x10[58], 2752 *__rounding, cos_bit); 2753 btf_32_type0_avx2_new(cospi_p25, cospi_p39, x9[57], x9[38], x10[38], x10[57], 2754 *__rounding, cos_bit); 2755 btf_32_type0_avx2_new(cospi_p57, cospi_p07, x9[56], x9[39], x10[39], x10[56], 2756 *__rounding, cos_bit); 2757 btf_32_type0_avx2_new(cospi_p05, cospi_p59, x9[55], x9[40], x10[40], x10[55], 2758 *__rounding, cos_bit); 2759 btf_32_type0_avx2_new(cospi_p37, cospi_p27, x9[54], x9[41], x10[41], x10[54], 2760 *__rounding, cos_bit); 2761 btf_32_type0_avx2_new(cospi_p21, cospi_p43, x9[53], x9[42], x10[42], x10[53], 2762 *__rounding, cos_bit); 2763 btf_32_type0_avx2_new(cospi_p53, cospi_p11, x9[52], x9[43], x10[43], x10[52], 2764 *__rounding, cos_bit); 2765 btf_32_type0_avx2_new(cospi_p13, cospi_p51, x9[51], x9[44], x10[44], x10[51], 2766 *__rounding, cos_bit); 2767 btf_32_type0_avx2_new(cospi_p45, cospi_p19, x9[50], x9[45], x10[45], x10[50], 2768 *__rounding, cos_bit); 2769 btf_32_type0_avx2_new(cospi_p29, cospi_p35, x9[49], x9[46], x10[46], x10[49], 2770 *__rounding, cos_bit); 2771 btf_32_type0_avx2_new(cospi_p61, cospi_p03, x9[48], x9[47], x10[47], x10[48], 2772 *__rounding, cos_bit); 2773 } 2774 static void fdct64_avx2(__m256i *input, __m256i *output, int8_t cos_bit, 2775 const int instride, const int outstride) { 2776 const int32_t *cospi = cospi_arr(cos_bit); 2777 const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1)); 2778 __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]); 2779 __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]); 2780 __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]); 2781 __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]); 2782 __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]); 2783 __m256i cospi_p16 = _mm256_set1_epi32(cospi[16]); 2784 __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]); 2785 __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]); 2786 __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]); 2787 __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]); 2788 __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]); 2789 __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]); 2790 __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]); 2791 __m256i cospi_p40 = _mm256_set1_epi32(cospi[40]); 2792 2793 int startidx = 0 * instride; 2794 int endidx = 63 * instride; 2795 // stage 1 2796 __m256i x1[64]; 2797 x1[0] = _mm256_add_epi32(input[startidx], input[endidx]); 2798 x1[63] = _mm256_sub_epi32(input[startidx], input[endidx]); 2799 startidx += instride; 2800 endidx -= instride; 2801 x1[1] = _mm256_add_epi32(input[startidx], input[endidx]); 2802 x1[62] = _mm256_sub_epi32(input[startidx], input[endidx]); 2803 startidx += instride; 2804 endidx -= instride; 2805 x1[2] = _mm256_add_epi32(input[startidx], input[endidx]); 2806 x1[61] = _mm256_sub_epi32(input[startidx], input[endidx]); 2807 startidx += instride; 2808 endidx -= instride; 2809 x1[3] = _mm256_add_epi32(input[startidx], input[endidx]); 2810 x1[60] = _mm256_sub_epi32(input[startidx], input[endidx]); 2811 startidx += instride; 2812 endidx -= instride; 2813 x1[4] = _mm256_add_epi32(input[startidx], input[endidx]); 2814 x1[59] = _mm256_sub_epi32(input[startidx], input[endidx]); 2815 startidx += instride; 2816 endidx -= instride; 2817 x1[5] = _mm256_add_epi32(input[startidx], input[endidx]); 2818 x1[58] = _mm256_sub_epi32(input[startidx], input[endidx]); 2819 startidx += instride; 2820 endidx -= instride; 2821 x1[6] = _mm256_add_epi32(input[startidx], input[endidx]); 2822 x1[57] = _mm256_sub_epi32(input[startidx], input[endidx]); 2823 startidx += instride; 2824 endidx -= instride; 2825 x1[7] = _mm256_add_epi32(input[startidx], input[endidx]); 2826 x1[56] = _mm256_sub_epi32(input[startidx], input[endidx]); 2827 startidx += instride; 2828 endidx -= instride; 2829 x1[8] = _mm256_add_epi32(input[startidx], input[endidx]); 2830 x1[55] = _mm256_sub_epi32(input[startidx], input[endidx]); 2831 startidx += instride; 2832 endidx -= instride; 2833 x1[9] = _mm256_add_epi32(input[startidx], input[endidx]); 2834 x1[54] = _mm256_sub_epi32(input[startidx], input[endidx]); 2835 startidx += instride; 2836 endidx -= instride; 2837 x1[10] = _mm256_add_epi32(input[startidx], input[endidx]); 2838 x1[53] = _mm256_sub_epi32(input[startidx], input[endidx]); 2839 startidx += instride; 2840 endidx -= instride; 2841 x1[11] = _mm256_add_epi32(input[startidx], input[endidx]); 2842 x1[52] = _mm256_sub_epi32(input[startidx], input[endidx]); 2843 startidx += instride; 2844 endidx -= instride; 2845 x1[12] = _mm256_add_epi32(input[startidx], input[endidx]); 2846 x1[51] = _mm256_sub_epi32(input[startidx], input[endidx]); 2847 startidx += instride; 2848 endidx -= instride; 2849 x1[13] = _mm256_add_epi32(input[startidx], input[endidx]); 2850 x1[50] = _mm256_sub_epi32(input[startidx], input[endidx]); 2851 startidx += instride; 2852 endidx -= instride; 2853 x1[14] = _mm256_add_epi32(input[startidx], input[endidx]); 2854 x1[49] = _mm256_sub_epi32(input[startidx], input[endidx]); 2855 startidx += instride; 2856 endidx -= instride; 2857 x1[15] = _mm256_add_epi32(input[startidx], input[endidx]); 2858 x1[48] = _mm256_sub_epi32(input[startidx], input[endidx]); 2859 startidx += instride; 2860 endidx -= instride; 2861 x1[16] = _mm256_add_epi32(input[startidx], input[endidx]); 2862 x1[47] = _mm256_sub_epi32(input[startidx], input[endidx]); 2863 startidx += instride; 2864 endidx -= instride; 2865 x1[17] = _mm256_add_epi32(input[startidx], input[endidx]); 2866 x1[46] = _mm256_sub_epi32(input[startidx], input[endidx]); 2867 startidx += instride; 2868 endidx -= instride; 2869 x1[18] = _mm256_add_epi32(input[startidx], input[endidx]); 2870 x1[45] = _mm256_sub_epi32(input[startidx], input[endidx]); 2871 startidx += instride; 2872 endidx -= instride; 2873 x1[19] = _mm256_add_epi32(input[startidx], input[endidx]); 2874 x1[44] = _mm256_sub_epi32(input[startidx], input[endidx]); 2875 startidx += instride; 2876 endidx -= instride; 2877 x1[20] = _mm256_add_epi32(input[startidx], input[endidx]); 2878 x1[43] = _mm256_sub_epi32(input[startidx], input[endidx]); 2879 startidx += instride; 2880 endidx -= instride; 2881 x1[21] = _mm256_add_epi32(input[startidx], input[endidx]); 2882 x1[42] = _mm256_sub_epi32(input[startidx], input[endidx]); 2883 startidx += instride; 2884 endidx -= instride; 2885 x1[22] = _mm256_add_epi32(input[startidx], input[endidx]); 2886 x1[41] = _mm256_sub_epi32(input[startidx], input[endidx]); 2887 startidx += instride; 2888 endidx -= instride; 2889 x1[23] = _mm256_add_epi32(input[startidx], input[endidx]); 2890 x1[40] = _mm256_sub_epi32(input[startidx], input[endidx]); 2891 startidx += instride; 2892 endidx -= instride; 2893 x1[24] = _mm256_add_epi32(input[startidx], input[endidx]); 2894 x1[39] = _mm256_sub_epi32(input[startidx], input[endidx]); 2895 startidx += instride; 2896 endidx -= instride; 2897 x1[25] = _mm256_add_epi32(input[startidx], input[endidx]); 2898 x1[38] = _mm256_sub_epi32(input[startidx], input[endidx]); 2899 startidx += instride; 2900 endidx -= instride; 2901 x1[26] = _mm256_add_epi32(input[startidx], input[endidx]); 2902 x1[37] = _mm256_sub_epi32(input[startidx], input[endidx]); 2903 startidx += instride; 2904 endidx -= instride; 2905 x1[27] = _mm256_add_epi32(input[startidx], input[endidx]); 2906 x1[36] = _mm256_sub_epi32(input[startidx], input[endidx]); 2907 startidx += instride; 2908 endidx -= instride; 2909 x1[28] = _mm256_add_epi32(input[startidx], input[endidx]); 2910 x1[35] = _mm256_sub_epi32(input[startidx], input[endidx]); 2911 startidx += instride; 2912 endidx -= instride; 2913 x1[29] = _mm256_add_epi32(input[startidx], input[endidx]); 2914 x1[34] = _mm256_sub_epi32(input[startidx], input[endidx]); 2915 startidx += instride; 2916 endidx -= instride; 2917 x1[30] = _mm256_add_epi32(input[startidx], input[endidx]); 2918 x1[33] = _mm256_sub_epi32(input[startidx], input[endidx]); 2919 startidx += instride; 2920 endidx -= instride; 2921 x1[31] = _mm256_add_epi32(input[startidx], input[endidx]); 2922 x1[32] = _mm256_sub_epi32(input[startidx], input[endidx]); 2923 2924 // stage 2 2925 __m256i x2[64]; 2926 fdct64_stage2_avx2(x1, x2, &cospi_m32, &cospi_p32, &__rounding, cos_bit); 2927 // stage 3 2928 fdct64_stage3_avx2(x2, x1, &cospi_m32, &cospi_p32, &__rounding, cos_bit); 2929 // stage 4 2930 fdct64_stage4_avx2(x1, x2, &cospi_m32, &cospi_p32, &cospi_m16, &cospi_p48, 2931 &cospi_m48, &__rounding, cos_bit); 2932 // stage 5 2933 fdct64_stage5_avx2(x2, x1, &cospi_m32, &cospi_p32, &cospi_m16, &cospi_p48, 2934 &cospi_m48, &__rounding, cos_bit); 2935 // stage 6 2936 fdct64_stage6_avx2(x1, x2, &cospi_p16, &cospi_p32, &cospi_m16, &cospi_p48, 2937 &cospi_m48, &cospi_m08, &cospi_p56, &cospi_m56, &cospi_m40, 2938 &cospi_p24, &cospi_m24, &__rounding, cos_bit); 2939 // stage 7 2940 fdct64_stage7_avx2(x2, x1, &cospi_p08, &cospi_p56, &cospi_p40, &cospi_p24, 2941 &cospi_m08, &cospi_m56, &cospi_m40, &cospi_m24, 2942 &__rounding, cos_bit); 2943 // stage 8 2944 fdct64_stage8_avx2(x1, x2, cospi, &__rounding, cos_bit); 2945 // stage 9 2946 fdct64_stage9_avx2(x2, x1, cospi, &__rounding, cos_bit); 2947 // stage 10 2948 fdct64_stage10_avx2(x1, x2, cospi, &__rounding, cos_bit); 2949 2950 startidx = 0 * outstride; 2951 endidx = 63 * outstride; 2952 2953 // stage 11 2954 output[startidx] = x2[0]; 2955 output[endidx] = x2[63]; 2956 startidx += outstride; 2957 endidx -= outstride; 2958 output[startidx] = x2[32]; 2959 output[endidx] = x2[31]; 2960 startidx += outstride; 2961 endidx -= outstride; 2962 output[startidx] = x2[16]; 2963 output[endidx] = x2[47]; 2964 startidx += outstride; 2965 endidx -= outstride; 2966 output[startidx] = x2[48]; 2967 output[endidx] = x2[15]; 2968 startidx += outstride; 2969 endidx -= outstride; 2970 output[startidx] = x2[8]; 2971 output[endidx] = x2[55]; 2972 startidx += outstride; 2973 endidx -= outstride; 2974 output[startidx] = x2[40]; 2975 output[endidx] = x2[23]; 2976 startidx += outstride; 2977 endidx -= outstride; 2978 output[startidx] = x2[24]; 2979 output[endidx] = x2[39]; 2980 startidx += outstride; 2981 endidx -= outstride; 2982 output[startidx] = x2[56]; 2983 output[endidx] = x2[7]; 2984 startidx += outstride; 2985 endidx -= outstride; 2986 output[startidx] = x2[4]; 2987 output[endidx] = x2[59]; 2988 startidx += outstride; 2989 endidx -= outstride; 2990 output[startidx] = x2[36]; 2991 output[endidx] = x2[27]; 2992 startidx += outstride; 2993 endidx -= outstride; 2994 output[startidx] = x2[20]; 2995 output[endidx] = x2[43]; 2996 startidx += outstride; 2997 endidx -= outstride; 2998 output[startidx] = x2[52]; 2999 output[endidx] = x2[11]; 3000 startidx += outstride; 3001 endidx -= outstride; 3002 output[startidx] = x2[12]; 3003 output[endidx] = x2[51]; 3004 startidx += outstride; 3005 endidx -= outstride; 3006 output[startidx] = x2[44]; 3007 output[endidx] = x2[19]; 3008 startidx += outstride; 3009 endidx -= outstride; 3010 output[startidx] = x2[28]; 3011 output[endidx] = x2[35]; 3012 startidx += outstride; 3013 endidx -= outstride; 3014 output[startidx] = x2[60]; 3015 output[endidx] = x2[3]; 3016 startidx += outstride; 3017 endidx -= outstride; 3018 output[startidx] = x2[2]; 3019 output[endidx] = x2[61]; 3020 startidx += outstride; 3021 endidx -= outstride; 3022 output[startidx] = x2[34]; 3023 output[endidx] = x2[29]; 3024 startidx += outstride; 3025 endidx -= outstride; 3026 output[startidx] = x2[18]; 3027 output[endidx] = x2[45]; 3028 startidx += outstride; 3029 endidx -= outstride; 3030 output[startidx] = x2[50]; 3031 output[endidx] = x2[13]; 3032 startidx += outstride; 3033 endidx -= outstride; 3034 output[startidx] = x2[10]; 3035 output[endidx] = x2[53]; 3036 startidx += outstride; 3037 endidx -= outstride; 3038 output[startidx] = x2[42]; 3039 output[endidx] = x2[21]; 3040 startidx += outstride; 3041 endidx -= outstride; 3042 output[startidx] = x2[26]; 3043 output[endidx] = x2[37]; 3044 startidx += outstride; 3045 endidx -= outstride; 3046 output[startidx] = x2[58]; 3047 output[endidx] = x2[5]; 3048 startidx += outstride; 3049 endidx -= outstride; 3050 output[startidx] = x2[6]; 3051 output[endidx] = x2[57]; 3052 startidx += outstride; 3053 endidx -= outstride; 3054 output[startidx] = x2[38]; 3055 output[endidx] = x2[25]; 3056 startidx += outstride; 3057 endidx -= outstride; 3058 output[startidx] = x2[22]; 3059 output[endidx] = x2[41]; 3060 startidx += outstride; 3061 endidx -= outstride; 3062 output[startidx] = x2[54]; 3063 output[endidx] = x2[9]; 3064 startidx += outstride; 3065 endidx -= outstride; 3066 output[startidx] = x2[14]; 3067 output[endidx] = x2[49]; 3068 startidx += outstride; 3069 endidx -= outstride; 3070 output[startidx] = x2[46]; 3071 output[endidx] = x2[17]; 3072 startidx += outstride; 3073 endidx -= outstride; 3074 output[startidx] = x2[30]; 3075 output[endidx] = x2[33]; 3076 startidx += outstride; 3077 endidx -= outstride; 3078 output[startidx] = x2[62]; 3079 output[endidx] = x2[1]; 3080 } 3081 void av1_fwd_txfm2d_64x64_avx2(const int16_t *input, int32_t *output, 3082 int stride, TX_TYPE tx_type, int bd) { 3083 (void)bd; 3084 (void)tx_type; 3085 assert(tx_type == DCT_DCT); 3086 const TX_SIZE tx_size = TX_64X64; 3087 __m256i buf0[512], buf1[512]; 3088 const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; 3089 const int txw_idx = get_txw_idx(tx_size); 3090 const int txh_idx = get_txh_idx(tx_size); 3091 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; 3092 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; 3093 const int width = tx_size_wide[tx_size]; 3094 const int height = tx_size_high[tx_size]; 3095 const transform_1d_avx2 col_txfm = fdct64_avx2; 3096 const transform_1d_avx2 row_txfm = fdct64_avx2; 3097 const int width_div16 = (width >> 4); 3098 const int width_div8 = (width >> 3); 3099 int r, c; 3100 for (int i = 0; i < width_div16; i++) { 3101 load_buffer_16xn_avx2(input + (i << 4), &buf0[i << 1], stride, height, 3102 width_div8, 0, 0); 3103 round_shift_32_8xn_avx2(&buf0[i << 1], height, shift[0], width_div8); 3104 round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[0], width_div8); 3105 col_txfm(&buf0[i << 1], &buf0[i << 1], cos_bit_col, width_div8, width_div8); 3106 col_txfm(&buf0[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_col, width_div8, 3107 width_div8); 3108 round_shift_32_8xn_avx2(&buf0[i << 1], height, shift[1], width_div8); 3109 round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[1], width_div8); 3110 } 3111 3112 for (r = 0; r < height; r += 8) { 3113 for (c = 0; c < width_div8; c++) { 3114 fwd_txfm_transpose_8x8_avx2(&buf0[r * width_div8 + c], 3115 &buf1[c * 8 * width_div8 + (r >> 3)], 3116 width_div8, width_div8); 3117 } 3118 } 3119 3120 for (int i = 0; i < 2; i++) { 3121 row_txfm(&buf1[i << 1], &buf0[i << 1], cos_bit_row, width_div8, 3122 width_div16); 3123 row_txfm(&buf1[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_row, width_div8, 3124 width_div16); 3125 round_shift_32_8xn_avx2(&buf0[i << 1], (height >> 1), shift[2], 3126 width_div16); 3127 round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], (height >> 1), shift[2], 3128 width_div16); 3129 } 3130 3131 store_buffer_avx2(buf0, output, 8, 128); 3132 }