av1_inv_txfm_avx2.c (89957B)
1 /* 2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include "config/aom_config.h" 13 14 #include "config/av1_rtcd.h" 15 16 #include "av1/common/av1_inv_txfm1d_cfg.h" 17 #include "av1/common/x86/av1_txfm_sse2.h" 18 #include "av1/common/x86/av1_inv_txfm_avx2.h" 19 #include "av1/common/x86/av1_inv_txfm_ssse3.h" 20 21 // TODO(venkatsanampudi@ittiam.com): move this to header file 22 23 // Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5 24 static const int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 25 4 * 4096, 4 * 5793 }; 26 27 static inline void idct16_stage5_avx2(__m256i *x1, const int32_t *cospi, 28 const __m256i _r, int8_t cos_bit) { 29 const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); 30 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); 31 btf_16_adds_subs_avx2(&x1[0], &x1[3]); 32 btf_16_adds_subs_avx2(&x1[1], &x1[2]); 33 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit); 34 35 btf_16_adds_subs_avx2(&x1[8], &x1[11]); 36 btf_16_adds_subs_avx2(&x1[9], &x1[10]); 37 btf_16_adds_subs_avx2(&x1[15], &x1[12]); 38 btf_16_adds_subs_avx2(&x1[14], &x1[13]); 39 } 40 41 static inline void idct16_stage6_avx2(__m256i *x, const int32_t *cospi, 42 const __m256i _r, int8_t cos_bit) { 43 const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); 44 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); 45 btf_16_adds_subs_avx2(&x[0], &x[7]); 46 btf_16_adds_subs_avx2(&x[1], &x[6]); 47 btf_16_adds_subs_avx2(&x[2], &x[5]); 48 btf_16_adds_subs_avx2(&x[3], &x[4]); 49 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit); 50 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit); 51 } 52 53 static inline void idct16_stage7_avx2(__m256i *output, __m256i *x1) { 54 btf_16_adds_subs_out_avx2(&output[0], &output[15], x1[0], x1[15]); 55 btf_16_adds_subs_out_avx2(&output[1], &output[14], x1[1], x1[14]); 56 btf_16_adds_subs_out_avx2(&output[2], &output[13], x1[2], x1[13]); 57 btf_16_adds_subs_out_avx2(&output[3], &output[12], x1[3], x1[12]); 58 btf_16_adds_subs_out_avx2(&output[4], &output[11], x1[4], x1[11]); 59 btf_16_adds_subs_out_avx2(&output[5], &output[10], x1[5], x1[10]); 60 btf_16_adds_subs_out_avx2(&output[6], &output[9], x1[6], x1[9]); 61 btf_16_adds_subs_out_avx2(&output[7], &output[8], x1[7], x1[8]); 62 } 63 64 static void idct16_avx2(const __m256i *input, __m256i *output) { 65 const int32_t *cospi = cospi_arr(INV_COS_BIT); 66 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); 67 68 __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]); 69 __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]); 70 __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]); 71 __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]); 72 __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]); 73 __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]); 74 __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]); 75 __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]); 76 __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); 77 __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); 78 __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]); 79 __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]); 80 __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); 81 __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); 82 __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); 83 __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); 84 __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); 85 __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); 86 __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); 87 88 // stage 1 89 __m256i x1[16]; 90 x1[0] = input[0]; 91 x1[1] = input[8]; 92 x1[2] = input[4]; 93 x1[3] = input[12]; 94 x1[4] = input[2]; 95 x1[5] = input[10]; 96 x1[6] = input[6]; 97 x1[7] = input[14]; 98 x1[8] = input[1]; 99 x1[9] = input[9]; 100 x1[10] = input[5]; 101 x1[11] = input[13]; 102 x1[12] = input[3]; 103 x1[13] = input[11]; 104 x1[14] = input[7]; 105 x1[15] = input[15]; 106 107 // stage 2 108 btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r, 109 INV_COS_BIT); 110 btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r, 111 INV_COS_BIT); 112 btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r, 113 INV_COS_BIT); 114 btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r, 115 INV_COS_BIT); 116 117 // stage 3 118 btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r, 119 INV_COS_BIT); 120 btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r, 121 INV_COS_BIT); 122 btf_16_adds_subs_avx2(&x1[8], &x1[9]); 123 btf_16_adds_subs_avx2(&x1[11], &x1[10]); 124 btf_16_adds_subs_avx2(&x1[12], &x1[13]); 125 btf_16_adds_subs_avx2(&x1[15], &x1[14]); 126 127 // stage 4 128 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, 129 INV_COS_BIT); 130 btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r, 131 INV_COS_BIT); 132 btf_16_adds_subs_avx2(&x1[4], &x1[5]); 133 btf_16_adds_subs_avx2(&x1[7], &x1[6]); 134 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, 135 INV_COS_BIT); 136 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, 137 INV_COS_BIT); 138 139 idct16_stage5_avx2(x1, cospi, _r, INV_COS_BIT); 140 idct16_stage6_avx2(x1, cospi, _r, INV_COS_BIT); 141 idct16_stage7_avx2(output, x1); 142 } 143 144 static void idct16_low8_avx2(const __m256i *input, __m256i *output) { 145 const int32_t *cospi = cospi_arr(INV_COS_BIT); 146 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); 147 148 const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); 149 const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); 150 const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); 151 152 // stage 1 153 __m256i x1[16]; 154 x1[0] = input[0]; 155 x1[2] = input[4]; 156 x1[4] = input[2]; 157 x1[6] = input[6]; 158 x1[8] = input[1]; 159 x1[10] = input[5]; 160 x1[12] = input[3]; 161 x1[14] = input[7]; 162 163 // stage 2 164 btf_16_w16_0_avx2(cospi[60], cospi[4], x1[8], x1[8], x1[15]); 165 btf_16_w16_0_avx2(-cospi[36], cospi[28], x1[14], x1[9], x1[14]); 166 btf_16_w16_0_avx2(cospi[44], cospi[20], x1[10], x1[10], x1[13]); 167 btf_16_w16_0_avx2(-cospi[52], cospi[12], x1[12], x1[11], x1[12]); 168 169 // stage 3 170 btf_16_w16_0_avx2(cospi[56], cospi[8], x1[4], x1[4], x1[7]); 171 btf_16_w16_0_avx2(-cospi[40], cospi[24], x1[6], x1[5], x1[6]); 172 btf_16_adds_subs_avx2(&x1[8], &x1[9]); 173 btf_16_adds_subs_avx2(&x1[11], &x1[10]); 174 btf_16_adds_subs_avx2(&x1[12], &x1[13]); 175 btf_16_adds_subs_avx2(&x1[15], &x1[14]); 176 177 // stage 4 178 btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]); 179 btf_16_w16_0_avx2(cospi[48], cospi[16], x1[2], x1[2], x1[3]); 180 btf_16_adds_subs_avx2(&x1[4], &x1[5]); 181 btf_16_adds_subs_avx2(&x1[7], &x1[6]); 182 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, 183 INV_COS_BIT); 184 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, 185 INV_COS_BIT); 186 187 idct16_stage5_avx2(x1, cospi, _r, INV_COS_BIT); 188 idct16_stage6_avx2(x1, cospi, _r, INV_COS_BIT); 189 idct16_stage7_avx2(output, x1); 190 } 191 192 static void idct16_low1_avx2(const __m256i *input, __m256i *output) { 193 const int32_t *cospi = cospi_arr(INV_COS_BIT); 194 195 // stage 1 196 __m256i x1[2]; 197 x1[0] = input[0]; 198 199 // stage 2 200 // stage 3 201 // stage 4 202 btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]); 203 204 // stage 5 205 // stage 6 206 output[0] = x1[0]; 207 output[1] = x1[1]; 208 output[2] = x1[1]; 209 output[3] = x1[0]; 210 output[4] = x1[0]; 211 output[5] = x1[1]; 212 output[6] = x1[1]; 213 output[7] = x1[0]; 214 output[8] = x1[0]; 215 output[9] = x1[1]; 216 output[10] = x1[1]; 217 output[11] = x1[0]; 218 output[12] = x1[0]; 219 output[13] = x1[1]; 220 output[14] = x1[1]; 221 output[15] = x1[0]; 222 } 223 224 static inline void iadst16_stage3_avx2(__m256i *x) { 225 btf_16_adds_subs_avx2(&x[0], &x[8]); 226 btf_16_adds_subs_avx2(&x[1], &x[9]); 227 btf_16_adds_subs_avx2(&x[2], &x[10]); 228 btf_16_adds_subs_avx2(&x[3], &x[11]); 229 btf_16_adds_subs_avx2(&x[4], &x[12]); 230 btf_16_adds_subs_avx2(&x[5], &x[13]); 231 btf_16_adds_subs_avx2(&x[6], &x[14]); 232 btf_16_adds_subs_avx2(&x[7], &x[15]); 233 } 234 235 static inline void iadst16_stage4_avx2(__m256i *x, const int32_t *cospi, 236 const __m256i _r, int8_t cos_bit) { 237 const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); 238 const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); 239 const __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]); 240 const __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]); 241 const __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]); 242 const __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]); 243 btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x[8], &x[9], _r, cos_bit); 244 btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x[10], &x[11], _r, cos_bit); 245 btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x[12], &x[13], _r, cos_bit); 246 btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x[14], &x[15], _r, cos_bit); 247 } 248 249 static inline void iadst16_stage5_avx2(__m256i *x) { 250 btf_16_adds_subs_avx2(&x[0], &x[4]); 251 btf_16_adds_subs_avx2(&x[1], &x[5]); 252 btf_16_adds_subs_avx2(&x[2], &x[6]); 253 btf_16_adds_subs_avx2(&x[3], &x[7]); 254 btf_16_adds_subs_avx2(&x[8], &x[12]); 255 btf_16_adds_subs_avx2(&x[9], &x[13]); 256 btf_16_adds_subs_avx2(&x[10], &x[14]); 257 btf_16_adds_subs_avx2(&x[11], &x[15]); 258 } 259 260 static inline void iadst16_stage6_avx2(__m256i *x, const int32_t *cospi, 261 const __m256i _r, int8_t cos_bit) { 262 const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); 263 const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); 264 const __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]); 265 btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[4], &x[5], _r, cos_bit); 266 btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[6], &x[7], _r, cos_bit); 267 btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[12], &x[13], _r, cos_bit); 268 btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[14], &x[15], _r, cos_bit); 269 } 270 271 static inline void iadst16_stage7_avx2(__m256i *x) { 272 btf_16_adds_subs_avx2(&x[0], &x[2]); 273 btf_16_adds_subs_avx2(&x[1], &x[3]); 274 btf_16_adds_subs_avx2(&x[4], &x[6]); 275 btf_16_adds_subs_avx2(&x[5], &x[7]); 276 btf_16_adds_subs_avx2(&x[8], &x[10]); 277 btf_16_adds_subs_avx2(&x[9], &x[11]); 278 btf_16_adds_subs_avx2(&x[12], &x[14]); 279 btf_16_adds_subs_avx2(&x[13], &x[15]); 280 } 281 282 static inline void iadst16_stage8_avx2(__m256i *x1, const int32_t *cospi, 283 const __m256i _r, int8_t cos_bit) { 284 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); 285 const __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); 286 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit); 287 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit); 288 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit); 289 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit); 290 } 291 292 static inline void iadst16_stage9_avx2(__m256i *output, __m256i *x1) { 293 const __m256i __zero = _mm256_setzero_si256(); 294 output[0] = x1[0]; 295 output[1] = _mm256_subs_epi16(__zero, x1[8]); 296 output[2] = x1[12]; 297 output[3] = _mm256_subs_epi16(__zero, x1[4]); 298 output[4] = x1[6]; 299 output[5] = _mm256_subs_epi16(__zero, x1[14]); 300 output[6] = x1[10]; 301 output[7] = _mm256_subs_epi16(__zero, x1[2]); 302 output[8] = x1[3]; 303 output[9] = _mm256_subs_epi16(__zero, x1[11]); 304 output[10] = x1[15]; 305 output[11] = _mm256_subs_epi16(__zero, x1[7]); 306 output[12] = x1[5]; 307 output[13] = _mm256_subs_epi16(__zero, x1[13]); 308 output[14] = x1[9]; 309 output[15] = _mm256_subs_epi16(__zero, x1[1]); 310 } 311 312 static void iadst16_avx2(const __m256i *input, __m256i *output) { 313 const int32_t *cospi = cospi_arr(INV_COS_BIT); 314 315 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); 316 317 __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]); 318 __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]); 319 __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]); 320 __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]); 321 __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]); 322 __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]); 323 __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]); 324 __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]); 325 __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]); 326 __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]); 327 __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]); 328 __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]); 329 __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]); 330 __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]); 331 __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]); 332 __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]); 333 334 // stage 1 335 __m256i x1[16]; 336 x1[0] = input[15]; 337 x1[1] = input[0]; 338 x1[2] = input[13]; 339 x1[3] = input[2]; 340 x1[4] = input[11]; 341 x1[5] = input[4]; 342 x1[6] = input[9]; 343 x1[7] = input[6]; 344 x1[8] = input[7]; 345 x1[9] = input[8]; 346 x1[10] = input[5]; 347 x1[11] = input[10]; 348 x1[12] = input[3]; 349 x1[13] = input[12]; 350 x1[14] = input[1]; 351 x1[15] = input[14]; 352 353 // stage 2 354 btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r, 355 INV_COS_BIT); 356 btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r, 357 INV_COS_BIT); 358 btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r, 359 INV_COS_BIT); 360 btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r, 361 INV_COS_BIT); 362 btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r, 363 INV_COS_BIT); 364 btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r, 365 INV_COS_BIT); 366 btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r, 367 INV_COS_BIT); 368 btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r, 369 INV_COS_BIT); 370 371 iadst16_stage3_avx2(x1); 372 iadst16_stage4_avx2(x1, cospi, _r, INV_COS_BIT); 373 iadst16_stage5_avx2(x1); 374 iadst16_stage6_avx2(x1, cospi, _r, INV_COS_BIT); 375 iadst16_stage7_avx2(x1); 376 iadst16_stage8_avx2(x1, cospi, _r, INV_COS_BIT); 377 iadst16_stage9_avx2(output, x1); 378 } 379 380 static void iadst16_low8_avx2(const __m256i *input, __m256i *output) { 381 const int32_t *cospi = cospi_arr(INV_COS_BIT); 382 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); 383 384 // stage 1 385 __m256i x1[16]; 386 x1[1] = input[0]; 387 x1[3] = input[2]; 388 x1[5] = input[4]; 389 x1[7] = input[6]; 390 x1[8] = input[7]; 391 x1[10] = input[5]; 392 x1[12] = input[3]; 393 x1[14] = input[1]; 394 395 // stage 2 396 btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]); 397 btf_16_w16_0_avx2(cospi[54], -cospi[10], x1[3], x1[2], x1[3]); 398 btf_16_w16_0_avx2(cospi[46], -cospi[18], x1[5], x1[4], x1[5]); 399 btf_16_w16_0_avx2(cospi[38], -cospi[26], x1[7], x1[6], x1[7]); 400 btf_16_w16_0_avx2(cospi[34], cospi[30], x1[8], x1[8], x1[9]); 401 btf_16_w16_0_avx2(cospi[42], cospi[22], x1[10], x1[10], x1[11]); 402 btf_16_w16_0_avx2(cospi[50], cospi[14], x1[12], x1[12], x1[13]); 403 btf_16_w16_0_avx2(cospi[58], cospi[06], x1[14], x1[14], x1[15]); 404 405 iadst16_stage3_avx2(x1); 406 iadst16_stage4_avx2(x1, cospi, _r, INV_COS_BIT); 407 iadst16_stage5_avx2(x1); 408 iadst16_stage6_avx2(x1, cospi, _r, INV_COS_BIT); 409 iadst16_stage7_avx2(x1); 410 iadst16_stage8_avx2(x1, cospi, _r, INV_COS_BIT); 411 iadst16_stage9_avx2(output, x1); 412 } 413 414 static void iadst16_low1_avx2(const __m256i *input, __m256i *output) { 415 const int32_t *cospi = cospi_arr(INV_COS_BIT); 416 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); 417 418 const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); 419 const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); 420 const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); 421 const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); 422 423 // stage 1 424 __m256i x1[16]; 425 x1[1] = input[0]; 426 427 // stage 2 428 btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]); 429 430 // stage 3 431 x1[8] = x1[0]; 432 x1[9] = x1[1]; 433 434 // stage 4 435 btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r, 436 INV_COS_BIT); 437 438 // stage 5 439 x1[4] = x1[0]; 440 x1[5] = x1[1]; 441 442 x1[12] = x1[8]; 443 x1[13] = x1[9]; 444 445 // stage 6 446 btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r, 447 INV_COS_BIT); 448 btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r, 449 INV_COS_BIT); 450 451 // stage 7 452 x1[2] = x1[0]; 453 x1[3] = x1[1]; 454 x1[6] = x1[4]; 455 x1[7] = x1[5]; 456 x1[10] = x1[8]; 457 x1[11] = x1[9]; 458 x1[14] = x1[12]; 459 x1[15] = x1[13]; 460 461 iadst16_stage8_avx2(x1, cospi, _r, INV_COS_BIT); 462 iadst16_stage9_avx2(output, x1); 463 } 464 465 static inline void idct32_high16_stage3_avx2(__m256i *x) { 466 btf_16_adds_subs_avx2(&x[16], &x[17]); 467 btf_16_adds_subs_avx2(&x[19], &x[18]); 468 btf_16_adds_subs_avx2(&x[20], &x[21]); 469 btf_16_adds_subs_avx2(&x[23], &x[22]); 470 btf_16_adds_subs_avx2(&x[24], &x[25]); 471 btf_16_adds_subs_avx2(&x[27], &x[26]); 472 btf_16_adds_subs_avx2(&x[28], &x[29]); 473 btf_16_adds_subs_avx2(&x[31], &x[30]); 474 } 475 476 static inline void idct32_high16_stage4_avx2(__m256i *x, const int32_t *cospi, 477 const __m256i _r, int8_t cos_bit) { 478 const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); 479 const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); 480 const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]); 481 const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); 482 const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); 483 const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); 484 btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit); 485 btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit); 486 btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit); 487 btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit); 488 } 489 490 static inline void idct32_high24_stage5_avx2(__m256i *x, const int32_t *cospi, 491 const __m256i _r, int8_t cos_bit) { 492 const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); 493 const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); 494 const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); 495 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit); 496 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit); 497 btf_16_adds_subs_avx2(&x[16], &x[19]); 498 btf_16_adds_subs_avx2(&x[17], &x[18]); 499 btf_16_adds_subs_avx2(&x[23], &x[20]); 500 btf_16_adds_subs_avx2(&x[22], &x[21]); 501 btf_16_adds_subs_avx2(&x[24], &x[27]); 502 btf_16_adds_subs_avx2(&x[25], &x[26]); 503 btf_16_adds_subs_avx2(&x[31], &x[28]); 504 btf_16_adds_subs_avx2(&x[30], &x[29]); 505 } 506 507 static inline void idct32_high28_stage6_avx2(__m256i *x, const int32_t *cospi, 508 const __m256i _r, int8_t cos_bit) { 509 const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); 510 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); 511 const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); 512 const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); 513 const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); 514 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit); 515 btf_16_adds_subs_avx2(&x[8], &x[11]); 516 btf_16_adds_subs_avx2(&x[9], &x[10]); 517 btf_16_adds_subs_avx2(&x[15], &x[12]); 518 btf_16_adds_subs_avx2(&x[14], &x[13]); 519 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit); 520 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit); 521 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit); 522 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit); 523 } 524 525 static inline void idct32_stage7_avx2(__m256i *x, const int32_t *cospi, 526 const __m256i _r, int8_t cos_bit) { 527 const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); 528 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); 529 btf_16_adds_subs_avx2(&x[0], &x[7]); 530 btf_16_adds_subs_avx2(&x[1], &x[6]); 531 btf_16_adds_subs_avx2(&x[2], &x[5]); 532 btf_16_adds_subs_avx2(&x[3], &x[4]); 533 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit); 534 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit); 535 btf_16_adds_subs_avx2(&x[16], &x[23]); 536 btf_16_adds_subs_avx2(&x[17], &x[22]); 537 btf_16_adds_subs_avx2(&x[18], &x[21]); 538 btf_16_adds_subs_avx2(&x[19], &x[20]); 539 btf_16_adds_subs_avx2(&x[31], &x[24]); 540 btf_16_adds_subs_avx2(&x[30], &x[25]); 541 btf_16_adds_subs_avx2(&x[29], &x[26]); 542 btf_16_adds_subs_avx2(&x[28], &x[27]); 543 } 544 545 static inline void idct32_stage8_avx2(__m256i *x, const int32_t *cospi, 546 const __m256i _r, int8_t cos_bit) { 547 const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); 548 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); 549 btf_16_adds_subs_avx2(&x[0], &x[15]); 550 btf_16_adds_subs_avx2(&x[1], &x[14]); 551 btf_16_adds_subs_avx2(&x[2], &x[13]); 552 btf_16_adds_subs_avx2(&x[3], &x[12]); 553 btf_16_adds_subs_avx2(&x[4], &x[11]); 554 btf_16_adds_subs_avx2(&x[5], &x[10]); 555 btf_16_adds_subs_avx2(&x[6], &x[9]); 556 btf_16_adds_subs_avx2(&x[7], &x[8]); 557 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit); 558 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit); 559 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit); 560 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit); 561 } 562 563 static inline void idct32_stage9_avx2(__m256i *output, __m256i *x) { 564 btf_16_adds_subs_out_avx2(&output[0], &output[31], x[0], x[31]); 565 btf_16_adds_subs_out_avx2(&output[1], &output[30], x[1], x[30]); 566 btf_16_adds_subs_out_avx2(&output[2], &output[29], x[2], x[29]); 567 btf_16_adds_subs_out_avx2(&output[3], &output[28], x[3], x[28]); 568 btf_16_adds_subs_out_avx2(&output[4], &output[27], x[4], x[27]); 569 btf_16_adds_subs_out_avx2(&output[5], &output[26], x[5], x[26]); 570 btf_16_adds_subs_out_avx2(&output[6], &output[25], x[6], x[25]); 571 btf_16_adds_subs_out_avx2(&output[7], &output[24], x[7], x[24]); 572 btf_16_adds_subs_out_avx2(&output[8], &output[23], x[8], x[23]); 573 btf_16_adds_subs_out_avx2(&output[9], &output[22], x[9], x[22]); 574 btf_16_adds_subs_out_avx2(&output[10], &output[21], x[10], x[21]); 575 btf_16_adds_subs_out_avx2(&output[11], &output[20], x[11], x[20]); 576 btf_16_adds_subs_out_avx2(&output[12], &output[19], x[12], x[19]); 577 btf_16_adds_subs_out_avx2(&output[13], &output[18], x[13], x[18]); 578 btf_16_adds_subs_out_avx2(&output[14], &output[17], x[14], x[17]); 579 btf_16_adds_subs_out_avx2(&output[15], &output[16], x[15], x[16]); 580 } 581 582 static void idct32_low1_avx2(const __m256i *input, __m256i *output) { 583 const int32_t *cospi = cospi_arr(INV_COS_BIT); 584 585 // stage 1 586 __m256i x[2]; 587 x[0] = input[0]; 588 589 // stage 2 590 // stage 3 591 // stage 4 592 // stage 5 593 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); 594 595 // stage 6 596 // stage 7 597 // stage 8 598 // stage 9 599 output[0] = x[0]; 600 output[31] = x[0]; 601 output[1] = x[1]; 602 output[30] = x[1]; 603 output[2] = x[1]; 604 output[29] = x[1]; 605 output[3] = x[0]; 606 output[28] = x[0]; 607 output[4] = x[0]; 608 output[27] = x[0]; 609 output[5] = x[1]; 610 output[26] = x[1]; 611 output[6] = x[1]; 612 output[25] = x[1]; 613 output[7] = x[0]; 614 output[24] = x[0]; 615 output[8] = x[0]; 616 output[23] = x[0]; 617 output[9] = x[1]; 618 output[22] = x[1]; 619 output[10] = x[1]; 620 output[21] = x[1]; 621 output[11] = x[0]; 622 output[20] = x[0]; 623 output[12] = x[0]; 624 output[19] = x[0]; 625 output[13] = x[1]; 626 output[18] = x[1]; 627 output[14] = x[1]; 628 output[17] = x[1]; 629 output[15] = x[0]; 630 output[16] = x[0]; 631 } 632 633 static void idct32_low8_avx2(const __m256i *input, __m256i *output) { 634 const int32_t *cospi = cospi_arr(INV_COS_BIT); 635 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); 636 637 // stage 1 638 __m256i x[32]; 639 x[0] = input[0]; 640 x[4] = input[4]; 641 x[8] = input[2]; 642 x[12] = input[6]; 643 x[16] = input[1]; 644 x[20] = input[5]; 645 x[24] = input[3]; 646 x[28] = input[7]; 647 648 // stage 2 649 btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]); 650 btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]); 651 btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]); 652 btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]); 653 654 // stage 3 655 btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]); 656 btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]); 657 x[17] = x[16]; 658 x[18] = x[19]; 659 x[21] = x[20]; 660 x[22] = x[23]; 661 x[25] = x[24]; 662 x[26] = x[27]; 663 x[29] = x[28]; 664 x[30] = x[31]; 665 666 // stage 4 667 btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]); 668 x[9] = x[8]; 669 x[10] = x[11]; 670 x[13] = x[12]; 671 x[14] = x[15]; 672 idct32_high16_stage4_avx2(x, cospi, _r, INV_COS_BIT); 673 674 // stage 5 675 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); 676 x[5] = x[4]; 677 x[6] = x[7]; 678 idct32_high24_stage5_avx2(x, cospi, _r, INV_COS_BIT); 679 // stage 6 680 x[3] = x[0]; 681 x[2] = x[1]; 682 idct32_high28_stage6_avx2(x, cospi, _r, INV_COS_BIT); 683 684 idct32_stage7_avx2(x, cospi, _r, INV_COS_BIT); 685 idct32_stage8_avx2(x, cospi, _r, INV_COS_BIT); 686 idct32_stage9_avx2(output, x); 687 } 688 689 static void idct32_low16_avx2(const __m256i *input, __m256i *output) { 690 const int32_t *cospi = cospi_arr(INV_COS_BIT); 691 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); 692 693 // stage 1 694 __m256i x[32]; 695 x[0] = input[0]; 696 x[2] = input[8]; 697 x[4] = input[4]; 698 x[6] = input[12]; 699 x[8] = input[2]; 700 x[10] = input[10]; 701 x[12] = input[6]; 702 x[14] = input[14]; 703 x[16] = input[1]; 704 x[18] = input[9]; 705 x[20] = input[5]; 706 x[22] = input[13]; 707 x[24] = input[3]; 708 x[26] = input[11]; 709 x[28] = input[7]; 710 x[30] = input[15]; 711 712 // stage 2 713 btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]); 714 btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]); 715 btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]); 716 btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]); 717 btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]); 718 btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]); 719 btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]); 720 btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]); 721 722 // stage 3 723 btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]); 724 btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]); 725 btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]); 726 btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]); 727 idct32_high16_stage3_avx2(x); 728 729 // stage 4 730 btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]); 731 btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]); 732 btf_16_adds_subs_avx2(&x[8], &x[9]); 733 btf_16_adds_subs_avx2(&x[11], &x[10]); 734 btf_16_adds_subs_avx2(&x[12], &x[13]); 735 btf_16_adds_subs_avx2(&x[15], &x[14]); 736 idct32_high16_stage4_avx2(x, cospi, _r, INV_COS_BIT); 737 738 // stage 5 739 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); 740 btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]); 741 btf_16_adds_subs_avx2(&x[4], &x[5]); 742 btf_16_adds_subs_avx2(&x[7], &x[6]); 743 idct32_high24_stage5_avx2(x, cospi, _r, INV_COS_BIT); 744 745 btf_16_adds_subs_avx2(&x[0], &x[3]); 746 btf_16_adds_subs_avx2(&x[1], &x[2]); 747 idct32_high28_stage6_avx2(x, cospi, _r, INV_COS_BIT); 748 749 idct32_stage7_avx2(x, cospi, _r, INV_COS_BIT); 750 idct32_stage8_avx2(x, cospi, _r, INV_COS_BIT); 751 idct32_stage9_avx2(output, x); 752 } 753 754 static void idct32_avx2(const __m256i *input, __m256i *output) { 755 const int32_t *cospi = cospi_arr(INV_COS_BIT); 756 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); 757 758 __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]); 759 __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]); 760 __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]); 761 __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]); 762 __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]); 763 __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]); 764 __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]); 765 __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]); 766 __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]); 767 __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]); 768 __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]); 769 __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]); 770 __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]); 771 __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]); 772 __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]); 773 __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]); 774 __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]); 775 __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]); 776 __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]); 777 __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]); 778 __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]); 779 __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]); 780 __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]); 781 __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]); 782 __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); 783 __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); 784 __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]); 785 __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]); 786 __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); 787 __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); 788 __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); 789 __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); 790 791 // stage 1 792 __m256i x1[32]; 793 x1[0] = input[0]; 794 x1[1] = input[16]; 795 x1[2] = input[8]; 796 x1[3] = input[24]; 797 x1[4] = input[4]; 798 x1[5] = input[20]; 799 x1[6] = input[12]; 800 x1[7] = input[28]; 801 x1[8] = input[2]; 802 x1[9] = input[18]; 803 x1[10] = input[10]; 804 x1[11] = input[26]; 805 x1[12] = input[6]; 806 x1[13] = input[22]; 807 x1[14] = input[14]; 808 x1[15] = input[30]; 809 x1[16] = input[1]; 810 x1[17] = input[17]; 811 x1[18] = input[9]; 812 x1[19] = input[25]; 813 x1[20] = input[5]; 814 x1[21] = input[21]; 815 x1[22] = input[13]; 816 x1[23] = input[29]; 817 x1[24] = input[3]; 818 x1[25] = input[19]; 819 x1[26] = input[11]; 820 x1[27] = input[27]; 821 x1[28] = input[7]; 822 x1[29] = input[23]; 823 x1[30] = input[15]; 824 x1[31] = input[31]; 825 826 // stage 2 827 btf_16_w16_avx2(cospi_p62_m02, cospi_p02_p62, &x1[16], &x1[31], _r, 828 INV_COS_BIT); 829 btf_16_w16_avx2(cospi_p30_m34, cospi_p34_p30, &x1[17], &x1[30], _r, 830 INV_COS_BIT); 831 btf_16_w16_avx2(cospi_p46_m18, cospi_p18_p46, &x1[18], &x1[29], _r, 832 INV_COS_BIT); 833 btf_16_w16_avx2(cospi_p14_m50, cospi_p50_p14, &x1[19], &x1[28], _r, 834 INV_COS_BIT); 835 btf_16_w16_avx2(cospi_p54_m10, cospi_p10_p54, &x1[20], &x1[27], _r, 836 INV_COS_BIT); 837 btf_16_w16_avx2(cospi_p22_m42, cospi_p42_p22, &x1[21], &x1[26], _r, 838 INV_COS_BIT); 839 btf_16_w16_avx2(cospi_p38_m26, cospi_p26_p38, &x1[22], &x1[25], _r, 840 INV_COS_BIT); 841 btf_16_w16_avx2(cospi_p06_m58, cospi_p58_p06, &x1[23], &x1[24], _r, 842 INV_COS_BIT); 843 844 // stage 3 845 btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r, 846 INV_COS_BIT); 847 btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r, 848 INV_COS_BIT); 849 btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r, 850 INV_COS_BIT); 851 btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r, 852 INV_COS_BIT); 853 idct32_high16_stage3_avx2(x1); 854 855 // stage 4 856 btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r, 857 INV_COS_BIT); 858 btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r, 859 INV_COS_BIT); 860 btf_16_adds_subs_avx2(&x1[8], &x1[9]); 861 btf_16_adds_subs_avx2(&x1[11], &x1[10]); 862 btf_16_adds_subs_avx2(&x1[12], &x1[13]); 863 btf_16_adds_subs_avx2(&x1[15], &x1[14]); 864 idct32_high16_stage4_avx2(x1, cospi, _r, INV_COS_BIT); 865 866 // stage 5 867 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, 868 INV_COS_BIT); 869 btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r, 870 INV_COS_BIT); 871 btf_16_adds_subs_avx2(&x1[4], &x1[5]); 872 btf_16_adds_subs_avx2(&x1[7], &x1[6]); 873 idct32_high24_stage5_avx2(x1, cospi, _r, INV_COS_BIT); 874 875 // stage 6 876 btf_16_adds_subs_avx2(&x1[0], &x1[3]); 877 btf_16_adds_subs_avx2(&x1[1], &x1[2]); 878 idct32_high28_stage6_avx2(x1, cospi, _r, INV_COS_BIT); 879 880 idct32_stage7_avx2(x1, cospi, _r, INV_COS_BIT); 881 idct32_stage8_avx2(x1, cospi, _r, INV_COS_BIT); 882 idct32_stage9_avx2(output, x1); 883 } 884 885 static inline void idct64_stage4_high32_avx2(__m256i *x, const int32_t *cospi, 886 const __m256i _r, int8_t cos_bit) { 887 (void)cos_bit; 888 const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]); 889 const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]); 890 const __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]); 891 const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]); 892 const __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]); 893 const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]); 894 const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]); 895 const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]); 896 const __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]); 897 const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]); 898 const __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]); 899 const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]); 900 btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, cos_bit); 901 btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x[34], &x[61], _r, cos_bit); 902 btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x[37], &x[58], _r, cos_bit); 903 btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, cos_bit); 904 btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, cos_bit); 905 btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x[42], &x[53], _r, cos_bit); 906 btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x[45], &x[50], _r, cos_bit); 907 btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit); 908 } 909 910 static inline void idct64_stage5_high48_avx2(__m256i *x, const int32_t *cospi, 911 const __m256i _r, int8_t cos_bit) { 912 (void)cos_bit; 913 const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); 914 const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); 915 const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]); 916 const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); 917 const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); 918 const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); 919 btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit); 920 btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit); 921 btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit); 922 btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit); 923 btf_16_adds_subs_avx2(&x[32], &x[35]); 924 btf_16_adds_subs_avx2(&x[33], &x[34]); 925 btf_16_adds_subs_avx2(&x[39], &x[36]); 926 btf_16_adds_subs_avx2(&x[38], &x[37]); 927 btf_16_adds_subs_avx2(&x[40], &x[43]); 928 btf_16_adds_subs_avx2(&x[41], &x[42]); 929 btf_16_adds_subs_avx2(&x[47], &x[44]); 930 btf_16_adds_subs_avx2(&x[46], &x[45]); 931 btf_16_adds_subs_avx2(&x[48], &x[51]); 932 btf_16_adds_subs_avx2(&x[49], &x[50]); 933 btf_16_adds_subs_avx2(&x[55], &x[52]); 934 btf_16_adds_subs_avx2(&x[54], &x[53]); 935 btf_16_adds_subs_avx2(&x[56], &x[59]); 936 btf_16_adds_subs_avx2(&x[57], &x[58]); 937 btf_16_adds_subs_avx2(&x[63], &x[60]); 938 btf_16_adds_subs_avx2(&x[62], &x[61]); 939 } 940 941 static inline void idct64_stage6_high32_avx2(__m256i *x, const int32_t *cospi, 942 const __m256i _r, int8_t cos_bit) { 943 (void)cos_bit; 944 const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); 945 const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); 946 const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]); 947 const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); 948 const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); 949 const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); 950 btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[34], &x[61], _r, cos_bit); 951 btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[35], &x[60], _r, cos_bit); 952 btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[36], &x[59], _r, cos_bit); 953 btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[37], &x[58], _r, cos_bit); 954 btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[42], &x[53], _r, cos_bit); 955 btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[43], &x[52], _r, cos_bit); 956 btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[44], &x[51], _r, cos_bit); 957 btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[45], &x[50], _r, cos_bit); 958 } 959 960 static inline void idct64_stage6_high48_avx2(__m256i *x, const int32_t *cospi, 961 const __m256i _r, int8_t cos_bit) { 962 btf_16_adds_subs_avx2(&x[16], &x[19]); 963 btf_16_adds_subs_avx2(&x[17], &x[18]); 964 btf_16_adds_subs_avx2(&x[23], &x[20]); 965 btf_16_adds_subs_avx2(&x[22], &x[21]); 966 btf_16_adds_subs_avx2(&x[24], &x[27]); 967 btf_16_adds_subs_avx2(&x[25], &x[26]); 968 btf_16_adds_subs_avx2(&x[31], &x[28]); 969 btf_16_adds_subs_avx2(&x[30], &x[29]); 970 idct64_stage6_high32_avx2(x, cospi, _r, cos_bit); 971 } 972 973 static inline void idct64_stage7_high48_avx2(__m256i *x, const int32_t *cospi, 974 const __m256i _r, int8_t cos_bit) { 975 (void)cos_bit; 976 const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); 977 const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); 978 const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); 979 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit); 980 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit); 981 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit); 982 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit); 983 btf_16_adds_subs_avx2(&x[32], &x[39]); 984 btf_16_adds_subs_avx2(&x[33], &x[38]); 985 btf_16_adds_subs_avx2(&x[34], &x[37]); 986 btf_16_adds_subs_avx2(&x[35], &x[36]); 987 btf_16_adds_subs_avx2(&x[47], &x[40]); 988 btf_16_adds_subs_avx2(&x[46], &x[41]); 989 btf_16_adds_subs_avx2(&x[45], &x[42]); 990 btf_16_adds_subs_avx2(&x[44], &x[43]); 991 btf_16_adds_subs_avx2(&x[48], &x[55]); 992 btf_16_adds_subs_avx2(&x[49], &x[54]); 993 btf_16_adds_subs_avx2(&x[50], &x[53]); 994 btf_16_adds_subs_avx2(&x[51], &x[52]); 995 btf_16_adds_subs_avx2(&x[63], &x[56]); 996 btf_16_adds_subs_avx2(&x[62], &x[57]); 997 btf_16_adds_subs_avx2(&x[61], &x[58]); 998 btf_16_adds_subs_avx2(&x[60], &x[59]); 999 } 1000 1001 static inline void idct64_stage8_high48_avx2(__m256i *x, const int32_t *cospi, 1002 const __m256i _r, int8_t cos_bit) { 1003 (void)cos_bit; 1004 const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); 1005 const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); 1006 const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); 1007 btf_16_adds_subs_avx2(&x[16], &x[23]); 1008 btf_16_adds_subs_avx2(&x[17], &x[22]); 1009 btf_16_adds_subs_avx2(&x[18], &x[21]); 1010 btf_16_adds_subs_avx2(&x[19], &x[20]); 1011 btf_16_adds_subs_avx2(&x[31], &x[24]); 1012 btf_16_adds_subs_avx2(&x[30], &x[25]); 1013 btf_16_adds_subs_avx2(&x[29], &x[26]); 1014 btf_16_adds_subs_avx2(&x[28], &x[27]); 1015 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[36], &x[59], _r, cos_bit); 1016 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[37], &x[58], _r, cos_bit); 1017 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[38], &x[57], _r, cos_bit); 1018 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[39], &x[56], _r, cos_bit); 1019 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[40], &x[55], _r, cos_bit); 1020 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[41], &x[54], _r, cos_bit); 1021 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[42], &x[53], _r, cos_bit); 1022 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[43], &x[52], _r, cos_bit); 1023 } 1024 1025 static inline void idct64_stage9_avx2(__m256i *x, const int32_t *cospi, 1026 const __m256i _r, int8_t cos_bit) { 1027 (void)cos_bit; 1028 const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); 1029 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); 1030 btf_16_adds_subs_avx2(&x[0], &x[15]); 1031 btf_16_adds_subs_avx2(&x[1], &x[14]); 1032 btf_16_adds_subs_avx2(&x[2], &x[13]); 1033 btf_16_adds_subs_avx2(&x[3], &x[12]); 1034 btf_16_adds_subs_avx2(&x[4], &x[11]); 1035 btf_16_adds_subs_avx2(&x[5], &x[10]); 1036 btf_16_adds_subs_avx2(&x[6], &x[9]); 1037 btf_16_adds_subs_avx2(&x[7], &x[8]); 1038 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit); 1039 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit); 1040 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit); 1041 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit); 1042 btf_16_adds_subs_avx2(&x[32], &x[47]); 1043 btf_16_adds_subs_avx2(&x[33], &x[46]); 1044 btf_16_adds_subs_avx2(&x[34], &x[45]); 1045 btf_16_adds_subs_avx2(&x[35], &x[44]); 1046 btf_16_adds_subs_avx2(&x[36], &x[43]); 1047 btf_16_adds_subs_avx2(&x[37], &x[42]); 1048 btf_16_adds_subs_avx2(&x[38], &x[41]); 1049 btf_16_adds_subs_avx2(&x[39], &x[40]); 1050 btf_16_adds_subs_avx2(&x[63], &x[48]); 1051 btf_16_adds_subs_avx2(&x[62], &x[49]); 1052 btf_16_adds_subs_avx2(&x[61], &x[50]); 1053 btf_16_adds_subs_avx2(&x[60], &x[51]); 1054 btf_16_adds_subs_avx2(&x[59], &x[52]); 1055 btf_16_adds_subs_avx2(&x[58], &x[53]); 1056 btf_16_adds_subs_avx2(&x[57], &x[54]); 1057 btf_16_adds_subs_avx2(&x[56], &x[55]); 1058 } 1059 1060 static inline void idct64_stage10_avx2(__m256i *x, const int32_t *cospi, 1061 const __m256i _r, int8_t cos_bit) { 1062 (void)cos_bit; 1063 const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); 1064 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); 1065 btf_16_adds_subs_avx2(&x[0], &x[31]); 1066 btf_16_adds_subs_avx2(&x[1], &x[30]); 1067 btf_16_adds_subs_avx2(&x[2], &x[29]); 1068 btf_16_adds_subs_avx2(&x[3], &x[28]); 1069 btf_16_adds_subs_avx2(&x[4], &x[27]); 1070 btf_16_adds_subs_avx2(&x[5], &x[26]); 1071 btf_16_adds_subs_avx2(&x[6], &x[25]); 1072 btf_16_adds_subs_avx2(&x[7], &x[24]); 1073 btf_16_adds_subs_avx2(&x[8], &x[23]); 1074 btf_16_adds_subs_avx2(&x[9], &x[22]); 1075 btf_16_adds_subs_avx2(&x[10], &x[21]); 1076 btf_16_adds_subs_avx2(&x[11], &x[20]); 1077 btf_16_adds_subs_avx2(&x[12], &x[19]); 1078 btf_16_adds_subs_avx2(&x[13], &x[18]); 1079 btf_16_adds_subs_avx2(&x[14], &x[17]); 1080 btf_16_adds_subs_avx2(&x[15], &x[16]); 1081 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[40], &x[55], _r, cos_bit); 1082 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[41], &x[54], _r, cos_bit); 1083 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[42], &x[53], _r, cos_bit); 1084 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[43], &x[52], _r, cos_bit); 1085 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[44], &x[51], _r, cos_bit); 1086 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[45], &x[50], _r, cos_bit); 1087 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[46], &x[49], _r, cos_bit); 1088 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[47], &x[48], _r, cos_bit); 1089 } 1090 1091 static inline void idct64_stage11_avx2(__m256i *output, __m256i *x) { 1092 btf_16_adds_subs_out_avx2(&output[0], &output[63], x[0], x[63]); 1093 btf_16_adds_subs_out_avx2(&output[1], &output[62], x[1], x[62]); 1094 btf_16_adds_subs_out_avx2(&output[2], &output[61], x[2], x[61]); 1095 btf_16_adds_subs_out_avx2(&output[3], &output[60], x[3], x[60]); 1096 btf_16_adds_subs_out_avx2(&output[4], &output[59], x[4], x[59]); 1097 btf_16_adds_subs_out_avx2(&output[5], &output[58], x[5], x[58]); 1098 btf_16_adds_subs_out_avx2(&output[6], &output[57], x[6], x[57]); 1099 btf_16_adds_subs_out_avx2(&output[7], &output[56], x[7], x[56]); 1100 btf_16_adds_subs_out_avx2(&output[8], &output[55], x[8], x[55]); 1101 btf_16_adds_subs_out_avx2(&output[9], &output[54], x[9], x[54]); 1102 btf_16_adds_subs_out_avx2(&output[10], &output[53], x[10], x[53]); 1103 btf_16_adds_subs_out_avx2(&output[11], &output[52], x[11], x[52]); 1104 btf_16_adds_subs_out_avx2(&output[12], &output[51], x[12], x[51]); 1105 btf_16_adds_subs_out_avx2(&output[13], &output[50], x[13], x[50]); 1106 btf_16_adds_subs_out_avx2(&output[14], &output[49], x[14], x[49]); 1107 btf_16_adds_subs_out_avx2(&output[15], &output[48], x[15], x[48]); 1108 btf_16_adds_subs_out_avx2(&output[16], &output[47], x[16], x[47]); 1109 btf_16_adds_subs_out_avx2(&output[17], &output[46], x[17], x[46]); 1110 btf_16_adds_subs_out_avx2(&output[18], &output[45], x[18], x[45]); 1111 btf_16_adds_subs_out_avx2(&output[19], &output[44], x[19], x[44]); 1112 btf_16_adds_subs_out_avx2(&output[20], &output[43], x[20], x[43]); 1113 btf_16_adds_subs_out_avx2(&output[21], &output[42], x[21], x[42]); 1114 btf_16_adds_subs_out_avx2(&output[22], &output[41], x[22], x[41]); 1115 btf_16_adds_subs_out_avx2(&output[23], &output[40], x[23], x[40]); 1116 btf_16_adds_subs_out_avx2(&output[24], &output[39], x[24], x[39]); 1117 btf_16_adds_subs_out_avx2(&output[25], &output[38], x[25], x[38]); 1118 btf_16_adds_subs_out_avx2(&output[26], &output[37], x[26], x[37]); 1119 btf_16_adds_subs_out_avx2(&output[27], &output[36], x[27], x[36]); 1120 btf_16_adds_subs_out_avx2(&output[28], &output[35], x[28], x[35]); 1121 btf_16_adds_subs_out_avx2(&output[29], &output[34], x[29], x[34]); 1122 btf_16_adds_subs_out_avx2(&output[30], &output[33], x[30], x[33]); 1123 btf_16_adds_subs_out_avx2(&output[31], &output[32], x[31], x[32]); 1124 } 1125 1126 static void idct64_low1_avx2(const __m256i *input, __m256i *output) { 1127 const int32_t *cospi = cospi_arr(INV_COS_BIT); 1128 1129 // stage 1 1130 __m256i x[32]; 1131 x[0] = input[0]; 1132 1133 // stage 2 1134 // stage 3 1135 // stage 4 1136 // stage 5 1137 // stage 6 1138 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); 1139 1140 // stage 7 1141 // stage 8 1142 // stage 9 1143 // stage 10 1144 // stage 11 1145 output[0] = x[0]; 1146 output[63] = x[0]; 1147 output[1] = x[1]; 1148 output[62] = x[1]; 1149 output[2] = x[1]; 1150 output[61] = x[1]; 1151 output[3] = x[0]; 1152 output[60] = x[0]; 1153 output[4] = x[0]; 1154 output[59] = x[0]; 1155 output[5] = x[1]; 1156 output[58] = x[1]; 1157 output[6] = x[1]; 1158 output[57] = x[1]; 1159 output[7] = x[0]; 1160 output[56] = x[0]; 1161 output[8] = x[0]; 1162 output[55] = x[0]; 1163 output[9] = x[1]; 1164 output[54] = x[1]; 1165 output[10] = x[1]; 1166 output[53] = x[1]; 1167 output[11] = x[0]; 1168 output[52] = x[0]; 1169 output[12] = x[0]; 1170 output[51] = x[0]; 1171 output[13] = x[1]; 1172 output[50] = x[1]; 1173 output[14] = x[1]; 1174 output[49] = x[1]; 1175 output[15] = x[0]; 1176 output[48] = x[0]; 1177 output[16] = x[0]; 1178 output[47] = x[0]; 1179 output[17] = x[1]; 1180 output[46] = x[1]; 1181 output[18] = x[1]; 1182 output[45] = x[1]; 1183 output[19] = x[0]; 1184 output[44] = x[0]; 1185 output[20] = x[0]; 1186 output[43] = x[0]; 1187 output[21] = x[1]; 1188 output[42] = x[1]; 1189 output[22] = x[1]; 1190 output[41] = x[1]; 1191 output[23] = x[0]; 1192 output[40] = x[0]; 1193 output[24] = x[0]; 1194 output[39] = x[0]; 1195 output[25] = x[1]; 1196 output[38] = x[1]; 1197 output[26] = x[1]; 1198 output[37] = x[1]; 1199 output[27] = x[0]; 1200 output[36] = x[0]; 1201 output[28] = x[0]; 1202 output[35] = x[0]; 1203 output[29] = x[1]; 1204 output[34] = x[1]; 1205 output[30] = x[1]; 1206 output[33] = x[1]; 1207 output[31] = x[0]; 1208 output[32] = x[0]; 1209 } 1210 1211 static void idct64_low8_avx2(const __m256i *input, __m256i *output) { 1212 const int32_t *cospi = cospi_arr(INV_COS_BIT); 1213 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); 1214 const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]); 1215 const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]); 1216 const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]); 1217 const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]); 1218 const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]); 1219 const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]); 1220 const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]); 1221 const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]); 1222 const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); 1223 const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); 1224 const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); 1225 const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); 1226 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); 1227 const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); 1228 const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); 1229 const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); 1230 1231 // stage 1 1232 __m256i x[64]; 1233 x[0] = input[0]; 1234 x[8] = input[4]; 1235 x[16] = input[2]; 1236 x[24] = input[6]; 1237 x[32] = input[1]; 1238 x[40] = input[5]; 1239 x[48] = input[3]; 1240 x[56] = input[7]; 1241 1242 // stage 2 1243 btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]); 1244 btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]); 1245 btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]); 1246 btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]); 1247 1248 // stage 3 1249 btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]); 1250 btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]); 1251 x[33] = x[32]; 1252 x[38] = x[39]; 1253 x[41] = x[40]; 1254 x[46] = x[47]; 1255 x[49] = x[48]; 1256 x[54] = x[55]; 1257 x[57] = x[56]; 1258 x[62] = x[63]; 1259 1260 // stage 4 1261 btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]); 1262 x[17] = x[16]; 1263 x[22] = x[23]; 1264 x[25] = x[24]; 1265 x[30] = x[31]; 1266 btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, 1267 INV_COS_BIT); 1268 btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, 1269 INV_COS_BIT); 1270 btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, 1271 INV_COS_BIT); 1272 btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, 1273 INV_COS_BIT); 1274 1275 // stage 5 1276 x[9] = x[8]; 1277 x[14] = x[15]; 1278 btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, 1279 INV_COS_BIT); 1280 btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, 1281 INV_COS_BIT); 1282 x[35] = x[32]; 1283 x[34] = x[33]; 1284 x[36] = x[39]; 1285 x[37] = x[38]; 1286 x[43] = x[40]; 1287 x[42] = x[41]; 1288 x[44] = x[47]; 1289 x[45] = x[46]; 1290 x[51] = x[48]; 1291 x[50] = x[49]; 1292 x[52] = x[55]; 1293 x[53] = x[54]; 1294 x[59] = x[56]; 1295 x[58] = x[57]; 1296 x[60] = x[63]; 1297 x[61] = x[62]; 1298 1299 // stage 6 1300 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); 1301 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, INV_COS_BIT); 1302 x[19] = x[16]; 1303 x[18] = x[17]; 1304 x[20] = x[23]; 1305 x[21] = x[22]; 1306 x[27] = x[24]; 1307 x[26] = x[25]; 1308 x[28] = x[31]; 1309 x[29] = x[30]; 1310 idct64_stage6_high32_avx2(x, cospi, _r, INV_COS_BIT); 1311 1312 // stage 7 1313 x[3] = x[0]; 1314 x[2] = x[1]; 1315 x[11] = x[8]; 1316 x[10] = x[9]; 1317 x[12] = x[15]; 1318 x[13] = x[14]; 1319 idct64_stage7_high48_avx2(x, cospi, _r, INV_COS_BIT); 1320 1321 // stage 8 1322 x[7] = x[0]; 1323 x[6] = x[1]; 1324 x[5] = x[2]; 1325 x[4] = x[3]; 1326 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, 1327 INV_COS_BIT); 1328 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, 1329 INV_COS_BIT); 1330 idct64_stage8_high48_avx2(x, cospi, _r, INV_COS_BIT); 1331 1332 idct64_stage9_avx2(x, cospi, _r, INV_COS_BIT); 1333 idct64_stage10_avx2(x, cospi, _r, INV_COS_BIT); 1334 idct64_stage11_avx2(output, x); 1335 } 1336 1337 static void idct64_low16_avx2(const __m256i *input, __m256i *output) { 1338 const int32_t *cospi = cospi_arr(INV_COS_BIT); 1339 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); 1340 1341 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); 1342 const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); 1343 const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); 1344 const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); 1345 const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); 1346 1347 // stage 1 1348 __m256i x[64]; 1349 x[0] = input[0]; 1350 x[4] = input[8]; 1351 x[8] = input[4]; 1352 x[12] = input[12]; 1353 x[16] = input[2]; 1354 x[20] = input[10]; 1355 x[24] = input[6]; 1356 x[28] = input[14]; 1357 x[32] = input[1]; 1358 x[36] = input[9]; 1359 x[40] = input[5]; 1360 x[44] = input[13]; 1361 x[48] = input[3]; 1362 x[52] = input[11]; 1363 x[56] = input[7]; 1364 x[60] = input[15]; 1365 1366 // stage 2 1367 btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]); 1368 btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]); 1369 btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]); 1370 btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]); 1371 btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]); 1372 btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]); 1373 btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]); 1374 btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]); 1375 1376 // stage 3 1377 btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]); 1378 btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]); 1379 btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]); 1380 btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]); 1381 x[33] = x[32]; 1382 x[34] = x[35]; 1383 x[37] = x[36]; 1384 x[38] = x[39]; 1385 x[41] = x[40]; 1386 x[42] = x[43]; 1387 x[45] = x[44]; 1388 x[46] = x[47]; 1389 x[49] = x[48]; 1390 x[50] = x[51]; 1391 x[53] = x[52]; 1392 x[54] = x[55]; 1393 x[57] = x[56]; 1394 x[58] = x[59]; 1395 x[61] = x[60]; 1396 x[62] = x[63]; 1397 1398 // stage 4 1399 btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]); 1400 btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]); 1401 x[17] = x[16]; 1402 x[18] = x[19]; 1403 x[21] = x[20]; 1404 x[22] = x[23]; 1405 x[25] = x[24]; 1406 x[26] = x[27]; 1407 x[29] = x[28]; 1408 x[30] = x[31]; 1409 idct64_stage4_high32_avx2(x, cospi, _r, INV_COS_BIT); 1410 1411 // stage 5 1412 btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]); 1413 x[9] = x[8]; 1414 x[10] = x[11]; 1415 x[13] = x[12]; 1416 x[14] = x[15]; 1417 idct64_stage5_high48_avx2(x, cospi, _r, INV_COS_BIT); 1418 1419 // stage 6 1420 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); 1421 x[5] = x[4]; 1422 x[6] = x[7]; 1423 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, INV_COS_BIT); 1424 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, 1425 INV_COS_BIT); 1426 idct64_stage6_high48_avx2(x, cospi, _r, INV_COS_BIT); 1427 1428 // stage 7 1429 x[3] = x[0]; 1430 x[2] = x[1]; 1431 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, INV_COS_BIT); 1432 btf_16_adds_subs_avx2(&x[8], &x[11]); 1433 btf_16_adds_subs_avx2(&x[9], &x[10]); 1434 btf_16_adds_subs_avx2(&x[15], &x[12]); 1435 btf_16_adds_subs_avx2(&x[14], &x[13]); 1436 idct64_stage7_high48_avx2(x, cospi, _r, INV_COS_BIT); 1437 1438 // stage 8 1439 btf_16_adds_subs_avx2(&x[0], &x[7]); 1440 btf_16_adds_subs_avx2(&x[1], &x[6]); 1441 btf_16_adds_subs_avx2(&x[2], &x[5]); 1442 btf_16_adds_subs_avx2(&x[3], &x[4]); 1443 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, 1444 INV_COS_BIT); 1445 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, 1446 INV_COS_BIT); 1447 idct64_stage8_high48_avx2(x, cospi, _r, INV_COS_BIT); 1448 1449 idct64_stage9_avx2(x, cospi, _r, INV_COS_BIT); 1450 idct64_stage10_avx2(x, cospi, _r, INV_COS_BIT); 1451 idct64_stage11_avx2(output, x); 1452 } 1453 1454 static void idct64_low32_avx2(const __m256i *input, __m256i *output) { 1455 const int32_t *cospi = cospi_arr(INV_COS_BIT); 1456 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); 1457 1458 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); 1459 const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); 1460 const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); 1461 const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); 1462 const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); 1463 1464 // stage 1 1465 __m256i x[64]; 1466 x[0] = input[0]; 1467 x[2] = input[16]; 1468 x[4] = input[8]; 1469 x[6] = input[24]; 1470 x[8] = input[4]; 1471 x[10] = input[20]; 1472 x[12] = input[12]; 1473 x[14] = input[28]; 1474 x[16] = input[2]; 1475 x[18] = input[18]; 1476 x[20] = input[10]; 1477 x[22] = input[26]; 1478 x[24] = input[6]; 1479 x[26] = input[22]; 1480 x[28] = input[14]; 1481 x[30] = input[30]; 1482 x[32] = input[1]; 1483 x[34] = input[17]; 1484 x[36] = input[9]; 1485 x[38] = input[25]; 1486 x[40] = input[5]; 1487 x[42] = input[21]; 1488 x[44] = input[13]; 1489 x[46] = input[29]; 1490 x[48] = input[3]; 1491 x[50] = input[19]; 1492 x[52] = input[11]; 1493 x[54] = input[27]; 1494 x[56] = input[7]; 1495 x[58] = input[23]; 1496 x[60] = input[15]; 1497 x[62] = input[31]; 1498 1499 // stage 2 1500 btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]); 1501 btf_16_w16_0_avx2(-cospi[33], cospi[31], x[62], x[33], x[62]); 1502 btf_16_w16_0_avx2(cospi[47], cospi[17], x[34], x[34], x[61]); 1503 btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]); 1504 btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]); 1505 btf_16_w16_0_avx2(-cospi[41], cospi[23], x[58], x[37], x[58]); 1506 btf_16_w16_0_avx2(cospi[39], cospi[25], x[38], x[38], x[57]); 1507 btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]); 1508 btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]); 1509 btf_16_w16_0_avx2(-cospi[37], cospi[27], x[54], x[41], x[54]); 1510 btf_16_w16_0_avx2(cospi[43], cospi[21], x[42], x[42], x[53]); 1511 btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]); 1512 btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]); 1513 btf_16_w16_0_avx2(-cospi[45], cospi[19], x[50], x[45], x[50]); 1514 btf_16_w16_0_avx2(cospi[35], cospi[29], x[46], x[46], x[49]); 1515 btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]); 1516 1517 // stage 3 1518 btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]); 1519 btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]); 1520 btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]); 1521 btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]); 1522 btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]); 1523 btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]); 1524 btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]); 1525 btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]); 1526 btf_16_adds_subs_avx2(&x[32], &x[33]); 1527 btf_16_adds_subs_avx2(&x[35], &x[34]); 1528 btf_16_adds_subs_avx2(&x[36], &x[37]); 1529 btf_16_adds_subs_avx2(&x[39], &x[38]); 1530 btf_16_adds_subs_avx2(&x[40], &x[41]); 1531 btf_16_adds_subs_avx2(&x[43], &x[42]); 1532 btf_16_adds_subs_avx2(&x[44], &x[45]); 1533 btf_16_adds_subs_avx2(&x[47], &x[46]); 1534 btf_16_adds_subs_avx2(&x[48], &x[49]); 1535 btf_16_adds_subs_avx2(&x[51], &x[50]); 1536 btf_16_adds_subs_avx2(&x[52], &x[53]); 1537 btf_16_adds_subs_avx2(&x[55], &x[54]); 1538 btf_16_adds_subs_avx2(&x[56], &x[57]); 1539 btf_16_adds_subs_avx2(&x[59], &x[58]); 1540 btf_16_adds_subs_avx2(&x[60], &x[61]); 1541 btf_16_adds_subs_avx2(&x[63], &x[62]); 1542 1543 // stage 4 1544 btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]); 1545 btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]); 1546 btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]); 1547 btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]); 1548 btf_16_adds_subs_avx2(&x[16], &x[17]); 1549 btf_16_adds_subs_avx2(&x[19], &x[18]); 1550 btf_16_adds_subs_avx2(&x[20], &x[21]); 1551 btf_16_adds_subs_avx2(&x[23], &x[22]); 1552 btf_16_adds_subs_avx2(&x[24], &x[25]); 1553 btf_16_adds_subs_avx2(&x[27], &x[26]); 1554 btf_16_adds_subs_avx2(&x[28], &x[29]); 1555 btf_16_adds_subs_avx2(&x[31], &x[30]); 1556 idct64_stage4_high32_avx2(x, cospi, _r, INV_COS_BIT); 1557 1558 // stage 5 1559 btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]); 1560 btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]); 1561 btf_16_adds_subs_avx2(&x[8], &x[9]); 1562 btf_16_adds_subs_avx2(&x[11], &x[10]); 1563 btf_16_adds_subs_avx2(&x[12], &x[13]); 1564 btf_16_adds_subs_avx2(&x[15], &x[14]); 1565 idct64_stage5_high48_avx2(x, cospi, _r, INV_COS_BIT); 1566 1567 // stage 6 1568 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); 1569 btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]); 1570 btf_16_adds_subs_avx2(&x[4], &x[5]); 1571 btf_16_adds_subs_avx2(&x[7], &x[6]); 1572 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, INV_COS_BIT); 1573 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, 1574 INV_COS_BIT); 1575 idct64_stage6_high48_avx2(x, cospi, _r, INV_COS_BIT); 1576 1577 // stage 7 1578 btf_16_adds_subs_avx2(&x[0], &x[3]); 1579 btf_16_adds_subs_avx2(&x[1], &x[2]); 1580 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, INV_COS_BIT); 1581 btf_16_adds_subs_avx2(&x[8], &x[11]); 1582 btf_16_adds_subs_avx2(&x[9], &x[10]); 1583 btf_16_adds_subs_avx2(&x[15], &x[12]); 1584 btf_16_adds_subs_avx2(&x[14], &x[13]); 1585 idct64_stage7_high48_avx2(x, cospi, _r, INV_COS_BIT); 1586 1587 // stage 8 1588 btf_16_adds_subs_avx2(&x[0], &x[7]); 1589 btf_16_adds_subs_avx2(&x[1], &x[6]); 1590 btf_16_adds_subs_avx2(&x[2], &x[5]); 1591 btf_16_adds_subs_avx2(&x[3], &x[4]); 1592 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, 1593 INV_COS_BIT); 1594 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, 1595 INV_COS_BIT); 1596 idct64_stage8_high48_avx2(x, cospi, _r, INV_COS_BIT); 1597 1598 // stage 9~11 1599 idct64_stage9_avx2(x, cospi, _r, INV_COS_BIT); 1600 idct64_stage10_avx2(x, cospi, _r, INV_COS_BIT); 1601 idct64_stage11_avx2(output, x); 1602 } 1603 1604 typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output); 1605 1606 // 1D functions process 16 pixels at one time. 1607 static const transform_1d_avx2 1608 lowbd_txfm_all_1d_zeros_w16_arr[TX_SIZES][ITX_TYPES_1D][4] = { 1609 { 1610 { NULL, NULL, NULL, NULL }, 1611 { NULL, NULL, NULL, NULL }, 1612 { NULL, NULL, NULL, NULL }, 1613 }, 1614 { { NULL, NULL, NULL, NULL }, 1615 { NULL, NULL, NULL, NULL }, 1616 { NULL, NULL, NULL, NULL } }, 1617 { 1618 { idct16_low1_avx2, idct16_low8_avx2, idct16_avx2, NULL }, 1619 { iadst16_low1_avx2, iadst16_low8_avx2, iadst16_avx2, NULL }, 1620 { NULL, NULL, NULL, NULL }, 1621 }, 1622 { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 }, 1623 { NULL, NULL, NULL, NULL }, 1624 { NULL, NULL, NULL, NULL } }, 1625 { { idct64_low1_avx2, idct64_low8_avx2, idct64_low16_avx2, 1626 idct64_low32_avx2 }, 1627 { NULL, NULL, NULL, NULL }, 1628 { NULL, NULL, NULL, NULL } } 1629 }; 1630 1631 // only process w >= 16 h >= 16 1632 static inline void lowbd_inv_txfm2d_add_no_identity_avx2( 1633 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, 1634 TX_SIZE tx_size, int eob) { 1635 __m256i buf1[64 * 16]; 1636 int eobx, eoby; 1637 get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); 1638 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; 1639 const int txw_idx = get_txw_idx(tx_size); 1640 const int txh_idx = get_txh_idx(tx_size); 1641 const int txfm_size_col = tx_size_wide[tx_size]; 1642 const int txfm_size_row = tx_size_high[tx_size]; 1643 const int buf_size_w_div16 = txfm_size_col >> 4; 1644 const int buf_size_nonzero_w = ((eobx + 16) >> 4) << 4; 1645 const int buf_size_nonzero_h_div16 = (eoby + 16) >> 4; 1646 const int input_stride = AOMMIN(32, txfm_size_row); 1647 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); 1648 1649 const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; 1650 const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; 1651 const transform_1d_avx2 row_txfm = 1652 lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; 1653 const transform_1d_avx2 col_txfm = 1654 lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; 1655 1656 assert(col_txfm != NULL); 1657 assert(row_txfm != NULL); 1658 int ud_flip, lr_flip; 1659 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 1660 const __m256i scale0 = _mm256_set1_epi16(1 << (15 + shift[0])); 1661 for (int i = 0; i < buf_size_nonzero_h_div16; i++) { 1662 __m256i buf0[64]; 1663 load_buffer_32bit_to_16bit_w16_avx2(input + 16 * i, input_stride, buf0, 1664 buf_size_nonzero_w); 1665 if (rect_type == 1 || rect_type == -1) { 1666 round_shift_avx2(buf0, buf0, buf_size_nonzero_w); // rect special code 1667 } 1668 row_txfm(buf0, buf0); 1669 for (int j = 0; j < txfm_size_col; ++j) { 1670 buf0[j] = _mm256_mulhrs_epi16(buf0[j], scale0); 1671 } 1672 1673 __m256i *buf1_cur = buf1 + (i << 4); 1674 if (lr_flip) { 1675 for (int j = 0; j < buf_size_w_div16; ++j) { 1676 __m256i temp[16]; 1677 flip_buf_avx2(buf0 + 16 * j, temp, 16); 1678 int offset = txfm_size_row * (buf_size_w_div16 - 1 - j); 1679 transpose_16bit_16x16_avx2(temp, buf1_cur + offset); 1680 } 1681 } else { 1682 for (int j = 0; j < buf_size_w_div16; ++j) { 1683 transpose_16bit_16x16_avx2(buf0 + 16 * j, buf1_cur + txfm_size_row * j); 1684 } 1685 } 1686 } 1687 const __m256i scale1 = _mm256_set1_epi16(1 << (15 + shift[1])); 1688 for (int i = 0; i < buf_size_w_div16; i++) { 1689 __m256i *buf1_cur = buf1 + i * txfm_size_row; 1690 col_txfm(buf1_cur, buf1_cur); 1691 for (int j = 0; j < txfm_size_row; ++j) { 1692 buf1_cur[j] = _mm256_mulhrs_epi16(buf1_cur[j], scale1); 1693 } 1694 } 1695 for (int i = 0; i < buf_size_w_div16; i++) { 1696 lowbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row, output + 16 * i, 1697 stride, ud_flip, txfm_size_row); 1698 } 1699 } 1700 1701 static inline void iidentity_row_16xn_avx2(__m256i *out, const int32_t *input, 1702 int stride, int shift, int height, 1703 int txw_idx, int rect_type) { 1704 const int32_t *input_row = input; 1705 const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txw_idx]); 1706 const __m256i _r = _mm256_set1_epi16((1 << (NewSqrt2Bits - 1)) + 1707 (1 << (NewSqrt2Bits - shift - 1))); 1708 const __m256i one = _mm256_set1_epi16(1); 1709 const __m256i scale__r = _mm256_unpacklo_epi16(scale, _r); 1710 if (rect_type != 1 && rect_type != -1) { 1711 for (int i = 0; i < height; ++i) { 1712 const __m256i src = load_32bit_to_16bit_w16_avx2(input_row); 1713 input_row += stride; 1714 __m256i lo = _mm256_unpacklo_epi16(src, one); 1715 __m256i hi = _mm256_unpackhi_epi16(src, one); 1716 lo = _mm256_madd_epi16(lo, scale__r); 1717 hi = _mm256_madd_epi16(hi, scale__r); 1718 lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift); 1719 hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift); 1720 out[i] = _mm256_packs_epi32(lo, hi); 1721 } 1722 } else { 1723 const __m256i rect_scale = 1724 _mm256_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits)); 1725 for (int i = 0; i < height; ++i) { 1726 __m256i src = load_32bit_to_16bit_w16_avx2(input_row); 1727 src = _mm256_mulhrs_epi16(src, rect_scale); 1728 input_row += stride; 1729 __m256i lo = _mm256_unpacklo_epi16(src, one); 1730 __m256i hi = _mm256_unpackhi_epi16(src, one); 1731 lo = _mm256_madd_epi16(lo, scale__r); 1732 hi = _mm256_madd_epi16(hi, scale__r); 1733 lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift); 1734 hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift); 1735 out[i] = _mm256_packs_epi32(lo, hi); 1736 } 1737 } 1738 } 1739 1740 static inline void iidentity_col_16xn_avx2(uint8_t *output, int stride, 1741 __m256i *buf, int shift, int height, 1742 int txh_idx) { 1743 const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txh_idx]); 1744 const __m256i scale__r = _mm256_set1_epi16(1 << (NewSqrt2Bits - 1)); 1745 const __m256i shift__r = _mm256_set1_epi32(1 << (-shift - 1)); 1746 const __m256i one = _mm256_set1_epi16(1); 1747 const __m256i scale_coeff = _mm256_unpacklo_epi16(scale, scale__r); 1748 for (int h = 0; h < height; ++h) { 1749 __m256i lo = _mm256_unpacklo_epi16(buf[h], one); 1750 __m256i hi = _mm256_unpackhi_epi16(buf[h], one); 1751 lo = _mm256_madd_epi16(lo, scale_coeff); 1752 hi = _mm256_madd_epi16(hi, scale_coeff); 1753 lo = _mm256_srai_epi32(lo, NewSqrt2Bits); 1754 hi = _mm256_srai_epi32(hi, NewSqrt2Bits); 1755 lo = _mm256_add_epi32(lo, shift__r); 1756 hi = _mm256_add_epi32(hi, shift__r); 1757 lo = _mm256_srai_epi32(lo, -shift); 1758 hi = _mm256_srai_epi32(hi, -shift); 1759 const __m256i x = _mm256_packs_epi32(lo, hi); 1760 write_recon_w16_avx2(x, output); 1761 output += stride; 1762 } 1763 } 1764 1765 static inline void lowbd_inv_txfm2d_add_idtx_avx2(const int32_t *input, 1766 uint8_t *output, int stride, 1767 TX_SIZE tx_size, 1768 int32_t eob) { 1769 (void)eob; 1770 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; 1771 const int txw_idx = get_txw_idx(tx_size); 1772 const int txh_idx = get_txh_idx(tx_size); 1773 const int txfm_size_col = tx_size_wide[tx_size]; 1774 const int txfm_size_row = tx_size_high[tx_size]; 1775 const int col_max = AOMMIN(32, txfm_size_col); 1776 const int row_max = AOMMIN(32, txfm_size_row); 1777 const int input_stride = row_max; 1778 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); 1779 __m256i buf[32]; 1780 1781 for (int i = 0; i < (col_max >> 4); ++i) { 1782 for (int j = 0; j < (row_max >> 4); j++) { 1783 iidentity_row_16xn_avx2(buf, input + j * 16 + i * 16 * input_stride, 1784 row_max, shift[0], 16, txw_idx, rect_type); 1785 transpose_16bit_16x16_avx2(buf, buf); 1786 iidentity_col_16xn_avx2(output + i * 16 + j * 16 * stride, stride, buf, 1787 shift[1], 16, txh_idx); 1788 } 1789 } 1790 } 1791 1792 static inline void lowbd_inv_txfm2d_add_h_identity_avx2( 1793 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, 1794 TX_SIZE tx_size, int eob) { 1795 int eobx, eoby; 1796 get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob); 1797 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; 1798 const int txw_idx = get_txw_idx(tx_size); 1799 const int txh_idx = get_txh_idx(tx_size); 1800 const int txfm_size_col = tx_size_wide[tx_size]; 1801 const int txfm_size_row = tx_size_high[tx_size]; 1802 const int txfm_size_row_notzero = AOMMIN(32, txfm_size_row); 1803 const int input_stride = txfm_size_row_notzero; 1804 const int buf_size_w_div16 = (eobx + 16) >> 4; 1805 const int buf_size_h_div16 = (eoby + 16) >> 4; 1806 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); 1807 1808 const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; 1809 const transform_1d_avx2 col_txfm = 1810 lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; 1811 1812 assert(col_txfm != NULL); 1813 1814 int ud_flip, lr_flip; 1815 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 1816 for (int i = 0; i < buf_size_w_div16; i++) { 1817 __m256i buf0[64]; 1818 for (int j = 0; j < buf_size_h_div16; j++) { 1819 __m256i *buf0_cur = buf0 + j * 16; 1820 const int32_t *input_cur = input + i * 16 * input_stride + j * 16; 1821 iidentity_row_16xn_avx2(buf0_cur, input_cur, input_stride, shift[0], 16, 1822 txw_idx, rect_type); 1823 transpose_16bit_16x16_avx2(buf0_cur, buf0_cur); 1824 } 1825 col_txfm(buf0, buf0); 1826 __m256i mshift = _mm256_set1_epi16(1 << (15 + shift[1])); 1827 int k = ud_flip ? (txfm_size_row - 1) : 0; 1828 const int step = ud_flip ? -1 : 1; 1829 for (int j = 0; j < txfm_size_row; ++j, k += step) { 1830 __m256i res = _mm256_mulhrs_epi16(buf0[k], mshift); 1831 write_recon_w16_avx2(res, output + (i << 4) + j * stride); 1832 } 1833 } 1834 } 1835 1836 static inline void lowbd_inv_txfm2d_add_v_identity_avx2( 1837 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, 1838 TX_SIZE tx_size, int eob) { 1839 __m256i buf1[64]; 1840 int eobx, eoby; 1841 get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob); 1842 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; 1843 const int txw_idx = get_txw_idx(tx_size); 1844 const int txh_idx = get_txh_idx(tx_size); 1845 const int txfm_size_col = tx_size_wide[tx_size]; 1846 const int txfm_size_row = tx_size_high[tx_size]; 1847 const int buf_size_w_div16 = txfm_size_col >> 4; 1848 const int buf_size_h_div16 = (eoby + 16) >> 4; 1849 const int buf_size_nonzero_w = ((eobx + 8) >> 3) << 3; 1850 const int input_stride = AOMMIN(32, txfm_size_row); 1851 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); 1852 1853 const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; 1854 const transform_1d_avx2 row_txfm = 1855 lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; 1856 1857 assert(row_txfm != NULL); 1858 1859 int ud_flip, lr_flip; 1860 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 1861 for (int i = 0; i < buf_size_h_div16; i++) { 1862 __m256i buf0[64]; 1863 load_buffer_32bit_to_16bit_w16_avx2(input + i * 16, input_stride, buf0, 1864 buf_size_nonzero_w); 1865 if (rect_type == 1 || rect_type == -1) { 1866 round_shift_avx2(buf0, buf0, buf_size_nonzero_w); // rect special code 1867 } 1868 row_txfm(buf0, buf0); 1869 round_shift_16bit_w16_avx2(buf0, txfm_size_col, shift[0]); 1870 __m256i *_buf1 = buf1; 1871 if (lr_flip) { 1872 for (int j = 0; j < buf_size_w_div16; ++j) { 1873 __m256i temp[16]; 1874 flip_buf_avx2(buf0 + 16 * j, temp, 16); 1875 transpose_16bit_16x16_avx2(temp, 1876 _buf1 + 16 * (buf_size_w_div16 - 1 - j)); 1877 } 1878 } else { 1879 for (int j = 0; j < buf_size_w_div16; ++j) { 1880 transpose_16bit_16x16_avx2(buf0 + 16 * j, _buf1 + 16 * j); 1881 } 1882 } 1883 for (int j = 0; j < buf_size_w_div16; ++j) { 1884 iidentity_col_16xn_avx2(output + i * 16 * stride + j * 16, stride, 1885 buf1 + j * 16, shift[1], 16, txh_idx); 1886 } 1887 } 1888 } 1889 1890 static const transform_1d_ssse3 lowbd_txfm_all_1d_zeros_8x8_arr[2][2] = { 1891 { av1_idct8_low1_ssse3, av1_idct8_sse2 }, 1892 { av1_iadst8_low1_ssse3, av1_iadst8_sse2 } 1893 }; 1894 1895 static inline void load_buffer_avx2(const int32_t *in, int stride, 1896 __m128i *out) { 1897 const __m256i a = _mm256_load_si256((const __m256i *)in); 1898 const __m256i b = _mm256_load_si256((const __m256i *)(in + stride * 1)); 1899 const __m256i c = _mm256_load_si256((const __m256i *)(in + stride * 2)); 1900 const __m256i d = _mm256_load_si256((const __m256i *)(in + stride * 3)); 1901 const __m256i e = _mm256_load_si256((const __m256i *)(in + stride * 4)); 1902 const __m256i f = _mm256_load_si256((const __m256i *)(in + stride * 5)); 1903 const __m256i g = _mm256_load_si256((const __m256i *)(in + stride * 6)); 1904 const __m256i h = _mm256_load_si256((const __m256i *)(in + stride * 7)); 1905 1906 // a0 a1 a2 a3 b0 b1 b2 b3 a4 a5 a6 a7 b4 b5 b6 b7 1907 const __m256i ab_16bit = _mm256_packs_epi32(a, b); 1908 // c0 c1 c2 c3 d0 d1 d2 d3 c4 c5 c6 c7 d4 d5 d6 d7 1909 const __m256i cd_16bit = _mm256_packs_epi32(c, d); 1910 // e0 e1 e2 e3 f0 f1 f2 f3 e4 e5 e6 e7 f4 f5 f6 f7 1911 const __m256i ef_16bit = _mm256_packs_epi32(e, f); 1912 // g0 g1 g2 g3 h0 h1 h2 h3 g4 g5 g6 g7 h4 h5 h6 h7 1913 const __m256i gh_16bit = _mm256_packs_epi32(g, h); 1914 1915 // a0 a1 a2 a3 a4 a5 a6 a7 b0 b1 b2 b3 b4 b5 b6 b7 1916 const __m256i ab = _mm256_permute4x64_epi64(ab_16bit, 0xd8); 1917 // c0 c1 c2 c3 c4 c5 c6 c7 d0 d1 d2 d3 d4 d5 d6 d7 1918 const __m256i cd = _mm256_permute4x64_epi64(cd_16bit, 0xd8); 1919 // e0 e1 e2 e3 e4 e5 e6 e7 f0 f1 f2 f3 f4 f5 f6 f7 1920 const __m256i ef = _mm256_permute4x64_epi64(ef_16bit, 0xd8); 1921 // g0 g1 g2 g3 g4 g5 g6 g7 h0 h1 h2 h3 h4 h5 h6 h7 1922 const __m256i gh = _mm256_permute4x64_epi64(gh_16bit, 0xd8); 1923 1924 out[0] = _mm256_castsi256_si128(ab); 1925 out[1] = _mm256_extractf128_si256(ab, 1); 1926 out[2] = _mm256_castsi256_si128(cd); 1927 out[3] = _mm256_extractf128_si256(cd, 1); 1928 out[4] = _mm256_castsi256_si128(ef); 1929 out[5] = _mm256_extractf128_si256(ef, 1); 1930 out[6] = _mm256_castsi256_si128(gh); 1931 out[7] = _mm256_extractf128_si256(gh, 1); 1932 } 1933 1934 static inline void round_and_transpose_avx2(const __m128i *const in, 1935 __m128i *const out, int bit, 1936 int *lr_flip) { 1937 __m256i buf_temp[4]; 1938 const __m256i scale = _mm256_set1_epi16(1 << (15 + bit)); 1939 int j = *lr_flip ? 7 : 0; 1940 const int step = *lr_flip ? -1 : 1; 1941 1942 // 70 71 72 73 74 75 76 77 | 30 31 32 33 34 35 36 37 1943 buf_temp[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), 1944 in[j + 4 * step], 1); 1945 j += step; 1946 // 60 61 62 63 64 65 66 67 | 20 21 22 23 24 25 26 27 1947 buf_temp[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), 1948 in[j + 4 * step], 1); 1949 j += step; 1950 // 50 51 52 53 54 55 56 57 | 10 11 12 13 14 15 16 17 1951 buf_temp[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), 1952 in[j + 4 * step], 1); 1953 j += step; 1954 // 40 41 42 43 44 45 46 47 | 00 01 02 03 04 05 06 07 1955 buf_temp[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), 1956 in[j + 4 * step], 1); 1957 1958 // 70 71 72 73 74 75 76 77 | 30 31 32 33 34 35 36 37 1959 buf_temp[0] = _mm256_mulhrs_epi16(buf_temp[0], scale); 1960 // 60 61 62 63 64 65 66 67 | 20 21 22 23 24 25 26 27 1961 buf_temp[1] = _mm256_mulhrs_epi16(buf_temp[1], scale); 1962 // 50 51 52 53 54 55 56 57 | 10 11 12 13 14 15 16 17 1963 buf_temp[2] = _mm256_mulhrs_epi16(buf_temp[2], scale); 1964 // 40 41 42 43 44 45 46 47 | 00 01 02 03 04 05 06 07 1965 buf_temp[3] = _mm256_mulhrs_epi16(buf_temp[3], scale); 1966 1967 // 70 60 71 61 72 62 73 63 | 30 20 31 21 32 22 33 23 1968 const __m256i unpcklo0 = _mm256_unpacklo_epi16(buf_temp[0], buf_temp[1]); 1969 // 74 64 75 65 76 66 77 67 | 34 24 35 25 36 26 37 27 1970 const __m256i unpckhi0 = _mm256_unpackhi_epi16(buf_temp[0], buf_temp[1]); 1971 // 50 40 51 41 52 42 53 43 | 10 00 11 01 12 02 13 03 1972 const __m256i unpcklo1 = _mm256_unpacklo_epi16(buf_temp[2], buf_temp[3]); 1973 // 54 44 55 45 56 46 57 47 | 14 04 15 05 16 06 17 07 1974 const __m256i unpckhi1 = _mm256_unpackhi_epi16(buf_temp[2], buf_temp[3]); 1975 1976 // 70 60 50 40 71 61 51 41 | 30 20 10 00 31 21 11 01 1977 const __m256i unpcklo00 = _mm256_unpacklo_epi32(unpcklo0, unpcklo1); 1978 // 72 62 52 42 73 63 53 43 | 32 22 12 02 33 23 13 03 1979 const __m256i unpckhi00 = _mm256_unpackhi_epi32(unpcklo0, unpcklo1); 1980 // 74 64 54 44 75 65 55 45 | 34 24 14 04 35 25 15 05 1981 const __m256i unpcklo01 = _mm256_unpacklo_epi32(unpckhi0, unpckhi1); 1982 // 76 66 56 46 77 67 57 47 | 36 26 16 06 37 27 17 07 1983 const __m256i unpckhi01 = _mm256_unpackhi_epi32(unpckhi0, unpckhi1); 1984 1985 // 70 60 50 40 30 20 10 00 | 71 61 51 41 31 21 11 01 1986 const __m256i reg_00 = _mm256_permute4x64_epi64(unpcklo00, 0xd8); 1987 // 72 62 52 42 32 22 12 02 | 73 63 53 43 33 23 13 03 1988 const __m256i reg_01 = _mm256_permute4x64_epi64(unpckhi00, 0xd8); 1989 // 74 64 54 44 34 24 14 04 | 75 65 55 45 35 25 15 05 1990 const __m256i reg_10 = _mm256_permute4x64_epi64(unpcklo01, 0xd8); 1991 // 76 66 56 46 36 26 16 06 | 77 67 57 47 37 27 17 07 1992 const __m256i reg_11 = _mm256_permute4x64_epi64(unpckhi01, 0xd8); 1993 1994 // 70 60 50 40 30 20 10 00 1995 out[0] = _mm256_castsi256_si128(reg_00); 1996 // 71 61 51 41 31 21 11 01 1997 out[1] = _mm256_extracti128_si256(reg_00, 1); 1998 // 72 62 52 42 32 22 12 02 1999 out[2] = _mm256_castsi256_si128(reg_01); 2000 // 73 63 53 43 33 23 13 03 2001 out[3] = _mm256_extracti128_si256(reg_01, 1); 2002 // 74 64 54 44 34 24 14 04 2003 out[4] = _mm256_castsi256_si128(reg_10); 2004 // 75 65 55 45 35 25 15 05 2005 out[5] = _mm256_extracti128_si256(reg_10, 1); 2006 // 76 66 56 46 36 26 16 06 2007 out[6] = _mm256_castsi256_si128(reg_11); 2008 // 77 67 57 47 37 27 17 07 2009 out[7] = _mm256_extracti128_si256(reg_11, 1); 2010 } 2011 2012 static inline void round_shift_lowbd_write_buffer_avx2(__m128i *in, int bit, 2013 uint8_t *output, 2014 int stride, int flipud) { 2015 __m256i in_256[4], v_256[4]; 2016 int j = flipud ? 7 : 0; 2017 const int step = flipud ? -1 : 1; 2018 const __m256i scale = _mm256_set1_epi16(1 << (15 + bit)); 2019 const __m256i zero = _mm256_setzero_si256(); 2020 // in[0], in[1] 2021 in_256[0] = 2022 _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1); 2023 j += 2 * step; 2024 // in[2], in[3] 2025 in_256[1] = 2026 _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1); 2027 j += 2 * step; 2028 // in[4], in[5] 2029 in_256[2] = 2030 _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1); 2031 j += 2 * step; 2032 // in[6], in[7] 2033 in_256[3] = 2034 _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1); 2035 2036 // i00 i01 i02 i03 i04 i05 i06 i07 i10 i11 i12 i13 i14 i15 i16 i17 2037 in_256[0] = _mm256_mulhrs_epi16(in_256[0], scale); 2038 // i20 i21 i22 i23 i24 i25 i26 i27 i30 i31 i32 i33 i34 i35 i36 i37 2039 in_256[1] = _mm256_mulhrs_epi16(in_256[1], scale); 2040 // i40 i41 i42 i43 i44 i45 i46 i47 i50 i51 i52 i53 i54 i55 i56 i57 2041 in_256[2] = _mm256_mulhrs_epi16(in_256[2], scale); 2042 // i60 i61 i62 i63 i64 i65 i66 i67 i70 i71 i72 i73 i74 i75 i76 i77 2043 in_256[3] = _mm256_mulhrs_epi16(in_256[3], scale); 2044 2045 const __m128i v0 = _mm_loadl_epi64((__m128i const *)(output)); 2046 const __m128i v1 = _mm_loadl_epi64((__m128i const *)(output + stride)); 2047 const __m128i v2 = _mm_loadl_epi64((__m128i const *)(output + 2 * stride)); 2048 const __m128i v3 = _mm_loadl_epi64((__m128i const *)(output + 3 * stride)); 2049 const __m128i v4 = _mm_loadl_epi64((__m128i const *)(output + 4 * stride)); 2050 const __m128i v5 = _mm_loadl_epi64((__m128i const *)(output + 5 * stride)); 2051 const __m128i v6 = _mm_loadl_epi64((__m128i const *)(output + 6 * stride)); 2052 const __m128i v7 = _mm_loadl_epi64((__m128i const *)(output + 7 * stride)); 2053 2054 v_256[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(v0), v1, 1); 2055 v_256[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(v2), v3, 1); 2056 v_256[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(v4), v5, 1); 2057 v_256[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(v6), v7, 1); 2058 2059 const __m256i unpcklo0 = _mm256_unpacklo_epi8(v_256[0], zero); 2060 const __m256i unpcklo1 = _mm256_unpacklo_epi8(v_256[1], zero); 2061 const __m256i unpcklo2 = _mm256_unpacklo_epi8(v_256[2], zero); 2062 const __m256i unpcklo3 = _mm256_unpacklo_epi8(v_256[3], zero); 2063 // 00 01 10 11 2064 const __m256i x0 = _mm256_adds_epi16(in_256[0], unpcklo0); 2065 // 20 21 30 31 2066 const __m256i x1 = _mm256_adds_epi16(in_256[1], unpcklo1); 2067 // 40 41 50 51 2068 const __m256i x2 = _mm256_adds_epi16(in_256[2], unpcklo2); 2069 // 60 61 70 71 2070 const __m256i x3 = _mm256_adds_epi16(in_256[3], unpcklo3); 2071 2072 // 00 01 20 21 10 11 30 31 2073 const __m256i res_0123 = _mm256_packus_epi16(x0, x1); 2074 // 40 41 60 61 50 51 70 71 2075 const __m256i res_4567 = _mm256_packus_epi16(x2, x3); 2076 2077 // 00 01 20 21 2078 const __m128i res_02 = _mm256_castsi256_si128(res_0123); 2079 // 10 11 30 31 2080 const __m128i res_13 = _mm256_extracti128_si256(res_0123, 1); 2081 // 40 41 60 61 2082 const __m128i res_46 = _mm256_castsi256_si128(res_4567); 2083 // 50 51 70 71 2084 const __m128i res_57 = _mm256_extracti128_si256(res_4567, 1); 2085 2086 // 00 01 2087 _mm_storel_epi64((__m128i *)(output), res_02); 2088 // 10 11 2089 _mm_storel_epi64((__m128i *)(output + stride), res_13); 2090 // 20 21 2091 _mm_storel_epi64((__m128i *)(output + 2 * stride), 2092 _mm_unpackhi_epi64(res_02, res_02)); 2093 // 30 31 2094 _mm_storel_epi64((__m128i *)(output + 3 * stride), 2095 _mm_unpackhi_epi64(res_13, res_13)); 2096 // 40 41 2097 _mm_storel_epi64((__m128i *)(output + 4 * stride), res_46); 2098 // 50 51 2099 _mm_storel_epi64((__m128i *)(output + 5 * stride), res_57); 2100 // 60 61 2101 _mm_storel_epi64((__m128i *)(output + 6 * stride), 2102 _mm_unpackhi_epi64(res_46, res_46)); 2103 // 70 71 2104 _mm_storel_epi64((__m128i *)(output + 7 * stride), 2105 _mm_unpackhi_epi64(res_57, res_57)); 2106 } 2107 2108 // AVX2 implementation has the advantage when combined multiple operations 2109 // together. 2110 static inline void lowbd_inv_txfm2d_8x8_no_identity_avx2( 2111 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, 2112 TX_SIZE tx_size, int eob) { 2113 __m128i buf1[8]; 2114 const int input_stride = 8; 2115 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; 2116 assert(hitx_1d_tab[tx_type] < 2); 2117 assert(vitx_1d_tab[tx_type] < 2); 2118 const transform_1d_ssse3 row_txfm = 2119 lowbd_txfm_all_1d_zeros_8x8_arr[hitx_1d_tab[tx_type]][eob != 1]; 2120 const transform_1d_ssse3 col_txfm = 2121 lowbd_txfm_all_1d_zeros_8x8_arr[vitx_1d_tab[tx_type]][eob != 1]; 2122 2123 assert(col_txfm != NULL); 2124 assert(row_txfm != NULL); 2125 int ud_flip, lr_flip; 2126 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 2127 2128 __m128i buf0[8]; 2129 __m128i *buf0_cur = buf0; 2130 load_buffer_avx2(input, input_stride, buf0_cur); 2131 row_txfm(buf0, buf0); 2132 2133 assert(shift[0] < 0); 2134 __m128i *_buf1 = buf1; 2135 round_and_transpose_avx2(buf0, _buf1, shift[0], &lr_flip); 2136 assert(shift[1] < 0); 2137 col_txfm(buf1, buf1); 2138 round_shift_lowbd_write_buffer_avx2(buf1, shift[1], output, stride, ud_flip); 2139 } 2140 2141 // AVX2 implementation of 8x8 inverse transform. Observed that coding AVX2 for 2142 // tx_type with identity in either of the direction has no advantage. 2143 static void lowbd_inv_txfm2d_add_8x8_avx2(const int32_t *input, uint8_t *output, 2144 int stride, TX_TYPE tx_type, 2145 TX_SIZE tx_size, int eob) { 2146 switch (tx_type) { 2147 case IDTX: 2148 av1_lowbd_inv_txfm2d_add_idtx_ssse3(input, output, stride, tx_size); 2149 2150 break; 2151 case V_DCT: 2152 case V_ADST: 2153 case V_FLIPADST: 2154 av1_lowbd_inv_txfm2d_add_h_identity_ssse3(input, output, stride, tx_type, 2155 tx_size, eob); 2156 break; 2157 case H_DCT: 2158 case H_ADST: 2159 case H_FLIPADST: 2160 av1_lowbd_inv_txfm2d_add_v_identity_ssse3(input, output, stride, tx_type, 2161 tx_size, eob); 2162 break; 2163 default: 2164 lowbd_inv_txfm2d_8x8_no_identity_avx2(input, output, stride, tx_type, 2165 tx_size, eob); 2166 } 2167 } 2168 2169 // for 32x32,32x64,64x32,64x64,16x32,32x16,64x16,16x64 2170 static inline void lowbd_inv_txfm2d_add_universe_avx2( 2171 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, 2172 TX_SIZE tx_size, int eob) { 2173 (void)eob; 2174 switch (tx_type) { 2175 case DCT_DCT: 2176 case ADST_DCT: // ADST in vertical, DCT in horizontal 2177 case DCT_ADST: // DCT in vertical, ADST in horizontal 2178 case ADST_ADST: // ADST in both directions 2179 case FLIPADST_DCT: 2180 case DCT_FLIPADST: 2181 case FLIPADST_FLIPADST: 2182 case ADST_FLIPADST: 2183 case FLIPADST_ADST: 2184 lowbd_inv_txfm2d_add_no_identity_avx2(input, output, stride, tx_type, 2185 tx_size, eob); 2186 break; 2187 case IDTX: 2188 lowbd_inv_txfm2d_add_idtx_avx2(input, output, stride, tx_size, eob); 2189 break; 2190 case V_DCT: 2191 case V_ADST: 2192 case V_FLIPADST: 2193 lowbd_inv_txfm2d_add_h_identity_avx2(input, output, stride, tx_type, 2194 tx_size, eob); 2195 break; 2196 case H_DCT: 2197 case H_ADST: 2198 case H_FLIPADST: 2199 lowbd_inv_txfm2d_add_v_identity_avx2(input, output, stride, tx_type, 2200 tx_size, eob); 2201 break; 2202 default: 2203 av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size, 2204 eob); 2205 break; 2206 } 2207 } 2208 2209 void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output, 2210 int stride, TX_TYPE tx_type, TX_SIZE tx_size, 2211 int eob) { 2212 switch (tx_size) { 2213 case TX_4X4: 2214 case TX_4X8: 2215 case TX_8X4: 2216 case TX_8X16: 2217 case TX_16X8: 2218 case TX_4X16: 2219 case TX_16X4: 2220 case TX_8X32: 2221 case TX_32X8: 2222 av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size, 2223 eob); 2224 break; 2225 case TX_8X8: 2226 lowbd_inv_txfm2d_add_8x8_avx2(input, output, stride, tx_type, tx_size, 2227 eob); 2228 break; 2229 case TX_16X16: 2230 case TX_32X32: 2231 case TX_64X64: 2232 case TX_16X32: 2233 case TX_32X16: 2234 case TX_32X64: 2235 case TX_64X32: 2236 case TX_16X64: 2237 case TX_64X16: 2238 default: 2239 lowbd_inv_txfm2d_add_universe_avx2(input, output, stride, tx_type, 2240 tx_size, eob); 2241 break; 2242 } 2243 } 2244 2245 void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, 2246 const TxfmParam *txfm_param) { 2247 const TX_TYPE tx_type = txfm_param->tx_type; 2248 if (!txfm_param->lossless) { 2249 av1_lowbd_inv_txfm2d_add_avx2(dqcoeff, dst, stride, tx_type, 2250 txfm_param->tx_size, txfm_param->eob); 2251 } else { 2252 av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param); 2253 } 2254 }