av1_fwd_txfm2d_avx2.c (133780B)
1 /* 2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include "config/av1_rtcd.h" 13 14 #include "av1/common/enums.h" 15 #include "av1/common/av1_txfm.h" 16 #include "av1/encoder/x86/av1_fwd_txfm_avx2.h" 17 #include "av1/common/x86/av1_txfm_sse2.h" 18 #include "av1/encoder/av1_fwd_txfm1d_cfg.h" 19 #include "av1/encoder/x86/av1_txfm1d_sse4.h" 20 #include "av1/encoder/x86/av1_fwd_txfm_sse2.h" 21 #include "aom_dsp/x86/txfm_common_avx2.h" 22 23 static inline void fdct16x16_new_avx2(const __m256i *input, __m256i *output, 24 int8_t cos_bit) { 25 const int32_t *cospi = cospi_arr(cos_bit); 26 const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1)); 27 28 __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); 29 __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); 30 __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); 31 __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); 32 __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); 33 __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); 34 __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); 35 __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); 36 __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); 37 __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); 38 __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]); 39 __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]); 40 __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]); 41 __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]); 42 __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]); 43 __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]); 44 __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]); 45 __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]); 46 47 // stage 1 48 __m256i x1[16]; 49 btf_16_adds_subs_out_avx2(&x1[0], &x1[15], input[0], input[15]); 50 btf_16_adds_subs_out_avx2(&x1[1], &x1[14], input[1], input[14]); 51 btf_16_adds_subs_out_avx2(&x1[2], &x1[13], input[2], input[13]); 52 btf_16_adds_subs_out_avx2(&x1[3], &x1[12], input[3], input[12]); 53 btf_16_adds_subs_out_avx2(&x1[4], &x1[11], input[4], input[11]); 54 btf_16_adds_subs_out_avx2(&x1[5], &x1[10], input[5], input[10]); 55 btf_16_adds_subs_out_avx2(&x1[6], &x1[9], input[6], input[9]); 56 btf_16_adds_subs_out_avx2(&x1[7], &x1[8], input[7], input[8]); 57 58 // stage 2 59 btf_16_adds_subs_avx2(&x1[0], &x1[7]); 60 btf_16_adds_subs_avx2(&x1[1], &x1[6]); 61 btf_16_adds_subs_avx2(&x1[2], &x1[5]); 62 btf_16_adds_subs_avx2(&x1[3], &x1[4]); 63 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit); 64 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit); 65 66 // stage 3 67 btf_16_adds_subs_avx2(&x1[0], &x1[3]); 68 btf_16_adds_subs_avx2(&x1[1], &x1[2]); 69 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit); 70 btf_16_adds_subs_avx2(&x1[8], &x1[11]); 71 btf_16_adds_subs_avx2(&x1[9], &x1[10]); 72 btf_16_adds_subs_avx2(&x1[15], &x1[12]); 73 btf_16_adds_subs_avx2(&x1[14], &x1[13]); 74 75 // stage 4 76 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit); 77 btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit); 78 btf_16_adds_subs_avx2(&x1[4], &x1[5]); 79 btf_16_adds_subs_avx2(&x1[7], &x1[6]); 80 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit); 81 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit); 82 83 // stage 5 84 btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit); 85 btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit); 86 btf_16_adds_subs_avx2(&x1[8], &x1[9]); 87 btf_16_adds_subs_avx2(&x1[11], &x1[10]); 88 btf_16_adds_subs_avx2(&x1[12], &x1[13]); 89 btf_16_adds_subs_avx2(&x1[15], &x1[14]); 90 91 // stage 6 92 btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit); 93 btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit); 94 btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit); 95 btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit); 96 97 // stage 7 98 output[0] = x1[0]; 99 output[1] = x1[8]; 100 output[2] = x1[4]; 101 output[3] = x1[12]; 102 output[4] = x1[2]; 103 output[5] = x1[10]; 104 output[6] = x1[6]; 105 output[7] = x1[14]; 106 output[8] = x1[1]; 107 output[9] = x1[9]; 108 output[10] = x1[5]; 109 output[11] = x1[13]; 110 output[12] = x1[3]; 111 output[13] = x1[11]; 112 output[14] = x1[7]; 113 output[15] = x1[15]; 114 } 115 116 static inline void fdct16x32_avx2(const __m256i *input, __m256i *output, 117 int8_t cos_bit) { 118 const int32_t *cospi = cospi_arr(cos_bit); 119 const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1)); 120 121 __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); 122 __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); 123 __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); 124 __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); 125 __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); 126 __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); 127 __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); 128 __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); 129 __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); 130 __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); 131 __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]); 132 __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); 133 __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]); 134 __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]); 135 __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]); 136 __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]); 137 __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]); 138 __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]); 139 __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]); 140 __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]); 141 __m256i cospi_p62_p02 = pair_set_w16_epi16(cospi[62], cospi[2]); 142 __m256i cospi_m02_p62 = pair_set_w16_epi16(-cospi[2], cospi[62]); 143 __m256i cospi_p30_p34 = pair_set_w16_epi16(cospi[30], cospi[34]); 144 __m256i cospi_m34_p30 = pair_set_w16_epi16(-cospi[34], cospi[30]); 145 __m256i cospi_p46_p18 = pair_set_w16_epi16(cospi[46], cospi[18]); 146 __m256i cospi_m18_p46 = pair_set_w16_epi16(-cospi[18], cospi[46]); 147 __m256i cospi_p14_p50 = pair_set_w16_epi16(cospi[14], cospi[50]); 148 __m256i cospi_m50_p14 = pair_set_w16_epi16(-cospi[50], cospi[14]); 149 __m256i cospi_p54_p10 = pair_set_w16_epi16(cospi[54], cospi[10]); 150 __m256i cospi_m10_p54 = pair_set_w16_epi16(-cospi[10], cospi[54]); 151 __m256i cospi_p22_p42 = pair_set_w16_epi16(cospi[22], cospi[42]); 152 __m256i cospi_m42_p22 = pair_set_w16_epi16(-cospi[42], cospi[22]); 153 __m256i cospi_p38_p26 = pair_set_w16_epi16(cospi[38], cospi[26]); 154 __m256i cospi_m26_p38 = pair_set_w16_epi16(-cospi[26], cospi[38]); 155 __m256i cospi_p06_p58 = pair_set_w16_epi16(cospi[6], cospi[58]); 156 __m256i cospi_m58_p06 = pair_set_w16_epi16(-cospi[58], cospi[6]); 157 158 // stage 1 159 __m256i x1[32]; 160 btf_16_adds_subs_out_avx2(&x1[0], &x1[31], input[0], input[31]); 161 btf_16_adds_subs_out_avx2(&x1[1], &x1[30], input[1], input[30]); 162 btf_16_adds_subs_out_avx2(&x1[2], &x1[29], input[2], input[29]); 163 btf_16_adds_subs_out_avx2(&x1[3], &x1[28], input[3], input[28]); 164 btf_16_adds_subs_out_avx2(&x1[4], &x1[27], input[4], input[27]); 165 btf_16_adds_subs_out_avx2(&x1[5], &x1[26], input[5], input[26]); 166 btf_16_adds_subs_out_avx2(&x1[6], &x1[25], input[6], input[25]); 167 btf_16_adds_subs_out_avx2(&x1[7], &x1[24], input[7], input[24]); 168 btf_16_adds_subs_out_avx2(&x1[8], &x1[23], input[8], input[23]); 169 btf_16_adds_subs_out_avx2(&x1[9], &x1[22], input[9], input[22]); 170 btf_16_adds_subs_out_avx2(&x1[10], &x1[21], input[10], input[21]); 171 btf_16_adds_subs_out_avx2(&x1[11], &x1[20], input[11], input[20]); 172 btf_16_adds_subs_out_avx2(&x1[12], &x1[19], input[12], input[19]); 173 btf_16_adds_subs_out_avx2(&x1[13], &x1[18], input[13], input[18]); 174 btf_16_adds_subs_out_avx2(&x1[14], &x1[17], input[14], input[17]); 175 btf_16_adds_subs_out_avx2(&x1[15], &x1[16], input[15], input[16]); 176 177 // stage 2 178 btf_16_adds_subs_avx2(&x1[0], &x1[15]); 179 btf_16_adds_subs_avx2(&x1[1], &x1[14]); 180 btf_16_adds_subs_avx2(&x1[2], &x1[13]); 181 btf_16_adds_subs_avx2(&x1[3], &x1[12]); 182 btf_16_adds_subs_avx2(&x1[4], &x1[11]); 183 btf_16_adds_subs_avx2(&x1[5], &x1[10]); 184 btf_16_adds_subs_avx2(&x1[6], &x1[9]); 185 btf_16_adds_subs_avx2(&x1[7], &x1[8]); 186 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[20], &x1[27], _r, cos_bit); 187 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[21], &x1[26], _r, cos_bit); 188 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[22], &x1[25], _r, cos_bit); 189 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[23], &x1[24], _r, cos_bit); 190 191 // stage 3 192 btf_16_adds_subs_avx2(&x1[0], &x1[7]); 193 btf_16_adds_subs_avx2(&x1[1], &x1[6]); 194 btf_16_adds_subs_avx2(&x1[2], &x1[5]); 195 btf_16_adds_subs_avx2(&x1[3], &x1[4]); 196 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit); 197 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit); 198 btf_16_adds_subs_avx2(&x1[16], &x1[23]); 199 btf_16_adds_subs_avx2(&x1[17], &x1[22]); 200 btf_16_adds_subs_avx2(&x1[18], &x1[21]); 201 btf_16_adds_subs_avx2(&x1[19], &x1[20]); 202 btf_16_adds_subs_avx2(&x1[31], &x1[24]); 203 btf_16_adds_subs_avx2(&x1[30], &x1[25]); 204 btf_16_adds_subs_avx2(&x1[29], &x1[26]); 205 btf_16_adds_subs_avx2(&x1[28], &x1[27]); 206 207 // stage 4 208 btf_16_adds_subs_avx2(&x1[0], &x1[3]); 209 btf_16_adds_subs_avx2(&x1[1], &x1[2]); 210 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit); 211 btf_16_adds_subs_avx2(&x1[8], &x1[11]); 212 btf_16_adds_subs_avx2(&x1[9], &x1[10]); 213 btf_16_adds_subs_avx2(&x1[15], &x1[12]); 214 btf_16_adds_subs_avx2(&x1[14], &x1[13]); 215 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[18], &x1[29], _r, cos_bit); 216 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[19], &x1[28], _r, cos_bit); 217 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[20], &x1[27], _r, cos_bit); 218 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[21], &x1[26], _r, cos_bit); 219 220 // stage 5 221 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit); 222 btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit); 223 btf_16_adds_subs_avx2(&x1[4], &x1[5]); 224 btf_16_adds_subs_avx2(&x1[7], &x1[6]); 225 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit); 226 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit); 227 btf_16_adds_subs_avx2(&x1[16], &x1[19]); 228 btf_16_adds_subs_avx2(&x1[17], &x1[18]); 229 btf_16_adds_subs_avx2(&x1[23], &x1[20]); 230 btf_16_adds_subs_avx2(&x1[22], &x1[21]); 231 btf_16_adds_subs_avx2(&x1[24], &x1[27]); 232 btf_16_adds_subs_avx2(&x1[25], &x1[26]); 233 btf_16_adds_subs_avx2(&x1[31], &x1[28]); 234 btf_16_adds_subs_avx2(&x1[30], &x1[29]); 235 236 // stage 6 237 btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit); 238 btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit); 239 btf_16_adds_subs_avx2(&x1[8], &x1[9]); 240 btf_16_adds_subs_avx2(&x1[11], &x1[10]); 241 btf_16_adds_subs_avx2(&x1[12], &x1[13]); 242 btf_16_adds_subs_avx2(&x1[15], &x1[14]); 243 btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[17], &x1[30], _r, cos_bit); 244 btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[18], &x1[29], _r, cos_bit); 245 btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[21], &x1[26], _r, cos_bit); 246 btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[22], &x1[25], _r, cos_bit); 247 248 // stage 7 249 btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit); 250 btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit); 251 btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit); 252 btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit); 253 btf_16_adds_subs_avx2(&x1[16], &x1[17]); 254 btf_16_adds_subs_avx2(&x1[19], &x1[18]); 255 btf_16_adds_subs_avx2(&x1[20], &x1[21]); 256 btf_16_adds_subs_avx2(&x1[23], &x1[22]); 257 btf_16_adds_subs_avx2(&x1[24], &x1[25]); 258 btf_16_adds_subs_avx2(&x1[27], &x1[26]); 259 btf_16_adds_subs_avx2(&x1[28], &x1[29]); 260 btf_16_adds_subs_avx2(&x1[31], &x1[30]); 261 262 // stage 8 263 btf_16_w16_avx2(cospi_p62_p02, cospi_m02_p62, &x1[16], &x1[31], _r, cos_bit); 264 btf_16_w16_avx2(cospi_p30_p34, cospi_m34_p30, &x1[17], &x1[30], _r, cos_bit); 265 btf_16_w16_avx2(cospi_p46_p18, cospi_m18_p46, &x1[18], &x1[29], _r, cos_bit); 266 btf_16_w16_avx2(cospi_p14_p50, cospi_m50_p14, &x1[19], &x1[28], _r, cos_bit); 267 btf_16_w16_avx2(cospi_p54_p10, cospi_m10_p54, &x1[20], &x1[27], _r, cos_bit); 268 btf_16_w16_avx2(cospi_p22_p42, cospi_m42_p22, &x1[21], &x1[26], _r, cos_bit); 269 btf_16_w16_avx2(cospi_p38_p26, cospi_m26_p38, &x1[22], &x1[25], _r, cos_bit); 270 btf_16_w16_avx2(cospi_p06_p58, cospi_m58_p06, &x1[23], &x1[24], _r, cos_bit); 271 272 // stage 9 273 output[0] = x1[0]; 274 output[1] = x1[16]; 275 output[2] = x1[8]; 276 output[3] = x1[24]; 277 output[4] = x1[4]; 278 output[5] = x1[20]; 279 output[6] = x1[12]; 280 output[7] = x1[28]; 281 output[8] = x1[2]; 282 output[9] = x1[18]; 283 output[10] = x1[10]; 284 output[11] = x1[26]; 285 output[12] = x1[6]; 286 output[13] = x1[22]; 287 output[14] = x1[14]; 288 output[15] = x1[30]; 289 output[16] = x1[1]; 290 output[17] = x1[17]; 291 output[18] = x1[9]; 292 output[19] = x1[25]; 293 output[20] = x1[5]; 294 output[21] = x1[21]; 295 output[22] = x1[13]; 296 output[23] = x1[29]; 297 output[24] = x1[3]; 298 output[25] = x1[19]; 299 output[26] = x1[11]; 300 output[27] = x1[27]; 301 output[28] = x1[7]; 302 output[29] = x1[23]; 303 output[30] = x1[15]; 304 output[31] = x1[31]; 305 } 306 307 static inline void fdct16x64_new_avx2(const __m256i *input, __m256i *output, 308 int8_t cos_bit) { 309 const int32_t *cospi = cospi_arr(cos_bit); 310 const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1)); 311 312 __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); 313 __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); 314 __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); 315 __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); 316 __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); 317 __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); 318 __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); 319 __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); 320 __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]); 321 __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); 322 __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); 323 __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); 324 __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]); 325 __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]); 326 __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]); 327 __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]); 328 __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]); 329 __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]); 330 __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]); 331 __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]); 332 __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]); 333 __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]); 334 __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]); 335 __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]); 336 __m256i cospi_p62_p02 = pair_set_w16_epi16(cospi[62], cospi[2]); 337 __m256i cospi_m02_p62 = pair_set_w16_epi16(-cospi[2], cospi[62]); 338 __m256i cospi_p30_p34 = pair_set_w16_epi16(cospi[30], cospi[34]); 339 __m256i cospi_m34_p30 = pair_set_w16_epi16(-cospi[34], cospi[30]); 340 __m256i cospi_p46_p18 = pair_set_w16_epi16(cospi[46], cospi[18]); 341 __m256i cospi_m18_p46 = pair_set_w16_epi16(-cospi[18], cospi[46]); 342 __m256i cospi_p14_p50 = pair_set_w16_epi16(cospi[14], cospi[50]); 343 __m256i cospi_m50_p14 = pair_set_w16_epi16(-cospi[50], cospi[14]); 344 __m256i cospi_p54_p10 = pair_set_w16_epi16(cospi[54], cospi[10]); 345 __m256i cospi_m10_p54 = pair_set_w16_epi16(-cospi[10], cospi[54]); 346 __m256i cospi_p22_p42 = pair_set_w16_epi16(cospi[22], cospi[42]); 347 __m256i cospi_m42_p22 = pair_set_w16_epi16(-cospi[42], cospi[22]); 348 __m256i cospi_p38_p26 = pair_set_w16_epi16(cospi[38], cospi[26]); 349 __m256i cospi_m26_p38 = pair_set_w16_epi16(-cospi[26], cospi[38]); 350 __m256i cospi_p06_p58 = pair_set_w16_epi16(cospi[6], cospi[58]); 351 __m256i cospi_m58_p06 = pair_set_w16_epi16(-cospi[58], cospi[6]); 352 __m256i cospi_p63_p01 = pair_set_w16_epi16(cospi[63], cospi[1]); 353 __m256i cospi_m01_p63 = pair_set_w16_epi16(-cospi[1], cospi[63]); 354 __m256i cospi_p31_p33 = pair_set_w16_epi16(cospi[31], cospi[33]); 355 __m256i cospi_m33_p31 = pair_set_w16_epi16(-cospi[33], cospi[31]); 356 __m256i cospi_p47_p17 = pair_set_w16_epi16(cospi[47], cospi[17]); 357 __m256i cospi_m17_p47 = pair_set_w16_epi16(-cospi[17], cospi[47]); 358 __m256i cospi_p15_p49 = pair_set_w16_epi16(cospi[15], cospi[49]); 359 __m256i cospi_m49_p15 = pair_set_w16_epi16(-cospi[49], cospi[15]); 360 __m256i cospi_p55_p09 = pair_set_w16_epi16(cospi[55], cospi[9]); 361 __m256i cospi_m09_p55 = pair_set_w16_epi16(-cospi[9], cospi[55]); 362 __m256i cospi_p23_p41 = pair_set_w16_epi16(cospi[23], cospi[41]); 363 __m256i cospi_m41_p23 = pair_set_w16_epi16(-cospi[41], cospi[23]); 364 __m256i cospi_p39_p25 = pair_set_w16_epi16(cospi[39], cospi[25]); 365 __m256i cospi_m25_p39 = pair_set_w16_epi16(-cospi[25], cospi[39]); 366 __m256i cospi_p07_p57 = pair_set_w16_epi16(cospi[7], cospi[57]); 367 __m256i cospi_m57_p07 = pair_set_w16_epi16(-cospi[57], cospi[7]); 368 __m256i cospi_p59_p05 = pair_set_w16_epi16(cospi[59], cospi[5]); 369 __m256i cospi_m05_p59 = pair_set_w16_epi16(-cospi[5], cospi[59]); 370 __m256i cospi_p27_p37 = pair_set_w16_epi16(cospi[27], cospi[37]); 371 __m256i cospi_m37_p27 = pair_set_w16_epi16(-cospi[37], cospi[27]); 372 __m256i cospi_p43_p21 = pair_set_w16_epi16(cospi[43], cospi[21]); 373 __m256i cospi_m21_p43 = pair_set_w16_epi16(-cospi[21], cospi[43]); 374 __m256i cospi_p11_p53 = pair_set_w16_epi16(cospi[11], cospi[53]); 375 __m256i cospi_m53_p11 = pair_set_w16_epi16(-cospi[53], cospi[11]); 376 __m256i cospi_p51_p13 = pair_set_w16_epi16(cospi[51], cospi[13]); 377 __m256i cospi_m13_p51 = pair_set_w16_epi16(-cospi[13], cospi[51]); 378 __m256i cospi_p19_p45 = pair_set_w16_epi16(cospi[19], cospi[45]); 379 __m256i cospi_m45_p19 = pair_set_w16_epi16(-cospi[45], cospi[19]); 380 __m256i cospi_p35_p29 = pair_set_w16_epi16(cospi[35], cospi[29]); 381 __m256i cospi_m29_p35 = pair_set_w16_epi16(-cospi[29], cospi[35]); 382 __m256i cospi_p03_p61 = pair_set_w16_epi16(cospi[3], cospi[61]); 383 __m256i cospi_m61_p03 = pair_set_w16_epi16(-cospi[61], cospi[3]); 384 385 // stage 1 386 __m256i x1[64]; 387 btf_16_adds_subs_out_avx2(&x1[0], &x1[63], input[0], input[63]); 388 btf_16_adds_subs_out_avx2(&x1[1], &x1[62], input[1], input[62]); 389 btf_16_adds_subs_out_avx2(&x1[2], &x1[61], input[2], input[61]); 390 btf_16_adds_subs_out_avx2(&x1[3], &x1[60], input[3], input[60]); 391 btf_16_adds_subs_out_avx2(&x1[4], &x1[59], input[4], input[59]); 392 btf_16_adds_subs_out_avx2(&x1[5], &x1[58], input[5], input[58]); 393 btf_16_adds_subs_out_avx2(&x1[6], &x1[57], input[6], input[57]); 394 btf_16_adds_subs_out_avx2(&x1[7], &x1[56], input[7], input[56]); 395 btf_16_adds_subs_out_avx2(&x1[8], &x1[55], input[8], input[55]); 396 btf_16_adds_subs_out_avx2(&x1[9], &x1[54], input[9], input[54]); 397 btf_16_adds_subs_out_avx2(&x1[10], &x1[53], input[10], input[53]); 398 btf_16_adds_subs_out_avx2(&x1[11], &x1[52], input[11], input[52]); 399 btf_16_adds_subs_out_avx2(&x1[12], &x1[51], input[12], input[51]); 400 btf_16_adds_subs_out_avx2(&x1[13], &x1[50], input[13], input[50]); 401 btf_16_adds_subs_out_avx2(&x1[14], &x1[49], input[14], input[49]); 402 btf_16_adds_subs_out_avx2(&x1[15], &x1[48], input[15], input[48]); 403 btf_16_adds_subs_out_avx2(&x1[16], &x1[47], input[16], input[47]); 404 btf_16_adds_subs_out_avx2(&x1[17], &x1[46], input[17], input[46]); 405 btf_16_adds_subs_out_avx2(&x1[18], &x1[45], input[18], input[45]); 406 btf_16_adds_subs_out_avx2(&x1[19], &x1[44], input[19], input[44]); 407 btf_16_adds_subs_out_avx2(&x1[20], &x1[43], input[20], input[43]); 408 btf_16_adds_subs_out_avx2(&x1[21], &x1[42], input[21], input[42]); 409 btf_16_adds_subs_out_avx2(&x1[22], &x1[41], input[22], input[41]); 410 btf_16_adds_subs_out_avx2(&x1[23], &x1[40], input[23], input[40]); 411 btf_16_adds_subs_out_avx2(&x1[24], &x1[39], input[24], input[39]); 412 btf_16_adds_subs_out_avx2(&x1[25], &x1[38], input[25], input[38]); 413 btf_16_adds_subs_out_avx2(&x1[26], &x1[37], input[26], input[37]); 414 btf_16_adds_subs_out_avx2(&x1[27], &x1[36], input[27], input[36]); 415 btf_16_adds_subs_out_avx2(&x1[28], &x1[35], input[28], input[35]); 416 btf_16_adds_subs_out_avx2(&x1[29], &x1[34], input[29], input[34]); 417 btf_16_adds_subs_out_avx2(&x1[30], &x1[33], input[30], input[33]); 418 btf_16_adds_subs_out_avx2(&x1[31], &x1[32], input[31], input[32]); 419 420 // stage 2 421 btf_16_adds_subs_avx2(&x1[0], &x1[31]); 422 btf_16_adds_subs_avx2(&x1[1], &x1[30]); 423 btf_16_adds_subs_avx2(&x1[2], &x1[29]); 424 btf_16_adds_subs_avx2(&x1[3], &x1[28]); 425 btf_16_adds_subs_avx2(&x1[4], &x1[27]); 426 btf_16_adds_subs_avx2(&x1[5], &x1[26]); 427 btf_16_adds_subs_avx2(&x1[6], &x1[25]); 428 btf_16_adds_subs_avx2(&x1[7], &x1[24]); 429 btf_16_adds_subs_avx2(&x1[8], &x1[23]); 430 btf_16_adds_subs_avx2(&x1[9], &x1[22]); 431 btf_16_adds_subs_avx2(&x1[10], &x1[21]); 432 btf_16_adds_subs_avx2(&x1[11], &x1[20]); 433 btf_16_adds_subs_avx2(&x1[12], &x1[19]); 434 btf_16_adds_subs_avx2(&x1[13], &x1[18]); 435 btf_16_adds_subs_avx2(&x1[14], &x1[17]); 436 btf_16_adds_subs_avx2(&x1[15], &x1[16]); 437 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[40], &x1[55], _r, cos_bit); 438 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[41], &x1[54], _r, cos_bit); 439 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[42], &x1[53], _r, cos_bit); 440 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[43], &x1[52], _r, cos_bit); 441 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[44], &x1[51], _r, cos_bit); 442 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[45], &x1[50], _r, cos_bit); 443 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[46], &x1[49], _r, cos_bit); 444 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[47], &x1[48], _r, cos_bit); 445 446 // stage 3 447 btf_16_adds_subs_avx2(&x1[0], &x1[15]); 448 btf_16_adds_subs_avx2(&x1[1], &x1[14]); 449 btf_16_adds_subs_avx2(&x1[2], &x1[13]); 450 btf_16_adds_subs_avx2(&x1[3], &x1[12]); 451 btf_16_adds_subs_avx2(&x1[4], &x1[11]); 452 btf_16_adds_subs_avx2(&x1[5], &x1[10]); 453 btf_16_adds_subs_avx2(&x1[6], &x1[9]); 454 btf_16_adds_subs_avx2(&x1[7], &x1[8]); 455 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[20], &x1[27], _r, cos_bit); 456 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[21], &x1[26], _r, cos_bit); 457 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[22], &x1[25], _r, cos_bit); 458 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[23], &x1[24], _r, cos_bit); 459 btf_16_adds_subs_avx2(&x1[32], &x1[47]); 460 btf_16_adds_subs_avx2(&x1[33], &x1[46]); 461 btf_16_adds_subs_avx2(&x1[34], &x1[45]); 462 btf_16_adds_subs_avx2(&x1[35], &x1[44]); 463 btf_16_adds_subs_avx2(&x1[36], &x1[43]); 464 btf_16_adds_subs_avx2(&x1[37], &x1[42]); 465 btf_16_adds_subs_avx2(&x1[38], &x1[41]); 466 btf_16_adds_subs_avx2(&x1[39], &x1[40]); 467 btf_16_adds_subs_avx2(&x1[63], &x1[48]); 468 btf_16_adds_subs_avx2(&x1[62], &x1[49]); 469 btf_16_adds_subs_avx2(&x1[61], &x1[50]); 470 btf_16_adds_subs_avx2(&x1[60], &x1[51]); 471 btf_16_adds_subs_avx2(&x1[59], &x1[52]); 472 btf_16_adds_subs_avx2(&x1[58], &x1[53]); 473 btf_16_adds_subs_avx2(&x1[57], &x1[54]); 474 btf_16_adds_subs_avx2(&x1[56], &x1[55]); 475 476 // stage 4 477 btf_16_adds_subs_avx2(&x1[0], &x1[7]); 478 btf_16_adds_subs_avx2(&x1[1], &x1[6]); 479 btf_16_adds_subs_avx2(&x1[2], &x1[5]); 480 btf_16_adds_subs_avx2(&x1[3], &x1[4]); 481 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit); 482 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit); 483 btf_16_adds_subs_avx2(&x1[16], &x1[23]); 484 btf_16_adds_subs_avx2(&x1[17], &x1[22]); 485 btf_16_adds_subs_avx2(&x1[18], &x1[21]); 486 btf_16_adds_subs_avx2(&x1[19], &x1[20]); 487 btf_16_adds_subs_avx2(&x1[31], &x1[24]); 488 btf_16_adds_subs_avx2(&x1[30], &x1[25]); 489 btf_16_adds_subs_avx2(&x1[29], &x1[26]); 490 btf_16_adds_subs_avx2(&x1[28], &x1[27]); 491 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[36], &x1[59], _r, cos_bit); 492 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[37], &x1[58], _r, cos_bit); 493 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[38], &x1[57], _r, cos_bit); 494 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[39], &x1[56], _r, cos_bit); 495 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[40], &x1[55], _r, cos_bit); 496 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[41], &x1[54], _r, cos_bit); 497 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[42], &x1[53], _r, cos_bit); 498 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[43], &x1[52], _r, cos_bit); 499 500 // stage 5 501 btf_16_adds_subs_avx2(&x1[0], &x1[3]); 502 btf_16_adds_subs_avx2(&x1[1], &x1[2]); 503 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit); 504 btf_16_adds_subs_avx2(&x1[8], &x1[11]); 505 btf_16_adds_subs_avx2(&x1[9], &x1[10]); 506 btf_16_adds_subs_avx2(&x1[15], &x1[12]); 507 btf_16_adds_subs_avx2(&x1[14], &x1[13]); 508 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[18], &x1[29], _r, cos_bit); 509 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[19], &x1[28], _r, cos_bit); 510 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[20], &x1[27], _r, cos_bit); 511 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[21], &x1[26], _r, cos_bit); 512 btf_16_adds_subs_avx2(&x1[32], &x1[39]); 513 btf_16_adds_subs_avx2(&x1[33], &x1[38]); 514 btf_16_adds_subs_avx2(&x1[34], &x1[37]); 515 btf_16_adds_subs_avx2(&x1[35], &x1[36]); 516 btf_16_adds_subs_avx2(&x1[47], &x1[40]); 517 btf_16_adds_subs_avx2(&x1[46], &x1[41]); 518 btf_16_adds_subs_avx2(&x1[45], &x1[42]); 519 btf_16_adds_subs_avx2(&x1[44], &x1[43]); 520 btf_16_adds_subs_avx2(&x1[48], &x1[55]); 521 btf_16_adds_subs_avx2(&x1[49], &x1[54]); 522 btf_16_adds_subs_avx2(&x1[50], &x1[53]); 523 btf_16_adds_subs_avx2(&x1[51], &x1[52]); 524 btf_16_adds_subs_avx2(&x1[63], &x1[56]); 525 btf_16_adds_subs_avx2(&x1[62], &x1[57]); 526 btf_16_adds_subs_avx2(&x1[61], &x1[58]); 527 btf_16_adds_subs_avx2(&x1[60], &x1[59]); 528 529 // stage 6 530 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit); 531 btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit); 532 btf_16_adds_subs_avx2(&x1[4], &x1[5]); 533 btf_16_adds_subs_avx2(&x1[7], &x1[6]); 534 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit); 535 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit); 536 btf_16_adds_subs_avx2(&x1[16], &x1[19]); 537 btf_16_adds_subs_avx2(&x1[17], &x1[18]); 538 btf_16_adds_subs_avx2(&x1[23], &x1[20]); 539 btf_16_adds_subs_avx2(&x1[22], &x1[21]); 540 btf_16_adds_subs_avx2(&x1[24], &x1[27]); 541 btf_16_adds_subs_avx2(&x1[25], &x1[26]); 542 btf_16_adds_subs_avx2(&x1[31], &x1[28]); 543 btf_16_adds_subs_avx2(&x1[30], &x1[29]); 544 btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[34], &x1[61], _r, cos_bit); 545 btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[35], &x1[60], _r, cos_bit); 546 btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[36], &x1[59], _r, cos_bit); 547 btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[37], &x1[58], _r, cos_bit); 548 btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[42], &x1[53], _r, cos_bit); 549 btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[43], &x1[52], _r, cos_bit); 550 btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[44], &x1[51], _r, cos_bit); 551 btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[45], &x1[50], _r, cos_bit); 552 553 // stage 7 554 btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit); 555 btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit); 556 btf_16_adds_subs_avx2(&x1[8], &x1[9]); 557 btf_16_adds_subs_avx2(&x1[11], &x1[10]); 558 btf_16_adds_subs_avx2(&x1[12], &x1[13]); 559 btf_16_adds_subs_avx2(&x1[15], &x1[14]); 560 btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[17], &x1[30], _r, cos_bit); 561 btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[18], &x1[29], _r, cos_bit); 562 btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[21], &x1[26], _r, cos_bit); 563 btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[22], &x1[25], _r, cos_bit); 564 btf_16_adds_subs_avx2(&x1[32], &x1[35]); 565 btf_16_adds_subs_avx2(&x1[33], &x1[34]); 566 btf_16_adds_subs_avx2(&x1[39], &x1[36]); 567 btf_16_adds_subs_avx2(&x1[38], &x1[37]); 568 btf_16_adds_subs_avx2(&x1[40], &x1[43]); 569 btf_16_adds_subs_avx2(&x1[41], &x1[42]); 570 btf_16_adds_subs_avx2(&x1[47], &x1[44]); 571 btf_16_adds_subs_avx2(&x1[46], &x1[45]); 572 btf_16_adds_subs_avx2(&x1[48], &x1[51]); 573 btf_16_adds_subs_avx2(&x1[49], &x1[50]); 574 btf_16_adds_subs_avx2(&x1[55], &x1[52]); 575 btf_16_adds_subs_avx2(&x1[54], &x1[53]); 576 btf_16_adds_subs_avx2(&x1[56], &x1[59]); 577 btf_16_adds_subs_avx2(&x1[57], &x1[58]); 578 btf_16_adds_subs_avx2(&x1[63], &x1[60]); 579 btf_16_adds_subs_avx2(&x1[62], &x1[61]); 580 581 // stage 8 582 btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit); 583 btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit); 584 btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit); 585 btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit); 586 btf_16_adds_subs_avx2(&x1[16], &x1[17]); 587 btf_16_adds_subs_avx2(&x1[19], &x1[18]); 588 btf_16_adds_subs_avx2(&x1[20], &x1[21]); 589 btf_16_adds_subs_avx2(&x1[23], &x1[22]); 590 btf_16_adds_subs_avx2(&x1[24], &x1[25]); 591 btf_16_adds_subs_avx2(&x1[27], &x1[26]); 592 btf_16_adds_subs_avx2(&x1[28], &x1[29]); 593 btf_16_adds_subs_avx2(&x1[31], &x1[30]); 594 btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x1[33], &x1[62], _r, cos_bit); 595 btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x1[34], &x1[61], _r, cos_bit); 596 btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x1[37], &x1[58], _r, cos_bit); 597 btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x1[38], &x1[57], _r, cos_bit); 598 btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x1[41], &x1[54], _r, cos_bit); 599 btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x1[42], &x1[53], _r, cos_bit); 600 btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x1[45], &x1[50], _r, cos_bit); 601 btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x1[46], &x1[49], _r, cos_bit); 602 603 // stage 9 604 btf_16_w16_avx2(cospi_p62_p02, cospi_m02_p62, &x1[16], &x1[31], _r, cos_bit); 605 btf_16_w16_avx2(cospi_p30_p34, cospi_m34_p30, &x1[17], &x1[30], _r, cos_bit); 606 btf_16_w16_avx2(cospi_p46_p18, cospi_m18_p46, &x1[18], &x1[29], _r, cos_bit); 607 btf_16_w16_avx2(cospi_p14_p50, cospi_m50_p14, &x1[19], &x1[28], _r, cos_bit); 608 btf_16_w16_avx2(cospi_p54_p10, cospi_m10_p54, &x1[20], &x1[27], _r, cos_bit); 609 btf_16_w16_avx2(cospi_p22_p42, cospi_m42_p22, &x1[21], &x1[26], _r, cos_bit); 610 btf_16_w16_avx2(cospi_p38_p26, cospi_m26_p38, &x1[22], &x1[25], _r, cos_bit); 611 btf_16_w16_avx2(cospi_p06_p58, cospi_m58_p06, &x1[23], &x1[24], _r, cos_bit); 612 btf_16_adds_subs_avx2(&x1[32], &x1[33]); 613 btf_16_adds_subs_avx2(&x1[35], &x1[34]); 614 btf_16_adds_subs_avx2(&x1[36], &x1[37]); 615 btf_16_adds_subs_avx2(&x1[39], &x1[38]); 616 btf_16_adds_subs_avx2(&x1[40], &x1[41]); 617 btf_16_adds_subs_avx2(&x1[43], &x1[42]); 618 btf_16_adds_subs_avx2(&x1[44], &x1[45]); 619 btf_16_adds_subs_avx2(&x1[47], &x1[46]); 620 btf_16_adds_subs_avx2(&x1[48], &x1[49]); 621 btf_16_adds_subs_avx2(&x1[51], &x1[50]); 622 btf_16_adds_subs_avx2(&x1[52], &x1[53]); 623 btf_16_adds_subs_avx2(&x1[55], &x1[54]); 624 btf_16_adds_subs_avx2(&x1[56], &x1[57]); 625 btf_16_adds_subs_avx2(&x1[59], &x1[58]); 626 btf_16_adds_subs_avx2(&x1[60], &x1[61]); 627 btf_16_adds_subs_avx2(&x1[63], &x1[62]); 628 629 // stage 10 630 btf_16_w16_avx2(cospi_p63_p01, cospi_m01_p63, &x1[32], &x1[63], _r, cos_bit); 631 btf_16_w16_avx2(cospi_p31_p33, cospi_m33_p31, &x1[33], &x1[62], _r, cos_bit); 632 btf_16_w16_avx2(cospi_p47_p17, cospi_m17_p47, &x1[34], &x1[61], _r, cos_bit); 633 btf_16_w16_avx2(cospi_p15_p49, cospi_m49_p15, &x1[35], &x1[60], _r, cos_bit); 634 btf_16_w16_avx2(cospi_p55_p09, cospi_m09_p55, &x1[36], &x1[59], _r, cos_bit); 635 btf_16_w16_avx2(cospi_p23_p41, cospi_m41_p23, &x1[37], &x1[58], _r, cos_bit); 636 btf_16_w16_avx2(cospi_p39_p25, cospi_m25_p39, &x1[38], &x1[57], _r, cos_bit); 637 btf_16_w16_avx2(cospi_p07_p57, cospi_m57_p07, &x1[39], &x1[56], _r, cos_bit); 638 btf_16_w16_avx2(cospi_p59_p05, cospi_m05_p59, &x1[40], &x1[55], _r, cos_bit); 639 btf_16_w16_avx2(cospi_p27_p37, cospi_m37_p27, &x1[41], &x1[54], _r, cos_bit); 640 btf_16_w16_avx2(cospi_p43_p21, cospi_m21_p43, &x1[42], &x1[53], _r, cos_bit); 641 btf_16_w16_avx2(cospi_p11_p53, cospi_m53_p11, &x1[43], &x1[52], _r, cos_bit); 642 btf_16_w16_avx2(cospi_p51_p13, cospi_m13_p51, &x1[44], &x1[51], _r, cos_bit); 643 btf_16_w16_avx2(cospi_p19_p45, cospi_m45_p19, &x1[45], &x1[50], _r, cos_bit); 644 btf_16_w16_avx2(cospi_p35_p29, cospi_m29_p35, &x1[46], &x1[49], _r, cos_bit); 645 btf_16_w16_avx2(cospi_p03_p61, cospi_m61_p03, &x1[47], &x1[48], _r, cos_bit); 646 647 // stage 11 648 output[0] = x1[0]; 649 output[1] = x1[32]; 650 output[2] = x1[16]; 651 output[3] = x1[48]; 652 output[4] = x1[8]; 653 output[5] = x1[40]; 654 output[6] = x1[24]; 655 output[7] = x1[56]; 656 output[8] = x1[4]; 657 output[9] = x1[36]; 658 output[10] = x1[20]; 659 output[11] = x1[52]; 660 output[12] = x1[12]; 661 output[13] = x1[44]; 662 output[14] = x1[28]; 663 output[15] = x1[60]; 664 output[16] = x1[2]; 665 output[17] = x1[34]; 666 output[18] = x1[18]; 667 output[19] = x1[50]; 668 output[20] = x1[10]; 669 output[21] = x1[42]; 670 output[22] = x1[26]; 671 output[23] = x1[58]; 672 output[24] = x1[6]; 673 output[25] = x1[38]; 674 output[26] = x1[22]; 675 output[27] = x1[54]; 676 output[28] = x1[14]; 677 output[29] = x1[46]; 678 output[30] = x1[30]; 679 output[31] = x1[62]; 680 output[32] = x1[1]; 681 output[33] = x1[33]; 682 output[34] = x1[17]; 683 output[35] = x1[49]; 684 output[36] = x1[9]; 685 output[37] = x1[41]; 686 output[38] = x1[25]; 687 output[39] = x1[57]; 688 output[40] = x1[5]; 689 output[41] = x1[37]; 690 output[42] = x1[21]; 691 output[43] = x1[53]; 692 output[44] = x1[13]; 693 output[45] = x1[45]; 694 output[46] = x1[29]; 695 output[47] = x1[61]; 696 output[48] = x1[3]; 697 output[49] = x1[35]; 698 output[50] = x1[19]; 699 output[51] = x1[51]; 700 output[52] = x1[11]; 701 output[53] = x1[43]; 702 output[54] = x1[27]; 703 output[55] = x1[59]; 704 output[56] = x1[7]; 705 output[57] = x1[39]; 706 output[58] = x1[23]; 707 output[59] = x1[55]; 708 output[60] = x1[15]; 709 output[61] = x1[47]; 710 output[62] = x1[31]; 711 output[63] = x1[63]; 712 } 713 714 static inline void fdct32_avx2(const __m256i *input, __m256i *output, 715 int8_t cos_bit) { 716 __m256i x1[32]; 717 const int32_t *cospi = cospi_arr(cos_bit); 718 const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1)); 719 // stage 0 720 // stage 1 721 btf_32_add_sub_out_avx2(&x1[0], &x1[31], input[0], input[31]); 722 btf_32_add_sub_out_avx2(&x1[1], &x1[30], input[1], input[30]); 723 btf_32_add_sub_out_avx2(&x1[2], &x1[29], input[2], input[29]); 724 btf_32_add_sub_out_avx2(&x1[3], &x1[28], input[3], input[28]); 725 btf_32_add_sub_out_avx2(&x1[4], &x1[27], input[4], input[27]); 726 btf_32_add_sub_out_avx2(&x1[5], &x1[26], input[5], input[26]); 727 btf_32_add_sub_out_avx2(&x1[6], &x1[25], input[6], input[25]); 728 btf_32_add_sub_out_avx2(&x1[7], &x1[24], input[7], input[24]); 729 btf_32_add_sub_out_avx2(&x1[8], &x1[23], input[8], input[23]); 730 btf_32_add_sub_out_avx2(&x1[9], &x1[22], input[9], input[22]); 731 btf_32_add_sub_out_avx2(&x1[10], &x1[21], input[10], input[21]); 732 btf_32_add_sub_out_avx2(&x1[11], &x1[20], input[11], input[20]); 733 btf_32_add_sub_out_avx2(&x1[12], &x1[19], input[12], input[19]); 734 btf_32_add_sub_out_avx2(&x1[13], &x1[18], input[13], input[18]); 735 btf_32_add_sub_out_avx2(&x1[14], &x1[17], input[14], input[17]); 736 btf_32_add_sub_out_avx2(&x1[15], &x1[16], input[15], input[16]); 737 738 // stage 2 739 btf_32_add_sub_avx2(&x1[0], &x1[15]); 740 btf_32_add_sub_avx2(&x1[1], &x1[14]); 741 btf_32_add_sub_avx2(&x1[2], &x1[13]); 742 btf_32_add_sub_avx2(&x1[3], &x1[12]); 743 btf_32_add_sub_avx2(&x1[4], &x1[11]); 744 btf_32_add_sub_avx2(&x1[5], &x1[10]); 745 btf_32_add_sub_avx2(&x1[6], &x1[9]); 746 btf_32_add_sub_avx2(&x1[7], &x1[8]); 747 btf_32_avx2_type0(-cospi[32], cospi[32], &x1[20], &x1[27], _r, cos_bit); 748 btf_32_avx2_type0(-cospi[32], cospi[32], &x1[21], &x1[26], _r, cos_bit); 749 btf_32_avx2_type0(-cospi[32], cospi[32], &x1[22], &x1[25], _r, cos_bit); 750 btf_32_avx2_type0(-cospi[32], cospi[32], &x1[23], &x1[24], _r, cos_bit); 751 752 // stage 3 753 btf_32_add_sub_avx2(&x1[0], &x1[7]); 754 btf_32_add_sub_avx2(&x1[1], &x1[6]); 755 btf_32_add_sub_avx2(&x1[2], &x1[5]); 756 btf_32_add_sub_avx2(&x1[3], &x1[4]); 757 btf_32_avx2_type0(-cospi[32], cospi[32], &x1[10], &x1[13], _r, cos_bit); 758 btf_32_avx2_type0(-cospi[32], cospi[32], &x1[11], &x1[12], _r, cos_bit); 759 btf_32_add_sub_avx2(&x1[16], &x1[23]); 760 btf_32_add_sub_avx2(&x1[17], &x1[22]); 761 btf_32_add_sub_avx2(&x1[18], &x1[21]); 762 btf_32_add_sub_avx2(&x1[19], &x1[20]); 763 btf_32_add_sub_avx2(&x1[31], &x1[24]); 764 btf_32_add_sub_avx2(&x1[30], &x1[25]); 765 btf_32_add_sub_avx2(&x1[29], &x1[26]); 766 btf_32_add_sub_avx2(&x1[28], &x1[27]); 767 768 // stage 4 769 btf_32_add_sub_avx2(&x1[0], &x1[3]); 770 btf_32_add_sub_avx2(&x1[1], &x1[2]); 771 btf_32_avx2_type0(-cospi[32], cospi[32], &x1[5], &x1[6], _r, cos_bit); 772 btf_32_add_sub_avx2(&x1[8], &x1[11]); 773 btf_32_add_sub_avx2(&x1[9], &x1[10]); 774 btf_32_add_sub_avx2(&x1[15], &x1[12]); 775 btf_32_add_sub_avx2(&x1[14], &x1[13]); 776 btf_32_avx2_type0(-cospi[16], cospi[48], &x1[18], &x1[29], _r, cos_bit); 777 btf_32_avx2_type0(-cospi[16], cospi[48], &x1[19], &x1[28], _r, cos_bit); 778 btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[20], &x1[27], _r, cos_bit); 779 btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[21], &x1[26], _r, cos_bit); 780 781 // stage 5 782 btf_32_avx2_type0(cospi[32], cospi[32], &x1[0], &x1[1], _r, cos_bit); 783 btf_32_avx2_type1(cospi[48], cospi[16], &x1[2], &x1[3], _r, cos_bit); 784 btf_32_add_sub_avx2(&x1[4], &x1[5]); 785 btf_32_add_sub_avx2(&x1[7], &x1[6]); 786 btf_32_avx2_type0(-cospi[16], cospi[48], &x1[9], &x1[14], _r, cos_bit); 787 btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[10], &x1[13], _r, cos_bit); 788 btf_32_add_sub_avx2(&x1[16], &x1[19]); 789 btf_32_add_sub_avx2(&x1[17], &x1[18]); 790 btf_32_add_sub_avx2(&x1[23], &x1[20]); 791 btf_32_add_sub_avx2(&x1[22], &x1[21]); 792 btf_32_add_sub_avx2(&x1[24], &x1[27]); 793 btf_32_add_sub_avx2(&x1[25], &x1[26]); 794 btf_32_add_sub_avx2(&x1[31], &x1[28]); 795 btf_32_add_sub_avx2(&x1[30], &x1[29]); 796 797 // stage 6 798 btf_32_avx2_type1(cospi[56], cospi[8], &x1[4], &x1[7], _r, cos_bit); 799 btf_32_avx2_type1(cospi[24], cospi[40], &x1[5], &x1[6], _r, cos_bit); 800 btf_32_add_sub_avx2(&x1[8], &x1[9]); 801 btf_32_add_sub_avx2(&x1[11], &x1[10]); 802 btf_32_add_sub_avx2(&x1[12], &x1[13]); 803 btf_32_add_sub_avx2(&x1[15], &x1[14]); 804 btf_32_avx2_type0(-cospi[8], cospi[56], &x1[17], &x1[30], _r, cos_bit); 805 btf_32_avx2_type0(-cospi[56], -cospi[8], &x1[18], &x1[29], _r, cos_bit); 806 btf_32_avx2_type0(-cospi[40], cospi[24], &x1[21], &x1[26], _r, cos_bit); 807 btf_32_avx2_type0(-cospi[24], -cospi[40], &x1[22], &x1[25], _r, cos_bit); 808 809 // stage 7 810 btf_32_avx2_type1(cospi[60], cospi[4], &x1[8], &x1[15], _r, cos_bit); 811 btf_32_avx2_type1(cospi[28], cospi[36], &x1[9], &x1[14], _r, cos_bit); 812 btf_32_avx2_type1(cospi[44], cospi[20], &x1[10], &x1[13], _r, cos_bit); 813 btf_32_avx2_type1(cospi[12], cospi[52], &x1[11], &x1[12], _r, cos_bit); 814 btf_32_add_sub_avx2(&x1[16], &x1[17]); 815 btf_32_add_sub_avx2(&x1[19], &x1[18]); 816 btf_32_add_sub_avx2(&x1[20], &x1[21]); 817 btf_32_add_sub_avx2(&x1[23], &x1[22]); 818 btf_32_add_sub_avx2(&x1[24], &x1[25]); 819 btf_32_add_sub_avx2(&x1[27], &x1[26]); 820 btf_32_add_sub_avx2(&x1[28], &x1[29]); 821 btf_32_add_sub_avx2(&x1[31], &x1[30]); 822 823 // stage 8 824 btf_32_avx2_type1(cospi[62], cospi[2], &x1[16], &x1[31], _r, cos_bit); 825 btf_32_avx2_type1(cospi[30], cospi[34], &x1[17], &x1[30], _r, cos_bit); 826 btf_32_avx2_type1(cospi[46], cospi[18], &x1[18], &x1[29], _r, cos_bit); 827 btf_32_avx2_type1(cospi[14], cospi[50], &x1[19], &x1[28], _r, cos_bit); 828 btf_32_avx2_type1(cospi[54], cospi[10], &x1[20], &x1[27], _r, cos_bit); 829 btf_32_avx2_type1(cospi[22], cospi[42], &x1[21], &x1[26], _r, cos_bit); 830 btf_32_avx2_type1(cospi[38], cospi[26], &x1[22], &x1[25], _r, cos_bit); 831 btf_32_avx2_type1(cospi[6], cospi[58], &x1[23], &x1[24], _r, cos_bit); 832 833 // stage 9 834 output[0] = x1[0]; 835 output[1] = x1[16]; 836 output[2] = x1[8]; 837 output[3] = x1[24]; 838 output[4] = x1[4]; 839 output[5] = x1[20]; 840 output[6] = x1[12]; 841 output[7] = x1[28]; 842 output[8] = x1[2]; 843 output[9] = x1[18]; 844 output[10] = x1[10]; 845 output[11] = x1[26]; 846 output[12] = x1[6]; 847 output[13] = x1[22]; 848 output[14] = x1[14]; 849 output[15] = x1[30]; 850 output[16] = x1[1]; 851 output[17] = x1[17]; 852 output[18] = x1[9]; 853 output[19] = x1[25]; 854 output[20] = x1[5]; 855 output[21] = x1[21]; 856 output[22] = x1[13]; 857 output[23] = x1[29]; 858 output[24] = x1[3]; 859 output[25] = x1[19]; 860 output[26] = x1[11]; 861 output[27] = x1[27]; 862 output[28] = x1[7]; 863 output[29] = x1[23]; 864 output[30] = x1[15]; 865 output[31] = x1[31]; 866 } 867 868 static inline void fdct64_new_avx2(const __m256i *input, __m256i *output, 869 int8_t cos_bit) { 870 const int32_t *cospi = cospi_arr(cos_bit); 871 const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1)); 872 873 __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]); 874 __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]); 875 __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]); 876 __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]); 877 __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]); 878 __m256i cospi_p16 = _mm256_set1_epi32(cospi[16]); 879 __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]); 880 __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]); 881 __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]); 882 __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]); 883 __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]); 884 __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]); 885 __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]); 886 __m256i cospi_p40 = _mm256_set1_epi32(cospi[40]); 887 __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]); 888 __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]); 889 __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]); 890 __m256i cospi_p36 = _mm256_set1_epi32(cospi[36]); 891 __m256i cospi_p44 = _mm256_set1_epi32(cospi[44]); 892 __m256i cospi_p20 = _mm256_set1_epi32(cospi[20]); 893 __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]); 894 __m256i cospi_p52 = _mm256_set1_epi32(cospi[52]); 895 __m256i cospi_m04 = _mm256_set1_epi32(-cospi[4]); 896 __m256i cospi_m60 = _mm256_set1_epi32(-cospi[60]); 897 __m256i cospi_m36 = _mm256_set1_epi32(-cospi[36]); 898 __m256i cospi_m28 = _mm256_set1_epi32(-cospi[28]); 899 __m256i cospi_m20 = _mm256_set1_epi32(-cospi[20]); 900 __m256i cospi_m44 = _mm256_set1_epi32(-cospi[44]); 901 __m256i cospi_m52 = _mm256_set1_epi32(-cospi[52]); 902 __m256i cospi_m12 = _mm256_set1_epi32(-cospi[12]); 903 __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]); 904 __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]); 905 __m256i cospi_p30 = _mm256_set1_epi32(cospi[30]); 906 __m256i cospi_p34 = _mm256_set1_epi32(cospi[34]); 907 __m256i cospi_p46 = _mm256_set1_epi32(cospi[46]); 908 __m256i cospi_p18 = _mm256_set1_epi32(cospi[18]); 909 __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]); 910 __m256i cospi_p50 = _mm256_set1_epi32(cospi[50]); 911 __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]); 912 __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]); 913 __m256i cospi_p22 = _mm256_set1_epi32(cospi[22]); 914 __m256i cospi_p42 = _mm256_set1_epi32(cospi[42]); 915 __m256i cospi_p38 = _mm256_set1_epi32(cospi[38]); 916 __m256i cospi_p26 = _mm256_set1_epi32(cospi[26]); 917 __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]); 918 __m256i cospi_p58 = _mm256_set1_epi32(cospi[58]); 919 __m256i cospi_p63 = _mm256_set1_epi32(cospi[63]); 920 __m256i cospi_p01 = _mm256_set1_epi32(cospi[1]); 921 __m256i cospi_p31 = _mm256_set1_epi32(cospi[31]); 922 __m256i cospi_p33 = _mm256_set1_epi32(cospi[33]); 923 __m256i cospi_p47 = _mm256_set1_epi32(cospi[47]); 924 __m256i cospi_p17 = _mm256_set1_epi32(cospi[17]); 925 __m256i cospi_p15 = _mm256_set1_epi32(cospi[15]); 926 __m256i cospi_p49 = _mm256_set1_epi32(cospi[49]); 927 __m256i cospi_p55 = _mm256_set1_epi32(cospi[55]); 928 __m256i cospi_p09 = _mm256_set1_epi32(cospi[9]); 929 __m256i cospi_p23 = _mm256_set1_epi32(cospi[23]); 930 __m256i cospi_p41 = _mm256_set1_epi32(cospi[41]); 931 __m256i cospi_p39 = _mm256_set1_epi32(cospi[39]); 932 __m256i cospi_p25 = _mm256_set1_epi32(cospi[25]); 933 __m256i cospi_p07 = _mm256_set1_epi32(cospi[7]); 934 __m256i cospi_p57 = _mm256_set1_epi32(cospi[57]); 935 __m256i cospi_p59 = _mm256_set1_epi32(cospi[59]); 936 __m256i cospi_p05 = _mm256_set1_epi32(cospi[5]); 937 __m256i cospi_p27 = _mm256_set1_epi32(cospi[27]); 938 __m256i cospi_p37 = _mm256_set1_epi32(cospi[37]); 939 __m256i cospi_p43 = _mm256_set1_epi32(cospi[43]); 940 __m256i cospi_p21 = _mm256_set1_epi32(cospi[21]); 941 __m256i cospi_p11 = _mm256_set1_epi32(cospi[11]); 942 __m256i cospi_p53 = _mm256_set1_epi32(cospi[53]); 943 __m256i cospi_p51 = _mm256_set1_epi32(cospi[51]); 944 __m256i cospi_p13 = _mm256_set1_epi32(cospi[13]); 945 __m256i cospi_p19 = _mm256_set1_epi32(cospi[19]); 946 __m256i cospi_p45 = _mm256_set1_epi32(cospi[45]); 947 __m256i cospi_p35 = _mm256_set1_epi32(cospi[35]); 948 __m256i cospi_p29 = _mm256_set1_epi32(cospi[29]); 949 __m256i cospi_p03 = _mm256_set1_epi32(cospi[3]); 950 __m256i cospi_p61 = _mm256_set1_epi32(cospi[61]); 951 952 // stage 1 953 __m256i x1[64]; 954 btf_32_add_sub_out_avx2(&x1[0], &x1[63], input[0], input[63]); 955 btf_32_add_sub_out_avx2(&x1[1], &x1[62], input[1], input[62]); 956 btf_32_add_sub_out_avx2(&x1[2], &x1[61], input[2], input[61]); 957 btf_32_add_sub_out_avx2(&x1[3], &x1[60], input[3], input[60]); 958 btf_32_add_sub_out_avx2(&x1[4], &x1[59], input[4], input[59]); 959 btf_32_add_sub_out_avx2(&x1[5], &x1[58], input[5], input[58]); 960 btf_32_add_sub_out_avx2(&x1[6], &x1[57], input[6], input[57]); 961 btf_32_add_sub_out_avx2(&x1[7], &x1[56], input[7], input[56]); 962 btf_32_add_sub_out_avx2(&x1[8], &x1[55], input[8], input[55]); 963 btf_32_add_sub_out_avx2(&x1[9], &x1[54], input[9], input[54]); 964 btf_32_add_sub_out_avx2(&x1[10], &x1[53], input[10], input[53]); 965 btf_32_add_sub_out_avx2(&x1[11], &x1[52], input[11], input[52]); 966 btf_32_add_sub_out_avx2(&x1[12], &x1[51], input[12], input[51]); 967 btf_32_add_sub_out_avx2(&x1[13], &x1[50], input[13], input[50]); 968 btf_32_add_sub_out_avx2(&x1[14], &x1[49], input[14], input[49]); 969 btf_32_add_sub_out_avx2(&x1[15], &x1[48], input[15], input[48]); 970 btf_32_add_sub_out_avx2(&x1[16], &x1[47], input[16], input[47]); 971 btf_32_add_sub_out_avx2(&x1[17], &x1[46], input[17], input[46]); 972 btf_32_add_sub_out_avx2(&x1[18], &x1[45], input[18], input[45]); 973 btf_32_add_sub_out_avx2(&x1[19], &x1[44], input[19], input[44]); 974 btf_32_add_sub_out_avx2(&x1[20], &x1[43], input[20], input[43]); 975 btf_32_add_sub_out_avx2(&x1[21], &x1[42], input[21], input[42]); 976 btf_32_add_sub_out_avx2(&x1[22], &x1[41], input[22], input[41]); 977 btf_32_add_sub_out_avx2(&x1[23], &x1[40], input[23], input[40]); 978 btf_32_add_sub_out_avx2(&x1[24], &x1[39], input[24], input[39]); 979 btf_32_add_sub_out_avx2(&x1[25], &x1[38], input[25], input[38]); 980 btf_32_add_sub_out_avx2(&x1[26], &x1[37], input[26], input[37]); 981 btf_32_add_sub_out_avx2(&x1[27], &x1[36], input[27], input[36]); 982 btf_32_add_sub_out_avx2(&x1[28], &x1[35], input[28], input[35]); 983 btf_32_add_sub_out_avx2(&x1[29], &x1[34], input[29], input[34]); 984 btf_32_add_sub_out_avx2(&x1[30], &x1[33], input[30], input[33]); 985 btf_32_add_sub_out_avx2(&x1[31], &x1[32], input[31], input[32]); 986 987 // stage 2 988 btf_32_add_sub_avx2(&x1[0], &x1[31]); 989 btf_32_add_sub_avx2(&x1[1], &x1[30]); 990 btf_32_add_sub_avx2(&x1[2], &x1[29]); 991 btf_32_add_sub_avx2(&x1[3], &x1[28]); 992 btf_32_add_sub_avx2(&x1[4], &x1[27]); 993 btf_32_add_sub_avx2(&x1[5], &x1[26]); 994 btf_32_add_sub_avx2(&x1[6], &x1[25]); 995 btf_32_add_sub_avx2(&x1[7], &x1[24]); 996 btf_32_add_sub_avx2(&x1[8], &x1[23]); 997 btf_32_add_sub_avx2(&x1[9], &x1[22]); 998 btf_32_add_sub_avx2(&x1[10], &x1[21]); 999 btf_32_add_sub_avx2(&x1[11], &x1[20]); 1000 btf_32_add_sub_avx2(&x1[12], &x1[19]); 1001 btf_32_add_sub_avx2(&x1[13], &x1[18]); 1002 btf_32_add_sub_avx2(&x1[14], &x1[17]); 1003 btf_32_add_sub_avx2(&x1[15], &x1[16]); 1004 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[40], &x1[55], _r, cos_bit); 1005 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[41], &x1[54], _r, cos_bit); 1006 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[42], &x1[53], _r, cos_bit); 1007 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[43], &x1[52], _r, cos_bit); 1008 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[44], &x1[51], _r, cos_bit); 1009 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[45], &x1[50], _r, cos_bit); 1010 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[46], &x1[49], _r, cos_bit); 1011 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[47], &x1[48], _r, cos_bit); 1012 1013 // stage 3 1014 btf_32_add_sub_avx2(&x1[0], &x1[15]); 1015 btf_32_add_sub_avx2(&x1[1], &x1[14]); 1016 btf_32_add_sub_avx2(&x1[2], &x1[13]); 1017 btf_32_add_sub_avx2(&x1[3], &x1[12]); 1018 btf_32_add_sub_avx2(&x1[4], &x1[11]); 1019 btf_32_add_sub_avx2(&x1[5], &x1[10]); 1020 btf_32_add_sub_avx2(&x1[6], &x1[9]); 1021 btf_32_add_sub_avx2(&x1[7], &x1[8]); 1022 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[20], &x1[27], _r, cos_bit); 1023 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[21], &x1[26], _r, cos_bit); 1024 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[22], &x1[25], _r, cos_bit); 1025 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[23], &x1[24], _r, cos_bit); 1026 btf_32_add_sub_avx2(&x1[32], &x1[47]); 1027 btf_32_add_sub_avx2(&x1[33], &x1[46]); 1028 btf_32_add_sub_avx2(&x1[34], &x1[45]); 1029 btf_32_add_sub_avx2(&x1[35], &x1[44]); 1030 btf_32_add_sub_avx2(&x1[36], &x1[43]); 1031 btf_32_add_sub_avx2(&x1[37], &x1[42]); 1032 btf_32_add_sub_avx2(&x1[38], &x1[41]); 1033 btf_32_add_sub_avx2(&x1[39], &x1[40]); 1034 btf_32_add_sub_avx2(&x1[63], &x1[48]); 1035 btf_32_add_sub_avx2(&x1[62], &x1[49]); 1036 btf_32_add_sub_avx2(&x1[61], &x1[50]); 1037 btf_32_add_sub_avx2(&x1[60], &x1[51]); 1038 btf_32_add_sub_avx2(&x1[59], &x1[52]); 1039 btf_32_add_sub_avx2(&x1[58], &x1[53]); 1040 btf_32_add_sub_avx2(&x1[57], &x1[54]); 1041 btf_32_add_sub_avx2(&x1[56], &x1[55]); 1042 1043 // stage 4 1044 btf_32_add_sub_avx2(&x1[0], &x1[7]); 1045 btf_32_add_sub_avx2(&x1[1], &x1[6]); 1046 btf_32_add_sub_avx2(&x1[2], &x1[5]); 1047 btf_32_add_sub_avx2(&x1[3], &x1[4]); 1048 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[10], &x1[13], _r, cos_bit); 1049 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[11], &x1[12], _r, cos_bit); 1050 btf_32_add_sub_avx2(&x1[16], &x1[23]); 1051 btf_32_add_sub_avx2(&x1[17], &x1[22]); 1052 btf_32_add_sub_avx2(&x1[18], &x1[21]); 1053 btf_32_add_sub_avx2(&x1[19], &x1[20]); 1054 btf_32_add_sub_avx2(&x1[31], &x1[24]); 1055 btf_32_add_sub_avx2(&x1[30], &x1[25]); 1056 btf_32_add_sub_avx2(&x1[29], &x1[26]); 1057 btf_32_add_sub_avx2(&x1[28], &x1[27]); 1058 btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[36], &x1[59], _r, cos_bit); 1059 btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[37], &x1[58], _r, cos_bit); 1060 btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[38], &x1[57], _r, cos_bit); 1061 btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[39], &x1[56], _r, cos_bit); 1062 btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[40], &x1[55], _r, cos_bit); 1063 btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[41], &x1[54], _r, cos_bit); 1064 btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[42], &x1[53], _r, cos_bit); 1065 btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[43], &x1[52], _r, cos_bit); 1066 1067 // stage 5 1068 btf_32_add_sub_avx2(&x1[0], &x1[3]); 1069 btf_32_add_sub_avx2(&x1[1], &x1[2]); 1070 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[5], &x1[6], _r, cos_bit); 1071 btf_32_add_sub_avx2(&x1[8], &x1[11]); 1072 btf_32_add_sub_avx2(&x1[9], &x1[10]); 1073 btf_32_add_sub_avx2(&x1[15], &x1[12]); 1074 btf_32_add_sub_avx2(&x1[14], &x1[13]); 1075 btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[18], &x1[29], _r, cos_bit); 1076 btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[19], &x1[28], _r, cos_bit); 1077 btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[20], &x1[27], _r, cos_bit); 1078 btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[21], &x1[26], _r, cos_bit); 1079 btf_32_add_sub_avx2(&x1[32], &x1[39]); 1080 btf_32_add_sub_avx2(&x1[33], &x1[38]); 1081 btf_32_add_sub_avx2(&x1[34], &x1[37]); 1082 btf_32_add_sub_avx2(&x1[35], &x1[36]); 1083 btf_32_add_sub_avx2(&x1[47], &x1[40]); 1084 btf_32_add_sub_avx2(&x1[46], &x1[41]); 1085 btf_32_add_sub_avx2(&x1[45], &x1[42]); 1086 btf_32_add_sub_avx2(&x1[44], &x1[43]); 1087 btf_32_add_sub_avx2(&x1[48], &x1[55]); 1088 btf_32_add_sub_avx2(&x1[49], &x1[54]); 1089 btf_32_add_sub_avx2(&x1[50], &x1[53]); 1090 btf_32_add_sub_avx2(&x1[51], &x1[52]); 1091 btf_32_add_sub_avx2(&x1[63], &x1[56]); 1092 btf_32_add_sub_avx2(&x1[62], &x1[57]); 1093 btf_32_add_sub_avx2(&x1[61], &x1[58]); 1094 btf_32_add_sub_avx2(&x1[60], &x1[59]); 1095 1096 // stage 6 1097 btf_32_avx2_type0_new(cospi_p32, cospi_p32, &x1[0], &x1[1], _r, cos_bit); 1098 btf_32_avx2_type1_new(cospi_p48, cospi_p16, &x1[2], &x1[3], _r, cos_bit); 1099 btf_32_add_sub_avx2(&x1[4], &x1[5]); 1100 btf_32_add_sub_avx2(&x1[7], &x1[6]); 1101 btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[9], &x1[14], _r, cos_bit); 1102 btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[10], &x1[13], _r, cos_bit); 1103 btf_32_add_sub_avx2(&x1[16], &x1[19]); 1104 btf_32_add_sub_avx2(&x1[17], &x1[18]); 1105 btf_32_add_sub_avx2(&x1[23], &x1[20]); 1106 btf_32_add_sub_avx2(&x1[22], &x1[21]); 1107 btf_32_add_sub_avx2(&x1[24], &x1[27]); 1108 btf_32_add_sub_avx2(&x1[25], &x1[26]); 1109 btf_32_add_sub_avx2(&x1[31], &x1[28]); 1110 btf_32_add_sub_avx2(&x1[30], &x1[29]); 1111 btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[34], &x1[61], _r, cos_bit); 1112 btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[35], &x1[60], _r, cos_bit); 1113 btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[36], &x1[59], _r, cos_bit); 1114 btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[37], &x1[58], _r, cos_bit); 1115 btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[42], &x1[53], _r, cos_bit); 1116 btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[43], &x1[52], _r, cos_bit); 1117 btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[44], &x1[51], _r, cos_bit); 1118 btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[45], &x1[50], _r, cos_bit); 1119 1120 // stage 7 1121 btf_32_avx2_type1_new(cospi_p56, cospi_p08, &x1[4], &x1[7], _r, cos_bit); 1122 btf_32_avx2_type1_new(cospi_p24, cospi_p40, &x1[5], &x1[6], _r, cos_bit); 1123 btf_32_add_sub_avx2(&x1[8], &x1[9]); 1124 btf_32_add_sub_avx2(&x1[11], &x1[10]); 1125 btf_32_add_sub_avx2(&x1[12], &x1[13]); 1126 btf_32_add_sub_avx2(&x1[15], &x1[14]); 1127 btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[17], &x1[30], _r, cos_bit); 1128 btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[18], &x1[29], _r, cos_bit); 1129 btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[21], &x1[26], _r, cos_bit); 1130 btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[22], &x1[25], _r, cos_bit); 1131 btf_32_add_sub_avx2(&x1[32], &x1[35]); 1132 btf_32_add_sub_avx2(&x1[33], &x1[34]); 1133 btf_32_add_sub_avx2(&x1[39], &x1[36]); 1134 btf_32_add_sub_avx2(&x1[38], &x1[37]); 1135 btf_32_add_sub_avx2(&x1[40], &x1[43]); 1136 btf_32_add_sub_avx2(&x1[41], &x1[42]); 1137 btf_32_add_sub_avx2(&x1[47], &x1[44]); 1138 btf_32_add_sub_avx2(&x1[46], &x1[45]); 1139 btf_32_add_sub_avx2(&x1[48], &x1[51]); 1140 btf_32_add_sub_avx2(&x1[49], &x1[50]); 1141 btf_32_add_sub_avx2(&x1[55], &x1[52]); 1142 btf_32_add_sub_avx2(&x1[54], &x1[53]); 1143 btf_32_add_sub_avx2(&x1[56], &x1[59]); 1144 btf_32_add_sub_avx2(&x1[57], &x1[58]); 1145 btf_32_add_sub_avx2(&x1[63], &x1[60]); 1146 btf_32_add_sub_avx2(&x1[62], &x1[61]); 1147 1148 // stage 8 1149 btf_32_avx2_type1_new(cospi_p60, cospi_p04, &x1[8], &x1[15], _r, cos_bit); 1150 btf_32_avx2_type1_new(cospi_p28, cospi_p36, &x1[9], &x1[14], _r, cos_bit); 1151 btf_32_avx2_type1_new(cospi_p44, cospi_p20, &x1[10], &x1[13], _r, cos_bit); 1152 btf_32_avx2_type1_new(cospi_p12, cospi_p52, &x1[11], &x1[12], _r, cos_bit); 1153 btf_32_add_sub_avx2(&x1[16], &x1[17]); 1154 btf_32_add_sub_avx2(&x1[19], &x1[18]); 1155 btf_32_add_sub_avx2(&x1[20], &x1[21]); 1156 btf_32_add_sub_avx2(&x1[23], &x1[22]); 1157 btf_32_add_sub_avx2(&x1[24], &x1[25]); 1158 btf_32_add_sub_avx2(&x1[27], &x1[26]); 1159 btf_32_add_sub_avx2(&x1[28], &x1[29]); 1160 btf_32_add_sub_avx2(&x1[31], &x1[30]); 1161 btf_32_avx2_type0_new(cospi_m04, cospi_p60, &x1[33], &x1[62], _r, cos_bit); 1162 btf_32_avx2_type0_new(cospi_m60, cospi_m04, &x1[34], &x1[61], _r, cos_bit); 1163 btf_32_avx2_type0_new(cospi_m36, cospi_p28, &x1[37], &x1[58], _r, cos_bit); 1164 btf_32_avx2_type0_new(cospi_m28, cospi_m36, &x1[38], &x1[57], _r, cos_bit); 1165 btf_32_avx2_type0_new(cospi_m20, cospi_p44, &x1[41], &x1[54], _r, cos_bit); 1166 btf_32_avx2_type0_new(cospi_m44, cospi_m20, &x1[42], &x1[53], _r, cos_bit); 1167 btf_32_avx2_type0_new(cospi_m52, cospi_p12, &x1[45], &x1[50], _r, cos_bit); 1168 btf_32_avx2_type0_new(cospi_m12, cospi_m52, &x1[46], &x1[49], _r, cos_bit); 1169 1170 // stage 9 1171 btf_32_avx2_type1_new(cospi_p62, cospi_p02, &x1[16], &x1[31], _r, cos_bit); 1172 btf_32_avx2_type1_new(cospi_p30, cospi_p34, &x1[17], &x1[30], _r, cos_bit); 1173 btf_32_avx2_type1_new(cospi_p46, cospi_p18, &x1[18], &x1[29], _r, cos_bit); 1174 btf_32_avx2_type1_new(cospi_p14, cospi_p50, &x1[19], &x1[28], _r, cos_bit); 1175 btf_32_avx2_type1_new(cospi_p54, cospi_p10, &x1[20], &x1[27], _r, cos_bit); 1176 btf_32_avx2_type1_new(cospi_p22, cospi_p42, &x1[21], &x1[26], _r, cos_bit); 1177 btf_32_avx2_type1_new(cospi_p38, cospi_p26, &x1[22], &x1[25], _r, cos_bit); 1178 btf_32_avx2_type1_new(cospi_p06, cospi_p58, &x1[23], &x1[24], _r, cos_bit); 1179 btf_32_add_sub_avx2(&x1[32], &x1[33]); 1180 btf_32_add_sub_avx2(&x1[35], &x1[34]); 1181 btf_32_add_sub_avx2(&x1[36], &x1[37]); 1182 btf_32_add_sub_avx2(&x1[39], &x1[38]); 1183 btf_32_add_sub_avx2(&x1[40], &x1[41]); 1184 btf_32_add_sub_avx2(&x1[43], &x1[42]); 1185 btf_32_add_sub_avx2(&x1[44], &x1[45]); 1186 btf_32_add_sub_avx2(&x1[47], &x1[46]); 1187 btf_32_add_sub_avx2(&x1[48], &x1[49]); 1188 btf_32_add_sub_avx2(&x1[51], &x1[50]); 1189 btf_32_add_sub_avx2(&x1[52], &x1[53]); 1190 btf_32_add_sub_avx2(&x1[55], &x1[54]); 1191 btf_32_add_sub_avx2(&x1[56], &x1[57]); 1192 btf_32_add_sub_avx2(&x1[59], &x1[58]); 1193 btf_32_add_sub_avx2(&x1[60], &x1[61]); 1194 btf_32_add_sub_avx2(&x1[63], &x1[62]); 1195 1196 // stage 10 1197 btf_32_avx2_type1_new(cospi_p63, cospi_p01, &x1[32], &x1[63], _r, cos_bit); 1198 btf_32_avx2_type1_new(cospi_p31, cospi_p33, &x1[33], &x1[62], _r, cos_bit); 1199 btf_32_avx2_type1_new(cospi_p47, cospi_p17, &x1[34], &x1[61], _r, cos_bit); 1200 btf_32_avx2_type1_new(cospi_p15, cospi_p49, &x1[35], &x1[60], _r, cos_bit); 1201 btf_32_avx2_type1_new(cospi_p55, cospi_p09, &x1[36], &x1[59], _r, cos_bit); 1202 btf_32_avx2_type1_new(cospi_p23, cospi_p41, &x1[37], &x1[58], _r, cos_bit); 1203 btf_32_avx2_type1_new(cospi_p39, cospi_p25, &x1[38], &x1[57], _r, cos_bit); 1204 btf_32_avx2_type1_new(cospi_p07, cospi_p57, &x1[39], &x1[56], _r, cos_bit); 1205 btf_32_avx2_type1_new(cospi_p59, cospi_p05, &x1[40], &x1[55], _r, cos_bit); 1206 btf_32_avx2_type1_new(cospi_p27, cospi_p37, &x1[41], &x1[54], _r, cos_bit); 1207 btf_32_avx2_type1_new(cospi_p43, cospi_p21, &x1[42], &x1[53], _r, cos_bit); 1208 btf_32_avx2_type1_new(cospi_p11, cospi_p53, &x1[43], &x1[52], _r, cos_bit); 1209 btf_32_avx2_type1_new(cospi_p51, cospi_p13, &x1[44], &x1[51], _r, cos_bit); 1210 btf_32_avx2_type1_new(cospi_p19, cospi_p45, &x1[45], &x1[50], _r, cos_bit); 1211 btf_32_avx2_type1_new(cospi_p35, cospi_p29, &x1[46], &x1[49], _r, cos_bit); 1212 btf_32_avx2_type1_new(cospi_p03, cospi_p61, &x1[47], &x1[48], _r, cos_bit); 1213 1214 // stage 11 1215 output[0] = x1[0]; 1216 output[1] = x1[32]; 1217 output[2] = x1[16]; 1218 output[3] = x1[48]; 1219 output[4] = x1[8]; 1220 output[5] = x1[40]; 1221 output[6] = x1[24]; 1222 output[7] = x1[56]; 1223 output[8] = x1[4]; 1224 output[9] = x1[36]; 1225 output[10] = x1[20]; 1226 output[11] = x1[52]; 1227 output[12] = x1[12]; 1228 output[13] = x1[44]; 1229 output[14] = x1[28]; 1230 output[15] = x1[60]; 1231 output[16] = x1[2]; 1232 output[17] = x1[34]; 1233 output[18] = x1[18]; 1234 output[19] = x1[50]; 1235 output[20] = x1[10]; 1236 output[21] = x1[42]; 1237 output[22] = x1[26]; 1238 output[23] = x1[58]; 1239 output[24] = x1[6]; 1240 output[25] = x1[38]; 1241 output[26] = x1[22]; 1242 output[27] = x1[54]; 1243 output[28] = x1[14]; 1244 output[29] = x1[46]; 1245 output[30] = x1[30]; 1246 output[31] = x1[62]; 1247 output[32] = x1[1]; 1248 output[33] = x1[33]; 1249 output[34] = x1[17]; 1250 output[35] = x1[49]; 1251 output[36] = x1[9]; 1252 output[37] = x1[41]; 1253 output[38] = x1[25]; 1254 output[39] = x1[57]; 1255 output[40] = x1[5]; 1256 output[41] = x1[37]; 1257 output[42] = x1[21]; 1258 output[43] = x1[53]; 1259 output[44] = x1[13]; 1260 output[45] = x1[45]; 1261 output[46] = x1[29]; 1262 output[47] = x1[61]; 1263 output[48] = x1[3]; 1264 output[49] = x1[35]; 1265 output[50] = x1[19]; 1266 output[51] = x1[51]; 1267 output[52] = x1[11]; 1268 output[53] = x1[43]; 1269 output[54] = x1[27]; 1270 output[55] = x1[59]; 1271 output[56] = x1[7]; 1272 output[57] = x1[39]; 1273 output[58] = x1[23]; 1274 output[59] = x1[55]; 1275 output[60] = x1[15]; 1276 output[61] = x1[47]; 1277 output[62] = x1[31]; 1278 output[63] = x1[63]; 1279 } 1280 1281 static inline void fadst16x16_new_avx2(const __m256i *input, __m256i *output, 1282 int8_t cos_bit) { 1283 const int32_t *cospi = cospi_arr(cos_bit); 1284 const __m256i __zero = _mm256_setzero_si256(); 1285 const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1)); 1286 1287 __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); 1288 __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); 1289 __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); 1290 __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); 1291 __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]); 1292 __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); 1293 __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); 1294 __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]); 1295 __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]); 1296 __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]); 1297 __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]); 1298 __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]); 1299 __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]); 1300 __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]); 1301 __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]); 1302 __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]); 1303 __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]); 1304 __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]); 1305 __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]); 1306 __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]); 1307 __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]); 1308 __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]); 1309 __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]); 1310 __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]); 1311 __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]); 1312 __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]); 1313 __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]); 1314 1315 // stage 1 1316 __m256i x1[16]; 1317 x1[0] = input[0]; 1318 x1[1] = _mm256_subs_epi16(__zero, input[15]); 1319 x1[2] = _mm256_subs_epi16(__zero, input[7]); 1320 x1[3] = input[8]; 1321 x1[4] = _mm256_subs_epi16(__zero, input[3]); 1322 x1[5] = input[12]; 1323 x1[6] = input[4]; 1324 x1[7] = _mm256_subs_epi16(__zero, input[11]); 1325 x1[8] = _mm256_subs_epi16(__zero, input[1]); 1326 x1[9] = input[14]; 1327 x1[10] = input[6]; 1328 x1[11] = _mm256_subs_epi16(__zero, input[9]); 1329 x1[12] = input[2]; 1330 x1[13] = _mm256_subs_epi16(__zero, input[13]); 1331 x1[14] = _mm256_subs_epi16(__zero, input[5]); 1332 x1[15] = input[10]; 1333 1334 // stage 2 1335 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit); 1336 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit); 1337 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit); 1338 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit); 1339 1340 // stage 3 1341 btf_16_adds_subs_avx2(&x1[0], &x1[2]); 1342 btf_16_adds_subs_avx2(&x1[1], &x1[3]); 1343 btf_16_adds_subs_avx2(&x1[4], &x1[6]); 1344 btf_16_adds_subs_avx2(&x1[5], &x1[7]); 1345 btf_16_adds_subs_avx2(&x1[8], &x1[10]); 1346 btf_16_adds_subs_avx2(&x1[9], &x1[11]); 1347 btf_16_adds_subs_avx2(&x1[12], &x1[14]); 1348 btf_16_adds_subs_avx2(&x1[13], &x1[15]); 1349 1350 // stage 4 1351 btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r, cos_bit); 1352 btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x1[6], &x1[7], _r, cos_bit); 1353 btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r, cos_bit); 1354 btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x1[14], &x1[15], _r, cos_bit); 1355 1356 // stage 5 1357 btf_16_adds_subs_avx2(&x1[0], &x1[4]); 1358 btf_16_adds_subs_avx2(&x1[1], &x1[5]); 1359 btf_16_adds_subs_avx2(&x1[2], &x1[6]); 1360 btf_16_adds_subs_avx2(&x1[3], &x1[7]); 1361 btf_16_adds_subs_avx2(&x1[8], &x1[12]); 1362 btf_16_adds_subs_avx2(&x1[9], &x1[13]); 1363 btf_16_adds_subs_avx2(&x1[10], &x1[14]); 1364 btf_16_adds_subs_avx2(&x1[11], &x1[15]); 1365 1366 // stage 6 1367 btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r, cos_bit); 1368 btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x1[10], &x1[11], _r, cos_bit); 1369 btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x1[12], &x1[13], _r, cos_bit); 1370 btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x1[14], &x1[15], _r, cos_bit); 1371 1372 // stage 7 1373 btf_16_adds_subs_avx2(&x1[0], &x1[8]); 1374 btf_16_adds_subs_avx2(&x1[1], &x1[9]); 1375 btf_16_adds_subs_avx2(&x1[2], &x1[10]); 1376 btf_16_adds_subs_avx2(&x1[3], &x1[11]); 1377 btf_16_adds_subs_avx2(&x1[4], &x1[12]); 1378 btf_16_adds_subs_avx2(&x1[5], &x1[13]); 1379 btf_16_adds_subs_avx2(&x1[6], &x1[14]); 1380 btf_16_adds_subs_avx2(&x1[7], &x1[15]); 1381 1382 // stage 8 1383 btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r, cos_bit); 1384 btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r, cos_bit); 1385 btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r, cos_bit); 1386 btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r, cos_bit); 1387 btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r, cos_bit); 1388 btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r, cos_bit); 1389 btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r, cos_bit); 1390 btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r, cos_bit); 1391 1392 // stage 9 1393 output[0] = x1[1]; 1394 output[1] = x1[14]; 1395 output[2] = x1[3]; 1396 output[3] = x1[12]; 1397 output[4] = x1[5]; 1398 output[5] = x1[10]; 1399 output[6] = x1[7]; 1400 output[7] = x1[8]; 1401 output[8] = x1[9]; 1402 output[9] = x1[6]; 1403 output[10] = x1[11]; 1404 output[11] = x1[4]; 1405 output[12] = x1[13]; 1406 output[13] = x1[2]; 1407 output[14] = x1[15]; 1408 output[15] = x1[0]; 1409 } 1410 1411 static inline void fidentity16x16_new_avx2(const __m256i *input, 1412 __m256i *output, int8_t cos_bit) { 1413 (void)cos_bit; 1414 const __m256i one = _mm256_set1_epi16(1); 1415 1416 for (int i = 0; i < 16; ++i) { 1417 const __m256i a_lo = _mm256_unpacklo_epi16(input[i], one); 1418 const __m256i a_hi = _mm256_unpackhi_epi16(input[i], one); 1419 const __m256i b_lo = scale_round_avx2(a_lo, 2 * NewSqrt2); 1420 const __m256i b_hi = scale_round_avx2(a_hi, 2 * NewSqrt2); 1421 output[i] = _mm256_packs_epi32(b_lo, b_hi); 1422 } 1423 } 1424 1425 static inline void fidentity16x32_avx2(const __m256i *input, __m256i *output, 1426 int8_t cos_bit) { 1427 (void)cos_bit; 1428 for (int i = 0; i < 32; ++i) { 1429 output[i] = _mm256_slli_epi16(input[i], 2); 1430 } 1431 } 1432 1433 static inline void store_output_32bit_w16(int32_t *const out, 1434 const __m256i *const in1, 1435 const __m256i *const in2, 1436 const int stride, 1437 const int out_size) { 1438 for (int i = 0; i < out_size; ++i) { 1439 _mm256_store_si256((__m256i *)(out + stride * i), in1[i]); 1440 _mm256_store_si256((__m256i *)(out + stride * i + 8), in2[i]); 1441 } 1442 } 1443 1444 // Store 8 16 bit values. Sign extend the values. 1445 static inline void store_buffer_16bit_to_32bit_w16_avx2(const __m256i *const in, 1446 int32_t *out, 1447 const int stride, 1448 const int out_size) { 1449 for (int i = 0; i < out_size; ++i) { 1450 _mm256_store_si256((__m256i *)(out), 1451 _mm256_cvtepi16_epi32(_mm256_castsi256_si128(in[i]))); 1452 _mm256_store_si256( 1453 (__m256i *)(out + 8), 1454 _mm256_cvtepi16_epi32(_mm256_extracti128_si256(in[i], 1))); 1455 out += stride; 1456 } 1457 } 1458 1459 static inline void store_rect_16bit_to_32bit_avx2(const __m256i a, 1460 int32_t *const b) { 1461 const __m256i one = _mm256_set1_epi16(1); 1462 const __m256i a_reoder = _mm256_permute4x64_epi64(a, 0xd8); 1463 const __m256i a_lo = _mm256_unpacklo_epi16(a_reoder, one); 1464 const __m256i a_hi = _mm256_unpackhi_epi16(a_reoder, one); 1465 const __m256i b_lo = scale_round_avx2(a_lo, NewSqrt2); 1466 const __m256i b_hi = scale_round_avx2(a_hi, NewSqrt2); 1467 _mm256_store_si256((__m256i *)b, b_lo); 1468 _mm256_store_si256((__m256i *)(b + 8), b_hi); 1469 } 1470 1471 static inline void store_rect_buffer_16bit_to_32bit_w16_avx2( 1472 const __m256i *const in, int32_t *const out, const int stride, 1473 const int out_size) { 1474 for (int i = 0; i < out_size; ++i) { 1475 store_rect_16bit_to_32bit_avx2(in[i], out + i * stride); 1476 } 1477 } 1478 1479 typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output, 1480 int8_t cos_bit); 1481 1482 static const transform_1d_avx2 col_txfm16x32_arr[TX_TYPES] = { 1483 fdct16x32_avx2, // DCT_DCT 1484 NULL, // ADST_DCT 1485 NULL, // DCT_ADST 1486 NULL, // ADST_ADST 1487 NULL, // FLIPADST_DCT 1488 NULL, // DCT_FLIPADST 1489 NULL, // FLIPADST_FLIPADST 1490 NULL, // ADST_FLIPADST 1491 NULL, // FLIPADST_ADST 1492 fidentity16x32_avx2, // IDTX 1493 fdct16x32_avx2, // V_DCT 1494 fidentity16x32_avx2, // H_DCT 1495 NULL, // V_ADST 1496 NULL, // H_ADST 1497 NULL, // V_FLIPADST 1498 NULL // H_FLIPADST 1499 }; 1500 1501 static const transform_1d_avx2 row_txfm16x32_arr[TX_TYPES] = { 1502 fdct16x32_avx2, // DCT_DCT 1503 NULL, // ADST_DCT 1504 NULL, // DCT_ADST 1505 NULL, // ADST_ADST 1506 NULL, // FLIPADST_DCT 1507 NULL, // DCT_FLIPADST 1508 NULL, // FLIPADST_FLIPADST 1509 NULL, // ADST_FLIPADST 1510 NULL, // FLIPADST_ADST 1511 fidentity16x32_avx2, // IDTX 1512 fidentity16x32_avx2, // V_DCT 1513 fdct16x32_avx2, // H_DCT 1514 NULL, // V_ADST 1515 NULL, // H_ADST 1516 NULL, // V_FLIPADST 1517 NULL // H_FLIPADST 1518 }; 1519 1520 static const transform_1d_avx2 col_txfm16x16_arr[TX_TYPES] = { 1521 fdct16x16_new_avx2, // DCT_DCT 1522 fadst16x16_new_avx2, // ADST_DCT 1523 fdct16x16_new_avx2, // DCT_ADST 1524 fadst16x16_new_avx2, // ADST_ADST 1525 fadst16x16_new_avx2, // FLIPADST_DCT 1526 fdct16x16_new_avx2, // DCT_FLIPADST 1527 fadst16x16_new_avx2, // FLIPADST_FLIPADST 1528 fadst16x16_new_avx2, // ADST_FLIPADST 1529 fadst16x16_new_avx2, // FLIPADST_ADST 1530 fidentity16x16_new_avx2, // IDTX 1531 fdct16x16_new_avx2, // V_DCT 1532 fidentity16x16_new_avx2, // H_DCT 1533 fadst16x16_new_avx2, // V_ADST 1534 fidentity16x16_new_avx2, // H_ADST 1535 fadst16x16_new_avx2, // V_FLIPADST 1536 fidentity16x16_new_avx2 // H_FLIPADST 1537 }; 1538 1539 static const transform_1d_avx2 row_txfm16x16_arr[TX_TYPES] = { 1540 fdct16x16_new_avx2, // DCT_DCT 1541 fdct16x16_new_avx2, // ADST_DCT 1542 fadst16x16_new_avx2, // DCT_ADST 1543 fadst16x16_new_avx2, // ADST_ADST 1544 fdct16x16_new_avx2, // FLIPADST_DCT 1545 fadst16x16_new_avx2, // DCT_FLIPADST 1546 fadst16x16_new_avx2, // FLIPADST_FLIPADST 1547 fadst16x16_new_avx2, // ADST_FLIPADST 1548 fadst16x16_new_avx2, // FLIPADST_ADST 1549 fidentity16x16_new_avx2, // IDTX 1550 fidentity16x16_new_avx2, // V_DCT 1551 fdct16x16_new_avx2, // H_DCT 1552 fidentity16x16_new_avx2, // V_ADST 1553 fadst16x16_new_avx2, // H_ADST 1554 fidentity16x16_new_avx2, // V_FLIPADST 1555 fadst16x16_new_avx2 // H_FLIPADST 1556 }; 1557 1558 static const transform_1d_sse2 col_txfm8x8_arr[TX_TYPES] = { 1559 fdct8x8_new_sse2, // DCT_DCT 1560 fadst8x8_new_sse2, // ADST_DCT 1561 fdct8x8_new_sse2, // DCT_ADST 1562 fadst8x8_new_sse2, // ADST_ADST 1563 fadst8x8_new_sse2, // FLIPADST_DCT 1564 fdct8x8_new_sse2, // DCT_FLIPADST 1565 fadst8x8_new_sse2, // FLIPADST_FLIPADST 1566 fadst8x8_new_sse2, // ADST_FLIPADST 1567 fadst8x8_new_sse2, // FLIPADST_ADST 1568 fidentity8x8_new_sse2, // IDTX 1569 fdct8x8_new_sse2, // V_DCT 1570 fidentity8x8_new_sse2, // H_DCT 1571 fadst8x8_new_sse2, // V_ADST 1572 fidentity8x8_new_sse2, // H_ADST 1573 fadst8x8_new_sse2, // V_FLIPADST 1574 fidentity8x8_new_sse2, // H_FLIPADST 1575 }; 1576 1577 static const transform_1d_sse2 row_txfm8x8_arr[TX_TYPES] = { 1578 fdct8x8_new_sse2, // DCT_DCT 1579 fdct8x8_new_sse2, // ADST_DCT 1580 fadst8x8_new_sse2, // DCT_ADST 1581 fadst8x8_new_sse2, // ADST_ADST 1582 fdct8x8_new_sse2, // FLIPADST_DCT 1583 fadst8x8_new_sse2, // DCT_FLIPADST 1584 fadst8x8_new_sse2, // FLIPADST_FLIPADST 1585 fadst8x8_new_sse2, // ADST_FLIPADST 1586 fadst8x8_new_sse2, // FLIPADST_ADST 1587 fidentity8x8_new_sse2, // IDTX 1588 fidentity8x8_new_sse2, // V_DCT 1589 fdct8x8_new_sse2, // H_DCT 1590 fidentity8x8_new_sse2, // V_ADST 1591 fadst8x8_new_sse2, // H_ADST 1592 fidentity8x8_new_sse2, // V_FLIPADST 1593 fadst8x8_new_sse2 // H_FLIPADST 1594 }; 1595 1596 static inline void load_buffer_and_round_shift(const int16_t *in, int stride, 1597 __m128i *out, int bit) { 1598 out[0] = _mm_load_si128((const __m128i *)(in + 0 * stride)); 1599 out[1] = _mm_load_si128((const __m128i *)(in + 1 * stride)); 1600 out[2] = _mm_load_si128((const __m128i *)(in + 2 * stride)); 1601 out[3] = _mm_load_si128((const __m128i *)(in + 3 * stride)); 1602 out[4] = _mm_load_si128((const __m128i *)(in + 4 * stride)); 1603 out[5] = _mm_load_si128((const __m128i *)(in + 5 * stride)); 1604 out[6] = _mm_load_si128((const __m128i *)(in + 6 * stride)); 1605 out[7] = _mm_load_si128((const __m128i *)(in + 7 * stride)); 1606 out[0] = _mm_slli_epi16(out[0], bit); 1607 out[1] = _mm_slli_epi16(out[1], bit); 1608 out[2] = _mm_slli_epi16(out[2], bit); 1609 out[3] = _mm_slli_epi16(out[3], bit); 1610 out[4] = _mm_slli_epi16(out[4], bit); 1611 out[5] = _mm_slli_epi16(out[5], bit); 1612 out[6] = _mm_slli_epi16(out[6], bit); 1613 out[7] = _mm_slli_epi16(out[7], bit); 1614 } 1615 1616 static inline void load_buffer_and_flip_round_shift(const int16_t *in, 1617 int stride, __m128i *out, 1618 int bit) { 1619 out[7] = load_16bit_to_16bit(in + 0 * stride); 1620 out[6] = load_16bit_to_16bit(in + 1 * stride); 1621 out[5] = load_16bit_to_16bit(in + 2 * stride); 1622 out[4] = load_16bit_to_16bit(in + 3 * stride); 1623 out[3] = load_16bit_to_16bit(in + 4 * stride); 1624 out[2] = load_16bit_to_16bit(in + 5 * stride); 1625 out[1] = load_16bit_to_16bit(in + 6 * stride); 1626 out[0] = load_16bit_to_16bit(in + 7 * stride); 1627 out[7] = _mm_slli_epi16(out[7], bit); 1628 out[6] = _mm_slli_epi16(out[6], bit); 1629 out[5] = _mm_slli_epi16(out[5], bit); 1630 out[4] = _mm_slli_epi16(out[4], bit); 1631 out[3] = _mm_slli_epi16(out[3], bit); 1632 out[2] = _mm_slli_epi16(out[2], bit); 1633 out[1] = _mm_slli_epi16(out[1], bit); 1634 out[0] = _mm_slli_epi16(out[0], bit); 1635 } 1636 1637 #define TRANSPOSE_8X8_AVX2() \ 1638 { \ 1639 /* aa0: 00 10 01 11 02 12 03 13 | 40 50 41 51 42 52 43 53*/ \ 1640 /* aa1: 04 14 05 15 06 16 07 17 | 44 54 45 55 46 56 47 57*/ \ 1641 /* aa2: 20 30 21 31 22 32 23 33 | 60 70 61 71 62 72 63 73*/ \ 1642 /* aa3: 24 34 25 35 26 36 27 37 | 64 74 65 75 66 76 67 77*/ \ 1643 const __m256i aa0 = _mm256_unpacklo_epi16(b0, b1); \ 1644 const __m256i aa1 = _mm256_unpackhi_epi16(b0, b1); \ 1645 const __m256i aa2 = _mm256_unpacklo_epi16(b2, b3); \ 1646 const __m256i aa3 = _mm256_unpackhi_epi16(b2, b3); \ 1647 /* Unpack 32 bit elements resulting in: */ \ 1648 /* bb0: 00 10 20 30 01 11 21 31 | 40 50 60 70 41 51 61 71*/ \ 1649 /* bb1: 02 12 22 32 03 13 23 33 | 42 52 62 72 43 53 63 73*/ \ 1650 /* bb2: 04 14 24 34 05 15 25 35 | 44 54 64 74 45 55 65 75*/ \ 1651 /* bb2: 06 16 26 36 07 17 27 37 | 46 56 66 76 47 57 67 77*/ \ 1652 const __m256i bb0 = _mm256_unpacklo_epi32(aa0, aa2); \ 1653 const __m256i bb1 = _mm256_unpackhi_epi32(aa0, aa2); \ 1654 const __m256i bb2 = _mm256_unpacklo_epi32(aa1, aa3); \ 1655 const __m256i bb3 = _mm256_unpackhi_epi32(aa1, aa3); \ 1656 /* bb0: 00 10 20 30 40 50 60 70| 01 11 21 31 41 51 61 71*/ \ 1657 /* bb1: 02 12 22 32 42 52 62 72| 03 13 23 33 43 53 63 73*/ \ 1658 /* bb2: 04 14 24 34 44 54 64 74| 05 15 25 35 45 55 65 75*/ \ 1659 /* bb2: 06 16 26 36 46 56 66 76| 07 17 27 37 47 57 67 77*/ \ 1660 c0 = _mm256_permute4x64_epi64(bb0, 0xd8); \ 1661 c1 = _mm256_permute4x64_epi64(bb1, 0xd8); \ 1662 c2 = _mm256_permute4x64_epi64(bb2, 0xd8); \ 1663 c3 = _mm256_permute4x64_epi64(bb3, 0xd8); \ 1664 } 1665 1666 static inline void transpose_round_shift_flip_8x8(__m128i *const in, 1667 __m128i *const out, int bit) { 1668 __m256i c0, c1, c2, c3; 1669 bit = -bit; 1670 const __m256i rounding = _mm256_set1_epi16(1 << (bit - 1)); 1671 const __m256i s04 = 1672 _mm256_insertf128_si256(_mm256_castsi128_si256(in[0]), in[4], 0x1); 1673 const __m256i s15 = 1674 _mm256_insertf128_si256(_mm256_castsi128_si256(in[1]), in[5], 0x1); 1675 const __m256i s26 = 1676 _mm256_insertf128_si256(_mm256_castsi128_si256(in[2]), in[6], 0x1); 1677 const __m256i s37 = 1678 _mm256_insertf128_si256(_mm256_castsi128_si256(in[3]), in[7], 0x1); 1679 1680 const __m256i a0 = _mm256_adds_epi16(s04, rounding); 1681 const __m256i a1 = _mm256_adds_epi16(s15, rounding); 1682 const __m256i a2 = _mm256_adds_epi16(s26, rounding); 1683 const __m256i a3 = _mm256_adds_epi16(s37, rounding); 1684 1685 // b0: 00 01 02 03 04 05 06 07 | 40 41 42 43 44 45 46 47 1686 // b1: 10 11 12 13 14 15 16 17 | 50 51 52 53 54 55 56 57 1687 // b2: 20 21 22 23 24 25 26 27 | 60 61 62 63 64 65 66 67 1688 // b3: 30 31 32 33 34 35 36 37 | 70 71 72 73 74 75 76 77 1689 const __m256i b0 = _mm256_srai_epi16(a0, bit); 1690 const __m256i b1 = _mm256_srai_epi16(a1, bit); 1691 const __m256i b2 = _mm256_srai_epi16(a2, bit); 1692 const __m256i b3 = _mm256_srai_epi16(a3, bit); 1693 1694 TRANSPOSE_8X8_AVX2() 1695 1696 // Unpack 64 bit elements resulting in: 1697 // out[7]: 00 10 20 30 40 50 60 70 1698 // out[6]: 01 11 21 31 41 51 61 71 1699 // out[5]: 02 12 22 32 42 52 62 72 1700 // out[4]: 03 13 23 33 43 53 63 73 1701 // out[3]: 04 14 24 34 44 54 64 74 1702 // out[2]: 05 15 25 35 45 55 65 75 1703 // out[1]: 06 16 26 36 46 56 66 76 1704 // out[0]: 07 17 27 37 47 57 67 77 1705 out[7] = _mm256_castsi256_si128(c0); 1706 out[6] = _mm256_extractf128_si256(c0, 1); 1707 out[5] = _mm256_castsi256_si128(c1); 1708 out[4] = _mm256_extractf128_si256(c1, 1); 1709 out[3] = _mm256_castsi256_si128(c2); 1710 out[2] = _mm256_extractf128_si256(c2, 1); 1711 out[1] = _mm256_castsi256_si128(c3); 1712 out[0] = _mm256_extractf128_si256(c3, 1); 1713 } 1714 1715 static inline void transpose_round_shift_8x8(__m128i *const in, 1716 __m128i *const out, int bit) { 1717 __m256i c0, c1, c2, c3; 1718 bit = -bit; 1719 const __m256i rounding = _mm256_set1_epi16(1 << (bit - 1)); 1720 const __m256i s04 = 1721 _mm256_insertf128_si256(_mm256_castsi128_si256(in[0]), in[4], 0x1); 1722 const __m256i s15 = 1723 _mm256_insertf128_si256(_mm256_castsi128_si256(in[1]), in[5], 0x1); 1724 const __m256i s26 = 1725 _mm256_insertf128_si256(_mm256_castsi128_si256(in[2]), in[6], 0x1); 1726 const __m256i s37 = 1727 _mm256_insertf128_si256(_mm256_castsi128_si256(in[3]), in[7], 0x1); 1728 1729 const __m256i a0 = _mm256_adds_epi16(s04, rounding); 1730 const __m256i a1 = _mm256_adds_epi16(s15, rounding); 1731 const __m256i a2 = _mm256_adds_epi16(s26, rounding); 1732 const __m256i a3 = _mm256_adds_epi16(s37, rounding); 1733 1734 // b0: 00 01 02 03 04 05 06 07 | 40 41 42 43 44 45 46 47 1735 // b1: 10 11 12 13 14 15 16 17 | 50 51 52 53 54 55 56 57 1736 // b2: 20 21 22 23 24 25 26 27 | 60 61 62 63 64 65 66 67 1737 // b3: 30 31 32 33 34 35 36 37 | 70 71 72 73 74 75 76 77 1738 const __m256i b0 = _mm256_srai_epi16(a0, bit); 1739 const __m256i b1 = _mm256_srai_epi16(a1, bit); 1740 const __m256i b2 = _mm256_srai_epi16(a2, bit); 1741 const __m256i b3 = _mm256_srai_epi16(a3, bit); 1742 1743 TRANSPOSE_8X8_AVX2() 1744 // Unpack 64 bit elements resulting in: 1745 // out[7]: 00 10 20 30 40 50 60 70 1746 // out[6]: 01 11 21 31 41 51 61 71 1747 // out[5]: 02 12 22 32 42 52 62 72 1748 // out[4]: 03 13 23 33 43 53 63 73 1749 // out[3]: 04 14 24 34 44 54 64 74 1750 // out[2]: 05 15 25 35 45 55 65 75 1751 // out[1]: 06 16 26 36 46 56 66 76 1752 // out[0]: 07 17 27 37 47 57 67 77 1753 out[0] = _mm256_castsi256_si128(c0); 1754 out[1] = _mm256_extractf128_si256(c0, 1); 1755 out[2] = _mm256_castsi256_si128(c1); 1756 out[3] = _mm256_extractf128_si256(c1, 1); 1757 out[4] = _mm256_castsi256_si128(c2); 1758 out[5] = _mm256_extractf128_si256(c2, 1); 1759 out[6] = _mm256_castsi256_si128(c3); 1760 out[7] = _mm256_extractf128_si256(c3, 1); 1761 } 1762 1763 static inline void store_buffer_16bit_to_32bit_w8_avx2(const __m128i *const in, 1764 int32_t *const out, 1765 const int stride, 1766 const int out_size) { 1767 for (int i = 0; i < out_size; ++i) { 1768 _mm256_store_si256((__m256i *)(out + i * stride), 1769 _mm256_cvtepi16_epi32(in[i])); 1770 } 1771 } 1772 1773 static void av1_lowbd_fwd_txfm2d_8x8_avx2(const int16_t *input, int32_t *output, 1774 int stride, TX_TYPE tx_type, int bd) { 1775 (void)bd; 1776 __m128i buf0[8], buf1[8], *buf; 1777 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8]; 1778 const int txw_idx = get_txw_idx(TX_8X8); 1779 const int txh_idx = get_txh_idx(TX_8X8); 1780 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; 1781 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; 1782 const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type]; 1783 const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type]; 1784 int ud_flip, lr_flip; 1785 1786 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 1787 // Condition to check shift bit is avoided while round shifting, by assuming 1788 // that shift[0] will always be positive. 1789 assert(shift[0] > 0); 1790 if (ud_flip) 1791 load_buffer_and_flip_round_shift(input, stride, buf0, shift[0]); 1792 else 1793 load_buffer_and_round_shift(input, stride, buf0, shift[0]); 1794 1795 col_txfm(buf0, buf0, cos_bit_col); 1796 // Condition to check shift bit is avoided while round shifting, by assuming 1797 // that shift[1] will always be negative. 1798 assert(shift[1] < 0); 1799 1800 if (lr_flip) { 1801 transpose_round_shift_flip_8x8(buf0, buf1, shift[1]); 1802 } else { 1803 transpose_round_shift_8x8(buf0, buf1, shift[1]); 1804 } 1805 1806 buf = buf1; 1807 row_txfm(buf, buf, cos_bit_row); 1808 1809 // Round and shift operation is avoided here as the shift bit is assumed to be 1810 // zero always. 1811 assert(shift[2] == 0); 1812 store_buffer_16bit_to_32bit_w8_avx2(buf, output, 8, 8); 1813 } 1814 1815 static void lowbd_fwd_txfm2d_16x16_avx2(const int16_t *input, int32_t *output, 1816 int stride, TX_TYPE tx_type, int bd) { 1817 (void)bd; 1818 const TX_SIZE tx_size = TX_16X16; 1819 __m256i buf0[16], buf1[16]; 1820 const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; 1821 const int txw_idx = get_txw_idx(tx_size); 1822 const int txh_idx = get_txh_idx(tx_size); 1823 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; 1824 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; 1825 const int width = tx_size_wide[tx_size]; 1826 const int height = tx_size_high[tx_size]; 1827 const transform_1d_avx2 col_txfm = col_txfm16x16_arr[tx_type]; 1828 const transform_1d_avx2 row_txfm = row_txfm16x16_arr[tx_type]; 1829 int ud_flip, lr_flip; 1830 1831 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 1832 const int32_t i = 0; 1833 if (ud_flip) { 1834 load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0, height); 1835 } else { 1836 load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); 1837 } 1838 round_shift_16bit_w16_avx2(buf0, height, shift[0]); 1839 col_txfm(buf0, buf0, cos_bit_col); 1840 round_shift_16bit_w16_avx2(buf0, height, shift[1]); 1841 transpose_16bit_16x16_avx2(buf0, buf1 + 0 * width + 16 * i); 1842 1843 __m256i *buf; 1844 if (lr_flip) { 1845 buf = buf0; 1846 flip_buf_avx2(buf1 + width * i, buf, width); 1847 } else { 1848 buf = buf1 + width * i; 1849 } 1850 row_txfm(buf, buf, cos_bit_row); 1851 round_shift_16bit_w16_avx2(buf, width, shift[2]); 1852 store_buffer_16bit_to_32bit_w16_avx2(buf, output + i * 16, height, width); 1853 } 1854 1855 static void lowbd_fwd_txfm2d_32x32_avx2(const int16_t *input, int32_t *output, 1856 int stride, TX_TYPE tx_type, int bd) { 1857 (void)bd; 1858 const TX_SIZE tx_size = TX_32X32; 1859 __m256i buf0[32], buf1[128]; 1860 const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; 1861 const int txw_idx = get_txw_idx(tx_size); 1862 const int txh_idx = get_txh_idx(tx_size); 1863 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; 1864 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; 1865 const int width = tx_size_wide[tx_size]; 1866 const int height = tx_size_high[tx_size]; 1867 const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type]; 1868 const transform_1d_avx2 row_txfm = row_txfm16x32_arr[tx_type]; 1869 1870 int ud_flip, lr_flip; 1871 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 1872 1873 for (int i = 0; i < 2; i++) { 1874 if (ud_flip) { 1875 load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0, 1876 height); 1877 } else { 1878 load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); 1879 } 1880 round_shift_16bit_w16_avx2(buf0, height, shift[0]); 1881 col_txfm(buf0, buf0, cos_bit_col); 1882 round_shift_16bit_w16_avx2(buf0, height, shift[1]); 1883 transpose_16bit_16x16_avx2(buf0 + 0 * 16, buf1 + 0 * width + 16 * i); 1884 transpose_16bit_16x16_avx2(buf0 + 1 * 16, buf1 + 1 * width + 16 * i); 1885 } 1886 1887 for (int i = 0; i < 2; i++) { 1888 __m256i *buf; 1889 if (lr_flip) { 1890 buf = buf0; 1891 flip_buf_avx2(buf1 + width * i, buf, width); 1892 } else { 1893 buf = buf1 + width * i; 1894 } 1895 row_txfm(buf, buf, cos_bit_row); 1896 round_shift_16bit_w16_avx2(buf, width, shift[2]); 1897 store_buffer_16bit_to_32bit_w16_avx2(buf, output + i * 16, height, width); 1898 } 1899 } 1900 1901 static void lowbd_fwd_txfm2d_64x64_avx2(const int16_t *input, int32_t *output, 1902 int stride, TX_TYPE tx_type, int bd) { 1903 (void)bd; 1904 (void)tx_type; 1905 assert(tx_type == DCT_DCT); 1906 const TX_SIZE tx_size = TX_64X64; 1907 __m256i buf0[64], buf1[256]; 1908 const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; 1909 const int txw_idx = get_txw_idx(tx_size); 1910 const int txh_idx = get_txh_idx(tx_size); 1911 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; 1912 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; 1913 const int width = tx_size_wide[tx_size]; 1914 const int height = tx_size_high[tx_size]; 1915 const transform_1d_avx2 col_txfm = fdct16x64_new_avx2; 1916 const int width_div16 = (width >> 4); 1917 const int height_div16 = (height >> 4); 1918 1919 for (int i = 0; i < width_div16; i++) { 1920 load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); 1921 round_shift_16bit_w16_avx2(buf0, height, shift[0]); 1922 col_txfm(buf0, buf0, cos_bit_col); 1923 round_shift_16bit_w16_avx2(buf0, height, shift[1]); 1924 for (int j = 0; j < AOMMIN(2, height_div16); ++j) { 1925 transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i); 1926 } 1927 } 1928 1929 for (int i = 0; i < AOMMIN(2, height_div16); i++) { 1930 __m256i bufA[64]; 1931 __m256i bufB[64]; 1932 __m128i *buf = (__m128i *)(buf1 + width * i); 1933 for (int j = 0; j < width; ++j) { 1934 bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]); 1935 bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]); 1936 } 1937 fdct64_new_avx2(bufA, bufA, cos_bit_row); 1938 fdct64_new_avx2(bufB, bufB, cos_bit_row); 1939 round_shift_array_32_avx2(bufA, bufA, 32, -shift[2]); 1940 round_shift_array_32_avx2(bufB, bufB, 32, -shift[2]); 1941 store_output_32bit_w16(output + i * 16, bufA, bufB, 32, 32); 1942 } 1943 } 1944 1945 static void lowbd_fwd_txfm2d_16x32_avx2(const int16_t *input, int32_t *output, 1946 int stride, TX_TYPE tx_type, int bd) { 1947 (void)bd; 1948 const TX_SIZE tx_size = TX_16X32; 1949 __m256i buf0[32], buf1[32]; 1950 const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; 1951 const int txw_idx = get_txw_idx(tx_size); 1952 const int txh_idx = get_txh_idx(tx_size); 1953 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; 1954 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; 1955 const int width = tx_size_wide[tx_size]; 1956 const int height = tx_size_high[tx_size]; 1957 const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type]; 1958 const transform_1d_avx2 row_txfm = row_txfm16x16_arr[tx_type]; 1959 1960 int ud_flip, lr_flip; 1961 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 1962 1963 if (ud_flip) { 1964 load_buffer_16bit_to_16bit_flip_avx2(input, stride, buf0, height); 1965 } else { 1966 load_buffer_16bit_to_16bit_avx2(input, stride, buf0, height); 1967 } 1968 round_shift_16bit_w16_avx2(buf0, height, shift[0]); 1969 col_txfm(buf0, buf0, cos_bit_col); 1970 round_shift_16bit_w16_avx2(buf0, height, shift[1]); 1971 transpose_16bit_16x16_avx2(buf0, buf1); 1972 transpose_16bit_16x16_avx2(buf0 + 16, buf1 + 16); 1973 1974 for (int i = 0; i < 2; i++) { 1975 __m256i *buf; 1976 if (lr_flip) { 1977 buf = buf0; 1978 flip_buf_avx2(buf1 + width * i, buf, width); 1979 } else { 1980 buf = buf1 + width * i; 1981 } 1982 row_txfm(buf, buf, cos_bit_row); 1983 round_shift_16bit_w16_avx2(buf, width, shift[2]); 1984 store_rect_buffer_16bit_to_32bit_w16_avx2(buf, output + i * 16, height, 1985 width); 1986 } 1987 } 1988 1989 static void lowbd_fwd_txfm2d_32x16_avx2(const int16_t *input, int32_t *output, 1990 int stride, TX_TYPE tx_type, int bd) { 1991 (void)bd; 1992 __m256i buf0[32], buf1[64]; 1993 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16]; 1994 const int txw_idx = get_txw_idx(TX_32X16); 1995 const int txh_idx = get_txh_idx(TX_32X16); 1996 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; 1997 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; 1998 const int width = 32; 1999 const int height = 16; 2000 const transform_1d_avx2 col_txfm = col_txfm16x16_arr[tx_type]; 2001 const transform_1d_avx2 row_txfm = row_txfm16x32_arr[tx_type]; 2002 2003 int ud_flip, lr_flip; 2004 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 2005 2006 for (int i = 0; i < 2; i++) { 2007 if (ud_flip) { 2008 load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0, 2009 height); 2010 } else { 2011 load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); 2012 } 2013 round_shift_16bit_w16_avx2(buf0, height, shift[0]); 2014 col_txfm(buf0, buf0, cos_bit_col); 2015 round_shift_16bit_w16_avx2(buf0, height, shift[1]); 2016 transpose_16bit_16x16_avx2(buf0, buf1 + 0 * width + 16 * i); 2017 } 2018 2019 __m256i *buf; 2020 if (lr_flip) { 2021 buf = buf0; 2022 flip_buf_avx2(buf1, buf, width); 2023 } else { 2024 buf = buf1; 2025 } 2026 row_txfm(buf, buf, cos_bit_row); 2027 round_shift_16bit_w16_avx2(buf, width, shift[2]); 2028 store_rect_buffer_16bit_to_32bit_w16_avx2(buf, output, height, width); 2029 } 2030 2031 static void lowbd_fwd_txfm2d_64x32_avx2(const int16_t *input, int32_t *output, 2032 int stride, TX_TYPE tx_type, int bd) { 2033 (void)bd; 2034 const TX_SIZE tx_size = TX_64X32; 2035 __m256i buf0[64], buf1[256]; 2036 const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; 2037 const int txw_idx = get_txw_idx(tx_size); 2038 const int txh_idx = get_txh_idx(tx_size); 2039 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; 2040 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; 2041 const int width = tx_size_wide[tx_size]; 2042 const int height = tx_size_high[tx_size]; 2043 const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type]; 2044 const int width_div16 = (width >> 4); 2045 const int height_div16 = (height >> 4); 2046 2047 for (int i = 0; i < width_div16; i++) { 2048 load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); 2049 round_shift_16bit_w16_avx2(buf0, height, shift[0]); 2050 col_txfm(buf0, buf0, cos_bit_col); 2051 round_shift_16bit_w16_avx2(buf0, height, shift[1]); 2052 for (int j = 0; j < AOMMIN(4, height_div16); ++j) { 2053 transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i); 2054 } 2055 } 2056 assert(tx_type == DCT_DCT); 2057 for (int i = 0; i < AOMMIN(2, height_div16); i++) { 2058 __m256i bufA[64]; 2059 __m256i bufB[64]; 2060 __m128i *buf = (__m128i *)(buf1 + width * i); 2061 for (int j = 0; j < width; ++j) { 2062 bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]); 2063 bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]); 2064 } 2065 fdct64_new_avx2(bufA, bufA, cos_bit_row); 2066 fdct64_new_avx2(bufB, bufB, cos_bit_row); 2067 round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2], NewSqrt2); 2068 round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2], NewSqrt2); 2069 2070 store_output_32bit_w16(output + i * 16, bufA, bufB, 32, 32); 2071 } 2072 } 2073 2074 static void lowbd_fwd_txfm2d_32x64_avx2(const int16_t *input, int32_t *output, 2075 int stride, TX_TYPE tx_type, int bd) { 2076 (void)bd; 2077 (void)tx_type; 2078 assert(tx_type == DCT_DCT); 2079 const TX_SIZE tx_size = TX_32X64; 2080 __m256i buf0[64], buf1[256]; 2081 const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; 2082 const int txw_idx = get_txw_idx(tx_size); 2083 const int txh_idx = get_txh_idx(tx_size); 2084 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; 2085 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; 2086 const int width = tx_size_wide[tx_size]; 2087 const int height = tx_size_high[tx_size]; 2088 const transform_1d_avx2 col_txfm = fdct16x64_new_avx2; 2089 const int width_div16 = (width >> 4); 2090 const int height_div16 = (height >> 4); 2091 2092 for (int i = 0; i < width_div16; i++) { 2093 load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); 2094 round_shift_16bit_w16_avx2(buf0, height, shift[0]); 2095 col_txfm(buf0, buf0, cos_bit_col); 2096 round_shift_16bit_w16_avx2(buf0, height, shift[1]); 2097 for (int j = 0; j < AOMMIN(2, height_div16); ++j) { 2098 transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i); 2099 } 2100 } 2101 2102 for (int i = 0; i < AOMMIN(2, height_div16); i++) { 2103 __m256i bufA[32]; 2104 __m256i bufB[32]; 2105 __m128i *buf = (__m128i *)(buf1 + width * i); 2106 for (int j = 0; j < width; ++j) { 2107 bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]); 2108 bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]); 2109 } 2110 fdct32_avx2(bufA, bufA, cos_bit_row); 2111 fdct32_avx2(bufB, bufB, cos_bit_row); 2112 round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2], NewSqrt2); 2113 round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2], NewSqrt2); 2114 2115 store_output_32bit_w16(output + i * 16, bufA, bufB, 32, 32); 2116 } 2117 } 2118 2119 static void lowbd_fwd_txfm2d_16x64_avx2(const int16_t *input, int32_t *output, 2120 int stride, TX_TYPE tx_type, int bd) { 2121 (void)bd; 2122 (void)tx_type; 2123 assert(tx_type == DCT_DCT); 2124 const TX_SIZE tx_size = TX_16X64; 2125 __m256i buf0[64], buf1[64]; 2126 const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; 2127 const int txw_idx = get_txw_idx(tx_size); 2128 const int txh_idx = get_txh_idx(tx_size); 2129 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; 2130 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; 2131 const int width = tx_size_wide[tx_size]; 2132 const int height = tx_size_high[tx_size]; 2133 const transform_1d_avx2 col_txfm = fdct16x64_new_avx2; 2134 const transform_1d_avx2 row_txfm = fdct16x16_new_avx2; 2135 const int width_div16 = (width >> 4); 2136 const int height_div16 = (height >> 4); 2137 2138 for (int i = 0; i < width_div16; i++) { 2139 load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); 2140 round_shift_16bit_w16_avx2(buf0, height, shift[0]); 2141 col_txfm(buf0, buf0, cos_bit_col); 2142 round_shift_16bit_w16_avx2(buf0, height, shift[1]); 2143 for (int j = 0; j < height_div16; ++j) { 2144 transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i); 2145 } 2146 } 2147 2148 for (int i = 0; i < AOMMIN(2, height_div16); i++) { 2149 __m256i *buf = buf1 + width * i; 2150 row_txfm(buf, buf, cos_bit_row); 2151 round_shift_16bit_w16_avx2(buf, width, shift[2]); 2152 store_buffer_16bit_to_32bit_w16_avx2(buf, output + width * i, 32, width); 2153 } 2154 } 2155 2156 static void lowbd_fwd_txfm2d_64x16_avx2(const int16_t *input, int32_t *output, 2157 int stride, TX_TYPE tx_type, int bd) { 2158 (void)bd; 2159 (void)tx_type; 2160 assert(tx_type == DCT_DCT); 2161 const TX_SIZE tx_size = TX_64X16; 2162 __m256i buf0[64], buf1[64]; 2163 const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; 2164 const int txw_idx = get_txw_idx(tx_size); 2165 const int txh_idx = get_txh_idx(tx_size); 2166 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; 2167 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; 2168 const int width = tx_size_wide[tx_size]; 2169 const int height = tx_size_high[tx_size]; 2170 const transform_1d_avx2 col_txfm = fdct16x16_new_avx2; 2171 const transform_1d_avx2 row_txfm = fdct16x64_new_avx2; 2172 const int width_div16 = (width >> 4); 2173 const int height_div16 = (height >> 4); 2174 2175 for (int i = 0; i < width_div16; i++) { 2176 load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); 2177 round_shift_16bit_w16_avx2(buf0, height, shift[0]); 2178 col_txfm(buf0, buf0, cos_bit_col); 2179 round_shift_16bit_w16_avx2(buf0, height, shift[1]); 2180 for (int j = 0; j < height_div16; ++j) { 2181 transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i); 2182 } 2183 } 2184 2185 for (int i = 0; i < height_div16; i++) { 2186 __m256i *buf = buf1 + width * i; 2187 row_txfm(buf, buf, cos_bit_row); 2188 round_shift_16bit_w16_avx2(buf, width, shift[2]); 2189 store_buffer_16bit_to_32bit_w16_avx2(buf, output + 16 * i, 16, 32); 2190 } 2191 // Zero out the bottom 16x32 area. 2192 memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output)); 2193 } 2194 2195 static inline void btf_16_avx2(__m256i *w0, __m256i *w1, __m256i *in0, 2196 __m256i *in1, __m128i *out0, __m128i *out1, 2197 __m128i *out2, __m128i *out3, 2198 const __m256i *__rounding, int8_t *cos_bit) { 2199 __m256i t0 = _mm256_unpacklo_epi16(*in0, *in1); 2200 __m256i t1 = _mm256_unpackhi_epi16(*in0, *in1); 2201 __m256i u0 = _mm256_madd_epi16(t0, *w0); 2202 __m256i u1 = _mm256_madd_epi16(t1, *w0); 2203 __m256i v0 = _mm256_madd_epi16(t0, *w1); 2204 __m256i v1 = _mm256_madd_epi16(t1, *w1); 2205 2206 __m256i a0 = _mm256_add_epi32(u0, *__rounding); 2207 __m256i a1 = _mm256_add_epi32(u1, *__rounding); 2208 __m256i b0 = _mm256_add_epi32(v0, *__rounding); 2209 __m256i b1 = _mm256_add_epi32(v1, *__rounding); 2210 2211 __m256i c0 = _mm256_srai_epi32(a0, *cos_bit); 2212 __m256i c1 = _mm256_srai_epi32(a1, *cos_bit); 2213 __m256i d0 = _mm256_srai_epi32(b0, *cos_bit); 2214 __m256i d1 = _mm256_srai_epi32(b1, *cos_bit); 2215 2216 __m256i temp0 = _mm256_packs_epi32(c0, c1); 2217 __m256i temp1 = _mm256_packs_epi32(d0, d1); 2218 2219 *out0 = _mm256_castsi256_si128(temp0); 2220 *out1 = _mm256_castsi256_si128(temp1); 2221 *out2 = _mm256_extracti128_si256(temp0, 0x01); 2222 *out3 = _mm256_extracti128_si256(temp1, 0x01); 2223 } 2224 2225 static inline void fdct8x8_new_avx2(const __m256i *input, __m256i *output, 2226 int8_t cos_bit) { 2227 const int32_t *cospi = cospi_arr(cos_bit); 2228 const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1)); 2229 2230 __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); 2231 __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); 2232 __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); 2233 __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); 2234 __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); 2235 __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); 2236 __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); 2237 __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); 2238 __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); 2239 2240 // stage 1 2241 __m256i x1[8]; 2242 x1[0] = _mm256_adds_epi16(input[0], input[7]); 2243 x1[7] = _mm256_subs_epi16(input[0], input[7]); 2244 x1[1] = _mm256_adds_epi16(input[1], input[6]); 2245 x1[6] = _mm256_subs_epi16(input[1], input[6]); 2246 x1[2] = _mm256_adds_epi16(input[2], input[5]); 2247 x1[5] = _mm256_subs_epi16(input[2], input[5]); 2248 x1[3] = _mm256_adds_epi16(input[3], input[4]); 2249 x1[4] = _mm256_subs_epi16(input[3], input[4]); 2250 2251 // stage 2 2252 __m256i x2[8]; 2253 x2[0] = _mm256_adds_epi16(x1[0], x1[3]); 2254 x2[3] = _mm256_subs_epi16(x1[0], x1[3]); 2255 x2[1] = _mm256_adds_epi16(x1[1], x1[2]); 2256 x2[2] = _mm256_subs_epi16(x1[1], x1[2]); 2257 x2[4] = x1[4]; 2258 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], __rounding, 2259 cos_bit); 2260 x2[5] = x1[5]; 2261 x2[6] = x1[6]; 2262 x2[7] = x1[7]; 2263 2264 // stage 3 2265 __m256i x3[8]; 2266 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x2[0], &x2[1], __rounding, 2267 cos_bit); 2268 x3[0] = x2[0]; 2269 x3[1] = x2[1]; 2270 btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x2[2], &x2[3], __rounding, 2271 cos_bit); 2272 x3[2] = x2[2]; 2273 x3[3] = x2[3]; 2274 x3[4] = _mm256_adds_epi16(x2[4], x2[5]); 2275 x3[5] = _mm256_subs_epi16(x2[4], x2[5]); 2276 x3[6] = _mm256_subs_epi16(x2[7], x2[6]); 2277 x3[7] = _mm256_adds_epi16(x2[7], x2[6]); 2278 2279 // stage 4 2280 __m256i x4[8]; 2281 x4[0] = x3[0]; 2282 x4[1] = x3[1]; 2283 x4[2] = x3[2]; 2284 x4[3] = x3[3]; 2285 btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x3[4], &x3[7], __rounding, 2286 cos_bit); 2287 x4[4] = x3[4]; 2288 x4[7] = x3[7]; 2289 btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x3[5], &x3[6], __rounding, 2290 cos_bit); 2291 x4[5] = x3[5]; 2292 x4[6] = x3[6]; 2293 // stage 5 2294 output[0] = x4[0]; 2295 output[1] = x4[4]; 2296 output[2] = x4[2]; 2297 output[3] = x4[6]; 2298 output[4] = x4[1]; 2299 output[5] = x4[5]; 2300 output[6] = x4[3]; 2301 output[7] = x4[7]; 2302 } 2303 2304 static inline void fadst8x8_new_avx2(const __m256i *input, __m256i *output, 2305 int8_t cos_bit) { 2306 const int32_t *cospi = cospi_arr(cos_bit); 2307 const __m256i __zero = _mm256_setzero_si256(); 2308 const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1)); 2309 2310 __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); 2311 __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); 2312 __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); 2313 __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); 2314 __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]); 2315 __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]); 2316 __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]); 2317 __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]); 2318 __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]); 2319 __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]); 2320 __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]); 2321 __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]); 2322 __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]); 2323 2324 // stage 1 2325 __m256i x1[8]; 2326 x1[0] = input[0]; 2327 x1[1] = _mm256_subs_epi16(__zero, input[7]); 2328 x1[2] = _mm256_subs_epi16(__zero, input[3]); 2329 x1[3] = input[4]; 2330 x1[4] = _mm256_subs_epi16(__zero, input[1]); 2331 x1[5] = input[6]; 2332 x1[6] = input[2]; 2333 x1[7] = _mm256_subs_epi16(__zero, input[5]); 2334 2335 // stage 2 2336 __m256i x2[8]; 2337 x2[0] = x1[0]; 2338 x2[1] = x1[1]; 2339 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], __rounding, 2340 cos_bit); 2341 x2[2] = x1[2]; 2342 x2[3] = x1[3]; 2343 x2[4] = x1[4]; 2344 x2[5] = x1[5]; 2345 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], __rounding, 2346 cos_bit); 2347 x2[6] = x1[6]; 2348 x2[7] = x1[7]; 2349 2350 // stage 3 2351 __m256i x3[8]; 2352 x3[0] = _mm256_adds_epi16(x2[0], x2[2]); 2353 x3[2] = _mm256_subs_epi16(x2[0], x2[2]); 2354 x3[1] = _mm256_adds_epi16(x2[1], x2[3]); 2355 x3[3] = _mm256_subs_epi16(x2[1], x2[3]); 2356 x3[4] = _mm256_adds_epi16(x2[4], x2[6]); 2357 x3[6] = _mm256_subs_epi16(x2[4], x2[6]); 2358 x3[5] = _mm256_adds_epi16(x2[5], x2[7]); 2359 x3[7] = _mm256_subs_epi16(x2[5], x2[7]); 2360 2361 // stage 4 2362 __m256i x4[8]; 2363 x4[0] = x3[0]; 2364 x4[1] = x3[1]; 2365 x4[2] = x3[2]; 2366 x4[3] = x3[3]; 2367 btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x3[4], &x3[5], __rounding, 2368 cos_bit); 2369 x4[4] = x3[4]; 2370 x4[5] = x3[5]; 2371 btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x3[6], &x3[7], __rounding, 2372 cos_bit); 2373 x4[6] = x3[6]; 2374 x4[7] = x3[7]; 2375 2376 // stage 5 2377 __m256i x5[8]; 2378 x5[0] = _mm256_adds_epi16(x4[0], x4[4]); 2379 x5[4] = _mm256_subs_epi16(x4[0], x4[4]); 2380 x5[1] = _mm256_adds_epi16(x4[1], x4[5]); 2381 x5[5] = _mm256_subs_epi16(x4[1], x4[5]); 2382 x5[2] = _mm256_adds_epi16(x4[2], x4[6]); 2383 x5[6] = _mm256_subs_epi16(x4[2], x4[6]); 2384 x5[3] = _mm256_adds_epi16(x4[3], x4[7]); 2385 x5[7] = _mm256_subs_epi16(x4[3], x4[7]); 2386 2387 // stage 6 2388 __m256i x6[8]; 2389 btf_16_w16_avx2(cospi_p04_p60, cospi_p60_m04, &x5[0], &x5[1], __rounding, 2390 cos_bit); 2391 x6[0] = x5[0]; 2392 x6[1] = x5[1]; 2393 btf_16_w16_avx2(cospi_p20_p44, cospi_p44_m20, &x5[2], &x5[3], __rounding, 2394 cos_bit); 2395 x6[2] = x5[2]; 2396 x6[3] = x5[3]; 2397 btf_16_w16_avx2(cospi_p36_p28, cospi_p28_m36, &x5[4], &x5[5], __rounding, 2398 cos_bit); 2399 x6[4] = x5[4]; 2400 x6[5] = x5[5]; 2401 btf_16_w16_avx2(cospi_p52_p12, cospi_p12_m52, &x5[6], &x5[7], __rounding, 2402 cos_bit); 2403 x6[6] = x5[6]; 2404 x6[7] = x5[7]; 2405 2406 // stage 7 2407 output[0] = x6[1]; 2408 output[1] = x6[6]; 2409 output[2] = x6[3]; 2410 output[3] = x6[4]; 2411 output[4] = x6[5]; 2412 output[5] = x6[2]; 2413 output[6] = x6[7]; 2414 output[7] = x6[0]; 2415 } 2416 2417 static inline void fidentity8x8_new_avx2(const __m256i *input, __m256i *output, 2418 int8_t cos_bit) { 2419 (void)cos_bit; 2420 2421 output[0] = _mm256_adds_epi16(input[0], input[0]); 2422 output[1] = _mm256_adds_epi16(input[1], input[1]); 2423 output[2] = _mm256_adds_epi16(input[2], input[2]); 2424 output[3] = _mm256_adds_epi16(input[3], input[3]); 2425 output[4] = _mm256_adds_epi16(input[4], input[4]); 2426 output[5] = _mm256_adds_epi16(input[5], input[5]); 2427 output[6] = _mm256_adds_epi16(input[6], input[6]); 2428 output[7] = _mm256_adds_epi16(input[7], input[7]); 2429 } 2430 2431 static inline void fdct8x16_new_avx2(const __m128i *input, __m128i *output, 2432 int8_t cos_bit) { 2433 const int32_t *cospi = cospi_arr(cos_bit); 2434 const __m256i __rounding_256 = _mm256_set1_epi32(1 << (cos_bit - 1)); 2435 const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); 2436 __m128i temp0, temp1, temp2, temp3; 2437 __m256i in0, in1; 2438 __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); 2439 __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); 2440 __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); 2441 __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); 2442 __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); 2443 __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); 2444 __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); 2445 __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); 2446 __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); 2447 __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); 2448 __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]); 2449 __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]); 2450 __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]); 2451 __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]); 2452 __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]); 2453 __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]); 2454 __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]); 2455 __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]); 2456 2457 __m256i cospi_arr[12]; 2458 2459 cospi_arr[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m32_p32), 2460 cospi_m32_p32, 0x1); 2461 cospi_arr[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32), 2462 cospi_p32_p32, 0x1); 2463 cospi_arr[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32), 2464 cospi_p48_p16, 0x1); 2465 cospi_arr[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32), 2466 cospi_m16_p48, 0x1); 2467 cospi_arr[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m16_p48), 2468 cospi_m48_m16, 0x1); 2469 cospi_arr[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_p16), 2470 cospi_m16_p48, 0x1); 2471 cospi_arr[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p56_p08), 2472 cospi_p24_p40, 0x1); 2473 cospi_arr[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m08_p56), 2474 cospi_m40_p24, 0x1); 2475 cospi_arr[8] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p60_p04), 2476 cospi_p28_p36, 0x1); 2477 cospi_arr[9] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m04_p60), 2478 cospi_m36_p28, 0x1); 2479 cospi_arr[10] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p44_p20), 2480 cospi_p12_p52, 0x1); 2481 cospi_arr[11] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m20_p44), 2482 cospi_m52_p12, 0x1); 2483 2484 __m256i x[8]; 2485 x[0] = 2486 _mm256_insertf128_si256(_mm256_castsi128_si256(input[0]), input[1], 0x1); 2487 x[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[15]), input[14], 2488 0x1); 2489 x[2] = 2490 _mm256_insertf128_si256(_mm256_castsi128_si256(input[2]), input[3], 0x1); 2491 x[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[13]), input[12], 2492 0x1); 2493 x[4] = 2494 _mm256_insertf128_si256(_mm256_castsi128_si256(input[5]), input[4], 0x1); 2495 x[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[10]), input[11], 2496 0x1); 2497 x[6] = 2498 _mm256_insertf128_si256(_mm256_castsi128_si256(input[7]), input[6], 0x1); 2499 x[7] = 2500 _mm256_insertf128_si256(_mm256_castsi128_si256(input[8]), input[9], 0x1); 2501 2502 // stage 1 2503 __m256i x1[8]; 2504 x1[0] = _mm256_adds_epi16(x[0], x[1]); 2505 x1[7] = _mm256_subs_epi16(x[0], x[1]); 2506 x1[1] = _mm256_adds_epi16(x[2], x[3]); 2507 x1[6] = _mm256_subs_epi16(x[2], x[3]); 2508 x1[2] = _mm256_adds_epi16(x[4], x[5]); 2509 x1[5] = _mm256_subs_epi16(x[4], x[5]); 2510 x1[3] = _mm256_adds_epi16(x[6], x[7]); 2511 x1[4] = _mm256_subs_epi16(x[6], x[7]); 2512 2513 // stage 2 2514 __m256i x2[8]; 2515 x2[0] = _mm256_adds_epi16(x1[0], x1[3]); 2516 x2[7] = _mm256_subs_epi16(x1[0], x1[3]); 2517 x2[1] = _mm256_adds_epi16(x1[1], x1[2]); 2518 x2[6] = _mm256_subs_epi16(x1[1], x1[2]); 2519 x2[2] = x1[4]; 2520 x2[3] = x1[7]; 2521 btf_16_avx2(&cospi_arr[0], &cospi_arr[1], &x1[5], &x1[6], &temp0, &temp1, 2522 &temp2, &temp3, &__rounding_256, &cos_bit); 2523 x2[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp0, 0x1); 2524 x2[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp3), temp1, 0x1); 2525 2526 // stage 3 2527 __m256i x3[8]; 2528 x2[1] = _mm256_permute4x64_epi64(x2[1], 0x4e); 2529 x3[0] = _mm256_adds_epi16(x2[0], x2[1]); 2530 x3[1] = _mm256_subs_epi16(x2[0], x2[1]); 2531 x3[2] = _mm256_blend_epi32(x2[7], x2[6], 0xf0); 2532 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, _mm256_castsi256_si128(x2[6]), 2533 _mm256_extractf128_si256(x2[7], 0x01), temp0, temp1); 2534 x3[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp1), temp0, 0x1); 2535 x3[3] = _mm256_adds_epi16(x2[2], x2[4]); 2536 x3[4] = _mm256_subs_epi16(x2[2], x2[4]); 2537 x3[5] = _mm256_adds_epi16(x2[3], x2[5]); 2538 x3[6] = _mm256_subs_epi16(x2[3], x2[5]); 2539 2540 // stage 4 2541 __m256i x4[8]; 2542 x4[0] = _mm256_blend_epi32(x3[0], x3[1], 0xf0); 2543 x4[1] = _mm256_permute2f128_si256(x3[0], x3[1], 0x21); 2544 btf_16_avx2(&cospi_arr[2], &cospi_arr[3], &x4[0], &x4[1], &output[0], 2545 &output[8], &output[4], &output[12], &__rounding_256, &cos_bit); 2546 x4[2] = _mm256_adds_epi16(x3[2], x3[7]); 2547 x4[3] = _mm256_subs_epi16(x3[2], x3[7]); 2548 x4[4] = _mm256_permute2f128_si256(x3[3], x3[4], 0x20); 2549 x4[5] = _mm256_permute2f128_si256(x3[6], x3[5], 0x20); 2550 in0 = _mm256_permute2f128_si256(x3[3], x3[4], 0x31); 2551 in1 = _mm256_permute2f128_si256(x3[5], x3[6], 0x31); 2552 btf_16_avx2(&cospi_arr[4], &cospi_arr[5], &in0, &in1, &temp0, &temp1, &temp2, 2553 &temp3, &__rounding_256, &cos_bit); 2554 2555 x4[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp2, 0x1); 2556 x4[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp3), temp1, 0x1); 2557 2558 // stage 5 2559 __m256i x5[4]; 2560 in0 = _mm256_permute2f128_si256(x4[2], x4[3], 0x31); 2561 in1 = _mm256_permute2f128_si256(x4[2], x4[3], 0x20); 2562 btf_16_avx2(&cospi_arr[6], &cospi_arr[7], &in0, &in1, &output[2], &output[14], 2563 &output[10], &output[6], &__rounding_256, &cos_bit); 2564 x5[0] = _mm256_adds_epi16(x4[4], x4[6]); 2565 x5[1] = _mm256_subs_epi16(x4[4], x4[6]); 2566 x5[2] = _mm256_adds_epi16(x4[5], x4[7]); 2567 x5[3] = _mm256_subs_epi16(x4[5], x4[7]); 2568 2569 // stage 6 2570 in0 = _mm256_permute2f128_si256(x5[0], x5[1], 0x20); 2571 in1 = _mm256_permute2f128_si256(x5[2], x5[3], 0x31); 2572 btf_16_avx2(&cospi_arr[8], &cospi_arr[9], &in0, &in1, &output[1], &output[15], 2573 &output[9], &output[7], &__rounding_256, &cos_bit); 2574 in0 = _mm256_permute2f128_si256(x5[1], x5[0], 0x31); 2575 in1 = _mm256_permute2f128_si256(x5[3], x5[2], 0x20); 2576 btf_16_avx2(&cospi_arr[10], &cospi_arr[11], &in0, &in1, &output[5], 2577 &output[11], &output[13], &output[3], &__rounding_256, &cos_bit); 2578 } 2579 2580 static inline void fadst8x16_new_avx2(const __m128i *input, __m128i *output, 2581 int8_t cos_bit) { 2582 const int32_t *cospi = cospi_arr(cos_bit); 2583 const __m256i __zero = _mm256_setzero_si256(); 2584 const __m256i __rounding_256 = _mm256_set1_epi32(1 << (cos_bit - 1)); 2585 __m256i in0, in1; 2586 __m128i temp0, temp1, temp2, temp3; 2587 2588 __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); 2589 __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); 2590 __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); 2591 __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); 2592 __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); 2593 __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); 2594 __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); 2595 __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); 2596 __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); 2597 __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]); 2598 __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]); 2599 __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]); 2600 __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]); 2601 __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]); 2602 __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]); 2603 __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]); 2604 __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]); 2605 __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]); 2606 __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]); 2607 __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]); 2608 __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]); 2609 __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]); 2610 __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]); 2611 __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]); 2612 __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]); 2613 __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]); 2614 __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]); 2615 2616 __m256i cospi_arr[20]; 2617 2618 cospi_arr[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32), 2619 cospi_p32_p32, 0x1); 2620 cospi_arr[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32), 2621 cospi_p32_m32, 0x1); 2622 cospi_arr[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32), 2623 cospi_p32_p32, 0x1); 2624 cospi_arr[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32), 2625 cospi_p32_m32, 0x1); 2626 cospi_arr[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p16_p48), 2627 cospi_m48_p16, 0x1); 2628 cospi_arr[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_m16), 2629 cospi_p16_p48, 0x1); 2630 cospi_arr[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p16_p48), 2631 cospi_m48_p16, 0x1); 2632 cospi_arr[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_m16), 2633 cospi_p16_p48, 0x1); 2634 cospi_arr[8] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p08_p56), 2635 cospi_p40_p24, 0x1); 2636 cospi_arr[9] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p56_m08), 2637 cospi_p24_m40, 0x1); 2638 cospi_arr[10] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m56_p08), 2639 cospi_m24_p40, 0x1); 2640 cospi_arr[11] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p08_p56), 2641 cospi_p40_p24, 0x1); 2642 cospi_arr[12] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p02_p62), 2643 cospi_p10_p54, 0x1); 2644 cospi_arr[13] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p62_m02), 2645 cospi_p54_m10, 0x1); 2646 cospi_arr[14] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p18_p46), 2647 cospi_p26_p38, 0x1); 2648 cospi_arr[15] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p46_m18), 2649 cospi_p38_m26, 0x1); 2650 cospi_arr[16] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p34_p30), 2651 cospi_p42_p22, 0x1); 2652 cospi_arr[17] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p30_m34), 2653 cospi_p22_m42, 0x1); 2654 cospi_arr[18] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p50_p14), 2655 cospi_p58_p06, 0x1); 2656 cospi_arr[19] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p14_m50), 2657 cospi_p06_m58, 0x1); 2658 2659 __m256i x[8]; 2660 x[0] = 2661 _mm256_insertf128_si256(_mm256_castsi128_si256(input[0]), input[4], 0x1); 2662 x[1] = 2663 _mm256_insertf128_si256(_mm256_castsi128_si256(input[2]), input[6], 0x1); 2664 x[2] = 2665 _mm256_insertf128_si256(_mm256_castsi128_si256(input[8]), input[12], 0x1); 2666 x[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[10]), input[14], 2667 0x1); 2668 x[4] = 2669 _mm256_insertf128_si256(_mm256_castsi128_si256(input[1]), input[9], 0x1); 2670 x[5] = 2671 _mm256_insertf128_si256(_mm256_castsi128_si256(input[3]), input[11], 0x1); 2672 x[6] = 2673 _mm256_insertf128_si256(_mm256_castsi128_si256(input[5]), input[13], 0x1); 2674 x[7] = 2675 _mm256_insertf128_si256(_mm256_castsi128_si256(input[7]), input[15], 0x1); 2676 2677 // stage 1 2678 __m256i x1[8]; 2679 x1[0] = x[0]; 2680 x1[1] = _mm256_subs_epi16(__zero, x[7]); 2681 x1[2] = x[2]; 2682 x1[3] = _mm256_subs_epi16(__zero, x[5]); 2683 x1[4] = _mm256_subs_epi16(__zero, x[4]); 2684 x1[5] = x[3]; 2685 x1[6] = _mm256_subs_epi16(__zero, x[6]); 2686 x1[7] = x[1]; 2687 2688 // stage 2 2689 __m256i x2[8]; 2690 x2[0] = _mm256_blend_epi32(x1[0], x1[1], 0xf0); 2691 x2[3] = _mm256_blend_epi32(x1[3], x1[2], 0xf0); 2692 x2[4] = _mm256_blend_epi32(x1[4], x1[5], 0xf0); 2693 x2[7] = _mm256_blend_epi32(x1[7], x1[6], 0xf0); 2694 in0 = _mm256_blend_epi32(x1[1], x1[0], 0xf0); 2695 in1 = _mm256_blend_epi32(x1[2], x1[3], 0xf0); 2696 btf_16_avx2(&cospi_arr[0], &cospi_arr[1], &in0, &in1, &temp0, &temp1, &temp2, 2697 &temp3, &__rounding_256, &cos_bit); 2698 x2[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1); 2699 x2[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1); 2700 in0 = _mm256_permute2f128_si256(x1[7], x1[6], 0x21); 2701 in1 = _mm256_permute2f128_si256(x1[4], x1[5], 0x21); 2702 btf_16_avx2(&cospi_arr[2], &cospi_arr[3], &in0, &in1, &temp0, &temp1, &temp2, 2703 &temp3, &__rounding_256, &cos_bit); 2704 x2[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1); 2705 x2[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1); 2706 2707 // stage 3 2708 __m256i x3[8]; 2709 x3[0] = _mm256_adds_epi16(x2[0], x2[1]); 2710 x3[1] = _mm256_subs_epi16(x2[0], x2[1]); 2711 x3[2] = _mm256_adds_epi16(x2[3], x2[2]); 2712 x3[3] = _mm256_subs_epi16(x2[3], x2[2]); 2713 x3[4] = _mm256_adds_epi16(x2[4], x2[5]); 2714 x3[5] = _mm256_subs_epi16(x2[4], x2[5]); 2715 x3[6] = _mm256_adds_epi16(x2[7], x2[6]); 2716 x3[7] = _mm256_subs_epi16(x2[7], x2[6]); 2717 2718 // stage 4 2719 __m256i x4[8]; 2720 x4[0] = x3[0]; 2721 x4[1] = x3[1]; 2722 x4[4] = x3[4]; 2723 x4[5] = x3[5]; 2724 in0 = _mm256_permute2f128_si256(x3[2], x3[3], 0x20); 2725 in1 = _mm256_permute2f128_si256(x3[2], x3[3], 0x31); 2726 btf_16_avx2(&cospi_arr[4], &cospi_arr[5], &in0, &in1, &temp0, &temp1, &temp2, 2727 &temp3, &__rounding_256, &cos_bit); 2728 x4[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1); 2729 x4[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1); 2730 in0 = _mm256_permute2f128_si256(x3[6], x3[7], 0x20); 2731 in1 = _mm256_permute2f128_si256(x3[6], x3[7], 0x31); 2732 btf_16_avx2(&cospi_arr[6], &cospi_arr[7], &in0, &in1, &temp0, &temp1, &temp2, 2733 &temp3, &__rounding_256, &cos_bit); 2734 x4[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1); 2735 x4[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1); 2736 2737 // stage 5 2738 __m256i x5[8]; 2739 x5[0] = _mm256_adds_epi16(x4[0], x4[2]); 2740 x5[1] = _mm256_subs_epi16(x4[0], x4[2]); 2741 x5[2] = _mm256_adds_epi16(x4[1], x4[3]); 2742 x5[3] = _mm256_subs_epi16(x4[1], x4[3]); 2743 x5[4] = _mm256_adds_epi16(x4[4], x4[6]); 2744 x5[5] = _mm256_subs_epi16(x4[4], x4[6]); 2745 x5[6] = _mm256_adds_epi16(x4[5], x4[7]); 2746 x5[7] = _mm256_subs_epi16(x4[5], x4[7]); 2747 2748 // stage 6 2749 __m256i x6[8]; 2750 x6[0] = x5[0]; 2751 x6[1] = x5[2]; 2752 x6[2] = x5[1]; 2753 x6[3] = x5[3]; 2754 in0 = _mm256_permute2f128_si256(x5[4], x5[6], 0x20); 2755 in1 = _mm256_permute2f128_si256(x5[4], x5[6], 0x31); 2756 btf_16_avx2(&cospi_arr[8], &cospi_arr[9], &in0, &in1, &temp0, &temp1, &temp2, 2757 &temp3, &__rounding_256, &cos_bit); 2758 x6[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1); 2759 x6[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1); 2760 in0 = _mm256_permute2f128_si256(x5[5], x5[7], 0x20); 2761 in1 = _mm256_permute2f128_si256(x5[5], x5[7], 0x31); 2762 btf_16_avx2(&cospi_arr[10], &cospi_arr[11], &in0, &in1, &temp0, &temp1, 2763 &temp2, &temp3, &__rounding_256, &cos_bit); 2764 x6[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1); 2765 x6[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1); 2766 2767 // stage 7 2768 __m256i x7[8]; 2769 x7[0] = _mm256_adds_epi16(x6[0], x6[4]); 2770 x7[1] = _mm256_subs_epi16(x6[0], x6[4]); 2771 x7[2] = _mm256_adds_epi16(x6[1], x6[5]); 2772 x7[3] = _mm256_subs_epi16(x6[1], x6[5]); 2773 x7[4] = _mm256_adds_epi16(x6[2], x6[6]); 2774 x7[5] = _mm256_subs_epi16(x6[2], x6[6]); 2775 x7[6] = _mm256_adds_epi16(x6[3], x6[7]); 2776 x7[7] = _mm256_subs_epi16(x6[3], x6[7]); 2777 2778 // stage 8 2779 in0 = _mm256_permute2f128_si256(x7[0], x7[2], 0x20); 2780 in1 = _mm256_permute2f128_si256(x7[0], x7[2], 0x31); 2781 btf_16_avx2(&cospi_arr[12], &cospi_arr[13], &in0, &in1, &output[15], 2782 &output[0], &output[13], &output[2], &__rounding_256, &cos_bit); 2783 in0 = _mm256_permute2f128_si256(x7[4], x7[6], 0x20); 2784 in1 = _mm256_permute2f128_si256(x7[4], x7[6], 0x31); 2785 btf_16_avx2(&cospi_arr[14], &cospi_arr[15], &in0, &in1, &output[11], 2786 &output[4], &output[9], &output[6], &__rounding_256, &cos_bit); 2787 in0 = _mm256_permute2f128_si256(x7[1], x7[3], 0x20); 2788 in1 = _mm256_permute2f128_si256(x7[1], x7[3], 0x31); 2789 btf_16_avx2(&cospi_arr[16], &cospi_arr[17], &in0, &in1, &output[7], 2790 &output[8], &output[5], &output[10], &__rounding_256, &cos_bit); 2791 in0 = _mm256_permute2f128_si256(x7[5], x7[7], 0x20); 2792 in1 = _mm256_permute2f128_si256(x7[5], x7[7], 0x31); 2793 btf_16_avx2(&cospi_arr[18], &cospi_arr[19], &in0, &in1, &output[3], 2794 &output[12], &output[1], &output[14], &__rounding_256, &cos_bit); 2795 } 2796 2797 static inline void fidentity8x16_new_avx2(const __m128i *input, __m128i *output, 2798 int8_t cos_bit) { 2799 (void)cos_bit; 2800 const __m256i one = _mm256_set1_epi16(1); 2801 __m256i temp; 2802 for (int i = 0; i < 16; i += 2) { 2803 temp = _mm256_insertf128_si256(_mm256_castsi128_si256(input[i]), 2804 input[i + 1], 0x1); 2805 const __m256i a_lo = _mm256_unpacklo_epi16(temp, one); 2806 const __m256i a_hi = _mm256_unpackhi_epi16(temp, one); 2807 const __m256i b_lo = scale_round_avx2(a_lo, 2 * NewSqrt2); 2808 const __m256i b_hi = scale_round_avx2(a_hi, 2 * NewSqrt2); 2809 temp = _mm256_packs_epi32(b_lo, b_hi); 2810 output[i] = _mm256_castsi256_si128(temp); 2811 output[i + 1] = _mm256_extractf128_si256(temp, 0x1); 2812 } 2813 } 2814 2815 static const transform_1d_avx2 row_txfm8x16_arr[TX_TYPES] = { 2816 fdct8x8_new_avx2, // DCT_DCT 2817 fdct8x8_new_avx2, // ADST_DCT 2818 fadst8x8_new_avx2, // DCT_ADST 2819 fadst8x8_new_avx2, // ADST_ADST 2820 fdct8x8_new_avx2, // FLIPADST_DCT 2821 fadst8x8_new_avx2, // DCT_FLIPADST 2822 fadst8x8_new_avx2, // FLIPADST_FLIPADST 2823 fadst8x8_new_avx2, // ADST_FLIPADST 2824 fadst8x8_new_avx2, // FLIPADST_ADST 2825 fidentity8x8_new_avx2, // IDTX 2826 fidentity8x8_new_avx2, // V_DCT 2827 fdct8x8_new_avx2, // H_DCT 2828 fidentity8x8_new_avx2, // V_ADST 2829 fadst8x8_new_avx2, // H_ADST 2830 fidentity8x8_new_avx2, // V_FLIPADST 2831 fadst8x8_new_avx2 // H_FLIPADST 2832 }; 2833 2834 static const transform_1d_sse2 col_txfm8x16_arr[TX_TYPES] = { 2835 fdct8x16_new_avx2, // DCT_DCT 2836 fadst8x16_new_avx2, // ADST_DCT 2837 fdct8x16_new_avx2, // DCT_ADST 2838 fadst8x16_new_avx2, // ADST_ADST 2839 fadst8x16_new_avx2, // FLIPADST_DCT 2840 fdct8x16_new_avx2, // DCT_FLIPADST 2841 fadst8x16_new_avx2, // FLIPADST_FLIPADST 2842 fadst8x16_new_avx2, // ADST_FLIPADST 2843 fadst8x16_new_avx2, // FLIPADST_ADST 2844 fidentity8x16_new_avx2, // IDTX 2845 fdct8x16_new_avx2, // V_DCT 2846 fidentity8x16_new_avx2, // H_DCT 2847 fadst8x16_new_avx2, // V_ADST 2848 fidentity8x16_new_avx2, // H_ADST 2849 fadst8x16_new_avx2, // V_FLIPADST 2850 fidentity8x16_new_avx2 // H_FLIPADST 2851 }; 2852 2853 static const transform_1d_avx2 col_txfm16x8_arr[TX_TYPES] = { 2854 fdct8x8_new_avx2, // DCT_DCT 2855 fadst8x8_new_avx2, // ADST_DCT 2856 fdct8x8_new_avx2, // DCT_ADST 2857 fadst8x8_new_avx2, // ADST_ADST 2858 fadst8x8_new_avx2, // FLIPADST_DCT 2859 fdct8x8_new_avx2, // DCT_FLIPADST 2860 fadst8x8_new_avx2, // FLIPADST_FLIPADST 2861 fadst8x8_new_avx2, // ADST_FLIPADST 2862 fadst8x8_new_avx2, // FLIPADST_ADST 2863 fidentity8x8_new_avx2, // IDTX 2864 fdct8x8_new_avx2, // V_DCT 2865 fidentity8x8_new_avx2, // H_DCT 2866 fadst8x8_new_avx2, // V_ADST 2867 fidentity8x8_new_avx2, // H_ADST 2868 fadst8x8_new_avx2, // V_FLIPADST 2869 fidentity8x8_new_avx2, // H_FLIPADST 2870 }; 2871 2872 static const transform_1d_sse2 row_txfm16x8_arr[TX_TYPES] = { 2873 fdct8x16_new_avx2, // DCT_DCT 2874 fdct8x16_new_avx2, // ADST_DCT 2875 fadst8x16_new_avx2, // DCT_ADST 2876 fadst8x16_new_avx2, // ADST_ADST 2877 fdct8x16_new_avx2, // FLIPADST_DCT 2878 fadst8x16_new_avx2, // DCT_FLIPADST 2879 fadst8x16_new_avx2, // FLIPADST_FLIPADST 2880 fadst8x16_new_avx2, // ADST_FLIPADST 2881 fadst8x16_new_avx2, // FLIPADST_ADST 2882 fidentity8x16_new_avx2, // IDTX 2883 fidentity8x16_new_avx2, // V_DCT 2884 fdct8x16_new_avx2, // H_DCT 2885 fidentity8x16_new_avx2, // V_ADST 2886 fadst8x16_new_avx2, // H_ADST 2887 fidentity8x16_new_avx2, // V_FLIPADST 2888 fadst8x16_new_avx2 // H_FLIPADST 2889 }; 2890 2891 static void lowbd_fwd_txfm2d_8x16_avx2(const int16_t *input, int32_t *output, 2892 int stride, TX_TYPE tx_type, int bd) { 2893 (void)bd; 2894 __m128i buf0[16], buf1[16]; 2895 __m256i buf2[8]; 2896 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16]; 2897 const int txw_idx = get_txw_idx(TX_8X16); 2898 const int txh_idx = get_txh_idx(TX_8X16); 2899 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; 2900 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; 2901 const int width = 8; 2902 const int height = 16; 2903 const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type]; 2904 const transform_1d_avx2 row_txfm = row_txfm8x16_arr[tx_type]; 2905 int ud_flip, lr_flip; 2906 2907 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 2908 if (ud_flip) { 2909 load_buffer_16bit_to_16bit_flip(input, stride, buf0, height); 2910 } else { 2911 load_buffer_16bit_to_16bit(input, stride, buf0, height); 2912 } 2913 round_shift_16bit(buf0, height, shift[0]); 2914 col_txfm(buf0, buf0, cos_bit_col); 2915 round_shift_16bit(buf0, height, shift[1]); 2916 transpose_16bit_8x8(buf0, buf1); 2917 transpose_16bit_8x8(buf0 + 8, buf1 + 8); 2918 2919 __m128i *bufl, *bufu; 2920 if (lr_flip) { 2921 bufl = buf0; 2922 bufu = buf0 + 8; 2923 flip_buf_sse2(buf1 + width * 0, bufl, width); 2924 flip_buf_sse2(buf1 + width * 1, bufu, width); 2925 } else { 2926 bufl = buf1 + width * 0; 2927 bufu = buf1 + width * 1; 2928 } 2929 pack_reg(bufl, bufu, buf2); 2930 row_txfm(buf2, buf2, cos_bit_row); 2931 round_shift_16bit_w16_avx2(buf2, width, shift[2]); 2932 store_rect_buffer_16bit_to_32bit_w16_avx2(buf2, output, height, width); 2933 } 2934 2935 static void lowbd_fwd_txfm2d_16x8_avx2(const int16_t *input, int32_t *output, 2936 int stride, TX_TYPE tx_type, int bd) { 2937 (void)bd; 2938 __m128i buf0[16], buf1[16]; 2939 __m256i buf2[8]; 2940 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8]; 2941 const int txw_idx = get_txw_idx(TX_16X8); 2942 const int txh_idx = get_txh_idx(TX_16X8); 2943 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; 2944 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; 2945 const int width = 16; 2946 const int height = 8; 2947 const transform_1d_avx2 col_txfm = col_txfm16x8_arr[tx_type]; 2948 const transform_1d_sse2 row_txfm = row_txfm16x8_arr[tx_type]; 2949 __m128i *buf; 2950 int ud_flip, lr_flip; 2951 2952 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 2953 2954 if (ud_flip) { 2955 load_buffer_16bit_to_16bit_flip(input + 8 * 0, stride, buf0, height); 2956 load_buffer_16bit_to_16bit_flip(input + 8 * 1, stride, &buf0[8], height); 2957 } else { 2958 load_buffer_16bit_to_16bit(input + 8 * 0, stride, buf0, height); 2959 load_buffer_16bit_to_16bit(input + 8 * 1, stride, &buf0[8], height); 2960 } 2961 pack_reg(buf0, &buf0[8], buf2); 2962 round_shift_16bit_w16_avx2(buf2, height, shift[0]); 2963 col_txfm(buf2, buf2, cos_bit_col); 2964 round_shift_16bit_w16_avx2(buf2, height, shift[1]); 2965 transpose_16bit_16x8_avx2(buf2, buf2); 2966 extract_reg(buf2, buf1); 2967 2968 if (lr_flip) { 2969 buf = buf0; 2970 flip_buf_sse2(buf1, buf, width); 2971 } else { 2972 buf = buf1; 2973 } 2974 row_txfm(buf, buf, cos_bit_row); 2975 round_shift_16bit(buf, width, shift[2]); 2976 store_rect_buffer_16bit_to_32bit_w8(buf, output, height, width); 2977 } 2978 2979 static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = { 2980 av1_lowbd_fwd_txfm2d_4x4_sse2, // 4x4 transform 2981 av1_lowbd_fwd_txfm2d_8x8_avx2, // 8x8 transform 2982 lowbd_fwd_txfm2d_16x16_avx2, // 16x16 transform 2983 lowbd_fwd_txfm2d_32x32_avx2, // 32x32 transform 2984 lowbd_fwd_txfm2d_64x64_avx2, // 64x64 transform 2985 av1_lowbd_fwd_txfm2d_4x8_sse2, // 4x8 transform 2986 av1_lowbd_fwd_txfm2d_8x4_sse2, // 8x4 transform 2987 lowbd_fwd_txfm2d_8x16_avx2, // 8x16 transform 2988 lowbd_fwd_txfm2d_16x8_avx2, // 16x8 transform 2989 lowbd_fwd_txfm2d_16x32_avx2, // 16x32 transform 2990 lowbd_fwd_txfm2d_32x16_avx2, // 32x16 transform 2991 lowbd_fwd_txfm2d_32x64_avx2, // 32x64 transform 2992 lowbd_fwd_txfm2d_64x32_avx2, // 64x32 transform 2993 av1_lowbd_fwd_txfm2d_4x16_sse2, // 4x16 transform 2994 av1_lowbd_fwd_txfm2d_16x4_sse2, // 16x4 transform 2995 av1_lowbd_fwd_txfm2d_8x32_sse2, // 8x32 transform 2996 av1_lowbd_fwd_txfm2d_32x8_sse2, // 32x8 transform 2997 lowbd_fwd_txfm2d_16x64_avx2, // 16x64 transform 2998 lowbd_fwd_txfm2d_64x16_avx2, // 64x16 transform 2999 }; 3000 3001 void av1_lowbd_fwd_txfm_avx2(const int16_t *src_diff, tran_low_t *coeff, 3002 int diff_stride, TxfmParam *txfm_param) { 3003 FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size]; 3004 if (txfm_param->lossless && txfm_param->tx_size == TX_4X4) { 3005 av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param); 3006 } else { 3007 fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type, 3008 txfm_param->bd); 3009 } 3010 }