av1_fwd_txfm1d_sse4.c (52232B)
1 /* 2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include "av1/encoder/x86/av1_txfm1d_sse4.h" 13 14 void av1_fdct32_sse4_1(__m128i *input, __m128i *output, int cos_bit, 15 const int stride) { 16 __m128i buf0[32]; 17 __m128i buf1[32]; 18 const int32_t *cospi; 19 20 int startidx = 0 * stride; 21 int endidx = 31 * stride; 22 // stage 0 23 // stage 1 24 buf1[0] = _mm_add_epi32(input[startidx], input[endidx]); 25 buf1[31] = _mm_sub_epi32(input[startidx], input[endidx]); 26 startidx += stride; 27 endidx -= stride; 28 buf1[1] = _mm_add_epi32(input[startidx], input[endidx]); 29 buf1[30] = _mm_sub_epi32(input[startidx], input[endidx]); 30 startidx += stride; 31 endidx -= stride; 32 buf1[2] = _mm_add_epi32(input[startidx], input[endidx]); 33 buf1[29] = _mm_sub_epi32(input[startidx], input[endidx]); 34 startidx += stride; 35 endidx -= stride; 36 buf1[3] = _mm_add_epi32(input[startidx], input[endidx]); 37 buf1[28] = _mm_sub_epi32(input[startidx], input[endidx]); 38 startidx += stride; 39 endidx -= stride; 40 buf1[4] = _mm_add_epi32(input[startidx], input[endidx]); 41 buf1[27] = _mm_sub_epi32(input[startidx], input[endidx]); 42 startidx += stride; 43 endidx -= stride; 44 buf1[5] = _mm_add_epi32(input[startidx], input[endidx]); 45 buf1[26] = _mm_sub_epi32(input[startidx], input[endidx]); 46 startidx += stride; 47 endidx -= stride; 48 buf1[6] = _mm_add_epi32(input[startidx], input[endidx]); 49 buf1[25] = _mm_sub_epi32(input[startidx], input[endidx]); 50 startidx += stride; 51 endidx -= stride; 52 buf1[7] = _mm_add_epi32(input[startidx], input[endidx]); 53 buf1[24] = _mm_sub_epi32(input[startidx], input[endidx]); 54 startidx += stride; 55 endidx -= stride; 56 buf1[8] = _mm_add_epi32(input[startidx], input[endidx]); 57 buf1[23] = _mm_sub_epi32(input[startidx], input[endidx]); 58 startidx += stride; 59 endidx -= stride; 60 buf1[9] = _mm_add_epi32(input[startidx], input[endidx]); 61 buf1[22] = _mm_sub_epi32(input[startidx], input[endidx]); 62 startidx += stride; 63 endidx -= stride; 64 buf1[10] = _mm_add_epi32(input[startidx], input[endidx]); 65 buf1[21] = _mm_sub_epi32(input[startidx], input[endidx]); 66 startidx += stride; 67 endidx -= stride; 68 buf1[11] = _mm_add_epi32(input[startidx], input[endidx]); 69 buf1[20] = _mm_sub_epi32(input[startidx], input[endidx]); 70 startidx += stride; 71 endidx -= stride; 72 buf1[12] = _mm_add_epi32(input[startidx], input[endidx]); 73 buf1[19] = _mm_sub_epi32(input[startidx], input[endidx]); 74 startidx += stride; 75 endidx -= stride; 76 buf1[13] = _mm_add_epi32(input[startidx], input[endidx]); 77 buf1[18] = _mm_sub_epi32(input[startidx], input[endidx]); 78 startidx += stride; 79 endidx -= stride; 80 buf1[14] = _mm_add_epi32(input[startidx], input[endidx]); 81 buf1[17] = _mm_sub_epi32(input[startidx], input[endidx]); 82 startidx += stride; 83 endidx -= stride; 84 buf1[15] = _mm_add_epi32(input[startidx], input[endidx]); 85 buf1[16] = _mm_sub_epi32(input[startidx], input[endidx]); 86 87 // stage 2 88 cospi = cospi_arr(cos_bit); 89 buf0[0] = _mm_add_epi32(buf1[0], buf1[15]); 90 buf0[15] = _mm_sub_epi32(buf1[0], buf1[15]); 91 buf0[1] = _mm_add_epi32(buf1[1], buf1[14]); 92 buf0[14] = _mm_sub_epi32(buf1[1], buf1[14]); 93 buf0[2] = _mm_add_epi32(buf1[2], buf1[13]); 94 buf0[13] = _mm_sub_epi32(buf1[2], buf1[13]); 95 buf0[3] = _mm_add_epi32(buf1[3], buf1[12]); 96 buf0[12] = _mm_sub_epi32(buf1[3], buf1[12]); 97 buf0[4] = _mm_add_epi32(buf1[4], buf1[11]); 98 buf0[11] = _mm_sub_epi32(buf1[4], buf1[11]); 99 buf0[5] = _mm_add_epi32(buf1[5], buf1[10]); 100 buf0[10] = _mm_sub_epi32(buf1[5], buf1[10]); 101 buf0[6] = _mm_add_epi32(buf1[6], buf1[9]); 102 buf0[9] = _mm_sub_epi32(buf1[6], buf1[9]); 103 buf0[7] = _mm_add_epi32(buf1[7], buf1[8]); 104 buf0[8] = _mm_sub_epi32(buf1[7], buf1[8]); 105 buf0[16] = buf1[16]; 106 buf0[17] = buf1[17]; 107 buf0[18] = buf1[18]; 108 buf0[19] = buf1[19]; 109 btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20], 110 buf0[27], cos_bit); 111 btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21], 112 buf0[26], cos_bit); 113 btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22], 114 buf0[25], cos_bit); 115 btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23], 116 buf0[24], cos_bit); 117 buf0[28] = buf1[28]; 118 buf0[29] = buf1[29]; 119 buf0[30] = buf1[30]; 120 buf0[31] = buf1[31]; 121 122 // stage 3 123 cospi = cospi_arr(cos_bit); 124 buf1[0] = _mm_add_epi32(buf0[0], buf0[7]); 125 buf1[7] = _mm_sub_epi32(buf0[0], buf0[7]); 126 buf1[1] = _mm_add_epi32(buf0[1], buf0[6]); 127 buf1[6] = _mm_sub_epi32(buf0[1], buf0[6]); 128 buf1[2] = _mm_add_epi32(buf0[2], buf0[5]); 129 buf1[5] = _mm_sub_epi32(buf0[2], buf0[5]); 130 buf1[3] = _mm_add_epi32(buf0[3], buf0[4]); 131 buf1[4] = _mm_sub_epi32(buf0[3], buf0[4]); 132 buf1[8] = buf0[8]; 133 buf1[9] = buf0[9]; 134 btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10], 135 buf1[13], cos_bit); 136 btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11], 137 buf1[12], cos_bit); 138 buf1[14] = buf0[14]; 139 buf1[15] = buf0[15]; 140 buf1[16] = _mm_add_epi32(buf0[16], buf0[23]); 141 buf1[23] = _mm_sub_epi32(buf0[16], buf0[23]); 142 buf1[17] = _mm_add_epi32(buf0[17], buf0[22]); 143 buf1[22] = _mm_sub_epi32(buf0[17], buf0[22]); 144 buf1[18] = _mm_add_epi32(buf0[18], buf0[21]); 145 buf1[21] = _mm_sub_epi32(buf0[18], buf0[21]); 146 buf1[19] = _mm_add_epi32(buf0[19], buf0[20]); 147 buf1[20] = _mm_sub_epi32(buf0[19], buf0[20]); 148 buf1[24] = _mm_sub_epi32(buf0[31], buf0[24]); 149 buf1[31] = _mm_add_epi32(buf0[31], buf0[24]); 150 buf1[25] = _mm_sub_epi32(buf0[30], buf0[25]); 151 buf1[30] = _mm_add_epi32(buf0[30], buf0[25]); 152 buf1[26] = _mm_sub_epi32(buf0[29], buf0[26]); 153 buf1[29] = _mm_add_epi32(buf0[29], buf0[26]); 154 buf1[27] = _mm_sub_epi32(buf0[28], buf0[27]); 155 buf1[28] = _mm_add_epi32(buf0[28], buf0[27]); 156 157 // stage 4 158 cospi = cospi_arr(cos_bit); 159 buf0[0] = _mm_add_epi32(buf1[0], buf1[3]); 160 buf0[3] = _mm_sub_epi32(buf1[0], buf1[3]); 161 buf0[1] = _mm_add_epi32(buf1[1], buf1[2]); 162 buf0[2] = _mm_sub_epi32(buf1[1], buf1[2]); 163 buf0[4] = buf1[4]; 164 btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6], 165 cos_bit); 166 buf0[7] = buf1[7]; 167 buf0[8] = _mm_add_epi32(buf1[8], buf1[11]); 168 buf0[11] = _mm_sub_epi32(buf1[8], buf1[11]); 169 buf0[9] = _mm_add_epi32(buf1[9], buf1[10]); 170 buf0[10] = _mm_sub_epi32(buf1[9], buf1[10]); 171 buf0[12] = _mm_sub_epi32(buf1[15], buf1[12]); 172 buf0[15] = _mm_add_epi32(buf1[15], buf1[12]); 173 buf0[13] = _mm_sub_epi32(buf1[14], buf1[13]); 174 buf0[14] = _mm_add_epi32(buf1[14], buf1[13]); 175 buf0[16] = buf1[16]; 176 buf0[17] = buf1[17]; 177 btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18], 178 buf0[29], cos_bit); 179 btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19], 180 buf0[28], cos_bit); 181 btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20], 182 buf0[27], cos_bit); 183 btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21], 184 buf0[26], cos_bit); 185 buf0[22] = buf1[22]; 186 buf0[23] = buf1[23]; 187 buf0[24] = buf1[24]; 188 buf0[25] = buf1[25]; 189 buf0[30] = buf1[30]; 190 buf0[31] = buf1[31]; 191 192 // stage 5 193 cospi = cospi_arr(cos_bit); 194 btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1], 195 cos_bit); 196 btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2], buf1[3], 197 cos_bit); 198 buf1[4] = _mm_add_epi32(buf0[4], buf0[5]); 199 buf1[5] = _mm_sub_epi32(buf0[4], buf0[5]); 200 buf1[6] = _mm_sub_epi32(buf0[7], buf0[6]); 201 buf1[7] = _mm_add_epi32(buf0[7], buf0[6]); 202 buf1[8] = buf0[8]; 203 btf_32_sse4_1_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9], 204 buf1[14], cos_bit); 205 btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10], 206 buf1[13], cos_bit); 207 buf1[11] = buf0[11]; 208 buf1[12] = buf0[12]; 209 buf1[15] = buf0[15]; 210 buf1[16] = _mm_add_epi32(buf0[16], buf0[19]); 211 buf1[19] = _mm_sub_epi32(buf0[16], buf0[19]); 212 buf1[17] = _mm_add_epi32(buf0[17], buf0[18]); 213 buf1[18] = _mm_sub_epi32(buf0[17], buf0[18]); 214 buf1[20] = _mm_sub_epi32(buf0[23], buf0[20]); 215 buf1[23] = _mm_add_epi32(buf0[23], buf0[20]); 216 buf1[21] = _mm_sub_epi32(buf0[22], buf0[21]); 217 buf1[22] = _mm_add_epi32(buf0[22], buf0[21]); 218 buf1[24] = _mm_add_epi32(buf0[24], buf0[27]); 219 buf1[27] = _mm_sub_epi32(buf0[24], buf0[27]); 220 buf1[25] = _mm_add_epi32(buf0[25], buf0[26]); 221 buf1[26] = _mm_sub_epi32(buf0[25], buf0[26]); 222 buf1[28] = _mm_sub_epi32(buf0[31], buf0[28]); 223 buf1[31] = _mm_add_epi32(buf0[31], buf0[28]); 224 buf1[29] = _mm_sub_epi32(buf0[30], buf0[29]); 225 buf1[30] = _mm_add_epi32(buf0[30], buf0[29]); 226 227 // stage 6 228 cospi = cospi_arr(cos_bit); 229 buf0[0] = buf1[0]; 230 buf0[1] = buf1[1]; 231 buf0[2] = buf1[2]; 232 buf0[3] = buf1[3]; 233 btf_32_sse4_1_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7], 234 cos_bit); 235 btf_32_sse4_1_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5], buf0[6], 236 cos_bit); 237 buf0[8] = _mm_add_epi32(buf1[8], buf1[9]); 238 buf0[9] = _mm_sub_epi32(buf1[8], buf1[9]); 239 buf0[10] = _mm_sub_epi32(buf1[11], buf1[10]); 240 buf0[11] = _mm_add_epi32(buf1[11], buf1[10]); 241 buf0[12] = _mm_add_epi32(buf1[12], buf1[13]); 242 buf0[13] = _mm_sub_epi32(buf1[12], buf1[13]); 243 buf0[14] = _mm_sub_epi32(buf1[15], buf1[14]); 244 buf0[15] = _mm_add_epi32(buf1[15], buf1[14]); 245 buf0[16] = buf1[16]; 246 btf_32_sse4_1_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17], 247 buf0[30], cos_bit); 248 btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18], 249 buf0[29], cos_bit); 250 buf0[19] = buf1[19]; 251 buf0[20] = buf1[20]; 252 btf_32_sse4_1_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21], 253 buf0[26], cos_bit); 254 btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22], 255 buf0[25], cos_bit); 256 buf0[23] = buf1[23]; 257 buf0[24] = buf1[24]; 258 buf0[27] = buf1[27]; 259 buf0[28] = buf1[28]; 260 buf0[31] = buf1[31]; 261 262 // stage 7 263 cospi = cospi_arr(cos_bit); 264 buf1[0] = buf0[0]; 265 buf1[1] = buf0[1]; 266 buf1[2] = buf0[2]; 267 buf1[3] = buf0[3]; 268 buf1[4] = buf0[4]; 269 buf1[5] = buf0[5]; 270 buf1[6] = buf0[6]; 271 buf1[7] = buf0[7]; 272 btf_32_sse4_1_type1(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8], buf1[15], 273 cos_bit); 274 btf_32_sse4_1_type1(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9], 275 buf1[14], cos_bit); 276 btf_32_sse4_1_type1(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10], 277 buf1[13], cos_bit); 278 btf_32_sse4_1_type1(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11], 279 buf1[12], cos_bit); 280 buf1[16] = _mm_add_epi32(buf0[16], buf0[17]); 281 buf1[17] = _mm_sub_epi32(buf0[16], buf0[17]); 282 buf1[18] = _mm_sub_epi32(buf0[19], buf0[18]); 283 buf1[19] = _mm_add_epi32(buf0[19], buf0[18]); 284 buf1[20] = _mm_add_epi32(buf0[20], buf0[21]); 285 buf1[21] = _mm_sub_epi32(buf0[20], buf0[21]); 286 buf1[22] = _mm_sub_epi32(buf0[23], buf0[22]); 287 buf1[23] = _mm_add_epi32(buf0[23], buf0[22]); 288 buf1[24] = _mm_add_epi32(buf0[24], buf0[25]); 289 buf1[25] = _mm_sub_epi32(buf0[24], buf0[25]); 290 buf1[26] = _mm_sub_epi32(buf0[27], buf0[26]); 291 buf1[27] = _mm_add_epi32(buf0[27], buf0[26]); 292 buf1[28] = _mm_add_epi32(buf0[28], buf0[29]); 293 buf1[29] = _mm_sub_epi32(buf0[28], buf0[29]); 294 buf1[30] = _mm_sub_epi32(buf0[31], buf0[30]); 295 buf1[31] = _mm_add_epi32(buf0[31], buf0[30]); 296 297 // stage 8 298 cospi = cospi_arr(cos_bit); 299 buf0[0] = buf1[0]; 300 buf0[1] = buf1[1]; 301 buf0[2] = buf1[2]; 302 buf0[3] = buf1[3]; 303 buf0[4] = buf1[4]; 304 buf0[5] = buf1[5]; 305 buf0[6] = buf1[6]; 306 buf0[7] = buf1[7]; 307 buf0[8] = buf1[8]; 308 buf0[9] = buf1[9]; 309 buf0[10] = buf1[10]; 310 buf0[11] = buf1[11]; 311 buf0[12] = buf1[12]; 312 buf0[13] = buf1[13]; 313 buf0[14] = buf1[14]; 314 buf0[15] = buf1[15]; 315 btf_32_sse4_1_type1(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16], 316 buf0[31], cos_bit); 317 btf_32_sse4_1_type1(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17], 318 buf0[30], cos_bit); 319 btf_32_sse4_1_type1(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18], 320 buf0[29], cos_bit); 321 btf_32_sse4_1_type1(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19], 322 buf0[28], cos_bit); 323 btf_32_sse4_1_type1(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20], 324 buf0[27], cos_bit); 325 btf_32_sse4_1_type1(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21], 326 buf0[26], cos_bit); 327 btf_32_sse4_1_type1(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22], 328 buf0[25], cos_bit); 329 btf_32_sse4_1_type1(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23], 330 buf0[24], cos_bit); 331 332 startidx = 0 * stride; 333 endidx = 31 * stride; 334 // stage 9 335 output[startidx] = buf0[0]; 336 output[endidx] = buf0[31]; 337 startidx += stride; 338 endidx -= stride; 339 output[startidx] = buf0[16]; 340 output[endidx] = buf0[15]; 341 startidx += stride; 342 endidx -= stride; 343 output[startidx] = buf0[8]; 344 output[endidx] = buf0[23]; 345 startidx += stride; 346 endidx -= stride; 347 output[startidx] = buf0[24]; 348 output[endidx] = buf0[7]; 349 startidx += stride; 350 endidx -= stride; 351 output[startidx] = buf0[4]; 352 output[endidx] = buf0[27]; 353 startidx += stride; 354 endidx -= stride; 355 output[startidx] = buf0[20]; 356 output[endidx] = buf0[11]; 357 startidx += stride; 358 endidx -= stride; 359 output[startidx] = buf0[12]; 360 output[endidx] = buf0[19]; 361 startidx += stride; 362 endidx -= stride; 363 output[startidx] = buf0[28]; 364 output[endidx] = buf0[3]; 365 startidx += stride; 366 endidx -= stride; 367 output[startidx] = buf0[2]; 368 output[endidx] = buf0[29]; 369 startidx += stride; 370 endidx -= stride; 371 output[startidx] = buf0[18]; 372 output[endidx] = buf0[13]; 373 startidx += stride; 374 endidx -= stride; 375 output[startidx] = buf0[10]; 376 output[endidx] = buf0[21]; 377 startidx += stride; 378 endidx -= stride; 379 output[startidx] = buf0[26]; 380 output[endidx] = buf0[5]; 381 startidx += stride; 382 endidx -= stride; 383 output[startidx] = buf0[6]; 384 output[endidx] = buf0[25]; 385 startidx += stride; 386 endidx -= stride; 387 output[startidx] = buf0[22]; 388 output[endidx] = buf0[9]; 389 startidx += stride; 390 endidx -= stride; 391 output[startidx] = buf0[14]; 392 output[endidx] = buf0[17]; 393 startidx += stride; 394 endidx -= stride; 395 output[startidx] = buf0[30]; 396 output[endidx] = buf0[1]; 397 } 398 399 void av1_fdct64_sse4_1(__m128i *input, __m128i *output, int8_t cos_bit, 400 const int instride, const int outstride) { 401 const int32_t *cospi = cospi_arr(cos_bit); 402 const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); 403 404 __m128i cospi_m32 = _mm_set1_epi32(-cospi[32]); 405 __m128i cospi_p32 = _mm_set1_epi32(cospi[32]); 406 __m128i cospi_m16 = _mm_set1_epi32(-cospi[16]); 407 __m128i cospi_p48 = _mm_set1_epi32(cospi[48]); 408 __m128i cospi_m48 = _mm_set1_epi32(-cospi[48]); 409 __m128i cospi_p16 = _mm_set1_epi32(cospi[16]); 410 __m128i cospi_m08 = _mm_set1_epi32(-cospi[8]); 411 __m128i cospi_p56 = _mm_set1_epi32(cospi[56]); 412 __m128i cospi_m56 = _mm_set1_epi32(-cospi[56]); 413 __m128i cospi_m40 = _mm_set1_epi32(-cospi[40]); 414 __m128i cospi_p24 = _mm_set1_epi32(cospi[24]); 415 __m128i cospi_m24 = _mm_set1_epi32(-cospi[24]); 416 __m128i cospi_p08 = _mm_set1_epi32(cospi[8]); 417 __m128i cospi_p40 = _mm_set1_epi32(cospi[40]); 418 __m128i cospi_p60 = _mm_set1_epi32(cospi[60]); 419 __m128i cospi_p04 = _mm_set1_epi32(cospi[4]); 420 __m128i cospi_p28 = _mm_set1_epi32(cospi[28]); 421 __m128i cospi_p36 = _mm_set1_epi32(cospi[36]); 422 __m128i cospi_p44 = _mm_set1_epi32(cospi[44]); 423 __m128i cospi_p20 = _mm_set1_epi32(cospi[20]); 424 __m128i cospi_p12 = _mm_set1_epi32(cospi[12]); 425 __m128i cospi_p52 = _mm_set1_epi32(cospi[52]); 426 __m128i cospi_m04 = _mm_set1_epi32(-cospi[4]); 427 __m128i cospi_m60 = _mm_set1_epi32(-cospi[60]); 428 __m128i cospi_m36 = _mm_set1_epi32(-cospi[36]); 429 __m128i cospi_m28 = _mm_set1_epi32(-cospi[28]); 430 __m128i cospi_m20 = _mm_set1_epi32(-cospi[20]); 431 __m128i cospi_m44 = _mm_set1_epi32(-cospi[44]); 432 __m128i cospi_m52 = _mm_set1_epi32(-cospi[52]); 433 __m128i cospi_m12 = _mm_set1_epi32(-cospi[12]); 434 __m128i cospi_p62 = _mm_set1_epi32(cospi[62]); 435 __m128i cospi_p02 = _mm_set1_epi32(cospi[2]); 436 __m128i cospi_p30 = _mm_set1_epi32(cospi[30]); 437 __m128i cospi_p34 = _mm_set1_epi32(cospi[34]); 438 __m128i cospi_p46 = _mm_set1_epi32(cospi[46]); 439 __m128i cospi_p18 = _mm_set1_epi32(cospi[18]); 440 __m128i cospi_p14 = _mm_set1_epi32(cospi[14]); 441 __m128i cospi_p50 = _mm_set1_epi32(cospi[50]); 442 __m128i cospi_p54 = _mm_set1_epi32(cospi[54]); 443 __m128i cospi_p10 = _mm_set1_epi32(cospi[10]); 444 __m128i cospi_p22 = _mm_set1_epi32(cospi[22]); 445 __m128i cospi_p42 = _mm_set1_epi32(cospi[42]); 446 __m128i cospi_p38 = _mm_set1_epi32(cospi[38]); 447 __m128i cospi_p26 = _mm_set1_epi32(cospi[26]); 448 __m128i cospi_p06 = _mm_set1_epi32(cospi[6]); 449 __m128i cospi_p58 = _mm_set1_epi32(cospi[58]); 450 __m128i cospi_p63 = _mm_set1_epi32(cospi[63]); 451 __m128i cospi_p01 = _mm_set1_epi32(cospi[1]); 452 __m128i cospi_p31 = _mm_set1_epi32(cospi[31]); 453 __m128i cospi_p33 = _mm_set1_epi32(cospi[33]); 454 __m128i cospi_p47 = _mm_set1_epi32(cospi[47]); 455 __m128i cospi_p17 = _mm_set1_epi32(cospi[17]); 456 __m128i cospi_p15 = _mm_set1_epi32(cospi[15]); 457 __m128i cospi_p49 = _mm_set1_epi32(cospi[49]); 458 __m128i cospi_p55 = _mm_set1_epi32(cospi[55]); 459 __m128i cospi_p09 = _mm_set1_epi32(cospi[9]); 460 __m128i cospi_p23 = _mm_set1_epi32(cospi[23]); 461 __m128i cospi_p41 = _mm_set1_epi32(cospi[41]); 462 __m128i cospi_p39 = _mm_set1_epi32(cospi[39]); 463 __m128i cospi_p25 = _mm_set1_epi32(cospi[25]); 464 __m128i cospi_p07 = _mm_set1_epi32(cospi[7]); 465 __m128i cospi_p57 = _mm_set1_epi32(cospi[57]); 466 __m128i cospi_p59 = _mm_set1_epi32(cospi[59]); 467 __m128i cospi_p05 = _mm_set1_epi32(cospi[5]); 468 __m128i cospi_p27 = _mm_set1_epi32(cospi[27]); 469 __m128i cospi_p37 = _mm_set1_epi32(cospi[37]); 470 __m128i cospi_p43 = _mm_set1_epi32(cospi[43]); 471 __m128i cospi_p21 = _mm_set1_epi32(cospi[21]); 472 __m128i cospi_p11 = _mm_set1_epi32(cospi[11]); 473 __m128i cospi_p53 = _mm_set1_epi32(cospi[53]); 474 __m128i cospi_p51 = _mm_set1_epi32(cospi[51]); 475 __m128i cospi_p13 = _mm_set1_epi32(cospi[13]); 476 __m128i cospi_p19 = _mm_set1_epi32(cospi[19]); 477 __m128i cospi_p45 = _mm_set1_epi32(cospi[45]); 478 __m128i cospi_p35 = _mm_set1_epi32(cospi[35]); 479 __m128i cospi_p29 = _mm_set1_epi32(cospi[29]); 480 __m128i cospi_p03 = _mm_set1_epi32(cospi[3]); 481 __m128i cospi_p61 = _mm_set1_epi32(cospi[61]); 482 483 int startidx = 0 * instride; 484 int endidx = 63 * instride; 485 // stage 1 486 __m128i x1[64]; 487 x1[0] = _mm_add_epi32(input[startidx], input[endidx]); 488 x1[63] = _mm_sub_epi32(input[startidx], input[endidx]); 489 startidx += instride; 490 endidx -= instride; 491 x1[1] = _mm_add_epi32(input[startidx], input[endidx]); 492 x1[62] = _mm_sub_epi32(input[startidx], input[endidx]); 493 startidx += instride; 494 endidx -= instride; 495 x1[2] = _mm_add_epi32(input[startidx], input[endidx]); 496 x1[61] = _mm_sub_epi32(input[startidx], input[endidx]); 497 startidx += instride; 498 endidx -= instride; 499 x1[3] = _mm_add_epi32(input[startidx], input[endidx]); 500 x1[60] = _mm_sub_epi32(input[startidx], input[endidx]); 501 startidx += instride; 502 endidx -= instride; 503 x1[4] = _mm_add_epi32(input[startidx], input[endidx]); 504 x1[59] = _mm_sub_epi32(input[startidx], input[endidx]); 505 startidx += instride; 506 endidx -= instride; 507 x1[5] = _mm_add_epi32(input[startidx], input[endidx]); 508 x1[58] = _mm_sub_epi32(input[startidx], input[endidx]); 509 startidx += instride; 510 endidx -= instride; 511 x1[6] = _mm_add_epi32(input[startidx], input[endidx]); 512 x1[57] = _mm_sub_epi32(input[startidx], input[endidx]); 513 startidx += instride; 514 endidx -= instride; 515 x1[7] = _mm_add_epi32(input[startidx], input[endidx]); 516 x1[56] = _mm_sub_epi32(input[startidx], input[endidx]); 517 startidx += instride; 518 endidx -= instride; 519 x1[8] = _mm_add_epi32(input[startidx], input[endidx]); 520 x1[55] = _mm_sub_epi32(input[startidx], input[endidx]); 521 startidx += instride; 522 endidx -= instride; 523 x1[9] = _mm_add_epi32(input[startidx], input[endidx]); 524 x1[54] = _mm_sub_epi32(input[startidx], input[endidx]); 525 startidx += instride; 526 endidx -= instride; 527 x1[10] = _mm_add_epi32(input[startidx], input[endidx]); 528 x1[53] = _mm_sub_epi32(input[startidx], input[endidx]); 529 startidx += instride; 530 endidx -= instride; 531 x1[11] = _mm_add_epi32(input[startidx], input[endidx]); 532 x1[52] = _mm_sub_epi32(input[startidx], input[endidx]); 533 startidx += instride; 534 endidx -= instride; 535 x1[12] = _mm_add_epi32(input[startidx], input[endidx]); 536 x1[51] = _mm_sub_epi32(input[startidx], input[endidx]); 537 startidx += instride; 538 endidx -= instride; 539 x1[13] = _mm_add_epi32(input[startidx], input[endidx]); 540 x1[50] = _mm_sub_epi32(input[startidx], input[endidx]); 541 startidx += instride; 542 endidx -= instride; 543 x1[14] = _mm_add_epi32(input[startidx], input[endidx]); 544 x1[49] = _mm_sub_epi32(input[startidx], input[endidx]); 545 startidx += instride; 546 endidx -= instride; 547 x1[15] = _mm_add_epi32(input[startidx], input[endidx]); 548 x1[48] = _mm_sub_epi32(input[startidx], input[endidx]); 549 startidx += instride; 550 endidx -= instride; 551 x1[16] = _mm_add_epi32(input[startidx], input[endidx]); 552 x1[47] = _mm_sub_epi32(input[startidx], input[endidx]); 553 startidx += instride; 554 endidx -= instride; 555 x1[17] = _mm_add_epi32(input[startidx], input[endidx]); 556 x1[46] = _mm_sub_epi32(input[startidx], input[endidx]); 557 startidx += instride; 558 endidx -= instride; 559 x1[18] = _mm_add_epi32(input[startidx], input[endidx]); 560 x1[45] = _mm_sub_epi32(input[startidx], input[endidx]); 561 startidx += instride; 562 endidx -= instride; 563 x1[19] = _mm_add_epi32(input[startidx], input[endidx]); 564 x1[44] = _mm_sub_epi32(input[startidx], input[endidx]); 565 startidx += instride; 566 endidx -= instride; 567 x1[20] = _mm_add_epi32(input[startidx], input[endidx]); 568 x1[43] = _mm_sub_epi32(input[startidx], input[endidx]); 569 startidx += instride; 570 endidx -= instride; 571 x1[21] = _mm_add_epi32(input[startidx], input[endidx]); 572 x1[42] = _mm_sub_epi32(input[startidx], input[endidx]); 573 startidx += instride; 574 endidx -= instride; 575 x1[22] = _mm_add_epi32(input[startidx], input[endidx]); 576 x1[41] = _mm_sub_epi32(input[startidx], input[endidx]); 577 startidx += instride; 578 endidx -= instride; 579 x1[23] = _mm_add_epi32(input[startidx], input[endidx]); 580 x1[40] = _mm_sub_epi32(input[startidx], input[endidx]); 581 startidx += instride; 582 endidx -= instride; 583 x1[24] = _mm_add_epi32(input[startidx], input[endidx]); 584 x1[39] = _mm_sub_epi32(input[startidx], input[endidx]); 585 startidx += instride; 586 endidx -= instride; 587 x1[25] = _mm_add_epi32(input[startidx], input[endidx]); 588 x1[38] = _mm_sub_epi32(input[startidx], input[endidx]); 589 startidx += instride; 590 endidx -= instride; 591 x1[26] = _mm_add_epi32(input[startidx], input[endidx]); 592 x1[37] = _mm_sub_epi32(input[startidx], input[endidx]); 593 startidx += instride; 594 endidx -= instride; 595 x1[27] = _mm_add_epi32(input[startidx], input[endidx]); 596 x1[36] = _mm_sub_epi32(input[startidx], input[endidx]); 597 startidx += instride; 598 endidx -= instride; 599 x1[28] = _mm_add_epi32(input[startidx], input[endidx]); 600 x1[35] = _mm_sub_epi32(input[startidx], input[endidx]); 601 startidx += instride; 602 endidx -= instride; 603 x1[29] = _mm_add_epi32(input[startidx], input[endidx]); 604 x1[34] = _mm_sub_epi32(input[startidx], input[endidx]); 605 startidx += instride; 606 endidx -= instride; 607 x1[30] = _mm_add_epi32(input[startidx], input[endidx]); 608 x1[33] = _mm_sub_epi32(input[startidx], input[endidx]); 609 startidx += instride; 610 endidx -= instride; 611 x1[31] = _mm_add_epi32(input[startidx], input[endidx]); 612 x1[32] = _mm_sub_epi32(input[startidx], input[endidx]); 613 614 // stage 2 615 __m128i x2[64]; 616 x2[0] = _mm_add_epi32(x1[0], x1[31]); 617 x2[31] = _mm_sub_epi32(x1[0], x1[31]); 618 x2[1] = _mm_add_epi32(x1[1], x1[30]); 619 x2[30] = _mm_sub_epi32(x1[1], x1[30]); 620 x2[2] = _mm_add_epi32(x1[2], x1[29]); 621 x2[29] = _mm_sub_epi32(x1[2], x1[29]); 622 x2[3] = _mm_add_epi32(x1[3], x1[28]); 623 x2[28] = _mm_sub_epi32(x1[3], x1[28]); 624 x2[4] = _mm_add_epi32(x1[4], x1[27]); 625 x2[27] = _mm_sub_epi32(x1[4], x1[27]); 626 x2[5] = _mm_add_epi32(x1[5], x1[26]); 627 x2[26] = _mm_sub_epi32(x1[5], x1[26]); 628 x2[6] = _mm_add_epi32(x1[6], x1[25]); 629 x2[25] = _mm_sub_epi32(x1[6], x1[25]); 630 x2[7] = _mm_add_epi32(x1[7], x1[24]); 631 x2[24] = _mm_sub_epi32(x1[7], x1[24]); 632 x2[8] = _mm_add_epi32(x1[8], x1[23]); 633 x2[23] = _mm_sub_epi32(x1[8], x1[23]); 634 x2[9] = _mm_add_epi32(x1[9], x1[22]); 635 x2[22] = _mm_sub_epi32(x1[9], x1[22]); 636 x2[10] = _mm_add_epi32(x1[10], x1[21]); 637 x2[21] = _mm_sub_epi32(x1[10], x1[21]); 638 x2[11] = _mm_add_epi32(x1[11], x1[20]); 639 x2[20] = _mm_sub_epi32(x1[11], x1[20]); 640 x2[12] = _mm_add_epi32(x1[12], x1[19]); 641 x2[19] = _mm_sub_epi32(x1[12], x1[19]); 642 x2[13] = _mm_add_epi32(x1[13], x1[18]); 643 x2[18] = _mm_sub_epi32(x1[13], x1[18]); 644 x2[14] = _mm_add_epi32(x1[14], x1[17]); 645 x2[17] = _mm_sub_epi32(x1[14], x1[17]); 646 x2[15] = _mm_add_epi32(x1[15], x1[16]); 647 x2[16] = _mm_sub_epi32(x1[15], x1[16]); 648 x2[32] = x1[32]; 649 x2[33] = x1[33]; 650 x2[34] = x1[34]; 651 x2[35] = x1[35]; 652 x2[36] = x1[36]; 653 x2[37] = x1[37]; 654 x2[38] = x1[38]; 655 x2[39] = x1[39]; 656 btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[40], x1[55], x2[40], x2[55], 657 __rounding, cos_bit); 658 btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[41], x1[54], x2[41], x2[54], 659 __rounding, cos_bit); 660 btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[42], x1[53], x2[42], x2[53], 661 __rounding, cos_bit); 662 btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[43], x1[52], x2[43], x2[52], 663 __rounding, cos_bit); 664 btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[44], x1[51], x2[44], x2[51], 665 __rounding, cos_bit); 666 btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[45], x1[50], x2[45], x2[50], 667 __rounding, cos_bit); 668 btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[46], x1[49], x2[46], x2[49], 669 __rounding, cos_bit); 670 btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[47], x1[48], x2[47], x2[48], 671 __rounding, cos_bit); 672 x2[56] = x1[56]; 673 x2[57] = x1[57]; 674 x2[58] = x1[58]; 675 x2[59] = x1[59]; 676 x2[60] = x1[60]; 677 x2[61] = x1[61]; 678 x2[62] = x1[62]; 679 x2[63] = x1[63]; 680 681 // stage 3 682 __m128i x3[64]; 683 x3[0] = _mm_add_epi32(x2[0], x2[15]); 684 x3[15] = _mm_sub_epi32(x2[0], x2[15]); 685 x3[1] = _mm_add_epi32(x2[1], x2[14]); 686 x3[14] = _mm_sub_epi32(x2[1], x2[14]); 687 x3[2] = _mm_add_epi32(x2[2], x2[13]); 688 x3[13] = _mm_sub_epi32(x2[2], x2[13]); 689 x3[3] = _mm_add_epi32(x2[3], x2[12]); 690 x3[12] = _mm_sub_epi32(x2[3], x2[12]); 691 x3[4] = _mm_add_epi32(x2[4], x2[11]); 692 x3[11] = _mm_sub_epi32(x2[4], x2[11]); 693 x3[5] = _mm_add_epi32(x2[5], x2[10]); 694 x3[10] = _mm_sub_epi32(x2[5], x2[10]); 695 x3[6] = _mm_add_epi32(x2[6], x2[9]); 696 x3[9] = _mm_sub_epi32(x2[6], x2[9]); 697 x3[7] = _mm_add_epi32(x2[7], x2[8]); 698 x3[8] = _mm_sub_epi32(x2[7], x2[8]); 699 x3[16] = x2[16]; 700 x3[17] = x2[17]; 701 x3[18] = x2[18]; 702 x3[19] = x2[19]; 703 btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[20], x2[27], x3[20], x3[27], 704 __rounding, cos_bit); 705 btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[21], x2[26], x3[21], x3[26], 706 __rounding, cos_bit); 707 btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[22], x2[25], x3[22], x3[25], 708 __rounding, cos_bit); 709 btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[23], x2[24], x3[23], x3[24], 710 __rounding, cos_bit); 711 x3[28] = x2[28]; 712 x3[29] = x2[29]; 713 x3[30] = x2[30]; 714 x3[31] = x2[31]; 715 x3[32] = _mm_add_epi32(x2[32], x2[47]); 716 x3[47] = _mm_sub_epi32(x2[32], x2[47]); 717 x3[33] = _mm_add_epi32(x2[33], x2[46]); 718 x3[46] = _mm_sub_epi32(x2[33], x2[46]); 719 x3[34] = _mm_add_epi32(x2[34], x2[45]); 720 x3[45] = _mm_sub_epi32(x2[34], x2[45]); 721 x3[35] = _mm_add_epi32(x2[35], x2[44]); 722 x3[44] = _mm_sub_epi32(x2[35], x2[44]); 723 x3[36] = _mm_add_epi32(x2[36], x2[43]); 724 x3[43] = _mm_sub_epi32(x2[36], x2[43]); 725 x3[37] = _mm_add_epi32(x2[37], x2[42]); 726 x3[42] = _mm_sub_epi32(x2[37], x2[42]); 727 x3[38] = _mm_add_epi32(x2[38], x2[41]); 728 x3[41] = _mm_sub_epi32(x2[38], x2[41]); 729 x3[39] = _mm_add_epi32(x2[39], x2[40]); 730 x3[40] = _mm_sub_epi32(x2[39], x2[40]); 731 x3[48] = _mm_sub_epi32(x2[63], x2[48]); 732 x3[63] = _mm_add_epi32(x2[63], x2[48]); 733 x3[49] = _mm_sub_epi32(x2[62], x2[49]); 734 x3[62] = _mm_add_epi32(x2[62], x2[49]); 735 x3[50] = _mm_sub_epi32(x2[61], x2[50]); 736 x3[61] = _mm_add_epi32(x2[61], x2[50]); 737 x3[51] = _mm_sub_epi32(x2[60], x2[51]); 738 x3[60] = _mm_add_epi32(x2[60], x2[51]); 739 x3[52] = _mm_sub_epi32(x2[59], x2[52]); 740 x3[59] = _mm_add_epi32(x2[59], x2[52]); 741 x3[53] = _mm_sub_epi32(x2[58], x2[53]); 742 x3[58] = _mm_add_epi32(x2[58], x2[53]); 743 x3[54] = _mm_sub_epi32(x2[57], x2[54]); 744 x3[57] = _mm_add_epi32(x2[57], x2[54]); 745 x3[55] = _mm_sub_epi32(x2[56], x2[55]); 746 x3[56] = _mm_add_epi32(x2[56], x2[55]); 747 748 // stage 4 749 __m128i x4[64]; 750 x4[0] = _mm_add_epi32(x3[0], x3[7]); 751 x4[7] = _mm_sub_epi32(x3[0], x3[7]); 752 x4[1] = _mm_add_epi32(x3[1], x3[6]); 753 x4[6] = _mm_sub_epi32(x3[1], x3[6]); 754 x4[2] = _mm_add_epi32(x3[2], x3[5]); 755 x4[5] = _mm_sub_epi32(x3[2], x3[5]); 756 x4[3] = _mm_add_epi32(x3[3], x3[4]); 757 x4[4] = _mm_sub_epi32(x3[3], x3[4]); 758 x4[8] = x3[8]; 759 x4[9] = x3[9]; 760 btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x3[10], x3[13], x4[10], x4[13], 761 __rounding, cos_bit); 762 btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x3[11], x3[12], x4[11], x4[12], 763 __rounding, cos_bit); 764 x4[14] = x3[14]; 765 x4[15] = x3[15]; 766 x4[16] = _mm_add_epi32(x3[16], x3[23]); 767 x4[23] = _mm_sub_epi32(x3[16], x3[23]); 768 x4[17] = _mm_add_epi32(x3[17], x3[22]); 769 x4[22] = _mm_sub_epi32(x3[17], x3[22]); 770 x4[18] = _mm_add_epi32(x3[18], x3[21]); 771 x4[21] = _mm_sub_epi32(x3[18], x3[21]); 772 x4[19] = _mm_add_epi32(x3[19], x3[20]); 773 x4[20] = _mm_sub_epi32(x3[19], x3[20]); 774 x4[24] = _mm_sub_epi32(x3[31], x3[24]); 775 x4[31] = _mm_add_epi32(x3[31], x3[24]); 776 x4[25] = _mm_sub_epi32(x3[30], x3[25]); 777 x4[30] = _mm_add_epi32(x3[30], x3[25]); 778 x4[26] = _mm_sub_epi32(x3[29], x3[26]); 779 x4[29] = _mm_add_epi32(x3[29], x3[26]); 780 x4[27] = _mm_sub_epi32(x3[28], x3[27]); 781 x4[28] = _mm_add_epi32(x3[28], x3[27]); 782 x4[32] = x3[32]; 783 x4[33] = x3[33]; 784 x4[34] = x3[34]; 785 x4[35] = x3[35]; 786 btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[36], x3[59], x4[36], x4[59], 787 __rounding, cos_bit); 788 btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[37], x3[58], x4[37], x4[58], 789 __rounding, cos_bit); 790 btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[38], x3[57], x4[38], x4[57], 791 __rounding, cos_bit); 792 btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[39], x3[56], x4[39], x4[56], 793 __rounding, cos_bit); 794 btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[40], x3[55], x4[40], x4[55], 795 __rounding, cos_bit); 796 btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[41], x3[54], x4[41], x4[54], 797 __rounding, cos_bit); 798 btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[42], x3[53], x4[42], x4[53], 799 __rounding, cos_bit); 800 btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[43], x3[52], x4[43], x4[52], 801 __rounding, cos_bit); 802 x4[44] = x3[44]; 803 x4[45] = x3[45]; 804 x4[46] = x3[46]; 805 x4[47] = x3[47]; 806 x4[48] = x3[48]; 807 x4[49] = x3[49]; 808 x4[50] = x3[50]; 809 x4[51] = x3[51]; 810 x4[60] = x3[60]; 811 x4[61] = x3[61]; 812 x4[62] = x3[62]; 813 x4[63] = x3[63]; 814 815 // stage 5 816 __m128i x5[64]; 817 x5[0] = _mm_add_epi32(x4[0], x4[3]); 818 x5[3] = _mm_sub_epi32(x4[0], x4[3]); 819 x5[1] = _mm_add_epi32(x4[1], x4[2]); 820 x5[2] = _mm_sub_epi32(x4[1], x4[2]); 821 x5[4] = x4[4]; 822 btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x4[5], x4[6], x5[5], x5[6], 823 __rounding, cos_bit); 824 x5[7] = x4[7]; 825 x5[8] = _mm_add_epi32(x4[8], x4[11]); 826 x5[11] = _mm_sub_epi32(x4[8], x4[11]); 827 x5[9] = _mm_add_epi32(x4[9], x4[10]); 828 x5[10] = _mm_sub_epi32(x4[9], x4[10]); 829 x5[12] = _mm_sub_epi32(x4[15], x4[12]); 830 x5[15] = _mm_add_epi32(x4[15], x4[12]); 831 x5[13] = _mm_sub_epi32(x4[14], x4[13]); 832 x5[14] = _mm_add_epi32(x4[14], x4[13]); 833 x5[16] = x4[16]; 834 x5[17] = x4[17]; 835 btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x4[18], x4[29], x5[18], x5[29], 836 __rounding, cos_bit); 837 btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x4[19], x4[28], x5[19], x5[28], 838 __rounding, cos_bit); 839 btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x4[20], x4[27], x5[20], x5[27], 840 __rounding, cos_bit); 841 btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x4[21], x4[26], x5[21], x5[26], 842 __rounding, cos_bit); 843 x5[22] = x4[22]; 844 x5[23] = x4[23]; 845 x5[24] = x4[24]; 846 x5[25] = x4[25]; 847 x5[30] = x4[30]; 848 x5[31] = x4[31]; 849 x5[32] = _mm_add_epi32(x4[32], x4[39]); 850 x5[39] = _mm_sub_epi32(x4[32], x4[39]); 851 x5[33] = _mm_add_epi32(x4[33], x4[38]); 852 x5[38] = _mm_sub_epi32(x4[33], x4[38]); 853 x5[34] = _mm_add_epi32(x4[34], x4[37]); 854 x5[37] = _mm_sub_epi32(x4[34], x4[37]); 855 x5[35] = _mm_add_epi32(x4[35], x4[36]); 856 x5[36] = _mm_sub_epi32(x4[35], x4[36]); 857 x5[40] = _mm_sub_epi32(x4[47], x4[40]); 858 x5[47] = _mm_add_epi32(x4[47], x4[40]); 859 x5[41] = _mm_sub_epi32(x4[46], x4[41]); 860 x5[46] = _mm_add_epi32(x4[46], x4[41]); 861 x5[42] = _mm_sub_epi32(x4[45], x4[42]); 862 x5[45] = _mm_add_epi32(x4[45], x4[42]); 863 x5[43] = _mm_sub_epi32(x4[44], x4[43]); 864 x5[44] = _mm_add_epi32(x4[44], x4[43]); 865 x5[48] = _mm_add_epi32(x4[48], x4[55]); 866 x5[55] = _mm_sub_epi32(x4[48], x4[55]); 867 x5[49] = _mm_add_epi32(x4[49], x4[54]); 868 x5[54] = _mm_sub_epi32(x4[49], x4[54]); 869 x5[50] = _mm_add_epi32(x4[50], x4[53]); 870 x5[53] = _mm_sub_epi32(x4[50], x4[53]); 871 x5[51] = _mm_add_epi32(x4[51], x4[52]); 872 x5[52] = _mm_sub_epi32(x4[51], x4[52]); 873 x5[56] = _mm_sub_epi32(x4[63], x4[56]); 874 x5[63] = _mm_add_epi32(x4[63], x4[56]); 875 x5[57] = _mm_sub_epi32(x4[62], x4[57]); 876 x5[62] = _mm_add_epi32(x4[62], x4[57]); 877 x5[58] = _mm_sub_epi32(x4[61], x4[58]); 878 x5[61] = _mm_add_epi32(x4[61], x4[58]); 879 x5[59] = _mm_sub_epi32(x4[60], x4[59]); 880 x5[60] = _mm_add_epi32(x4[60], x4[59]); 881 882 // stage 6 883 __m128i x6[64]; 884 btf_32_type0_sse4_1_new(cospi_p32, cospi_p32, x5[0], x5[1], x6[0], x6[1], 885 __rounding, cos_bit); 886 btf_32_type1_sse4_1_new(cospi_p48, cospi_p16, x5[2], x5[3], x6[2], x6[3], 887 __rounding, cos_bit); 888 x6[4] = _mm_add_epi32(x5[4], x5[5]); 889 x6[5] = _mm_sub_epi32(x5[4], x5[5]); 890 x6[6] = _mm_sub_epi32(x5[7], x5[6]); 891 x6[7] = _mm_add_epi32(x5[7], x5[6]); 892 x6[8] = x5[8]; 893 btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x5[9], x5[14], x6[9], x6[14], 894 __rounding, cos_bit); 895 btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x5[10], x5[13], x6[10], x6[13], 896 __rounding, cos_bit); 897 x6[11] = x5[11]; 898 x6[12] = x5[12]; 899 x6[15] = x5[15]; 900 x6[16] = _mm_add_epi32(x5[16], x5[19]); 901 x6[19] = _mm_sub_epi32(x5[16], x5[19]); 902 x6[17] = _mm_add_epi32(x5[17], x5[18]); 903 x6[18] = _mm_sub_epi32(x5[17], x5[18]); 904 x6[20] = _mm_sub_epi32(x5[23], x5[20]); 905 x6[23] = _mm_add_epi32(x5[23], x5[20]); 906 x6[21] = _mm_sub_epi32(x5[22], x5[21]); 907 x6[22] = _mm_add_epi32(x5[22], x5[21]); 908 x6[24] = _mm_add_epi32(x5[24], x5[27]); 909 x6[27] = _mm_sub_epi32(x5[24], x5[27]); 910 x6[25] = _mm_add_epi32(x5[25], x5[26]); 911 x6[26] = _mm_sub_epi32(x5[25], x5[26]); 912 x6[28] = _mm_sub_epi32(x5[31], x5[28]); 913 x6[31] = _mm_add_epi32(x5[31], x5[28]); 914 x6[29] = _mm_sub_epi32(x5[30], x5[29]); 915 x6[30] = _mm_add_epi32(x5[30], x5[29]); 916 x6[32] = x5[32]; 917 x6[33] = x5[33]; 918 btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x5[34], x5[61], x6[34], x6[61], 919 __rounding, cos_bit); 920 btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x5[35], x5[60], x6[35], x6[60], 921 __rounding, cos_bit); 922 btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x5[36], x5[59], x6[36], x6[59], 923 __rounding, cos_bit); 924 btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x5[37], x5[58], x6[37], x6[58], 925 __rounding, cos_bit); 926 x6[38] = x5[38]; 927 x6[39] = x5[39]; 928 x6[40] = x5[40]; 929 x6[41] = x5[41]; 930 btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x5[42], x5[53], x6[42], x6[53], 931 __rounding, cos_bit); 932 btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x5[43], x5[52], x6[43], x6[52], 933 __rounding, cos_bit); 934 btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x5[44], x5[51], x6[44], x6[51], 935 __rounding, cos_bit); 936 btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x5[45], x5[50], x6[45], x6[50], 937 __rounding, cos_bit); 938 x6[46] = x5[46]; 939 x6[47] = x5[47]; 940 x6[48] = x5[48]; 941 x6[49] = x5[49]; 942 x6[54] = x5[54]; 943 x6[55] = x5[55]; 944 x6[56] = x5[56]; 945 x6[57] = x5[57]; 946 x6[62] = x5[62]; 947 x6[63] = x5[63]; 948 949 // stage 7 950 __m128i x7[64]; 951 x7[0] = x6[0]; 952 x7[1] = x6[1]; 953 x7[2] = x6[2]; 954 x7[3] = x6[3]; 955 btf_32_type1_sse4_1_new(cospi_p56, cospi_p08, x6[4], x6[7], x7[4], x7[7], 956 __rounding, cos_bit); 957 btf_32_type1_sse4_1_new(cospi_p24, cospi_p40, x6[5], x6[6], x7[5], x7[6], 958 __rounding, cos_bit); 959 x7[8] = _mm_add_epi32(x6[8], x6[9]); 960 x7[9] = _mm_sub_epi32(x6[8], x6[9]); 961 x7[10] = _mm_sub_epi32(x6[11], x6[10]); 962 x7[11] = _mm_add_epi32(x6[11], x6[10]); 963 x7[12] = _mm_add_epi32(x6[12], x6[13]); 964 x7[13] = _mm_sub_epi32(x6[12], x6[13]); 965 x7[14] = _mm_sub_epi32(x6[15], x6[14]); 966 x7[15] = _mm_add_epi32(x6[15], x6[14]); 967 x7[16] = x6[16]; 968 btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x6[17], x6[30], x7[17], x7[30], 969 __rounding, cos_bit); 970 btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x6[18], x6[29], x7[18], x7[29], 971 __rounding, cos_bit); 972 x7[19] = x6[19]; 973 x7[20] = x6[20]; 974 btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x6[21], x6[26], x7[21], x7[26], 975 __rounding, cos_bit); 976 btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x6[22], x6[25], x7[22], x7[25], 977 __rounding, cos_bit); 978 x7[23] = x6[23]; 979 x7[24] = x6[24]; 980 x7[27] = x6[27]; 981 x7[28] = x6[28]; 982 x7[31] = x6[31]; 983 x7[32] = _mm_add_epi32(x6[32], x6[35]); 984 x7[35] = _mm_sub_epi32(x6[32], x6[35]); 985 x7[33] = _mm_add_epi32(x6[33], x6[34]); 986 x7[34] = _mm_sub_epi32(x6[33], x6[34]); 987 x7[36] = _mm_sub_epi32(x6[39], x6[36]); 988 x7[39] = _mm_add_epi32(x6[39], x6[36]); 989 x7[37] = _mm_sub_epi32(x6[38], x6[37]); 990 x7[38] = _mm_add_epi32(x6[38], x6[37]); 991 x7[40] = _mm_add_epi32(x6[40], x6[43]); 992 x7[43] = _mm_sub_epi32(x6[40], x6[43]); 993 x7[41] = _mm_add_epi32(x6[41], x6[42]); 994 x7[42] = _mm_sub_epi32(x6[41], x6[42]); 995 x7[44] = _mm_sub_epi32(x6[47], x6[44]); 996 x7[47] = _mm_add_epi32(x6[47], x6[44]); 997 x7[45] = _mm_sub_epi32(x6[46], x6[45]); 998 x7[46] = _mm_add_epi32(x6[46], x6[45]); 999 x7[48] = _mm_add_epi32(x6[48], x6[51]); 1000 x7[51] = _mm_sub_epi32(x6[48], x6[51]); 1001 x7[49] = _mm_add_epi32(x6[49], x6[50]); 1002 x7[50] = _mm_sub_epi32(x6[49], x6[50]); 1003 x7[52] = _mm_sub_epi32(x6[55], x6[52]); 1004 x7[55] = _mm_add_epi32(x6[55], x6[52]); 1005 x7[53] = _mm_sub_epi32(x6[54], x6[53]); 1006 x7[54] = _mm_add_epi32(x6[54], x6[53]); 1007 x7[56] = _mm_add_epi32(x6[56], x6[59]); 1008 x7[59] = _mm_sub_epi32(x6[56], x6[59]); 1009 x7[57] = _mm_add_epi32(x6[57], x6[58]); 1010 x7[58] = _mm_sub_epi32(x6[57], x6[58]); 1011 x7[60] = _mm_sub_epi32(x6[63], x6[60]); 1012 x7[63] = _mm_add_epi32(x6[63], x6[60]); 1013 x7[61] = _mm_sub_epi32(x6[62], x6[61]); 1014 x7[62] = _mm_add_epi32(x6[62], x6[61]); 1015 1016 // stage 8 1017 __m128i x8[64]; 1018 x8[0] = x7[0]; 1019 x8[1] = x7[1]; 1020 x8[2] = x7[2]; 1021 x8[3] = x7[3]; 1022 x8[4] = x7[4]; 1023 x8[5] = x7[5]; 1024 x8[6] = x7[6]; 1025 x8[7] = x7[7]; 1026 btf_32_type1_sse4_1_new(cospi_p60, cospi_p04, x7[8], x7[15], x8[8], x8[15], 1027 __rounding, cos_bit); 1028 btf_32_type1_sse4_1_new(cospi_p28, cospi_p36, x7[9], x7[14], x8[9], x8[14], 1029 __rounding, cos_bit); 1030 btf_32_type1_sse4_1_new(cospi_p44, cospi_p20, x7[10], x7[13], x8[10], x8[13], 1031 __rounding, cos_bit); 1032 btf_32_type1_sse4_1_new(cospi_p12, cospi_p52, x7[11], x7[12], x8[11], x8[12], 1033 __rounding, cos_bit); 1034 x8[16] = _mm_add_epi32(x7[16], x7[17]); 1035 x8[17] = _mm_sub_epi32(x7[16], x7[17]); 1036 x8[18] = _mm_sub_epi32(x7[19], x7[18]); 1037 x8[19] = _mm_add_epi32(x7[19], x7[18]); 1038 x8[20] = _mm_add_epi32(x7[20], x7[21]); 1039 x8[21] = _mm_sub_epi32(x7[20], x7[21]); 1040 x8[22] = _mm_sub_epi32(x7[23], x7[22]); 1041 x8[23] = _mm_add_epi32(x7[23], x7[22]); 1042 x8[24] = _mm_add_epi32(x7[24], x7[25]); 1043 x8[25] = _mm_sub_epi32(x7[24], x7[25]); 1044 x8[26] = _mm_sub_epi32(x7[27], x7[26]); 1045 x8[27] = _mm_add_epi32(x7[27], x7[26]); 1046 x8[28] = _mm_add_epi32(x7[28], x7[29]); 1047 x8[29] = _mm_sub_epi32(x7[28], x7[29]); 1048 x8[30] = _mm_sub_epi32(x7[31], x7[30]); 1049 x8[31] = _mm_add_epi32(x7[31], x7[30]); 1050 x8[32] = x7[32]; 1051 btf_32_type0_sse4_1_new(cospi_m04, cospi_p60, x7[33], x7[62], x8[33], x8[62], 1052 __rounding, cos_bit); 1053 btf_32_type0_sse4_1_new(cospi_m60, cospi_m04, x7[34], x7[61], x8[34], x8[61], 1054 __rounding, cos_bit); 1055 x8[35] = x7[35]; 1056 x8[36] = x7[36]; 1057 btf_32_type0_sse4_1_new(cospi_m36, cospi_p28, x7[37], x7[58], x8[37], x8[58], 1058 __rounding, cos_bit); 1059 btf_32_type0_sse4_1_new(cospi_m28, cospi_m36, x7[38], x7[57], x8[38], x8[57], 1060 __rounding, cos_bit); 1061 x8[39] = x7[39]; 1062 x8[40] = x7[40]; 1063 btf_32_type0_sse4_1_new(cospi_m20, cospi_p44, x7[41], x7[54], x8[41], x8[54], 1064 __rounding, cos_bit); 1065 btf_32_type0_sse4_1_new(cospi_m44, cospi_m20, x7[42], x7[53], x8[42], x8[53], 1066 __rounding, cos_bit); 1067 x8[43] = x7[43]; 1068 x8[44] = x7[44]; 1069 btf_32_type0_sse4_1_new(cospi_m52, cospi_p12, x7[45], x7[50], x8[45], x8[50], 1070 __rounding, cos_bit); 1071 btf_32_type0_sse4_1_new(cospi_m12, cospi_m52, x7[46], x7[49], x8[46], x8[49], 1072 __rounding, cos_bit); 1073 x8[47] = x7[47]; 1074 x8[48] = x7[48]; 1075 x8[51] = x7[51]; 1076 x8[52] = x7[52]; 1077 x8[55] = x7[55]; 1078 x8[56] = x7[56]; 1079 x8[59] = x7[59]; 1080 x8[60] = x7[60]; 1081 x8[63] = x7[63]; 1082 1083 // stage 9 1084 __m128i x9[64]; 1085 x9[0] = x8[0]; 1086 x9[1] = x8[1]; 1087 x9[2] = x8[2]; 1088 x9[3] = x8[3]; 1089 x9[4] = x8[4]; 1090 x9[5] = x8[5]; 1091 x9[6] = x8[6]; 1092 x9[7] = x8[7]; 1093 x9[8] = x8[8]; 1094 x9[9] = x8[9]; 1095 x9[10] = x8[10]; 1096 x9[11] = x8[11]; 1097 x9[12] = x8[12]; 1098 x9[13] = x8[13]; 1099 x9[14] = x8[14]; 1100 x9[15] = x8[15]; 1101 btf_32_type1_sse4_1_new(cospi_p62, cospi_p02, x8[16], x8[31], x9[16], x9[31], 1102 __rounding, cos_bit); 1103 btf_32_type1_sse4_1_new(cospi_p30, cospi_p34, x8[17], x8[30], x9[17], x9[30], 1104 __rounding, cos_bit); 1105 btf_32_type1_sse4_1_new(cospi_p46, cospi_p18, x8[18], x8[29], x9[18], x9[29], 1106 __rounding, cos_bit); 1107 btf_32_type1_sse4_1_new(cospi_p14, cospi_p50, x8[19], x8[28], x9[19], x9[28], 1108 __rounding, cos_bit); 1109 btf_32_type1_sse4_1_new(cospi_p54, cospi_p10, x8[20], x8[27], x9[20], x9[27], 1110 __rounding, cos_bit); 1111 btf_32_type1_sse4_1_new(cospi_p22, cospi_p42, x8[21], x8[26], x9[21], x9[26], 1112 __rounding, cos_bit); 1113 btf_32_type1_sse4_1_new(cospi_p38, cospi_p26, x8[22], x8[25], x9[22], x9[25], 1114 __rounding, cos_bit); 1115 btf_32_type1_sse4_1_new(cospi_p06, cospi_p58, x8[23], x8[24], x9[23], x9[24], 1116 __rounding, cos_bit); 1117 x9[32] = _mm_add_epi32(x8[32], x8[33]); 1118 x9[33] = _mm_sub_epi32(x8[32], x8[33]); 1119 x9[34] = _mm_sub_epi32(x8[35], x8[34]); 1120 x9[35] = _mm_add_epi32(x8[35], x8[34]); 1121 x9[36] = _mm_add_epi32(x8[36], x8[37]); 1122 x9[37] = _mm_sub_epi32(x8[36], x8[37]); 1123 x9[38] = _mm_sub_epi32(x8[39], x8[38]); 1124 x9[39] = _mm_add_epi32(x8[39], x8[38]); 1125 x9[40] = _mm_add_epi32(x8[40], x8[41]); 1126 x9[41] = _mm_sub_epi32(x8[40], x8[41]); 1127 x9[42] = _mm_sub_epi32(x8[43], x8[42]); 1128 x9[43] = _mm_add_epi32(x8[43], x8[42]); 1129 x9[44] = _mm_add_epi32(x8[44], x8[45]); 1130 x9[45] = _mm_sub_epi32(x8[44], x8[45]); 1131 x9[46] = _mm_sub_epi32(x8[47], x8[46]); 1132 x9[47] = _mm_add_epi32(x8[47], x8[46]); 1133 x9[48] = _mm_add_epi32(x8[48], x8[49]); 1134 x9[49] = _mm_sub_epi32(x8[48], x8[49]); 1135 x9[50] = _mm_sub_epi32(x8[51], x8[50]); 1136 x9[51] = _mm_add_epi32(x8[51], x8[50]); 1137 x9[52] = _mm_add_epi32(x8[52], x8[53]); 1138 x9[53] = _mm_sub_epi32(x8[52], x8[53]); 1139 x9[54] = _mm_sub_epi32(x8[55], x8[54]); 1140 x9[55] = _mm_add_epi32(x8[55], x8[54]); 1141 x9[56] = _mm_add_epi32(x8[56], x8[57]); 1142 x9[57] = _mm_sub_epi32(x8[56], x8[57]); 1143 x9[58] = _mm_sub_epi32(x8[59], x8[58]); 1144 x9[59] = _mm_add_epi32(x8[59], x8[58]); 1145 x9[60] = _mm_add_epi32(x8[60], x8[61]); 1146 x9[61] = _mm_sub_epi32(x8[60], x8[61]); 1147 x9[62] = _mm_sub_epi32(x8[63], x8[62]); 1148 x9[63] = _mm_add_epi32(x8[63], x8[62]); 1149 1150 // stage 10 1151 __m128i x10[64]; 1152 x10[0] = x9[0]; 1153 x10[1] = x9[1]; 1154 x10[2] = x9[2]; 1155 x10[3] = x9[3]; 1156 x10[4] = x9[4]; 1157 x10[5] = x9[5]; 1158 x10[6] = x9[6]; 1159 x10[7] = x9[7]; 1160 x10[8] = x9[8]; 1161 x10[9] = x9[9]; 1162 x10[10] = x9[10]; 1163 x10[11] = x9[11]; 1164 x10[12] = x9[12]; 1165 x10[13] = x9[13]; 1166 x10[14] = x9[14]; 1167 x10[15] = x9[15]; 1168 x10[16] = x9[16]; 1169 x10[17] = x9[17]; 1170 x10[18] = x9[18]; 1171 x10[19] = x9[19]; 1172 x10[20] = x9[20]; 1173 x10[21] = x9[21]; 1174 x10[22] = x9[22]; 1175 x10[23] = x9[23]; 1176 x10[24] = x9[24]; 1177 x10[25] = x9[25]; 1178 x10[26] = x9[26]; 1179 x10[27] = x9[27]; 1180 x10[28] = x9[28]; 1181 x10[29] = x9[29]; 1182 x10[30] = x9[30]; 1183 x10[31] = x9[31]; 1184 btf_32_type1_sse4_1_new(cospi_p63, cospi_p01, x9[32], x9[63], x10[32], 1185 x10[63], __rounding, cos_bit); 1186 btf_32_type1_sse4_1_new(cospi_p31, cospi_p33, x9[33], x9[62], x10[33], 1187 x10[62], __rounding, cos_bit); 1188 btf_32_type1_sse4_1_new(cospi_p47, cospi_p17, x9[34], x9[61], x10[34], 1189 x10[61], __rounding, cos_bit); 1190 btf_32_type1_sse4_1_new(cospi_p15, cospi_p49, x9[35], x9[60], x10[35], 1191 x10[60], __rounding, cos_bit); 1192 btf_32_type1_sse4_1_new(cospi_p55, cospi_p09, x9[36], x9[59], x10[36], 1193 x10[59], __rounding, cos_bit); 1194 btf_32_type1_sse4_1_new(cospi_p23, cospi_p41, x9[37], x9[58], x10[37], 1195 x10[58], __rounding, cos_bit); 1196 btf_32_type1_sse4_1_new(cospi_p39, cospi_p25, x9[38], x9[57], x10[38], 1197 x10[57], __rounding, cos_bit); 1198 btf_32_type1_sse4_1_new(cospi_p07, cospi_p57, x9[39], x9[56], x10[39], 1199 x10[56], __rounding, cos_bit); 1200 btf_32_type1_sse4_1_new(cospi_p59, cospi_p05, x9[40], x9[55], x10[40], 1201 x10[55], __rounding, cos_bit); 1202 btf_32_type1_sse4_1_new(cospi_p27, cospi_p37, x9[41], x9[54], x10[41], 1203 x10[54], __rounding, cos_bit); 1204 btf_32_type1_sse4_1_new(cospi_p43, cospi_p21, x9[42], x9[53], x10[42], 1205 x10[53], __rounding, cos_bit); 1206 btf_32_type1_sse4_1_new(cospi_p11, cospi_p53, x9[43], x9[52], x10[43], 1207 x10[52], __rounding, cos_bit); 1208 btf_32_type1_sse4_1_new(cospi_p51, cospi_p13, x9[44], x9[51], x10[44], 1209 x10[51], __rounding, cos_bit); 1210 btf_32_type1_sse4_1_new(cospi_p19, cospi_p45, x9[45], x9[50], x10[45], 1211 x10[50], __rounding, cos_bit); 1212 btf_32_type1_sse4_1_new(cospi_p35, cospi_p29, x9[46], x9[49], x10[46], 1213 x10[49], __rounding, cos_bit); 1214 btf_32_type1_sse4_1_new(cospi_p03, cospi_p61, x9[47], x9[48], x10[47], 1215 x10[48], __rounding, cos_bit); 1216 1217 startidx = 0 * outstride; 1218 endidx = 63 * outstride; 1219 // stage 11 1220 output[startidx] = x10[0]; 1221 output[endidx] = x10[63]; 1222 startidx += outstride; 1223 endidx -= outstride; 1224 output[startidx] = x10[32]; 1225 output[endidx] = x10[31]; 1226 startidx += outstride; 1227 endidx -= outstride; 1228 output[startidx] = x10[16]; 1229 output[endidx] = x10[47]; 1230 startidx += outstride; 1231 endidx -= outstride; 1232 output[startidx] = x10[48]; 1233 output[endidx] = x10[15]; 1234 startidx += outstride; 1235 endidx -= outstride; 1236 output[startidx] = x10[8]; 1237 output[endidx] = x10[55]; 1238 startidx += outstride; 1239 endidx -= outstride; 1240 output[startidx] = x10[40]; 1241 output[endidx] = x10[23]; 1242 startidx += outstride; 1243 endidx -= outstride; 1244 output[startidx] = x10[24]; 1245 output[endidx] = x10[39]; 1246 startidx += outstride; 1247 endidx -= outstride; 1248 output[startidx] = x10[56]; 1249 output[endidx] = x10[7]; 1250 startidx += outstride; 1251 endidx -= outstride; 1252 output[startidx] = x10[4]; 1253 output[endidx] = x10[59]; 1254 startidx += outstride; 1255 endidx -= outstride; 1256 output[startidx] = x10[36]; 1257 output[endidx] = x10[27]; 1258 startidx += outstride; 1259 endidx -= outstride; 1260 output[startidx] = x10[20]; 1261 output[endidx] = x10[43]; 1262 startidx += outstride; 1263 endidx -= outstride; 1264 output[startidx] = x10[52]; 1265 output[endidx] = x10[11]; 1266 startidx += outstride; 1267 endidx -= outstride; 1268 output[startidx] = x10[12]; 1269 output[endidx] = x10[51]; 1270 startidx += outstride; 1271 endidx -= outstride; 1272 output[startidx] = x10[44]; 1273 output[endidx] = x10[19]; 1274 startidx += outstride; 1275 endidx -= outstride; 1276 output[startidx] = x10[28]; 1277 output[endidx] = x10[35]; 1278 startidx += outstride; 1279 endidx -= outstride; 1280 output[startidx] = x10[60]; 1281 output[endidx] = x10[3]; 1282 startidx += outstride; 1283 endidx -= outstride; 1284 output[startidx] = x10[2]; 1285 output[endidx] = x10[61]; 1286 startidx += outstride; 1287 endidx -= outstride; 1288 output[startidx] = x10[34]; 1289 output[endidx] = x10[29]; 1290 startidx += outstride; 1291 endidx -= outstride; 1292 output[startidx] = x10[18]; 1293 output[endidx] = x10[45]; 1294 startidx += outstride; 1295 endidx -= outstride; 1296 output[startidx] = x10[50]; 1297 output[endidx] = x10[13]; 1298 startidx += outstride; 1299 endidx -= outstride; 1300 output[startidx] = x10[10]; 1301 output[endidx] = x10[53]; 1302 startidx += outstride; 1303 endidx -= outstride; 1304 output[startidx] = x10[42]; 1305 output[endidx] = x10[21]; 1306 startidx += outstride; 1307 endidx -= outstride; 1308 output[startidx] = x10[26]; 1309 output[endidx] = x10[37]; 1310 startidx += outstride; 1311 endidx -= outstride; 1312 output[startidx] = x10[58]; 1313 output[endidx] = x10[5]; 1314 startidx += outstride; 1315 endidx -= outstride; 1316 output[startidx] = x10[6]; 1317 output[endidx] = x10[57]; 1318 startidx += outstride; 1319 endidx -= outstride; 1320 output[startidx] = x10[38]; 1321 output[endidx] = x10[25]; 1322 startidx += outstride; 1323 endidx -= outstride; 1324 output[startidx] = x10[22]; 1325 output[endidx] = x10[41]; 1326 startidx += outstride; 1327 endidx -= outstride; 1328 output[startidx] = x10[54]; 1329 output[endidx] = x10[9]; 1330 startidx += outstride; 1331 endidx -= outstride; 1332 output[startidx] = x10[14]; 1333 output[endidx] = x10[49]; 1334 startidx += outstride; 1335 endidx -= outstride; 1336 output[startidx] = x10[46]; 1337 output[endidx] = x10[17]; 1338 startidx += outstride; 1339 endidx -= outstride; 1340 output[startidx] = x10[30]; 1341 output[endidx] = x10[33]; 1342 startidx += outstride; 1343 endidx -= outstride; 1344 output[startidx] = x10[62]; 1345 output[endidx] = x10[1]; 1346 } 1347 1348 void av1_idtx32_sse4_1(__m128i *input, __m128i *output, int cos_bit, 1349 const int col_num) { 1350 (void)cos_bit; 1351 for (int i = 0; i < 32; i++) { 1352 output[i * col_num] = _mm_slli_epi32(input[i * col_num], 2); 1353 } 1354 }