av1_inv_txfm_ssse3.c (113960B)
1 /* 2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include "config/aom_config.h" 13 #include "config/av1_rtcd.h" 14 15 #include "av1/common/av1_inv_txfm1d_cfg.h" 16 #include "av1/common/x86/av1_inv_txfm_ssse3.h" 17 #include "av1/common/x86/av1_txfm_sse2.h" 18 19 // TODO(venkatsanampudi@ittiam.com): move this to header file 20 21 // Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5 22 static const int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 23 4 * 4096, 4 * 5793 }; 24 25 // TODO(binpengsmail@gmail.com): replace some for loop with do {} while 26 27 static void idct4_sse2(const __m128i *input, __m128i *output) { 28 const int8_t cos_bit = INV_COS_BIT; 29 const int32_t *cospi = cospi_arr(INV_COS_BIT); 30 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); 31 32 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); 33 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); 34 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); 35 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); 36 37 // stage 1 38 __m128i x[4]; 39 x[0] = input[0]; 40 x[1] = input[2]; 41 x[2] = input[1]; 42 x[3] = input[3]; 43 44 // stage 2 45 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]); 46 btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]); 47 48 // stage 3 49 btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]); 50 btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]); 51 } 52 53 static void idct4_w4_sse2(const __m128i *input, __m128i *output) { 54 const int8_t cos_bit = INV_COS_BIT; 55 const int32_t *cospi = cospi_arr(INV_COS_BIT); 56 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); 57 58 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); 59 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); 60 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); 61 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); 62 63 // stage 1 64 __m128i x[4]; 65 x[0] = input[0]; 66 x[1] = input[2]; 67 x[2] = input[1]; 68 x[3] = input[3]; 69 70 // stage 2 71 btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]); 72 btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]); 73 74 // stage 3 75 btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]); 76 btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]); 77 } 78 79 void av1_idct8_low1_ssse3(const __m128i *input, __m128i *output) { 80 const int32_t *cospi = cospi_arr(INV_COS_BIT); 81 82 // stage 1 83 __m128i x[2]; 84 x[0] = input[0]; 85 86 // stage 2 87 // stage 3 88 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); 89 90 // stage 4 91 // stage 5 92 output[0] = x[0]; 93 output[7] = x[0]; 94 output[1] = x[1]; 95 output[6] = x[1]; 96 output[2] = x[1]; 97 output[5] = x[1]; 98 output[3] = x[0]; 99 output[4] = x[0]; 100 } 101 102 void av1_idct8_sse2(const __m128i *input, __m128i *output) { 103 const int8_t cos_bit = INV_COS_BIT; 104 const int32_t *cospi = cospi_arr(INV_COS_BIT); 105 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); 106 107 const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); 108 const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); 109 const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); 110 const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); 111 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); 112 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); 113 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); 114 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); 115 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); 116 117 // stage 1 118 __m128i x[8]; 119 x[0] = input[0]; 120 x[1] = input[4]; 121 x[2] = input[2]; 122 x[3] = input[6]; 123 x[4] = input[1]; 124 x[5] = input[5]; 125 x[6] = input[3]; 126 x[7] = input[7]; 127 128 // stage 2 129 btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]); 130 btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]); 131 132 // stage 3 133 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]); 134 btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]); 135 btf_16_adds_subs_sse2(x[4], x[5]); 136 btf_16_subs_adds_sse2(x[7], x[6]); 137 138 // stage 4 139 btf_16_adds_subs_sse2(x[0], x[3]); 140 btf_16_adds_subs_sse2(x[1], x[2]); 141 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); 142 143 // stage 5 144 btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]); 145 btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]); 146 btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]); 147 btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]); 148 } 149 150 static void idct8_w4_sse2(const __m128i *input, __m128i *output) { 151 const int8_t cos_bit = INV_COS_BIT; 152 const int32_t *cospi = cospi_arr(INV_COS_BIT); 153 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); 154 155 const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); 156 const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); 157 const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); 158 const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); 159 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); 160 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); 161 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); 162 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); 163 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); 164 165 // stage 1 166 __m128i x[8]; 167 x[0] = input[0]; 168 x[1] = input[4]; 169 x[2] = input[2]; 170 x[3] = input[6]; 171 x[4] = input[1]; 172 x[5] = input[5]; 173 x[6] = input[3]; 174 x[7] = input[7]; 175 176 // stage 2 177 btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]); 178 btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]); 179 180 // stage 3 181 btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]); 182 btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]); 183 btf_16_adds_subs_sse2(x[4], x[5]); 184 btf_16_subs_adds_sse2(x[7], x[6]); 185 186 // stage 4 187 btf_16_adds_subs_sse2(x[0], x[3]); 188 btf_16_adds_subs_sse2(x[1], x[2]); 189 btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); 190 191 // stage 5 192 btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]); 193 btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]); 194 btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]); 195 btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]); 196 } 197 198 static inline void idct16_stage5_sse2(__m128i *x, const int32_t *cospi, 199 const __m128i __rounding, 200 int8_t cos_bit) { 201 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); 202 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); 203 btf_16_adds_subs_sse2(x[0], x[3]); 204 btf_16_adds_subs_sse2(x[1], x[2]); 205 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); 206 btf_16_adds_subs_sse2(x[8], x[11]); 207 btf_16_adds_subs_sse2(x[9], x[10]); 208 btf_16_subs_adds_sse2(x[15], x[12]); 209 btf_16_subs_adds_sse2(x[14], x[13]); 210 } 211 212 static inline void idct16_stage6_sse2(__m128i *x, const int32_t *cospi, 213 const __m128i __rounding, 214 int8_t cos_bit) { 215 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); 216 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); 217 btf_16_adds_subs_sse2(x[0], x[7]); 218 btf_16_adds_subs_sse2(x[1], x[6]); 219 btf_16_adds_subs_sse2(x[2], x[5]); 220 btf_16_adds_subs_sse2(x[3], x[4]); 221 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); 222 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); 223 } 224 225 static inline void idct16_stage7_sse2(__m128i *output, __m128i *x) { 226 btf_16_adds_subs_out_sse2(output[0], output[15], x[0], x[15]); 227 btf_16_adds_subs_out_sse2(output[1], output[14], x[1], x[14]); 228 btf_16_adds_subs_out_sse2(output[2], output[13], x[2], x[13]); 229 btf_16_adds_subs_out_sse2(output[3], output[12], x[3], x[12]); 230 btf_16_adds_subs_out_sse2(output[4], output[11], x[4], x[11]); 231 btf_16_adds_subs_out_sse2(output[5], output[10], x[5], x[10]); 232 btf_16_adds_subs_out_sse2(output[6], output[9], x[6], x[9]); 233 btf_16_adds_subs_out_sse2(output[7], output[8], x[7], x[8]); 234 } 235 236 static void idct16_low1_ssse3(const __m128i *input, __m128i *output) { 237 const int32_t *cospi = cospi_arr(INV_COS_BIT); 238 239 // stage 1 240 __m128i x[2]; 241 x[0] = input[0]; 242 243 // stage 2 244 // stage 3 245 // stage 4 246 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); 247 248 // stage 5 249 // stage 6 250 // stage 7 251 output[0] = x[0]; 252 output[15] = x[0]; 253 output[1] = x[1]; 254 output[14] = x[1]; 255 output[2] = x[1]; 256 output[13] = x[1]; 257 output[3] = x[0]; 258 output[12] = x[0]; 259 output[4] = x[0]; 260 output[11] = x[0]; 261 output[5] = x[1]; 262 output[10] = x[1]; 263 output[6] = x[1]; 264 output[9] = x[1]; 265 output[7] = x[0]; 266 output[8] = x[0]; 267 } 268 269 static void idct16_low8_ssse3(const __m128i *input, __m128i *output) { 270 const int8_t cos_bit = INV_COS_BIT; 271 const int32_t *cospi = cospi_arr(INV_COS_BIT); 272 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); 273 const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); 274 const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); 275 const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); 276 277 // stage 1 278 __m128i x[16]; 279 x[0] = input[0]; 280 x[2] = input[4]; 281 x[4] = input[2]; 282 x[6] = input[6]; 283 x[8] = input[1]; 284 x[10] = input[5]; 285 x[12] = input[3]; 286 x[14] = input[7]; 287 288 // stage 2 289 btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]); 290 btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]); 291 btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]); 292 btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]); 293 294 // stage 3 295 btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]); 296 btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]); 297 btf_16_adds_subs_sse2(x[8], x[9]); 298 btf_16_subs_adds_sse2(x[11], x[10]); 299 btf_16_adds_subs_sse2(x[12], x[13]); 300 btf_16_subs_adds_sse2(x[15], x[14]); 301 302 // stage 4 303 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); 304 btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]); 305 btf_16_adds_subs_sse2(x[4], x[5]); 306 btf_16_subs_adds_sse2(x[7], x[6]); 307 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); 308 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); 309 310 idct16_stage5_sse2(x, cospi, __rounding, cos_bit); 311 idct16_stage6_sse2(x, cospi, __rounding, cos_bit); 312 idct16_stage7_sse2(output, x); 313 } 314 315 static void idct16_sse2(const __m128i *input, __m128i *output) { 316 const int8_t cos_bit = INV_COS_BIT; 317 const int32_t *cospi = cospi_arr(INV_COS_BIT); 318 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); 319 320 const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); 321 const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); 322 const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); 323 const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); 324 const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); 325 const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); 326 const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); 327 const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); 328 const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); 329 const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); 330 const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); 331 const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); 332 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); 333 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); 334 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); 335 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); 336 const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); 337 const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); 338 const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); 339 340 // stage 1 341 __m128i x[16]; 342 x[0] = input[0]; 343 x[1] = input[8]; 344 x[2] = input[4]; 345 x[3] = input[12]; 346 x[4] = input[2]; 347 x[5] = input[10]; 348 x[6] = input[6]; 349 x[7] = input[14]; 350 x[8] = input[1]; 351 x[9] = input[9]; 352 x[10] = input[5]; 353 x[11] = input[13]; 354 x[12] = input[3]; 355 x[13] = input[11]; 356 x[14] = input[7]; 357 x[15] = input[15]; 358 359 // stage 2 360 btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]); 361 btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]); 362 btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]); 363 btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]); 364 365 // stage 3 366 btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]); 367 btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]); 368 btf_16_adds_subs_sse2(x[8], x[9]); 369 btf_16_subs_adds_sse2(x[11], x[10]); 370 btf_16_adds_subs_sse2(x[12], x[13]); 371 btf_16_subs_adds_sse2(x[15], x[14]); 372 373 // stage 4 374 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]); 375 btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]); 376 btf_16_adds_subs_sse2(x[4], x[5]); 377 btf_16_subs_adds_sse2(x[7], x[6]); 378 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); 379 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); 380 381 // stage 5~7 382 idct16_stage5_sse2(x, cospi, __rounding, cos_bit); 383 idct16_stage6_sse2(x, cospi, __rounding, cos_bit); 384 idct16_stage7_sse2(output, x); 385 } 386 387 static void idct16_w4_sse2(const __m128i *input, __m128i *output) { 388 const int8_t cos_bit = INV_COS_BIT; 389 const int32_t *cospi = cospi_arr(INV_COS_BIT); 390 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); 391 392 const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); 393 const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); 394 const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); 395 const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); 396 const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); 397 const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); 398 const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); 399 const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); 400 const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); 401 const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); 402 const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); 403 const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); 404 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); 405 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); 406 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); 407 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); 408 const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); 409 const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); 410 const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); 411 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); 412 413 // stage 1 414 __m128i x[16]; 415 x[0] = input[0]; 416 x[1] = input[8]; 417 x[2] = input[4]; 418 x[3] = input[12]; 419 x[4] = input[2]; 420 x[5] = input[10]; 421 x[6] = input[6]; 422 x[7] = input[14]; 423 x[8] = input[1]; 424 x[9] = input[9]; 425 x[10] = input[5]; 426 x[11] = input[13]; 427 x[12] = input[3]; 428 x[13] = input[11]; 429 x[14] = input[7]; 430 x[15] = input[15]; 431 432 // stage 2 433 btf_16_4p_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]); 434 btf_16_4p_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]); 435 btf_16_4p_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]); 436 btf_16_4p_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]); 437 438 // stage 3 439 btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]); 440 btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]); 441 btf_16_adds_subs_sse2(x[8], x[9]); 442 btf_16_subs_adds_sse2(x[11], x[10]); 443 btf_16_adds_subs_sse2(x[12], x[13]); 444 btf_16_subs_adds_sse2(x[15], x[14]); 445 446 // stage 4 447 btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]); 448 btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]); 449 btf_16_adds_subs_sse2(x[4], x[5]); 450 btf_16_subs_adds_sse2(x[7], x[6]); 451 btf_16_4p_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); 452 btf_16_4p_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); 453 454 // stage 5 455 btf_16_adds_subs_sse2(x[0], x[3]); 456 btf_16_adds_subs_sse2(x[1], x[2]); 457 btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); 458 btf_16_adds_subs_sse2(x[8], x[11]); 459 btf_16_adds_subs_sse2(x[9], x[10]); 460 btf_16_subs_adds_sse2(x[15], x[12]); 461 btf_16_subs_adds_sse2(x[14], x[13]); 462 463 // stage 6 464 btf_16_adds_subs_sse2(x[0], x[7]); 465 btf_16_adds_subs_sse2(x[1], x[6]); 466 btf_16_adds_subs_sse2(x[2], x[5]); 467 btf_16_adds_subs_sse2(x[3], x[4]); 468 btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); 469 btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); 470 471 // stage 7 472 idct16_stage7_sse2(output, x); 473 } 474 475 static inline void idct32_high16_stage3_sse2(__m128i *x) { 476 btf_16_adds_subs_sse2(x[16], x[17]); 477 btf_16_subs_adds_sse2(x[19], x[18]); 478 btf_16_adds_subs_sse2(x[20], x[21]); 479 btf_16_subs_adds_sse2(x[23], x[22]); 480 btf_16_adds_subs_sse2(x[24], x[25]); 481 btf_16_subs_adds_sse2(x[27], x[26]); 482 btf_16_adds_subs_sse2(x[28], x[29]); 483 btf_16_subs_adds_sse2(x[31], x[30]); 484 } 485 486 static inline void idct32_high16_stage4_sse2(__m128i *x, const int32_t *cospi, 487 const __m128i __rounding, 488 int8_t cos_bit) { 489 const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); 490 const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); 491 const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]); 492 const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); 493 const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); 494 const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]); 495 btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]); 496 btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]); 497 btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]); 498 btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]); 499 } 500 501 static inline void idct32_high24_stage5_sse2(__m128i *x, const int32_t *cospi, 502 const __m128i __rounding, 503 int8_t cos_bit) { 504 const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); 505 const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); 506 const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); 507 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); 508 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); 509 btf_16_adds_subs_sse2(x[16], x[19]); 510 btf_16_adds_subs_sse2(x[17], x[18]); 511 btf_16_subs_adds_sse2(x[23], x[20]); 512 btf_16_subs_adds_sse2(x[22], x[21]); 513 btf_16_adds_subs_sse2(x[24], x[27]); 514 btf_16_adds_subs_sse2(x[25], x[26]); 515 btf_16_subs_adds_sse2(x[31], x[28]); 516 btf_16_subs_adds_sse2(x[30], x[29]); 517 } 518 519 static inline void idct32_high28_stage6_sse2(__m128i *x, const int32_t *cospi, 520 const __m128i __rounding, 521 int8_t cos_bit) { 522 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); 523 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); 524 const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); 525 const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); 526 const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); 527 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); 528 btf_16_adds_subs_sse2(x[8], x[11]); 529 btf_16_adds_subs_sse2(x[9], x[10]); 530 btf_16_subs_adds_sse2(x[15], x[12]); 531 btf_16_subs_adds_sse2(x[14], x[13]); 532 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]); 533 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]); 534 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]); 535 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]); 536 } 537 538 static inline void idct32_stage7_sse2(__m128i *x, const int32_t *cospi, 539 const __m128i __rounding, 540 int8_t cos_bit) { 541 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); 542 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); 543 btf_16_adds_subs_sse2(x[0], x[7]); 544 btf_16_adds_subs_sse2(x[1], x[6]); 545 btf_16_adds_subs_sse2(x[2], x[5]); 546 btf_16_adds_subs_sse2(x[3], x[4]); 547 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); 548 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); 549 btf_16_adds_subs_sse2(x[16], x[23]); 550 btf_16_adds_subs_sse2(x[17], x[22]); 551 btf_16_adds_subs_sse2(x[18], x[21]); 552 btf_16_adds_subs_sse2(x[19], x[20]); 553 btf_16_subs_adds_sse2(x[31], x[24]); 554 btf_16_subs_adds_sse2(x[30], x[25]); 555 btf_16_subs_adds_sse2(x[29], x[26]); 556 btf_16_subs_adds_sse2(x[28], x[27]); 557 } 558 559 static inline void idct32_stage8_sse2(__m128i *x, const int32_t *cospi, 560 const __m128i __rounding, 561 int8_t cos_bit) { 562 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); 563 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); 564 btf_16_adds_subs_sse2(x[0], x[15]); 565 btf_16_adds_subs_sse2(x[1], x[14]); 566 btf_16_adds_subs_sse2(x[2], x[13]); 567 btf_16_adds_subs_sse2(x[3], x[12]); 568 btf_16_adds_subs_sse2(x[4], x[11]); 569 btf_16_adds_subs_sse2(x[5], x[10]); 570 btf_16_adds_subs_sse2(x[6], x[9]); 571 btf_16_adds_subs_sse2(x[7], x[8]); 572 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]); 573 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]); 574 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]); 575 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]); 576 } 577 578 static inline void idct32_stage9_sse2(__m128i *output, __m128i *x) { 579 btf_16_adds_subs_out_sse2(output[0], output[31], x[0], x[31]); 580 btf_16_adds_subs_out_sse2(output[1], output[30], x[1], x[30]); 581 btf_16_adds_subs_out_sse2(output[2], output[29], x[2], x[29]); 582 btf_16_adds_subs_out_sse2(output[3], output[28], x[3], x[28]); 583 btf_16_adds_subs_out_sse2(output[4], output[27], x[4], x[27]); 584 btf_16_adds_subs_out_sse2(output[5], output[26], x[5], x[26]); 585 btf_16_adds_subs_out_sse2(output[6], output[25], x[6], x[25]); 586 btf_16_adds_subs_out_sse2(output[7], output[24], x[7], x[24]); 587 btf_16_adds_subs_out_sse2(output[8], output[23], x[8], x[23]); 588 btf_16_adds_subs_out_sse2(output[9], output[22], x[9], x[22]); 589 btf_16_adds_subs_out_sse2(output[10], output[21], x[10], x[21]); 590 btf_16_adds_subs_out_sse2(output[11], output[20], x[11], x[20]); 591 btf_16_adds_subs_out_sse2(output[12], output[19], x[12], x[19]); 592 btf_16_adds_subs_out_sse2(output[13], output[18], x[13], x[18]); 593 btf_16_adds_subs_out_sse2(output[14], output[17], x[14], x[17]); 594 btf_16_adds_subs_out_sse2(output[15], output[16], x[15], x[16]); 595 } 596 597 static void idct32_low1_ssse3(const __m128i *input, __m128i *output) { 598 const int32_t *cospi = cospi_arr(INV_COS_BIT); 599 600 // stage 1 601 __m128i x[2]; 602 x[0] = input[0]; 603 604 // stage 2 605 // stage 3 606 // stage 4 607 // stage 5 608 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); 609 610 // stage 6 611 // stage 7 612 // stage 8 613 // stage 9 614 output[0] = x[0]; 615 output[31] = x[0]; 616 output[1] = x[1]; 617 output[30] = x[1]; 618 output[2] = x[1]; 619 output[29] = x[1]; 620 output[3] = x[0]; 621 output[28] = x[0]; 622 output[4] = x[0]; 623 output[27] = x[0]; 624 output[5] = x[1]; 625 output[26] = x[1]; 626 output[6] = x[1]; 627 output[25] = x[1]; 628 output[7] = x[0]; 629 output[24] = x[0]; 630 output[8] = x[0]; 631 output[23] = x[0]; 632 output[9] = x[1]; 633 output[22] = x[1]; 634 output[10] = x[1]; 635 output[21] = x[1]; 636 output[11] = x[0]; 637 output[20] = x[0]; 638 output[12] = x[0]; 639 output[19] = x[0]; 640 output[13] = x[1]; 641 output[18] = x[1]; 642 output[14] = x[1]; 643 output[17] = x[1]; 644 output[15] = x[0]; 645 output[16] = x[0]; 646 } 647 648 static void idct32_low8_ssse3(const __m128i *input, __m128i *output) { 649 const int8_t cos_bit = INV_COS_BIT; 650 const int32_t *cospi = cospi_arr(INV_COS_BIT); 651 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); 652 653 // stage 1 654 __m128i x[32]; 655 x[0] = input[0]; 656 x[4] = input[4]; 657 x[8] = input[2]; 658 x[12] = input[6]; 659 x[16] = input[1]; 660 x[20] = input[5]; 661 x[24] = input[3]; 662 x[28] = input[7]; 663 664 // stage 2 665 btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]); 666 btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]); 667 btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]); 668 btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]); 669 670 // stage 3 671 btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]); 672 btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]); 673 x[17] = x[16]; 674 x[18] = x[19]; 675 x[21] = x[20]; 676 x[22] = x[23]; 677 x[25] = x[24]; 678 x[26] = x[27]; 679 x[29] = x[28]; 680 x[30] = x[31]; 681 682 // stage 4 683 btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]); 684 x[9] = x[8]; 685 x[10] = x[11]; 686 x[13] = x[12]; 687 x[14] = x[15]; 688 idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit); 689 690 // stage 5 691 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); 692 x[5] = x[4]; 693 x[6] = x[7]; 694 idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit); 695 // stage 6 696 x[3] = x[0]; 697 x[2] = x[1]; 698 idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit); 699 700 idct32_stage7_sse2(x, cospi, __rounding, cos_bit); 701 idct32_stage8_sse2(x, cospi, __rounding, cos_bit); 702 idct32_stage9_sse2(output, x); 703 } 704 705 static void idct32_low16_ssse3(const __m128i *input, __m128i *output) { 706 const int8_t cos_bit = INV_COS_BIT; 707 const int32_t *cospi = cospi_arr(INV_COS_BIT); 708 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); 709 710 // stage 1 711 __m128i x[32]; 712 x[0] = input[0]; 713 x[2] = input[8]; 714 x[4] = input[4]; 715 x[6] = input[12]; 716 x[8] = input[2]; 717 x[10] = input[10]; 718 x[12] = input[6]; 719 x[14] = input[14]; 720 x[16] = input[1]; 721 x[18] = input[9]; 722 x[20] = input[5]; 723 x[22] = input[13]; 724 x[24] = input[3]; 725 x[26] = input[11]; 726 x[28] = input[7]; 727 x[30] = input[15]; 728 729 // stage 2 730 btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]); 731 btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]); 732 btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]); 733 btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]); 734 btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]); 735 btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]); 736 btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]); 737 btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]); 738 739 // stage 3 740 btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]); 741 btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]); 742 btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]); 743 btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]); 744 idct32_high16_stage3_sse2(x); 745 746 // stage 4 747 btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]); 748 btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]); 749 btf_16_adds_subs_sse2(x[8], x[9]); 750 btf_16_subs_adds_sse2(x[11], x[10]); 751 btf_16_adds_subs_sse2(x[12], x[13]); 752 btf_16_subs_adds_sse2(x[15], x[14]); 753 idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit); 754 755 // stage 5 756 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); 757 btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]); 758 btf_16_adds_subs_sse2(x[4], x[5]); 759 btf_16_subs_adds_sse2(x[7], x[6]); 760 idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit); 761 762 btf_16_adds_subs_sse2(x[0], x[3]); 763 btf_16_adds_subs_sse2(x[1], x[2]); 764 idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit); 765 766 idct32_stage7_sse2(x, cospi, __rounding, cos_bit); 767 idct32_stage8_sse2(x, cospi, __rounding, cos_bit); 768 idct32_stage9_sse2(output, x); 769 } 770 771 static void idct32_sse2(const __m128i *input, __m128i *output) { 772 const int8_t cos_bit = INV_COS_BIT; 773 const int32_t *cospi = cospi_arr(INV_COS_BIT); 774 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); 775 776 const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]); 777 const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]); 778 const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]); 779 const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]); 780 const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]); 781 const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]); 782 const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]); 783 const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]); 784 const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]); 785 const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]); 786 const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]); 787 const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]); 788 const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]); 789 const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]); 790 const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]); 791 const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]); 792 const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); 793 const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); 794 const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); 795 const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); 796 const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); 797 const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); 798 const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); 799 const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); 800 const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); 801 const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); 802 const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); 803 const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); 804 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); 805 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); 806 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); 807 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); 808 809 // stage 1 810 __m128i x[32]; 811 x[0] = input[0]; 812 x[1] = input[16]; 813 x[2] = input[8]; 814 x[3] = input[24]; 815 x[4] = input[4]; 816 x[5] = input[20]; 817 x[6] = input[12]; 818 x[7] = input[28]; 819 x[8] = input[2]; 820 x[9] = input[18]; 821 x[10] = input[10]; 822 x[11] = input[26]; 823 x[12] = input[6]; 824 x[13] = input[22]; 825 x[14] = input[14]; 826 x[15] = input[30]; 827 x[16] = input[1]; 828 x[17] = input[17]; 829 x[18] = input[9]; 830 x[19] = input[25]; 831 x[20] = input[5]; 832 x[21] = input[21]; 833 x[22] = input[13]; 834 x[23] = input[29]; 835 x[24] = input[3]; 836 x[25] = input[19]; 837 x[26] = input[11]; 838 x[27] = input[27]; 839 x[28] = input[7]; 840 x[29] = input[23]; 841 x[30] = input[15]; 842 x[31] = input[31]; 843 844 // stage 2 845 btf_16_sse2(cospi_p62_m02, cospi_p02_p62, x[16], x[31], x[16], x[31]); 846 btf_16_sse2(cospi_p30_m34, cospi_p34_p30, x[17], x[30], x[17], x[30]); 847 btf_16_sse2(cospi_p46_m18, cospi_p18_p46, x[18], x[29], x[18], x[29]); 848 btf_16_sse2(cospi_p14_m50, cospi_p50_p14, x[19], x[28], x[19], x[28]); 849 btf_16_sse2(cospi_p54_m10, cospi_p10_p54, x[20], x[27], x[20], x[27]); 850 btf_16_sse2(cospi_p22_m42, cospi_p42_p22, x[21], x[26], x[21], x[26]); 851 btf_16_sse2(cospi_p38_m26, cospi_p26_p38, x[22], x[25], x[22], x[25]); 852 btf_16_sse2(cospi_p06_m58, cospi_p58_p06, x[23], x[24], x[23], x[24]); 853 854 // stage 3 855 btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]); 856 btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]); 857 btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]); 858 btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]); 859 idct32_high16_stage3_sse2(x); 860 861 // stage 4 862 btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]); 863 btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]); 864 btf_16_adds_subs_sse2(x[8], x[9]); 865 btf_16_subs_adds_sse2(x[11], x[10]); 866 btf_16_adds_subs_sse2(x[12], x[13]); 867 btf_16_subs_adds_sse2(x[15], x[14]); 868 idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit); 869 870 // stage 5 871 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]); 872 btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]); 873 btf_16_adds_subs_sse2(x[4], x[5]); 874 btf_16_adds_subs_sse2(x[7], x[6]); 875 idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit); 876 877 // stage 6 878 btf_16_adds_subs_sse2(x[0], x[3]); 879 btf_16_adds_subs_sse2(x[1], x[2]); 880 idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit); 881 882 // stage 7~8 883 idct32_stage7_sse2(x, cospi, __rounding, cos_bit); 884 idct32_stage8_sse2(x, cospi, __rounding, cos_bit); 885 idct32_stage9_sse2(output, x); 886 } 887 888 static inline void idct64_stage4_high32_sse2(__m128i *x, const int32_t *cospi, 889 const __m128i __rounding, 890 int8_t cos_bit) { 891 const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]); 892 const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]); 893 const __m128i cospi_m60_m04 = pair_set_epi16(-cospi[60], -cospi[4]); 894 const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]); 895 const __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]); 896 const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]); 897 const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]); 898 const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]); 899 const __m128i cospi_m44_m20 = pair_set_epi16(-cospi[44], -cospi[20]); 900 const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]); 901 const __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]); 902 const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]); 903 btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]); 904 btf_16_sse2(cospi_m60_m04, cospi_m04_p60, x[34], x[61], x[34], x[61]); 905 btf_16_sse2(cospi_m36_p28, cospi_p28_p36, x[37], x[58], x[37], x[58]); 906 btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]); 907 btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]); 908 btf_16_sse2(cospi_m44_m20, cospi_m20_p44, x[42], x[53], x[42], x[53]); 909 btf_16_sse2(cospi_m52_p12, cospi_p12_p52, x[45], x[50], x[45], x[50]); 910 btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]); 911 } 912 913 static inline void idct64_stage5_high48_sse2(__m128i *x, const int32_t *cospi, 914 const __m128i __rounding, 915 int8_t cos_bit) { 916 const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); 917 const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); 918 const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]); 919 const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); 920 const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); 921 const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]); 922 btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]); 923 btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]); 924 btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]); 925 btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]); 926 btf_16_adds_subs_sse2(x[32], x[35]); 927 btf_16_adds_subs_sse2(x[33], x[34]); 928 btf_16_subs_adds_sse2(x[39], x[36]); 929 btf_16_subs_adds_sse2(x[38], x[37]); 930 btf_16_adds_subs_sse2(x[40], x[43]); 931 btf_16_adds_subs_sse2(x[41], x[42]); 932 btf_16_subs_adds_sse2(x[47], x[44]); 933 btf_16_subs_adds_sse2(x[46], x[45]); 934 btf_16_adds_subs_sse2(x[48], x[51]); 935 btf_16_adds_subs_sse2(x[49], x[50]); 936 btf_16_subs_adds_sse2(x[55], x[52]); 937 btf_16_subs_adds_sse2(x[54], x[53]); 938 btf_16_adds_subs_sse2(x[56], x[59]); 939 btf_16_adds_subs_sse2(x[57], x[58]); 940 btf_16_subs_adds_sse2(x[63], x[60]); 941 btf_16_subs_adds_sse2(x[62], x[61]); 942 } 943 944 static inline void idct64_stage6_high32_sse2(__m128i *x, const int32_t *cospi, 945 const __m128i __rounding, 946 int8_t cos_bit) { 947 const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); 948 const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); 949 const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]); 950 const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); 951 const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); 952 const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]); 953 btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[34], x[61], x[34], x[61]); 954 btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[35], x[60], x[35], x[60]); 955 btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[36], x[59], x[36], x[59]); 956 btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[37], x[58], x[37], x[58]); 957 btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[42], x[53], x[42], x[53]); 958 btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[43], x[52], x[43], x[52]); 959 btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[44], x[51], x[44], x[51]); 960 btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[45], x[50], x[45], x[50]); 961 } 962 963 static inline void idct64_stage6_high48_sse2(__m128i *x, const int32_t *cospi, 964 const __m128i __rounding, 965 int8_t cos_bit) { 966 btf_16_adds_subs_sse2(x[16], x[19]); 967 btf_16_adds_subs_sse2(x[17], x[18]); 968 btf_16_subs_adds_sse2(x[23], x[20]); 969 btf_16_subs_adds_sse2(x[22], x[21]); 970 btf_16_adds_subs_sse2(x[24], x[27]); 971 btf_16_adds_subs_sse2(x[25], x[26]); 972 btf_16_subs_adds_sse2(x[31], x[28]); 973 btf_16_subs_adds_sse2(x[30], x[29]); 974 idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit); 975 } 976 977 static inline void idct64_stage7_high48_sse2(__m128i *x, const int32_t *cospi, 978 const __m128i __rounding, 979 int8_t cos_bit) { 980 const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); 981 const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); 982 const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); 983 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]); 984 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]); 985 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]); 986 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]); 987 btf_16_adds_subs_sse2(x[32], x[39]); 988 btf_16_adds_subs_sse2(x[33], x[38]); 989 btf_16_adds_subs_sse2(x[34], x[37]); 990 btf_16_adds_subs_sse2(x[35], x[36]); 991 btf_16_subs_adds_sse2(x[47], x[40]); 992 btf_16_subs_adds_sse2(x[46], x[41]); 993 btf_16_subs_adds_sse2(x[45], x[42]); 994 btf_16_subs_adds_sse2(x[44], x[43]); 995 btf_16_adds_subs_sse2(x[48], x[55]); 996 btf_16_adds_subs_sse2(x[49], x[54]); 997 btf_16_adds_subs_sse2(x[50], x[53]); 998 btf_16_adds_subs_sse2(x[51], x[52]); 999 btf_16_subs_adds_sse2(x[63], x[56]); 1000 btf_16_subs_adds_sse2(x[62], x[57]); 1001 btf_16_subs_adds_sse2(x[61], x[58]); 1002 btf_16_subs_adds_sse2(x[60], x[59]); 1003 } 1004 1005 static inline void idct64_stage8_high48_sse2(__m128i *x, const int32_t *cospi, 1006 const __m128i __rounding, 1007 int8_t cos_bit) { 1008 const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); 1009 const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); 1010 const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); 1011 btf_16_adds_subs_sse2(x[16], x[23]); 1012 btf_16_adds_subs_sse2(x[17], x[22]); 1013 btf_16_adds_subs_sse2(x[18], x[21]); 1014 btf_16_adds_subs_sse2(x[19], x[20]); 1015 btf_16_subs_adds_sse2(x[31], x[24]); 1016 btf_16_subs_adds_sse2(x[30], x[25]); 1017 btf_16_subs_adds_sse2(x[29], x[26]); 1018 btf_16_subs_adds_sse2(x[28], x[27]); 1019 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[36], x[59], x[36], x[59]); 1020 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[37], x[58], x[37], x[58]); 1021 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[38], x[57], x[38], x[57]); 1022 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[39], x[56], x[39], x[56]); 1023 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[40], x[55], x[40], x[55]); 1024 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[41], x[54], x[41], x[54]); 1025 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[42], x[53], x[42], x[53]); 1026 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[43], x[52], x[43], x[52]); 1027 } 1028 1029 static inline void idct64_stage9_sse2(__m128i *x, const int32_t *cospi, 1030 const __m128i __rounding, 1031 int8_t cos_bit) { 1032 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); 1033 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); 1034 btf_16_adds_subs_sse2(x[0], x[15]); 1035 btf_16_adds_subs_sse2(x[1], x[14]); 1036 btf_16_adds_subs_sse2(x[2], x[13]); 1037 btf_16_adds_subs_sse2(x[3], x[12]); 1038 btf_16_adds_subs_sse2(x[4], x[11]); 1039 btf_16_adds_subs_sse2(x[5], x[10]); 1040 btf_16_adds_subs_sse2(x[6], x[9]); 1041 btf_16_adds_subs_sse2(x[7], x[8]); 1042 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]); 1043 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]); 1044 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]); 1045 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]); 1046 btf_16_adds_subs_sse2(x[32], x[47]); 1047 btf_16_adds_subs_sse2(x[33], x[46]); 1048 btf_16_adds_subs_sse2(x[34], x[45]); 1049 btf_16_adds_subs_sse2(x[35], x[44]); 1050 btf_16_adds_subs_sse2(x[36], x[43]); 1051 btf_16_adds_subs_sse2(x[37], x[42]); 1052 btf_16_adds_subs_sse2(x[38], x[41]); 1053 btf_16_adds_subs_sse2(x[39], x[40]); 1054 btf_16_subs_adds_sse2(x[63], x[48]); 1055 btf_16_subs_adds_sse2(x[62], x[49]); 1056 btf_16_subs_adds_sse2(x[61], x[50]); 1057 btf_16_subs_adds_sse2(x[60], x[51]); 1058 btf_16_subs_adds_sse2(x[59], x[52]); 1059 btf_16_subs_adds_sse2(x[58], x[53]); 1060 btf_16_subs_adds_sse2(x[57], x[54]); 1061 btf_16_subs_adds_sse2(x[56], x[55]); 1062 } 1063 1064 static inline void idct64_stage10_sse2(__m128i *x, const int32_t *cospi, 1065 const __m128i __rounding, 1066 int8_t cos_bit) { 1067 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); 1068 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); 1069 btf_16_adds_subs_sse2(x[0], x[31]); 1070 btf_16_adds_subs_sse2(x[1], x[30]); 1071 btf_16_adds_subs_sse2(x[2], x[29]); 1072 btf_16_adds_subs_sse2(x[3], x[28]); 1073 btf_16_adds_subs_sse2(x[4], x[27]); 1074 btf_16_adds_subs_sse2(x[5], x[26]); 1075 btf_16_adds_subs_sse2(x[6], x[25]); 1076 btf_16_adds_subs_sse2(x[7], x[24]); 1077 btf_16_adds_subs_sse2(x[8], x[23]); 1078 btf_16_adds_subs_sse2(x[9], x[22]); 1079 btf_16_adds_subs_sse2(x[10], x[21]); 1080 btf_16_adds_subs_sse2(x[11], x[20]); 1081 btf_16_adds_subs_sse2(x[12], x[19]); 1082 btf_16_adds_subs_sse2(x[13], x[18]); 1083 btf_16_adds_subs_sse2(x[14], x[17]); 1084 btf_16_adds_subs_sse2(x[15], x[16]); 1085 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[40], x[55], x[40], x[55]); 1086 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[41], x[54], x[41], x[54]); 1087 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[42], x[53], x[42], x[53]); 1088 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[43], x[52], x[43], x[52]); 1089 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[44], x[51], x[44], x[51]); 1090 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[45], x[50], x[45], x[50]); 1091 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[46], x[49], x[46], x[49]); 1092 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[47], x[48], x[47], x[48]); 1093 } 1094 1095 static inline void idct64_stage11_sse2(__m128i *output, __m128i *x) { 1096 btf_16_adds_subs_out_sse2(output[0], output[63], x[0], x[63]); 1097 btf_16_adds_subs_out_sse2(output[1], output[62], x[1], x[62]); 1098 btf_16_adds_subs_out_sse2(output[2], output[61], x[2], x[61]); 1099 btf_16_adds_subs_out_sse2(output[3], output[60], x[3], x[60]); 1100 btf_16_adds_subs_out_sse2(output[4], output[59], x[4], x[59]); 1101 btf_16_adds_subs_out_sse2(output[5], output[58], x[5], x[58]); 1102 btf_16_adds_subs_out_sse2(output[6], output[57], x[6], x[57]); 1103 btf_16_adds_subs_out_sse2(output[7], output[56], x[7], x[56]); 1104 btf_16_adds_subs_out_sse2(output[8], output[55], x[8], x[55]); 1105 btf_16_adds_subs_out_sse2(output[9], output[54], x[9], x[54]); 1106 btf_16_adds_subs_out_sse2(output[10], output[53], x[10], x[53]); 1107 btf_16_adds_subs_out_sse2(output[11], output[52], x[11], x[52]); 1108 btf_16_adds_subs_out_sse2(output[12], output[51], x[12], x[51]); 1109 btf_16_adds_subs_out_sse2(output[13], output[50], x[13], x[50]); 1110 btf_16_adds_subs_out_sse2(output[14], output[49], x[14], x[49]); 1111 btf_16_adds_subs_out_sse2(output[15], output[48], x[15], x[48]); 1112 btf_16_adds_subs_out_sse2(output[16], output[47], x[16], x[47]); 1113 btf_16_adds_subs_out_sse2(output[17], output[46], x[17], x[46]); 1114 btf_16_adds_subs_out_sse2(output[18], output[45], x[18], x[45]); 1115 btf_16_adds_subs_out_sse2(output[19], output[44], x[19], x[44]); 1116 btf_16_adds_subs_out_sse2(output[20], output[43], x[20], x[43]); 1117 btf_16_adds_subs_out_sse2(output[21], output[42], x[21], x[42]); 1118 btf_16_adds_subs_out_sse2(output[22], output[41], x[22], x[41]); 1119 btf_16_adds_subs_out_sse2(output[23], output[40], x[23], x[40]); 1120 btf_16_adds_subs_out_sse2(output[24], output[39], x[24], x[39]); 1121 btf_16_adds_subs_out_sse2(output[25], output[38], x[25], x[38]); 1122 btf_16_adds_subs_out_sse2(output[26], output[37], x[26], x[37]); 1123 btf_16_adds_subs_out_sse2(output[27], output[36], x[27], x[36]); 1124 btf_16_adds_subs_out_sse2(output[28], output[35], x[28], x[35]); 1125 btf_16_adds_subs_out_sse2(output[29], output[34], x[29], x[34]); 1126 btf_16_adds_subs_out_sse2(output[30], output[33], x[30], x[33]); 1127 btf_16_adds_subs_out_sse2(output[31], output[32], x[31], x[32]); 1128 } 1129 1130 static void idct64_low1_ssse3(const __m128i *input, __m128i *output) { 1131 const int32_t *cospi = cospi_arr(INV_COS_BIT); 1132 1133 // stage 1 1134 __m128i x[32]; 1135 x[0] = input[0]; 1136 1137 // stage 2 1138 // stage 3 1139 // stage 4 1140 // stage 5 1141 // stage 6 1142 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); 1143 1144 // stage 7 1145 // stage 8 1146 // stage 9 1147 // stage 10 1148 // stage 11 1149 output[0] = x[0]; 1150 output[63] = x[0]; 1151 output[1] = x[1]; 1152 output[62] = x[1]; 1153 output[2] = x[1]; 1154 output[61] = x[1]; 1155 output[3] = x[0]; 1156 output[60] = x[0]; 1157 output[4] = x[0]; 1158 output[59] = x[0]; 1159 output[5] = x[1]; 1160 output[58] = x[1]; 1161 output[6] = x[1]; 1162 output[57] = x[1]; 1163 output[7] = x[0]; 1164 output[56] = x[0]; 1165 output[8] = x[0]; 1166 output[55] = x[0]; 1167 output[9] = x[1]; 1168 output[54] = x[1]; 1169 output[10] = x[1]; 1170 output[53] = x[1]; 1171 output[11] = x[0]; 1172 output[52] = x[0]; 1173 output[12] = x[0]; 1174 output[51] = x[0]; 1175 output[13] = x[1]; 1176 output[50] = x[1]; 1177 output[14] = x[1]; 1178 output[49] = x[1]; 1179 output[15] = x[0]; 1180 output[48] = x[0]; 1181 output[16] = x[0]; 1182 output[47] = x[0]; 1183 output[17] = x[1]; 1184 output[46] = x[1]; 1185 output[18] = x[1]; 1186 output[45] = x[1]; 1187 output[19] = x[0]; 1188 output[44] = x[0]; 1189 output[20] = x[0]; 1190 output[43] = x[0]; 1191 output[21] = x[1]; 1192 output[42] = x[1]; 1193 output[22] = x[1]; 1194 output[41] = x[1]; 1195 output[23] = x[0]; 1196 output[40] = x[0]; 1197 output[24] = x[0]; 1198 output[39] = x[0]; 1199 output[25] = x[1]; 1200 output[38] = x[1]; 1201 output[26] = x[1]; 1202 output[37] = x[1]; 1203 output[27] = x[0]; 1204 output[36] = x[0]; 1205 output[28] = x[0]; 1206 output[35] = x[0]; 1207 output[29] = x[1]; 1208 output[34] = x[1]; 1209 output[30] = x[1]; 1210 output[33] = x[1]; 1211 output[31] = x[0]; 1212 output[32] = x[0]; 1213 } 1214 1215 static void idct64_low8_ssse3(const __m128i *input, __m128i *output) { 1216 const int8_t cos_bit = INV_COS_BIT; 1217 const int32_t *cospi = cospi_arr(INV_COS_BIT); 1218 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); 1219 const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]); 1220 const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]); 1221 const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]); 1222 const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]); 1223 const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]); 1224 const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]); 1225 const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]); 1226 const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]); 1227 const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); 1228 const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); 1229 const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); 1230 const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]); 1231 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); 1232 const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); 1233 const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); 1234 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); 1235 1236 // stage 1 1237 __m128i x[64]; 1238 x[0] = input[0]; 1239 x[8] = input[4]; 1240 x[16] = input[2]; 1241 x[24] = input[6]; 1242 x[32] = input[1]; 1243 x[40] = input[5]; 1244 x[48] = input[3]; 1245 x[56] = input[7]; 1246 1247 // stage 2 1248 btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]); 1249 btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]); 1250 btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]); 1251 btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]); 1252 1253 // stage 3 1254 btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]); 1255 btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]); 1256 x[33] = x[32]; 1257 x[38] = x[39]; 1258 x[41] = x[40]; 1259 x[46] = x[47]; 1260 x[49] = x[48]; 1261 x[54] = x[55]; 1262 x[57] = x[56]; 1263 x[62] = x[63]; 1264 1265 // stage 4 1266 btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]); 1267 x[17] = x[16]; 1268 x[22] = x[23]; 1269 x[25] = x[24]; 1270 x[30] = x[31]; 1271 btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]); 1272 btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]); 1273 btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]); 1274 btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]); 1275 1276 // stage 5 1277 x[9] = x[8]; 1278 x[14] = x[15]; 1279 btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]); 1280 btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]); 1281 x[35] = x[32]; 1282 x[34] = x[33]; 1283 x[36] = x[39]; 1284 x[37] = x[38]; 1285 x[43] = x[40]; 1286 x[42] = x[41]; 1287 x[44] = x[47]; 1288 x[45] = x[46]; 1289 x[51] = x[48]; 1290 x[50] = x[49]; 1291 x[52] = x[55]; 1292 x[53] = x[54]; 1293 x[59] = x[56]; 1294 x[58] = x[57]; 1295 x[60] = x[63]; 1296 x[61] = x[62]; 1297 1298 // stage 6 1299 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); 1300 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); 1301 x[19] = x[16]; 1302 x[18] = x[17]; 1303 x[20] = x[23]; 1304 x[21] = x[22]; 1305 x[27] = x[24]; 1306 x[26] = x[25]; 1307 x[28] = x[31]; 1308 x[29] = x[30]; 1309 idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit); 1310 1311 // stage 7 1312 x[3] = x[0]; 1313 x[2] = x[1]; 1314 x[11] = x[8]; 1315 x[10] = x[9]; 1316 x[12] = x[15]; 1317 x[13] = x[14]; 1318 idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit); 1319 1320 // stage 8 1321 x[7] = x[0]; 1322 x[6] = x[1]; 1323 x[5] = x[2]; 1324 x[4] = x[3]; 1325 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); 1326 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); 1327 idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit); 1328 1329 idct64_stage9_sse2(x, cospi, __rounding, cos_bit); 1330 idct64_stage10_sse2(x, cospi, __rounding, cos_bit); 1331 idct64_stage11_sse2(output, x); 1332 } 1333 1334 static void idct64_low16_ssse3(const __m128i *input, __m128i *output) { 1335 const int8_t cos_bit = INV_COS_BIT; 1336 const int32_t *cospi = cospi_arr(INV_COS_BIT); 1337 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); 1338 1339 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); 1340 const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); 1341 const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); 1342 const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); 1343 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); 1344 1345 // stage 1 1346 __m128i x[64]; 1347 x[0] = input[0]; 1348 x[4] = input[8]; 1349 x[8] = input[4]; 1350 x[12] = input[12]; 1351 x[16] = input[2]; 1352 x[20] = input[10]; 1353 x[24] = input[6]; 1354 x[28] = input[14]; 1355 x[32] = input[1]; 1356 x[36] = input[9]; 1357 x[40] = input[5]; 1358 x[44] = input[13]; 1359 x[48] = input[3]; 1360 x[52] = input[11]; 1361 x[56] = input[7]; 1362 x[60] = input[15]; 1363 1364 // stage 2 1365 btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]); 1366 btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]); 1367 btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]); 1368 btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]); 1369 btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]); 1370 btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]); 1371 btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]); 1372 btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]); 1373 1374 // stage 3 1375 btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]); 1376 btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]); 1377 btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]); 1378 btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]); 1379 x[33] = x[32]; 1380 x[34] = x[35]; 1381 x[37] = x[36]; 1382 x[38] = x[39]; 1383 x[41] = x[40]; 1384 x[42] = x[43]; 1385 x[45] = x[44]; 1386 x[46] = x[47]; 1387 x[49] = x[48]; 1388 x[50] = x[51]; 1389 x[53] = x[52]; 1390 x[54] = x[55]; 1391 x[57] = x[56]; 1392 x[58] = x[59]; 1393 x[61] = x[60]; 1394 x[62] = x[63]; 1395 1396 // stage 4 1397 btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]); 1398 btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]); 1399 x[17] = x[16]; 1400 x[18] = x[19]; 1401 x[21] = x[20]; 1402 x[22] = x[23]; 1403 x[25] = x[24]; 1404 x[26] = x[27]; 1405 x[29] = x[28]; 1406 x[30] = x[31]; 1407 idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit); 1408 1409 // stage 5 1410 btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]); 1411 x[9] = x[8]; 1412 x[10] = x[11]; 1413 x[13] = x[12]; 1414 x[14] = x[15]; 1415 idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit); 1416 1417 // stage 6 1418 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); 1419 x[5] = x[4]; 1420 x[6] = x[7]; 1421 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); 1422 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); 1423 idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit); 1424 1425 // stage 7 1426 x[3] = x[0]; 1427 x[2] = x[1]; 1428 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); 1429 btf_16_adds_subs_sse2(x[8], x[11]); 1430 btf_16_adds_subs_sse2(x[9], x[10]); 1431 btf_16_subs_adds_sse2(x[15], x[12]); 1432 btf_16_subs_adds_sse2(x[14], x[13]); 1433 idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit); 1434 1435 // stage 8 1436 btf_16_adds_subs_sse2(x[0], x[7]); 1437 btf_16_adds_subs_sse2(x[1], x[6]); 1438 btf_16_adds_subs_sse2(x[2], x[5]); 1439 btf_16_adds_subs_sse2(x[3], x[4]); 1440 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); 1441 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); 1442 idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit); 1443 1444 idct64_stage9_sse2(x, cospi, __rounding, cos_bit); 1445 idct64_stage10_sse2(x, cospi, __rounding, cos_bit); 1446 idct64_stage11_sse2(output, x); 1447 } 1448 1449 static void idct64_low32_ssse3(const __m128i *input, __m128i *output) { 1450 const int8_t cos_bit = INV_COS_BIT; 1451 const int32_t *cospi = cospi_arr(INV_COS_BIT); 1452 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); 1453 1454 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); 1455 const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); 1456 const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); 1457 const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); 1458 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); 1459 1460 // stage 1 1461 __m128i x[64]; 1462 x[0] = input[0]; 1463 x[2] = input[16]; 1464 x[4] = input[8]; 1465 x[6] = input[24]; 1466 x[8] = input[4]; 1467 x[10] = input[20]; 1468 x[12] = input[12]; 1469 x[14] = input[28]; 1470 x[16] = input[2]; 1471 x[18] = input[18]; 1472 x[20] = input[10]; 1473 x[22] = input[26]; 1474 x[24] = input[6]; 1475 x[26] = input[22]; 1476 x[28] = input[14]; 1477 x[30] = input[30]; 1478 x[32] = input[1]; 1479 x[34] = input[17]; 1480 x[36] = input[9]; 1481 x[38] = input[25]; 1482 x[40] = input[5]; 1483 x[42] = input[21]; 1484 x[44] = input[13]; 1485 x[46] = input[29]; 1486 x[48] = input[3]; 1487 x[50] = input[19]; 1488 x[52] = input[11]; 1489 x[54] = input[27]; 1490 x[56] = input[7]; 1491 x[58] = input[23]; 1492 x[60] = input[15]; 1493 x[62] = input[31]; 1494 1495 // stage 2 1496 btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]); 1497 btf_16_ssse3(-cospi[33], cospi[31], x[62], x[33], x[62]); 1498 btf_16_ssse3(cospi[47], cospi[17], x[34], x[34], x[61]); 1499 btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]); 1500 btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]); 1501 btf_16_ssse3(-cospi[41], cospi[23], x[58], x[37], x[58]); 1502 btf_16_ssse3(cospi[39], cospi[25], x[38], x[38], x[57]); 1503 btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]); 1504 btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]); 1505 btf_16_ssse3(-cospi[37], cospi[27], x[54], x[41], x[54]); 1506 btf_16_ssse3(cospi[43], cospi[21], x[42], x[42], x[53]); 1507 btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]); 1508 btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]); 1509 btf_16_ssse3(-cospi[45], cospi[19], x[50], x[45], x[50]); 1510 btf_16_ssse3(cospi[35], cospi[29], x[46], x[46], x[49]); 1511 btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]); 1512 1513 // stage 3 1514 btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]); 1515 btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]); 1516 btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]); 1517 btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]); 1518 btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]); 1519 btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]); 1520 btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]); 1521 btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]); 1522 btf_16_adds_subs_sse2(x[32], x[33]); 1523 btf_16_subs_adds_sse2(x[35], x[34]); 1524 btf_16_adds_subs_sse2(x[36], x[37]); 1525 btf_16_subs_adds_sse2(x[39], x[38]); 1526 btf_16_adds_subs_sse2(x[40], x[41]); 1527 btf_16_subs_adds_sse2(x[43], x[42]); 1528 btf_16_adds_subs_sse2(x[44], x[45]); 1529 btf_16_subs_adds_sse2(x[47], x[46]); 1530 btf_16_adds_subs_sse2(x[48], x[49]); 1531 btf_16_subs_adds_sse2(x[51], x[50]); 1532 btf_16_adds_subs_sse2(x[52], x[53]); 1533 btf_16_subs_adds_sse2(x[55], x[54]); 1534 btf_16_adds_subs_sse2(x[56], x[57]); 1535 btf_16_subs_adds_sse2(x[59], x[58]); 1536 btf_16_adds_subs_sse2(x[60], x[61]); 1537 btf_16_subs_adds_sse2(x[63], x[62]); 1538 1539 // stage 4 1540 btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]); 1541 btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]); 1542 btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]); 1543 btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]); 1544 btf_16_adds_subs_sse2(x[16], x[17]); 1545 btf_16_subs_adds_sse2(x[19], x[18]); 1546 btf_16_adds_subs_sse2(x[20], x[21]); 1547 btf_16_subs_adds_sse2(x[23], x[22]); 1548 btf_16_adds_subs_sse2(x[24], x[25]); 1549 btf_16_subs_adds_sse2(x[27], x[26]); 1550 btf_16_adds_subs_sse2(x[28], x[29]); 1551 btf_16_subs_adds_sse2(x[31], x[30]); 1552 idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit); 1553 1554 // stage 5 1555 btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]); 1556 btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]); 1557 btf_16_adds_subs_sse2(x[8], x[9]); 1558 btf_16_subs_adds_sse2(x[11], x[10]); 1559 btf_16_adds_subs_sse2(x[12], x[13]); 1560 btf_16_subs_adds_sse2(x[15], x[14]); 1561 idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit); 1562 1563 // stage 6 1564 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); 1565 btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]); 1566 btf_16_adds_subs_sse2(x[4], x[5]); 1567 btf_16_subs_adds_sse2(x[7], x[6]); 1568 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); 1569 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); 1570 idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit); 1571 1572 // stage 7 1573 btf_16_adds_subs_sse2(x[0], x[3]); 1574 btf_16_adds_subs_sse2(x[1], x[2]); 1575 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); 1576 btf_16_adds_subs_sse2(x[8], x[11]); 1577 btf_16_adds_subs_sse2(x[9], x[10]); 1578 btf_16_subs_adds_sse2(x[15], x[12]); 1579 btf_16_subs_adds_sse2(x[14], x[13]); 1580 idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit); 1581 1582 // stage 8 1583 btf_16_adds_subs_sse2(x[0], x[7]); 1584 btf_16_adds_subs_sse2(x[1], x[6]); 1585 btf_16_adds_subs_sse2(x[2], x[5]); 1586 btf_16_adds_subs_sse2(x[3], x[4]); 1587 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); 1588 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); 1589 idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit); 1590 1591 // stage 9~11 1592 idct64_stage9_sse2(x, cospi, __rounding, cos_bit); 1593 idct64_stage10_sse2(x, cospi, __rounding, cos_bit); 1594 idct64_stage11_sse2(output, x); 1595 } 1596 1597 static void iadst4_sse2(const __m128i *input, __m128i *output) { 1598 const int32_t *sinpi = sinpi_arr(INV_COS_BIT); 1599 const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]); 1600 const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]); 1601 const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]); 1602 const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]); 1603 const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]); 1604 const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]); 1605 const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]); 1606 const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]); 1607 __m128i x0[4]; 1608 x0[0] = input[0]; 1609 x0[1] = input[1]; 1610 x0[2] = input[2]; 1611 x0[3] = input[3]; 1612 1613 __m128i u[4]; 1614 u[0] = _mm_unpacklo_epi16(x0[0], x0[2]); 1615 u[1] = _mm_unpackhi_epi16(x0[0], x0[2]); 1616 u[2] = _mm_unpacklo_epi16(x0[1], x0[3]); 1617 u[3] = _mm_unpackhi_epi16(x0[1], x0[3]); 1618 1619 __m128i x1[16]; 1620 x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04); // x0*sin1 + x2*sin4 1621 x1[1] = _mm_madd_epi16(u[1], sinpi_p01_p04); 1622 x1[2] = _mm_madd_epi16(u[0], sinpi_p02_m01); // x0*sin2 - x2*sin1 1623 x1[3] = _mm_madd_epi16(u[1], sinpi_p02_m01); 1624 x1[4] = _mm_madd_epi16(u[2], sinpi_p03_p02); // x1*sin3 + x3*sin2 1625 x1[5] = _mm_madd_epi16(u[3], sinpi_p03_p02); 1626 x1[6] = _mm_madd_epi16(u[2], sinpi_p03_m04); // x1*sin3 - x3*sin4 1627 x1[7] = _mm_madd_epi16(u[3], sinpi_p03_m04); 1628 x1[8] = _mm_madd_epi16(u[0], sinpi_p03_m03); // x0*sin3 - x2*sin3 1629 x1[9] = _mm_madd_epi16(u[1], sinpi_p03_m03); 1630 x1[10] = _mm_madd_epi16(u[2], sinpi_0_p03); // x2*sin3 1631 x1[11] = _mm_madd_epi16(u[3], sinpi_0_p03); 1632 x1[12] = _mm_madd_epi16(u[0], sinpi_p04_p02); // x0*sin4 + x2*sin2 1633 x1[13] = _mm_madd_epi16(u[1], sinpi_p04_p02); 1634 x1[14] = _mm_madd_epi16(u[2], sinpi_m03_m01); // -x1*sin3 - x3*sin1 1635 x1[15] = _mm_madd_epi16(u[3], sinpi_m03_m01); 1636 1637 __m128i x2[8]; 1638 x2[0] = _mm_add_epi32(x1[0], x1[4]); // x0*sin1 +x2*sin4 +x1*sin3 +x3*sin2 1639 x2[1] = _mm_add_epi32(x1[1], x1[5]); 1640 x2[2] = _mm_add_epi32(x1[2], x1[6]); // x0*sin2 -x2*sin1 +x1*sin3 -x3*sin4 1641 x2[3] = _mm_add_epi32(x1[3], x1[7]); 1642 x2[4] = _mm_add_epi32(x1[8], x1[10]); // x0*sin3 -x2*sin3 +x3*sin3 1643 x2[5] = _mm_add_epi32(x1[9], x1[11]); 1644 x2[6] = _mm_add_epi32(x1[12], x1[14]); // x0*sin1 +x2*sin4 +x0*sin2 -x2*sin1 1645 x2[7] = _mm_add_epi32(x1[13], x1[15]); 1646 1647 const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); 1648 for (int i = 0; i < 4; ++i) { 1649 __m128i out0 = _mm_add_epi32(x2[2 * i], rounding); 1650 __m128i out1 = _mm_add_epi32(x2[2 * i + 1], rounding); 1651 out0 = _mm_srai_epi32(out0, INV_COS_BIT); 1652 out1 = _mm_srai_epi32(out1, INV_COS_BIT); 1653 output[i] = _mm_packs_epi32(out0, out1); 1654 } 1655 } 1656 1657 static void iadst4_w4_sse2(const __m128i *input, __m128i *output) { 1658 const int32_t *sinpi = sinpi_arr(INV_COS_BIT); 1659 const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]); 1660 const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]); 1661 const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]); 1662 const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]); 1663 const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]); 1664 const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]); 1665 const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]); 1666 const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]); 1667 __m128i x0[4]; 1668 x0[0] = input[0]; 1669 x0[1] = input[1]; 1670 x0[2] = input[2]; 1671 x0[3] = input[3]; 1672 1673 __m128i u[2]; 1674 u[0] = _mm_unpacklo_epi16(x0[0], x0[2]); 1675 u[1] = _mm_unpacklo_epi16(x0[1], x0[3]); 1676 1677 __m128i x1[8]; 1678 x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04); // x0*sin1 + x2*sin4 1679 x1[1] = _mm_madd_epi16(u[0], sinpi_p02_m01); // x0*sin2 - x2*sin1 1680 x1[2] = _mm_madd_epi16(u[1], sinpi_p03_p02); // x1*sin3 + x3*sin2 1681 x1[3] = _mm_madd_epi16(u[1], sinpi_p03_m04); // x1*sin3 - x3*sin4 1682 x1[4] = _mm_madd_epi16(u[0], sinpi_p03_m03); // x0*sin3 - x2*sin3 1683 x1[5] = _mm_madd_epi16(u[1], sinpi_0_p03); // x2*sin3 1684 x1[6] = _mm_madd_epi16(u[0], sinpi_p04_p02); // x0*sin4 + x2*sin2 1685 x1[7] = _mm_madd_epi16(u[1], sinpi_m03_m01); // -x1*sin3 - x3*sin1 1686 1687 __m128i x2[4]; 1688 x2[0] = _mm_add_epi32(x1[0], x1[2]); // x0*sin1 + x2*sin4 + x1*sin3 + x3*sin2 1689 x2[1] = _mm_add_epi32(x1[1], x1[3]); // x0*sin2 - x2*sin1 + x1*sin3 - x3*sin4 1690 x2[2] = _mm_add_epi32(x1[4], x1[5]); // x0*sin3 - x2*sin3 + x3*sin3 1691 x2[3] = _mm_add_epi32(x1[6], x1[7]); // x0*sin4 + x2*sin2 - x1*sin3 - x3*sin1 1692 1693 const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); 1694 for (int i = 0; i < 4; ++i) { 1695 __m128i out0 = _mm_add_epi32(x2[i], rounding); 1696 out0 = _mm_srai_epi32(out0, INV_COS_BIT); 1697 output[i] = _mm_packs_epi32(out0, out0); 1698 } 1699 } 1700 1701 void av1_iadst8_low1_ssse3(const __m128i *input, __m128i *output) { 1702 const int8_t cos_bit = INV_COS_BIT; 1703 const int32_t *cospi = cospi_arr(INV_COS_BIT); 1704 const __m128i __zero = _mm_setzero_si128(); 1705 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); 1706 1707 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); 1708 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); 1709 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); 1710 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); 1711 1712 // stage 1 1713 __m128i x[8]; 1714 x[1] = input[0]; 1715 1716 // stage 2 1717 btf_16_ssse3(cospi[60], -cospi[4], x[1], x[0], x[1]); 1718 1719 // stage 3 1720 x[4] = x[0]; 1721 x[5] = x[1]; 1722 1723 // stage 4 1724 btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]); 1725 1726 // stage 5 1727 x[2] = x[0]; 1728 x[3] = x[1]; 1729 x[6] = x[4]; 1730 x[7] = x[5]; 1731 1732 // stage 6 1733 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]); 1734 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]); 1735 1736 // stage 7 1737 output[0] = x[0]; 1738 output[1] = _mm_subs_epi16(__zero, x[4]); 1739 output[2] = x[6]; 1740 output[3] = _mm_subs_epi16(__zero, x[2]); 1741 output[4] = x[3]; 1742 output[5] = _mm_subs_epi16(__zero, x[7]); 1743 output[6] = x[5]; 1744 output[7] = _mm_subs_epi16(__zero, x[1]); 1745 } 1746 1747 void av1_iadst8_sse2(const __m128i *input, __m128i *output) { 1748 const int8_t cos_bit = INV_COS_BIT; 1749 const int32_t *cospi = cospi_arr(INV_COS_BIT); 1750 const __m128i __zero = _mm_setzero_si128(); 1751 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); 1752 1753 const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); 1754 const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); 1755 const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); 1756 const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); 1757 const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); 1758 const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); 1759 const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); 1760 const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); 1761 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); 1762 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); 1763 const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); 1764 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); 1765 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); 1766 1767 // stage 1 1768 __m128i x[8]; 1769 x[0] = input[7]; 1770 x[1] = input[0]; 1771 x[2] = input[5]; 1772 x[3] = input[2]; 1773 x[4] = input[3]; 1774 x[5] = input[4]; 1775 x[6] = input[1]; 1776 x[7] = input[6]; 1777 1778 // stage 2 1779 btf_16_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]); 1780 btf_16_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]); 1781 btf_16_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]); 1782 btf_16_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]); 1783 1784 // stage 3 1785 btf_16_adds_subs_sse2(x[0], x[4]); 1786 btf_16_adds_subs_sse2(x[1], x[5]); 1787 btf_16_adds_subs_sse2(x[2], x[6]); 1788 btf_16_adds_subs_sse2(x[3], x[7]); 1789 1790 // stage 4 1791 btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]); 1792 btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]); 1793 1794 // stage 5 1795 btf_16_adds_subs_sse2(x[0], x[2]); 1796 btf_16_adds_subs_sse2(x[1], x[3]); 1797 btf_16_adds_subs_sse2(x[4], x[6]); 1798 btf_16_adds_subs_sse2(x[5], x[7]); 1799 1800 // stage 6 1801 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]); 1802 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]); 1803 1804 // stage 7 1805 output[0] = x[0]; 1806 output[1] = _mm_subs_epi16(__zero, x[4]); 1807 output[2] = x[6]; 1808 output[3] = _mm_subs_epi16(__zero, x[2]); 1809 output[4] = x[3]; 1810 output[5] = _mm_subs_epi16(__zero, x[7]); 1811 output[6] = x[5]; 1812 output[7] = _mm_subs_epi16(__zero, x[1]); 1813 } 1814 1815 static void iadst8_w4_sse2(const __m128i *input, __m128i *output) { 1816 const int8_t cos_bit = INV_COS_BIT; 1817 const int32_t *cospi = cospi_arr(INV_COS_BIT); 1818 const __m128i __zero = _mm_setzero_si128(); 1819 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); 1820 1821 const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); 1822 const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); 1823 const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); 1824 const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); 1825 const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); 1826 const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); 1827 const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); 1828 const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); 1829 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); 1830 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); 1831 const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); 1832 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); 1833 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); 1834 1835 // stage 1 1836 __m128i x[8]; 1837 x[0] = input[7]; 1838 x[1] = input[0]; 1839 x[2] = input[5]; 1840 x[3] = input[2]; 1841 x[4] = input[3]; 1842 x[5] = input[4]; 1843 x[6] = input[1]; 1844 x[7] = input[6]; 1845 1846 // stage 2 1847 btf_16_4p_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]); 1848 btf_16_4p_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]); 1849 btf_16_4p_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]); 1850 btf_16_4p_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]); 1851 1852 // stage 3 1853 btf_16_adds_subs_sse2(x[0], x[4]); 1854 btf_16_adds_subs_sse2(x[1], x[5]); 1855 btf_16_adds_subs_sse2(x[2], x[6]); 1856 btf_16_adds_subs_sse2(x[3], x[7]); 1857 1858 // stage 4 1859 btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]); 1860 btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]); 1861 1862 // stage 5 1863 btf_16_adds_subs_sse2(x[0], x[2]); 1864 btf_16_adds_subs_sse2(x[1], x[3]); 1865 btf_16_adds_subs_sse2(x[4], x[6]); 1866 btf_16_adds_subs_sse2(x[5], x[7]); 1867 1868 // stage 6 1869 btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]); 1870 btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]); 1871 1872 // stage 7 1873 output[0] = x[0]; 1874 output[1] = _mm_subs_epi16(__zero, x[4]); 1875 output[2] = x[6]; 1876 output[3] = _mm_subs_epi16(__zero, x[2]); 1877 output[4] = x[3]; 1878 output[5] = _mm_subs_epi16(__zero, x[7]); 1879 output[6] = x[5]; 1880 output[7] = _mm_subs_epi16(__zero, x[1]); 1881 } 1882 1883 static inline void iadst16_stage3_ssse3(__m128i *x) { 1884 btf_16_adds_subs_sse2(x[0], x[8]); 1885 btf_16_adds_subs_sse2(x[1], x[9]); 1886 btf_16_adds_subs_sse2(x[2], x[10]); 1887 btf_16_adds_subs_sse2(x[3], x[11]); 1888 btf_16_adds_subs_sse2(x[4], x[12]); 1889 btf_16_adds_subs_sse2(x[5], x[13]); 1890 btf_16_adds_subs_sse2(x[6], x[14]); 1891 btf_16_adds_subs_sse2(x[7], x[15]); 1892 } 1893 1894 static inline void iadst16_stage4_ssse3(__m128i *x, const int32_t *cospi, 1895 const __m128i __rounding, 1896 int8_t cos_bit) { 1897 const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); 1898 const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); 1899 const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); 1900 const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); 1901 const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]); 1902 const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]); 1903 btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]); 1904 btf_16_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]); 1905 btf_16_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]); 1906 btf_16_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]); 1907 } 1908 1909 static inline void iadst16_stage5_ssse3(__m128i *x) { 1910 btf_16_adds_subs_sse2(x[0], x[4]); 1911 btf_16_adds_subs_sse2(x[1], x[5]); 1912 btf_16_adds_subs_sse2(x[2], x[6]); 1913 btf_16_adds_subs_sse2(x[3], x[7]); 1914 btf_16_adds_subs_sse2(x[8], x[12]); 1915 btf_16_adds_subs_sse2(x[9], x[13]); 1916 btf_16_adds_subs_sse2(x[10], x[14]); 1917 btf_16_adds_subs_sse2(x[11], x[15]); 1918 } 1919 1920 static inline void iadst16_stage6_ssse3(__m128i *x, const int32_t *cospi, 1921 const __m128i __rounding, 1922 int8_t cos_bit) { 1923 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); 1924 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); 1925 const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); 1926 btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]); 1927 btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]); 1928 btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]); 1929 btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]); 1930 } 1931 1932 static inline void iadst16_stage7_ssse3(__m128i *x) { 1933 btf_16_adds_subs_sse2(x[0], x[2]); 1934 btf_16_adds_subs_sse2(x[1], x[3]); 1935 btf_16_adds_subs_sse2(x[4], x[6]); 1936 btf_16_adds_subs_sse2(x[5], x[7]); 1937 btf_16_adds_subs_sse2(x[8], x[10]); 1938 btf_16_adds_subs_sse2(x[9], x[11]); 1939 btf_16_adds_subs_sse2(x[12], x[14]); 1940 btf_16_adds_subs_sse2(x[13], x[15]); 1941 } 1942 1943 static inline void iadst16_stage8_ssse3(__m128i *x, const int32_t *cospi, 1944 const __m128i __rounding, 1945 int8_t cos_bit) { 1946 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); 1947 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); 1948 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]); 1949 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]); 1950 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]); 1951 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]); 1952 } 1953 1954 static inline void iadst16_stage9_ssse3(__m128i *output, __m128i *x) { 1955 const __m128i __zero = _mm_setzero_si128(); 1956 output[0] = x[0]; 1957 output[1] = _mm_subs_epi16(__zero, x[8]); 1958 output[2] = x[12]; 1959 output[3] = _mm_subs_epi16(__zero, x[4]); 1960 output[4] = x[6]; 1961 output[5] = _mm_subs_epi16(__zero, x[14]); 1962 output[6] = x[10]; 1963 output[7] = _mm_subs_epi16(__zero, x[2]); 1964 output[8] = x[3]; 1965 output[9] = _mm_subs_epi16(__zero, x[11]); 1966 output[10] = x[15]; 1967 output[11] = _mm_subs_epi16(__zero, x[7]); 1968 output[12] = x[5]; 1969 output[13] = _mm_subs_epi16(__zero, x[13]); 1970 output[14] = x[9]; 1971 output[15] = _mm_subs_epi16(__zero, x[1]); 1972 } 1973 1974 static void iadst16_low1_ssse3(const __m128i *input, __m128i *output) { 1975 const int8_t cos_bit = INV_COS_BIT; 1976 const int32_t *cospi = cospi_arr(INV_COS_BIT); 1977 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); 1978 1979 const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); 1980 const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); 1981 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); 1982 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); 1983 1984 // stage 1 1985 __m128i x[16]; 1986 x[1] = input[0]; 1987 1988 // stage 2 1989 btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]); 1990 1991 // stage 3 1992 x[8] = x[0]; 1993 x[9] = x[1]; 1994 1995 // stage 4 1996 btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]); 1997 1998 // stage 5 1999 x[4] = x[0]; 2000 x[5] = x[1]; 2001 x[12] = x[8]; 2002 x[13] = x[9]; 2003 2004 // stage 6 2005 btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]); 2006 btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]); 2007 2008 // stage 7 2009 x[2] = x[0]; 2010 x[3] = x[1]; 2011 x[6] = x[4]; 2012 x[7] = x[5]; 2013 x[10] = x[8]; 2014 x[11] = x[9]; 2015 x[14] = x[12]; 2016 x[15] = x[13]; 2017 2018 iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit); 2019 iadst16_stage9_ssse3(output, x); 2020 } 2021 2022 static void iadst16_low8_ssse3(const __m128i *input, __m128i *output) { 2023 const int8_t cos_bit = INV_COS_BIT; 2024 const int32_t *cospi = cospi_arr(INV_COS_BIT); 2025 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); 2026 2027 // stage 1 2028 __m128i x[16]; 2029 x[1] = input[0]; 2030 x[3] = input[2]; 2031 x[5] = input[4]; 2032 x[7] = input[6]; 2033 x[8] = input[7]; 2034 x[10] = input[5]; 2035 x[12] = input[3]; 2036 x[14] = input[1]; 2037 2038 // stage 2 2039 btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]); 2040 btf_16_ssse3(cospi[54], -cospi[10], x[3], x[2], x[3]); 2041 btf_16_ssse3(cospi[46], -cospi[18], x[5], x[4], x[5]); 2042 btf_16_ssse3(cospi[38], -cospi[26], x[7], x[6], x[7]); 2043 btf_16_ssse3(cospi[34], cospi[30], x[8], x[8], x[9]); 2044 btf_16_ssse3(cospi[42], cospi[22], x[10], x[10], x[11]); 2045 btf_16_ssse3(cospi[50], cospi[14], x[12], x[12], x[13]); 2046 btf_16_ssse3(cospi[58], cospi[6], x[14], x[14], x[15]); 2047 2048 // stage 3 2049 iadst16_stage3_ssse3(x); 2050 iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit); 2051 iadst16_stage5_ssse3(x); 2052 iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit); 2053 iadst16_stage7_ssse3(x); 2054 iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit); 2055 iadst16_stage9_ssse3(output, x); 2056 } 2057 static void iadst16_sse2(const __m128i *input, __m128i *output) { 2058 const int8_t cos_bit = INV_COS_BIT; 2059 const int32_t *cospi = cospi_arr(INV_COS_BIT); 2060 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); 2061 const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]); 2062 const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]); 2063 const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]); 2064 const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]); 2065 const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]); 2066 const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]); 2067 const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]); 2068 const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]); 2069 const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]); 2070 const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]); 2071 const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]); 2072 const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]); 2073 const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]); 2074 const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]); 2075 const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]); 2076 const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]); 2077 2078 // stage 1 2079 __m128i x[16]; 2080 x[0] = input[15]; 2081 x[1] = input[0]; 2082 x[2] = input[13]; 2083 x[3] = input[2]; 2084 x[4] = input[11]; 2085 x[5] = input[4]; 2086 x[6] = input[9]; 2087 x[7] = input[6]; 2088 x[8] = input[7]; 2089 x[9] = input[8]; 2090 x[10] = input[5]; 2091 x[11] = input[10]; 2092 x[12] = input[3]; 2093 x[13] = input[12]; 2094 x[14] = input[1]; 2095 x[15] = input[14]; 2096 2097 // stage 2 2098 btf_16_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]); 2099 btf_16_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]); 2100 btf_16_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]); 2101 btf_16_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]); 2102 btf_16_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]); 2103 btf_16_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]); 2104 btf_16_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]); 2105 btf_16_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]); 2106 2107 // stage 3~9 2108 iadst16_stage3_ssse3(x); 2109 iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit); 2110 iadst16_stage5_ssse3(x); 2111 iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit); 2112 iadst16_stage7_ssse3(x); 2113 iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit); 2114 iadst16_stage9_ssse3(output, x); 2115 } 2116 2117 static void iadst16_w4_sse2(const __m128i *input, __m128i *output) { 2118 const int8_t cos_bit = INV_COS_BIT; 2119 const int32_t *cospi = cospi_arr(INV_COS_BIT); 2120 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); 2121 2122 const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]); 2123 const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]); 2124 const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]); 2125 const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]); 2126 const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]); 2127 const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]); 2128 const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]); 2129 const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]); 2130 const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]); 2131 const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]); 2132 const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]); 2133 const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]); 2134 const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]); 2135 const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]); 2136 const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]); 2137 const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]); 2138 const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); 2139 const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); 2140 const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); 2141 const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); 2142 const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]); 2143 const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]); 2144 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); 2145 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); 2146 const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); 2147 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); 2148 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); 2149 2150 // stage 1 2151 __m128i x[16]; 2152 x[0] = input[15]; 2153 x[1] = input[0]; 2154 x[2] = input[13]; 2155 x[3] = input[2]; 2156 x[4] = input[11]; 2157 x[5] = input[4]; 2158 x[6] = input[9]; 2159 x[7] = input[6]; 2160 x[8] = input[7]; 2161 x[9] = input[8]; 2162 x[10] = input[5]; 2163 x[11] = input[10]; 2164 x[12] = input[3]; 2165 x[13] = input[12]; 2166 x[14] = input[1]; 2167 x[15] = input[14]; 2168 2169 // stage 2 2170 btf_16_4p_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]); 2171 btf_16_4p_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]); 2172 btf_16_4p_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]); 2173 btf_16_4p_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]); 2174 btf_16_4p_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]); 2175 btf_16_4p_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]); 2176 btf_16_4p_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]); 2177 btf_16_4p_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]); 2178 2179 // stage 3 2180 iadst16_stage3_ssse3(x); 2181 2182 // stage 4 2183 btf_16_4p_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]); 2184 btf_16_4p_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]); 2185 btf_16_4p_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]); 2186 btf_16_4p_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]); 2187 2188 // stage 5 2189 iadst16_stage5_ssse3(x); 2190 2191 // stage 6 2192 btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]); 2193 btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]); 2194 btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]); 2195 btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]); 2196 2197 // stage 7 2198 iadst16_stage7_ssse3(x); 2199 2200 // stage 8 2201 btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]); 2202 btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]); 2203 btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]); 2204 btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]); 2205 2206 // stage 9 2207 iadst16_stage9_ssse3(output, x); 2208 } 2209 2210 static void iidentity4_ssse3(const __m128i *input, __m128i *output) { 2211 const int16_t scale_fractional = (NewSqrt2 - (1 << NewSqrt2Bits)); 2212 const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits)); 2213 for (int i = 0; i < 4; ++i) { 2214 __m128i x = _mm_mulhrs_epi16(input[i], scale); 2215 output[i] = _mm_adds_epi16(x, input[i]); 2216 } 2217 } 2218 2219 static void iidentity8_sse2(const __m128i *input, __m128i *output) { 2220 for (int i = 0; i < 8; ++i) { 2221 output[i] = _mm_adds_epi16(input[i], input[i]); 2222 } 2223 } 2224 2225 static void iidentity16_ssse3(const __m128i *input, __m128i *output) { 2226 const int16_t scale_fractional = 2 * (NewSqrt2 - (1 << NewSqrt2Bits)); 2227 const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits)); 2228 for (int i = 0; i < 16; ++i) { 2229 __m128i x = _mm_mulhrs_epi16(input[i], scale); 2230 __m128i srcx2 = _mm_adds_epi16(input[i], input[i]); 2231 output[i] = _mm_adds_epi16(x, srcx2); 2232 } 2233 } 2234 2235 static inline __m128i lowbd_get_recon_8x8_sse2(const __m128i pred, 2236 __m128i res) { 2237 const __m128i zero = _mm_setzero_si128(); 2238 __m128i x0 = _mm_adds_epi16(res, _mm_unpacklo_epi8(pred, zero)); 2239 return _mm_packus_epi16(x0, x0); 2240 } 2241 2242 static inline void lowbd_write_buffer_4xn_sse2(__m128i *in, uint8_t *output, 2243 int stride, int flipud, 2244 const int height) { 2245 int j = flipud ? (height - 1) : 0; 2246 const int step = flipud ? -1 : 1; 2247 const __m128i zero = _mm_setzero_si128(); 2248 for (int i = 0; i < height; ++i, j += step) { 2249 const __m128i v = _mm_cvtsi32_si128(*((int *)(output + i * stride))); 2250 __m128i u = _mm_adds_epi16(in[j], _mm_unpacklo_epi8(v, zero)); 2251 u = _mm_packus_epi16(u, zero); 2252 *((int *)(output + i * stride)) = _mm_cvtsi128_si32(u); 2253 } 2254 } 2255 2256 static inline void lowbd_write_buffer_8xn_sse2(__m128i *in, uint8_t *output, 2257 int stride, int flipud, 2258 const int height) { 2259 int j = flipud ? (height - 1) : 0; 2260 const int step = flipud ? -1 : 1; 2261 for (int i = 0; i < height; ++i, j += step) { 2262 const __m128i v = _mm_loadl_epi64((__m128i const *)(output + i * stride)); 2263 const __m128i u = lowbd_get_recon_8x8_sse2(v, in[j]); 2264 _mm_storel_epi64((__m128i *)(output + i * stride), u); 2265 } 2266 } 2267 2268 // 1D functions process process 8 pixels at one time. 2269 static const transform_1d_ssse3 2270 lowbd_txfm_all_1d_w8_arr[TX_SIZES][ITX_TYPES_1D] = { 2271 { idct4_sse2, iadst4_sse2, iidentity4_ssse3 }, 2272 { av1_idct8_sse2, av1_iadst8_sse2, iidentity8_sse2 }, 2273 { idct16_sse2, iadst16_sse2, iidentity16_ssse3 }, 2274 { idct32_sse2, NULL, NULL }, 2275 { idct64_low32_ssse3, NULL, NULL }, 2276 }; 2277 2278 // functions for blocks with eob at DC and within 2279 // topleft 8x8, 16x16, 32x32 corner 2280 static const transform_1d_ssse3 2281 lowbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = { 2282 { 2283 { idct4_sse2, idct4_sse2, NULL, NULL }, 2284 { iadst4_sse2, iadst4_sse2, NULL, NULL }, 2285 { iidentity4_ssse3, iidentity4_ssse3, NULL, NULL }, 2286 }, 2287 { { av1_idct8_low1_ssse3, av1_idct8_sse2, NULL, NULL }, 2288 { av1_iadst8_low1_ssse3, av1_iadst8_sse2, NULL, NULL }, 2289 { iidentity8_sse2, iidentity8_sse2, NULL, NULL } }, 2290 { 2291 { idct16_low1_ssse3, idct16_low8_ssse3, idct16_sse2, NULL }, 2292 { iadst16_low1_ssse3, iadst16_low8_ssse3, iadst16_sse2, NULL }, 2293 { NULL, NULL, NULL, NULL }, 2294 }, 2295 { { idct32_low1_ssse3, idct32_low8_ssse3, idct32_low16_ssse3, 2296 idct32_sse2 }, 2297 { NULL, NULL, NULL, NULL }, 2298 { NULL, NULL, NULL, NULL } }, 2299 { { idct64_low1_ssse3, idct64_low8_ssse3, idct64_low16_ssse3, 2300 idct64_low32_ssse3 }, 2301 { NULL, NULL, NULL, NULL }, 2302 { NULL, NULL, NULL, NULL } } 2303 }; 2304 2305 // 1D functions process process 4 pixels at one time. 2306 // used in 4x4, 4x8, 4x16, 8x4, 16x4 2307 static const transform_1d_ssse3 2308 lowbd_txfm_all_1d_w4_arr[TX_SIZES][ITX_TYPES_1D] = { 2309 { idct4_w4_sse2, iadst4_w4_sse2, iidentity4_ssse3 }, 2310 { idct8_w4_sse2, iadst8_w4_sse2, iidentity8_sse2 }, 2311 { idct16_w4_sse2, iadst16_w4_sse2, iidentity16_ssse3 }, 2312 { NULL, NULL, NULL }, 2313 { NULL, NULL, NULL }, 2314 }; 2315 2316 static inline void iidentity_row_8xn_ssse3(__m128i *out, const int32_t *input, 2317 int stride, int shift, int height, 2318 int txw_idx, int rect_type) { 2319 const int32_t *input_row = input; 2320 const __m128i scale = _mm_set1_epi16(NewSqrt2list[txw_idx]); 2321 const __m128i rounding = _mm_set1_epi16((1 << (NewSqrt2Bits - 1)) + 2322 (1 << (NewSqrt2Bits - shift - 1))); 2323 const __m128i one = _mm_set1_epi16(1); 2324 const __m128i scale_rounding = _mm_unpacklo_epi16(scale, rounding); 2325 if (rect_type != 1 && rect_type != -1) { 2326 for (int i = 0; i < height; ++i) { 2327 const __m128i src = load_32bit_to_16bit(input_row); 2328 input_row += stride; 2329 __m128i lo = _mm_unpacklo_epi16(src, one); 2330 __m128i hi = _mm_unpackhi_epi16(src, one); 2331 lo = _mm_madd_epi16(lo, scale_rounding); 2332 hi = _mm_madd_epi16(hi, scale_rounding); 2333 lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift); 2334 hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift); 2335 out[i] = _mm_packs_epi32(lo, hi); 2336 } 2337 } else { 2338 const __m128i rect_scale = 2339 _mm_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits)); 2340 for (int i = 0; i < height; ++i) { 2341 __m128i src = load_32bit_to_16bit(input_row); 2342 src = _mm_mulhrs_epi16(src, rect_scale); 2343 input_row += stride; 2344 __m128i lo = _mm_unpacklo_epi16(src, one); 2345 __m128i hi = _mm_unpackhi_epi16(src, one); 2346 lo = _mm_madd_epi16(lo, scale_rounding); 2347 hi = _mm_madd_epi16(hi, scale_rounding); 2348 lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift); 2349 hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift); 2350 out[i] = _mm_packs_epi32(lo, hi); 2351 } 2352 } 2353 } 2354 2355 static inline void iidentity_col_8xn_ssse3(uint8_t *output, int stride, 2356 __m128i *buf, int shift, int height, 2357 int txh_idx) { 2358 const __m128i scale = _mm_set1_epi16(NewSqrt2list[txh_idx]); 2359 const __m128i scale_rounding = _mm_set1_epi16(1 << (NewSqrt2Bits - 1)); 2360 const __m128i shift_rounding = _mm_set1_epi32(1 << (-shift - 1)); 2361 const __m128i one = _mm_set1_epi16(1); 2362 const __m128i scale_coeff = _mm_unpacklo_epi16(scale, scale_rounding); 2363 const __m128i zero = _mm_setzero_si128(); 2364 for (int h = 0; h < height; ++h) { 2365 __m128i lo = _mm_unpacklo_epi16(buf[h], one); 2366 __m128i hi = _mm_unpackhi_epi16(buf[h], one); 2367 lo = _mm_madd_epi16(lo, scale_coeff); 2368 hi = _mm_madd_epi16(hi, scale_coeff); 2369 lo = _mm_srai_epi32(lo, NewSqrt2Bits); 2370 hi = _mm_srai_epi32(hi, NewSqrt2Bits); 2371 lo = _mm_add_epi32(lo, shift_rounding); 2372 hi = _mm_add_epi32(hi, shift_rounding); 2373 lo = _mm_srai_epi32(lo, -shift); 2374 hi = _mm_srai_epi32(hi, -shift); 2375 __m128i x = _mm_packs_epi32(lo, hi); 2376 2377 const __m128i pred = _mm_loadl_epi64((__m128i const *)(output)); 2378 x = _mm_adds_epi16(x, _mm_unpacklo_epi8(pred, zero)); 2379 const __m128i u = _mm_packus_epi16(x, x); 2380 _mm_storel_epi64((__m128i *)(output), u); 2381 output += stride; 2382 } 2383 } 2384 2385 void av1_lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t *input, uint8_t *output, 2386 int stride, TX_SIZE tx_size) { 2387 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; 2388 const int txw_idx = get_txw_idx(tx_size); 2389 const int txh_idx = get_txh_idx(tx_size); 2390 const int txfm_size_col = tx_size_wide[tx_size]; 2391 const int txfm_size_row = tx_size_high[tx_size]; 2392 const int col_max = AOMMIN(32, txfm_size_col); 2393 const int row_max = AOMMIN(32, txfm_size_row); 2394 const int input_stride = row_max; 2395 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); 2396 2397 for (int i = 0; i < (col_max >> 3); ++i) { 2398 for (int j = 0; j < (row_max >> 3); j++) { 2399 __m128i buf[8]; 2400 iidentity_row_8xn_ssse3(buf, input + j * 8 + i * 8 * input_stride, 2401 row_max, shift[0], 8, txw_idx, rect_type); 2402 transpose_16bit_8x8(buf, buf); 2403 iidentity_col_8xn_ssse3(output + i * 8 + j * 8 * stride, stride, buf, 2404 shift[1], 8, txh_idx); 2405 } 2406 } 2407 } 2408 2409 static void lowbd_inv_txfm2d_add_4x4_ssse3(const int32_t *input, 2410 uint8_t *output, int stride, 2411 TX_TYPE tx_type, TX_SIZE tx_size_, 2412 int eob) { 2413 (void)tx_size_; 2414 (void)eob; 2415 __m128i buf[4]; 2416 const TX_SIZE tx_size = TX_4X4; 2417 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; 2418 const int txw_idx = get_txw_idx(tx_size); 2419 const int txh_idx = get_txh_idx(tx_size); 2420 const int txfm_size_col = tx_size_wide[tx_size]; 2421 const int txfm_size_row = tx_size_high[tx_size]; 2422 2423 const transform_1d_ssse3 row_txfm = 2424 lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]]; 2425 const transform_1d_ssse3 col_txfm = 2426 lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]]; 2427 2428 int ud_flip, lr_flip; 2429 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 2430 load_buffer_32bit_to_16bit_w4(input, txfm_size_row, buf, txfm_size_col); 2431 row_txfm(buf, buf); 2432 if (lr_flip) { 2433 __m128i temp[4]; 2434 flip_buf_sse2(buf, temp, txfm_size_col); 2435 transpose_16bit_4x4(temp, buf); 2436 } else { 2437 transpose_16bit_4x4(buf, buf); 2438 } 2439 col_txfm(buf, buf); 2440 round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]); 2441 lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row); 2442 } 2443 2444 static inline __m128i lowbd_get_recon_16x16_sse2(const __m128i pred, 2445 __m128i res0, __m128i res1) { 2446 const __m128i zero = _mm_setzero_si128(); 2447 __m128i x0 = _mm_unpacklo_epi8(pred, zero); 2448 __m128i x1 = _mm_unpackhi_epi8(pred, zero); 2449 x0 = _mm_adds_epi16(res0, x0); 2450 x1 = _mm_adds_epi16(res1, x1); 2451 return _mm_packus_epi16(x0, x1); 2452 } 2453 2454 static inline void lowbd_write_buffer_16xn_sse2(__m128i *in, uint8_t *output, 2455 int stride, int flipud, 2456 int height) { 2457 int j = flipud ? (height - 1) : 0; 2458 const int step = flipud ? -1 : 1; 2459 for (int i = 0; i < height; ++i, j += step) { 2460 __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride)); 2461 __m128i u = lowbd_get_recon_16x16_sse2(v, in[j], in[j + height]); 2462 _mm_storeu_si128((__m128i *)(output + i * stride), u); 2463 } 2464 } 2465 2466 static inline void round_shift_ssse3(const __m128i *input, __m128i *output, 2467 int size) { 2468 const __m128i scale = _mm_set1_epi16(NewInvSqrt2 * 8); 2469 for (int i = 0; i < size; ++i) { 2470 output[i] = _mm_mulhrs_epi16(input[i], scale); 2471 } 2472 } 2473 2474 static inline void lowbd_inv_txfm2d_add_no_identity_ssse3( 2475 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, 2476 TX_SIZE tx_size, int eob) { 2477 __m128i buf1[64 * 8]; 2478 int eobx, eoby; 2479 get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); 2480 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; 2481 const int txw_idx = get_txw_idx(tx_size); 2482 const int txh_idx = get_txh_idx(tx_size); 2483 const int txfm_size_col = tx_size_wide[tx_size]; 2484 const int txfm_size_row = tx_size_high[tx_size]; 2485 const int buf_size_w_div8 = txfm_size_col >> 3; 2486 const int buf_size_nonzero_w = ((eobx + 8) >> 3) << 3; 2487 const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; 2488 const int input_stride = AOMMIN(32, txfm_size_row); 2489 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); 2490 2491 const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; 2492 const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; 2493 const transform_1d_ssse3 row_txfm = 2494 lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; 2495 const transform_1d_ssse3 col_txfm = 2496 lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; 2497 2498 assert(col_txfm != NULL); 2499 assert(row_txfm != NULL); 2500 int ud_flip, lr_flip; 2501 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 2502 for (int i = 0; i < buf_size_nonzero_h_div8; i++) { 2503 __m128i buf0[64]; 2504 load_buffer_32bit_to_16bit(input + 8 * i, input_stride, buf0, 2505 buf_size_nonzero_w); 2506 if (rect_type == 1 || rect_type == -1) { 2507 round_shift_ssse3(buf0, buf0, buf_size_nonzero_w); // rect special code 2508 } 2509 row_txfm(buf0, buf0); 2510 round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]); 2511 __m128i *_buf1 = buf1 + i * 8; 2512 if (lr_flip) { 2513 for (int j = 0; j < buf_size_w_div8; ++j) { 2514 __m128i temp[8]; 2515 flip_buf_sse2(buf0 + 8 * j, temp, 8); 2516 transpose_16bit_8x8(temp, 2517 _buf1 + txfm_size_row * (buf_size_w_div8 - 1 - j)); 2518 } 2519 } else { 2520 for (int j = 0; j < buf_size_w_div8; ++j) { 2521 transpose_16bit_8x8(buf0 + 8 * j, _buf1 + txfm_size_row * j); 2522 } 2523 } 2524 } 2525 for (int i = 0; i < buf_size_w_div8; i++) { 2526 col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row); 2527 round_shift_16bit_ssse3(buf1 + i * txfm_size_row, txfm_size_row, shift[1]); 2528 } 2529 2530 if (txfm_size_col >= 16) { 2531 for (int i = 0; i < (txfm_size_col >> 4); i++) { 2532 lowbd_write_buffer_16xn_sse2(buf1 + i * txfm_size_row * 2, 2533 output + 16 * i, stride, ud_flip, 2534 txfm_size_row); 2535 } 2536 } else if (txfm_size_col == 8) { 2537 lowbd_write_buffer_8xn_sse2(buf1, output, stride, ud_flip, txfm_size_row); 2538 } 2539 } 2540 2541 void av1_lowbd_inv_txfm2d_add_h_identity_ssse3(const int32_t *input, 2542 uint8_t *output, int stride, 2543 TX_TYPE tx_type, TX_SIZE tx_size, 2544 int eob) { 2545 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; 2546 int eobx, eoby; 2547 get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob); 2548 const int txw_idx = get_txw_idx(tx_size); 2549 const int txh_idx = get_txh_idx(tx_size); 2550 const int txfm_size_col = tx_size_wide[tx_size]; 2551 const int txfm_size_row = tx_size_high[tx_size]; 2552 const int buf_size_w_div8 = (eobx + 8) >> 3; 2553 const int buf_size_h_div8 = (eoby + 8) >> 3; 2554 const int input_stride = AOMMIN(32, txfm_size_row); 2555 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); 2556 2557 const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby]; 2558 assert(fun_idx < 5); 2559 const transform_1d_ssse3 col_txfm = 2560 lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx]; 2561 2562 assert(col_txfm != NULL); 2563 2564 int ud_flip, lr_flip; 2565 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 2566 for (int i = 0; i < buf_size_w_div8; i++) { 2567 __m128i buf0[64]; 2568 for (int j = 0; j < buf_size_h_div8; j++) { 2569 __m128i *buf0_cur = buf0 + j * 8; 2570 const int32_t *input_cur = input + i * 8 * input_stride + j * 8; 2571 iidentity_row_8xn_ssse3(buf0_cur, input_cur, input_stride, shift[0], 8, 2572 txw_idx, rect_type); 2573 transpose_16bit_8x8(buf0_cur, buf0_cur); 2574 } 2575 col_txfm(buf0, buf0); 2576 __m128i mshift = _mm_set1_epi16(1 << (15 + shift[1])); 2577 int k = ud_flip ? (txfm_size_row - 1) : 0; 2578 const int step = ud_flip ? -1 : 1; 2579 uint8_t *out = output + 8 * i; 2580 for (int j = 0; j < txfm_size_row; ++j, k += step) { 2581 const __m128i v = _mm_loadl_epi64((__m128i const *)(out)); 2582 __m128i res = _mm_mulhrs_epi16(buf0[k], mshift); 2583 const __m128i u = lowbd_get_recon_8x8_sse2(v, res); 2584 _mm_storel_epi64((__m128i *)(out), u); 2585 out += stride; 2586 } 2587 } 2588 } 2589 2590 void av1_lowbd_inv_txfm2d_add_v_identity_ssse3(const int32_t *input, 2591 uint8_t *output, int stride, 2592 TX_TYPE tx_type, TX_SIZE tx_size, 2593 int eob) { 2594 __m128i buf1[64]; 2595 int eobx, eoby; 2596 get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob); 2597 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; 2598 const int txw_idx = get_txw_idx(tx_size); 2599 const int txh_idx = get_txh_idx(tx_size); 2600 const int txfm_size_col = tx_size_wide[tx_size]; 2601 const int txfm_size_row = tx_size_high[tx_size]; 2602 const int buf_size_w_div8 = txfm_size_col >> 3; 2603 const int buf_size_nonzero_w = ((eobx + 8) >> 3) << 3; 2604 const int buf_size_h_div8 = (eoby + 8) >> 3; 2605 const int input_stride = AOMMIN(32, txfm_size_row); 2606 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); 2607 2608 const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx]; 2609 const transform_1d_ssse3 row_txfm = 2610 lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx]; 2611 2612 assert(row_txfm != NULL); 2613 int ud_flip, lr_flip; 2614 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 2615 for (int i = 0; i < buf_size_h_div8; i++) { 2616 __m128i buf0[64]; 2617 load_buffer_32bit_to_16bit(input + i * 8, input_stride, buf0, 2618 buf_size_nonzero_w); 2619 if (rect_type == 1 || rect_type == -1) { 2620 round_shift_ssse3(buf0, buf0, buf_size_nonzero_w); // rect special code 2621 } 2622 row_txfm(buf0, buf0); 2623 round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]); 2624 __m128i *_buf1 = buf1; 2625 if (lr_flip) { 2626 for (int j = 0; j < buf_size_w_div8; ++j) { 2627 __m128i temp[8]; 2628 flip_buf_sse2(buf0 + 8 * j, temp, 8); 2629 transpose_16bit_8x8(temp, _buf1 + 8 * (buf_size_w_div8 - 1 - j)); 2630 } 2631 } else { 2632 for (int j = 0; j < buf_size_w_div8; ++j) { 2633 transpose_16bit_8x8(buf0 + 8 * j, _buf1 + 8 * j); 2634 } 2635 } 2636 2637 for (int j = 0; j < buf_size_w_div8; ++j) { 2638 iidentity_col_8xn_ssse3(output + i * 8 * stride + j * 8, stride, 2639 buf1 + j * 8, shift[1], 8, txh_idx); 2640 } 2641 } 2642 } 2643 2644 // for 32x32,32x64,64x32,64x64,32x8,8x32,16x32,32x16,64x16,16x64 2645 static inline void lowbd_inv_txfm2d_add_universe_ssse3( 2646 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, 2647 TX_SIZE tx_size, int eob) { 2648 switch (tx_type) { 2649 case DCT_DCT: 2650 lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type, 2651 tx_size, eob); 2652 break; 2653 case IDTX: 2654 av1_lowbd_inv_txfm2d_add_idtx_ssse3(input, output, stride, tx_size); 2655 break; 2656 case V_DCT: 2657 case V_ADST: 2658 case V_FLIPADST: 2659 av1_lowbd_inv_txfm2d_add_h_identity_ssse3(input, output, stride, tx_type, 2660 tx_size, eob); 2661 break; 2662 case H_DCT: 2663 case H_ADST: 2664 case H_FLIPADST: 2665 av1_lowbd_inv_txfm2d_add_v_identity_ssse3(input, output, stride, tx_type, 2666 tx_size, eob); 2667 break; 2668 default: 2669 lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type, 2670 tx_size, eob); 2671 break; 2672 } 2673 } 2674 2675 static void lowbd_inv_txfm2d_add_4x8_ssse3(const int32_t *input, 2676 uint8_t *output, int stride, 2677 TX_TYPE tx_type, TX_SIZE tx_size_, 2678 int eob) { 2679 (void)tx_size_; 2680 (void)eob; 2681 __m128i buf[8]; 2682 const TX_SIZE tx_size = TX_4X8; 2683 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; 2684 const int txw_idx = get_txw_idx(tx_size); 2685 const int txh_idx = get_txh_idx(tx_size); 2686 const int txfm_size_col = tx_size_wide[tx_size]; 2687 const int txfm_size_row = tx_size_high[tx_size]; 2688 2689 const transform_1d_ssse3 row_txfm = 2690 lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]]; 2691 const transform_1d_ssse3 col_txfm = 2692 lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]]; 2693 2694 int ud_flip, lr_flip; 2695 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 2696 load_buffer_32bit_to_16bit(input, txfm_size_row, buf, txfm_size_col); 2697 round_shift_ssse3(buf, buf, txfm_size_col); // rect special code 2698 row_txfm(buf, buf); 2699 // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);// shift[0] is 0 2700 if (lr_flip) { 2701 __m128i temp[4]; 2702 flip_buf_sse2(buf, temp, txfm_size_col); 2703 transpose_16bit_8x4(temp, buf); 2704 } else { 2705 transpose_16bit_8x4(buf, buf); 2706 } 2707 col_txfm(buf, buf); 2708 round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]); 2709 lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row); 2710 } 2711 2712 static void lowbd_inv_txfm2d_add_8x4_ssse3(const int32_t *input, 2713 uint8_t *output, int stride, 2714 TX_TYPE tx_type, TX_SIZE tx_size_, 2715 int eob) { 2716 (void)tx_size_; 2717 (void)eob; 2718 __m128i buf[8]; 2719 const TX_SIZE tx_size = TX_8X4; 2720 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; 2721 const int txw_idx = get_txw_idx(tx_size); 2722 const int txh_idx = get_txh_idx(tx_size); 2723 const int txfm_size_col = tx_size_wide[tx_size]; 2724 const int txfm_size_row = tx_size_high[tx_size]; 2725 2726 const transform_1d_ssse3 row_txfm = 2727 lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]]; 2728 const transform_1d_ssse3 col_txfm = 2729 lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]]; 2730 2731 int ud_flip, lr_flip; 2732 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 2733 load_buffer_32bit_to_16bit_w4(input, txfm_size_row, buf, txfm_size_col); 2734 round_shift_ssse3(buf, buf, txfm_size_col); // rect special code 2735 row_txfm(buf, buf); 2736 // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]); // shift[0] is 0 2737 if (lr_flip) { 2738 __m128i temp[8]; 2739 flip_buf_sse2(buf, temp, txfm_size_col); 2740 transpose_16bit_4x8(temp, buf); 2741 } else { 2742 transpose_16bit_4x8(buf, buf); 2743 } 2744 col_txfm(buf, buf); 2745 round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]); 2746 lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, txfm_size_row); 2747 } 2748 2749 static void lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t *input, 2750 uint8_t *output, int stride, 2751 TX_TYPE tx_type, TX_SIZE tx_size_, 2752 int eob) { 2753 (void)tx_size_; 2754 (void)eob; 2755 __m128i buf[16]; 2756 const TX_SIZE tx_size = TX_4X16; 2757 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; 2758 const int txw_idx = get_txw_idx(tx_size); 2759 const int txh_idx = get_txh_idx(tx_size); 2760 const int txfm_size_col = tx_size_wide[tx_size]; 2761 const int txfm_size_row = tx_size_high[tx_size]; 2762 2763 const transform_1d_ssse3 row_txfm = 2764 lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]]; 2765 const transform_1d_ssse3 col_txfm = 2766 lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]]; 2767 2768 int ud_flip, lr_flip; 2769 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 2770 2771 const int row_one_loop = 8; 2772 for (int i = 0; i < 2; ++i) { 2773 const int32_t *input_cur = input + i * row_one_loop; 2774 __m128i *buf_cur = buf + i * row_one_loop; 2775 load_buffer_32bit_to_16bit(input_cur, txfm_size_row, buf_cur, 2776 txfm_size_col); 2777 if (row_txfm == iidentity4_ssse3) { 2778 const __m128i scale = pair_set_epi16(NewSqrt2, 3 << (NewSqrt2Bits - 1)); 2779 const __m128i ones = _mm_set1_epi16(1); 2780 for (int j = 0; j < 4; ++j) { 2781 const __m128i buf_lo = _mm_unpacklo_epi16(buf_cur[j], ones); 2782 const __m128i buf_hi = _mm_unpackhi_epi16(buf_cur[j], ones); 2783 const __m128i buf_32_lo = 2784 _mm_srai_epi32(_mm_madd_epi16(buf_lo, scale), (NewSqrt2Bits + 1)); 2785 const __m128i buf_32_hi = 2786 _mm_srai_epi32(_mm_madd_epi16(buf_hi, scale), (NewSqrt2Bits + 1)); 2787 buf_cur[j] = _mm_packs_epi32(buf_32_lo, buf_32_hi); 2788 } 2789 } else { 2790 row_txfm(buf_cur, buf_cur); 2791 round_shift_16bit_ssse3(buf_cur, row_one_loop, shift[0]); 2792 } 2793 if (lr_flip) { 2794 __m128i temp[8]; 2795 flip_buf_sse2(buf_cur, temp, txfm_size_col); 2796 transpose_16bit_8x4(temp, buf_cur); 2797 } else { 2798 transpose_16bit_8x4(buf_cur, buf_cur); 2799 } 2800 } 2801 col_txfm(buf, buf); 2802 round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]); 2803 lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row); 2804 } 2805 2806 static void lowbd_inv_txfm2d_add_16x4_ssse3(const int32_t *input, 2807 uint8_t *output, int stride, 2808 TX_TYPE tx_type, TX_SIZE tx_size_, 2809 int eob) { 2810 (void)tx_size_; 2811 (void)eob; 2812 __m128i buf[16]; 2813 const TX_SIZE tx_size = TX_16X4; 2814 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; 2815 const int txw_idx = get_txw_idx(tx_size); 2816 const int txh_idx = get_txh_idx(tx_size); 2817 const int txfm_size_col = tx_size_wide[tx_size]; 2818 const int txfm_size_row = tx_size_high[tx_size]; 2819 const int buf_size_w_div8 = txfm_size_col >> 3; 2820 2821 const transform_1d_ssse3 row_txfm = 2822 lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]]; 2823 const transform_1d_ssse3 col_txfm = 2824 lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]]; 2825 2826 int ud_flip, lr_flip; 2827 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 2828 const int row_one_loop = 8; 2829 load_buffer_32bit_to_16bit_w4(input, txfm_size_row, buf, txfm_size_col); 2830 if (row_txfm == iidentity16_ssse3) { 2831 const __m128i scale = pair_set_epi16(2 * NewSqrt2, 3 << (NewSqrt2Bits - 1)); 2832 const __m128i ones = _mm_set1_epi16(1); 2833 for (int j = 0; j < 16; ++j) { 2834 const __m128i buf_lo = _mm_unpacklo_epi16(buf[j], ones); 2835 const __m128i buf_hi = _mm_unpackhi_epi16(buf[j], ones); 2836 const __m128i buf_32_lo = 2837 _mm_srai_epi32(_mm_madd_epi16(buf_lo, scale), (NewSqrt2Bits + 1)); 2838 const __m128i buf_32_hi = 2839 _mm_srai_epi32(_mm_madd_epi16(buf_hi, scale), (NewSqrt2Bits + 1)); 2840 buf[j] = _mm_packs_epi32(buf_32_lo, buf_32_hi); 2841 } 2842 } else { 2843 row_txfm(buf, buf); 2844 round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]); 2845 } 2846 if (lr_flip) { 2847 __m128i temp[16]; 2848 flip_buf_sse2(buf, temp, 16); 2849 transpose_16bit_4x8(temp, buf); 2850 transpose_16bit_4x8(temp + 8, buf + 8); 2851 } else { 2852 transpose_16bit_4x8(buf, buf); 2853 transpose_16bit_4x8(buf + row_one_loop, buf + row_one_loop); 2854 } 2855 for (int i = 0; i < buf_size_w_div8; i++) { 2856 col_txfm(buf + i * row_one_loop, buf + i * row_one_loop); 2857 round_shift_16bit_ssse3(buf + i * row_one_loop, txfm_size_row, shift[1]); 2858 } 2859 lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, 4); 2860 lowbd_write_buffer_8xn_sse2(buf + 8, output + 8, stride, ud_flip, 4); 2861 } 2862 2863 void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output, 2864 int stride, TX_TYPE tx_type, 2865 TX_SIZE tx_size, int eob) { 2866 switch (tx_size) { 2867 case TX_4X4: 2868 lowbd_inv_txfm2d_add_4x4_ssse3(input, output, stride, tx_type, tx_size, 2869 eob); 2870 break; 2871 case TX_4X8: 2872 lowbd_inv_txfm2d_add_4x8_ssse3(input, output, stride, tx_type, tx_size, 2873 eob); 2874 break; 2875 case TX_8X4: 2876 lowbd_inv_txfm2d_add_8x4_ssse3(input, output, stride, tx_type, tx_size, 2877 eob); 2878 break; 2879 case TX_4X16: 2880 lowbd_inv_txfm2d_add_4x16_ssse3(input, output, stride, tx_type, tx_size, 2881 eob); 2882 break; 2883 case TX_16X4: 2884 lowbd_inv_txfm2d_add_16x4_ssse3(input, output, stride, tx_type, tx_size, 2885 eob); 2886 break; 2887 default: 2888 lowbd_inv_txfm2d_add_universe_ssse3(input, output, stride, tx_type, 2889 tx_size, eob); 2890 break; 2891 } 2892 } 2893 2894 void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride, 2895 const TxfmParam *txfm_param) { 2896 if (!txfm_param->lossless) { 2897 const TX_TYPE tx_type = txfm_param->tx_type; 2898 av1_lowbd_inv_txfm2d_add_ssse3(dqcoeff, dst, stride, tx_type, 2899 txfm_param->tx_size, txfm_param->eob); 2900 2901 } else { 2902 av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param); 2903 } 2904 }