highbd_inv_txfm_sse4.c (223876B)
1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 #include <assert.h> 12 #include <smmintrin.h> /* SSE4.1 */ 13 14 #include "config/aom_config.h" 15 #include "config/av1_rtcd.h" 16 17 #include "av1/common/av1_inv_txfm1d_cfg.h" 18 #include "av1/common/idct.h" 19 #include "av1/common/x86/av1_inv_txfm_ssse3.h" 20 #include "av1/common/x86/av1_txfm_sse2.h" 21 #include "av1/common/x86/av1_txfm_sse4.h" 22 #include "av1/common/x86/highbd_txfm_utility_sse4.h" 23 24 static inline __m128i highbd_clamp_epi16(__m128i u, int bd) { 25 const __m128i zero = _mm_setzero_si128(); 26 const __m128i one = _mm_set1_epi16(1); 27 const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); 28 __m128i clamped, mask; 29 30 mask = _mm_cmpgt_epi16(u, max); 31 clamped = _mm_andnot_si128(mask, u); 32 mask = _mm_and_si128(mask, max); 33 clamped = _mm_or_si128(mask, clamped); 34 mask = _mm_cmpgt_epi16(clamped, zero); 35 clamped = _mm_and_si128(clamped, mask); 36 37 return clamped; 38 } 39 40 static inline void round_shift_4x4(__m128i *in, int shift) { 41 if (shift != 0) { 42 __m128i rnding = _mm_set1_epi32(1 << (shift - 1)); 43 in[0] = _mm_add_epi32(in[0], rnding); 44 in[1] = _mm_add_epi32(in[1], rnding); 45 in[2] = _mm_add_epi32(in[2], rnding); 46 in[3] = _mm_add_epi32(in[3], rnding); 47 48 in[0] = _mm_srai_epi32(in[0], shift); 49 in[1] = _mm_srai_epi32(in[1], shift); 50 in[2] = _mm_srai_epi32(in[2], shift); 51 in[3] = _mm_srai_epi32(in[3], shift); 52 } 53 } 54 55 static void round_shift_8x8(__m128i *in, int shift) { 56 round_shift_4x4(&in[0], shift); 57 round_shift_4x4(&in[4], shift); 58 round_shift_4x4(&in[8], shift); 59 round_shift_4x4(&in[12], shift); 60 } 61 62 static void highbd_clamp_epi32_sse4_1(__m128i *in, __m128i *out, 63 const __m128i *clamp_lo, 64 const __m128i *clamp_hi, int size) { 65 __m128i a0, a1; 66 for (int i = 0; i < size; i += 4) { 67 a0 = _mm_max_epi32(in[i], *clamp_lo); 68 out[i] = _mm_min_epi32(a0, *clamp_hi); 69 70 a1 = _mm_max_epi32(in[i + 1], *clamp_lo); 71 out[i + 1] = _mm_min_epi32(a1, *clamp_hi); 72 73 a0 = _mm_max_epi32(in[i + 2], *clamp_lo); 74 out[i + 2] = _mm_min_epi32(a0, *clamp_hi); 75 76 a1 = _mm_max_epi32(in[i + 3], *clamp_lo); 77 out[i + 3] = _mm_min_epi32(a1, *clamp_hi); 78 } 79 } 80 81 static inline __m128i highbd_get_recon_8x8_sse4_1(const __m128i pred, 82 __m128i res0, __m128i res1, 83 const int bd) { 84 __m128i x0 = _mm_cvtepi16_epi32(pred); 85 __m128i x1 = _mm_cvtepi16_epi32(_mm_srli_si128(pred, 8)); 86 __m128i min_clip_val = _mm_setzero_si128(); 87 __m128i max_clip_val = _mm_set1_epi32((1 << bd) - 1); 88 x0 = _mm_add_epi32(res0, x0); 89 x1 = _mm_add_epi32(res1, x1); 90 x0 = _mm_max_epi32(x0, min_clip_val); 91 x0 = _mm_min_epi32(x0, max_clip_val); 92 x1 = _mm_max_epi32(x1, min_clip_val); 93 x1 = _mm_min_epi32(x1, max_clip_val); 94 x0 = _mm_packus_epi32(x0, x1); 95 return x0; 96 } 97 98 static inline __m128i highbd_get_recon_4xn_sse4_1(const __m128i pred, 99 __m128i res0, const int bd) { 100 __m128i x0 = _mm_cvtepi16_epi32(pred); 101 102 x0 = _mm_add_epi32(res0, x0); 103 x0 = _mm_packus_epi32(x0, x0); 104 x0 = highbd_clamp_epi16(x0, bd); 105 return x0; 106 } 107 108 static inline void highbd_write_buffer_4xn_sse4_1(__m128i *in, uint16_t *output, 109 int stride, int flipud, 110 int height, const int bd) { 111 int j = flipud ? (height - 1) : 0; 112 const int step = flipud ? -1 : 1; 113 for (int i = 0; i < height; ++i, j += step) { 114 __m128i v = _mm_loadl_epi64((__m128i const *)(output + i * stride)); 115 __m128i u = highbd_get_recon_4xn_sse4_1(v, in[j], bd); 116 117 _mm_storel_epi64((__m128i *)(output + i * stride), u); 118 } 119 } 120 121 static inline void highbd_write_buffer_8xn_sse4_1(__m128i *in, uint16_t *output, 122 int stride, int flipud, 123 int height, const int bd) { 124 int j = flipud ? (height - 1) : 0; 125 const int step = flipud ? -1 : 1; 126 for (int i = 0; i < height; ++i, j += step) { 127 __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride)); 128 __m128i u = highbd_get_recon_8x8_sse4_1(v, in[j], in[j + height], bd); 129 130 _mm_storeu_si128((__m128i *)(output + i * stride), u); 131 } 132 } 133 134 static inline void load_buffer_32bit_input(const int32_t *in, int stride, 135 __m128i *out, int out_size) { 136 for (int i = 0; i < out_size; ++i) { 137 out[i] = _mm_loadu_si128((const __m128i *)(in + i * stride)); 138 } 139 } 140 141 static inline void load_buffer_4x4(const int32_t *coeff, __m128i *in) { 142 in[0] = _mm_load_si128((const __m128i *)(coeff + 0)); 143 in[1] = _mm_load_si128((const __m128i *)(coeff + 4)); 144 in[2] = _mm_load_si128((const __m128i *)(coeff + 8)); 145 in[3] = _mm_load_si128((const __m128i *)(coeff + 12)); 146 } 147 148 void av1_highbd_iwht4x4_16_add_sse4_1(const tran_low_t *input, uint8_t *dest8, 149 int stride, int bd) { 150 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, 151 0.5 shifts per pixel. */ 152 __m128i op[4]; 153 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 154 155 load_buffer_4x4(input, op); 156 157 // Shift before-hand. 158 op[0] = _mm_srai_epi32(op[0], UNIT_QUANT_SHIFT); 159 op[1] = _mm_srai_epi32(op[1], UNIT_QUANT_SHIFT); 160 op[2] = _mm_srai_epi32(op[2], UNIT_QUANT_SHIFT); 161 op[3] = _mm_srai_epi32(op[3], UNIT_QUANT_SHIFT); 162 163 for (int i = 0; i < 2; ++i) { 164 __m128i a1 = op[0]; 165 __m128i c1 = op[1]; 166 __m128i d1 = op[2]; 167 __m128i b1 = op[3]; 168 a1 = _mm_add_epi32(a1, c1); // a1 += c1 169 d1 = _mm_sub_epi32(d1, b1); // d1 -= b1 170 __m128i e1 = _mm_sub_epi32(a1, d1); // e1 = (a1 - d1) >> 1 171 e1 = _mm_srai_epi32(e1, 1); 172 b1 = _mm_sub_epi32(e1, b1); // b1 = e1 - b1 173 c1 = _mm_sub_epi32(e1, c1); // c1 = e1 - c1 174 a1 = _mm_sub_epi32(a1, b1); // a1 -= b1 175 d1 = _mm_add_epi32(d1, c1); // d1 += c1 176 177 op[0] = a1; 178 op[1] = b1; 179 op[2] = c1; 180 op[3] = d1; 181 if (i == 0) { 182 transpose_32bit_4x4(op, op); 183 } 184 } 185 186 // Convert to int16_t. The C code checks that we are in range. 187 op[0] = _mm_packs_epi32(op[0], op[1]); 188 op[1] = _mm_packs_epi32(op[2], op[3]); 189 190 // Load uint16_t. 191 __m128i dst[2]; 192 __m128i tmp[4]; 193 tmp[0] = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride)); 194 tmp[1] = _mm_loadl_epi64((const __m128i *)(dest + 1 * stride)); 195 dst[0] = _mm_unpacklo_epi64(tmp[0], tmp[1]); 196 tmp[2] = _mm_loadl_epi64((const __m128i *)(dest + 2 * stride)); 197 tmp[3] = _mm_loadl_epi64((const __m128i *)(dest + 3 * stride)); 198 dst[1] = _mm_unpacklo_epi64(tmp[2], tmp[3]); 199 200 // Add to the previous results. 201 dst[0] = _mm_add_epi16(dst[0], op[0]); 202 dst[1] = _mm_add_epi16(dst[1], op[1]); 203 204 // Clamp. 205 dst[0] = highbd_clamp_epi16(dst[0], bd); 206 dst[1] = highbd_clamp_epi16(dst[1], bd); 207 208 // Store. 209 _mm_storel_epi64((__m128i *)(dest + 0 * stride), dst[0]); 210 dst[0] = _mm_srli_si128(dst[0], 8); 211 _mm_storel_epi64((__m128i *)(dest + 1 * stride), dst[0]); 212 _mm_storel_epi64((__m128i *)(dest + 2 * stride), dst[1]); 213 dst[1] = _mm_srli_si128(dst[1], 8); 214 _mm_storel_epi64((__m128i *)(dest + 3 * stride), dst[1]); 215 } 216 217 static void addsub_sse4_1(const __m128i in0, const __m128i in1, __m128i *out0, 218 __m128i *out1, const __m128i *clamp_lo, 219 const __m128i *clamp_hi) { 220 __m128i a0 = _mm_add_epi32(in0, in1); 221 __m128i a1 = _mm_sub_epi32(in0, in1); 222 223 a0 = _mm_max_epi32(a0, *clamp_lo); 224 a0 = _mm_min_epi32(a0, *clamp_hi); 225 a1 = _mm_max_epi32(a1, *clamp_lo); 226 a1 = _mm_min_epi32(a1, *clamp_hi); 227 228 *out0 = a0; 229 *out1 = a1; 230 } 231 232 static void shift_and_clamp_sse4_1(__m128i *in0, __m128i *in1, 233 const __m128i *clamp_lo, 234 const __m128i *clamp_hi, int shift) { 235 __m128i offset = _mm_set1_epi32((1 << shift) >> 1); 236 __m128i in0_w_offset = _mm_add_epi32(*in0, offset); 237 __m128i in1_w_offset = _mm_add_epi32(*in1, offset); 238 239 in0_w_offset = _mm_sra_epi32(in0_w_offset, _mm_cvtsi32_si128(shift)); 240 in1_w_offset = _mm_sra_epi32(in1_w_offset, _mm_cvtsi32_si128(shift)); 241 242 in0_w_offset = _mm_max_epi32(in0_w_offset, *clamp_lo); 243 in0_w_offset = _mm_min_epi32(in0_w_offset, *clamp_hi); 244 in1_w_offset = _mm_max_epi32(in1_w_offset, *clamp_lo); 245 in1_w_offset = _mm_min_epi32(in1_w_offset, *clamp_hi); 246 247 *in0 = in0_w_offset; 248 *in1 = in1_w_offset; 249 } 250 251 static inline void idct32_stage4_sse4_1( 252 __m128i *bf1, const __m128i *cospim8, const __m128i *cospi56, 253 const __m128i *cospi8, const __m128i *cospim56, const __m128i *cospim40, 254 const __m128i *cospi24, const __m128i *cospi40, const __m128i *cospim24, 255 const __m128i *rounding, int bit) { 256 __m128i temp1, temp2; 257 temp1 = half_btf_sse4_1(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit); 258 bf1[30] = half_btf_sse4_1(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit); 259 bf1[17] = temp1; 260 261 temp2 = half_btf_sse4_1(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit); 262 bf1[29] = 263 half_btf_sse4_1(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit); 264 bf1[18] = temp2; 265 266 temp1 = half_btf_sse4_1(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit); 267 bf1[26] = 268 half_btf_sse4_1(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit); 269 bf1[21] = temp1; 270 271 temp2 = 272 half_btf_sse4_1(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit); 273 bf1[25] = 274 half_btf_sse4_1(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit); 275 bf1[22] = temp2; 276 } 277 278 static inline void idct32_stage5_sse4_1( 279 __m128i *bf1, const __m128i *cospim16, const __m128i *cospi48, 280 const __m128i *cospi16, const __m128i *cospim48, const __m128i *clamp_lo, 281 const __m128i *clamp_hi, const __m128i *rounding, int bit) { 282 __m128i temp1, temp2; 283 temp1 = half_btf_sse4_1(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit); 284 bf1[14] = half_btf_sse4_1(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit); 285 bf1[9] = temp1; 286 287 temp2 = 288 half_btf_sse4_1(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit); 289 bf1[13] = 290 half_btf_sse4_1(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit); 291 bf1[10] = temp2; 292 293 addsub_sse4_1(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi); 294 addsub_sse4_1(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi); 295 addsub_sse4_1(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi); 296 addsub_sse4_1(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi); 297 addsub_sse4_1(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi); 298 addsub_sse4_1(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi); 299 addsub_sse4_1(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi); 300 addsub_sse4_1(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi); 301 } 302 303 static inline void idct32_stage6_sse4_1( 304 __m128i *bf1, const __m128i *cospim32, const __m128i *cospi32, 305 const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16, 306 const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi, 307 const __m128i *rounding, int bit) { 308 __m128i temp1, temp2; 309 temp1 = half_btf_sse4_1(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit); 310 bf1[6] = half_btf_sse4_1(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit); 311 bf1[5] = temp1; 312 313 addsub_sse4_1(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi); 314 addsub_sse4_1(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi); 315 addsub_sse4_1(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi); 316 addsub_sse4_1(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi); 317 318 temp1 = half_btf_sse4_1(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit); 319 bf1[29] = 320 half_btf_sse4_1(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit); 321 bf1[18] = temp1; 322 temp2 = half_btf_sse4_1(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit); 323 bf1[28] = 324 half_btf_sse4_1(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit); 325 bf1[19] = temp2; 326 temp1 = 327 half_btf_sse4_1(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit); 328 bf1[27] = 329 half_btf_sse4_1(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit); 330 bf1[20] = temp1; 331 temp2 = 332 half_btf_sse4_1(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit); 333 bf1[26] = 334 half_btf_sse4_1(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit); 335 bf1[21] = temp2; 336 } 337 338 static inline void idct32_stage7_sse4_1(__m128i *bf1, const __m128i *cospim32, 339 const __m128i *cospi32, 340 const __m128i *clamp_lo, 341 const __m128i *clamp_hi, 342 const __m128i *rounding, int bit) { 343 __m128i temp1, temp2; 344 addsub_sse4_1(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi); 345 addsub_sse4_1(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi); 346 addsub_sse4_1(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi); 347 addsub_sse4_1(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi); 348 349 temp1 = half_btf_sse4_1(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit); 350 bf1[13] = 351 half_btf_sse4_1(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit); 352 bf1[10] = temp1; 353 temp2 = half_btf_sse4_1(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit); 354 bf1[12] = 355 half_btf_sse4_1(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit); 356 bf1[11] = temp2; 357 358 addsub_sse4_1(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi); 359 addsub_sse4_1(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi); 360 addsub_sse4_1(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi); 361 addsub_sse4_1(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi); 362 addsub_sse4_1(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi); 363 addsub_sse4_1(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi); 364 addsub_sse4_1(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi); 365 addsub_sse4_1(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi); 366 } 367 368 static inline void idct32_stage8_sse4_1(__m128i *bf1, const __m128i *cospim32, 369 const __m128i *cospi32, 370 const __m128i *clamp_lo, 371 const __m128i *clamp_hi, 372 const __m128i *rounding, int bit) { 373 __m128i temp1, temp2; 374 addsub_sse4_1(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi); 375 addsub_sse4_1(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi); 376 addsub_sse4_1(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi); 377 addsub_sse4_1(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi); 378 addsub_sse4_1(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi); 379 addsub_sse4_1(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi); 380 addsub_sse4_1(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi); 381 addsub_sse4_1(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi); 382 383 temp1 = half_btf_sse4_1(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit); 384 bf1[27] = 385 half_btf_sse4_1(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit); 386 bf1[20] = temp1; 387 temp2 = half_btf_sse4_1(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit); 388 bf1[26] = 389 half_btf_sse4_1(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit); 390 bf1[21] = temp2; 391 temp1 = half_btf_sse4_1(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit); 392 bf1[25] = 393 half_btf_sse4_1(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit); 394 bf1[22] = temp1; 395 temp2 = half_btf_sse4_1(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit); 396 bf1[24] = 397 half_btf_sse4_1(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit); 398 bf1[23] = temp2; 399 } 400 401 static inline void idct32_stage9_sse4_1(__m128i *bf1, __m128i *out, 402 const int do_cols, const int bd, 403 const int out_shift, 404 const __m128i *clamp_lo, 405 const __m128i *clamp_hi) { 406 addsub_sse4_1(bf1[0], bf1[31], out + 0, out + 31, clamp_lo, clamp_hi); 407 addsub_sse4_1(bf1[1], bf1[30], out + 1, out + 30, clamp_lo, clamp_hi); 408 addsub_sse4_1(bf1[2], bf1[29], out + 2, out + 29, clamp_lo, clamp_hi); 409 addsub_sse4_1(bf1[3], bf1[28], out + 3, out + 28, clamp_lo, clamp_hi); 410 addsub_sse4_1(bf1[4], bf1[27], out + 4, out + 27, clamp_lo, clamp_hi); 411 addsub_sse4_1(bf1[5], bf1[26], out + 5, out + 26, clamp_lo, clamp_hi); 412 addsub_sse4_1(bf1[6], bf1[25], out + 6, out + 25, clamp_lo, clamp_hi); 413 addsub_sse4_1(bf1[7], bf1[24], out + 7, out + 24, clamp_lo, clamp_hi); 414 addsub_sse4_1(bf1[8], bf1[23], out + 8, out + 23, clamp_lo, clamp_hi); 415 addsub_sse4_1(bf1[9], bf1[22], out + 9, out + 22, clamp_lo, clamp_hi); 416 addsub_sse4_1(bf1[10], bf1[21], out + 10, out + 21, clamp_lo, clamp_hi); 417 addsub_sse4_1(bf1[11], bf1[20], out + 11, out + 20, clamp_lo, clamp_hi); 418 addsub_sse4_1(bf1[12], bf1[19], out + 12, out + 19, clamp_lo, clamp_hi); 419 addsub_sse4_1(bf1[13], bf1[18], out + 13, out + 18, clamp_lo, clamp_hi); 420 addsub_sse4_1(bf1[14], bf1[17], out + 14, out + 17, clamp_lo, clamp_hi); 421 addsub_sse4_1(bf1[15], bf1[16], out + 15, out + 16, clamp_lo, clamp_hi); 422 423 if (!do_cols) { 424 const int log_range_out = AOMMAX(16, bd + 6); 425 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); 426 const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); 427 for (int i = 0; i < 32; i += 8) { 428 round_shift_4x4(out + i, out_shift); 429 round_shift_4x4(out + i + 4, out_shift); 430 } 431 highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32); 432 } 433 } 434 435 static void neg_shift_sse4_1(const __m128i in0, const __m128i in1, 436 __m128i *out0, __m128i *out1, 437 const __m128i *clamp_lo, const __m128i *clamp_hi, 438 int shift) { 439 __m128i offset = _mm_set1_epi32((1 << shift) >> 1); 440 __m128i a0 = _mm_add_epi32(offset, in0); 441 __m128i a1 = _mm_sub_epi32(offset, in1); 442 443 a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift)); 444 a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift)); 445 446 a0 = _mm_max_epi32(a0, *clamp_lo); 447 a0 = _mm_min_epi32(a0, *clamp_hi); 448 a1 = _mm_max_epi32(a1, *clamp_lo); 449 a1 = _mm_min_epi32(a1, *clamp_hi); 450 451 *out0 = a0; 452 *out1 = a1; 453 } 454 455 static void idct4x4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, 456 int bd, int out_shift) { 457 const int32_t *cospi = cospi_arr(bit); 458 const __m128i cospi32 = _mm_set1_epi32(cospi[32]); 459 const __m128i cospi48 = _mm_set1_epi32(cospi[48]); 460 const __m128i cospi16 = _mm_set1_epi32(cospi[16]); 461 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); 462 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); 463 int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); 464 __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); 465 __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); 466 __m128i u0, u1, u2, u3; 467 __m128i v0, v1, v2, v3, x, y; 468 469 // Stage 0 470 // Stage 1 471 // Stage 2 472 u0 = in[0]; 473 u1 = in[1]; 474 u2 = in[2]; 475 u3 = in[3]; 476 477 x = _mm_mullo_epi32(u0, cospi32); 478 y = _mm_mullo_epi32(u2, cospi32); 479 v0 = _mm_add_epi32(x, y); 480 v0 = _mm_add_epi32(v0, rnding); 481 v0 = _mm_srai_epi32(v0, bit); 482 483 v1 = _mm_sub_epi32(x, y); 484 v1 = _mm_add_epi32(v1, rnding); 485 v1 = _mm_srai_epi32(v1, bit); 486 487 x = _mm_mullo_epi32(u1, cospi48); 488 y = _mm_mullo_epi32(u3, cospim16); 489 v2 = _mm_add_epi32(x, y); 490 v2 = _mm_add_epi32(v2, rnding); 491 v2 = _mm_srai_epi32(v2, bit); 492 493 x = _mm_mullo_epi32(u1, cospi16); 494 y = _mm_mullo_epi32(u3, cospi48); 495 v3 = _mm_add_epi32(x, y); 496 v3 = _mm_add_epi32(v3, rnding); 497 v3 = _mm_srai_epi32(v3, bit); 498 499 // Stage 3 500 addsub_sse4_1(v0, v3, out + 0, out + 3, &clamp_lo, &clamp_hi); 501 addsub_sse4_1(v1, v2, out + 1, out + 2, &clamp_lo, &clamp_hi); 502 503 if (!do_cols) { 504 log_range = AOMMAX(16, bd + 6); 505 clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); 506 clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); 507 508 shift_and_clamp_sse4_1(out + 0, out + 3, &clamp_lo, &clamp_hi, out_shift); 509 shift_and_clamp_sse4_1(out + 1, out + 2, &clamp_lo, &clamp_hi, out_shift); 510 } 511 } 512 513 static void iadst4x4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, 514 int bd, int out_shift) { 515 const int32_t *sinpi = sinpi_arr(bit); 516 const __m128i zero = _mm_setzero_si128(); 517 __m128i rnding = _mm_set1_epi32(1 << (bit + 4 - 1)); 518 rnding = _mm_unpacklo_epi32(rnding, zero); 519 const __m128i mul = _mm_set1_epi32(1 << 4); 520 const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]); 521 const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]); 522 const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]); 523 const __m128i sinpi4 = _mm_set1_epi32((int)sinpi[4]); 524 __m128i t; 525 __m128i s0, s1, s2, s3, s4, s5, s6, s7; 526 __m128i x0, x1, x2, x3; 527 __m128i u0, u1, u2, u3; 528 __m128i u0_low, u1_low, u2_low, u3_low; 529 __m128i u0_high, u1_high, u2_high, u3_high; 530 531 x0 = in[0]; 532 x1 = in[1]; 533 x2 = in[2]; 534 x3 = in[3]; 535 536 s0 = _mm_mullo_epi32(x0, sinpi1); 537 s1 = _mm_mullo_epi32(x0, sinpi2); 538 s2 = _mm_mullo_epi32(x1, sinpi3); 539 s3 = _mm_mullo_epi32(x2, sinpi4); 540 s4 = _mm_mullo_epi32(x2, sinpi1); 541 s5 = _mm_mullo_epi32(x3, sinpi2); 542 s6 = _mm_mullo_epi32(x3, sinpi4); 543 t = _mm_sub_epi32(x0, x2); 544 s7 = _mm_add_epi32(t, x3); 545 546 t = _mm_add_epi32(s0, s3); 547 s0 = _mm_add_epi32(t, s5); 548 t = _mm_sub_epi32(s1, s4); 549 s1 = _mm_sub_epi32(t, s6); 550 s3 = s2; 551 s2 = _mm_mullo_epi32(s7, sinpi3); 552 553 u0 = _mm_add_epi32(s0, s3); 554 u1 = _mm_add_epi32(s1, s3); 555 u2 = s2; 556 t = _mm_add_epi32(s0, s1); 557 u3 = _mm_sub_epi32(t, s3); 558 559 // u0 560 u0_low = _mm_mul_epi32(u0, mul); 561 u0_low = _mm_add_epi64(u0_low, rnding); 562 563 u0 = _mm_srli_si128(u0, 4); 564 u0_high = _mm_mul_epi32(u0, mul); 565 u0_high = _mm_add_epi64(u0_high, rnding); 566 567 u0_low = _mm_srli_si128(u0_low, 2); 568 u0_high = _mm_srli_si128(u0_high, 2); 569 570 u0 = _mm_unpacklo_epi32(u0_low, u0_high); 571 u0_high = _mm_unpackhi_epi32(u0_low, u0_high); 572 u0 = _mm_unpacklo_epi64(u0, u0_high); 573 574 // u1 575 u1_low = _mm_mul_epi32(u1, mul); 576 u1_low = _mm_add_epi64(u1_low, rnding); 577 578 u1 = _mm_srli_si128(u1, 4); 579 u1_high = _mm_mul_epi32(u1, mul); 580 u1_high = _mm_add_epi64(u1_high, rnding); 581 582 u1_low = _mm_srli_si128(u1_low, 2); 583 u1_high = _mm_srli_si128(u1_high, 2); 584 585 u1 = _mm_unpacklo_epi32(u1_low, u1_high); 586 u1_high = _mm_unpackhi_epi32(u1_low, u1_high); 587 u1 = _mm_unpacklo_epi64(u1, u1_high); 588 589 // u2 590 u2_low = _mm_mul_epi32(u2, mul); 591 u2_low = _mm_add_epi64(u2_low, rnding); 592 593 u2 = _mm_srli_si128(u2, 4); 594 u2_high = _mm_mul_epi32(u2, mul); 595 u2_high = _mm_add_epi64(u2_high, rnding); 596 597 u2_low = _mm_srli_si128(u2_low, 2); 598 u2_high = _mm_srli_si128(u2_high, 2); 599 600 u2 = _mm_unpacklo_epi32(u2_low, u2_high); 601 u2_high = _mm_unpackhi_epi32(u2_low, u2_high); 602 u2 = _mm_unpacklo_epi64(u2, u2_high); 603 604 // u3 605 u3_low = _mm_mul_epi32(u3, mul); 606 u3_low = _mm_add_epi64(u3_low, rnding); 607 608 u3 = _mm_srli_si128(u3, 4); 609 u3_high = _mm_mul_epi32(u3, mul); 610 u3_high = _mm_add_epi64(u3_high, rnding); 611 612 u3_low = _mm_srli_si128(u3_low, 2); 613 u3_high = _mm_srli_si128(u3_high, 2); 614 615 u3 = _mm_unpacklo_epi32(u3_low, u3_high); 616 u3_high = _mm_unpackhi_epi32(u3_low, u3_high); 617 u3 = _mm_unpacklo_epi64(u3, u3_high); 618 619 out[0] = u0; 620 out[1] = u1; 621 out[2] = u2; 622 out[3] = u3; 623 624 if (!do_cols) { 625 const int log_range = AOMMAX(16, bd + 6); 626 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); 627 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); 628 round_shift_4x4(out, out_shift); 629 highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4); 630 } 631 } 632 633 static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride, 634 int fliplr, int flipud, int shift, int bd) { 635 const __m128i zero = _mm_setzero_si128(); 636 __m128i u0, u1, u2, u3; 637 __m128i v0, v1, v2, v3; 638 639 round_shift_4x4(in, shift); 640 641 v0 = _mm_loadl_epi64((__m128i const *)(output + 0 * stride)); 642 v1 = _mm_loadl_epi64((__m128i const *)(output + 1 * stride)); 643 v2 = _mm_loadl_epi64((__m128i const *)(output + 2 * stride)); 644 v3 = _mm_loadl_epi64((__m128i const *)(output + 3 * stride)); 645 646 v0 = _mm_unpacklo_epi16(v0, zero); 647 v1 = _mm_unpacklo_epi16(v1, zero); 648 v2 = _mm_unpacklo_epi16(v2, zero); 649 v3 = _mm_unpacklo_epi16(v3, zero); 650 651 if (fliplr) { 652 in[0] = _mm_shuffle_epi32(in[0], 0x1B); 653 in[1] = _mm_shuffle_epi32(in[1], 0x1B); 654 in[2] = _mm_shuffle_epi32(in[2], 0x1B); 655 in[3] = _mm_shuffle_epi32(in[3], 0x1B); 656 } 657 658 if (flipud) { 659 u0 = _mm_add_epi32(in[3], v0); 660 u1 = _mm_add_epi32(in[2], v1); 661 u2 = _mm_add_epi32(in[1], v2); 662 u3 = _mm_add_epi32(in[0], v3); 663 } else { 664 u0 = _mm_add_epi32(in[0], v0); 665 u1 = _mm_add_epi32(in[1], v1); 666 u2 = _mm_add_epi32(in[2], v2); 667 u3 = _mm_add_epi32(in[3], v3); 668 } 669 670 v0 = _mm_packus_epi32(u0, u1); 671 v2 = _mm_packus_epi32(u2, u3); 672 673 u0 = highbd_clamp_epi16(v0, bd); 674 u2 = highbd_clamp_epi16(v2, bd); 675 676 v0 = _mm_unpacklo_epi64(u0, u0); 677 v1 = _mm_unpackhi_epi64(u0, u0); 678 v2 = _mm_unpacklo_epi64(u2, u2); 679 v3 = _mm_unpackhi_epi64(u2, u2); 680 681 _mm_storel_epi64((__m128i *)(output + 0 * stride), v0); 682 _mm_storel_epi64((__m128i *)(output + 1 * stride), v1); 683 _mm_storel_epi64((__m128i *)(output + 2 * stride), v2); 684 _mm_storel_epi64((__m128i *)(output + 3 * stride), v3); 685 } 686 687 static void iidentity4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, 688 int bd, int out_shift) { 689 (void)bit; 690 __m128i zero = _mm_setzero_si128(); 691 __m128i fact = _mm_set1_epi32(NewSqrt2); 692 __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1)); 693 __m128i a0_low, a1_low; 694 __m128i a0_high, a1_high; 695 696 offset = _mm_unpacklo_epi32(offset, zero); 697 698 for (int i = 0; i < 4; i++) { 699 a0_low = _mm_mul_epi32(in[i], fact); 700 a0_low = _mm_add_epi32(a0_low, offset); 701 a0_low = _mm_srli_epi64(a0_low, NewSqrt2Bits); 702 703 a0_high = _mm_srli_si128(in[i], 4); 704 a0_high = _mm_mul_epi32(a0_high, fact); 705 a0_high = _mm_add_epi32(a0_high, offset); 706 a0_high = _mm_srli_epi64(a0_high, NewSqrt2Bits); 707 708 a1_low = _mm_unpacklo_epi32(a0_low, a0_high); 709 a1_high = _mm_unpackhi_epi32(a0_low, a0_high); 710 out[i] = _mm_unpacklo_epi64(a1_low, a1_high); 711 } 712 713 if (!do_cols) { 714 const int log_range = AOMMAX(16, bd + 6); 715 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); 716 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); 717 round_shift_4x4(out, out_shift); 718 highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4); 719 } 720 } 721 void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *input, uint16_t *output, 722 int stride, TX_TYPE tx_type, int bd) { 723 __m128i in[4]; 724 const int8_t *shift = av1_inv_txfm_shift_ls[TX_4X4]; 725 726 switch (tx_type) { 727 case DCT_DCT: 728 load_buffer_4x4(input, in); 729 idct4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); 730 transpose_32bit_4x4(in, in); 731 idct4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); 732 write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); 733 break; 734 case ADST_DCT: 735 load_buffer_4x4(input, in); 736 idct4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); 737 transpose_32bit_4x4(in, in); 738 iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); 739 write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); 740 break; 741 case DCT_ADST: 742 load_buffer_4x4(input, in); 743 iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); 744 transpose_32bit_4x4(in, in); 745 idct4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); 746 write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); 747 break; 748 case ADST_ADST: 749 load_buffer_4x4(input, in); 750 iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); 751 transpose_32bit_4x4(in, in); 752 iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); 753 write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); 754 break; 755 case FLIPADST_DCT: 756 load_buffer_4x4(input, in); 757 idct4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); 758 transpose_32bit_4x4(in, in); 759 iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); 760 write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd); 761 break; 762 case DCT_FLIPADST: 763 load_buffer_4x4(input, in); 764 iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); 765 transpose_32bit_4x4(in, in); 766 idct4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); 767 write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd); 768 break; 769 case FLIPADST_FLIPADST: 770 load_buffer_4x4(input, in); 771 iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); 772 transpose_32bit_4x4(in, in); 773 iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); 774 write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd); 775 break; 776 case ADST_FLIPADST: 777 load_buffer_4x4(input, in); 778 iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); 779 transpose_32bit_4x4(in, in); 780 iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); 781 write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd); 782 break; 783 case FLIPADST_ADST: 784 load_buffer_4x4(input, in); 785 iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); 786 transpose_32bit_4x4(in, in); 787 iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); 788 write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd); 789 break; 790 case IDTX: 791 load_buffer_4x4(input, in); 792 iidentity4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); 793 transpose_32bit_4x4(in, in); 794 iidentity4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); 795 write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); 796 break; 797 case V_DCT: 798 load_buffer_4x4(input, in); 799 iidentity4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); 800 transpose_32bit_4x4(in, in); 801 idct4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); 802 write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); 803 break; 804 case H_DCT: 805 load_buffer_4x4(input, in); 806 idct4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); 807 transpose_32bit_4x4(in, in); 808 iidentity4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); 809 write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); 810 break; 811 case V_ADST: 812 load_buffer_4x4(input, in); 813 iidentity4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); 814 transpose_32bit_4x4(in, in); 815 iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); 816 write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); 817 break; 818 case H_ADST: 819 load_buffer_4x4(input, in); 820 iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); 821 transpose_32bit_4x4(in, in); 822 iidentity4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); 823 write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); 824 break; 825 case V_FLIPADST: 826 load_buffer_4x4(input, in); 827 iidentity4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); 828 transpose_32bit_4x4(in, in); 829 iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); 830 write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd); 831 break; 832 case H_FLIPADST: 833 load_buffer_4x4(input, in); 834 iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); 835 transpose_32bit_4x4(in, in); 836 iidentity4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); 837 write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd); 838 break; 839 default: assert(0); 840 } 841 } 842 843 // 8x8 844 static void load_buffer_8x8(const int32_t *coeff, __m128i *in) { 845 in[0] = _mm_load_si128((const __m128i *)(coeff + 0)); 846 in[1] = _mm_load_si128((const __m128i *)(coeff + 4)); 847 in[2] = _mm_load_si128((const __m128i *)(coeff + 8)); 848 in[3] = _mm_load_si128((const __m128i *)(coeff + 12)); 849 in[4] = _mm_load_si128((const __m128i *)(coeff + 16)); 850 in[5] = _mm_load_si128((const __m128i *)(coeff + 20)); 851 in[6] = _mm_load_si128((const __m128i *)(coeff + 24)); 852 in[7] = _mm_load_si128((const __m128i *)(coeff + 28)); 853 in[8] = _mm_load_si128((const __m128i *)(coeff + 32)); 854 in[9] = _mm_load_si128((const __m128i *)(coeff + 36)); 855 in[10] = _mm_load_si128((const __m128i *)(coeff + 40)); 856 in[11] = _mm_load_si128((const __m128i *)(coeff + 44)); 857 in[12] = _mm_load_si128((const __m128i *)(coeff + 48)); 858 in[13] = _mm_load_si128((const __m128i *)(coeff + 52)); 859 in[14] = _mm_load_si128((const __m128i *)(coeff + 56)); 860 in[15] = _mm_load_si128((const __m128i *)(coeff + 60)); 861 } 862 863 static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, 864 int bd, int out_shift) { 865 const int32_t *cospi = cospi_arr(bit); 866 const __m128i cospi56 = _mm_set1_epi32(cospi[56]); 867 const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); 868 const __m128i cospi24 = _mm_set1_epi32(cospi[24]); 869 const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); 870 const __m128i cospi40 = _mm_set1_epi32(cospi[40]); 871 const __m128i cospi8 = _mm_set1_epi32(cospi[8]); 872 const __m128i cospi32 = _mm_set1_epi32(cospi[32]); 873 const __m128i cospi48 = _mm_set1_epi32(cospi[48]); 874 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); 875 const __m128i cospi16 = _mm_set1_epi32(cospi[16]); 876 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); 877 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); 878 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); 879 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); 880 __m128i u0, u1, u2, u3, u4, u5, u6, u7; 881 __m128i v0, v1, v2, v3, v4, v5, v6, v7; 882 __m128i x, y; 883 int col; 884 885 // Note: 886 // Even column: 0, 2, ..., 14 887 // Odd column: 1, 3, ..., 15 888 // one even column plus one odd column constructs one row (8 coeffs) 889 // total we have 8 rows (8x8). 890 for (col = 0; col < 2; ++col) { 891 // stage 0 892 // stage 1 893 // stage 2 894 u0 = in[0 * 2 + col]; 895 u1 = in[4 * 2 + col]; 896 u2 = in[2 * 2 + col]; 897 u3 = in[6 * 2 + col]; 898 899 x = _mm_mullo_epi32(in[1 * 2 + col], cospi56); 900 y = _mm_mullo_epi32(in[7 * 2 + col], cospim8); 901 u4 = _mm_add_epi32(x, y); 902 u4 = _mm_add_epi32(u4, rnding); 903 u4 = _mm_srai_epi32(u4, bit); 904 905 x = _mm_mullo_epi32(in[1 * 2 + col], cospi8); 906 y = _mm_mullo_epi32(in[7 * 2 + col], cospi56); 907 u7 = _mm_add_epi32(x, y); 908 u7 = _mm_add_epi32(u7, rnding); 909 u7 = _mm_srai_epi32(u7, bit); 910 911 x = _mm_mullo_epi32(in[5 * 2 + col], cospi24); 912 y = _mm_mullo_epi32(in[3 * 2 + col], cospim40); 913 u5 = _mm_add_epi32(x, y); 914 u5 = _mm_add_epi32(u5, rnding); 915 u5 = _mm_srai_epi32(u5, bit); 916 917 x = _mm_mullo_epi32(in[5 * 2 + col], cospi40); 918 y = _mm_mullo_epi32(in[3 * 2 + col], cospi24); 919 u6 = _mm_add_epi32(x, y); 920 u6 = _mm_add_epi32(u6, rnding); 921 u6 = _mm_srai_epi32(u6, bit); 922 923 // stage 3 924 x = _mm_mullo_epi32(u0, cospi32); 925 y = _mm_mullo_epi32(u1, cospi32); 926 v0 = _mm_add_epi32(x, y); 927 v0 = _mm_add_epi32(v0, rnding); 928 v0 = _mm_srai_epi32(v0, bit); 929 930 v1 = _mm_sub_epi32(x, y); 931 v1 = _mm_add_epi32(v1, rnding); 932 v1 = _mm_srai_epi32(v1, bit); 933 934 x = _mm_mullo_epi32(u2, cospi48); 935 y = _mm_mullo_epi32(u3, cospim16); 936 v2 = _mm_add_epi32(x, y); 937 v2 = _mm_add_epi32(v2, rnding); 938 v2 = _mm_srai_epi32(v2, bit); 939 940 x = _mm_mullo_epi32(u2, cospi16); 941 y = _mm_mullo_epi32(u3, cospi48); 942 v3 = _mm_add_epi32(x, y); 943 v3 = _mm_add_epi32(v3, rnding); 944 v3 = _mm_srai_epi32(v3, bit); 945 946 addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi); 947 addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi); 948 949 // stage 4 950 addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi); 951 addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi); 952 u4 = v4; 953 u7 = v7; 954 955 x = _mm_mullo_epi32(v5, cospi32); 956 y = _mm_mullo_epi32(v6, cospi32); 957 u6 = _mm_add_epi32(y, x); 958 u6 = _mm_add_epi32(u6, rnding); 959 u6 = _mm_srai_epi32(u6, bit); 960 961 u5 = _mm_sub_epi32(y, x); 962 u5 = _mm_add_epi32(u5, rnding); 963 u5 = _mm_srai_epi32(u5, bit); 964 965 // stage 5 966 addsub_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col, &clamp_lo, 967 &clamp_hi); 968 addsub_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col, &clamp_lo, 969 &clamp_hi); 970 addsub_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col, &clamp_lo, 971 &clamp_hi); 972 addsub_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col, &clamp_lo, 973 &clamp_hi); 974 } 975 976 if (!do_cols) { 977 const int log_range_out = AOMMAX(16, bd + 6); 978 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); 979 const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); 980 round_shift_8x8(out, out_shift); 981 highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16); 982 } 983 } 984 985 static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, 986 int bd, int out_shift) { 987 const int32_t *cospi = cospi_arr(bit); 988 const __m128i cospi4 = _mm_set1_epi32(cospi[4]); 989 const __m128i cospi60 = _mm_set1_epi32(cospi[60]); 990 const __m128i cospi20 = _mm_set1_epi32(cospi[20]); 991 const __m128i cospi44 = _mm_set1_epi32(cospi[44]); 992 const __m128i cospi36 = _mm_set1_epi32(cospi[36]); 993 const __m128i cospi28 = _mm_set1_epi32(cospi[28]); 994 const __m128i cospi52 = _mm_set1_epi32(cospi[52]); 995 const __m128i cospi12 = _mm_set1_epi32(cospi[12]); 996 const __m128i cospi16 = _mm_set1_epi32(cospi[16]); 997 const __m128i cospi48 = _mm_set1_epi32(cospi[48]); 998 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); 999 const __m128i cospi32 = _mm_set1_epi32(cospi[32]); 1000 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); 1001 const __m128i kZero = _mm_setzero_si128(); 1002 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); 1003 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); 1004 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); 1005 __m128i u[8], v[8], x; 1006 1007 // Even 8 points: 0, 2, ..., 14 1008 // stage 0 1009 // stage 1 1010 // stage 2 1011 // (1) 1012 u[0] = _mm_mullo_epi32(in[14], cospi4); 1013 x = _mm_mullo_epi32(in[0], cospi60); 1014 u[0] = _mm_add_epi32(u[0], x); 1015 u[0] = _mm_add_epi32(u[0], rnding); 1016 u[0] = _mm_srai_epi32(u[0], bit); 1017 1018 u[1] = _mm_mullo_epi32(in[14], cospi60); 1019 x = _mm_mullo_epi32(in[0], cospi4); 1020 u[1] = _mm_sub_epi32(u[1], x); 1021 u[1] = _mm_add_epi32(u[1], rnding); 1022 u[1] = _mm_srai_epi32(u[1], bit); 1023 1024 // (2) 1025 u[2] = _mm_mullo_epi32(in[10], cospi20); 1026 x = _mm_mullo_epi32(in[4], cospi44); 1027 u[2] = _mm_add_epi32(u[2], x); 1028 u[2] = _mm_add_epi32(u[2], rnding); 1029 u[2] = _mm_srai_epi32(u[2], bit); 1030 1031 u[3] = _mm_mullo_epi32(in[10], cospi44); 1032 x = _mm_mullo_epi32(in[4], cospi20); 1033 u[3] = _mm_sub_epi32(u[3], x); 1034 u[3] = _mm_add_epi32(u[3], rnding); 1035 u[3] = _mm_srai_epi32(u[3], bit); 1036 1037 // (3) 1038 u[4] = _mm_mullo_epi32(in[6], cospi36); 1039 x = _mm_mullo_epi32(in[8], cospi28); 1040 u[4] = _mm_add_epi32(u[4], x); 1041 u[4] = _mm_add_epi32(u[4], rnding); 1042 u[4] = _mm_srai_epi32(u[4], bit); 1043 1044 u[5] = _mm_mullo_epi32(in[6], cospi28); 1045 x = _mm_mullo_epi32(in[8], cospi36); 1046 u[5] = _mm_sub_epi32(u[5], x); 1047 u[5] = _mm_add_epi32(u[5], rnding); 1048 u[5] = _mm_srai_epi32(u[5], bit); 1049 1050 // (4) 1051 u[6] = _mm_mullo_epi32(in[2], cospi52); 1052 x = _mm_mullo_epi32(in[12], cospi12); 1053 u[6] = _mm_add_epi32(u[6], x); 1054 u[6] = _mm_add_epi32(u[6], rnding); 1055 u[6] = _mm_srai_epi32(u[6], bit); 1056 1057 u[7] = _mm_mullo_epi32(in[2], cospi12); 1058 x = _mm_mullo_epi32(in[12], cospi52); 1059 u[7] = _mm_sub_epi32(u[7], x); 1060 u[7] = _mm_add_epi32(u[7], rnding); 1061 u[7] = _mm_srai_epi32(u[7], bit); 1062 1063 // stage 3 1064 addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi); 1065 addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi); 1066 addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi); 1067 addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi); 1068 1069 // stage 4 1070 u[0] = v[0]; 1071 u[1] = v[1]; 1072 u[2] = v[2]; 1073 u[3] = v[3]; 1074 1075 u[4] = _mm_mullo_epi32(v[4], cospi16); 1076 x = _mm_mullo_epi32(v[5], cospi48); 1077 u[4] = _mm_add_epi32(u[4], x); 1078 u[4] = _mm_add_epi32(u[4], rnding); 1079 u[4] = _mm_srai_epi32(u[4], bit); 1080 1081 u[5] = _mm_mullo_epi32(v[4], cospi48); 1082 x = _mm_mullo_epi32(v[5], cospi16); 1083 u[5] = _mm_sub_epi32(u[5], x); 1084 u[5] = _mm_add_epi32(u[5], rnding); 1085 u[5] = _mm_srai_epi32(u[5], bit); 1086 1087 u[6] = _mm_mullo_epi32(v[6], cospim48); 1088 x = _mm_mullo_epi32(v[7], cospi16); 1089 u[6] = _mm_add_epi32(u[6], x); 1090 u[6] = _mm_add_epi32(u[6], rnding); 1091 u[6] = _mm_srai_epi32(u[6], bit); 1092 1093 u[7] = _mm_mullo_epi32(v[6], cospi16); 1094 x = _mm_mullo_epi32(v[7], cospim48); 1095 u[7] = _mm_sub_epi32(u[7], x); 1096 u[7] = _mm_add_epi32(u[7], rnding); 1097 u[7] = _mm_srai_epi32(u[7], bit); 1098 1099 // stage 5 1100 addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi); 1101 addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi); 1102 addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi); 1103 addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi); 1104 1105 // stage 6 1106 u[0] = v[0]; 1107 u[1] = v[1]; 1108 u[4] = v[4]; 1109 u[5] = v[5]; 1110 1111 v[0] = _mm_mullo_epi32(v[2], cospi32); 1112 x = _mm_mullo_epi32(v[3], cospi32); 1113 u[2] = _mm_add_epi32(v[0], x); 1114 u[2] = _mm_add_epi32(u[2], rnding); 1115 u[2] = _mm_srai_epi32(u[2], bit); 1116 1117 u[3] = _mm_sub_epi32(v[0], x); 1118 u[3] = _mm_add_epi32(u[3], rnding); 1119 u[3] = _mm_srai_epi32(u[3], bit); 1120 1121 v[0] = _mm_mullo_epi32(v[6], cospi32); 1122 x = _mm_mullo_epi32(v[7], cospi32); 1123 u[6] = _mm_add_epi32(v[0], x); 1124 u[6] = _mm_add_epi32(u[6], rnding); 1125 u[6] = _mm_srai_epi32(u[6], bit); 1126 1127 u[7] = _mm_sub_epi32(v[0], x); 1128 u[7] = _mm_add_epi32(u[7], rnding); 1129 u[7] = _mm_srai_epi32(u[7], bit); 1130 1131 // stage 7 1132 if (do_cols) { 1133 out[0] = u[0]; 1134 out[2] = _mm_sub_epi32(kZero, u[4]); 1135 out[4] = u[6]; 1136 out[6] = _mm_sub_epi32(kZero, u[2]); 1137 out[8] = u[3]; 1138 out[10] = _mm_sub_epi32(kZero, u[7]); 1139 out[12] = u[5]; 1140 out[14] = _mm_sub_epi32(kZero, u[1]); 1141 } else { 1142 const int log_range_out = AOMMAX(16, bd + 6); 1143 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); 1144 const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); 1145 1146 neg_shift_sse4_1(u[0], u[4], out + 0, out + 2, &clamp_lo_out, &clamp_hi_out, 1147 out_shift); 1148 neg_shift_sse4_1(u[6], u[2], out + 4, out + 6, &clamp_lo_out, &clamp_hi_out, 1149 out_shift); 1150 neg_shift_sse4_1(u[3], u[7], out + 8, out + 10, &clamp_lo_out, 1151 &clamp_hi_out, out_shift); 1152 neg_shift_sse4_1(u[5], u[1], out + 12, out + 14, &clamp_lo_out, 1153 &clamp_hi_out, out_shift); 1154 } 1155 1156 // Odd 8 points: 1, 3, ..., 15 1157 // stage 0 1158 // stage 1 1159 // stage 2 1160 // (1) 1161 u[0] = _mm_mullo_epi32(in[15], cospi4); 1162 x = _mm_mullo_epi32(in[1], cospi60); 1163 u[0] = _mm_add_epi32(u[0], x); 1164 u[0] = _mm_add_epi32(u[0], rnding); 1165 u[0] = _mm_srai_epi32(u[0], bit); 1166 1167 u[1] = _mm_mullo_epi32(in[15], cospi60); 1168 x = _mm_mullo_epi32(in[1], cospi4); 1169 u[1] = _mm_sub_epi32(u[1], x); 1170 u[1] = _mm_add_epi32(u[1], rnding); 1171 u[1] = _mm_srai_epi32(u[1], bit); 1172 1173 // (2) 1174 u[2] = _mm_mullo_epi32(in[11], cospi20); 1175 x = _mm_mullo_epi32(in[5], cospi44); 1176 u[2] = _mm_add_epi32(u[2], x); 1177 u[2] = _mm_add_epi32(u[2], rnding); 1178 u[2] = _mm_srai_epi32(u[2], bit); 1179 1180 u[3] = _mm_mullo_epi32(in[11], cospi44); 1181 x = _mm_mullo_epi32(in[5], cospi20); 1182 u[3] = _mm_sub_epi32(u[3], x); 1183 u[3] = _mm_add_epi32(u[3], rnding); 1184 u[3] = _mm_srai_epi32(u[3], bit); 1185 1186 // (3) 1187 u[4] = _mm_mullo_epi32(in[7], cospi36); 1188 x = _mm_mullo_epi32(in[9], cospi28); 1189 u[4] = _mm_add_epi32(u[4], x); 1190 u[4] = _mm_add_epi32(u[4], rnding); 1191 u[4] = _mm_srai_epi32(u[4], bit); 1192 1193 u[5] = _mm_mullo_epi32(in[7], cospi28); 1194 x = _mm_mullo_epi32(in[9], cospi36); 1195 u[5] = _mm_sub_epi32(u[5], x); 1196 u[5] = _mm_add_epi32(u[5], rnding); 1197 u[5] = _mm_srai_epi32(u[5], bit); 1198 1199 // (4) 1200 u[6] = _mm_mullo_epi32(in[3], cospi52); 1201 x = _mm_mullo_epi32(in[13], cospi12); 1202 u[6] = _mm_add_epi32(u[6], x); 1203 u[6] = _mm_add_epi32(u[6], rnding); 1204 u[6] = _mm_srai_epi32(u[6], bit); 1205 1206 u[7] = _mm_mullo_epi32(in[3], cospi12); 1207 x = _mm_mullo_epi32(in[13], cospi52); 1208 u[7] = _mm_sub_epi32(u[7], x); 1209 u[7] = _mm_add_epi32(u[7], rnding); 1210 u[7] = _mm_srai_epi32(u[7], bit); 1211 1212 // stage 3 1213 addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi); 1214 addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi); 1215 addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi); 1216 addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi); 1217 1218 // stage 4 1219 u[0] = v[0]; 1220 u[1] = v[1]; 1221 u[2] = v[2]; 1222 u[3] = v[3]; 1223 1224 u[4] = _mm_mullo_epi32(v[4], cospi16); 1225 x = _mm_mullo_epi32(v[5], cospi48); 1226 u[4] = _mm_add_epi32(u[4], x); 1227 u[4] = _mm_add_epi32(u[4], rnding); 1228 u[4] = _mm_srai_epi32(u[4], bit); 1229 1230 u[5] = _mm_mullo_epi32(v[4], cospi48); 1231 x = _mm_mullo_epi32(v[5], cospi16); 1232 u[5] = _mm_sub_epi32(u[5], x); 1233 u[5] = _mm_add_epi32(u[5], rnding); 1234 u[5] = _mm_srai_epi32(u[5], bit); 1235 1236 u[6] = _mm_mullo_epi32(v[6], cospim48); 1237 x = _mm_mullo_epi32(v[7], cospi16); 1238 u[6] = _mm_add_epi32(u[6], x); 1239 u[6] = _mm_add_epi32(u[6], rnding); 1240 u[6] = _mm_srai_epi32(u[6], bit); 1241 1242 u[7] = _mm_mullo_epi32(v[6], cospi16); 1243 x = _mm_mullo_epi32(v[7], cospim48); 1244 u[7] = _mm_sub_epi32(u[7], x); 1245 u[7] = _mm_add_epi32(u[7], rnding); 1246 u[7] = _mm_srai_epi32(u[7], bit); 1247 1248 // stage 5 1249 addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi); 1250 addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi); 1251 addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi); 1252 addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi); 1253 1254 // stage 6 1255 u[0] = v[0]; 1256 u[1] = v[1]; 1257 u[4] = v[4]; 1258 u[5] = v[5]; 1259 1260 v[0] = _mm_mullo_epi32(v[2], cospi32); 1261 x = _mm_mullo_epi32(v[3], cospi32); 1262 u[2] = _mm_add_epi32(v[0], x); 1263 u[2] = _mm_add_epi32(u[2], rnding); 1264 u[2] = _mm_srai_epi32(u[2], bit); 1265 1266 u[3] = _mm_sub_epi32(v[0], x); 1267 u[3] = _mm_add_epi32(u[3], rnding); 1268 u[3] = _mm_srai_epi32(u[3], bit); 1269 1270 v[0] = _mm_mullo_epi32(v[6], cospi32); 1271 x = _mm_mullo_epi32(v[7], cospi32); 1272 u[6] = _mm_add_epi32(v[0], x); 1273 u[6] = _mm_add_epi32(u[6], rnding); 1274 u[6] = _mm_srai_epi32(u[6], bit); 1275 1276 u[7] = _mm_sub_epi32(v[0], x); 1277 u[7] = _mm_add_epi32(u[7], rnding); 1278 u[7] = _mm_srai_epi32(u[7], bit); 1279 1280 // stage 7 1281 if (do_cols) { 1282 out[1] = u[0]; 1283 out[3] = _mm_sub_epi32(kZero, u[4]); 1284 out[5] = u[6]; 1285 out[7] = _mm_sub_epi32(kZero, u[2]); 1286 out[9] = u[3]; 1287 out[11] = _mm_sub_epi32(kZero, u[7]); 1288 out[13] = u[5]; 1289 out[15] = _mm_sub_epi32(kZero, u[1]); 1290 } else { 1291 const int log_range_out = AOMMAX(16, bd + 6); 1292 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); 1293 const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); 1294 1295 neg_shift_sse4_1(u[0], u[4], out + 1, out + 3, &clamp_lo_out, &clamp_hi_out, 1296 out_shift); 1297 neg_shift_sse4_1(u[6], u[2], out + 5, out + 7, &clamp_lo_out, &clamp_hi_out, 1298 out_shift); 1299 neg_shift_sse4_1(u[3], u[7], out + 9, out + 11, &clamp_lo_out, 1300 &clamp_hi_out, out_shift); 1301 neg_shift_sse4_1(u[5], u[1], out + 13, out + 15, &clamp_lo_out, 1302 &clamp_hi_out, out_shift); 1303 } 1304 } 1305 1306 static void iidentity8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, 1307 int bd, int out_shift) { 1308 (void)bit; 1309 out[0] = _mm_add_epi32(in[0], in[0]); 1310 out[1] = _mm_add_epi32(in[1], in[1]); 1311 out[2] = _mm_add_epi32(in[2], in[2]); 1312 out[3] = _mm_add_epi32(in[3], in[3]); 1313 out[4] = _mm_add_epi32(in[4], in[4]); 1314 out[5] = _mm_add_epi32(in[5], in[5]); 1315 out[6] = _mm_add_epi32(in[6], in[6]); 1316 out[7] = _mm_add_epi32(in[7], in[7]); 1317 1318 if (!do_cols) { 1319 const int log_range = AOMMAX(16, bd + 6); 1320 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); 1321 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); 1322 round_shift_4x4(out, out_shift); 1323 round_shift_4x4(out + 4, out_shift); 1324 highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 8); 1325 } 1326 } 1327 1328 static __m128i get_recon_8x8(const __m128i pred, __m128i res_lo, __m128i res_hi, 1329 int fliplr, int bd) { 1330 __m128i x0, x1; 1331 const __m128i zero = _mm_setzero_si128(); 1332 1333 x0 = _mm_unpacklo_epi16(pred, zero); 1334 x1 = _mm_unpackhi_epi16(pred, zero); 1335 1336 if (fliplr) { 1337 res_lo = _mm_shuffle_epi32(res_lo, 0x1B); 1338 res_hi = _mm_shuffle_epi32(res_hi, 0x1B); 1339 x0 = _mm_add_epi32(res_hi, x0); 1340 x1 = _mm_add_epi32(res_lo, x1); 1341 1342 } else { 1343 x0 = _mm_add_epi32(res_lo, x0); 1344 x1 = _mm_add_epi32(res_hi, x1); 1345 } 1346 1347 x0 = _mm_packus_epi32(x0, x1); 1348 return highbd_clamp_epi16(x0, bd); 1349 } 1350 1351 static void write_buffer_8x8(__m128i *in, uint16_t *output, int stride, 1352 int fliplr, int flipud, int shift, int bd) { 1353 __m128i u0, u1, u2, u3, u4, u5, u6, u7; 1354 __m128i v0, v1, v2, v3, v4, v5, v6, v7; 1355 1356 round_shift_8x8(in, shift); 1357 1358 v0 = _mm_load_si128((__m128i const *)(output + 0 * stride)); 1359 v1 = _mm_load_si128((__m128i const *)(output + 1 * stride)); 1360 v2 = _mm_load_si128((__m128i const *)(output + 2 * stride)); 1361 v3 = _mm_load_si128((__m128i const *)(output + 3 * stride)); 1362 v4 = _mm_load_si128((__m128i const *)(output + 4 * stride)); 1363 v5 = _mm_load_si128((__m128i const *)(output + 5 * stride)); 1364 v6 = _mm_load_si128((__m128i const *)(output + 6 * stride)); 1365 v7 = _mm_load_si128((__m128i const *)(output + 7 * stride)); 1366 1367 if (flipud) { 1368 u0 = get_recon_8x8(v0, in[14], in[15], fliplr, bd); 1369 u1 = get_recon_8x8(v1, in[12], in[13], fliplr, bd); 1370 u2 = get_recon_8x8(v2, in[10], in[11], fliplr, bd); 1371 u3 = get_recon_8x8(v3, in[8], in[9], fliplr, bd); 1372 u4 = get_recon_8x8(v4, in[6], in[7], fliplr, bd); 1373 u5 = get_recon_8x8(v5, in[4], in[5], fliplr, bd); 1374 u6 = get_recon_8x8(v6, in[2], in[3], fliplr, bd); 1375 u7 = get_recon_8x8(v7, in[0], in[1], fliplr, bd); 1376 } else { 1377 u0 = get_recon_8x8(v0, in[0], in[1], fliplr, bd); 1378 u1 = get_recon_8x8(v1, in[2], in[3], fliplr, bd); 1379 u2 = get_recon_8x8(v2, in[4], in[5], fliplr, bd); 1380 u3 = get_recon_8x8(v3, in[6], in[7], fliplr, bd); 1381 u4 = get_recon_8x8(v4, in[8], in[9], fliplr, bd); 1382 u5 = get_recon_8x8(v5, in[10], in[11], fliplr, bd); 1383 u6 = get_recon_8x8(v6, in[12], in[13], fliplr, bd); 1384 u7 = get_recon_8x8(v7, in[14], in[15], fliplr, bd); 1385 } 1386 1387 _mm_store_si128((__m128i *)(output + 0 * stride), u0); 1388 _mm_store_si128((__m128i *)(output + 1 * stride), u1); 1389 _mm_store_si128((__m128i *)(output + 2 * stride), u2); 1390 _mm_store_si128((__m128i *)(output + 3 * stride), u3); 1391 _mm_store_si128((__m128i *)(output + 4 * stride), u4); 1392 _mm_store_si128((__m128i *)(output + 5 * stride), u5); 1393 _mm_store_si128((__m128i *)(output + 6 * stride), u6); 1394 _mm_store_si128((__m128i *)(output + 7 * stride), u7); 1395 } 1396 1397 void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *input, uint16_t *output, 1398 int stride, TX_TYPE tx_type, int bd) { 1399 __m128i in[16], out[16]; 1400 const int8_t *shift = av1_inv_txfm_shift_ls[TX_8X8]; 1401 1402 switch (tx_type) { 1403 case DCT_DCT: 1404 load_buffer_8x8(input, in); 1405 idct8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]); 1406 transpose_8x8(out, in); 1407 idct8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0); 1408 write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd); 1409 break; 1410 case DCT_ADST: 1411 load_buffer_8x8(input, in); 1412 iadst8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]); 1413 transpose_8x8(out, in); 1414 idct8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0); 1415 write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd); 1416 break; 1417 case ADST_DCT: 1418 load_buffer_8x8(input, in); 1419 idct8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]); 1420 transpose_8x8(out, in); 1421 iadst8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0); 1422 write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd); 1423 break; 1424 case ADST_ADST: 1425 load_buffer_8x8(input, in); 1426 iadst8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]); 1427 transpose_8x8(out, in); 1428 iadst8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0); 1429 write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd); 1430 break; 1431 case FLIPADST_DCT: 1432 load_buffer_8x8(input, in); 1433 idct8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]); 1434 transpose_8x8(out, in); 1435 iadst8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0); 1436 write_buffer_8x8(out, output, stride, 0, 1, -shift[1], bd); 1437 break; 1438 case DCT_FLIPADST: 1439 load_buffer_8x8(input, in); 1440 iadst8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]); 1441 transpose_8x8(out, in); 1442 idct8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0); 1443 write_buffer_8x8(out, output, stride, 1, 0, -shift[1], bd); 1444 break; 1445 case ADST_FLIPADST: 1446 load_buffer_8x8(input, in); 1447 iadst8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]); 1448 transpose_8x8(out, in); 1449 iadst8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0); 1450 write_buffer_8x8(out, output, stride, 1, 0, -shift[1], bd); 1451 break; 1452 case FLIPADST_FLIPADST: 1453 load_buffer_8x8(input, in); 1454 iadst8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]); 1455 transpose_8x8(out, in); 1456 iadst8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0); 1457 write_buffer_8x8(out, output, stride, 1, 1, -shift[1], bd); 1458 break; 1459 case FLIPADST_ADST: 1460 load_buffer_8x8(input, in); 1461 iadst8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]); 1462 transpose_8x8(out, in); 1463 iadst8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0); 1464 write_buffer_8x8(out, output, stride, 0, 1, -shift[1], bd); 1465 break; 1466 default: assert(0); 1467 } 1468 } 1469 1470 static void idct8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, 1471 int bd, int out_shift) { 1472 const int32_t *cospi = cospi_arr(bit); 1473 const __m128i cospi32 = _mm_set1_epi32(cospi[32]); 1474 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); 1475 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); 1476 __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); 1477 __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); 1478 __m128i x; 1479 1480 // stage 0 1481 // stage 1 1482 // stage 2 1483 // stage 3 1484 x = _mm_mullo_epi32(in[0], cospi32); 1485 x = _mm_add_epi32(x, rnding); 1486 x = _mm_srai_epi32(x, bit); 1487 1488 // stage 4 1489 // stage 5 1490 if (!do_cols) { 1491 const int log_range_out = AOMMAX(16, bd + 6); 1492 clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1))); 1493 clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); 1494 1495 __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1); 1496 x = _mm_add_epi32(x, offset); 1497 x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift)); 1498 } 1499 1500 x = _mm_max_epi32(x, clamp_lo); 1501 x = _mm_min_epi32(x, clamp_hi); 1502 out[0] = x; 1503 out[1] = x; 1504 out[2] = x; 1505 out[3] = x; 1506 out[4] = x; 1507 out[5] = x; 1508 out[6] = x; 1509 out[7] = x; 1510 } 1511 1512 static void idct8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, 1513 int bd, int out_shift) { 1514 const int32_t *cospi = cospi_arr(bit); 1515 const __m128i cospi56 = _mm_set1_epi32(cospi[56]); 1516 const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); 1517 const __m128i cospi24 = _mm_set1_epi32(cospi[24]); 1518 const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); 1519 const __m128i cospi40 = _mm_set1_epi32(cospi[40]); 1520 const __m128i cospi8 = _mm_set1_epi32(cospi[8]); 1521 const __m128i cospi32 = _mm_set1_epi32(cospi[32]); 1522 const __m128i cospi48 = _mm_set1_epi32(cospi[48]); 1523 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); 1524 const __m128i cospi16 = _mm_set1_epi32(cospi[16]); 1525 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); 1526 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); 1527 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); 1528 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); 1529 __m128i u0, u1, u2, u3, u4, u5, u6, u7; 1530 __m128i v0, v1, v2, v3, v4, v5, v6, v7; 1531 __m128i x, y; 1532 1533 // stage 0 1534 // stage 1 1535 // stage 2 1536 u0 = in[0]; 1537 u1 = in[4]; 1538 u2 = in[2]; 1539 u3 = in[6]; 1540 1541 x = _mm_mullo_epi32(in[1], cospi56); 1542 y = _mm_mullo_epi32(in[7], cospim8); 1543 u4 = _mm_add_epi32(x, y); 1544 u4 = _mm_add_epi32(u4, rnding); 1545 u4 = _mm_srai_epi32(u4, bit); 1546 1547 x = _mm_mullo_epi32(in[1], cospi8); 1548 y = _mm_mullo_epi32(in[7], cospi56); 1549 u7 = _mm_add_epi32(x, y); 1550 u7 = _mm_add_epi32(u7, rnding); 1551 u7 = _mm_srai_epi32(u7, bit); 1552 1553 x = _mm_mullo_epi32(in[5], cospi24); 1554 y = _mm_mullo_epi32(in[3], cospim40); 1555 u5 = _mm_add_epi32(x, y); 1556 u5 = _mm_add_epi32(u5, rnding); 1557 u5 = _mm_srai_epi32(u5, bit); 1558 1559 x = _mm_mullo_epi32(in[5], cospi40); 1560 y = _mm_mullo_epi32(in[3], cospi24); 1561 u6 = _mm_add_epi32(x, y); 1562 u6 = _mm_add_epi32(u6, rnding); 1563 u6 = _mm_srai_epi32(u6, bit); 1564 1565 // stage 3 1566 x = _mm_mullo_epi32(u0, cospi32); 1567 y = _mm_mullo_epi32(u1, cospi32); 1568 v0 = _mm_add_epi32(x, y); 1569 v0 = _mm_add_epi32(v0, rnding); 1570 v0 = _mm_srai_epi32(v0, bit); 1571 1572 v1 = _mm_sub_epi32(x, y); 1573 v1 = _mm_add_epi32(v1, rnding); 1574 v1 = _mm_srai_epi32(v1, bit); 1575 1576 x = _mm_mullo_epi32(u2, cospi48); 1577 y = _mm_mullo_epi32(u3, cospim16); 1578 v2 = _mm_add_epi32(x, y); 1579 v2 = _mm_add_epi32(v2, rnding); 1580 v2 = _mm_srai_epi32(v2, bit); 1581 1582 x = _mm_mullo_epi32(u2, cospi16); 1583 y = _mm_mullo_epi32(u3, cospi48); 1584 v3 = _mm_add_epi32(x, y); 1585 v3 = _mm_add_epi32(v3, rnding); 1586 v3 = _mm_srai_epi32(v3, bit); 1587 1588 addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi); 1589 addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi); 1590 1591 // stage 4 1592 addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi); 1593 addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi); 1594 u4 = v4; 1595 u7 = v7; 1596 1597 x = _mm_mullo_epi32(v5, cospi32); 1598 y = _mm_mullo_epi32(v6, cospi32); 1599 u6 = _mm_add_epi32(y, x); 1600 u6 = _mm_add_epi32(u6, rnding); 1601 u6 = _mm_srai_epi32(u6, bit); 1602 1603 u5 = _mm_sub_epi32(y, x); 1604 u5 = _mm_add_epi32(u5, rnding); 1605 u5 = _mm_srai_epi32(u5, bit); 1606 1607 // stage 5 1608 addsub_sse4_1(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi); 1609 addsub_sse4_1(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi); 1610 addsub_sse4_1(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi); 1611 addsub_sse4_1(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi); 1612 1613 if (!do_cols) { 1614 const int log_range_out = AOMMAX(16, bd + 6); 1615 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); 1616 const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); 1617 1618 round_shift_4x4(out, out_shift); 1619 round_shift_4x4(out + 4, out_shift); 1620 highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 8); 1621 } 1622 } 1623 1624 static void iadst8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit, 1625 int do_cols, int bd, int out_shift) { 1626 const int32_t *cospi = cospi_arr(bit); 1627 const __m128i cospi4 = _mm_set1_epi32(cospi[4]); 1628 const __m128i cospi60 = _mm_set1_epi32(cospi[60]); 1629 const __m128i cospi16 = _mm_set1_epi32(cospi[16]); 1630 const __m128i cospi48 = _mm_set1_epi32(cospi[48]); 1631 const __m128i cospi32 = _mm_set1_epi32(cospi[32]); 1632 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); 1633 const __m128i kZero = _mm_setzero_si128(); 1634 __m128i u[8], x; 1635 1636 // stage 0 1637 // stage 1 1638 // stage 2 1639 1640 x = _mm_mullo_epi32(in[0], cospi60); 1641 u[0] = _mm_add_epi32(x, rnding); 1642 u[0] = _mm_srai_epi32(u[0], bit); 1643 1644 x = _mm_mullo_epi32(in[0], cospi4); 1645 u[1] = _mm_sub_epi32(kZero, x); 1646 u[1] = _mm_add_epi32(u[1], rnding); 1647 u[1] = _mm_srai_epi32(u[1], bit); 1648 1649 // stage 3 1650 // stage 4 1651 __m128i temp1, temp2; 1652 temp1 = _mm_mullo_epi32(u[0], cospi16); 1653 x = _mm_mullo_epi32(u[1], cospi48); 1654 temp1 = _mm_add_epi32(temp1, x); 1655 temp1 = _mm_add_epi32(temp1, rnding); 1656 temp1 = _mm_srai_epi32(temp1, bit); 1657 u[4] = temp1; 1658 1659 temp2 = _mm_mullo_epi32(u[0], cospi48); 1660 x = _mm_mullo_epi32(u[1], cospi16); 1661 u[5] = _mm_sub_epi32(temp2, x); 1662 u[5] = _mm_add_epi32(u[5], rnding); 1663 u[5] = _mm_srai_epi32(u[5], bit); 1664 1665 // stage 5 1666 // stage 6 1667 temp1 = _mm_mullo_epi32(u[0], cospi32); 1668 x = _mm_mullo_epi32(u[1], cospi32); 1669 u[2] = _mm_add_epi32(temp1, x); 1670 u[2] = _mm_add_epi32(u[2], rnding); 1671 u[2] = _mm_srai_epi32(u[2], bit); 1672 1673 u[3] = _mm_sub_epi32(temp1, x); 1674 u[3] = _mm_add_epi32(u[3], rnding); 1675 u[3] = _mm_srai_epi32(u[3], bit); 1676 1677 temp1 = _mm_mullo_epi32(u[4], cospi32); 1678 x = _mm_mullo_epi32(u[5], cospi32); 1679 u[6] = _mm_add_epi32(temp1, x); 1680 u[6] = _mm_add_epi32(u[6], rnding); 1681 u[6] = _mm_srai_epi32(u[6], bit); 1682 1683 u[7] = _mm_sub_epi32(temp1, x); 1684 u[7] = _mm_add_epi32(u[7], rnding); 1685 u[7] = _mm_srai_epi32(u[7], bit); 1686 1687 // stage 7 1688 if (do_cols) { 1689 out[0] = u[0]; 1690 out[1] = _mm_sub_epi32(kZero, u[4]); 1691 out[2] = u[6]; 1692 out[3] = _mm_sub_epi32(kZero, u[2]); 1693 out[4] = u[3]; 1694 out[5] = _mm_sub_epi32(kZero, u[7]); 1695 out[6] = u[5]; 1696 out[7] = _mm_sub_epi32(kZero, u[1]); 1697 } else { 1698 const int log_range_out = AOMMAX(16, bd + 6); 1699 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); 1700 const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); 1701 1702 neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, 1703 out_shift); 1704 neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, 1705 out_shift); 1706 neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, 1707 out_shift); 1708 neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, 1709 out_shift); 1710 } 1711 } 1712 1713 static void iadst8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, 1714 int bd, int out_shift) { 1715 const int32_t *cospi = cospi_arr(bit); 1716 const __m128i cospi4 = _mm_set1_epi32(cospi[4]); 1717 const __m128i cospi60 = _mm_set1_epi32(cospi[60]); 1718 const __m128i cospi20 = _mm_set1_epi32(cospi[20]); 1719 const __m128i cospi44 = _mm_set1_epi32(cospi[44]); 1720 const __m128i cospi36 = _mm_set1_epi32(cospi[36]); 1721 const __m128i cospi28 = _mm_set1_epi32(cospi[28]); 1722 const __m128i cospi52 = _mm_set1_epi32(cospi[52]); 1723 const __m128i cospi12 = _mm_set1_epi32(cospi[12]); 1724 const __m128i cospi16 = _mm_set1_epi32(cospi[16]); 1725 const __m128i cospi48 = _mm_set1_epi32(cospi[48]); 1726 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); 1727 const __m128i cospi32 = _mm_set1_epi32(cospi[32]); 1728 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); 1729 const __m128i kZero = _mm_setzero_si128(); 1730 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); 1731 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); 1732 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); 1733 __m128i u[8], v[8], x; 1734 1735 // stage 0 1736 // stage 1 1737 // stage 2 1738 1739 u[0] = _mm_mullo_epi32(in[7], cospi4); 1740 x = _mm_mullo_epi32(in[0], cospi60); 1741 u[0] = _mm_add_epi32(u[0], x); 1742 u[0] = _mm_add_epi32(u[0], rnding); 1743 u[0] = _mm_srai_epi32(u[0], bit); 1744 1745 u[1] = _mm_mullo_epi32(in[7], cospi60); 1746 x = _mm_mullo_epi32(in[0], cospi4); 1747 u[1] = _mm_sub_epi32(u[1], x); 1748 u[1] = _mm_add_epi32(u[1], rnding); 1749 u[1] = _mm_srai_epi32(u[1], bit); 1750 1751 // (2) 1752 u[2] = _mm_mullo_epi32(in[5], cospi20); 1753 x = _mm_mullo_epi32(in[2], cospi44); 1754 u[2] = _mm_add_epi32(u[2], x); 1755 u[2] = _mm_add_epi32(u[2], rnding); 1756 u[2] = _mm_srai_epi32(u[2], bit); 1757 1758 u[3] = _mm_mullo_epi32(in[5], cospi44); 1759 x = _mm_mullo_epi32(in[2], cospi20); 1760 u[3] = _mm_sub_epi32(u[3], x); 1761 u[3] = _mm_add_epi32(u[3], rnding); 1762 u[3] = _mm_srai_epi32(u[3], bit); 1763 1764 // (3) 1765 u[4] = _mm_mullo_epi32(in[3], cospi36); 1766 x = _mm_mullo_epi32(in[4], cospi28); 1767 u[4] = _mm_add_epi32(u[4], x); 1768 u[4] = _mm_add_epi32(u[4], rnding); 1769 u[4] = _mm_srai_epi32(u[4], bit); 1770 1771 u[5] = _mm_mullo_epi32(in[3], cospi28); 1772 x = _mm_mullo_epi32(in[4], cospi36); 1773 u[5] = _mm_sub_epi32(u[5], x); 1774 u[5] = _mm_add_epi32(u[5], rnding); 1775 u[5] = _mm_srai_epi32(u[5], bit); 1776 1777 // (4) 1778 u[6] = _mm_mullo_epi32(in[1], cospi52); 1779 x = _mm_mullo_epi32(in[6], cospi12); 1780 u[6] = _mm_add_epi32(u[6], x); 1781 u[6] = _mm_add_epi32(u[6], rnding); 1782 u[6] = _mm_srai_epi32(u[6], bit); 1783 1784 u[7] = _mm_mullo_epi32(in[1], cospi12); 1785 x = _mm_mullo_epi32(in[6], cospi52); 1786 u[7] = _mm_sub_epi32(u[7], x); 1787 u[7] = _mm_add_epi32(u[7], rnding); 1788 u[7] = _mm_srai_epi32(u[7], bit); 1789 1790 // stage 3 1791 addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi); 1792 addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi); 1793 addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi); 1794 addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi); 1795 1796 // stage 4 1797 u[0] = v[0]; 1798 u[1] = v[1]; 1799 u[2] = v[2]; 1800 u[3] = v[3]; 1801 1802 u[4] = _mm_mullo_epi32(v[4], cospi16); 1803 x = _mm_mullo_epi32(v[5], cospi48); 1804 u[4] = _mm_add_epi32(u[4], x); 1805 u[4] = _mm_add_epi32(u[4], rnding); 1806 u[4] = _mm_srai_epi32(u[4], bit); 1807 1808 u[5] = _mm_mullo_epi32(v[4], cospi48); 1809 x = _mm_mullo_epi32(v[5], cospi16); 1810 u[5] = _mm_sub_epi32(u[5], x); 1811 u[5] = _mm_add_epi32(u[5], rnding); 1812 u[5] = _mm_srai_epi32(u[5], bit); 1813 1814 u[6] = _mm_mullo_epi32(v[6], cospim48); 1815 x = _mm_mullo_epi32(v[7], cospi16); 1816 u[6] = _mm_add_epi32(u[6], x); 1817 u[6] = _mm_add_epi32(u[6], rnding); 1818 u[6] = _mm_srai_epi32(u[6], bit); 1819 1820 u[7] = _mm_mullo_epi32(v[6], cospi16); 1821 x = _mm_mullo_epi32(v[7], cospim48); 1822 u[7] = _mm_sub_epi32(u[7], x); 1823 u[7] = _mm_add_epi32(u[7], rnding); 1824 u[7] = _mm_srai_epi32(u[7], bit); 1825 1826 // stage 5 1827 addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi); 1828 addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi); 1829 addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi); 1830 addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi); 1831 1832 // stage 6 1833 u[0] = v[0]; 1834 u[1] = v[1]; 1835 u[4] = v[4]; 1836 u[5] = v[5]; 1837 1838 v[0] = _mm_mullo_epi32(v[2], cospi32); 1839 x = _mm_mullo_epi32(v[3], cospi32); 1840 u[2] = _mm_add_epi32(v[0], x); 1841 u[2] = _mm_add_epi32(u[2], rnding); 1842 u[2] = _mm_srai_epi32(u[2], bit); 1843 1844 u[3] = _mm_sub_epi32(v[0], x); 1845 u[3] = _mm_add_epi32(u[3], rnding); 1846 u[3] = _mm_srai_epi32(u[3], bit); 1847 1848 v[0] = _mm_mullo_epi32(v[6], cospi32); 1849 x = _mm_mullo_epi32(v[7], cospi32); 1850 u[6] = _mm_add_epi32(v[0], x); 1851 u[6] = _mm_add_epi32(u[6], rnding); 1852 u[6] = _mm_srai_epi32(u[6], bit); 1853 1854 u[7] = _mm_sub_epi32(v[0], x); 1855 u[7] = _mm_add_epi32(u[7], rnding); 1856 u[7] = _mm_srai_epi32(u[7], bit); 1857 1858 // stage 7 1859 if (do_cols) { 1860 out[0] = u[0]; 1861 out[1] = _mm_sub_epi32(kZero, u[4]); 1862 out[2] = u[6]; 1863 out[3] = _mm_sub_epi32(kZero, u[2]); 1864 out[4] = u[3]; 1865 out[5] = _mm_sub_epi32(kZero, u[7]); 1866 out[6] = u[5]; 1867 out[7] = _mm_sub_epi32(kZero, u[1]); 1868 } else { 1869 const int log_range_out = AOMMAX(16, bd + 6); 1870 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); 1871 const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); 1872 1873 neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, 1874 out_shift); 1875 neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, 1876 out_shift); 1877 neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, 1878 out_shift); 1879 neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, 1880 out_shift); 1881 } 1882 } 1883 1884 static void idct16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit, 1885 int do_cols, int bd, int out_shift) { 1886 const int32_t *cospi = cospi_arr(bit); 1887 const __m128i cospi32 = _mm_set1_epi32(cospi[32]); 1888 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); 1889 int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); 1890 __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); 1891 __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); 1892 // stage 0 1893 // stage 1 1894 // stage 2 1895 // stage 3 1896 // stage 4 1897 in[0] = _mm_mullo_epi32(in[0], cospi32); 1898 in[0] = _mm_add_epi32(in[0], rnding); 1899 in[0] = _mm_srai_epi32(in[0], bit); 1900 1901 // stage 5 1902 // stage 6 1903 // stage 7 1904 if (!do_cols) { 1905 log_range = AOMMAX(16, bd + 6); 1906 clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); 1907 clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); 1908 if (out_shift != 0) { 1909 __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1); 1910 in[0] = _mm_add_epi32(in[0], offset); 1911 in[0] = _mm_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift)); 1912 } 1913 } 1914 1915 in[0] = _mm_max_epi32(in[0], clamp_lo); 1916 in[0] = _mm_min_epi32(in[0], clamp_hi); 1917 out[0] = in[0]; 1918 out[1] = in[0]; 1919 out[2] = in[0]; 1920 out[3] = in[0]; 1921 out[4] = in[0]; 1922 out[5] = in[0]; 1923 out[6] = in[0]; 1924 out[7] = in[0]; 1925 out[8] = in[0]; 1926 out[9] = in[0]; 1927 out[10] = in[0]; 1928 out[11] = in[0]; 1929 out[12] = in[0]; 1930 out[13] = in[0]; 1931 out[14] = in[0]; 1932 out[15] = in[0]; 1933 } 1934 1935 static void idct16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit, 1936 int do_cols, int bd, int out_shift) { 1937 const int32_t *cospi = cospi_arr(bit); 1938 const __m128i cospi60 = _mm_set1_epi32(cospi[60]); 1939 const __m128i cospi28 = _mm_set1_epi32(cospi[28]); 1940 const __m128i cospi44 = _mm_set1_epi32(cospi[44]); 1941 const __m128i cospi20 = _mm_set1_epi32(cospi[20]); 1942 const __m128i cospi12 = _mm_set1_epi32(cospi[12]); 1943 const __m128i cospi4 = _mm_set1_epi32(cospi[4]); 1944 const __m128i cospi56 = _mm_set1_epi32(cospi[56]); 1945 const __m128i cospi24 = _mm_set1_epi32(cospi[24]); 1946 const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); 1947 const __m128i cospi8 = _mm_set1_epi32(cospi[8]); 1948 const __m128i cospi32 = _mm_set1_epi32(cospi[32]); 1949 const __m128i cospi48 = _mm_set1_epi32(cospi[48]); 1950 const __m128i cospi16 = _mm_set1_epi32(cospi[16]); 1951 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); 1952 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); 1953 const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); 1954 const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); 1955 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); 1956 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); 1957 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); 1958 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); 1959 __m128i u[16], x, y; 1960 // stage 0 1961 // stage 1 1962 u[0] = in[0]; 1963 u[2] = in[4]; 1964 u[4] = in[2]; 1965 u[6] = in[6]; 1966 u[8] = in[1]; 1967 u[10] = in[5]; 1968 u[12] = in[3]; 1969 u[14] = in[7]; 1970 1971 // stage 2 1972 u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit); 1973 u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit); 1974 1975 u[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit); 1976 u[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit); 1977 1978 u[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit); 1979 u[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit); 1980 1981 u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit); 1982 u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit); 1983 1984 // stage 3 1985 u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit); 1986 u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit); 1987 u[5] = half_btf_0_sse4_1(&cospim40, &u[6], &rnding, bit); 1988 u[6] = half_btf_0_sse4_1(&cospi24, &u[6], &rnding, bit); 1989 1990 addsub_sse4_1(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi); 1991 addsub_sse4_1(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi); 1992 addsub_sse4_1(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi); 1993 addsub_sse4_1(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi); 1994 1995 // stage 4 1996 x = _mm_mullo_epi32(u[0], cospi32); 1997 u[0] = _mm_add_epi32(x, rnding); 1998 u[0] = _mm_srai_epi32(u[0], bit); 1999 u[1] = u[0]; 2000 2001 u[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit); 2002 u[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit); 2003 2004 addsub_sse4_1(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi); 2005 addsub_sse4_1(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi); 2006 2007 x = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); 2008 u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); 2009 u[9] = x; 2010 y = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); 2011 u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); 2012 u[10] = y; 2013 2014 // stage 5 2015 addsub_sse4_1(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi); 2016 addsub_sse4_1(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi); 2017 2018 x = _mm_mullo_epi32(u[5], cospi32); 2019 y = _mm_mullo_epi32(u[6], cospi32); 2020 u[5] = _mm_sub_epi32(y, x); 2021 u[5] = _mm_add_epi32(u[5], rnding); 2022 u[5] = _mm_srai_epi32(u[5], bit); 2023 2024 u[6] = _mm_add_epi32(y, x); 2025 u[6] = _mm_add_epi32(u[6], rnding); 2026 u[6] = _mm_srai_epi32(u[6], bit); 2027 2028 addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi); 2029 addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi); 2030 addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi); 2031 addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi); 2032 2033 // stage 6 2034 addsub_sse4_1(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi); 2035 addsub_sse4_1(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi); 2036 addsub_sse4_1(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi); 2037 addsub_sse4_1(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi); 2038 2039 x = _mm_mullo_epi32(u[10], cospi32); 2040 y = _mm_mullo_epi32(u[13], cospi32); 2041 u[10] = _mm_sub_epi32(y, x); 2042 u[10] = _mm_add_epi32(u[10], rnding); 2043 u[10] = _mm_srai_epi32(u[10], bit); 2044 2045 u[13] = _mm_add_epi32(x, y); 2046 u[13] = _mm_add_epi32(u[13], rnding); 2047 u[13] = _mm_srai_epi32(u[13], bit); 2048 2049 x = _mm_mullo_epi32(u[11], cospi32); 2050 y = _mm_mullo_epi32(u[12], cospi32); 2051 u[11] = _mm_sub_epi32(y, x); 2052 u[11] = _mm_add_epi32(u[11], rnding); 2053 u[11] = _mm_srai_epi32(u[11], bit); 2054 2055 u[12] = _mm_add_epi32(x, y); 2056 u[12] = _mm_add_epi32(u[12], rnding); 2057 u[12] = _mm_srai_epi32(u[12], bit); 2058 // stage 7 2059 addsub_sse4_1(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi); 2060 addsub_sse4_1(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi); 2061 addsub_sse4_1(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi); 2062 addsub_sse4_1(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi); 2063 addsub_sse4_1(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi); 2064 addsub_sse4_1(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi); 2065 addsub_sse4_1(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi); 2066 addsub_sse4_1(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi); 2067 2068 if (!do_cols) { 2069 const int log_range_out = AOMMAX(16, bd + 6); 2070 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); 2071 const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); 2072 round_shift_8x8(out, out_shift); 2073 highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16); 2074 } 2075 } 2076 2077 static void iadst16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit, 2078 int do_cols, int bd, int out_shift) { 2079 const int32_t *cospi = cospi_arr(bit); 2080 const __m128i cospi2 = _mm_set1_epi32(cospi[2]); 2081 const __m128i cospi62 = _mm_set1_epi32(cospi[62]); 2082 const __m128i cospi8 = _mm_set1_epi32(cospi[8]); 2083 const __m128i cospi56 = _mm_set1_epi32(cospi[56]); 2084 const __m128i cospi48 = _mm_set1_epi32(cospi[48]); 2085 const __m128i cospi16 = _mm_set1_epi32(cospi[16]); 2086 const __m128i cospi32 = _mm_set1_epi32(cospi[32]); 2087 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); 2088 const __m128i zero = _mm_setzero_si128(); 2089 __m128i v[16], x, y, temp1, temp2; 2090 // stage 0 2091 // stage 1 2092 // stage 2 2093 x = _mm_mullo_epi32(in[0], cospi62); 2094 v[0] = _mm_add_epi32(x, rnding); 2095 v[0] = _mm_srai_epi32(v[0], bit); 2096 2097 x = _mm_mullo_epi32(in[0], cospi2); 2098 v[1] = _mm_sub_epi32(zero, x); 2099 v[1] = _mm_add_epi32(v[1], rnding); 2100 v[1] = _mm_srai_epi32(v[1], bit); 2101 2102 // stage 3 2103 v[8] = v[0]; 2104 v[9] = v[1]; 2105 2106 // stage 4 2107 temp1 = _mm_mullo_epi32(v[8], cospi8); 2108 x = _mm_mullo_epi32(v[9], cospi56); 2109 temp1 = _mm_add_epi32(temp1, x); 2110 temp1 = _mm_add_epi32(temp1, rnding); 2111 temp1 = _mm_srai_epi32(temp1, bit); 2112 2113 temp2 = _mm_mullo_epi32(v[8], cospi56); 2114 x = _mm_mullo_epi32(v[9], cospi8); 2115 temp2 = _mm_sub_epi32(temp2, x); 2116 temp2 = _mm_add_epi32(temp2, rnding); 2117 temp2 = _mm_srai_epi32(temp2, bit); 2118 v[8] = temp1; 2119 v[9] = temp2; 2120 2121 // stage 5 2122 v[4] = v[0]; 2123 v[5] = v[1]; 2124 v[12] = v[8]; 2125 v[13] = v[9]; 2126 2127 // stage 6 2128 temp1 = _mm_mullo_epi32(v[4], cospi16); 2129 x = _mm_mullo_epi32(v[5], cospi48); 2130 temp1 = _mm_add_epi32(temp1, x); 2131 temp1 = _mm_add_epi32(temp1, rnding); 2132 temp1 = _mm_srai_epi32(temp1, bit); 2133 2134 temp2 = _mm_mullo_epi32(v[4], cospi48); 2135 x = _mm_mullo_epi32(v[5], cospi16); 2136 temp2 = _mm_sub_epi32(temp2, x); 2137 temp2 = _mm_add_epi32(temp2, rnding); 2138 temp2 = _mm_srai_epi32(temp2, bit); 2139 v[4] = temp1; 2140 v[5] = temp2; 2141 2142 temp1 = _mm_mullo_epi32(v[12], cospi16); 2143 x = _mm_mullo_epi32(v[13], cospi48); 2144 temp1 = _mm_add_epi32(temp1, x); 2145 temp1 = _mm_add_epi32(temp1, rnding); 2146 temp1 = _mm_srai_epi32(temp1, bit); 2147 2148 temp2 = _mm_mullo_epi32(v[12], cospi48); 2149 x = _mm_mullo_epi32(v[13], cospi16); 2150 temp2 = _mm_sub_epi32(temp2, x); 2151 temp2 = _mm_add_epi32(temp2, rnding); 2152 temp2 = _mm_srai_epi32(temp2, bit); 2153 v[12] = temp1; 2154 v[13] = temp2; 2155 2156 // stage 7 2157 v[2] = v[0]; 2158 v[3] = v[1]; 2159 v[6] = v[4]; 2160 v[7] = v[5]; 2161 v[10] = v[8]; 2162 v[11] = v[9]; 2163 v[14] = v[12]; 2164 v[15] = v[13]; 2165 2166 // stage 8 2167 y = _mm_mullo_epi32(v[2], cospi32); 2168 x = _mm_mullo_epi32(v[3], cospi32); 2169 v[2] = _mm_add_epi32(y, x); 2170 v[2] = _mm_add_epi32(v[2], rnding); 2171 v[2] = _mm_srai_epi32(v[2], bit); 2172 2173 v[3] = _mm_sub_epi32(y, x); 2174 v[3] = _mm_add_epi32(v[3], rnding); 2175 v[3] = _mm_srai_epi32(v[3], bit); 2176 2177 y = _mm_mullo_epi32(v[6], cospi32); 2178 x = _mm_mullo_epi32(v[7], cospi32); 2179 v[6] = _mm_add_epi32(y, x); 2180 v[6] = _mm_add_epi32(v[6], rnding); 2181 v[6] = _mm_srai_epi32(v[6], bit); 2182 2183 v[7] = _mm_sub_epi32(y, x); 2184 v[7] = _mm_add_epi32(v[7], rnding); 2185 v[7] = _mm_srai_epi32(v[7], bit); 2186 2187 y = _mm_mullo_epi32(v[10], cospi32); 2188 x = _mm_mullo_epi32(v[11], cospi32); 2189 v[10] = _mm_add_epi32(y, x); 2190 v[10] = _mm_add_epi32(v[10], rnding); 2191 v[10] = _mm_srai_epi32(v[10], bit); 2192 2193 v[11] = _mm_sub_epi32(y, x); 2194 v[11] = _mm_add_epi32(v[11], rnding); 2195 v[11] = _mm_srai_epi32(v[11], bit); 2196 2197 y = _mm_mullo_epi32(v[14], cospi32); 2198 x = _mm_mullo_epi32(v[15], cospi32); 2199 v[14] = _mm_add_epi32(y, x); 2200 v[14] = _mm_add_epi32(v[14], rnding); 2201 v[14] = _mm_srai_epi32(v[14], bit); 2202 2203 v[15] = _mm_sub_epi32(y, x); 2204 v[15] = _mm_add_epi32(v[15], rnding); 2205 v[15] = _mm_srai_epi32(v[15], bit); 2206 2207 // stage 9 2208 if (do_cols) { 2209 out[0] = v[0]; 2210 out[1] = _mm_sub_epi32(zero, v[8]); 2211 out[2] = v[12]; 2212 out[3] = _mm_sub_epi32(zero, v[4]); 2213 out[4] = v[6]; 2214 out[5] = _mm_sub_epi32(zero, v[14]); 2215 out[6] = v[10]; 2216 out[7] = _mm_sub_epi32(zero, v[2]); 2217 out[8] = v[3]; 2218 out[9] = _mm_sub_epi32(zero, v[11]); 2219 out[10] = v[15]; 2220 out[11] = _mm_sub_epi32(zero, v[7]); 2221 out[12] = v[5]; 2222 out[13] = _mm_sub_epi32(zero, v[13]); 2223 out[14] = v[9]; 2224 out[15] = _mm_sub_epi32(zero, v[1]); 2225 } else { 2226 const int log_range_out = AOMMAX(16, bd + 6); 2227 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); 2228 const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); 2229 2230 neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, 2231 out_shift); 2232 neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out, 2233 &clamp_hi_out, out_shift); 2234 neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out, 2235 &clamp_hi_out, out_shift); 2236 neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out, 2237 &clamp_hi_out, out_shift); 2238 neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out, 2239 &clamp_hi_out, out_shift); 2240 neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out, 2241 &clamp_hi_out, out_shift); 2242 neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out, 2243 &clamp_hi_out, out_shift); 2244 neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out, 2245 &clamp_hi_out, out_shift); 2246 } 2247 } 2248 2249 static void iadst16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit, 2250 int do_cols, int bd, int out_shift) { 2251 const int32_t *cospi = cospi_arr(bit); 2252 const __m128i cospi2 = _mm_set1_epi32(cospi[2]); 2253 const __m128i cospi62 = _mm_set1_epi32(cospi[62]); 2254 const __m128i cospi10 = _mm_set1_epi32(cospi[10]); 2255 const __m128i cospi54 = _mm_set1_epi32(cospi[54]); 2256 const __m128i cospi18 = _mm_set1_epi32(cospi[18]); 2257 const __m128i cospi46 = _mm_set1_epi32(cospi[46]); 2258 const __m128i cospi26 = _mm_set1_epi32(cospi[26]); 2259 const __m128i cospi38 = _mm_set1_epi32(cospi[38]); 2260 const __m128i cospi34 = _mm_set1_epi32(cospi[34]); 2261 const __m128i cospi30 = _mm_set1_epi32(cospi[30]); 2262 const __m128i cospi42 = _mm_set1_epi32(cospi[42]); 2263 const __m128i cospi22 = _mm_set1_epi32(cospi[22]); 2264 const __m128i cospi50 = _mm_set1_epi32(cospi[50]); 2265 const __m128i cospi14 = _mm_set1_epi32(cospi[14]); 2266 const __m128i cospi58 = _mm_set1_epi32(cospi[58]); 2267 const __m128i cospi6 = _mm_set1_epi32(cospi[6]); 2268 const __m128i cospi8 = _mm_set1_epi32(cospi[8]); 2269 const __m128i cospi56 = _mm_set1_epi32(cospi[56]); 2270 const __m128i cospi40 = _mm_set1_epi32(cospi[40]); 2271 const __m128i cospi24 = _mm_set1_epi32(cospi[24]); 2272 const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); 2273 const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); 2274 const __m128i cospi48 = _mm_set1_epi32(cospi[48]); 2275 const __m128i cospi16 = _mm_set1_epi32(cospi[16]); 2276 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); 2277 const __m128i cospi32 = _mm_set1_epi32(cospi[32]); 2278 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); 2279 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); 2280 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); 2281 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); 2282 __m128i zero = _mm_setzero_si128(); 2283 __m128i u[16], x, y; 2284 2285 // stage 0 2286 // stage 1 2287 // stage 2 2288 x = _mm_mullo_epi32(in[0], cospi62); 2289 u[0] = _mm_add_epi32(x, rnding); 2290 u[0] = _mm_srai_epi32(u[0], bit); 2291 2292 x = _mm_mullo_epi32(in[0], cospi2); 2293 u[1] = _mm_sub_epi32(zero, x); 2294 u[1] = _mm_add_epi32(u[1], rnding); 2295 u[1] = _mm_srai_epi32(u[1], bit); 2296 2297 x = _mm_mullo_epi32(in[2], cospi54); 2298 u[2] = _mm_add_epi32(x, rnding); 2299 u[2] = _mm_srai_epi32(u[2], bit); 2300 2301 x = _mm_mullo_epi32(in[2], cospi10); 2302 u[3] = _mm_sub_epi32(zero, x); 2303 u[3] = _mm_add_epi32(u[3], rnding); 2304 u[3] = _mm_srai_epi32(u[3], bit); 2305 2306 x = _mm_mullo_epi32(in[4], cospi46); 2307 u[4] = _mm_add_epi32(x, rnding); 2308 u[4] = _mm_srai_epi32(u[4], bit); 2309 2310 x = _mm_mullo_epi32(in[4], cospi18); 2311 u[5] = _mm_sub_epi32(zero, x); 2312 u[5] = _mm_add_epi32(u[5], rnding); 2313 u[5] = _mm_srai_epi32(u[5], bit); 2314 2315 x = _mm_mullo_epi32(in[6], cospi38); 2316 u[6] = _mm_add_epi32(x, rnding); 2317 u[6] = _mm_srai_epi32(u[6], bit); 2318 2319 x = _mm_mullo_epi32(in[6], cospi26); 2320 u[7] = _mm_sub_epi32(zero, x); 2321 u[7] = _mm_add_epi32(u[7], rnding); 2322 u[7] = _mm_srai_epi32(u[7], bit); 2323 2324 u[8] = _mm_mullo_epi32(in[7], cospi34); 2325 u[8] = _mm_add_epi32(u[8], rnding); 2326 u[8] = _mm_srai_epi32(u[8], bit); 2327 2328 u[9] = _mm_mullo_epi32(in[7], cospi30); 2329 u[9] = _mm_add_epi32(u[9], rnding); 2330 u[9] = _mm_srai_epi32(u[9], bit); 2331 2332 u[10] = _mm_mullo_epi32(in[5], cospi42); 2333 u[10] = _mm_add_epi32(u[10], rnding); 2334 u[10] = _mm_srai_epi32(u[10], bit); 2335 2336 u[11] = _mm_mullo_epi32(in[5], cospi22); 2337 u[11] = _mm_add_epi32(u[11], rnding); 2338 u[11] = _mm_srai_epi32(u[11], bit); 2339 2340 u[12] = _mm_mullo_epi32(in[3], cospi50); 2341 u[12] = _mm_add_epi32(u[12], rnding); 2342 u[12] = _mm_srai_epi32(u[12], bit); 2343 2344 u[13] = _mm_mullo_epi32(in[3], cospi14); 2345 u[13] = _mm_add_epi32(u[13], rnding); 2346 u[13] = _mm_srai_epi32(u[13], bit); 2347 2348 u[14] = _mm_mullo_epi32(in[1], cospi58); 2349 u[14] = _mm_add_epi32(u[14], rnding); 2350 u[14] = _mm_srai_epi32(u[14], bit); 2351 2352 u[15] = _mm_mullo_epi32(in[1], cospi6); 2353 u[15] = _mm_add_epi32(u[15], rnding); 2354 u[15] = _mm_srai_epi32(u[15], bit); 2355 2356 // stage 3 2357 addsub_sse4_1(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi); 2358 addsub_sse4_1(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi); 2359 addsub_sse4_1(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi); 2360 addsub_sse4_1(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi); 2361 addsub_sse4_1(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi); 2362 addsub_sse4_1(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi); 2363 addsub_sse4_1(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi); 2364 addsub_sse4_1(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi); 2365 2366 // stage 4 2367 y = _mm_mullo_epi32(u[8], cospi56); 2368 x = _mm_mullo_epi32(u[9], cospi56); 2369 u[8] = _mm_mullo_epi32(u[8], cospi8); 2370 u[8] = _mm_add_epi32(u[8], x); 2371 u[8] = _mm_add_epi32(u[8], rnding); 2372 u[8] = _mm_srai_epi32(u[8], bit); 2373 2374 x = _mm_mullo_epi32(u[9], cospi8); 2375 u[9] = _mm_sub_epi32(y, x); 2376 u[9] = _mm_add_epi32(u[9], rnding); 2377 u[9] = _mm_srai_epi32(u[9], bit); 2378 2379 x = _mm_mullo_epi32(u[11], cospi24); 2380 y = _mm_mullo_epi32(u[10], cospi24); 2381 u[10] = _mm_mullo_epi32(u[10], cospi40); 2382 u[10] = _mm_add_epi32(u[10], x); 2383 u[10] = _mm_add_epi32(u[10], rnding); 2384 u[10] = _mm_srai_epi32(u[10], bit); 2385 2386 x = _mm_mullo_epi32(u[11], cospi40); 2387 u[11] = _mm_sub_epi32(y, x); 2388 u[11] = _mm_add_epi32(u[11], rnding); 2389 u[11] = _mm_srai_epi32(u[11], bit); 2390 2391 x = _mm_mullo_epi32(u[13], cospi8); 2392 y = _mm_mullo_epi32(u[12], cospi8); 2393 u[12] = _mm_mullo_epi32(u[12], cospim56); 2394 u[12] = _mm_add_epi32(u[12], x); 2395 u[12] = _mm_add_epi32(u[12], rnding); 2396 u[12] = _mm_srai_epi32(u[12], bit); 2397 2398 x = _mm_mullo_epi32(u[13], cospim56); 2399 u[13] = _mm_sub_epi32(y, x); 2400 u[13] = _mm_add_epi32(u[13], rnding); 2401 u[13] = _mm_srai_epi32(u[13], bit); 2402 2403 x = _mm_mullo_epi32(u[15], cospi40); 2404 y = _mm_mullo_epi32(u[14], cospi40); 2405 u[14] = _mm_mullo_epi32(u[14], cospim24); 2406 u[14] = _mm_add_epi32(u[14], x); 2407 u[14] = _mm_add_epi32(u[14], rnding); 2408 u[14] = _mm_srai_epi32(u[14], bit); 2409 2410 x = _mm_mullo_epi32(u[15], cospim24); 2411 u[15] = _mm_sub_epi32(y, x); 2412 u[15] = _mm_add_epi32(u[15], rnding); 2413 u[15] = _mm_srai_epi32(u[15], bit); 2414 2415 // stage 5 2416 addsub_sse4_1(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi); 2417 addsub_sse4_1(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi); 2418 addsub_sse4_1(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi); 2419 addsub_sse4_1(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi); 2420 addsub_sse4_1(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi); 2421 addsub_sse4_1(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi); 2422 addsub_sse4_1(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi); 2423 addsub_sse4_1(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi); 2424 2425 // stage 6 2426 x = _mm_mullo_epi32(u[5], cospi48); 2427 y = _mm_mullo_epi32(u[4], cospi48); 2428 u[4] = _mm_mullo_epi32(u[4], cospi16); 2429 u[4] = _mm_add_epi32(u[4], x); 2430 u[4] = _mm_add_epi32(u[4], rnding); 2431 u[4] = _mm_srai_epi32(u[4], bit); 2432 2433 x = _mm_mullo_epi32(u[5], cospi16); 2434 u[5] = _mm_sub_epi32(y, x); 2435 u[5] = _mm_add_epi32(u[5], rnding); 2436 u[5] = _mm_srai_epi32(u[5], bit); 2437 2438 x = _mm_mullo_epi32(u[7], cospi16); 2439 y = _mm_mullo_epi32(u[6], cospi16); 2440 u[6] = _mm_mullo_epi32(u[6], cospim48); 2441 u[6] = _mm_add_epi32(u[6], x); 2442 u[6] = _mm_add_epi32(u[6], rnding); 2443 u[6] = _mm_srai_epi32(u[6], bit); 2444 2445 x = _mm_mullo_epi32(u[7], cospim48); 2446 u[7] = _mm_sub_epi32(y, x); 2447 u[7] = _mm_add_epi32(u[7], rnding); 2448 u[7] = _mm_srai_epi32(u[7], bit); 2449 2450 x = _mm_mullo_epi32(u[13], cospi48); 2451 y = _mm_mullo_epi32(u[12], cospi48); 2452 u[12] = _mm_mullo_epi32(u[12], cospi16); 2453 u[12] = _mm_add_epi32(u[12], x); 2454 u[12] = _mm_add_epi32(u[12], rnding); 2455 u[12] = _mm_srai_epi32(u[12], bit); 2456 2457 x = _mm_mullo_epi32(u[13], cospi16); 2458 u[13] = _mm_sub_epi32(y, x); 2459 u[13] = _mm_add_epi32(u[13], rnding); 2460 u[13] = _mm_srai_epi32(u[13], bit); 2461 2462 x = _mm_mullo_epi32(u[15], cospi16); 2463 y = _mm_mullo_epi32(u[14], cospi16); 2464 u[14] = _mm_mullo_epi32(u[14], cospim48); 2465 u[14] = _mm_add_epi32(u[14], x); 2466 u[14] = _mm_add_epi32(u[14], rnding); 2467 u[14] = _mm_srai_epi32(u[14], bit); 2468 2469 x = _mm_mullo_epi32(u[15], cospim48); 2470 u[15] = _mm_sub_epi32(y, x); 2471 u[15] = _mm_add_epi32(u[15], rnding); 2472 u[15] = _mm_srai_epi32(u[15], bit); 2473 2474 // stage 7 2475 addsub_sse4_1(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi); 2476 addsub_sse4_1(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi); 2477 addsub_sse4_1(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi); 2478 addsub_sse4_1(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi); 2479 addsub_sse4_1(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi); 2480 addsub_sse4_1(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi); 2481 addsub_sse4_1(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi); 2482 addsub_sse4_1(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi); 2483 2484 // stage 8 2485 y = _mm_mullo_epi32(u[2], cospi32); 2486 x = _mm_mullo_epi32(u[3], cospi32); 2487 u[2] = _mm_add_epi32(y, x); 2488 u[2] = _mm_add_epi32(u[2], rnding); 2489 u[2] = _mm_srai_epi32(u[2], bit); 2490 2491 u[3] = _mm_sub_epi32(y, x); 2492 u[3] = _mm_add_epi32(u[3], rnding); 2493 u[3] = _mm_srai_epi32(u[3], bit); 2494 y = _mm_mullo_epi32(u[6], cospi32); 2495 x = _mm_mullo_epi32(u[7], cospi32); 2496 u[6] = _mm_add_epi32(y, x); 2497 u[6] = _mm_add_epi32(u[6], rnding); 2498 u[6] = _mm_srai_epi32(u[6], bit); 2499 2500 u[7] = _mm_sub_epi32(y, x); 2501 u[7] = _mm_add_epi32(u[7], rnding); 2502 u[7] = _mm_srai_epi32(u[7], bit); 2503 2504 y = _mm_mullo_epi32(u[10], cospi32); 2505 x = _mm_mullo_epi32(u[11], cospi32); 2506 u[10] = _mm_add_epi32(y, x); 2507 u[10] = _mm_add_epi32(u[10], rnding); 2508 u[10] = _mm_srai_epi32(u[10], bit); 2509 2510 u[11] = _mm_sub_epi32(y, x); 2511 u[11] = _mm_add_epi32(u[11], rnding); 2512 u[11] = _mm_srai_epi32(u[11], bit); 2513 2514 y = _mm_mullo_epi32(u[14], cospi32); 2515 x = _mm_mullo_epi32(u[15], cospi32); 2516 u[14] = _mm_add_epi32(y, x); 2517 u[14] = _mm_add_epi32(u[14], rnding); 2518 u[14] = _mm_srai_epi32(u[14], bit); 2519 2520 u[15] = _mm_sub_epi32(y, x); 2521 u[15] = _mm_add_epi32(u[15], rnding); 2522 u[15] = _mm_srai_epi32(u[15], bit); 2523 2524 // stage 9 2525 if (do_cols) { 2526 out[0] = u[0]; 2527 out[1] = _mm_sub_epi32(zero, u[8]); 2528 out[2] = u[12]; 2529 out[3] = _mm_sub_epi32(zero, u[4]); 2530 out[4] = u[6]; 2531 out[5] = _mm_sub_epi32(zero, u[14]); 2532 out[6] = u[10]; 2533 out[7] = _mm_sub_epi32(zero, u[2]); 2534 out[8] = u[3]; 2535 out[9] = _mm_sub_epi32(zero, u[11]); 2536 out[10] = u[15]; 2537 out[11] = _mm_sub_epi32(zero, u[7]); 2538 out[12] = u[5]; 2539 out[13] = _mm_sub_epi32(zero, u[13]); 2540 out[14] = u[9]; 2541 out[15] = _mm_sub_epi32(zero, u[1]); 2542 } else { 2543 const int log_range_out = AOMMAX(16, bd + 6); 2544 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); 2545 const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); 2546 2547 neg_shift_sse4_1(u[0], u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, 2548 out_shift); 2549 neg_shift_sse4_1(u[12], u[4], out + 2, out + 3, &clamp_lo_out, 2550 &clamp_hi_out, out_shift); 2551 neg_shift_sse4_1(u[6], u[14], out + 4, out + 5, &clamp_lo_out, 2552 &clamp_hi_out, out_shift); 2553 neg_shift_sse4_1(u[10], u[2], out + 6, out + 7, &clamp_lo_out, 2554 &clamp_hi_out, out_shift); 2555 neg_shift_sse4_1(u[3], u[11], out + 8, out + 9, &clamp_lo_out, 2556 &clamp_hi_out, out_shift); 2557 neg_shift_sse4_1(u[15], u[7], out + 10, out + 11, &clamp_lo_out, 2558 &clamp_hi_out, out_shift); 2559 neg_shift_sse4_1(u[5], u[13], out + 12, out + 13, &clamp_lo_out, 2560 &clamp_hi_out, out_shift); 2561 neg_shift_sse4_1(u[9], u[1], out + 14, out + 15, &clamp_lo_out, 2562 &clamp_hi_out, out_shift); 2563 } 2564 } 2565 2566 static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, 2567 int bd, int out_shift) { 2568 const int32_t *cospi = cospi_arr(bit); 2569 const __m128i cospi60 = _mm_set1_epi32(cospi[60]); 2570 const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); 2571 const __m128i cospi28 = _mm_set1_epi32(cospi[28]); 2572 const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); 2573 const __m128i cospi44 = _mm_set1_epi32(cospi[44]); 2574 const __m128i cospi20 = _mm_set1_epi32(cospi[20]); 2575 const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); 2576 const __m128i cospi12 = _mm_set1_epi32(cospi[12]); 2577 const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); 2578 const __m128i cospi52 = _mm_set1_epi32(cospi[52]); 2579 const __m128i cospi36 = _mm_set1_epi32(cospi[36]); 2580 const __m128i cospi4 = _mm_set1_epi32(cospi[4]); 2581 const __m128i cospi56 = _mm_set1_epi32(cospi[56]); 2582 const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); 2583 const __m128i cospi24 = _mm_set1_epi32(cospi[24]); 2584 const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); 2585 const __m128i cospi40 = _mm_set1_epi32(cospi[40]); 2586 const __m128i cospi8 = _mm_set1_epi32(cospi[8]); 2587 const __m128i cospi32 = _mm_set1_epi32(cospi[32]); 2588 const __m128i cospi48 = _mm_set1_epi32(cospi[48]); 2589 const __m128i cospi16 = _mm_set1_epi32(cospi[16]); 2590 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); 2591 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); 2592 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); 2593 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); 2594 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); 2595 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); 2596 __m128i u[16], v[16], x, y; 2597 2598 { 2599 // stage 0 2600 // stage 1 2601 u[0] = in[0]; 2602 u[1] = in[8]; 2603 u[2] = in[4]; 2604 u[3] = in[12]; 2605 u[4] = in[2]; 2606 u[5] = in[10]; 2607 u[6] = in[6]; 2608 u[7] = in[14]; 2609 u[8] = in[1]; 2610 u[9] = in[9]; 2611 u[10] = in[5]; 2612 u[11] = in[13]; 2613 u[12] = in[3]; 2614 u[13] = in[11]; 2615 u[14] = in[7]; 2616 u[15] = in[15]; 2617 2618 // stage 2 2619 v[0] = u[0]; 2620 v[1] = u[1]; 2621 v[2] = u[2]; 2622 v[3] = u[3]; 2623 v[4] = u[4]; 2624 v[5] = u[5]; 2625 v[6] = u[6]; 2626 v[7] = u[7]; 2627 2628 v[8] = half_btf_sse4_1(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit); 2629 v[9] = half_btf_sse4_1(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit); 2630 v[10] = half_btf_sse4_1(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit); 2631 v[11] = half_btf_sse4_1(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit); 2632 v[12] = half_btf_sse4_1(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit); 2633 v[13] = half_btf_sse4_1(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit); 2634 v[14] = half_btf_sse4_1(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit); 2635 v[15] = half_btf_sse4_1(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit); 2636 2637 // stage 3 2638 u[0] = v[0]; 2639 u[1] = v[1]; 2640 u[2] = v[2]; 2641 u[3] = v[3]; 2642 u[4] = half_btf_sse4_1(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit); 2643 u[5] = half_btf_sse4_1(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit); 2644 u[6] = half_btf_sse4_1(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit); 2645 u[7] = half_btf_sse4_1(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit); 2646 addsub_sse4_1(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi); 2647 addsub_sse4_1(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi); 2648 addsub_sse4_1(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi); 2649 addsub_sse4_1(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi); 2650 2651 // stage 4 2652 x = _mm_mullo_epi32(u[0], cospi32); 2653 y = _mm_mullo_epi32(u[1], cospi32); 2654 v[0] = _mm_add_epi32(x, y); 2655 v[0] = _mm_add_epi32(v[0], rnding); 2656 v[0] = _mm_srai_epi32(v[0], bit); 2657 2658 v[1] = _mm_sub_epi32(x, y); 2659 v[1] = _mm_add_epi32(v[1], rnding); 2660 v[1] = _mm_srai_epi32(v[1], bit); 2661 2662 v[2] = half_btf_sse4_1(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit); 2663 v[3] = half_btf_sse4_1(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit); 2664 addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi); 2665 addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi); 2666 v[8] = u[8]; 2667 v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); 2668 v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); 2669 v[11] = u[11]; 2670 v[12] = u[12]; 2671 v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); 2672 v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); 2673 v[15] = u[15]; 2674 2675 // stage 5 2676 addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi); 2677 addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi); 2678 u[4] = v[4]; 2679 2680 x = _mm_mullo_epi32(v[5], cospi32); 2681 y = _mm_mullo_epi32(v[6], cospi32); 2682 u[5] = _mm_sub_epi32(y, x); 2683 u[5] = _mm_add_epi32(u[5], rnding); 2684 u[5] = _mm_srai_epi32(u[5], bit); 2685 2686 u[6] = _mm_add_epi32(y, x); 2687 u[6] = _mm_add_epi32(u[6], rnding); 2688 u[6] = _mm_srai_epi32(u[6], bit); 2689 2690 u[7] = v[7]; 2691 addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi); 2692 addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi); 2693 addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi); 2694 addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi); 2695 2696 // stage 6 2697 addsub_sse4_1(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi); 2698 addsub_sse4_1(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi); 2699 addsub_sse4_1(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi); 2700 addsub_sse4_1(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi); 2701 v[8] = u[8]; 2702 v[9] = u[9]; 2703 2704 x = _mm_mullo_epi32(u[10], cospi32); 2705 y = _mm_mullo_epi32(u[13], cospi32); 2706 v[10] = _mm_sub_epi32(y, x); 2707 v[10] = _mm_add_epi32(v[10], rnding); 2708 v[10] = _mm_srai_epi32(v[10], bit); 2709 2710 v[13] = _mm_add_epi32(x, y); 2711 v[13] = _mm_add_epi32(v[13], rnding); 2712 v[13] = _mm_srai_epi32(v[13], bit); 2713 2714 x = _mm_mullo_epi32(u[11], cospi32); 2715 y = _mm_mullo_epi32(u[12], cospi32); 2716 v[11] = _mm_sub_epi32(y, x); 2717 v[11] = _mm_add_epi32(v[11], rnding); 2718 v[11] = _mm_srai_epi32(v[11], bit); 2719 2720 v[12] = _mm_add_epi32(x, y); 2721 v[12] = _mm_add_epi32(v[12], rnding); 2722 v[12] = _mm_srai_epi32(v[12], bit); 2723 2724 v[14] = u[14]; 2725 v[15] = u[15]; 2726 2727 // stage 7 2728 addsub_sse4_1(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi); 2729 addsub_sse4_1(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi); 2730 addsub_sse4_1(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi); 2731 addsub_sse4_1(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi); 2732 addsub_sse4_1(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi); 2733 addsub_sse4_1(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi); 2734 addsub_sse4_1(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi); 2735 addsub_sse4_1(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi); 2736 2737 if (!do_cols) { 2738 const int log_range_out = AOMMAX(16, bd + 6); 2739 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); 2740 const __m128i clamp_hi_out = 2741 _mm_set1_epi32((1 << (log_range_out - 1)) - 1); 2742 round_shift_8x8(out, out_shift); 2743 highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16); 2744 } 2745 } 2746 } 2747 2748 static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, 2749 int bd, int out_shift) { 2750 const int32_t *cospi = cospi_arr(bit); 2751 const __m128i cospi2 = _mm_set1_epi32(cospi[2]); 2752 const __m128i cospi62 = _mm_set1_epi32(cospi[62]); 2753 const __m128i cospi10 = _mm_set1_epi32(cospi[10]); 2754 const __m128i cospi54 = _mm_set1_epi32(cospi[54]); 2755 const __m128i cospi18 = _mm_set1_epi32(cospi[18]); 2756 const __m128i cospi46 = _mm_set1_epi32(cospi[46]); 2757 const __m128i cospi26 = _mm_set1_epi32(cospi[26]); 2758 const __m128i cospi38 = _mm_set1_epi32(cospi[38]); 2759 const __m128i cospi34 = _mm_set1_epi32(cospi[34]); 2760 const __m128i cospi30 = _mm_set1_epi32(cospi[30]); 2761 const __m128i cospi42 = _mm_set1_epi32(cospi[42]); 2762 const __m128i cospi22 = _mm_set1_epi32(cospi[22]); 2763 const __m128i cospi50 = _mm_set1_epi32(cospi[50]); 2764 const __m128i cospi14 = _mm_set1_epi32(cospi[14]); 2765 const __m128i cospi58 = _mm_set1_epi32(cospi[58]); 2766 const __m128i cospi6 = _mm_set1_epi32(cospi[6]); 2767 const __m128i cospi8 = _mm_set1_epi32(cospi[8]); 2768 const __m128i cospi56 = _mm_set1_epi32(cospi[56]); 2769 const __m128i cospi40 = _mm_set1_epi32(cospi[40]); 2770 const __m128i cospi24 = _mm_set1_epi32(cospi[24]); 2771 const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); 2772 const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); 2773 const __m128i cospi48 = _mm_set1_epi32(cospi[48]); 2774 const __m128i cospi16 = _mm_set1_epi32(cospi[16]); 2775 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); 2776 const __m128i cospi32 = _mm_set1_epi32(cospi[32]); 2777 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); 2778 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); 2779 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); 2780 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); 2781 const __m128i zero = _mm_setzero_si128(); 2782 __m128i u[16], v[16], x, y; 2783 // Calculate the column 0, 1, 2, 3 2784 // stage 0 2785 // stage 1 2786 // stage 2 2787 v[0] = _mm_mullo_epi32(in[15], cospi2); 2788 x = _mm_mullo_epi32(in[0], cospi62); 2789 v[0] = _mm_add_epi32(v[0], x); 2790 v[0] = _mm_add_epi32(v[0], rnding); 2791 v[0] = _mm_srai_epi32(v[0], bit); 2792 2793 v[1] = _mm_mullo_epi32(in[15], cospi62); 2794 x = _mm_mullo_epi32(in[0], cospi2); 2795 v[1] = _mm_sub_epi32(v[1], x); 2796 v[1] = _mm_add_epi32(v[1], rnding); 2797 v[1] = _mm_srai_epi32(v[1], bit); 2798 2799 v[2] = _mm_mullo_epi32(in[13], cospi10); 2800 x = _mm_mullo_epi32(in[2], cospi54); 2801 v[2] = _mm_add_epi32(v[2], x); 2802 v[2] = _mm_add_epi32(v[2], rnding); 2803 v[2] = _mm_srai_epi32(v[2], bit); 2804 2805 v[3] = _mm_mullo_epi32(in[13], cospi54); 2806 x = _mm_mullo_epi32(in[2], cospi10); 2807 v[3] = _mm_sub_epi32(v[3], x); 2808 v[3] = _mm_add_epi32(v[3], rnding); 2809 v[3] = _mm_srai_epi32(v[3], bit); 2810 2811 v[4] = _mm_mullo_epi32(in[11], cospi18); 2812 x = _mm_mullo_epi32(in[4], cospi46); 2813 v[4] = _mm_add_epi32(v[4], x); 2814 v[4] = _mm_add_epi32(v[4], rnding); 2815 v[4] = _mm_srai_epi32(v[4], bit); 2816 2817 v[5] = _mm_mullo_epi32(in[11], cospi46); 2818 x = _mm_mullo_epi32(in[4], cospi18); 2819 v[5] = _mm_sub_epi32(v[5], x); 2820 v[5] = _mm_add_epi32(v[5], rnding); 2821 v[5] = _mm_srai_epi32(v[5], bit); 2822 2823 v[6] = _mm_mullo_epi32(in[9], cospi26); 2824 x = _mm_mullo_epi32(in[6], cospi38); 2825 v[6] = _mm_add_epi32(v[6], x); 2826 v[6] = _mm_add_epi32(v[6], rnding); 2827 v[6] = _mm_srai_epi32(v[6], bit); 2828 2829 v[7] = _mm_mullo_epi32(in[9], cospi38); 2830 x = _mm_mullo_epi32(in[6], cospi26); 2831 v[7] = _mm_sub_epi32(v[7], x); 2832 v[7] = _mm_add_epi32(v[7], rnding); 2833 v[7] = _mm_srai_epi32(v[7], bit); 2834 2835 v[8] = _mm_mullo_epi32(in[7], cospi34); 2836 x = _mm_mullo_epi32(in[8], cospi30); 2837 v[8] = _mm_add_epi32(v[8], x); 2838 v[8] = _mm_add_epi32(v[8], rnding); 2839 v[8] = _mm_srai_epi32(v[8], bit); 2840 2841 v[9] = _mm_mullo_epi32(in[7], cospi30); 2842 x = _mm_mullo_epi32(in[8], cospi34); 2843 v[9] = _mm_sub_epi32(v[9], x); 2844 v[9] = _mm_add_epi32(v[9], rnding); 2845 v[9] = _mm_srai_epi32(v[9], bit); 2846 2847 v[10] = _mm_mullo_epi32(in[5], cospi42); 2848 x = _mm_mullo_epi32(in[10], cospi22); 2849 v[10] = _mm_add_epi32(v[10], x); 2850 v[10] = _mm_add_epi32(v[10], rnding); 2851 v[10] = _mm_srai_epi32(v[10], bit); 2852 2853 v[11] = _mm_mullo_epi32(in[5], cospi22); 2854 x = _mm_mullo_epi32(in[10], cospi42); 2855 v[11] = _mm_sub_epi32(v[11], x); 2856 v[11] = _mm_add_epi32(v[11], rnding); 2857 v[11] = _mm_srai_epi32(v[11], bit); 2858 2859 v[12] = _mm_mullo_epi32(in[3], cospi50); 2860 x = _mm_mullo_epi32(in[12], cospi14); 2861 v[12] = _mm_add_epi32(v[12], x); 2862 v[12] = _mm_add_epi32(v[12], rnding); 2863 v[12] = _mm_srai_epi32(v[12], bit); 2864 2865 v[13] = _mm_mullo_epi32(in[3], cospi14); 2866 x = _mm_mullo_epi32(in[12], cospi50); 2867 v[13] = _mm_sub_epi32(v[13], x); 2868 v[13] = _mm_add_epi32(v[13], rnding); 2869 v[13] = _mm_srai_epi32(v[13], bit); 2870 2871 v[14] = _mm_mullo_epi32(in[1], cospi58); 2872 x = _mm_mullo_epi32(in[14], cospi6); 2873 v[14] = _mm_add_epi32(v[14], x); 2874 v[14] = _mm_add_epi32(v[14], rnding); 2875 v[14] = _mm_srai_epi32(v[14], bit); 2876 2877 v[15] = _mm_mullo_epi32(in[1], cospi6); 2878 x = _mm_mullo_epi32(in[14], cospi58); 2879 v[15] = _mm_sub_epi32(v[15], x); 2880 v[15] = _mm_add_epi32(v[15], rnding); 2881 v[15] = _mm_srai_epi32(v[15], bit); 2882 2883 // stage 3 2884 addsub_sse4_1(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi); 2885 addsub_sse4_1(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi); 2886 addsub_sse4_1(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi); 2887 addsub_sse4_1(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi); 2888 addsub_sse4_1(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi); 2889 addsub_sse4_1(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi); 2890 addsub_sse4_1(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi); 2891 addsub_sse4_1(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi); 2892 2893 // stage 4 2894 v[0] = u[0]; 2895 v[1] = u[1]; 2896 v[2] = u[2]; 2897 v[3] = u[3]; 2898 v[4] = u[4]; 2899 v[5] = u[5]; 2900 v[6] = u[6]; 2901 v[7] = u[7]; 2902 2903 v[8] = _mm_mullo_epi32(u[8], cospi8); 2904 x = _mm_mullo_epi32(u[9], cospi56); 2905 v[8] = _mm_add_epi32(v[8], x); 2906 v[8] = _mm_add_epi32(v[8], rnding); 2907 v[8] = _mm_srai_epi32(v[8], bit); 2908 2909 v[9] = _mm_mullo_epi32(u[8], cospi56); 2910 x = _mm_mullo_epi32(u[9], cospi8); 2911 v[9] = _mm_sub_epi32(v[9], x); 2912 v[9] = _mm_add_epi32(v[9], rnding); 2913 v[9] = _mm_srai_epi32(v[9], bit); 2914 2915 v[10] = _mm_mullo_epi32(u[10], cospi40); 2916 x = _mm_mullo_epi32(u[11], cospi24); 2917 v[10] = _mm_add_epi32(v[10], x); 2918 v[10] = _mm_add_epi32(v[10], rnding); 2919 v[10] = _mm_srai_epi32(v[10], bit); 2920 2921 v[11] = _mm_mullo_epi32(u[10], cospi24); 2922 x = _mm_mullo_epi32(u[11], cospi40); 2923 v[11] = _mm_sub_epi32(v[11], x); 2924 v[11] = _mm_add_epi32(v[11], rnding); 2925 v[11] = _mm_srai_epi32(v[11], bit); 2926 2927 v[12] = _mm_mullo_epi32(u[12], cospim56); 2928 x = _mm_mullo_epi32(u[13], cospi8); 2929 v[12] = _mm_add_epi32(v[12], x); 2930 v[12] = _mm_add_epi32(v[12], rnding); 2931 v[12] = _mm_srai_epi32(v[12], bit); 2932 2933 v[13] = _mm_mullo_epi32(u[12], cospi8); 2934 x = _mm_mullo_epi32(u[13], cospim56); 2935 v[13] = _mm_sub_epi32(v[13], x); 2936 v[13] = _mm_add_epi32(v[13], rnding); 2937 v[13] = _mm_srai_epi32(v[13], bit); 2938 2939 v[14] = _mm_mullo_epi32(u[14], cospim24); 2940 x = _mm_mullo_epi32(u[15], cospi40); 2941 v[14] = _mm_add_epi32(v[14], x); 2942 v[14] = _mm_add_epi32(v[14], rnding); 2943 v[14] = _mm_srai_epi32(v[14], bit); 2944 2945 v[15] = _mm_mullo_epi32(u[14], cospi40); 2946 x = _mm_mullo_epi32(u[15], cospim24); 2947 v[15] = _mm_sub_epi32(v[15], x); 2948 v[15] = _mm_add_epi32(v[15], rnding); 2949 v[15] = _mm_srai_epi32(v[15], bit); 2950 2951 // stage 5 2952 addsub_sse4_1(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi); 2953 addsub_sse4_1(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi); 2954 addsub_sse4_1(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi); 2955 addsub_sse4_1(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi); 2956 addsub_sse4_1(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi); 2957 addsub_sse4_1(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi); 2958 addsub_sse4_1(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi); 2959 addsub_sse4_1(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi); 2960 2961 // stage 6 2962 v[0] = u[0]; 2963 v[1] = u[1]; 2964 v[2] = u[2]; 2965 v[3] = u[3]; 2966 2967 v[4] = _mm_mullo_epi32(u[4], cospi16); 2968 x = _mm_mullo_epi32(u[5], cospi48); 2969 v[4] = _mm_add_epi32(v[4], x); 2970 v[4] = _mm_add_epi32(v[4], rnding); 2971 v[4] = _mm_srai_epi32(v[4], bit); 2972 2973 v[5] = _mm_mullo_epi32(u[4], cospi48); 2974 x = _mm_mullo_epi32(u[5], cospi16); 2975 v[5] = _mm_sub_epi32(v[5], x); 2976 v[5] = _mm_add_epi32(v[5], rnding); 2977 v[5] = _mm_srai_epi32(v[5], bit); 2978 2979 v[6] = _mm_mullo_epi32(u[6], cospim48); 2980 x = _mm_mullo_epi32(u[7], cospi16); 2981 v[6] = _mm_add_epi32(v[6], x); 2982 v[6] = _mm_add_epi32(v[6], rnding); 2983 v[6] = _mm_srai_epi32(v[6], bit); 2984 2985 v[7] = _mm_mullo_epi32(u[6], cospi16); 2986 x = _mm_mullo_epi32(u[7], cospim48); 2987 v[7] = _mm_sub_epi32(v[7], x); 2988 v[7] = _mm_add_epi32(v[7], rnding); 2989 v[7] = _mm_srai_epi32(v[7], bit); 2990 2991 v[8] = u[8]; 2992 v[9] = u[9]; 2993 v[10] = u[10]; 2994 v[11] = u[11]; 2995 2996 v[12] = _mm_mullo_epi32(u[12], cospi16); 2997 x = _mm_mullo_epi32(u[13], cospi48); 2998 v[12] = _mm_add_epi32(v[12], x); 2999 v[12] = _mm_add_epi32(v[12], rnding); 3000 v[12] = _mm_srai_epi32(v[12], bit); 3001 3002 v[13] = _mm_mullo_epi32(u[12], cospi48); 3003 x = _mm_mullo_epi32(u[13], cospi16); 3004 v[13] = _mm_sub_epi32(v[13], x); 3005 v[13] = _mm_add_epi32(v[13], rnding); 3006 v[13] = _mm_srai_epi32(v[13], bit); 3007 3008 v[14] = _mm_mullo_epi32(u[14], cospim48); 3009 x = _mm_mullo_epi32(u[15], cospi16); 3010 v[14] = _mm_add_epi32(v[14], x); 3011 v[14] = _mm_add_epi32(v[14], rnding); 3012 v[14] = _mm_srai_epi32(v[14], bit); 3013 3014 v[15] = _mm_mullo_epi32(u[14], cospi16); 3015 x = _mm_mullo_epi32(u[15], cospim48); 3016 v[15] = _mm_sub_epi32(v[15], x); 3017 v[15] = _mm_add_epi32(v[15], rnding); 3018 v[15] = _mm_srai_epi32(v[15], bit); 3019 3020 // stage 7 3021 addsub_sse4_1(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi); 3022 addsub_sse4_1(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi); 3023 addsub_sse4_1(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi); 3024 addsub_sse4_1(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi); 3025 addsub_sse4_1(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi); 3026 addsub_sse4_1(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi); 3027 addsub_sse4_1(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi); 3028 addsub_sse4_1(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi); 3029 3030 // stage 8 3031 v[0] = u[0]; 3032 v[1] = u[1]; 3033 3034 y = _mm_mullo_epi32(u[2], cospi32); 3035 x = _mm_mullo_epi32(u[3], cospi32); 3036 v[2] = _mm_add_epi32(y, x); 3037 v[2] = _mm_add_epi32(v[2], rnding); 3038 v[2] = _mm_srai_epi32(v[2], bit); 3039 3040 v[3] = _mm_sub_epi32(y, x); 3041 v[3] = _mm_add_epi32(v[3], rnding); 3042 v[3] = _mm_srai_epi32(v[3], bit); 3043 3044 v[4] = u[4]; 3045 v[5] = u[5]; 3046 3047 y = _mm_mullo_epi32(u[6], cospi32); 3048 x = _mm_mullo_epi32(u[7], cospi32); 3049 v[6] = _mm_add_epi32(y, x); 3050 v[6] = _mm_add_epi32(v[6], rnding); 3051 v[6] = _mm_srai_epi32(v[6], bit); 3052 3053 v[7] = _mm_sub_epi32(y, x); 3054 v[7] = _mm_add_epi32(v[7], rnding); 3055 v[7] = _mm_srai_epi32(v[7], bit); 3056 3057 v[8] = u[8]; 3058 v[9] = u[9]; 3059 3060 y = _mm_mullo_epi32(u[10], cospi32); 3061 x = _mm_mullo_epi32(u[11], cospi32); 3062 v[10] = _mm_add_epi32(y, x); 3063 v[10] = _mm_add_epi32(v[10], rnding); 3064 v[10] = _mm_srai_epi32(v[10], bit); 3065 3066 v[11] = _mm_sub_epi32(y, x); 3067 v[11] = _mm_add_epi32(v[11], rnding); 3068 v[11] = _mm_srai_epi32(v[11], bit); 3069 3070 v[12] = u[12]; 3071 v[13] = u[13]; 3072 3073 y = _mm_mullo_epi32(u[14], cospi32); 3074 x = _mm_mullo_epi32(u[15], cospi32); 3075 v[14] = _mm_add_epi32(y, x); 3076 v[14] = _mm_add_epi32(v[14], rnding); 3077 v[14] = _mm_srai_epi32(v[14], bit); 3078 3079 v[15] = _mm_sub_epi32(y, x); 3080 v[15] = _mm_add_epi32(v[15], rnding); 3081 v[15] = _mm_srai_epi32(v[15], bit); 3082 3083 // stage 9 3084 if (do_cols) { 3085 out[0] = v[0]; 3086 out[1] = _mm_sub_epi32(zero, v[8]); 3087 out[2] = v[12]; 3088 out[3] = _mm_sub_epi32(zero, v[4]); 3089 out[4] = v[6]; 3090 out[5] = _mm_sub_epi32(zero, v[14]); 3091 out[6] = v[10]; 3092 out[7] = _mm_sub_epi32(zero, v[2]); 3093 out[8] = v[3]; 3094 out[9] = _mm_sub_epi32(zero, v[11]); 3095 out[10] = v[15]; 3096 out[11] = _mm_sub_epi32(zero, v[7]); 3097 out[12] = v[5]; 3098 out[13] = _mm_sub_epi32(zero, v[13]); 3099 out[14] = v[9]; 3100 out[15] = _mm_sub_epi32(zero, v[1]); 3101 } else { 3102 const int log_range_out = AOMMAX(16, bd + 6); 3103 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); 3104 const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); 3105 3106 neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, 3107 out_shift); 3108 neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out, 3109 &clamp_hi_out, out_shift); 3110 neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out, 3111 &clamp_hi_out, out_shift); 3112 neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out, 3113 &clamp_hi_out, out_shift); 3114 neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out, 3115 &clamp_hi_out, out_shift); 3116 neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out, 3117 &clamp_hi_out, out_shift); 3118 neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out, 3119 &clamp_hi_out, out_shift); 3120 neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out, 3121 &clamp_hi_out, out_shift); 3122 } 3123 } 3124 static void iidentity16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, 3125 int bd, int out_shift) { 3126 (void)bit; 3127 __m128i fact = _mm_set1_epi32(2 * NewSqrt2); 3128 __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1)); 3129 __m128i a0_low, a0_high, a1_low, a1_high; 3130 __m128i zero = _mm_setzero_si128(); 3131 offset = _mm_unpacklo_epi32(offset, zero); 3132 3133 for (int i = 0; i < 16; i++) { 3134 a0_low = _mm_mul_epi32(in[i], fact); 3135 a0_low = _mm_add_epi32(a0_low, offset); 3136 a0_low = _mm_srli_epi64(a0_low, NewSqrt2Bits); 3137 3138 a0_high = _mm_srli_si128(in[i], 4); 3139 a0_high = _mm_mul_epi32(a0_high, fact); 3140 a0_high = _mm_add_epi32(a0_high, offset); 3141 a0_high = _mm_srli_epi64(a0_high, NewSqrt2Bits); 3142 3143 a1_low = _mm_unpacklo_epi32(a0_low, a0_high); 3144 a1_high = _mm_unpackhi_epi32(a0_low, a0_high); 3145 out[i] = _mm_unpacklo_epi64(a1_low, a1_high); 3146 } 3147 3148 if (!do_cols) { 3149 const int log_range = AOMMAX(16, bd + 6); 3150 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); 3151 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); 3152 round_shift_8x8(out, out_shift); 3153 highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 16); 3154 } 3155 } 3156 static inline void idct64_stage8_sse4_1( 3157 __m128i *u, const __m128i *cospim32, const __m128i *cospi32, 3158 const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16, 3159 const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi, 3160 const __m128i *rnding, int bit) { 3161 int i; 3162 __m128i temp1, temp2, temp3, temp4; 3163 temp1 = half_btf_sse4_1(cospim32, &u[10], cospi32, &u[13], rnding, bit); 3164 u[13] = half_btf_sse4_1(cospi32, &u[10], cospi32, &u[13], rnding, bit); 3165 u[10] = temp1; 3166 temp2 = half_btf_sse4_1(cospim32, &u[11], cospi32, &u[12], rnding, bit); 3167 u[12] = half_btf_sse4_1(cospi32, &u[11], cospi32, &u[12], rnding, bit); 3168 u[11] = temp2; 3169 3170 for (i = 16; i < 20; ++i) { 3171 addsub_sse4_1(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi); 3172 addsub_sse4_1(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo, 3173 clamp_hi); 3174 } 3175 3176 temp1 = half_btf_sse4_1(cospim16, &u[36], cospi48, &u[59], rnding, bit); 3177 temp2 = half_btf_sse4_1(cospim16, &u[37], cospi48, &u[58], rnding, bit); 3178 temp3 = half_btf_sse4_1(cospim16, &u[38], cospi48, &u[57], rnding, bit); 3179 temp4 = half_btf_sse4_1(cospim16, &u[39], cospi48, &u[56], rnding, bit); 3180 u[56] = half_btf_sse4_1(cospi48, &u[39], cospi16, &u[56], rnding, bit); 3181 u[57] = half_btf_sse4_1(cospi48, &u[38], cospi16, &u[57], rnding, bit); 3182 u[58] = half_btf_sse4_1(cospi48, &u[37], cospi16, &u[58], rnding, bit); 3183 u[59] = half_btf_sse4_1(cospi48, &u[36], cospi16, &u[59], rnding, bit); 3184 u[36] = temp1; 3185 u[37] = temp2; 3186 u[38] = temp3; 3187 u[39] = temp4; 3188 3189 temp1 = half_btf_sse4_1(cospim48, &u[40], cospim16, &u[55], rnding, bit); 3190 temp2 = half_btf_sse4_1(cospim48, &u[41], cospim16, &u[54], rnding, bit); 3191 temp3 = half_btf_sse4_1(cospim48, &u[42], cospim16, &u[53], rnding, bit); 3192 temp4 = half_btf_sse4_1(cospim48, &u[43], cospim16, &u[52], rnding, bit); 3193 u[52] = half_btf_sse4_1(cospim16, &u[43], cospi48, &u[52], rnding, bit); 3194 u[53] = half_btf_sse4_1(cospim16, &u[42], cospi48, &u[53], rnding, bit); 3195 u[54] = half_btf_sse4_1(cospim16, &u[41], cospi48, &u[54], rnding, bit); 3196 u[55] = half_btf_sse4_1(cospim16, &u[40], cospi48, &u[55], rnding, bit); 3197 u[40] = temp1; 3198 u[41] = temp2; 3199 u[42] = temp3; 3200 u[43] = temp4; 3201 } 3202 3203 static inline void idct64_stage9_sse4_1(__m128i *u, const __m128i *cospim32, 3204 const __m128i *cospi32, 3205 const __m128i *clamp_lo, 3206 const __m128i *clamp_hi, 3207 const __m128i *rnding, int bit) { 3208 int i; 3209 __m128i temp1, temp2, temp3, temp4; 3210 for (i = 0; i < 8; ++i) { 3211 addsub_sse4_1(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi); 3212 } 3213 3214 temp1 = half_btf_sse4_1(cospim32, &u[20], cospi32, &u[27], rnding, bit); 3215 temp2 = half_btf_sse4_1(cospim32, &u[21], cospi32, &u[26], rnding, bit); 3216 temp3 = half_btf_sse4_1(cospim32, &u[22], cospi32, &u[25], rnding, bit); 3217 temp4 = half_btf_sse4_1(cospim32, &u[23], cospi32, &u[24], rnding, bit); 3218 u[24] = half_btf_sse4_1(cospi32, &u[23], cospi32, &u[24], rnding, bit); 3219 u[25] = half_btf_sse4_1(cospi32, &u[22], cospi32, &u[25], rnding, bit); 3220 u[26] = half_btf_sse4_1(cospi32, &u[21], cospi32, &u[26], rnding, bit); 3221 u[27] = half_btf_sse4_1(cospi32, &u[20], cospi32, &u[27], rnding, bit); 3222 u[20] = temp1; 3223 u[21] = temp2; 3224 u[22] = temp3; 3225 u[23] = temp4; 3226 for (i = 32; i < 40; i++) { 3227 addsub_sse4_1(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi); 3228 } 3229 3230 for (i = 48; i < 56; i++) { 3231 addsub_sse4_1(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi); 3232 } 3233 } 3234 3235 static inline void idct64_stage10_sse4_1(__m128i *u, const __m128i *cospim32, 3236 const __m128i *cospi32, 3237 const __m128i *clamp_lo, 3238 const __m128i *clamp_hi, 3239 const __m128i *rnding, int bit) { 3240 __m128i temp1, temp2, temp3, temp4; 3241 for (int i = 0; i < 16; i++) { 3242 addsub_sse4_1(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi); 3243 } 3244 3245 temp1 = half_btf_sse4_1(cospim32, &u[40], cospi32, &u[55], rnding, bit); 3246 temp2 = half_btf_sse4_1(cospim32, &u[41], cospi32, &u[54], rnding, bit); 3247 temp3 = half_btf_sse4_1(cospim32, &u[42], cospi32, &u[53], rnding, bit); 3248 temp4 = half_btf_sse4_1(cospim32, &u[43], cospi32, &u[52], rnding, bit); 3249 u[52] = half_btf_sse4_1(cospi32, &u[43], cospi32, &u[52], rnding, bit); 3250 u[53] = half_btf_sse4_1(cospi32, &u[42], cospi32, &u[53], rnding, bit); 3251 u[54] = half_btf_sse4_1(cospi32, &u[41], cospi32, &u[54], rnding, bit); 3252 u[55] = half_btf_sse4_1(cospi32, &u[40], cospi32, &u[55], rnding, bit); 3253 u[40] = temp1; 3254 u[41] = temp2; 3255 u[42] = temp3; 3256 u[43] = temp4; 3257 3258 temp1 = half_btf_sse4_1(cospim32, &u[44], cospi32, &u[51], rnding, bit); 3259 temp2 = half_btf_sse4_1(cospim32, &u[45], cospi32, &u[50], rnding, bit); 3260 temp3 = half_btf_sse4_1(cospim32, &u[46], cospi32, &u[49], rnding, bit); 3261 temp4 = half_btf_sse4_1(cospim32, &u[47], cospi32, &u[48], rnding, bit); 3262 u[48] = half_btf_sse4_1(cospi32, &u[47], cospi32, &u[48], rnding, bit); 3263 u[49] = half_btf_sse4_1(cospi32, &u[46], cospi32, &u[49], rnding, bit); 3264 u[50] = half_btf_sse4_1(cospi32, &u[45], cospi32, &u[50], rnding, bit); 3265 u[51] = half_btf_sse4_1(cospi32, &u[44], cospi32, &u[51], rnding, bit); 3266 u[44] = temp1; 3267 u[45] = temp2; 3268 u[46] = temp3; 3269 u[47] = temp4; 3270 } 3271 3272 static inline void idct64_stage11_sse4_1(__m128i *u, __m128i *out, int do_cols, 3273 int bd, int out_shift, 3274 const __m128i *clamp_lo, 3275 const __m128i *clamp_hi) { 3276 for (int i = 0; i < 32; i++) { 3277 addsub_sse4_1(u[i], u[63 - i], out + i, out + 63 - i, clamp_lo, clamp_hi); 3278 } 3279 3280 if (!do_cols) { 3281 const int log_range_out = AOMMAX(16, bd + 6); 3282 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); 3283 const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); 3284 3285 for (int i = 0; i < 64; i += 4) { 3286 round_shift_4x4(out + i, out_shift); 3287 highbd_clamp_epi32_sse4_1(out + i, out + i, &clamp_lo_out, &clamp_hi_out, 3288 4); 3289 } 3290 } 3291 } 3292 3293 static void idct64x64_low1_sse4_1(__m128i *in, __m128i *out, int bit, 3294 int do_cols, int bd, int out_shift) { 3295 const int32_t *cospi = cospi_arr(bit); 3296 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); 3297 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); 3298 __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); 3299 __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); 3300 3301 const __m128i cospi32 = _mm_set1_epi32(cospi[32]); 3302 3303 { 3304 __m128i x; 3305 3306 // stage 1 3307 // stage 2 3308 // stage 3 3309 // stage 4 3310 // stage 5 3311 // stage 6 3312 x = half_btf_0_sse4_1(&cospi32, &in[0], &rnding, bit); 3313 3314 // stage 8 3315 // stage 9 3316 // stage 10 3317 // stage 11 3318 if (!do_cols) { 3319 const int log_range_out = AOMMAX(16, bd + 6); 3320 clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1))); 3321 clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); 3322 if (out_shift != 0) { 3323 __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1); 3324 x = _mm_add_epi32(x, offset); 3325 x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift)); 3326 } 3327 } 3328 x = _mm_max_epi32(x, clamp_lo); 3329 x = _mm_min_epi32(x, clamp_hi); 3330 out[0] = x; 3331 out[1] = x; 3332 out[2] = x; 3333 out[3] = x; 3334 out[4] = x; 3335 out[5] = x; 3336 out[6] = x; 3337 out[7] = x; 3338 out[8] = x; 3339 out[9] = x; 3340 out[10] = x; 3341 out[11] = x; 3342 out[12] = x; 3343 out[13] = x; 3344 out[14] = x; 3345 out[15] = x; 3346 out[16] = x; 3347 out[17] = x; 3348 out[18] = x; 3349 out[19] = x; 3350 out[20] = x; 3351 out[21] = x; 3352 out[22] = x; 3353 out[23] = x; 3354 out[24] = x; 3355 out[25] = x; 3356 out[26] = x; 3357 out[27] = x; 3358 out[28] = x; 3359 out[29] = x; 3360 out[30] = x; 3361 out[31] = x; 3362 out[32] = x; 3363 out[33] = x; 3364 out[34] = x; 3365 out[35] = x; 3366 out[36] = x; 3367 out[37] = x; 3368 out[38] = x; 3369 out[39] = x; 3370 out[40] = x; 3371 out[41] = x; 3372 out[42] = x; 3373 out[43] = x; 3374 out[44] = x; 3375 out[45] = x; 3376 out[46] = x; 3377 out[47] = x; 3378 out[48] = x; 3379 out[49] = x; 3380 out[50] = x; 3381 out[51] = x; 3382 out[52] = x; 3383 out[53] = x; 3384 out[54] = x; 3385 out[55] = x; 3386 out[56] = x; 3387 out[57] = x; 3388 out[58] = x; 3389 out[59] = x; 3390 out[60] = x; 3391 out[61] = x; 3392 out[62] = x; 3393 out[63] = x; 3394 } 3395 } 3396 3397 static void idct64x64_low8_sse4_1(__m128i *in, __m128i *out, int bit, 3398 int do_cols, int bd, int out_shift) { 3399 int i, j; 3400 const int32_t *cospi = cospi_arr(bit); 3401 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); 3402 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); 3403 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); 3404 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); 3405 3406 const __m128i cospi1 = _mm_set1_epi32(cospi[1]); 3407 const __m128i cospi2 = _mm_set1_epi32(cospi[2]); 3408 const __m128i cospi3 = _mm_set1_epi32(cospi[3]); 3409 const __m128i cospi4 = _mm_set1_epi32(cospi[4]); 3410 const __m128i cospi6 = _mm_set1_epi32(cospi[6]); 3411 const __m128i cospi8 = _mm_set1_epi32(cospi[8]); 3412 const __m128i cospi12 = _mm_set1_epi32(cospi[12]); 3413 const __m128i cospi16 = _mm_set1_epi32(cospi[16]); 3414 const __m128i cospi20 = _mm_set1_epi32(cospi[20]); 3415 const __m128i cospi24 = _mm_set1_epi32(cospi[24]); 3416 const __m128i cospi28 = _mm_set1_epi32(cospi[28]); 3417 const __m128i cospi32 = _mm_set1_epi32(cospi[32]); 3418 const __m128i cospi40 = _mm_set1_epi32(cospi[40]); 3419 const __m128i cospi44 = _mm_set1_epi32(cospi[44]); 3420 const __m128i cospi48 = _mm_set1_epi32(cospi[48]); 3421 const __m128i cospi56 = _mm_set1_epi32(cospi[56]); 3422 const __m128i cospi60 = _mm_set1_epi32(cospi[60]); 3423 const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); 3424 const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); 3425 const __m128i cospim12 = _mm_set1_epi32(-cospi[12]); 3426 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); 3427 const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); 3428 const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); 3429 const __m128i cospim28 = _mm_set1_epi32(-cospi[28]); 3430 const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); 3431 const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); 3432 const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); 3433 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); 3434 const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); 3435 const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); 3436 const __m128i cospi63 = _mm_set1_epi32(cospi[63]); 3437 const __m128i cospim57 = _mm_set1_epi32(-cospi[57]); 3438 const __m128i cospi7 = _mm_set1_epi32(cospi[7]); 3439 const __m128i cospi5 = _mm_set1_epi32(cospi[5]); 3440 const __m128i cospi59 = _mm_set1_epi32(cospi[59]); 3441 const __m128i cospim61 = _mm_set1_epi32(-cospi[61]); 3442 const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); 3443 const __m128i cospi62 = _mm_set1_epi32(cospi[62]); 3444 3445 { 3446 __m128i u[64]; 3447 3448 // stage 1 3449 u[0] = in[0]; 3450 u[8] = in[4]; 3451 u[16] = in[2]; 3452 u[24] = in[6]; 3453 u[32] = in[1]; 3454 u[40] = in[5]; 3455 u[48] = in[3]; 3456 u[56] = in[7]; 3457 3458 // stage 2 3459 u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit); 3460 u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit); 3461 u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit); 3462 u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit); 3463 u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit); 3464 u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit); 3465 u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit); 3466 u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit); 3467 3468 // stage 3 3469 u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit); 3470 u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit); 3471 u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit); 3472 u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit); 3473 u[33] = u[32]; 3474 u[38] = u[39]; 3475 u[41] = u[40]; 3476 u[46] = u[47]; 3477 u[49] = u[48]; 3478 u[54] = u[55]; 3479 u[57] = u[56]; 3480 u[62] = u[63]; 3481 3482 // stage 4 3483 __m128i temp1, temp2; 3484 u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit); 3485 u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit); 3486 u[17] = u[16]; 3487 u[22] = u[23]; 3488 u[25] = u[24]; 3489 u[30] = u[31]; 3490 3491 temp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); 3492 u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); 3493 u[33] = temp1; 3494 3495 temp2 = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); 3496 u[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); 3497 u[57] = temp2; 3498 3499 temp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); 3500 u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); 3501 u[41] = temp1; 3502 3503 temp2 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); 3504 u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); 3505 u[46] = temp2; 3506 3507 // stage 5 3508 u[9] = u[8]; 3509 u[14] = u[15]; 3510 3511 temp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit); 3512 u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit); 3513 u[17] = temp1; 3514 3515 temp2 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit); 3516 u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit); 3517 u[22] = temp2; 3518 3519 u[35] = u[32]; 3520 u[34] = u[33]; 3521 u[36] = u[39]; 3522 u[37] = u[38]; 3523 u[43] = u[40]; 3524 u[42] = u[41]; 3525 u[44] = u[47]; 3526 u[45] = u[46]; 3527 u[51] = u[48]; 3528 u[50] = u[49]; 3529 u[52] = u[55]; 3530 u[53] = u[54]; 3531 u[59] = u[56]; 3532 u[58] = u[57]; 3533 u[60] = u[63]; 3534 u[61] = u[62]; 3535 3536 // stage 6 3537 temp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); 3538 u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); 3539 u[0] = temp1; 3540 3541 temp2 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); 3542 u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); 3543 u[9] = temp2; 3544 u[19] = u[16]; 3545 u[18] = u[17]; 3546 u[20] = u[23]; 3547 u[21] = u[22]; 3548 u[27] = u[24]; 3549 u[26] = u[25]; 3550 u[28] = u[31]; 3551 u[29] = u[30]; 3552 3553 temp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); 3554 u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); 3555 u[34] = temp1; 3556 temp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); 3557 u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); 3558 u[35] = temp2; 3559 temp1 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); 3560 u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); 3561 u[36] = temp1; 3562 temp2 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); 3563 u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); 3564 u[37] = temp2; 3565 temp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); 3566 u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); 3567 u[42] = temp1; 3568 temp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); 3569 u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); 3570 u[43] = temp2; 3571 temp1 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); 3572 u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); 3573 u[44] = temp1; 3574 temp2 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); 3575 u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); 3576 u[45] = temp2; 3577 3578 // stage 7 3579 u[3] = u[0]; 3580 u[2] = u[1]; 3581 u[11] = u[8]; 3582 u[10] = u[9]; 3583 u[12] = u[15]; 3584 u[13] = u[14]; 3585 3586 temp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit); 3587 u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit); 3588 u[18] = temp1; 3589 temp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit); 3590 u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit); 3591 u[19] = temp2; 3592 temp1 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit); 3593 u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit); 3594 u[20] = temp1; 3595 temp2 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit); 3596 u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit); 3597 u[21] = temp2; 3598 for (i = 32; i < 64; i += 16) { 3599 for (j = i; j < i + 4; j++) { 3600 addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); 3601 addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, 3602 &clamp_hi); 3603 } 3604 } 3605 3606 // stage 8 3607 u[7] = u[0]; 3608 u[6] = u[1]; 3609 u[5] = u[2]; 3610 u[4] = u[3]; 3611 3612 idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, 3613 &cospim48, &clamp_lo, &clamp_hi, &rnding, bit); 3614 3615 // stage 9 3616 idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, 3617 bit); 3618 3619 // stage 10 3620 idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, 3621 bit); 3622 3623 // stage 11 3624 idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); 3625 } 3626 } 3627 3628 static void idct64x64_low16_sse4_1(__m128i *in, __m128i *out, int bit, 3629 int do_cols, int bd, int out_shift) { 3630 int i, j; 3631 const int32_t *cospi = cospi_arr(bit); 3632 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); 3633 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); 3634 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); 3635 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); 3636 3637 const __m128i cospi1 = _mm_set1_epi32(cospi[1]); 3638 const __m128i cospi2 = _mm_set1_epi32(cospi[2]); 3639 const __m128i cospi3 = _mm_set1_epi32(cospi[3]); 3640 const __m128i cospi4 = _mm_set1_epi32(cospi[4]); 3641 const __m128i cospi5 = _mm_set1_epi32(cospi[5]); 3642 const __m128i cospi6 = _mm_set1_epi32(cospi[6]); 3643 const __m128i cospi7 = _mm_set1_epi32(cospi[7]); 3644 const __m128i cospi8 = _mm_set1_epi32(cospi[8]); 3645 const __m128i cospi9 = _mm_set1_epi32(cospi[9]); 3646 const __m128i cospi10 = _mm_set1_epi32(cospi[10]); 3647 const __m128i cospi11 = _mm_set1_epi32(cospi[11]); 3648 const __m128i cospi12 = _mm_set1_epi32(cospi[12]); 3649 const __m128i cospi13 = _mm_set1_epi32(cospi[13]); 3650 const __m128i cospi14 = _mm_set1_epi32(cospi[14]); 3651 const __m128i cospi15 = _mm_set1_epi32(cospi[15]); 3652 const __m128i cospi16 = _mm_set1_epi32(cospi[16]); 3653 const __m128i cospi20 = _mm_set1_epi32(cospi[20]); 3654 const __m128i cospi24 = _mm_set1_epi32(cospi[24]); 3655 const __m128i cospi28 = _mm_set1_epi32(cospi[28]); 3656 const __m128i cospi32 = _mm_set1_epi32(cospi[32]); 3657 const __m128i cospi36 = _mm_set1_epi32(cospi[36]); 3658 const __m128i cospi40 = _mm_set1_epi32(cospi[40]); 3659 const __m128i cospi44 = _mm_set1_epi32(cospi[44]); 3660 const __m128i cospi48 = _mm_set1_epi32(cospi[48]); 3661 const __m128i cospi51 = _mm_set1_epi32(cospi[51]); 3662 const __m128i cospi52 = _mm_set1_epi32(cospi[52]); 3663 const __m128i cospi54 = _mm_set1_epi32(cospi[54]); 3664 const __m128i cospi55 = _mm_set1_epi32(cospi[55]); 3665 const __m128i cospi56 = _mm_set1_epi32(cospi[56]); 3666 const __m128i cospi59 = _mm_set1_epi32(cospi[59]); 3667 const __m128i cospi60 = _mm_set1_epi32(cospi[60]); 3668 const __m128i cospi62 = _mm_set1_epi32(cospi[62]); 3669 const __m128i cospi63 = _mm_set1_epi32(cospi[63]); 3670 3671 const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); 3672 const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); 3673 const __m128i cospim12 = _mm_set1_epi32(-cospi[12]); 3674 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); 3675 const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); 3676 const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); 3677 const __m128i cospim28 = _mm_set1_epi32(-cospi[28]); 3678 const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); 3679 const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); 3680 const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); 3681 const __m128i cospim44 = _mm_set1_epi32(-cospi[44]); 3682 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); 3683 const __m128i cospim49 = _mm_set1_epi32(-cospi[49]); 3684 const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); 3685 const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); 3686 const __m128i cospim53 = _mm_set1_epi32(-cospi[53]); 3687 const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); 3688 const __m128i cospim57 = _mm_set1_epi32(-cospi[57]); 3689 const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); 3690 const __m128i cospim60 = _mm_set1_epi32(-cospi[60]); 3691 const __m128i cospim61 = _mm_set1_epi32(-cospi[61]); 3692 3693 { 3694 __m128i u[64]; 3695 __m128i tmp1, tmp2, tmp3, tmp4; 3696 // stage 1 3697 u[0] = in[0]; 3698 u[32] = in[1]; 3699 u[36] = in[9]; 3700 u[40] = in[5]; 3701 u[44] = in[13]; 3702 u[48] = in[3]; 3703 u[52] = in[11]; 3704 u[56] = in[7]; 3705 u[60] = in[15]; 3706 u[16] = in[2]; 3707 u[20] = in[10]; 3708 u[24] = in[6]; 3709 u[28] = in[14]; 3710 u[4] = in[8]; 3711 u[8] = in[4]; 3712 u[12] = in[12]; 3713 3714 // stage 2 3715 u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit); 3716 u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit); 3717 u[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit); 3718 u[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit); 3719 u[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit); 3720 u[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit); 3721 u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit); 3722 u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit); 3723 u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit); 3724 u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit); 3725 u[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit); 3726 u[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit); 3727 u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit); 3728 u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit); 3729 u[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit); 3730 u[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit); 3731 3732 // stage 3 3733 u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit); 3734 u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit); 3735 u[19] = half_btf_0_sse4_1(&cospim50, &u[28], &rnding, bit); 3736 u[28] = half_btf_0_sse4_1(&cospi14, &u[28], &rnding, bit); 3737 u[27] = half_btf_0_sse4_1(&cospi10, &u[20], &rnding, bit); 3738 u[20] = half_btf_0_sse4_1(&cospi54, &u[20], &rnding, bit); 3739 u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit); 3740 u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit); 3741 u[33] = u[32]; 3742 u[34] = u[35]; 3743 u[37] = u[36]; 3744 u[38] = u[39]; 3745 u[41] = u[40]; 3746 u[42] = u[43]; 3747 u[45] = u[44]; 3748 u[46] = u[47]; 3749 u[49] = u[48]; 3750 u[50] = u[51]; 3751 u[53] = u[52]; 3752 u[54] = u[55]; 3753 u[57] = u[56]; 3754 u[58] = u[59]; 3755 u[61] = u[60]; 3756 u[62] = u[63]; 3757 3758 // stage 4 3759 u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit); 3760 u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit); 3761 u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit); 3762 u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit); 3763 3764 u[17] = u[16]; 3765 u[18] = u[19]; 3766 u[21] = u[20]; 3767 u[22] = u[23]; 3768 u[25] = u[24]; 3769 u[26] = u[27]; 3770 u[29] = u[28]; 3771 u[30] = u[31]; 3772 3773 tmp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); 3774 tmp2 = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit); 3775 tmp3 = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit); 3776 tmp4 = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); 3777 u[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); 3778 u[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit); 3779 u[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit); 3780 u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); 3781 u[33] = tmp1; 3782 u[34] = tmp2; 3783 u[37] = tmp3; 3784 u[38] = tmp4; 3785 3786 tmp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); 3787 tmp2 = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit); 3788 tmp3 = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit); 3789 tmp4 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); 3790 u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); 3791 u[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit); 3792 u[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit); 3793 u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); 3794 u[41] = tmp1; 3795 u[42] = tmp2; 3796 u[45] = tmp3; 3797 u[46] = tmp4; 3798 3799 // stage 5 3800 u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit); 3801 u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit); 3802 3803 u[9] = u[8]; 3804 u[10] = u[11]; 3805 u[13] = u[12]; 3806 u[14] = u[15]; 3807 3808 tmp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit); 3809 tmp2 = half_btf_sse4_1(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit); 3810 tmp3 = half_btf_sse4_1(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit); 3811 tmp4 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit); 3812 u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit); 3813 u[26] = half_btf_sse4_1(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit); 3814 u[29] = half_btf_sse4_1(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit); 3815 u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit); 3816 u[17] = tmp1; 3817 u[18] = tmp2; 3818 u[21] = tmp3; 3819 u[22] = tmp4; 3820 3821 for (i = 32; i < 64; i += 8) { 3822 addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, 3823 &clamp_hi); 3824 addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, 3825 &clamp_hi); 3826 3827 addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, 3828 &clamp_hi); 3829 addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, 3830 &clamp_hi); 3831 } 3832 3833 // stage 6 3834 tmp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); 3835 u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); 3836 u[0] = tmp1; 3837 u[5] = u[4]; 3838 u[6] = u[7]; 3839 3840 tmp1 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); 3841 u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); 3842 u[9] = tmp1; 3843 tmp2 = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); 3844 u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); 3845 u[10] = tmp2; 3846 3847 for (i = 16; i < 32; i += 8) { 3848 addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, 3849 &clamp_hi); 3850 addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, 3851 &clamp_hi); 3852 3853 addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, 3854 &clamp_hi); 3855 addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, 3856 &clamp_hi); 3857 } 3858 3859 tmp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); 3860 tmp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); 3861 tmp3 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); 3862 tmp4 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); 3863 u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); 3864 u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); 3865 u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); 3866 u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); 3867 u[34] = tmp1; 3868 u[35] = tmp2; 3869 u[36] = tmp3; 3870 u[37] = tmp4; 3871 3872 tmp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); 3873 tmp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); 3874 tmp3 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); 3875 tmp4 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); 3876 u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); 3877 u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); 3878 u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); 3879 u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); 3880 u[42] = tmp1; 3881 u[43] = tmp2; 3882 u[44] = tmp3; 3883 u[45] = tmp4; 3884 3885 // stage 7 3886 u[3] = u[0]; 3887 u[2] = u[1]; 3888 tmp1 = half_btf_sse4_1(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit); 3889 u[6] = half_btf_sse4_1(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit); 3890 u[5] = tmp1; 3891 addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi); 3892 addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi); 3893 addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi); 3894 addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi); 3895 3896 tmp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit); 3897 tmp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit); 3898 tmp3 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit); 3899 tmp4 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit); 3900 u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit); 3901 u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit); 3902 u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit); 3903 u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit); 3904 u[18] = tmp1; 3905 u[19] = tmp2; 3906 u[20] = tmp3; 3907 u[21] = tmp4; 3908 3909 for (i = 32; i < 64; i += 16) { 3910 for (j = i; j < i + 4; j++) { 3911 addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); 3912 addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, 3913 &clamp_hi); 3914 } 3915 } 3916 3917 // stage 8 3918 for (i = 0; i < 4; ++i) { 3919 addsub_sse4_1(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi); 3920 } 3921 3922 idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, 3923 &cospim48, &clamp_lo, &clamp_hi, &rnding, bit); 3924 3925 // stage 9 3926 idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, 3927 bit); 3928 3929 // stage 10 3930 idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, 3931 bit); 3932 3933 // stage 11 3934 idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); 3935 } 3936 } 3937 3938 static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, 3939 int bd, int out_shift) { 3940 int i, j; 3941 const int32_t *cospi = cospi_arr(bit); 3942 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); 3943 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); 3944 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); 3945 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); 3946 3947 const __m128i cospi1 = _mm_set1_epi32(cospi[1]); 3948 const __m128i cospi2 = _mm_set1_epi32(cospi[2]); 3949 const __m128i cospi3 = _mm_set1_epi32(cospi[3]); 3950 const __m128i cospi4 = _mm_set1_epi32(cospi[4]); 3951 const __m128i cospi5 = _mm_set1_epi32(cospi[5]); 3952 const __m128i cospi6 = _mm_set1_epi32(cospi[6]); 3953 const __m128i cospi7 = _mm_set1_epi32(cospi[7]); 3954 const __m128i cospi8 = _mm_set1_epi32(cospi[8]); 3955 const __m128i cospi9 = _mm_set1_epi32(cospi[9]); 3956 const __m128i cospi10 = _mm_set1_epi32(cospi[10]); 3957 const __m128i cospi11 = _mm_set1_epi32(cospi[11]); 3958 const __m128i cospi12 = _mm_set1_epi32(cospi[12]); 3959 const __m128i cospi13 = _mm_set1_epi32(cospi[13]); 3960 const __m128i cospi14 = _mm_set1_epi32(cospi[14]); 3961 const __m128i cospi15 = _mm_set1_epi32(cospi[15]); 3962 const __m128i cospi16 = _mm_set1_epi32(cospi[16]); 3963 const __m128i cospi17 = _mm_set1_epi32(cospi[17]); 3964 const __m128i cospi18 = _mm_set1_epi32(cospi[18]); 3965 const __m128i cospi19 = _mm_set1_epi32(cospi[19]); 3966 const __m128i cospi20 = _mm_set1_epi32(cospi[20]); 3967 const __m128i cospi21 = _mm_set1_epi32(cospi[21]); 3968 const __m128i cospi22 = _mm_set1_epi32(cospi[22]); 3969 const __m128i cospi23 = _mm_set1_epi32(cospi[23]); 3970 const __m128i cospi24 = _mm_set1_epi32(cospi[24]); 3971 const __m128i cospi25 = _mm_set1_epi32(cospi[25]); 3972 const __m128i cospi26 = _mm_set1_epi32(cospi[26]); 3973 const __m128i cospi27 = _mm_set1_epi32(cospi[27]); 3974 const __m128i cospi28 = _mm_set1_epi32(cospi[28]); 3975 const __m128i cospi29 = _mm_set1_epi32(cospi[29]); 3976 const __m128i cospi30 = _mm_set1_epi32(cospi[30]); 3977 const __m128i cospi31 = _mm_set1_epi32(cospi[31]); 3978 const __m128i cospi32 = _mm_set1_epi32(cospi[32]); 3979 const __m128i cospi35 = _mm_set1_epi32(cospi[35]); 3980 const __m128i cospi36 = _mm_set1_epi32(cospi[36]); 3981 const __m128i cospi38 = _mm_set1_epi32(cospi[38]); 3982 const __m128i cospi39 = _mm_set1_epi32(cospi[39]); 3983 const __m128i cospi40 = _mm_set1_epi32(cospi[40]); 3984 const __m128i cospi43 = _mm_set1_epi32(cospi[43]); 3985 const __m128i cospi44 = _mm_set1_epi32(cospi[44]); 3986 const __m128i cospi46 = _mm_set1_epi32(cospi[46]); 3987 const __m128i cospi47 = _mm_set1_epi32(cospi[47]); 3988 const __m128i cospi48 = _mm_set1_epi32(cospi[48]); 3989 const __m128i cospi51 = _mm_set1_epi32(cospi[51]); 3990 const __m128i cospi52 = _mm_set1_epi32(cospi[52]); 3991 const __m128i cospi54 = _mm_set1_epi32(cospi[54]); 3992 const __m128i cospi55 = _mm_set1_epi32(cospi[55]); 3993 const __m128i cospi56 = _mm_set1_epi32(cospi[56]); 3994 const __m128i cospi59 = _mm_set1_epi32(cospi[59]); 3995 const __m128i cospi60 = _mm_set1_epi32(cospi[60]); 3996 const __m128i cospi62 = _mm_set1_epi32(cospi[62]); 3997 const __m128i cospi63 = _mm_set1_epi32(cospi[63]); 3998 3999 const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); 4000 const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); 4001 const __m128i cospim12 = _mm_set1_epi32(-cospi[12]); 4002 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); 4003 const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); 4004 const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); 4005 const __m128i cospim28 = _mm_set1_epi32(-cospi[28]); 4006 const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); 4007 const __m128i cospim33 = _mm_set1_epi32(-cospi[33]); 4008 const __m128i cospim34 = _mm_set1_epi32(-cospi[34]); 4009 const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); 4010 const __m128i cospim37 = _mm_set1_epi32(-cospi[37]); 4011 const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); 4012 const __m128i cospim41 = _mm_set1_epi32(-cospi[41]); 4013 const __m128i cospim42 = _mm_set1_epi32(-cospi[42]); 4014 const __m128i cospim44 = _mm_set1_epi32(-cospi[44]); 4015 const __m128i cospim45 = _mm_set1_epi32(-cospi[45]); 4016 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); 4017 const __m128i cospim49 = _mm_set1_epi32(-cospi[49]); 4018 const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); 4019 const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); 4020 const __m128i cospim53 = _mm_set1_epi32(-cospi[53]); 4021 const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); 4022 const __m128i cospim57 = _mm_set1_epi32(-cospi[57]); 4023 const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); 4024 const __m128i cospim60 = _mm_set1_epi32(-cospi[60]); 4025 const __m128i cospim61 = _mm_set1_epi32(-cospi[61]); 4026 4027 { 4028 __m128i u[64], v[64]; 4029 4030 // stage 1 4031 u[32] = in[1]; 4032 u[34] = in[17]; 4033 u[36] = in[9]; 4034 u[38] = in[25]; 4035 u[40] = in[5]; 4036 u[42] = in[21]; 4037 u[44] = in[13]; 4038 u[46] = in[29]; 4039 u[48] = in[3]; 4040 u[50] = in[19]; 4041 u[52] = in[11]; 4042 u[54] = in[27]; 4043 u[56] = in[7]; 4044 u[58] = in[23]; 4045 u[60] = in[15]; 4046 u[62] = in[31]; 4047 4048 v[16] = in[2]; 4049 v[18] = in[18]; 4050 v[20] = in[10]; 4051 v[22] = in[26]; 4052 v[24] = in[6]; 4053 v[26] = in[22]; 4054 v[28] = in[14]; 4055 v[30] = in[30]; 4056 4057 u[8] = in[4]; 4058 u[10] = in[20]; 4059 u[12] = in[12]; 4060 u[14] = in[28]; 4061 4062 v[4] = in[8]; 4063 v[6] = in[24]; 4064 4065 u[0] = in[0]; 4066 u[2] = in[16]; 4067 4068 // stage 2 4069 v[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit); 4070 v[33] = half_btf_0_sse4_1(&cospim33, &u[62], &rnding, bit); 4071 v[34] = half_btf_0_sse4_1(&cospi47, &u[34], &rnding, bit); 4072 v[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit); 4073 v[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit); 4074 v[37] = half_btf_0_sse4_1(&cospim41, &u[58], &rnding, bit); 4075 v[38] = half_btf_0_sse4_1(&cospi39, &u[38], &rnding, bit); 4076 v[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit); 4077 v[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit); 4078 v[41] = half_btf_0_sse4_1(&cospim37, &u[54], &rnding, bit); 4079 v[42] = half_btf_0_sse4_1(&cospi43, &u[42], &rnding, bit); 4080 v[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit); 4081 v[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit); 4082 v[45] = half_btf_0_sse4_1(&cospim45, &u[50], &rnding, bit); 4083 v[46] = half_btf_0_sse4_1(&cospi35, &u[46], &rnding, bit); 4084 v[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit); 4085 v[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit); 4086 v[49] = half_btf_0_sse4_1(&cospi29, &u[46], &rnding, bit); 4087 v[50] = half_btf_0_sse4_1(&cospi19, &u[50], &rnding, bit); 4088 v[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit); 4089 v[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit); 4090 v[53] = half_btf_0_sse4_1(&cospi21, &u[42], &rnding, bit); 4091 v[54] = half_btf_0_sse4_1(&cospi27, &u[54], &rnding, bit); 4092 v[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit); 4093 v[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit); 4094 v[57] = half_btf_0_sse4_1(&cospi25, &u[38], &rnding, bit); 4095 v[58] = half_btf_0_sse4_1(&cospi23, &u[58], &rnding, bit); 4096 v[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit); 4097 v[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit); 4098 v[61] = half_btf_0_sse4_1(&cospi17, &u[34], &rnding, bit); 4099 v[62] = half_btf_0_sse4_1(&cospi31, &u[62], &rnding, bit); 4100 v[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit); 4101 4102 // stage 3 4103 u[16] = half_btf_0_sse4_1(&cospi62, &v[16], &rnding, bit); 4104 u[17] = half_btf_0_sse4_1(&cospim34, &v[30], &rnding, bit); 4105 u[18] = half_btf_0_sse4_1(&cospi46, &v[18], &rnding, bit); 4106 u[19] = half_btf_0_sse4_1(&cospim50, &v[28], &rnding, bit); 4107 u[20] = half_btf_0_sse4_1(&cospi54, &v[20], &rnding, bit); 4108 u[21] = half_btf_0_sse4_1(&cospim42, &v[26], &rnding, bit); 4109 u[22] = half_btf_0_sse4_1(&cospi38, &v[22], &rnding, bit); 4110 u[23] = half_btf_0_sse4_1(&cospim58, &v[24], &rnding, bit); 4111 u[24] = half_btf_0_sse4_1(&cospi6, &v[24], &rnding, bit); 4112 u[25] = half_btf_0_sse4_1(&cospi26, &v[22], &rnding, bit); 4113 u[26] = half_btf_0_sse4_1(&cospi22, &v[26], &rnding, bit); 4114 u[27] = half_btf_0_sse4_1(&cospi10, &v[20], &rnding, bit); 4115 u[28] = half_btf_0_sse4_1(&cospi14, &v[28], &rnding, bit); 4116 u[29] = half_btf_0_sse4_1(&cospi18, &v[18], &rnding, bit); 4117 u[30] = half_btf_0_sse4_1(&cospi30, &v[30], &rnding, bit); 4118 u[31] = half_btf_0_sse4_1(&cospi2, &v[16], &rnding, bit); 4119 4120 for (i = 32; i < 64; i += 4) { 4121 addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo, 4122 &clamp_hi); 4123 addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo, 4124 &clamp_hi); 4125 } 4126 4127 // stage 4 4128 v[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit); 4129 v[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit); 4130 v[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit); 4131 v[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit); 4132 v[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit); 4133 v[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit); 4134 v[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit); 4135 v[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit); 4136 4137 for (i = 16; i < 32; i += 4) { 4138 addsub_sse4_1(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo, 4139 &clamp_hi); 4140 addsub_sse4_1(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo, 4141 &clamp_hi); 4142 } 4143 4144 for (i = 32; i < 64; i += 4) { 4145 v[i + 0] = u[i + 0]; 4146 v[i + 3] = u[i + 3]; 4147 } 4148 4149 v[33] = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); 4150 v[34] = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit); 4151 v[37] = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit); 4152 v[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); 4153 v[41] = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); 4154 v[42] = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit); 4155 v[45] = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit); 4156 v[46] = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); 4157 v[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); 4158 v[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit); 4159 v[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit); 4160 v[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); 4161 v[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); 4162 v[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit); 4163 v[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit); 4164 v[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); 4165 4166 // stage 5 4167 u[4] = half_btf_0_sse4_1(&cospi56, &v[4], &rnding, bit); 4168 u[5] = half_btf_0_sse4_1(&cospim40, &v[6], &rnding, bit); 4169 u[6] = half_btf_0_sse4_1(&cospi24, &v[6], &rnding, bit); 4170 u[7] = half_btf_0_sse4_1(&cospi8, &v[4], &rnding, bit); 4171 4172 for (i = 8; i < 16; i += 4) { 4173 addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo, 4174 &clamp_hi); 4175 addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo, 4176 &clamp_hi); 4177 } 4178 4179 for (i = 16; i < 32; i += 4) { 4180 u[i + 0] = v[i + 0]; 4181 u[i + 3] = v[i + 3]; 4182 } 4183 4184 u[17] = half_btf_sse4_1(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit); 4185 u[18] = half_btf_sse4_1(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit); 4186 u[21] = half_btf_sse4_1(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit); 4187 u[22] = half_btf_sse4_1(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit); 4188 u[25] = half_btf_sse4_1(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit); 4189 u[26] = half_btf_sse4_1(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit); 4190 u[29] = half_btf_sse4_1(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit); 4191 u[30] = half_btf_sse4_1(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit); 4192 4193 for (i = 32; i < 64; i += 8) { 4194 addsub_sse4_1(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, 4195 &clamp_hi); 4196 addsub_sse4_1(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, 4197 &clamp_hi); 4198 4199 addsub_sse4_1(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, 4200 &clamp_hi); 4201 addsub_sse4_1(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, 4202 &clamp_hi); 4203 } 4204 4205 // stage 6 4206 v[0] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); 4207 v[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); 4208 v[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit); 4209 v[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit); 4210 4211 addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi); 4212 addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi); 4213 4214 for (i = 8; i < 16; i += 4) { 4215 v[i + 0] = u[i + 0]; 4216 v[i + 3] = u[i + 3]; 4217 } 4218 4219 v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); 4220 v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); 4221 v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); 4222 v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); 4223 4224 for (i = 16; i < 32; i += 8) { 4225 addsub_sse4_1(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo, 4226 &clamp_hi); 4227 addsub_sse4_1(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo, 4228 &clamp_hi); 4229 4230 addsub_sse4_1(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo, 4231 &clamp_hi); 4232 addsub_sse4_1(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo, 4233 &clamp_hi); 4234 } 4235 4236 for (i = 32; i < 64; i += 8) { 4237 v[i + 0] = u[i + 0]; 4238 v[i + 1] = u[i + 1]; 4239 v[i + 6] = u[i + 6]; 4240 v[i + 7] = u[i + 7]; 4241 } 4242 4243 v[34] = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); 4244 v[35] = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); 4245 v[36] = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); 4246 v[37] = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); 4247 v[42] = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); 4248 v[43] = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); 4249 v[44] = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); 4250 v[45] = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); 4251 v[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); 4252 v[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); 4253 v[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); 4254 v[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); 4255 v[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); 4256 v[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); 4257 v[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); 4258 v[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); 4259 4260 // stage 7 4261 addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi); 4262 addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi); 4263 4264 u[4] = v[4]; 4265 u[7] = v[7]; 4266 u[5] = half_btf_sse4_1(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit); 4267 u[6] = half_btf_sse4_1(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit); 4268 4269 addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi); 4270 addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi); 4271 addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi); 4272 addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi); 4273 4274 for (i = 16; i < 32; i += 8) { 4275 u[i + 0] = v[i + 0]; 4276 u[i + 1] = v[i + 1]; 4277 u[i + 6] = v[i + 6]; 4278 u[i + 7] = v[i + 7]; 4279 } 4280 4281 u[18] = half_btf_sse4_1(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit); 4282 u[19] = half_btf_sse4_1(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit); 4283 u[20] = half_btf_sse4_1(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit); 4284 u[21] = half_btf_sse4_1(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit); 4285 u[26] = half_btf_sse4_1(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit); 4286 u[27] = half_btf_sse4_1(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit); 4287 u[28] = half_btf_sse4_1(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit); 4288 u[29] = half_btf_sse4_1(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit); 4289 4290 for (i = 32; i < 64; i += 16) { 4291 for (j = i; j < i + 4; j++) { 4292 addsub_sse4_1(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); 4293 addsub_sse4_1(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, 4294 &clamp_hi); 4295 } 4296 } 4297 4298 // stage 8 4299 for (i = 0; i < 4; ++i) { 4300 addsub_sse4_1(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi); 4301 } 4302 4303 v[8] = u[8]; 4304 v[9] = u[9]; 4305 v[14] = u[14]; 4306 v[15] = u[15]; 4307 4308 v[10] = half_btf_sse4_1(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit); 4309 v[11] = half_btf_sse4_1(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit); 4310 v[12] = half_btf_sse4_1(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit); 4311 v[13] = half_btf_sse4_1(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit); 4312 4313 for (i = 16; i < 20; ++i) { 4314 addsub_sse4_1(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi); 4315 addsub_sse4_1(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo, 4316 &clamp_hi); 4317 } 4318 4319 for (i = 32; i < 36; ++i) { 4320 v[i] = u[i]; 4321 v[i + 12] = u[i + 12]; 4322 v[i + 16] = u[i + 16]; 4323 v[i + 28] = u[i + 28]; 4324 } 4325 4326 v[36] = half_btf_sse4_1(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit); 4327 v[37] = half_btf_sse4_1(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit); 4328 v[38] = half_btf_sse4_1(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit); 4329 v[39] = half_btf_sse4_1(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit); 4330 v[40] = half_btf_sse4_1(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit); 4331 v[41] = half_btf_sse4_1(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit); 4332 v[42] = half_btf_sse4_1(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit); 4333 v[43] = half_btf_sse4_1(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit); 4334 v[52] = half_btf_sse4_1(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit); 4335 v[53] = half_btf_sse4_1(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit); 4336 v[54] = half_btf_sse4_1(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit); 4337 v[55] = half_btf_sse4_1(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit); 4338 v[56] = half_btf_sse4_1(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit); 4339 v[57] = half_btf_sse4_1(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit); 4340 v[58] = half_btf_sse4_1(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit); 4341 v[59] = half_btf_sse4_1(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit); 4342 4343 // stage 9 4344 for (i = 0; i < 8; ++i) { 4345 addsub_sse4_1(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi); 4346 } 4347 4348 for (i = 16; i < 20; ++i) { 4349 u[i] = v[i]; 4350 u[i + 12] = v[i + 12]; 4351 } 4352 4353 u[20] = half_btf_sse4_1(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit); 4354 u[21] = half_btf_sse4_1(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit); 4355 u[22] = half_btf_sse4_1(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit); 4356 u[23] = half_btf_sse4_1(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit); 4357 u[24] = half_btf_sse4_1(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit); 4358 u[25] = half_btf_sse4_1(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit); 4359 u[26] = half_btf_sse4_1(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit); 4360 u[27] = half_btf_sse4_1(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit); 4361 4362 for (i = 32; i < 40; i++) { 4363 addsub_sse4_1(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi); 4364 } 4365 4366 for (i = 48; i < 56; i++) { 4367 addsub_sse4_1(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi); 4368 } 4369 4370 // stage 10 4371 for (i = 0; i < 16; i++) { 4372 addsub_sse4_1(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi); 4373 } 4374 4375 for (i = 32; i < 40; i++) v[i] = u[i]; 4376 4377 v[40] = half_btf_sse4_1(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit); 4378 v[41] = half_btf_sse4_1(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit); 4379 v[42] = half_btf_sse4_1(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit); 4380 v[43] = half_btf_sse4_1(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit); 4381 v[44] = half_btf_sse4_1(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit); 4382 v[45] = half_btf_sse4_1(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit); 4383 v[46] = half_btf_sse4_1(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit); 4384 v[47] = half_btf_sse4_1(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit); 4385 v[48] = half_btf_sse4_1(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit); 4386 v[49] = half_btf_sse4_1(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit); 4387 v[50] = half_btf_sse4_1(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit); 4388 v[51] = half_btf_sse4_1(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit); 4389 v[52] = half_btf_sse4_1(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit); 4390 v[53] = half_btf_sse4_1(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit); 4391 v[54] = half_btf_sse4_1(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit); 4392 v[55] = half_btf_sse4_1(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit); 4393 4394 for (i = 56; i < 64; i++) v[i] = u[i]; 4395 4396 // stage 11 4397 for (i = 0; i < 32; i++) { 4398 addsub_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)], &clamp_lo, 4399 &clamp_hi); 4400 } 4401 4402 if (!do_cols) { 4403 const int log_range_out = AOMMAX(16, bd + 6); 4404 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); 4405 const __m128i clamp_hi_out = 4406 _mm_set1_epi32((1 << (log_range_out - 1)) - 1); 4407 for (i = 0; i < 64; i += 4) { 4408 round_shift_4x4(out + i, out_shift); 4409 highbd_clamp_epi32_sse4_1(out + i, out + i, &clamp_lo_out, 4410 &clamp_hi_out, 4); 4411 } 4412 } 4413 } 4414 } 4415 4416 static void idct32x32_low1_sse4_1(__m128i *in, __m128i *out, int bit, 4417 int do_cols, int bd, int out_shift) { 4418 const int32_t *cospi = cospi_arr(bit); 4419 const __m128i cospi32 = _mm_set1_epi32(cospi[32]); 4420 const __m128i rounding = _mm_set1_epi32(1 << (bit - 1)); 4421 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); 4422 __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); 4423 __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); 4424 __m128i bf1; 4425 4426 // stage 0 4427 // stage 1 4428 bf1 = in[0]; 4429 4430 // stage 2 4431 // stage 3 4432 // stage 4 4433 // stage 5 4434 bf1 = half_btf_0_sse4_1(&cospi32, &bf1, &rounding, bit); 4435 4436 // stage 6 4437 // stage 7 4438 // stage 8 4439 // stage 9 4440 if (do_cols) { 4441 bf1 = _mm_max_epi32(bf1, clamp_lo); 4442 bf1 = _mm_min_epi32(bf1, clamp_hi); 4443 } else { 4444 const int log_range_out = AOMMAX(16, bd + 6); 4445 clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1))); 4446 clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); 4447 if (out_shift != 0) { 4448 __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1); 4449 bf1 = _mm_add_epi32(bf1, offset); 4450 bf1 = _mm_sra_epi32(bf1, _mm_cvtsi32_si128(out_shift)); 4451 } 4452 } 4453 4454 bf1 = _mm_max_epi32(bf1, clamp_lo); 4455 bf1 = _mm_min_epi32(bf1, clamp_hi); 4456 out[0] = bf1; 4457 out[1] = bf1; 4458 out[2] = bf1; 4459 out[3] = bf1; 4460 out[4] = bf1; 4461 out[5] = bf1; 4462 out[6] = bf1; 4463 out[7] = bf1; 4464 out[8] = bf1; 4465 out[9] = bf1; 4466 out[10] = bf1; 4467 out[11] = bf1; 4468 out[12] = bf1; 4469 out[13] = bf1; 4470 out[14] = bf1; 4471 out[15] = bf1; 4472 out[16] = bf1; 4473 out[17] = bf1; 4474 out[18] = bf1; 4475 out[19] = bf1; 4476 out[20] = bf1; 4477 out[21] = bf1; 4478 out[22] = bf1; 4479 out[23] = bf1; 4480 out[24] = bf1; 4481 out[25] = bf1; 4482 out[26] = bf1; 4483 out[27] = bf1; 4484 out[28] = bf1; 4485 out[29] = bf1; 4486 out[30] = bf1; 4487 out[31] = bf1; 4488 } 4489 4490 static void idct32x32_low8_sse4_1(__m128i *in, __m128i *out, int bit, 4491 int do_cols, int bd, int out_shift) { 4492 const int32_t *cospi = cospi_arr(bit); 4493 const __m128i cospi62 = _mm_set1_epi32(cospi[62]); 4494 const __m128i cospi14 = _mm_set1_epi32(cospi[14]); 4495 const __m128i cospi54 = _mm_set1_epi32(cospi[54]); 4496 const __m128i cospi6 = _mm_set1_epi32(cospi[6]); 4497 const __m128i cospi10 = _mm_set1_epi32(cospi[10]); 4498 const __m128i cospi2 = _mm_set1_epi32(cospi[2]); 4499 const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); 4500 const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); 4501 const __m128i cospi60 = _mm_set1_epi32(cospi[60]); 4502 const __m128i cospi12 = _mm_set1_epi32(cospi[12]); 4503 const __m128i cospi4 = _mm_set1_epi32(cospi[4]); 4504 const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); 4505 const __m128i cospi56 = _mm_set1_epi32(cospi[56]); 4506 const __m128i cospi24 = _mm_set1_epi32(cospi[24]); 4507 const __m128i cospi40 = _mm_set1_epi32(cospi[40]); 4508 const __m128i cospi8 = _mm_set1_epi32(cospi[8]); 4509 const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); 4510 const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); 4511 const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); 4512 const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); 4513 const __m128i cospi32 = _mm_set1_epi32(cospi[32]); 4514 const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); 4515 const __m128i cospi48 = _mm_set1_epi32(cospi[48]); 4516 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); 4517 const __m128i cospi16 = _mm_set1_epi32(cospi[16]); 4518 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); 4519 const __m128i rounding = _mm_set1_epi32(1 << (bit - 1)); 4520 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); 4521 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); 4522 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); 4523 __m128i bf1[32]; 4524 4525 // stage 0 4526 // stage 1 4527 bf1[0] = in[0]; 4528 bf1[4] = in[4]; 4529 bf1[8] = in[2]; 4530 bf1[12] = in[6]; 4531 bf1[16] = in[1]; 4532 bf1[20] = in[5]; 4533 bf1[24] = in[3]; 4534 bf1[28] = in[7]; 4535 4536 // stage 2 4537 bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit); 4538 bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit); 4539 bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit); 4540 bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit); 4541 bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit); 4542 bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit); 4543 bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit); 4544 bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit); 4545 4546 // stage 3 4547 bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit); 4548 bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit); 4549 4550 bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit); 4551 bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit); 4552 bf1[17] = bf1[16]; 4553 bf1[18] = bf1[19]; 4554 bf1[21] = bf1[20]; 4555 bf1[22] = bf1[23]; 4556 bf1[25] = bf1[24]; 4557 bf1[26] = bf1[27]; 4558 bf1[29] = bf1[28]; 4559 bf1[30] = bf1[31]; 4560 4561 // stage 4 : 4562 bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit); 4563 bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit); 4564 4565 bf1[9] = bf1[8]; 4566 bf1[10] = bf1[11]; 4567 bf1[13] = bf1[12]; 4568 bf1[14] = bf1[15]; 4569 4570 idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40, 4571 &cospi24, &cospi40, &cospim24, &rounding, bit); 4572 4573 // stage 5 4574 bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit); 4575 bf1[1] = bf1[0]; 4576 bf1[5] = bf1[4]; 4577 bf1[6] = bf1[7]; 4578 4579 idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo, 4580 &clamp_hi, &rounding, bit); 4581 4582 // stage 6 4583 bf1[3] = bf1[0]; 4584 bf1[2] = bf1[1]; 4585 4586 idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, 4587 &cospim48, &clamp_lo, &clamp_hi, &rounding, bit); 4588 4589 // stage 7 4590 idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, 4591 &rounding, bit); 4592 4593 // stage 8 4594 idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, 4595 &rounding, bit); 4596 4597 // stage 9 4598 idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); 4599 } 4600 4601 static void idct32x32_low16_sse4_1(__m128i *in, __m128i *out, int bit, 4602 int do_cols, int bd, int out_shift) { 4603 const int32_t *cospi = cospi_arr(bit); 4604 const __m128i cospi62 = _mm_set1_epi32(cospi[62]); 4605 const __m128i cospi30 = _mm_set1_epi32(cospi[30]); 4606 const __m128i cospi46 = _mm_set1_epi32(cospi[46]); 4607 const __m128i cospi14 = _mm_set1_epi32(cospi[14]); 4608 const __m128i cospi54 = _mm_set1_epi32(cospi[54]); 4609 const __m128i cospi22 = _mm_set1_epi32(cospi[22]); 4610 const __m128i cospi38 = _mm_set1_epi32(cospi[38]); 4611 const __m128i cospi6 = _mm_set1_epi32(cospi[6]); 4612 const __m128i cospi26 = _mm_set1_epi32(cospi[26]); 4613 const __m128i cospi10 = _mm_set1_epi32(cospi[10]); 4614 const __m128i cospi18 = _mm_set1_epi32(cospi[18]); 4615 const __m128i cospi2 = _mm_set1_epi32(cospi[2]); 4616 const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); 4617 const __m128i cospim42 = _mm_set1_epi32(-cospi[42]); 4618 const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); 4619 const __m128i cospim34 = _mm_set1_epi32(-cospi[34]); 4620 const __m128i cospi60 = _mm_set1_epi32(cospi[60]); 4621 const __m128i cospi28 = _mm_set1_epi32(cospi[28]); 4622 const __m128i cospi44 = _mm_set1_epi32(cospi[44]); 4623 const __m128i cospi12 = _mm_set1_epi32(cospi[12]); 4624 const __m128i cospi20 = _mm_set1_epi32(cospi[20]); 4625 const __m128i cospi4 = _mm_set1_epi32(cospi[4]); 4626 const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); 4627 const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); 4628 const __m128i cospi56 = _mm_set1_epi32(cospi[56]); 4629 const __m128i cospi24 = _mm_set1_epi32(cospi[24]); 4630 const __m128i cospi40 = _mm_set1_epi32(cospi[40]); 4631 const __m128i cospi8 = _mm_set1_epi32(cospi[8]); 4632 const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); 4633 const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); 4634 const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); 4635 const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); 4636 const __m128i cospi32 = _mm_set1_epi32(cospi[32]); 4637 const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); 4638 const __m128i cospi48 = _mm_set1_epi32(cospi[48]); 4639 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); 4640 const __m128i cospi16 = _mm_set1_epi32(cospi[16]); 4641 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); 4642 const __m128i rounding = _mm_set1_epi32(1 << (bit - 1)); 4643 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); 4644 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); 4645 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); 4646 __m128i bf1[32]; 4647 4648 // stage 0 4649 // stage 1 4650 4651 bf1[0] = in[0]; 4652 bf1[2] = in[8]; 4653 bf1[4] = in[4]; 4654 bf1[6] = in[12]; 4655 bf1[8] = in[2]; 4656 bf1[10] = in[10]; 4657 bf1[12] = in[6]; 4658 bf1[14] = in[14]; 4659 bf1[16] = in[1]; 4660 bf1[18] = in[9]; 4661 bf1[20] = in[5]; 4662 bf1[22] = in[13]; 4663 bf1[24] = in[3]; 4664 bf1[26] = in[11]; 4665 bf1[28] = in[7]; 4666 bf1[30] = in[15]; 4667 4668 // stage 2 4669 bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit); 4670 bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit); 4671 bf1[17] = half_btf_0_sse4_1(&cospim34, &bf1[30], &rounding, bit); 4672 bf1[30] = half_btf_0_sse4_1(&cospi30, &bf1[30], &rounding, bit); 4673 bf1[29] = half_btf_0_sse4_1(&cospi18, &bf1[18], &rounding, bit); 4674 bf1[18] = half_btf_0_sse4_1(&cospi46, &bf1[18], &rounding, bit); 4675 bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit); 4676 bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit); 4677 bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit); 4678 bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit); 4679 bf1[21] = half_btf_0_sse4_1(&cospim42, &bf1[26], &rounding, bit); 4680 bf1[26] = half_btf_0_sse4_1(&cospi22, &bf1[26], &rounding, bit); 4681 bf1[25] = half_btf_0_sse4_1(&cospi26, &bf1[22], &rounding, bit); 4682 bf1[22] = half_btf_0_sse4_1(&cospi38, &bf1[22], &rounding, bit); 4683 bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit); 4684 bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit); 4685 4686 // stage 3 4687 bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit); 4688 bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit); 4689 bf1[9] = half_btf_0_sse4_1(&cospim36, &bf1[14], &rounding, bit); 4690 bf1[14] = half_btf_0_sse4_1(&cospi28, &bf1[14], &rounding, bit); 4691 bf1[13] = half_btf_0_sse4_1(&cospi20, &bf1[10], &rounding, bit); 4692 bf1[10] = half_btf_0_sse4_1(&cospi44, &bf1[10], &rounding, bit); 4693 bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit); 4694 bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit); 4695 4696 addsub_sse4_1(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi); 4697 addsub_sse4_1(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi); 4698 addsub_sse4_1(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi); 4699 addsub_sse4_1(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi); 4700 addsub_sse4_1(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi); 4701 addsub_sse4_1(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi); 4702 addsub_sse4_1(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi); 4703 addsub_sse4_1(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi); 4704 // stage 4 4705 bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit); 4706 bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit); 4707 bf1[5] = half_btf_0_sse4_1(&cospim40, &bf1[6], &rounding, bit); 4708 bf1[6] = half_btf_0_sse4_1(&cospi24, &bf1[6], &rounding, bit); 4709 4710 addsub_sse4_1(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi); 4711 addsub_sse4_1(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi); 4712 addsub_sse4_1(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi); 4713 addsub_sse4_1(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi); 4714 4715 idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40, 4716 &cospi24, &cospi40, &cospim24, &rounding, bit); 4717 4718 // stage 5 4719 bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit); 4720 bf1[1] = bf1[0]; 4721 bf1[3] = half_btf_0_sse4_1(&cospi16, &bf1[2], &rounding, bit); 4722 bf1[2] = half_btf_0_sse4_1(&cospi48, &bf1[2], &rounding, bit); 4723 4724 addsub_sse4_1(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi); 4725 addsub_sse4_1(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi); 4726 4727 idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo, 4728 &clamp_hi, &rounding, bit); 4729 4730 // stage 6 4731 addsub_sse4_1(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi); 4732 addsub_sse4_1(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi); 4733 4734 idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, 4735 &cospim48, &clamp_lo, &clamp_hi, &rounding, bit); 4736 4737 // stage 7 4738 idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, 4739 &rounding, bit); 4740 4741 // stage 8 4742 idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, 4743 &rounding, bit); 4744 // stage 9 4745 idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); 4746 } 4747 4748 static void idct32x32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, 4749 int bd, int out_shift) { 4750 const int32_t *cospi = cospi_arr(bit); 4751 const __m128i cospi62 = _mm_set1_epi32(cospi[62]); 4752 const __m128i cospi30 = _mm_set1_epi32(cospi[30]); 4753 const __m128i cospi46 = _mm_set1_epi32(cospi[46]); 4754 const __m128i cospi14 = _mm_set1_epi32(cospi[14]); 4755 const __m128i cospi54 = _mm_set1_epi32(cospi[54]); 4756 const __m128i cospi22 = _mm_set1_epi32(cospi[22]); 4757 const __m128i cospi38 = _mm_set1_epi32(cospi[38]); 4758 const __m128i cospi6 = _mm_set1_epi32(cospi[6]); 4759 const __m128i cospi58 = _mm_set1_epi32(cospi[58]); 4760 const __m128i cospi26 = _mm_set1_epi32(cospi[26]); 4761 const __m128i cospi42 = _mm_set1_epi32(cospi[42]); 4762 const __m128i cospi10 = _mm_set1_epi32(cospi[10]); 4763 const __m128i cospi50 = _mm_set1_epi32(cospi[50]); 4764 const __m128i cospi18 = _mm_set1_epi32(cospi[18]); 4765 const __m128i cospi34 = _mm_set1_epi32(cospi[34]); 4766 const __m128i cospi2 = _mm_set1_epi32(cospi[2]); 4767 const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); 4768 const __m128i cospim26 = _mm_set1_epi32(-cospi[26]); 4769 const __m128i cospim42 = _mm_set1_epi32(-cospi[42]); 4770 const __m128i cospim10 = _mm_set1_epi32(-cospi[10]); 4771 const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); 4772 const __m128i cospim18 = _mm_set1_epi32(-cospi[18]); 4773 const __m128i cospim34 = _mm_set1_epi32(-cospi[34]); 4774 const __m128i cospim2 = _mm_set1_epi32(-cospi[2]); 4775 const __m128i cospi60 = _mm_set1_epi32(cospi[60]); 4776 const __m128i cospi28 = _mm_set1_epi32(cospi[28]); 4777 const __m128i cospi44 = _mm_set1_epi32(cospi[44]); 4778 const __m128i cospi12 = _mm_set1_epi32(cospi[12]); 4779 const __m128i cospi52 = _mm_set1_epi32(cospi[52]); 4780 const __m128i cospi20 = _mm_set1_epi32(cospi[20]); 4781 const __m128i cospi36 = _mm_set1_epi32(cospi[36]); 4782 const __m128i cospi4 = _mm_set1_epi32(cospi[4]); 4783 const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); 4784 const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); 4785 const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); 4786 const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); 4787 const __m128i cospi56 = _mm_set1_epi32(cospi[56]); 4788 const __m128i cospi24 = _mm_set1_epi32(cospi[24]); 4789 const __m128i cospi40 = _mm_set1_epi32(cospi[40]); 4790 const __m128i cospi8 = _mm_set1_epi32(cospi[8]); 4791 const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); 4792 const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); 4793 const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); 4794 const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); 4795 const __m128i cospi32 = _mm_set1_epi32(cospi[32]); 4796 const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); 4797 const __m128i cospi48 = _mm_set1_epi32(cospi[48]); 4798 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); 4799 const __m128i cospi16 = _mm_set1_epi32(cospi[16]); 4800 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); 4801 const __m128i rounding = _mm_set1_epi32(1 << (bit - 1)); 4802 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); 4803 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); 4804 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); 4805 __m128i bf1[32], bf0[32]; 4806 4807 // stage 0 4808 // stage 1 4809 bf1[0] = in[0]; 4810 bf1[1] = in[16]; 4811 bf1[2] = in[8]; 4812 bf1[3] = in[24]; 4813 bf1[4] = in[4]; 4814 bf1[5] = in[20]; 4815 bf1[6] = in[12]; 4816 bf1[7] = in[28]; 4817 bf1[8] = in[2]; 4818 bf1[9] = in[18]; 4819 bf1[10] = in[10]; 4820 bf1[11] = in[26]; 4821 bf1[12] = in[6]; 4822 bf1[13] = in[22]; 4823 bf1[14] = in[14]; 4824 bf1[15] = in[30]; 4825 bf1[16] = in[1]; 4826 bf1[17] = in[17]; 4827 bf1[18] = in[9]; 4828 bf1[19] = in[25]; 4829 bf1[20] = in[5]; 4830 bf1[21] = in[21]; 4831 bf1[22] = in[13]; 4832 bf1[23] = in[29]; 4833 bf1[24] = in[3]; 4834 bf1[25] = in[19]; 4835 bf1[26] = in[11]; 4836 bf1[27] = in[27]; 4837 bf1[28] = in[7]; 4838 bf1[29] = in[23]; 4839 bf1[30] = in[15]; 4840 bf1[31] = in[31]; 4841 4842 // stage 2 4843 bf0[0] = bf1[0]; 4844 bf0[1] = bf1[1]; 4845 bf0[2] = bf1[2]; 4846 bf0[3] = bf1[3]; 4847 bf0[4] = bf1[4]; 4848 bf0[5] = bf1[5]; 4849 bf0[6] = bf1[6]; 4850 bf0[7] = bf1[7]; 4851 bf0[8] = bf1[8]; 4852 bf0[9] = bf1[9]; 4853 bf0[10] = bf1[10]; 4854 bf0[11] = bf1[11]; 4855 bf0[12] = bf1[12]; 4856 bf0[13] = bf1[13]; 4857 bf0[14] = bf1[14]; 4858 bf0[15] = bf1[15]; 4859 bf0[16] = 4860 half_btf_sse4_1(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit); 4861 bf0[17] = 4862 half_btf_sse4_1(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit); 4863 bf0[18] = 4864 half_btf_sse4_1(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit); 4865 bf0[19] = 4866 half_btf_sse4_1(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit); 4867 bf0[20] = 4868 half_btf_sse4_1(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit); 4869 bf0[21] = 4870 half_btf_sse4_1(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit); 4871 bf0[22] = 4872 half_btf_sse4_1(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit); 4873 bf0[23] = 4874 half_btf_sse4_1(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit); 4875 bf0[24] = 4876 half_btf_sse4_1(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit); 4877 bf0[25] = 4878 half_btf_sse4_1(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit); 4879 bf0[26] = 4880 half_btf_sse4_1(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit); 4881 bf0[27] = 4882 half_btf_sse4_1(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit); 4883 bf0[28] = 4884 half_btf_sse4_1(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit); 4885 bf0[29] = 4886 half_btf_sse4_1(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit); 4887 bf0[30] = 4888 half_btf_sse4_1(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit); 4889 bf0[31] = 4890 half_btf_sse4_1(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit); 4891 4892 // stage 3 4893 bf1[0] = bf0[0]; 4894 bf1[1] = bf0[1]; 4895 bf1[2] = bf0[2]; 4896 bf1[3] = bf0[3]; 4897 bf1[4] = bf0[4]; 4898 bf1[5] = bf0[5]; 4899 bf1[6] = bf0[6]; 4900 bf1[7] = bf0[7]; 4901 bf1[8] = 4902 half_btf_sse4_1(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit); 4903 bf1[9] = 4904 half_btf_sse4_1(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit); 4905 bf1[10] = 4906 half_btf_sse4_1(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit); 4907 bf1[11] = 4908 half_btf_sse4_1(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit); 4909 bf1[12] = 4910 half_btf_sse4_1(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit); 4911 bf1[13] = 4912 half_btf_sse4_1(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit); 4913 bf1[14] = 4914 half_btf_sse4_1(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit); 4915 bf1[15] = 4916 half_btf_sse4_1(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit); 4917 4918 addsub_sse4_1(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi); 4919 addsub_sse4_1(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi); 4920 addsub_sse4_1(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi); 4921 addsub_sse4_1(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi); 4922 addsub_sse4_1(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi); 4923 addsub_sse4_1(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi); 4924 addsub_sse4_1(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi); 4925 addsub_sse4_1(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi); 4926 4927 // stage 4 4928 bf0[0] = bf1[0]; 4929 bf0[1] = bf1[1]; 4930 bf0[2] = bf1[2]; 4931 bf0[3] = bf1[3]; 4932 bf0[4] = 4933 half_btf_sse4_1(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit); 4934 bf0[5] = 4935 half_btf_sse4_1(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit); 4936 bf0[6] = 4937 half_btf_sse4_1(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit); 4938 bf0[7] = half_btf_sse4_1(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit); 4939 4940 addsub_sse4_1(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi); 4941 addsub_sse4_1(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi); 4942 addsub_sse4_1(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi); 4943 addsub_sse4_1(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi); 4944 4945 bf0[16] = bf1[16]; 4946 bf0[17] = 4947 half_btf_sse4_1(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit); 4948 bf0[18] = 4949 half_btf_sse4_1(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit); 4950 bf0[19] = bf1[19]; 4951 bf0[20] = bf1[20]; 4952 bf0[21] = 4953 half_btf_sse4_1(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit); 4954 bf0[22] = 4955 half_btf_sse4_1(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit); 4956 bf0[23] = bf1[23]; 4957 bf0[24] = bf1[24]; 4958 bf0[25] = 4959 half_btf_sse4_1(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit); 4960 bf0[26] = 4961 half_btf_sse4_1(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit); 4962 bf0[27] = bf1[27]; 4963 bf0[28] = bf1[28]; 4964 bf0[29] = 4965 half_btf_sse4_1(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit); 4966 bf0[30] = 4967 half_btf_sse4_1(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit); 4968 bf0[31] = bf1[31]; 4969 4970 // stage 5 4971 bf1[0] = 4972 half_btf_sse4_1(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit); 4973 bf1[1] = 4974 half_btf_sse4_1(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit); 4975 bf1[2] = 4976 half_btf_sse4_1(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit); 4977 bf1[3] = 4978 half_btf_sse4_1(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit); 4979 addsub_sse4_1(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi); 4980 addsub_sse4_1(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi); 4981 bf1[8] = bf0[8]; 4982 bf1[9] = 4983 half_btf_sse4_1(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit); 4984 bf1[10] = 4985 half_btf_sse4_1(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit); 4986 bf1[11] = bf0[11]; 4987 bf1[12] = bf0[12]; 4988 bf1[13] = 4989 half_btf_sse4_1(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit); 4990 bf1[14] = 4991 half_btf_sse4_1(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit); 4992 bf1[15] = bf0[15]; 4993 addsub_sse4_1(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi); 4994 addsub_sse4_1(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi); 4995 addsub_sse4_1(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi); 4996 addsub_sse4_1(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi); 4997 addsub_sse4_1(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi); 4998 addsub_sse4_1(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi); 4999 addsub_sse4_1(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi); 5000 addsub_sse4_1(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi); 5001 5002 // stage 6 5003 addsub_sse4_1(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi); 5004 addsub_sse4_1(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi); 5005 bf0[4] = bf1[4]; 5006 bf0[5] = 5007 half_btf_sse4_1(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit); 5008 bf0[6] = 5009 half_btf_sse4_1(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit); 5010 bf0[7] = bf1[7]; 5011 addsub_sse4_1(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi); 5012 addsub_sse4_1(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi); 5013 addsub_sse4_1(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi); 5014 addsub_sse4_1(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi); 5015 bf0[16] = bf1[16]; 5016 bf0[17] = bf1[17]; 5017 bf0[18] = 5018 half_btf_sse4_1(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit); 5019 bf0[19] = 5020 half_btf_sse4_1(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit); 5021 bf0[20] = 5022 half_btf_sse4_1(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit); 5023 bf0[21] = 5024 half_btf_sse4_1(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit); 5025 bf0[22] = bf1[22]; 5026 bf0[23] = bf1[23]; 5027 bf0[24] = bf1[24]; 5028 bf0[25] = bf1[25]; 5029 bf0[26] = 5030 half_btf_sse4_1(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit); 5031 bf0[27] = 5032 half_btf_sse4_1(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit); 5033 bf0[28] = 5034 half_btf_sse4_1(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit); 5035 bf0[29] = 5036 half_btf_sse4_1(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit); 5037 bf0[30] = bf1[30]; 5038 bf0[31] = bf1[31]; 5039 5040 // stage 7 5041 addsub_sse4_1(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi); 5042 addsub_sse4_1(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi); 5043 addsub_sse4_1(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi); 5044 addsub_sse4_1(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi); 5045 bf1[8] = bf0[8]; 5046 bf1[9] = bf0[9]; 5047 bf1[10] = 5048 half_btf_sse4_1(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit); 5049 bf1[11] = 5050 half_btf_sse4_1(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit); 5051 bf1[12] = 5052 half_btf_sse4_1(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit); 5053 bf1[13] = 5054 half_btf_sse4_1(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit); 5055 bf1[14] = bf0[14]; 5056 bf1[15] = bf0[15]; 5057 addsub_sse4_1(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi); 5058 addsub_sse4_1(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi); 5059 addsub_sse4_1(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi); 5060 addsub_sse4_1(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi); 5061 addsub_sse4_1(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi); 5062 addsub_sse4_1(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi); 5063 addsub_sse4_1(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi); 5064 addsub_sse4_1(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi); 5065 5066 // stage 8 5067 addsub_sse4_1(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi); 5068 addsub_sse4_1(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi); 5069 addsub_sse4_1(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi); 5070 addsub_sse4_1(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi); 5071 addsub_sse4_1(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi); 5072 addsub_sse4_1(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi); 5073 addsub_sse4_1(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi); 5074 addsub_sse4_1(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi); 5075 bf0[16] = bf1[16]; 5076 bf0[17] = bf1[17]; 5077 bf0[18] = bf1[18]; 5078 bf0[19] = bf1[19]; 5079 bf0[20] = 5080 half_btf_sse4_1(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit); 5081 bf0[21] = 5082 half_btf_sse4_1(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit); 5083 bf0[22] = 5084 half_btf_sse4_1(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit); 5085 bf0[23] = 5086 half_btf_sse4_1(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit); 5087 bf0[24] = 5088 half_btf_sse4_1(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit); 5089 bf0[25] = 5090 half_btf_sse4_1(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit); 5091 bf0[26] = 5092 half_btf_sse4_1(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit); 5093 bf0[27] = 5094 half_btf_sse4_1(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit); 5095 bf0[28] = bf1[28]; 5096 bf0[29] = bf1[29]; 5097 bf0[30] = bf1[30]; 5098 bf0[31] = bf1[31]; 5099 5100 // stage 9 5101 addsub_sse4_1(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo, &clamp_hi); 5102 addsub_sse4_1(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo, &clamp_hi); 5103 addsub_sse4_1(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo, &clamp_hi); 5104 addsub_sse4_1(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo, &clamp_hi); 5105 addsub_sse4_1(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo, &clamp_hi); 5106 addsub_sse4_1(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo, &clamp_hi); 5107 addsub_sse4_1(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo, &clamp_hi); 5108 addsub_sse4_1(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo, &clamp_hi); 5109 addsub_sse4_1(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo, &clamp_hi); 5110 addsub_sse4_1(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo, &clamp_hi); 5111 addsub_sse4_1(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo, &clamp_hi); 5112 addsub_sse4_1(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo, &clamp_hi); 5113 addsub_sse4_1(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo, &clamp_hi); 5114 addsub_sse4_1(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo, &clamp_hi); 5115 addsub_sse4_1(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo, &clamp_hi); 5116 addsub_sse4_1(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo, &clamp_hi); 5117 5118 if (!do_cols) { 5119 const int log_range_out = AOMMAX(16, bd + 6); 5120 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); 5121 const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); 5122 round_shift_8x8(out, out_shift); 5123 round_shift_8x8(out + 16, out_shift); 5124 highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32); 5125 } 5126 } 5127 5128 static void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *input, 5129 uint8_t *dest, int stride, 5130 const TxfmParam *txfm_param) { 5131 int bd = txfm_param->bd; 5132 const TX_TYPE tx_type = txfm_param->tx_type; 5133 const int32_t *src = cast_to_int32(input); 5134 switch (tx_type) { 5135 case IDTX: 5136 case H_DCT: 5137 case H_ADST: 5138 case H_FLIPADST: 5139 case V_DCT: 5140 case V_ADST: 5141 case V_FLIPADST: 5142 av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type, 5143 txfm_param->tx_size, 5144 txfm_param->eob, bd); 5145 break; 5146 default: 5147 av1_inv_txfm2d_add_8x8_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride, 5148 tx_type, bd); 5149 break; 5150 } 5151 } 5152 static void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *input, 5153 uint8_t *dest, int stride, 5154 const TxfmParam *txfm_param) { 5155 assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); 5156 int eob = txfm_param->eob; 5157 int bd = txfm_param->bd; 5158 int lossless = txfm_param->lossless; 5159 const int32_t *src = cast_to_int32(input); 5160 const TX_TYPE tx_type = txfm_param->tx_type; 5161 if (lossless) { 5162 assert(tx_type == DCT_DCT); 5163 av1_highbd_iwht4x4_add(input, dest, stride, eob, bd); 5164 return; 5165 } 5166 av1_inv_txfm2d_add_4x4_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, 5167 bd); 5168 } 5169 static void iidentity32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, 5170 int bd, int out_shift) { 5171 (void)bit; 5172 for (int i = 0; i < 32; i += 16) { 5173 out[i] = _mm_slli_epi32(in[i], 2); 5174 out[i + 1] = _mm_slli_epi32(in[i + 1], 2); 5175 out[i + 2] = _mm_slli_epi32(in[i + 2], 2); 5176 out[i + 3] = _mm_slli_epi32(in[i + 3], 2); 5177 out[i + 4] = _mm_slli_epi32(in[i + 4], 2); 5178 out[i + 5] = _mm_slli_epi32(in[i + 5], 2); 5179 out[i + 6] = _mm_slli_epi32(in[i + 6], 2); 5180 out[i + 7] = _mm_slli_epi32(in[i + 7], 2); 5181 out[i + 8] = _mm_slli_epi32(in[i + 8], 2); 5182 out[i + 9] = _mm_slli_epi32(in[i + 9], 2); 5183 out[i + 10] = _mm_slli_epi32(in[i + 10], 2); 5184 out[i + 11] = _mm_slli_epi32(in[i + 11], 2); 5185 out[i + 12] = _mm_slli_epi32(in[i + 12], 2); 5186 out[i + 13] = _mm_slli_epi32(in[i + 13], 2); 5187 out[i + 14] = _mm_slli_epi32(in[i + 14], 2); 5188 out[i + 15] = _mm_slli_epi32(in[i + 15], 2); 5189 } 5190 5191 if (!do_cols) { 5192 const int log_range_out = AOMMAX(16, bd + 6); 5193 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); 5194 const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); 5195 round_shift_8x8(out, out_shift); 5196 round_shift_8x8(out + 16, out_shift); 5197 highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32); 5198 } 5199 } 5200 static const transform_1d_sse4_1 5201 highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = { 5202 { 5203 { idct4x4_sse4_1, NULL, NULL, NULL }, 5204 { iadst4x4_sse4_1, NULL, NULL, NULL }, 5205 { iidentity4_sse4_1, iidentity4_sse4_1, iidentity4_sse4_1, NULL }, 5206 }, 5207 { { idct8x8_low1_sse4_1, idct8x8_new_sse4_1, NULL, NULL }, 5208 { iadst8x8_low1_sse4_1, iadst8x8_new_sse4_1, NULL, NULL }, 5209 { iidentity8_sse4_1, iidentity8_sse4_1, NULL, NULL } }, 5210 { 5211 { idct16x16_low1_sse4_1, idct16x16_low8_sse4_1, idct16x16_sse4_1, 5212 NULL }, 5213 { iadst16x16_low1_sse4_1, iadst16x16_low8_sse4_1, iadst16x16_sse4_1, 5214 NULL }, 5215 { iidentity16_sse4_1, NULL, iidentity16_sse4_1, NULL }, 5216 }, 5217 { { idct32x32_low1_sse4_1, idct32x32_low8_sse4_1, idct32x32_low16_sse4_1, 5218 idct32x32_sse4_1 }, 5219 { NULL, NULL, NULL, NULL }, 5220 { iidentity32_sse4_1, NULL, NULL, NULL } }, 5221 { { idct64x64_low1_sse4_1, idct64x64_low8_sse4_1, idct64x64_low16_sse4_1, 5222 idct64x64_sse4_1 }, 5223 { NULL, NULL, NULL, NULL }, 5224 { NULL, NULL, NULL, NULL } } 5225 }; 5226 static void highbd_inv_txfm2d_add_h_identity_ssse41(const int32_t *input, 5227 uint16_t *output, 5228 int stride, TX_TYPE tx_type, 5229 TX_SIZE tx_size, int eob, 5230 const int bd) { 5231 __m128i buf1[64]; 5232 int eobx, eoby; 5233 get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob); 5234 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; 5235 const int txw_idx = get_txw_idx(tx_size); 5236 const int txh_idx = get_txh_idx(tx_size); 5237 const int txfm_size_col = tx_size_wide[tx_size]; 5238 const int txfm_size_row = tx_size_high[tx_size]; 5239 const int buf_size_w = AOMMIN(32, txfm_size_col); 5240 const int buf_size_w_div4 = buf_size_w >> 2; 5241 const int buf_size_h_div8 = (eoby + 8) >> 3; 5242 const int row_max = AOMMIN(32, txfm_size_row); 5243 const int input_stride = row_max; 5244 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); 5245 const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby]; 5246 const transform_1d_sse4_1 row_txfm = 5247 highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0]; 5248 const transform_1d_sse4_1 col_txfm = 5249 highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx]; 5250 int ud_flip, lr_flip; 5251 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 5252 5253 for (int i = 0; i < (buf_size_h_div8 << 1); ++i) { 5254 __m128i buf0[16]; 5255 load_buffer_32bit_input(input + i * 4, input_stride, buf0, buf_size_w); 5256 if (rect_type == 1 || rect_type == -1) { 5257 av1_round_shift_rect_array_32_sse4_1(buf0, buf0, buf_size_w, 0, 5258 NewInvSqrt2); 5259 } 5260 row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); 5261 5262 __m128i *_buf1 = buf1 + i * 4; 5263 5264 for (int j = 0; j < buf_size_w_div4; ++j) { 5265 __m128i *buf0_cur = buf0 + j * 4; 5266 TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3], 5267 buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]); 5268 _buf1[j * txfm_size_row + 0] = buf0_cur[0]; 5269 _buf1[j * txfm_size_row + 1] = buf0_cur[1]; 5270 _buf1[j * txfm_size_row + 2] = buf0_cur[2]; 5271 _buf1[j * txfm_size_row + 3] = buf0_cur[3]; 5272 } 5273 } 5274 for (int i = 0; i < buf_size_w_div4; i++) { 5275 col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1, 5276 bd, 0); 5277 5278 av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row, 5279 buf1 + i * txfm_size_row, txfm_size_row, 5280 -shift[1]); 5281 } 5282 5283 // write to buffer 5284 for (int i = 0; i < (txfm_size_col >> 3); i++) { 5285 highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, output + 8 * i, 5286 stride, ud_flip, txfm_size_row, bd); 5287 } 5288 } 5289 static void highbd_inv_txfm2d_add_v_identity_ssse41(const int32_t *input, 5290 uint16_t *output, 5291 int stride, TX_TYPE tx_type, 5292 TX_SIZE tx_size, int eob, 5293 const int bd) { 5294 __m128i buf1[64]; 5295 int eobx, eoby; 5296 get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob); 5297 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; 5298 const int txw_idx = get_txw_idx(tx_size); 5299 const int txh_idx = get_txh_idx(tx_size); 5300 const int txfm_size_col = tx_size_wide[tx_size]; 5301 const int txfm_size_row = tx_size_high[tx_size]; 5302 const int buf_size_w_div4 = AOMMIN(32, txfm_size_col) >> 2; 5303 const int row_max = AOMMIN(32, txfm_size_row); 5304 const int input_stride = row_max; 5305 const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3; 5306 const int buf_size_nonzero_w = buf_size_nonzero_w_div8 << 3; 5307 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); 5308 const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx]; 5309 const transform_1d_sse4_1 row_txfm = 5310 highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx]; 5311 const transform_1d_sse4_1 col_txfm = 5312 highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0]; 5313 int ud_flip, lr_flip; 5314 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 5315 5316 for (int i = 0; i < (row_max >> 2); ++i) { 5317 __m128i buf0[16]; 5318 load_buffer_32bit_input(input + i * 4, input_stride, buf0, 5319 buf_size_nonzero_w); 5320 if (rect_type == 1 || rect_type == -1) { 5321 av1_round_shift_rect_array_32_sse4_1(buf0, buf0, buf_size_nonzero_w, 0, 5322 NewInvSqrt2); 5323 } 5324 row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); 5325 5326 __m128i *_buf1 = buf1 + i * 4; 5327 if (lr_flip) { 5328 for (int j = 0; j < buf_size_w_div4; ++j) { 5329 TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1], 5330 buf0[4 * j], 5331 _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0], 5332 _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 1], 5333 _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 2], 5334 _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3]); 5335 } 5336 } else { 5337 for (int j = 0; j < buf_size_w_div4; ++j) { 5338 TRANSPOSE_4X4( 5339 buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3], 5340 _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1], 5341 _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]); 5342 } 5343 } 5344 } 5345 for (int i = 0; i < buf_size_w_div4; i++) { 5346 col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1, 5347 bd, 0); 5348 5349 av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row, 5350 buf1 + i * txfm_size_row, txfm_size_row, 5351 -shift[1]); 5352 } 5353 5354 // write to buffer 5355 { 5356 for (int i = 0; i < (txfm_size_col >> 3); i++) { 5357 highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, 5358 output + 8 * i, stride, ud_flip, 5359 txfm_size_row, bd); 5360 } 5361 } 5362 } 5363 static void highbd_inv_txfm2d_add_idtx_ssse41(const int32_t *input, 5364 uint16_t *output, int stride, 5365 TX_TYPE tx_type, TX_SIZE tx_size, 5366 int eob, const int bd) { 5367 (void)eob; 5368 __m128i buf1[64 * 4]; 5369 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; 5370 const int txw_idx = get_txw_idx(tx_size); 5371 const int txh_idx = get_txh_idx(tx_size); 5372 const int txfm_size_col = tx_size_wide[tx_size]; 5373 const int txfm_size_row = tx_size_high[tx_size]; 5374 const int row_max = AOMMIN(32, txfm_size_row); 5375 const int input_stride = row_max; 5376 const int buf_size_w = AOMMIN(32, txfm_size_col); 5377 const int buf_size_w_div4 = buf_size_w >> 2; 5378 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); 5379 const transform_1d_sse4_1 row_txfm = 5380 highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0]; 5381 const transform_1d_sse4_1 col_txfm = 5382 highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0]; 5383 5384 for (int i = 0; i < (row_max >> 2); ++i) { 5385 __m128i buf0[32]; 5386 load_buffer_32bit_input(input + i * 4, input_stride, buf0, buf_size_w); 5387 if (rect_type == 1 || rect_type == -1) { 5388 av1_round_shift_rect_array_32_sse4_1(buf0, buf0, buf_size_w, 0, 5389 NewInvSqrt2); 5390 } 5391 row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); 5392 5393 __m128i *_buf1 = buf1 + i * 4; 5394 for (int j = 0; j < buf_size_w_div4; ++j) { 5395 __m128i *buf0_cur = buf0 + j * 4; 5396 TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3], 5397 buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]); 5398 _buf1[j * txfm_size_row + 0] = buf0_cur[0]; 5399 _buf1[j * txfm_size_row + 1] = buf0_cur[1]; 5400 _buf1[j * txfm_size_row + 2] = buf0_cur[2]; 5401 _buf1[j * txfm_size_row + 3] = buf0_cur[3]; 5402 } 5403 } 5404 for (int i = 0; i < buf_size_w_div4; i++) { 5405 col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1, 5406 bd, 0); 5407 5408 av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row, 5409 buf1 + i * txfm_size_row, txfm_size_row, 5410 -shift[1]); 5411 } 5412 5413 // write to buffer 5414 { 5415 for (int i = 0; i < (txfm_size_col >> 3); i++) { 5416 highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, 5417 output + 8 * i, stride, 0, txfm_size_row, 5418 bd); 5419 } 5420 } 5421 } 5422 static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input, 5423 uint16_t *output, 5424 int stride, TX_TYPE tx_type, 5425 TX_SIZE tx_size, int eob, 5426 const int bd) { 5427 __m128i buf1[64 * 16]; 5428 int eobx, eoby; 5429 get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); 5430 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; 5431 const int txw_idx = get_txw_idx(tx_size); 5432 const int txh_idx = get_txh_idx(tx_size); 5433 const int txfm_size_col = tx_size_wide[tx_size]; 5434 const int txfm_size_row = tx_size_high[tx_size]; 5435 const int buf_size_w_div4 = txfm_size_col >> 2; 5436 const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3; 5437 const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; 5438 const int input_stride = AOMMIN(32, txfm_size_row); 5439 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); 5440 5441 const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; 5442 const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; 5443 const transform_1d_sse4_1 row_txfm = 5444 highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; 5445 const transform_1d_sse4_1 col_txfm = 5446 highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; 5447 5448 assert(col_txfm != NULL); 5449 assert(row_txfm != NULL); 5450 int ud_flip, lr_flip; 5451 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 5452 5453 // 1st stage: column transform 5454 for (int i = 0; i < buf_size_nonzero_h_div8 << 1; i++) { 5455 __m128i buf0[64]; 5456 load_buffer_32bit_input(input + i * 4, input_stride, buf0, 5457 buf_size_nonzero_w); 5458 if (rect_type == 1 || rect_type == -1) { 5459 av1_round_shift_rect_array_32_sse4_1(buf0, buf0, buf_size_nonzero_w, 0, 5460 NewInvSqrt2); 5461 } 5462 row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); 5463 5464 __m128i *_buf1 = buf1 + i * 4; 5465 if (lr_flip) { 5466 for (int j = 0; j < buf_size_w_div4; ++j) { 5467 TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1], 5468 buf0[4 * j], 5469 _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0], 5470 _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 1], 5471 _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 2], 5472 _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3]); 5473 } 5474 } else { 5475 for (int j = 0; j < buf_size_w_div4; ++j) { 5476 TRANSPOSE_4X4( 5477 buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3], 5478 _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1], 5479 _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]); 5480 } 5481 } 5482 } 5483 // 2nd stage: column transform 5484 for (int i = 0; i < buf_size_w_div4; i++) { 5485 col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1, 5486 bd, 0); 5487 5488 av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row, 5489 buf1 + i * txfm_size_row, txfm_size_row, 5490 -shift[1]); 5491 } 5492 5493 // write to buffer 5494 { 5495 for (int i = 0; i < (txfm_size_col >> 3); i++) { 5496 highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, 5497 output + 8 * i, stride, ud_flip, 5498 txfm_size_row, bd); 5499 } 5500 } 5501 } 5502 5503 static void highbd_inv_txfm2d_add_4x8_sse41(const int32_t *input, 5504 uint16_t *output, int stride, 5505 TX_TYPE tx_type, TX_SIZE tx_size, 5506 int eob, const int bd) { 5507 (void)eob; 5508 __m128i buf1[8]; 5509 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; 5510 const int txw_idx = get_txw_idx(tx_size); 5511 const int txh_idx = get_txh_idx(tx_size); 5512 const int txfm_size_col = tx_size_wide[tx_size]; 5513 const int txfm_size_row = tx_size_high[tx_size]; 5514 const transform_1d_sse4_1 row_txfm = 5515 highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0]; 5516 const transform_1d_sse4_1 col_txfm = 5517 highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][1]; 5518 const int input_stride = AOMMIN(32, txfm_size_row); 5519 5520 assert(col_txfm != NULL); 5521 assert(row_txfm != NULL); 5522 int ud_flip, lr_flip; 5523 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 5524 5525 // 1st stage: column transform 5526 __m128i buf0[8]; 5527 load_buffer_32bit_input(input, input_stride, buf0, txfm_size_col); 5528 load_buffer_32bit_input(input + 4, input_stride, buf0 + 4, txfm_size_col); 5529 av1_round_shift_rect_array_32_sse4_1(buf0, buf0, txfm_size_row, 0, 5530 NewInvSqrt2); 5531 row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); 5532 row_txfm(buf0 + 4, buf0 + 4, INV_COS_BIT, 0, bd, -shift[0]); 5533 5534 if (lr_flip) { 5535 TRANSPOSE_4X4(buf0[3], buf0[2], buf0[1], buf0[0], buf1[0], buf1[1], buf1[2], 5536 buf1[3]); 5537 5538 TRANSPOSE_4X4(buf0[7], buf0[6], buf0[5], buf0[4], buf1[4], buf1[5], buf1[6], 5539 buf1[7]); 5540 } else { 5541 TRANSPOSE_4X4(buf0[0], buf0[1], buf0[2], buf0[3], buf1[0], buf1[1], buf1[2], 5542 buf1[3]); 5543 5544 TRANSPOSE_4X4(buf0[4], buf0[5], buf0[6], buf0[7], buf1[4], buf1[5], buf1[6], 5545 buf1[7]); 5546 } 5547 5548 // 2nd stage: column transform 5549 col_txfm(buf1, buf1, INV_COS_BIT, 1, bd, 0); 5550 5551 av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]); 5552 5553 // write to buffer 5554 highbd_write_buffer_4xn_sse4_1(buf1, output, stride, ud_flip, txfm_size_row, 5555 bd); 5556 } 5557 5558 static void highbd_inv_txfm2d_add_8x4_sse41(const int32_t *input, 5559 uint16_t *output, int stride, 5560 TX_TYPE tx_type, TX_SIZE tx_size, 5561 int eob, const int bd) { 5562 (void)eob; 5563 __m128i buf1[8]; 5564 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; 5565 const int txw_idx = get_txw_idx(tx_size); 5566 const int txh_idx = get_txh_idx(tx_size); 5567 const int txfm_size_col = tx_size_wide[tx_size]; 5568 const int txfm_size_row = tx_size_high[tx_size]; 5569 const transform_1d_sse4_1 row_txfm = 5570 highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][1]; 5571 const transform_1d_sse4_1 col_txfm = 5572 highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0]; 5573 5574 assert(col_txfm != NULL); 5575 assert(row_txfm != NULL); 5576 int ud_flip, lr_flip; 5577 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 5578 5579 // 1st stage: column transform 5580 __m128i buf0[8]; 5581 const int32_t *input_row = input; 5582 load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col); 5583 5584 av1_round_shift_rect_array_32_sse4_1(buf0, buf0, txfm_size_col, 0, 5585 NewInvSqrt2); 5586 row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); 5587 5588 __m128i *buf1_ptr; 5589 if (lr_flip) { 5590 flip_buf_sse2(buf0, buf1, txfm_size_col); 5591 buf1_ptr = buf1; 5592 } else { 5593 buf1_ptr = buf0; 5594 } 5595 5596 // 2nd stage: column transform 5597 for (int i = 0; i < 2; i++) { 5598 __m128i *buf1_cur = buf1_ptr + i * txfm_size_row; 5599 transpose_32bit_4x4(buf1_cur, buf1_cur); 5600 col_txfm(buf1_cur, buf1_cur, INV_COS_BIT, 1, bd, 0); 5601 } 5602 av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]); 5603 // write to buffer 5604 highbd_write_buffer_8xn_sse4_1(buf1_ptr, output, stride, ud_flip, 5605 txfm_size_row, bd); 5606 } 5607 5608 static void highbd_inv_txfm2d_add_4x16_sse4_1(const int32_t *input, 5609 uint16_t *output, int stride, 5610 TX_TYPE tx_type, TX_SIZE tx_size, 5611 int eob, const int bd) { 5612 (void)eob; 5613 __m128i buf1[16]; 5614 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; 5615 const int txw_idx = get_txw_idx(tx_size); 5616 const int txh_idx = get_txh_idx(tx_size); 5617 const int txfm_size_col = tx_size_wide[tx_size]; 5618 const int txfm_size_row = tx_size_high[tx_size]; 5619 const int buf_size_h_div8 = txfm_size_row >> 2; 5620 const transform_1d_sse4_1 row_txfm = 5621 highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0]; 5622 const transform_1d_sse4_1 col_txfm = 5623 highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][2]; 5624 const int input_stride = AOMMIN(32, txfm_size_row); 5625 5626 assert(col_txfm != NULL); 5627 assert(row_txfm != NULL); 5628 int ud_flip, lr_flip; 5629 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 5630 5631 // 1st stage: column transform 5632 __m128i buf0[16]; 5633 for (int i = 0; i < (txfm_size_row >> 2); i++) { 5634 const int32_t *input_row = input + i * 4; 5635 __m128i *buf0_cur = buf0 + i * 4; 5636 load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_col); 5637 row_txfm(buf0_cur, buf0_cur, INV_COS_BIT, 0, bd, -shift[0]); 5638 } 5639 5640 if (lr_flip) { 5641 for (int j = 0; j < buf_size_h_div8; ++j) { 5642 TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1], 5643 buf0[4 * j], buf1[4 * j], buf1[4 * j + 1], buf1[4 * j + 2], 5644 buf1[4 * j + 3]); 5645 } 5646 } else { 5647 for (int j = 0; j < buf_size_h_div8; ++j) { 5648 TRANSPOSE_4X4(buf0[4 * j], buf0[4 * j + 1], buf0[4 * j + 2], 5649 buf0[4 * j + 3], buf1[4 * j], buf1[4 * j + 1], 5650 buf1[4 * j + 2], buf1[4 * j + 3]); 5651 } 5652 } 5653 5654 // 2nd stage: column transform 5655 col_txfm(buf1, buf1, INV_COS_BIT, 1, bd, 0); 5656 5657 av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]); 5658 5659 // write to buffer 5660 highbd_write_buffer_4xn_sse4_1(buf1, output, stride, ud_flip, txfm_size_row, 5661 bd); 5662 } 5663 5664 static void highbd_inv_txfm2d_add_16x4_sse4_1(const int32_t *input, 5665 uint16_t *output, int stride, 5666 TX_TYPE tx_type, TX_SIZE tx_size, 5667 int eob, const int bd) { 5668 (void)eob; 5669 __m128i buf1[16]; 5670 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; 5671 const int txw_idx = get_txw_idx(tx_size); 5672 const int txh_idx = get_txh_idx(tx_size); 5673 const int txfm_size_col = tx_size_wide[tx_size]; 5674 const int txfm_size_row = tx_size_high[tx_size]; 5675 const int buf_size_w_div8 = txfm_size_col >> 2; 5676 const transform_1d_sse4_1 row_txfm = 5677 highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][2]; 5678 const transform_1d_sse4_1 col_txfm = 5679 highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0]; 5680 5681 assert(col_txfm != NULL); 5682 assert(row_txfm != NULL); 5683 int ud_flip, lr_flip; 5684 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 5685 5686 // 1st stage: column transform 5687 __m128i buf0[16]; 5688 const int32_t *input_row = input; 5689 load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col); 5690 5691 row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); 5692 5693 __m128i *buf1_ptr; 5694 if (lr_flip) { 5695 flip_buf_sse2(buf0, buf1, txfm_size_col); 5696 buf1_ptr = buf1; 5697 } else { 5698 buf1_ptr = buf0; 5699 } 5700 5701 // 2nd stage: column transform 5702 for (int i = 0; i < buf_size_w_div8; i++) { 5703 __m128i *buf1_cur = buf1_ptr + i * txfm_size_row; 5704 transpose_32bit_4x4(buf1_cur, buf1_cur); 5705 col_txfm(buf1_cur, buf1_cur, INV_COS_BIT, 1, bd, 0); 5706 } 5707 av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]); 5708 5709 // write to buffer 5710 for (int i = 0; i < (txfm_size_col >> 3); i++) { 5711 highbd_write_buffer_8xn_sse4_1(buf1_ptr + i * txfm_size_row * 2, 5712 output + 8 * i, stride, ud_flip, 5713 txfm_size_row, bd); 5714 } 5715 } 5716 5717 void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input, 5718 uint8_t *output, int stride, 5719 TX_TYPE tx_type, TX_SIZE tx_size, 5720 int eob, const int bd) { 5721 switch (tx_type) { 5722 case DCT_DCT: 5723 case ADST_DCT: 5724 case DCT_ADST: 5725 case ADST_ADST: 5726 case FLIPADST_DCT: 5727 case DCT_FLIPADST: 5728 case FLIPADST_FLIPADST: 5729 case ADST_FLIPADST: 5730 case FLIPADST_ADST: 5731 highbd_inv_txfm2d_add_no_identity_sse41( 5732 input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob, 5733 bd); 5734 break; 5735 case V_DCT: 5736 case V_ADST: 5737 case V_FLIPADST: 5738 highbd_inv_txfm2d_add_h_identity_ssse41( 5739 input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob, 5740 bd); 5741 break; 5742 case H_DCT: 5743 case H_ADST: 5744 case H_FLIPADST: 5745 highbd_inv_txfm2d_add_v_identity_ssse41( 5746 input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob, 5747 bd); 5748 break; 5749 case IDTX: 5750 highbd_inv_txfm2d_add_idtx_ssse41(input, CONVERT_TO_SHORTPTR(output), 5751 stride, tx_type, tx_size, eob, bd); 5752 break; 5753 default: assert(0); break; 5754 } 5755 } 5756 5757 static void av1_highbd_inv_txfm_add_4x8_sse4_1(const tran_low_t *input, 5758 uint8_t *dest, int stride, 5759 const TxfmParam *txfm_param) { 5760 int bd = txfm_param->bd; 5761 const TX_TYPE tx_type = txfm_param->tx_type; 5762 const TX_SIZE tx_size = txfm_param->tx_size; 5763 int eob = txfm_param->eob; 5764 highbd_inv_txfm2d_add_4x8_sse41(input, CONVERT_TO_SHORTPTR(dest), stride, 5765 tx_type, tx_size, eob, bd); 5766 } 5767 5768 static void av1_highbd_inv_txfm_add_8x4_sse4_1(const tran_low_t *input, 5769 uint8_t *dest, int stride, 5770 const TxfmParam *txfm_param) { 5771 int bd = txfm_param->bd; 5772 const TX_TYPE tx_type = txfm_param->tx_type; 5773 const TX_SIZE tx_size = txfm_param->tx_size; 5774 int eob = txfm_param->eob; 5775 highbd_inv_txfm2d_add_8x4_sse41(input, CONVERT_TO_SHORTPTR(dest), stride, 5776 tx_type, tx_size, eob, bd); 5777 } 5778 5779 static void av1_highbd_inv_txfm_add_4x16_sse4_1(const tran_low_t *input, 5780 uint8_t *dest, int stride, 5781 const TxfmParam *txfm_param) { 5782 int bd = txfm_param->bd; 5783 const TX_TYPE tx_type = txfm_param->tx_type; 5784 const TX_SIZE tx_size = txfm_param->tx_size; 5785 int eob = txfm_param->eob; 5786 highbd_inv_txfm2d_add_4x16_sse4_1(input, CONVERT_TO_SHORTPTR(dest), stride, 5787 tx_type, tx_size, eob, bd); 5788 } 5789 5790 static void av1_highbd_inv_txfm_add_16x4_sse4_1(const tran_low_t *input, 5791 uint8_t *dest, int stride, 5792 const TxfmParam *txfm_param) { 5793 int bd = txfm_param->bd; 5794 const TX_TYPE tx_type = txfm_param->tx_type; 5795 const TX_SIZE tx_size = txfm_param->tx_size; 5796 int eob = txfm_param->eob; 5797 highbd_inv_txfm2d_add_16x4_sse4_1(input, CONVERT_TO_SHORTPTR(dest), stride, 5798 tx_type, tx_size, eob, bd); 5799 } 5800 5801 void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *input, uint8_t *dest, 5802 int stride, const TxfmParam *txfm_param) { 5803 assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); 5804 const TX_SIZE tx_size = txfm_param->tx_size; 5805 switch (tx_size) { 5806 case TX_8X8: 5807 av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param); 5808 break; 5809 case TX_4X8: 5810 av1_highbd_inv_txfm_add_4x8_sse4_1(input, dest, stride, txfm_param); 5811 break; 5812 case TX_8X4: 5813 av1_highbd_inv_txfm_add_8x4_sse4_1(input, dest, stride, txfm_param); 5814 break; 5815 case TX_4X4: 5816 av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param); 5817 break; 5818 case TX_16X4: 5819 av1_highbd_inv_txfm_add_16x4_sse4_1(input, dest, stride, txfm_param); 5820 break; 5821 case TX_4X16: 5822 av1_highbd_inv_txfm_add_4x16_sse4_1(input, dest, stride, txfm_param); 5823 break; 5824 default: 5825 av1_highbd_inv_txfm2d_add_universe_sse4_1( 5826 input, dest, stride, txfm_param->tx_type, tx_size, txfm_param->eob, 5827 txfm_param->bd); 5828 break; 5829 } 5830 }