highbd_fwd_txfm_sse4.c (95993B)
1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 #include <assert.h> 12 #include <smmintrin.h> /* SSE4.1 */ 13 14 #include "aom_dsp/txfm_common.h" 15 #include "aom_dsp/x86/transpose_sse2.h" 16 #include "aom_dsp/x86/txfm_common_sse2.h" 17 #include "aom_ports/mem.h" 18 #include "av1/common/av1_txfm.h" 19 #include "av1/common/x86/highbd_txfm_utility_sse4.h" 20 #include "av1/encoder/av1_fwd_txfm1d_cfg.h" 21 #include "av1/encoder/x86/av1_txfm1d_sse4.h" 22 #include "config/aom_config.h" 23 #include "config/av1_rtcd.h" 24 25 static inline void store_output_w4(int32_t *const out, const __m128i *const in, 26 const int stride, const int out_size) { 27 for (int i = 0; i < out_size; ++i) { 28 _mm_store_si128((__m128i *)(out + i * stride), in[i]); 29 } 30 } 31 32 void av1_fwht4x4_sse4_1(const int16_t *input, tran_low_t *output, int stride) { 33 __m128i in[4]; 34 in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); 35 in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); 36 in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); 37 in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); 38 39 // Convert to int32_t. 40 __m128i op[4]; 41 op[0] = _mm_cvtepi16_epi32(in[0]); 42 op[1] = _mm_cvtepi16_epi32(in[1]); 43 op[2] = _mm_cvtepi16_epi32(in[2]); 44 op[3] = _mm_cvtepi16_epi32(in[3]); 45 46 for (int i = 0; i < 2; ++i) { 47 __m128i a1 = op[0]; 48 __m128i b1 = op[1]; 49 __m128i c1 = op[2]; 50 __m128i d1 = op[3]; 51 __m128i e1; 52 53 a1 = _mm_add_epi32(a1, b1); // a1 += b1 54 d1 = _mm_sub_epi32(d1, c1); // d1 = d1 - c1 55 e1 = _mm_sub_epi32(a1, d1); // e1 = (a1 - d1) >> 1 56 e1 = _mm_srai_epi32(e1, 1); 57 b1 = _mm_sub_epi32(e1, b1); // b1 = e1 - b1 58 c1 = _mm_sub_epi32(e1, c1); // c1 = e1 - c1 59 a1 = _mm_sub_epi32(a1, c1); // a1 -= c1 60 d1 = _mm_add_epi32(d1, b1); // d1 += b1 61 62 op[0] = a1; 63 op[1] = c1; 64 op[2] = d1; 65 op[3] = b1; 66 67 if (i == 0) { 68 transpose_32bit_4x4(op, op); 69 } 70 } 71 72 op[0] = _mm_slli_epi32(op[0], UNIT_QUANT_SHIFT); 73 op[1] = _mm_slli_epi32(op[1], UNIT_QUANT_SHIFT); 74 op[2] = _mm_slli_epi32(op[2], UNIT_QUANT_SHIFT); 75 op[3] = _mm_slli_epi32(op[3], UNIT_QUANT_SHIFT); 76 77 _mm_storeu_si128((__m128i *)(output + 0), op[0]); 78 _mm_storeu_si128((__m128i *)(output + 4), op[1]); 79 _mm_storeu_si128((__m128i *)(output + 8), op[2]); 80 _mm_storeu_si128((__m128i *)(output + 12), op[3]); 81 } 82 83 static inline void load_buffer_4x4(const int16_t *input, __m128i *in, 84 int stride, int flipud, int fliplr, 85 int shift) { 86 if (!flipud) { 87 in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); 88 in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); 89 in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); 90 in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); 91 } else { 92 in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); 93 in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); 94 in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); 95 in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); 96 } 97 98 if (fliplr) { 99 in[0] = _mm_shufflelo_epi16(in[0], 0x1b); 100 in[1] = _mm_shufflelo_epi16(in[1], 0x1b); 101 in[2] = _mm_shufflelo_epi16(in[2], 0x1b); 102 in[3] = _mm_shufflelo_epi16(in[3], 0x1b); 103 } 104 105 in[0] = _mm_cvtepi16_epi32(in[0]); 106 in[1] = _mm_cvtepi16_epi32(in[1]); 107 in[2] = _mm_cvtepi16_epi32(in[2]); 108 in[3] = _mm_cvtepi16_epi32(in[3]); 109 110 in[0] = _mm_slli_epi32(in[0], shift); 111 in[1] = _mm_slli_epi32(in[1], shift); 112 in[2] = _mm_slli_epi32(in[2], shift); 113 in[3] = _mm_slli_epi32(in[3], shift); 114 } 115 116 // We only use stage-2 bit; 117 // shift[0] is used in load_buffer_4x4() 118 // shift[1] is used in txfm_func_col() 119 // shift[2] is used in txfm_func_row() 120 static void fdct4x4_sse4_1(__m128i *in, __m128i *out, int bit, 121 const int num_col) { 122 const int32_t *cospi = cospi_arr(bit); 123 const __m128i cospi32 = _mm_set1_epi32(cospi[32]); 124 const __m128i cospi48 = _mm_set1_epi32(cospi[48]); 125 const __m128i cospi16 = _mm_set1_epi32(cospi[16]); 126 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); 127 __m128i s0, s1, s2, s3; 128 __m128i u0, u1, u2, u3; 129 __m128i v0, v1, v2, v3; 130 131 int endidx = 3 * num_col; 132 s0 = _mm_add_epi32(in[0], in[endidx]); 133 s3 = _mm_sub_epi32(in[0], in[endidx]); 134 endidx -= num_col; 135 s1 = _mm_add_epi32(in[num_col], in[endidx]); 136 s2 = _mm_sub_epi32(in[num_col], in[endidx]); 137 138 // btf_32_sse4_1_type0(cospi32, cospi32, s[01], u[02], bit); 139 u0 = _mm_mullo_epi32(s0, cospi32); 140 u1 = _mm_mullo_epi32(s1, cospi32); 141 u2 = _mm_add_epi32(u0, u1); 142 v0 = _mm_sub_epi32(u0, u1); 143 144 u3 = _mm_add_epi32(u2, rnding); 145 v1 = _mm_add_epi32(v0, rnding); 146 147 u0 = _mm_srai_epi32(u3, bit); 148 u2 = _mm_srai_epi32(v1, bit); 149 150 // btf_32_sse4_1_type1(cospi48, cospi16, s[23], u[13], bit); 151 v0 = _mm_mullo_epi32(s2, cospi48); 152 v1 = _mm_mullo_epi32(s3, cospi16); 153 v2 = _mm_add_epi32(v0, v1); 154 155 v3 = _mm_add_epi32(v2, rnding); 156 u1 = _mm_srai_epi32(v3, bit); 157 158 v0 = _mm_mullo_epi32(s2, cospi16); 159 v1 = _mm_mullo_epi32(s3, cospi48); 160 v2 = _mm_sub_epi32(v1, v0); 161 162 v3 = _mm_add_epi32(v2, rnding); 163 u3 = _mm_srai_epi32(v3, bit); 164 165 // Note: shift[1] and shift[2] are zeros 166 167 out[0] = u0; 168 out[1] = u1; 169 out[2] = u2; 170 out[3] = u3; 171 } 172 173 static inline void write_buffer_4x4(__m128i *res, int32_t *output) { 174 _mm_store_si128((__m128i *)(output + 0 * 4), res[0]); 175 _mm_store_si128((__m128i *)(output + 1 * 4), res[1]); 176 _mm_store_si128((__m128i *)(output + 2 * 4), res[2]); 177 _mm_store_si128((__m128i *)(output + 3 * 4), res[3]); 178 } 179 180 static void fadst4x4_sse4_1(__m128i *in, __m128i *out, int bit, 181 const int num_col) { 182 const int32_t *sinpi = sinpi_arr(bit); 183 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); 184 const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]); 185 const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]); 186 const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]); 187 const __m128i sinpi4 = _mm_set1_epi32((int)sinpi[4]); 188 __m128i t; 189 __m128i s0, s1, s2, s3, s4, s5, s6, s7; 190 __m128i x0, x1, x2, x3; 191 __m128i u0, u1, u2, u3; 192 193 int idx = 0 * num_col; 194 s0 = _mm_mullo_epi32(in[idx], sinpi1); 195 s1 = _mm_mullo_epi32(in[idx], sinpi4); 196 t = _mm_add_epi32(in[idx], in[idx + num_col]); 197 idx += num_col; 198 s2 = _mm_mullo_epi32(in[idx], sinpi2); 199 s3 = _mm_mullo_epi32(in[idx], sinpi1); 200 idx += num_col; 201 s4 = _mm_mullo_epi32(in[idx], sinpi3); 202 idx += num_col; 203 s5 = _mm_mullo_epi32(in[idx], sinpi4); 204 s6 = _mm_mullo_epi32(in[idx], sinpi2); 205 s7 = _mm_sub_epi32(t, in[idx]); 206 207 t = _mm_add_epi32(s0, s2); 208 x0 = _mm_add_epi32(t, s5); 209 x1 = _mm_mullo_epi32(s7, sinpi3); 210 t = _mm_sub_epi32(s1, s3); 211 x2 = _mm_add_epi32(t, s6); 212 x3 = s4; 213 214 s0 = _mm_add_epi32(x0, x3); 215 s1 = x1; 216 s2 = _mm_sub_epi32(x2, x3); 217 t = _mm_sub_epi32(x2, x0); 218 s3 = _mm_add_epi32(t, x3); 219 220 u0 = _mm_add_epi32(s0, rnding); 221 u0 = _mm_srai_epi32(u0, bit); 222 223 u1 = _mm_add_epi32(s1, rnding); 224 u1 = _mm_srai_epi32(u1, bit); 225 226 u2 = _mm_add_epi32(s2, rnding); 227 u2 = _mm_srai_epi32(u2, bit); 228 229 u3 = _mm_add_epi32(s3, rnding); 230 u3 = _mm_srai_epi32(u3, bit); 231 232 out[0] = u0; 233 out[1] = u1; 234 out[2] = u2; 235 out[3] = u3; 236 } 237 static void idtx4x4_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) { 238 (void)bit; 239 __m128i fact = _mm_set1_epi32(NewSqrt2); 240 __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1)); 241 __m128i a_low; 242 243 for (int i = 0; i < 4; i++) { 244 a_low = _mm_mullo_epi32(in[i * col_num], fact); 245 a_low = _mm_add_epi32(a_low, offset); 246 out[i] = _mm_srai_epi32(a_low, NewSqrt2Bits); 247 } 248 } 249 void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff, 250 int input_stride, TX_TYPE tx_type, int bd) { 251 __m128i in[4]; 252 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X4]; 253 const int txw_idx = get_txw_idx(TX_4X4); 254 const int txh_idx = get_txh_idx(TX_4X4); 255 256 switch (tx_type) { 257 case DCT_DCT: 258 load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); 259 fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); 260 transpose_32bit_4x4(in, in); 261 fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); 262 write_buffer_4x4(in, coeff); 263 break; 264 case ADST_DCT: 265 load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); 266 fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); 267 transpose_32bit_4x4(in, in); 268 fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); 269 write_buffer_4x4(in, coeff); 270 break; 271 case DCT_ADST: 272 load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); 273 fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); 274 transpose_32bit_4x4(in, in); 275 fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); 276 write_buffer_4x4(in, coeff); 277 break; 278 case ADST_ADST: 279 load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); 280 fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); 281 transpose_32bit_4x4(in, in); 282 fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); 283 write_buffer_4x4(in, coeff); 284 break; 285 case FLIPADST_DCT: 286 load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]); 287 fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); 288 transpose_32bit_4x4(in, in); 289 fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); 290 write_buffer_4x4(in, coeff); 291 break; 292 case DCT_FLIPADST: 293 load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]); 294 fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); 295 transpose_32bit_4x4(in, in); 296 fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); 297 write_buffer_4x4(in, coeff); 298 break; 299 case FLIPADST_FLIPADST: 300 load_buffer_4x4(input, in, input_stride, 1, 1, shift[0]); 301 fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); 302 transpose_32bit_4x4(in, in); 303 fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); 304 write_buffer_4x4(in, coeff); 305 break; 306 case ADST_FLIPADST: 307 load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]); 308 fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); 309 transpose_32bit_4x4(in, in); 310 fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); 311 write_buffer_4x4(in, coeff); 312 break; 313 case FLIPADST_ADST: 314 load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]); 315 fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); 316 transpose_32bit_4x4(in, in); 317 fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); 318 write_buffer_4x4(in, coeff); 319 break; 320 case IDTX: 321 load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); 322 idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); 323 transpose_32bit_4x4(in, in); 324 idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); 325 write_buffer_4x4(in, coeff); 326 break; 327 case V_DCT: 328 load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); 329 fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); 330 transpose_32bit_4x4(in, in); 331 idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); 332 write_buffer_4x4(in, coeff); 333 break; 334 case H_DCT: 335 load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); 336 idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); 337 transpose_32bit_4x4(in, in); 338 fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); 339 write_buffer_4x4(in, coeff); 340 break; 341 case V_ADST: 342 load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); 343 fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); 344 transpose_32bit_4x4(in, in); 345 idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); 346 write_buffer_4x4(in, coeff); 347 break; 348 case H_ADST: 349 load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); 350 idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); 351 transpose_32bit_4x4(in, in); 352 fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); 353 write_buffer_4x4(in, coeff); 354 break; 355 case V_FLIPADST: 356 load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]); 357 fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); 358 transpose_32bit_4x4(in, in); 359 idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); 360 write_buffer_4x4(in, coeff); 361 break; 362 case H_FLIPADST: 363 load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]); 364 idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); 365 transpose_32bit_4x4(in, in); 366 fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); 367 write_buffer_4x4(in, coeff); 368 break; 369 default: assert(0); 370 } 371 (void)bd; 372 } 373 374 static inline void load_buffer_8x8(const int16_t *input, __m128i *in, 375 int stride, int flipud, int fliplr, 376 int shift) { 377 __m128i u; 378 if (!flipud) { 379 in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); 380 in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); 381 in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); 382 in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); 383 in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride)); 384 in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride)); 385 in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride)); 386 in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride)); 387 } else { 388 in[0] = _mm_load_si128((const __m128i *)(input + 7 * stride)); 389 in[1] = _mm_load_si128((const __m128i *)(input + 6 * stride)); 390 in[2] = _mm_load_si128((const __m128i *)(input + 5 * stride)); 391 in[3] = _mm_load_si128((const __m128i *)(input + 4 * stride)); 392 in[4] = _mm_load_si128((const __m128i *)(input + 3 * stride)); 393 in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride)); 394 in[6] = _mm_load_si128((const __m128i *)(input + 1 * stride)); 395 in[7] = _mm_load_si128((const __m128i *)(input + 0 * stride)); 396 } 397 398 if (fliplr) { 399 in[0] = mm_reverse_epi16(in[0]); 400 in[1] = mm_reverse_epi16(in[1]); 401 in[2] = mm_reverse_epi16(in[2]); 402 in[3] = mm_reverse_epi16(in[3]); 403 in[4] = mm_reverse_epi16(in[4]); 404 in[5] = mm_reverse_epi16(in[5]); 405 in[6] = mm_reverse_epi16(in[6]); 406 in[7] = mm_reverse_epi16(in[7]); 407 } 408 409 u = _mm_unpackhi_epi64(in[4], in[4]); 410 in[8] = _mm_cvtepi16_epi32(in[4]); 411 in[9] = _mm_cvtepi16_epi32(u); 412 413 u = _mm_unpackhi_epi64(in[5], in[5]); 414 in[10] = _mm_cvtepi16_epi32(in[5]); 415 in[11] = _mm_cvtepi16_epi32(u); 416 417 u = _mm_unpackhi_epi64(in[6], in[6]); 418 in[12] = _mm_cvtepi16_epi32(in[6]); 419 in[13] = _mm_cvtepi16_epi32(u); 420 421 u = _mm_unpackhi_epi64(in[7], in[7]); 422 in[14] = _mm_cvtepi16_epi32(in[7]); 423 in[15] = _mm_cvtepi16_epi32(u); 424 425 u = _mm_unpackhi_epi64(in[3], in[3]); 426 in[6] = _mm_cvtepi16_epi32(in[3]); 427 in[7] = _mm_cvtepi16_epi32(u); 428 429 u = _mm_unpackhi_epi64(in[2], in[2]); 430 in[4] = _mm_cvtepi16_epi32(in[2]); 431 in[5] = _mm_cvtepi16_epi32(u); 432 433 u = _mm_unpackhi_epi64(in[1], in[1]); 434 in[2] = _mm_cvtepi16_epi32(in[1]); 435 in[3] = _mm_cvtepi16_epi32(u); 436 437 u = _mm_unpackhi_epi64(in[0], in[0]); 438 in[0] = _mm_cvtepi16_epi32(in[0]); 439 in[1] = _mm_cvtepi16_epi32(u); 440 441 in[0] = _mm_slli_epi32(in[0], shift); 442 in[1] = _mm_slli_epi32(in[1], shift); 443 in[2] = _mm_slli_epi32(in[2], shift); 444 in[3] = _mm_slli_epi32(in[3], shift); 445 in[4] = _mm_slli_epi32(in[4], shift); 446 in[5] = _mm_slli_epi32(in[5], shift); 447 in[6] = _mm_slli_epi32(in[6], shift); 448 in[7] = _mm_slli_epi32(in[7], shift); 449 450 in[8] = _mm_slli_epi32(in[8], shift); 451 in[9] = _mm_slli_epi32(in[9], shift); 452 in[10] = _mm_slli_epi32(in[10], shift); 453 in[11] = _mm_slli_epi32(in[11], shift); 454 in[12] = _mm_slli_epi32(in[12], shift); 455 in[13] = _mm_slli_epi32(in[13], shift); 456 in[14] = _mm_slli_epi32(in[14], shift); 457 in[15] = _mm_slli_epi32(in[15], shift); 458 } 459 460 static inline void col_txfm_8x8_rounding(__m128i *in, int shift) { 461 const __m128i rounding = _mm_set1_epi32(1 << (shift - 1)); 462 463 in[0] = _mm_add_epi32(in[0], rounding); 464 in[1] = _mm_add_epi32(in[1], rounding); 465 in[2] = _mm_add_epi32(in[2], rounding); 466 in[3] = _mm_add_epi32(in[3], rounding); 467 in[4] = _mm_add_epi32(in[4], rounding); 468 in[5] = _mm_add_epi32(in[5], rounding); 469 in[6] = _mm_add_epi32(in[6], rounding); 470 in[7] = _mm_add_epi32(in[7], rounding); 471 in[8] = _mm_add_epi32(in[8], rounding); 472 in[9] = _mm_add_epi32(in[9], rounding); 473 in[10] = _mm_add_epi32(in[10], rounding); 474 in[11] = _mm_add_epi32(in[11], rounding); 475 in[12] = _mm_add_epi32(in[12], rounding); 476 in[13] = _mm_add_epi32(in[13], rounding); 477 in[14] = _mm_add_epi32(in[14], rounding); 478 in[15] = _mm_add_epi32(in[15], rounding); 479 480 in[0] = _mm_srai_epi32(in[0], shift); 481 in[1] = _mm_srai_epi32(in[1], shift); 482 in[2] = _mm_srai_epi32(in[2], shift); 483 in[3] = _mm_srai_epi32(in[3], shift); 484 in[4] = _mm_srai_epi32(in[4], shift); 485 in[5] = _mm_srai_epi32(in[5], shift); 486 in[6] = _mm_srai_epi32(in[6], shift); 487 in[7] = _mm_srai_epi32(in[7], shift); 488 in[8] = _mm_srai_epi32(in[8], shift); 489 in[9] = _mm_srai_epi32(in[9], shift); 490 in[10] = _mm_srai_epi32(in[10], shift); 491 in[11] = _mm_srai_epi32(in[11], shift); 492 in[12] = _mm_srai_epi32(in[12], shift); 493 in[13] = _mm_srai_epi32(in[13], shift); 494 in[14] = _mm_srai_epi32(in[14], shift); 495 in[15] = _mm_srai_epi32(in[15], shift); 496 } 497 498 static inline void col_txfm_4x8_rounding(__m128i *in, int shift) { 499 const __m128i rounding = _mm_set1_epi32(1 << (shift - 1)); 500 501 in[0] = _mm_add_epi32(in[0], rounding); 502 in[1] = _mm_add_epi32(in[1], rounding); 503 in[2] = _mm_add_epi32(in[2], rounding); 504 in[3] = _mm_add_epi32(in[3], rounding); 505 in[4] = _mm_add_epi32(in[4], rounding); 506 in[5] = _mm_add_epi32(in[5], rounding); 507 in[6] = _mm_add_epi32(in[6], rounding); 508 in[7] = _mm_add_epi32(in[7], rounding); 509 510 in[0] = _mm_srai_epi32(in[0], shift); 511 in[1] = _mm_srai_epi32(in[1], shift); 512 in[2] = _mm_srai_epi32(in[2], shift); 513 in[3] = _mm_srai_epi32(in[3], shift); 514 in[4] = _mm_srai_epi32(in[4], shift); 515 in[5] = _mm_srai_epi32(in[5], shift); 516 in[6] = _mm_srai_epi32(in[6], shift); 517 in[7] = _mm_srai_epi32(in[7], shift); 518 } 519 520 static inline void write_buffer_8x8(const __m128i *res, int32_t *output) { 521 _mm_store_si128((__m128i *)(output + 0 * 4), res[0]); 522 _mm_store_si128((__m128i *)(output + 1 * 4), res[1]); 523 _mm_store_si128((__m128i *)(output + 2 * 4), res[2]); 524 _mm_store_si128((__m128i *)(output + 3 * 4), res[3]); 525 526 _mm_store_si128((__m128i *)(output + 4 * 4), res[4]); 527 _mm_store_si128((__m128i *)(output + 5 * 4), res[5]); 528 _mm_store_si128((__m128i *)(output + 6 * 4), res[6]); 529 _mm_store_si128((__m128i *)(output + 7 * 4), res[7]); 530 531 _mm_store_si128((__m128i *)(output + 8 * 4), res[8]); 532 _mm_store_si128((__m128i *)(output + 9 * 4), res[9]); 533 _mm_store_si128((__m128i *)(output + 10 * 4), res[10]); 534 _mm_store_si128((__m128i *)(output + 11 * 4), res[11]); 535 536 _mm_store_si128((__m128i *)(output + 12 * 4), res[12]); 537 _mm_store_si128((__m128i *)(output + 13 * 4), res[13]); 538 _mm_store_si128((__m128i *)(output + 14 * 4), res[14]); 539 _mm_store_si128((__m128i *)(output + 15 * 4), res[15]); 540 } 541 542 static inline void write_buffer_16x8(const __m128i *res, int32_t *output, 543 const int stride) { 544 _mm_storeu_si128((__m128i *)(output), res[0]); 545 _mm_storeu_si128((__m128i *)(output + 4), res[1]); 546 _mm_storeu_si128((__m128i *)(output + stride), res[2]); 547 _mm_storeu_si128((__m128i *)(output + stride + 4), res[3]); 548 549 _mm_storeu_si128((__m128i *)(output + (stride * 2)), res[4]); 550 _mm_storeu_si128((__m128i *)(output + (stride * 2) + 4), res[5]); 551 _mm_storeu_si128((__m128i *)(output + (stride * 3)), res[6]); 552 _mm_storeu_si128((__m128i *)(output + (stride * 3) + 4), res[7]); 553 554 _mm_storeu_si128((__m128i *)(output + (stride * 4)), res[8]); 555 _mm_storeu_si128((__m128i *)(output + (stride * 4) + 4), res[9]); 556 _mm_storeu_si128((__m128i *)(output + (stride * 5)), res[10]); 557 _mm_storeu_si128((__m128i *)(output + (stride * 5) + 4), res[11]); 558 559 _mm_storeu_si128((__m128i *)(output + (stride * 6)), res[12]); 560 _mm_storeu_si128((__m128i *)(output + (stride * 6) + 4), res[13]); 561 _mm_storeu_si128((__m128i *)(output + (stride * 7)), res[14]); 562 _mm_storeu_si128((__m128i *)(output + (stride * 7) + 4), res[15]); 563 } 564 565 static void fdct4x8_sse4_1(__m128i *in, __m128i *out, int bit, 566 const int col_num) { 567 const int32_t *cospi = cospi_arr(bit); 568 const __m128i cospi32 = _mm_set1_epi32(cospi[32]); 569 const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); 570 const __m128i cospi48 = _mm_set1_epi32(cospi[48]); 571 const __m128i cospi16 = _mm_set1_epi32(cospi[16]); 572 const __m128i cospi56 = _mm_set1_epi32(cospi[56]); 573 const __m128i cospi8 = _mm_set1_epi32(cospi[8]); 574 const __m128i cospi24 = _mm_set1_epi32(cospi[24]); 575 const __m128i cospi40 = _mm_set1_epi32(cospi[40]); 576 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); 577 __m128i u[8], v[8]; 578 579 int startidx = 0 * col_num; 580 int endidx = 7 * col_num; 581 // Even 8 points 0, 2, ..., 14 582 // stage 0 583 // stage 1 584 u[0] = _mm_add_epi32(in[startidx], in[endidx]); 585 v[7] = _mm_sub_epi32(in[startidx], in[endidx]); // v[7] 586 startidx += col_num; 587 endidx -= col_num; 588 u[1] = _mm_add_epi32(in[startidx], in[endidx]); 589 u[6] = _mm_sub_epi32(in[startidx], in[endidx]); 590 startidx += col_num; 591 endidx -= col_num; 592 u[2] = _mm_add_epi32(in[startidx], in[endidx]); 593 u[5] = _mm_sub_epi32(in[startidx], in[endidx]); 594 startidx += col_num; 595 endidx -= col_num; 596 u[3] = _mm_add_epi32(in[startidx], in[endidx]); 597 v[4] = _mm_sub_epi32(in[startidx], in[endidx]); // v[4] 598 599 // stage 2 600 v[0] = _mm_add_epi32(u[0], u[3]); 601 v[3] = _mm_sub_epi32(u[0], u[3]); 602 v[1] = _mm_add_epi32(u[1], u[2]); 603 v[2] = _mm_sub_epi32(u[1], u[2]); 604 605 v[5] = _mm_mullo_epi32(u[5], cospim32); 606 v[6] = _mm_mullo_epi32(u[6], cospi32); 607 v[5] = _mm_add_epi32(v[5], v[6]); 608 v[5] = _mm_add_epi32(v[5], rnding); 609 v[5] = _mm_srai_epi32(v[5], bit); 610 611 u[0] = _mm_mullo_epi32(u[5], cospi32); 612 v[6] = _mm_mullo_epi32(u[6], cospim32); 613 v[6] = _mm_sub_epi32(u[0], v[6]); 614 v[6] = _mm_add_epi32(v[6], rnding); 615 v[6] = _mm_srai_epi32(v[6], bit); 616 617 // stage 3 618 // type 0 619 v[0] = _mm_mullo_epi32(v[0], cospi32); 620 v[1] = _mm_mullo_epi32(v[1], cospi32); 621 u[0] = _mm_add_epi32(v[0], v[1]); 622 u[0] = _mm_add_epi32(u[0], rnding); 623 u[0] = _mm_srai_epi32(u[0], bit); 624 625 u[1] = _mm_sub_epi32(v[0], v[1]); 626 u[1] = _mm_add_epi32(u[1], rnding); 627 u[1] = _mm_srai_epi32(u[1], bit); 628 629 // type 1 630 v[0] = _mm_mullo_epi32(v[2], cospi48); 631 v[1] = _mm_mullo_epi32(v[3], cospi16); 632 u[2] = _mm_add_epi32(v[0], v[1]); 633 u[2] = _mm_add_epi32(u[2], rnding); 634 u[2] = _mm_srai_epi32(u[2], bit); 635 636 v[0] = _mm_mullo_epi32(v[2], cospi16); 637 v[1] = _mm_mullo_epi32(v[3], cospi48); 638 u[3] = _mm_sub_epi32(v[1], v[0]); 639 u[3] = _mm_add_epi32(u[3], rnding); 640 u[3] = _mm_srai_epi32(u[3], bit); 641 642 u[4] = _mm_add_epi32(v[4], v[5]); 643 u[5] = _mm_sub_epi32(v[4], v[5]); 644 u[6] = _mm_sub_epi32(v[7], v[6]); 645 u[7] = _mm_add_epi32(v[7], v[6]); 646 647 // stage 4 648 // stage 5 649 v[0] = _mm_mullo_epi32(u[4], cospi56); 650 v[1] = _mm_mullo_epi32(u[7], cospi8); 651 v[0] = _mm_add_epi32(v[0], v[1]); 652 v[0] = _mm_add_epi32(v[0], rnding); 653 out[1 * col_num] = _mm_srai_epi32(v[0], bit); // buf0[4] 654 655 v[0] = _mm_mullo_epi32(u[4], cospi8); 656 v[1] = _mm_mullo_epi32(u[7], cospi56); 657 v[0] = _mm_sub_epi32(v[1], v[0]); 658 v[0] = _mm_add_epi32(v[0], rnding); 659 out[7 * col_num] = _mm_srai_epi32(v[0], bit); // buf0[7] 660 661 v[0] = _mm_mullo_epi32(u[5], cospi24); 662 v[1] = _mm_mullo_epi32(u[6], cospi40); 663 v[0] = _mm_add_epi32(v[0], v[1]); 664 v[0] = _mm_add_epi32(v[0], rnding); 665 out[5 * col_num] = _mm_srai_epi32(v[0], bit); // buf0[5] 666 667 v[0] = _mm_mullo_epi32(u[5], cospi40); 668 v[1] = _mm_mullo_epi32(u[6], cospi24); 669 v[0] = _mm_sub_epi32(v[1], v[0]); 670 v[0] = _mm_add_epi32(v[0], rnding); 671 out[3 * col_num] = _mm_srai_epi32(v[0], bit); // buf0[6] 672 673 out[0 * col_num] = u[0]; // buf0[0] 674 out[4 * col_num] = u[1]; // buf0[1] 675 out[2 * col_num] = u[2]; // buf0[2] 676 out[6 * col_num] = u[3]; // buf0[3] 677 } 678 679 static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit, 680 const int col_num) { 681 fdct4x8_sse4_1(in, out, bit, col_num); 682 fdct4x8_sse4_1(in + 1, out + 1, bit, col_num); 683 } 684 685 static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, 686 const int col_num) { 687 const int32_t *cospi = cospi_arr(bit); 688 const __m128i cospi32 = _mm_set1_epi32(cospi[32]); 689 const __m128i cospi16 = _mm_set1_epi32(cospi[16]); 690 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); 691 const __m128i cospi48 = _mm_set1_epi32(cospi[48]); 692 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); 693 const __m128i cospi4 = _mm_set1_epi32(cospi[4]); 694 const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); 695 const __m128i cospi60 = _mm_set1_epi32(cospi[60]); 696 const __m128i cospi20 = _mm_set1_epi32(cospi[20]); 697 const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); 698 const __m128i cospi44 = _mm_set1_epi32(cospi[44]); 699 const __m128i cospi28 = _mm_set1_epi32(cospi[28]); 700 const __m128i cospi36 = _mm_set1_epi32(cospi[36]); 701 const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); 702 const __m128i cospi52 = _mm_set1_epi32(cospi[52]); 703 const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); 704 const __m128i cospi12 = _mm_set1_epi32(cospi[12]); 705 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); 706 const __m128i zero = _mm_setzero_si128(); 707 __m128i u0, u1, u2, u3, u4, u5, u6, u7; 708 __m128i v0, v1, v2, v3, v4, v5, v6, v7; 709 __m128i x, y; 710 int col; 711 712 // Note: 713 // Even column: 0, 2, ..., 14 714 // Odd column: 1, 3, ..., 15 715 // one even column plus one odd column constructs one row (8 coeffs) 716 // total we have 8 rows (8x8). 717 for (col = 0; col < col_num; ++col) { 718 // stage 0 719 // stage 1 720 u0 = in[col_num * 0 + col]; 721 u1 = _mm_sub_epi32(zero, in[col_num * 7 + col]); 722 u2 = _mm_sub_epi32(zero, in[col_num * 3 + col]); 723 u3 = in[col_num * 4 + col]; 724 u4 = _mm_sub_epi32(zero, in[col_num * 1 + col]); 725 u5 = in[col_num * 6 + col]; 726 u6 = in[col_num * 2 + col]; 727 u7 = _mm_sub_epi32(zero, in[col_num * 5 + col]); 728 729 // stage 2 730 v0 = u0; 731 v1 = u1; 732 733 x = _mm_mullo_epi32(u2, cospi32); 734 y = _mm_mullo_epi32(u3, cospi32); 735 v2 = _mm_add_epi32(x, y); 736 v2 = _mm_add_epi32(v2, rnding); 737 v2 = _mm_srai_epi32(v2, bit); 738 739 v3 = _mm_sub_epi32(x, y); 740 v3 = _mm_add_epi32(v3, rnding); 741 v3 = _mm_srai_epi32(v3, bit); 742 743 v4 = u4; 744 v5 = u5; 745 746 x = _mm_mullo_epi32(u6, cospi32); 747 y = _mm_mullo_epi32(u7, cospi32); 748 v6 = _mm_add_epi32(x, y); 749 v6 = _mm_add_epi32(v6, rnding); 750 v6 = _mm_srai_epi32(v6, bit); 751 752 v7 = _mm_sub_epi32(x, y); 753 v7 = _mm_add_epi32(v7, rnding); 754 v7 = _mm_srai_epi32(v7, bit); 755 756 // stage 3 757 u0 = _mm_add_epi32(v0, v2); 758 u1 = _mm_add_epi32(v1, v3); 759 u2 = _mm_sub_epi32(v0, v2); 760 u3 = _mm_sub_epi32(v1, v3); 761 u4 = _mm_add_epi32(v4, v6); 762 u5 = _mm_add_epi32(v5, v7); 763 u6 = _mm_sub_epi32(v4, v6); 764 u7 = _mm_sub_epi32(v5, v7); 765 766 // stage 4 767 v0 = u0; 768 v1 = u1; 769 v2 = u2; 770 v3 = u3; 771 772 x = _mm_mullo_epi32(u4, cospi16); 773 y = _mm_mullo_epi32(u5, cospi48); 774 v4 = _mm_add_epi32(x, y); 775 v4 = _mm_add_epi32(v4, rnding); 776 v4 = _mm_srai_epi32(v4, bit); 777 778 x = _mm_mullo_epi32(u4, cospi48); 779 y = _mm_mullo_epi32(u5, cospim16); 780 v5 = _mm_add_epi32(x, y); 781 v5 = _mm_add_epi32(v5, rnding); 782 v5 = _mm_srai_epi32(v5, bit); 783 784 x = _mm_mullo_epi32(u6, cospim48); 785 y = _mm_mullo_epi32(u7, cospi16); 786 v6 = _mm_add_epi32(x, y); 787 v6 = _mm_add_epi32(v6, rnding); 788 v6 = _mm_srai_epi32(v6, bit); 789 790 x = _mm_mullo_epi32(u6, cospi16); 791 y = _mm_mullo_epi32(u7, cospi48); 792 v7 = _mm_add_epi32(x, y); 793 v7 = _mm_add_epi32(v7, rnding); 794 v7 = _mm_srai_epi32(v7, bit); 795 796 // stage 5 797 u0 = _mm_add_epi32(v0, v4); 798 u1 = _mm_add_epi32(v1, v5); 799 u2 = _mm_add_epi32(v2, v6); 800 u3 = _mm_add_epi32(v3, v7); 801 u4 = _mm_sub_epi32(v0, v4); 802 u5 = _mm_sub_epi32(v1, v5); 803 u6 = _mm_sub_epi32(v2, v6); 804 u7 = _mm_sub_epi32(v3, v7); 805 806 // stage 6 807 x = _mm_mullo_epi32(u0, cospi4); 808 y = _mm_mullo_epi32(u1, cospi60); 809 v0 = _mm_add_epi32(x, y); 810 v0 = _mm_add_epi32(v0, rnding); 811 v0 = _mm_srai_epi32(v0, bit); 812 813 x = _mm_mullo_epi32(u0, cospi60); 814 y = _mm_mullo_epi32(u1, cospim4); 815 v1 = _mm_add_epi32(x, y); 816 v1 = _mm_add_epi32(v1, rnding); 817 v1 = _mm_srai_epi32(v1, bit); 818 819 x = _mm_mullo_epi32(u2, cospi20); 820 y = _mm_mullo_epi32(u3, cospi44); 821 v2 = _mm_add_epi32(x, y); 822 v2 = _mm_add_epi32(v2, rnding); 823 v2 = _mm_srai_epi32(v2, bit); 824 825 x = _mm_mullo_epi32(u2, cospi44); 826 y = _mm_mullo_epi32(u3, cospim20); 827 v3 = _mm_add_epi32(x, y); 828 v3 = _mm_add_epi32(v3, rnding); 829 v3 = _mm_srai_epi32(v3, bit); 830 831 x = _mm_mullo_epi32(u4, cospi36); 832 y = _mm_mullo_epi32(u5, cospi28); 833 v4 = _mm_add_epi32(x, y); 834 v4 = _mm_add_epi32(v4, rnding); 835 v4 = _mm_srai_epi32(v4, bit); 836 837 x = _mm_mullo_epi32(u4, cospi28); 838 y = _mm_mullo_epi32(u5, cospim36); 839 v5 = _mm_add_epi32(x, y); 840 v5 = _mm_add_epi32(v5, rnding); 841 v5 = _mm_srai_epi32(v5, bit); 842 843 x = _mm_mullo_epi32(u6, cospi52); 844 y = _mm_mullo_epi32(u7, cospi12); 845 v6 = _mm_add_epi32(x, y); 846 v6 = _mm_add_epi32(v6, rnding); 847 v6 = _mm_srai_epi32(v6, bit); 848 849 x = _mm_mullo_epi32(u6, cospi12); 850 y = _mm_mullo_epi32(u7, cospim52); 851 v7 = _mm_add_epi32(x, y); 852 v7 = _mm_add_epi32(v7, rnding); 853 v7 = _mm_srai_epi32(v7, bit); 854 855 // stage 7 856 out[col_num * 0 + col] = v1; 857 out[col_num * 1 + col] = v6; 858 out[col_num * 2 + col] = v3; 859 out[col_num * 3 + col] = v4; 860 out[col_num * 4 + col] = v5; 861 out[col_num * 5 + col] = v2; 862 out[col_num * 6 + col] = v7; 863 out[col_num * 7 + col] = v0; 864 } 865 } 866 static void idtx8x8_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) { 867 (void)bit; 868 869 for (int i = 0; i < col_num; i += 1) { 870 out[0 + 8 * i] = _mm_add_epi32(in[0 + 8 * i], in[0 + 8 * i]); 871 out[1 + 8 * i] = _mm_add_epi32(in[1 + 8 * i], in[1 + 8 * i]); 872 out[2 + 8 * i] = _mm_add_epi32(in[2 + 8 * i], in[2 + 8 * i]); 873 out[3 + 8 * i] = _mm_add_epi32(in[3 + 8 * i], in[3 + 8 * i]); 874 out[4 + 8 * i] = _mm_add_epi32(in[4 + 8 * i], in[4 + 8 * i]); 875 out[5 + 8 * i] = _mm_add_epi32(in[5 + 8 * i], in[5 + 8 * i]); 876 out[6 + 8 * i] = _mm_add_epi32(in[6 + 8 * i], in[6 + 8 * i]); 877 out[7 + 8 * i] = _mm_add_epi32(in[7 + 8 * i], in[7 + 8 * i]); 878 } 879 } 880 #if !CONFIG_REALTIME_ONLY 881 static void idtx32x8_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) { 882 (void)bit; 883 (void)col_num; 884 for (int j = 0; j < 2; j++) { 885 out[j + 8 * 0] = _mm_add_epi32(in[j + 8 * 0], in[j + 8 * 0]); 886 out[j + 8 * 1] = _mm_add_epi32(in[j + 8 * 1], in[j + 8 * 1]); 887 out[j + 8 * 2] = _mm_add_epi32(in[j + 8 * 2], in[j + 8 * 2]); 888 out[j + 8 * 3] = _mm_add_epi32(in[j + 8 * 3], in[j + 8 * 3]); 889 out[j + 8 * 4] = _mm_add_epi32(in[j + 8 * 4], in[j + 8 * 4]); 890 out[j + 8 * 5] = _mm_add_epi32(in[j + 8 * 5], in[j + 8 * 5]); 891 out[j + 8 * 6] = _mm_add_epi32(in[j + 8 * 6], in[j + 8 * 6]); 892 out[j + 8 * 7] = _mm_add_epi32(in[j + 8 * 7], in[j + 8 * 7]); 893 } 894 } 895 #endif 896 void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff, int stride, 897 TX_TYPE tx_type, int bd) { 898 __m128i in[16], out[16]; 899 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8]; 900 const int txw_idx = get_txw_idx(TX_8X8); 901 const int txh_idx = get_txh_idx(TX_8X8); 902 903 switch (tx_type) { 904 case DCT_DCT: 905 load_buffer_8x8(input, in, stride, 0, 0, shift[0]); 906 fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); 907 col_txfm_8x8_rounding(out, -shift[1]); 908 transpose_8x8(out, in); 909 fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); 910 write_buffer_8x8(out, coeff); 911 break; 912 case ADST_DCT: 913 load_buffer_8x8(input, in, stride, 0, 0, shift[0]); 914 fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); 915 col_txfm_8x8_rounding(out, -shift[1]); 916 transpose_8x8(out, in); 917 fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); 918 write_buffer_8x8(out, coeff); 919 break; 920 case DCT_ADST: 921 load_buffer_8x8(input, in, stride, 0, 0, shift[0]); 922 fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); 923 col_txfm_8x8_rounding(out, -shift[1]); 924 transpose_8x8(out, in); 925 fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); 926 write_buffer_8x8(out, coeff); 927 break; 928 case ADST_ADST: 929 load_buffer_8x8(input, in, stride, 0, 0, shift[0]); 930 fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); 931 col_txfm_8x8_rounding(out, -shift[1]); 932 transpose_8x8(out, in); 933 fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); 934 write_buffer_8x8(out, coeff); 935 break; 936 case FLIPADST_DCT: 937 load_buffer_8x8(input, in, stride, 1, 0, shift[0]); 938 fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); 939 col_txfm_8x8_rounding(out, -shift[1]); 940 transpose_8x8(out, in); 941 fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); 942 write_buffer_8x8(out, coeff); 943 break; 944 case DCT_FLIPADST: 945 load_buffer_8x8(input, in, stride, 0, 1, shift[0]); 946 fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); 947 col_txfm_8x8_rounding(out, -shift[1]); 948 transpose_8x8(out, in); 949 fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); 950 write_buffer_8x8(out, coeff); 951 break; 952 case FLIPADST_FLIPADST: 953 load_buffer_8x8(input, in, stride, 1, 1, shift[0]); 954 fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); 955 col_txfm_8x8_rounding(out, -shift[1]); 956 transpose_8x8(out, in); 957 fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); 958 write_buffer_8x8(out, coeff); 959 break; 960 case ADST_FLIPADST: 961 load_buffer_8x8(input, in, stride, 0, 1, shift[0]); 962 fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); 963 col_txfm_8x8_rounding(out, -shift[1]); 964 transpose_8x8(out, in); 965 fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); 966 write_buffer_8x8(out, coeff); 967 break; 968 case FLIPADST_ADST: 969 load_buffer_8x8(input, in, stride, 1, 0, shift[0]); 970 fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); 971 col_txfm_8x8_rounding(out, -shift[1]); 972 transpose_8x8(out, in); 973 fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); 974 write_buffer_8x8(out, coeff); 975 break; 976 case IDTX: 977 load_buffer_8x8(input, in, stride, 0, 0, shift[0]); 978 idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); 979 col_txfm_8x8_rounding(out, -shift[1]); 980 transpose_8x8(out, in); 981 idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); 982 write_buffer_8x8(out, coeff); 983 break; 984 case V_DCT: 985 load_buffer_8x8(input, in, stride, 0, 0, shift[0]); 986 fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); 987 col_txfm_8x8_rounding(out, -shift[1]); 988 transpose_8x8(out, in); 989 idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); 990 write_buffer_8x8(out, coeff); 991 break; 992 case H_DCT: 993 load_buffer_8x8(input, in, stride, 0, 0, shift[0]); 994 idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); 995 col_txfm_8x8_rounding(out, -shift[1]); 996 transpose_8x8(out, in); 997 fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); 998 write_buffer_8x8(out, coeff); 999 break; 1000 case V_ADST: 1001 load_buffer_8x8(input, in, stride, 0, 0, shift[0]); 1002 fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); 1003 col_txfm_8x8_rounding(out, -shift[1]); 1004 transpose_8x8(out, in); 1005 idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); 1006 write_buffer_8x8(out, coeff); 1007 break; 1008 case H_ADST: 1009 load_buffer_8x8(input, in, stride, 0, 0, shift[0]); 1010 idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); 1011 col_txfm_8x8_rounding(out, -shift[1]); 1012 transpose_8x8(out, in); 1013 fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); 1014 write_buffer_8x8(out, coeff); 1015 break; 1016 case V_FLIPADST: 1017 load_buffer_8x8(input, in, stride, 1, 0, shift[0]); 1018 fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); 1019 col_txfm_8x8_rounding(out, -shift[1]); 1020 transpose_8x8(out, in); 1021 idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); 1022 write_buffer_8x8(out, coeff); 1023 break; 1024 case H_FLIPADST: 1025 load_buffer_8x8(input, in, stride, 0, 1, shift[0]); 1026 idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); 1027 col_txfm_8x8_rounding(out, -shift[1]); 1028 transpose_8x8(out, in); 1029 fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); 1030 write_buffer_8x8(out, coeff); 1031 break; 1032 default: assert(0); 1033 } 1034 (void)bd; 1035 } 1036 1037 // Hybrid Transform 16x16 1038 1039 static inline void convert_8x8_to_16x16(const __m128i *in, __m128i *out) { 1040 int row_index = 0; 1041 int dst_index = 0; 1042 int src_index = 0; 1043 1044 // row 0, 1, .., 7 1045 do { 1046 out[dst_index] = in[src_index]; 1047 out[dst_index + 1] = in[src_index + 1]; 1048 out[dst_index + 2] = in[src_index + 16]; 1049 out[dst_index + 3] = in[src_index + 17]; 1050 dst_index += 4; 1051 src_index += 2; 1052 row_index += 1; 1053 } while (row_index < 8); 1054 1055 // row 8, 9, ..., 15 1056 src_index += 16; 1057 do { 1058 out[dst_index] = in[src_index]; 1059 out[dst_index + 1] = in[src_index + 1]; 1060 out[dst_index + 2] = in[src_index + 16]; 1061 out[dst_index + 3] = in[src_index + 17]; 1062 dst_index += 4; 1063 src_index += 2; 1064 row_index += 1; 1065 } while (row_index < 16); 1066 } 1067 1068 static inline void load_buffer_16x16(const int16_t *input, __m128i *out, 1069 int stride, int flipud, int fliplr, 1070 int shift) { 1071 __m128i in[64]; 1072 // Load 4 8x8 blocks 1073 const int16_t *topL = input; 1074 const int16_t *topR = input + 8; 1075 const int16_t *botL = input + 8 * stride; 1076 const int16_t *botR = input + 8 * stride + 8; 1077 1078 const int16_t *tmp; 1079 1080 if (flipud) { 1081 // Swap left columns 1082 tmp = topL; 1083 topL = botL; 1084 botL = tmp; 1085 // Swap right columns 1086 tmp = topR; 1087 topR = botR; 1088 botR = tmp; 1089 } 1090 1091 if (fliplr) { 1092 // Swap top rows 1093 tmp = topL; 1094 topL = topR; 1095 topR = tmp; 1096 // Swap bottom rows 1097 tmp = botL; 1098 botL = botR; 1099 botR = tmp; 1100 } 1101 1102 // load first 8 columns 1103 load_buffer_8x8(topL, &in[0], stride, flipud, fliplr, shift); 1104 load_buffer_8x8(botL, &in[32], stride, flipud, fliplr, shift); 1105 1106 // load second 8 columns 1107 load_buffer_8x8(topR, &in[16], stride, flipud, fliplr, shift); 1108 load_buffer_8x8(botR, &in[48], stride, flipud, fliplr, shift); 1109 1110 convert_8x8_to_16x16(in, out); 1111 } 1112 1113 static inline void load_buffer_8x16(const int16_t *input, __m128i *out, 1114 int stride, int flipud, int fliplr, 1115 int shift) { 1116 const int16_t *topL = input; 1117 const int16_t *botL = input + 8 * stride; 1118 1119 const int16_t *tmp; 1120 1121 if (flipud) { 1122 tmp = topL; 1123 topL = botL; 1124 botL = tmp; 1125 } 1126 1127 load_buffer_8x8(topL, out, stride, flipud, fliplr, shift); 1128 load_buffer_8x8(botL, out + 16, stride, flipud, fliplr, shift); 1129 } 1130 1131 static inline void load_buffer_8x4(const int16_t *input, __m128i *out, 1132 int stride, int flipud, int fliplr, 1133 int shift) { 1134 const int16_t *topL = input; 1135 const int16_t *topR = input + 4; 1136 1137 const int16_t *tmp; 1138 1139 if (fliplr) { 1140 tmp = topL; 1141 topL = topR; 1142 topR = tmp; 1143 } 1144 1145 load_buffer_4x4(topL, out, stride, flipud, fliplr, shift); 1146 load_buffer_4x4(topR, out + 4, stride, flipud, fliplr, shift); 1147 } 1148 1149 static inline void load_buffer_16x4(const int16_t *input, __m128i *out, 1150 int stride, int flipud, int fliplr, 1151 int shift) { 1152 const int16_t *topL = input; 1153 const int16_t *topR = input + 8; 1154 1155 const int16_t *tmp; 1156 1157 if (fliplr) { 1158 tmp = topL; 1159 topL = topR; 1160 topR = tmp; 1161 } 1162 1163 load_buffer_8x4(topL, out, stride, flipud, fliplr, shift); 1164 load_buffer_8x4(topR, out + 8, stride, flipud, fliplr, shift); 1165 } 1166 1167 static inline void load_buffer_4x8(const int16_t *input, __m128i *out, 1168 int stride, int flipud, int fliplr, 1169 int shift) { 1170 const int16_t *topL = input; 1171 const int16_t *botL = input + 4 * stride; 1172 1173 const int16_t *tmp; 1174 1175 if (flipud) { 1176 tmp = topL; 1177 topL = botL; 1178 botL = tmp; 1179 } 1180 1181 load_buffer_4x4(topL, out, stride, flipud, fliplr, shift); 1182 load_buffer_4x4(botL, out + 4, stride, flipud, fliplr, shift); 1183 } 1184 1185 #if !CONFIG_REALTIME_ONLY 1186 static inline void load_buffer_4x16(const int16_t *input, __m128i *out, 1187 const int stride, const int flipud, 1188 const int fliplr, const int shift) { 1189 const int16_t *topL = input; 1190 const int16_t *botL = input + 8 * stride; 1191 1192 const int16_t *tmp; 1193 1194 if (flipud) { 1195 tmp = topL; 1196 topL = botL; 1197 botL = tmp; 1198 } 1199 load_buffer_4x8(topL, out, stride, flipud, fliplr, shift); 1200 load_buffer_4x8(botL, out + 8, stride, flipud, fliplr, shift); 1201 } 1202 #endif 1203 1204 static inline void load_buffer_32x8n(const int16_t *input, __m128i *out, 1205 int stride, int flipud, int fliplr, 1206 int shift, const int height) { 1207 const int16_t *in = input; 1208 __m128i *output = out; 1209 for (int col = 0; col < height; col++) { 1210 in = input + col * stride; 1211 output = out + col * 8; 1212 load_buffer_4x4(in, output, 4, flipud, fliplr, shift); 1213 load_buffer_4x4((in + 16), (output + 4), 4, flipud, fliplr, shift); 1214 } 1215 } 1216 1217 static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit, 1218 const int col_num) { 1219 const int32_t *cospi = cospi_arr(bit); 1220 const __m128i cospi32 = _mm_set1_epi32(cospi[32]); 1221 const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); 1222 const __m128i cospi48 = _mm_set1_epi32(cospi[48]); 1223 const __m128i cospi16 = _mm_set1_epi32(cospi[16]); 1224 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); 1225 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); 1226 const __m128i cospi56 = _mm_set1_epi32(cospi[56]); 1227 const __m128i cospi8 = _mm_set1_epi32(cospi[8]); 1228 const __m128i cospi24 = _mm_set1_epi32(cospi[24]); 1229 const __m128i cospi40 = _mm_set1_epi32(cospi[40]); 1230 const __m128i cospi60 = _mm_set1_epi32(cospi[60]); 1231 const __m128i cospi4 = _mm_set1_epi32(cospi[4]); 1232 const __m128i cospi28 = _mm_set1_epi32(cospi[28]); 1233 const __m128i cospi36 = _mm_set1_epi32(cospi[36]); 1234 const __m128i cospi44 = _mm_set1_epi32(cospi[44]); 1235 const __m128i cospi20 = _mm_set1_epi32(cospi[20]); 1236 const __m128i cospi12 = _mm_set1_epi32(cospi[12]); 1237 const __m128i cospi52 = _mm_set1_epi32(cospi[52]); 1238 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); 1239 __m128i u[16], v[16], x; 1240 int col; 1241 1242 // Calculate the column 0, 1, 2, 3 1243 for (col = 0; col < col_num; ++col) { 1244 // stage 0 1245 // stage 1 1246 u[0] = _mm_add_epi32(in[0 * col_num + col], in[15 * col_num + col]); 1247 u[15] = _mm_sub_epi32(in[0 * col_num + col], in[15 * col_num + col]); 1248 u[1] = _mm_add_epi32(in[1 * col_num + col], in[14 * col_num + col]); 1249 u[14] = _mm_sub_epi32(in[1 * col_num + col], in[14 * col_num + col]); 1250 u[2] = _mm_add_epi32(in[2 * col_num + col], in[13 * col_num + col]); 1251 u[13] = _mm_sub_epi32(in[2 * col_num + col], in[13 * col_num + col]); 1252 u[3] = _mm_add_epi32(in[3 * col_num + col], in[12 * col_num + col]); 1253 u[12] = _mm_sub_epi32(in[3 * col_num + col], in[12 * col_num + col]); 1254 u[4] = _mm_add_epi32(in[4 * col_num + col], in[11 * col_num + col]); 1255 u[11] = _mm_sub_epi32(in[4 * col_num + col], in[11 * col_num + col]); 1256 u[5] = _mm_add_epi32(in[5 * col_num + col], in[10 * col_num + col]); 1257 u[10] = _mm_sub_epi32(in[5 * col_num + col], in[10 * col_num + col]); 1258 u[6] = _mm_add_epi32(in[6 * col_num + col], in[9 * col_num + col]); 1259 u[9] = _mm_sub_epi32(in[6 * col_num + col], in[9 * col_num + col]); 1260 u[7] = _mm_add_epi32(in[7 * col_num + col], in[8 * col_num + col]); 1261 u[8] = _mm_sub_epi32(in[7 * col_num + col], in[8 * col_num + col]); 1262 1263 // stage 2 1264 v[0] = _mm_add_epi32(u[0], u[7]); 1265 v[7] = _mm_sub_epi32(u[0], u[7]); 1266 v[1] = _mm_add_epi32(u[1], u[6]); 1267 v[6] = _mm_sub_epi32(u[1], u[6]); 1268 v[2] = _mm_add_epi32(u[2], u[5]); 1269 v[5] = _mm_sub_epi32(u[2], u[5]); 1270 v[3] = _mm_add_epi32(u[3], u[4]); 1271 v[4] = _mm_sub_epi32(u[3], u[4]); 1272 v[8] = u[8]; 1273 v[9] = u[9]; 1274 1275 v[10] = _mm_mullo_epi32(u[10], cospim32); 1276 x = _mm_mullo_epi32(u[13], cospi32); 1277 v[10] = _mm_add_epi32(v[10], x); 1278 v[10] = _mm_add_epi32(v[10], rnding); 1279 v[10] = _mm_srai_epi32(v[10], bit); 1280 1281 v[13] = _mm_mullo_epi32(u[10], cospi32); 1282 x = _mm_mullo_epi32(u[13], cospim32); 1283 v[13] = _mm_sub_epi32(v[13], x); 1284 v[13] = _mm_add_epi32(v[13], rnding); 1285 v[13] = _mm_srai_epi32(v[13], bit); 1286 1287 v[11] = _mm_mullo_epi32(u[11], cospim32); 1288 x = _mm_mullo_epi32(u[12], cospi32); 1289 v[11] = _mm_add_epi32(v[11], x); 1290 v[11] = _mm_add_epi32(v[11], rnding); 1291 v[11] = _mm_srai_epi32(v[11], bit); 1292 1293 v[12] = _mm_mullo_epi32(u[11], cospi32); 1294 x = _mm_mullo_epi32(u[12], cospim32); 1295 v[12] = _mm_sub_epi32(v[12], x); 1296 v[12] = _mm_add_epi32(v[12], rnding); 1297 v[12] = _mm_srai_epi32(v[12], bit); 1298 v[14] = u[14]; 1299 v[15] = u[15]; 1300 1301 // stage 3 1302 u[0] = _mm_add_epi32(v[0], v[3]); 1303 u[3] = _mm_sub_epi32(v[0], v[3]); 1304 u[1] = _mm_add_epi32(v[1], v[2]); 1305 u[2] = _mm_sub_epi32(v[1], v[2]); 1306 u[4] = v[4]; 1307 1308 u[5] = _mm_mullo_epi32(v[5], cospim32); 1309 x = _mm_mullo_epi32(v[6], cospi32); 1310 u[5] = _mm_add_epi32(u[5], x); 1311 u[5] = _mm_add_epi32(u[5], rnding); 1312 u[5] = _mm_srai_epi32(u[5], bit); 1313 1314 u[6] = _mm_mullo_epi32(v[5], cospi32); 1315 x = _mm_mullo_epi32(v[6], cospim32); 1316 u[6] = _mm_sub_epi32(u[6], x); 1317 u[6] = _mm_add_epi32(u[6], rnding); 1318 u[6] = _mm_srai_epi32(u[6], bit); 1319 1320 u[7] = v[7]; 1321 u[8] = _mm_add_epi32(v[8], v[11]); 1322 u[11] = _mm_sub_epi32(v[8], v[11]); 1323 u[9] = _mm_add_epi32(v[9], v[10]); 1324 u[10] = _mm_sub_epi32(v[9], v[10]); 1325 u[12] = _mm_sub_epi32(v[15], v[12]); 1326 u[15] = _mm_add_epi32(v[15], v[12]); 1327 u[13] = _mm_sub_epi32(v[14], v[13]); 1328 u[14] = _mm_add_epi32(v[14], v[13]); 1329 1330 // stage 4 1331 u[0] = _mm_mullo_epi32(u[0], cospi32); 1332 u[1] = _mm_mullo_epi32(u[1], cospi32); 1333 v[0] = _mm_add_epi32(u[0], u[1]); 1334 v[0] = _mm_add_epi32(v[0], rnding); 1335 v[0] = _mm_srai_epi32(v[0], bit); 1336 1337 v[1] = _mm_sub_epi32(u[0], u[1]); 1338 v[1] = _mm_add_epi32(v[1], rnding); 1339 v[1] = _mm_srai_epi32(v[1], bit); 1340 1341 v[2] = _mm_mullo_epi32(u[2], cospi48); 1342 x = _mm_mullo_epi32(u[3], cospi16); 1343 v[2] = _mm_add_epi32(v[2], x); 1344 v[2] = _mm_add_epi32(v[2], rnding); 1345 v[2] = _mm_srai_epi32(v[2], bit); 1346 1347 v[3] = _mm_mullo_epi32(u[2], cospi16); 1348 x = _mm_mullo_epi32(u[3], cospi48); 1349 v[3] = _mm_sub_epi32(x, v[3]); 1350 v[3] = _mm_add_epi32(v[3], rnding); 1351 v[3] = _mm_srai_epi32(v[3], bit); 1352 1353 v[4] = _mm_add_epi32(u[4], u[5]); 1354 v[5] = _mm_sub_epi32(u[4], u[5]); 1355 v[6] = _mm_sub_epi32(u[7], u[6]); 1356 v[7] = _mm_add_epi32(u[7], u[6]); 1357 v[8] = u[8]; 1358 1359 v[9] = _mm_mullo_epi32(u[9], cospim16); 1360 x = _mm_mullo_epi32(u[14], cospi48); 1361 v[9] = _mm_add_epi32(v[9], x); 1362 v[9] = _mm_add_epi32(v[9], rnding); 1363 v[9] = _mm_srai_epi32(v[9], bit); 1364 1365 v[14] = _mm_mullo_epi32(u[9], cospi48); 1366 x = _mm_mullo_epi32(u[14], cospim16); 1367 v[14] = _mm_sub_epi32(v[14], x); 1368 v[14] = _mm_add_epi32(v[14], rnding); 1369 v[14] = _mm_srai_epi32(v[14], bit); 1370 1371 v[10] = _mm_mullo_epi32(u[10], cospim48); 1372 x = _mm_mullo_epi32(u[13], cospim16); 1373 v[10] = _mm_add_epi32(v[10], x); 1374 v[10] = _mm_add_epi32(v[10], rnding); 1375 v[10] = _mm_srai_epi32(v[10], bit); 1376 1377 v[13] = _mm_mullo_epi32(u[10], cospim16); 1378 x = _mm_mullo_epi32(u[13], cospim48); 1379 v[13] = _mm_sub_epi32(v[13], x); 1380 v[13] = _mm_add_epi32(v[13], rnding); 1381 v[13] = _mm_srai_epi32(v[13], bit); 1382 1383 v[11] = u[11]; 1384 v[12] = u[12]; 1385 v[15] = u[15]; 1386 1387 // stage 5 1388 u[0] = v[0]; 1389 u[1] = v[1]; 1390 u[2] = v[2]; 1391 u[3] = v[3]; 1392 1393 u[4] = _mm_mullo_epi32(v[4], cospi56); 1394 x = _mm_mullo_epi32(v[7], cospi8); 1395 u[4] = _mm_add_epi32(u[4], x); 1396 u[4] = _mm_add_epi32(u[4], rnding); 1397 u[4] = _mm_srai_epi32(u[4], bit); 1398 1399 u[7] = _mm_mullo_epi32(v[4], cospi8); 1400 x = _mm_mullo_epi32(v[7], cospi56); 1401 u[7] = _mm_sub_epi32(x, u[7]); 1402 u[7] = _mm_add_epi32(u[7], rnding); 1403 u[7] = _mm_srai_epi32(u[7], bit); 1404 1405 u[5] = _mm_mullo_epi32(v[5], cospi24); 1406 x = _mm_mullo_epi32(v[6], cospi40); 1407 u[5] = _mm_add_epi32(u[5], x); 1408 u[5] = _mm_add_epi32(u[5], rnding); 1409 u[5] = _mm_srai_epi32(u[5], bit); 1410 1411 u[6] = _mm_mullo_epi32(v[5], cospi40); 1412 x = _mm_mullo_epi32(v[6], cospi24); 1413 u[6] = _mm_sub_epi32(x, u[6]); 1414 u[6] = _mm_add_epi32(u[6], rnding); 1415 u[6] = _mm_srai_epi32(u[6], bit); 1416 1417 u[8] = _mm_add_epi32(v[8], v[9]); 1418 u[9] = _mm_sub_epi32(v[8], v[9]); 1419 u[10] = _mm_sub_epi32(v[11], v[10]); 1420 u[11] = _mm_add_epi32(v[11], v[10]); 1421 u[12] = _mm_add_epi32(v[12], v[13]); 1422 u[13] = _mm_sub_epi32(v[12], v[13]); 1423 u[14] = _mm_sub_epi32(v[15], v[14]); 1424 u[15] = _mm_add_epi32(v[15], v[14]); 1425 1426 // stage 6 1427 v[0] = u[0]; 1428 v[1] = u[1]; 1429 v[2] = u[2]; 1430 v[3] = u[3]; 1431 v[4] = u[4]; 1432 v[5] = u[5]; 1433 v[6] = u[6]; 1434 v[7] = u[7]; 1435 1436 v[8] = _mm_mullo_epi32(u[8], cospi60); 1437 x = _mm_mullo_epi32(u[15], cospi4); 1438 v[8] = _mm_add_epi32(v[8], x); 1439 v[8] = _mm_add_epi32(v[8], rnding); 1440 v[8] = _mm_srai_epi32(v[8], bit); 1441 1442 v[15] = _mm_mullo_epi32(u[8], cospi4); 1443 x = _mm_mullo_epi32(u[15], cospi60); 1444 v[15] = _mm_sub_epi32(x, v[15]); 1445 v[15] = _mm_add_epi32(v[15], rnding); 1446 v[15] = _mm_srai_epi32(v[15], bit); 1447 1448 v[9] = _mm_mullo_epi32(u[9], cospi28); 1449 x = _mm_mullo_epi32(u[14], cospi36); 1450 v[9] = _mm_add_epi32(v[9], x); 1451 v[9] = _mm_add_epi32(v[9], rnding); 1452 v[9] = _mm_srai_epi32(v[9], bit); 1453 1454 v[14] = _mm_mullo_epi32(u[9], cospi36); 1455 x = _mm_mullo_epi32(u[14], cospi28); 1456 v[14] = _mm_sub_epi32(x, v[14]); 1457 v[14] = _mm_add_epi32(v[14], rnding); 1458 v[14] = _mm_srai_epi32(v[14], bit); 1459 1460 v[10] = _mm_mullo_epi32(u[10], cospi44); 1461 x = _mm_mullo_epi32(u[13], cospi20); 1462 v[10] = _mm_add_epi32(v[10], x); 1463 v[10] = _mm_add_epi32(v[10], rnding); 1464 v[10] = _mm_srai_epi32(v[10], bit); 1465 1466 v[13] = _mm_mullo_epi32(u[10], cospi20); 1467 x = _mm_mullo_epi32(u[13], cospi44); 1468 v[13] = _mm_sub_epi32(x, v[13]); 1469 v[13] = _mm_add_epi32(v[13], rnding); 1470 v[13] = _mm_srai_epi32(v[13], bit); 1471 1472 v[11] = _mm_mullo_epi32(u[11], cospi12); 1473 x = _mm_mullo_epi32(u[12], cospi52); 1474 v[11] = _mm_add_epi32(v[11], x); 1475 v[11] = _mm_add_epi32(v[11], rnding); 1476 v[11] = _mm_srai_epi32(v[11], bit); 1477 1478 v[12] = _mm_mullo_epi32(u[11], cospi52); 1479 x = _mm_mullo_epi32(u[12], cospi12); 1480 v[12] = _mm_sub_epi32(x, v[12]); 1481 v[12] = _mm_add_epi32(v[12], rnding); 1482 v[12] = _mm_srai_epi32(v[12], bit); 1483 1484 out[0 * col_num + col] = v[0]; 1485 out[1 * col_num + col] = v[8]; 1486 out[2 * col_num + col] = v[4]; 1487 out[3 * col_num + col] = v[12]; 1488 out[4 * col_num + col] = v[2]; 1489 out[5 * col_num + col] = v[10]; 1490 out[6 * col_num + col] = v[6]; 1491 out[7 * col_num + col] = v[14]; 1492 out[8 * col_num + col] = v[1]; 1493 out[9 * col_num + col] = v[9]; 1494 out[10 * col_num + col] = v[5]; 1495 out[11 * col_num + col] = v[13]; 1496 out[12 * col_num + col] = v[3]; 1497 out[13 * col_num + col] = v[11]; 1498 out[14 * col_num + col] = v[7]; 1499 out[15 * col_num + col] = v[15]; 1500 } 1501 } 1502 1503 static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, 1504 const int num_cols) { 1505 const int32_t *cospi = cospi_arr(bit); 1506 const __m128i cospi32 = _mm_set1_epi32(cospi[32]); 1507 const __m128i cospi48 = _mm_set1_epi32(cospi[48]); 1508 const __m128i cospi16 = _mm_set1_epi32(cospi[16]); 1509 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); 1510 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); 1511 const __m128i cospi8 = _mm_set1_epi32(cospi[8]); 1512 const __m128i cospi56 = _mm_set1_epi32(cospi[56]); 1513 const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); 1514 const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); 1515 const __m128i cospi24 = _mm_set1_epi32(cospi[24]); 1516 const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); 1517 const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); 1518 const __m128i cospi40 = _mm_set1_epi32(cospi[40]); 1519 const __m128i cospi2 = _mm_set1_epi32(cospi[2]); 1520 const __m128i cospi62 = _mm_set1_epi32(cospi[62]); 1521 const __m128i cospim2 = _mm_set1_epi32(-cospi[2]); 1522 const __m128i cospi10 = _mm_set1_epi32(cospi[10]); 1523 const __m128i cospi54 = _mm_set1_epi32(cospi[54]); 1524 const __m128i cospim10 = _mm_set1_epi32(-cospi[10]); 1525 const __m128i cospi18 = _mm_set1_epi32(cospi[18]); 1526 const __m128i cospi46 = _mm_set1_epi32(cospi[46]); 1527 const __m128i cospim18 = _mm_set1_epi32(-cospi[18]); 1528 const __m128i cospi26 = _mm_set1_epi32(cospi[26]); 1529 const __m128i cospi38 = _mm_set1_epi32(cospi[38]); 1530 const __m128i cospim26 = _mm_set1_epi32(-cospi[26]); 1531 const __m128i cospi34 = _mm_set1_epi32(cospi[34]); 1532 const __m128i cospi30 = _mm_set1_epi32(cospi[30]); 1533 const __m128i cospim34 = _mm_set1_epi32(-cospi[34]); 1534 const __m128i cospi42 = _mm_set1_epi32(cospi[42]); 1535 const __m128i cospi22 = _mm_set1_epi32(cospi[22]); 1536 const __m128i cospim42 = _mm_set1_epi32(-cospi[42]); 1537 const __m128i cospi50 = _mm_set1_epi32(cospi[50]); 1538 const __m128i cospi14 = _mm_set1_epi32(cospi[14]); 1539 const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); 1540 const __m128i cospi58 = _mm_set1_epi32(cospi[58]); 1541 const __m128i cospi6 = _mm_set1_epi32(cospi[6]); 1542 const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); 1543 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); 1544 const __m128i zero = _mm_setzero_si128(); 1545 1546 __m128i u[16], v[16], x, y; 1547 int col; 1548 1549 for (col = 0; col < num_cols; ++col) { 1550 // stage 0 1551 // stage 1 1552 u[0] = in[0 * num_cols + col]; 1553 u[1] = _mm_sub_epi32(zero, in[15 * num_cols + col]); 1554 u[2] = _mm_sub_epi32(zero, in[7 * num_cols + col]); 1555 u[3] = in[8 * num_cols + col]; 1556 u[4] = _mm_sub_epi32(zero, in[3 * num_cols + col]); 1557 u[5] = in[12 * num_cols + col]; 1558 u[6] = in[4 * num_cols + col]; 1559 u[7] = _mm_sub_epi32(zero, in[11 * num_cols + col]); 1560 u[8] = _mm_sub_epi32(zero, in[1 * num_cols + col]); 1561 u[9] = in[14 * num_cols + col]; 1562 u[10] = in[6 * num_cols + col]; 1563 u[11] = _mm_sub_epi32(zero, in[9 * num_cols + col]); 1564 u[12] = in[2 * num_cols + col]; 1565 u[13] = _mm_sub_epi32(zero, in[13 * num_cols + col]); 1566 u[14] = _mm_sub_epi32(zero, in[5 * num_cols + col]); 1567 u[15] = in[10 * num_cols + col]; 1568 1569 // stage 2 1570 v[0] = u[0]; 1571 v[1] = u[1]; 1572 1573 x = _mm_mullo_epi32(u[2], cospi32); 1574 y = _mm_mullo_epi32(u[3], cospi32); 1575 v[2] = _mm_add_epi32(x, y); 1576 v[2] = _mm_add_epi32(v[2], rnding); 1577 v[2] = _mm_srai_epi32(v[2], bit); 1578 1579 v[3] = _mm_sub_epi32(x, y); 1580 v[3] = _mm_add_epi32(v[3], rnding); 1581 v[3] = _mm_srai_epi32(v[3], bit); 1582 1583 v[4] = u[4]; 1584 v[5] = u[5]; 1585 1586 x = _mm_mullo_epi32(u[6], cospi32); 1587 y = _mm_mullo_epi32(u[7], cospi32); 1588 v[6] = _mm_add_epi32(x, y); 1589 v[6] = _mm_add_epi32(v[6], rnding); 1590 v[6] = _mm_srai_epi32(v[6], bit); 1591 1592 v[7] = _mm_sub_epi32(x, y); 1593 v[7] = _mm_add_epi32(v[7], rnding); 1594 v[7] = _mm_srai_epi32(v[7], bit); 1595 1596 v[8] = u[8]; 1597 v[9] = u[9]; 1598 1599 x = _mm_mullo_epi32(u[10], cospi32); 1600 y = _mm_mullo_epi32(u[11], cospi32); 1601 v[10] = _mm_add_epi32(x, y); 1602 v[10] = _mm_add_epi32(v[10], rnding); 1603 v[10] = _mm_srai_epi32(v[10], bit); 1604 1605 v[11] = _mm_sub_epi32(x, y); 1606 v[11] = _mm_add_epi32(v[11], rnding); 1607 v[11] = _mm_srai_epi32(v[11], bit); 1608 1609 v[12] = u[12]; 1610 v[13] = u[13]; 1611 1612 x = _mm_mullo_epi32(u[14], cospi32); 1613 y = _mm_mullo_epi32(u[15], cospi32); 1614 v[14] = _mm_add_epi32(x, y); 1615 v[14] = _mm_add_epi32(v[14], rnding); 1616 v[14] = _mm_srai_epi32(v[14], bit); 1617 1618 v[15] = _mm_sub_epi32(x, y); 1619 v[15] = _mm_add_epi32(v[15], rnding); 1620 v[15] = _mm_srai_epi32(v[15], bit); 1621 1622 // stage 3 1623 u[0] = _mm_add_epi32(v[0], v[2]); 1624 u[1] = _mm_add_epi32(v[1], v[3]); 1625 u[2] = _mm_sub_epi32(v[0], v[2]); 1626 u[3] = _mm_sub_epi32(v[1], v[3]); 1627 u[4] = _mm_add_epi32(v[4], v[6]); 1628 u[5] = _mm_add_epi32(v[5], v[7]); 1629 u[6] = _mm_sub_epi32(v[4], v[6]); 1630 u[7] = _mm_sub_epi32(v[5], v[7]); 1631 u[8] = _mm_add_epi32(v[8], v[10]); 1632 u[9] = _mm_add_epi32(v[9], v[11]); 1633 u[10] = _mm_sub_epi32(v[8], v[10]); 1634 u[11] = _mm_sub_epi32(v[9], v[11]); 1635 u[12] = _mm_add_epi32(v[12], v[14]); 1636 u[13] = _mm_add_epi32(v[13], v[15]); 1637 u[14] = _mm_sub_epi32(v[12], v[14]); 1638 u[15] = _mm_sub_epi32(v[13], v[15]); 1639 1640 // stage 4 1641 v[0] = u[0]; 1642 v[1] = u[1]; 1643 v[2] = u[2]; 1644 v[3] = u[3]; 1645 v[4] = half_btf_sse4_1(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit); 1646 v[5] = half_btf_sse4_1(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit); 1647 v[6] = half_btf_sse4_1(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit); 1648 v[7] = half_btf_sse4_1(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit); 1649 v[8] = u[8]; 1650 v[9] = u[9]; 1651 v[10] = u[10]; 1652 v[11] = u[11]; 1653 v[12] = half_btf_sse4_1(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit); 1654 v[13] = half_btf_sse4_1(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit); 1655 v[14] = half_btf_sse4_1(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit); 1656 v[15] = half_btf_sse4_1(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit); 1657 1658 // stage 5 1659 u[0] = _mm_add_epi32(v[0], v[4]); 1660 u[1] = _mm_add_epi32(v[1], v[5]); 1661 u[2] = _mm_add_epi32(v[2], v[6]); 1662 u[3] = _mm_add_epi32(v[3], v[7]); 1663 u[4] = _mm_sub_epi32(v[0], v[4]); 1664 u[5] = _mm_sub_epi32(v[1], v[5]); 1665 u[6] = _mm_sub_epi32(v[2], v[6]); 1666 u[7] = _mm_sub_epi32(v[3], v[7]); 1667 u[8] = _mm_add_epi32(v[8], v[12]); 1668 u[9] = _mm_add_epi32(v[9], v[13]); 1669 u[10] = _mm_add_epi32(v[10], v[14]); 1670 u[11] = _mm_add_epi32(v[11], v[15]); 1671 u[12] = _mm_sub_epi32(v[8], v[12]); 1672 u[13] = _mm_sub_epi32(v[9], v[13]); 1673 u[14] = _mm_sub_epi32(v[10], v[14]); 1674 u[15] = _mm_sub_epi32(v[11], v[15]); 1675 1676 // stage 6 1677 v[0] = u[0]; 1678 v[1] = u[1]; 1679 v[2] = u[2]; 1680 v[3] = u[3]; 1681 v[4] = u[4]; 1682 v[5] = u[5]; 1683 v[6] = u[6]; 1684 v[7] = u[7]; 1685 v[8] = half_btf_sse4_1(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit); 1686 v[9] = half_btf_sse4_1(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit); 1687 v[10] = half_btf_sse4_1(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit); 1688 v[11] = half_btf_sse4_1(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit); 1689 v[12] = half_btf_sse4_1(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit); 1690 v[13] = half_btf_sse4_1(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit); 1691 v[14] = half_btf_sse4_1(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit); 1692 v[15] = half_btf_sse4_1(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit); 1693 1694 // stage 7 1695 u[0] = _mm_add_epi32(v[0], v[8]); 1696 u[1] = _mm_add_epi32(v[1], v[9]); 1697 u[2] = _mm_add_epi32(v[2], v[10]); 1698 u[3] = _mm_add_epi32(v[3], v[11]); 1699 u[4] = _mm_add_epi32(v[4], v[12]); 1700 u[5] = _mm_add_epi32(v[5], v[13]); 1701 u[6] = _mm_add_epi32(v[6], v[14]); 1702 u[7] = _mm_add_epi32(v[7], v[15]); 1703 u[8] = _mm_sub_epi32(v[0], v[8]); 1704 u[9] = _mm_sub_epi32(v[1], v[9]); 1705 u[10] = _mm_sub_epi32(v[2], v[10]); 1706 u[11] = _mm_sub_epi32(v[3], v[11]); 1707 u[12] = _mm_sub_epi32(v[4], v[12]); 1708 u[13] = _mm_sub_epi32(v[5], v[13]); 1709 u[14] = _mm_sub_epi32(v[6], v[14]); 1710 u[15] = _mm_sub_epi32(v[7], v[15]); 1711 1712 // stage 8 1713 v[0] = half_btf_sse4_1(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit); 1714 v[1] = half_btf_sse4_1(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit); 1715 v[2] = half_btf_sse4_1(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit); 1716 v[3] = half_btf_sse4_1(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit); 1717 v[4] = half_btf_sse4_1(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit); 1718 v[5] = half_btf_sse4_1(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit); 1719 v[6] = half_btf_sse4_1(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit); 1720 v[7] = half_btf_sse4_1(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit); 1721 v[8] = half_btf_sse4_1(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit); 1722 v[9] = half_btf_sse4_1(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit); 1723 v[10] = half_btf_sse4_1(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit); 1724 v[11] = half_btf_sse4_1(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit); 1725 v[12] = half_btf_sse4_1(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit); 1726 v[13] = half_btf_sse4_1(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit); 1727 v[14] = half_btf_sse4_1(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit); 1728 v[15] = half_btf_sse4_1(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit); 1729 1730 // stage 9 1731 out[0 * num_cols + col] = v[1]; 1732 out[1 * num_cols + col] = v[14]; 1733 out[2 * num_cols + col] = v[3]; 1734 out[3 * num_cols + col] = v[12]; 1735 out[4 * num_cols + col] = v[5]; 1736 out[5 * num_cols + col] = v[10]; 1737 out[6 * num_cols + col] = v[7]; 1738 out[7 * num_cols + col] = v[8]; 1739 out[8 * num_cols + col] = v[9]; 1740 out[9 * num_cols + col] = v[6]; 1741 out[10 * num_cols + col] = v[11]; 1742 out[11 * num_cols + col] = v[4]; 1743 out[12 * num_cols + col] = v[13]; 1744 out[13 * num_cols + col] = v[2]; 1745 out[14 * num_cols + col] = v[15]; 1746 out[15 * num_cols + col] = v[0]; 1747 } 1748 } 1749 1750 static void col_txfm_16x16_rounding(__m128i *in, int shift) { 1751 // Note: 1752 // We split 16x16 rounding into 4 sections of 8x8 rounding, 1753 // instead of 4 columns 1754 col_txfm_8x8_rounding(&in[0], shift); 1755 col_txfm_8x8_rounding(&in[16], shift); 1756 col_txfm_8x8_rounding(&in[32], shift); 1757 col_txfm_8x8_rounding(&in[48], shift); 1758 } 1759 1760 static void col_txfm_8x16_rounding(__m128i *in, int shift) { 1761 col_txfm_8x8_rounding(&in[0], shift); 1762 col_txfm_8x8_rounding(&in[16], shift); 1763 } 1764 1765 static void write_buffer_16x16(const __m128i *in, int32_t *output) { 1766 const int size_8x8 = 16 * 4; 1767 write_buffer_8x8(&in[0], output); 1768 output += size_8x8; 1769 write_buffer_8x8(&in[16], output); 1770 output += size_8x8; 1771 write_buffer_8x8(&in[32], output); 1772 output += size_8x8; 1773 write_buffer_8x8(&in[48], output); 1774 } 1775 static void idtx16x16_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) { 1776 (void)bit; 1777 __m128i fact = _mm_set1_epi32(2 * NewSqrt2); 1778 __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1)); 1779 __m128i a_low; 1780 1781 int num_iters = 16 * col_num; 1782 for (int i = 0; i < num_iters; i++) { 1783 a_low = _mm_mullo_epi32(in[i], fact); 1784 a_low = _mm_add_epi32(a_low, offset); 1785 out[i] = _mm_srai_epi32(a_low, NewSqrt2Bits); 1786 } 1787 } 1788 void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff, 1789 int stride, TX_TYPE tx_type, int bd) { 1790 __m128i in[64], out[64]; 1791 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X16]; 1792 const int txw_idx = get_txw_idx(TX_16X16); 1793 const int txh_idx = get_txh_idx(TX_16X16); 1794 const int col_num = 4; 1795 switch (tx_type) { 1796 case DCT_DCT: 1797 load_buffer_16x16(input, in, stride, 0, 0, shift[0]); 1798 fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); 1799 col_txfm_16x16_rounding(out, -shift[1]); 1800 transpose_16x16(out, in); 1801 fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); 1802 write_buffer_16x16(out, coeff); 1803 break; 1804 case ADST_DCT: 1805 load_buffer_16x16(input, in, stride, 0, 0, shift[0]); 1806 fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1807 col_num); 1808 col_txfm_16x16_rounding(out, -shift[1]); 1809 transpose_16x16(out, in); 1810 fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); 1811 write_buffer_16x16(out, coeff); 1812 break; 1813 case DCT_ADST: 1814 load_buffer_16x16(input, in, stride, 0, 0, shift[0]); 1815 fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); 1816 col_txfm_16x16_rounding(out, -shift[1]); 1817 transpose_16x16(out, in); 1818 fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1819 col_num); 1820 write_buffer_16x16(out, coeff); 1821 break; 1822 case ADST_ADST: 1823 load_buffer_16x16(input, in, stride, 0, 0, shift[0]); 1824 fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1825 col_num); 1826 col_txfm_16x16_rounding(out, -shift[1]); 1827 transpose_16x16(out, in); 1828 fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1829 col_num); 1830 write_buffer_16x16(out, coeff); 1831 break; 1832 case FLIPADST_DCT: 1833 load_buffer_16x16(input, in, stride, 1, 0, shift[0]); 1834 fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1835 col_num); 1836 col_txfm_16x16_rounding(out, -shift[1]); 1837 transpose_16x16(out, in); 1838 fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); 1839 write_buffer_16x16(out, coeff); 1840 break; 1841 case DCT_FLIPADST: 1842 load_buffer_16x16(input, in, stride, 0, 1, shift[0]); 1843 fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); 1844 col_txfm_16x16_rounding(out, -shift[1]); 1845 transpose_16x16(out, in); 1846 fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1847 col_num); 1848 write_buffer_16x16(out, coeff); 1849 break; 1850 case FLIPADST_FLIPADST: 1851 load_buffer_16x16(input, in, stride, 1, 1, shift[0]); 1852 fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1853 col_num); 1854 col_txfm_16x16_rounding(out, -shift[1]); 1855 transpose_16x16(out, in); 1856 fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1857 col_num); 1858 write_buffer_16x16(out, coeff); 1859 break; 1860 case ADST_FLIPADST: 1861 load_buffer_16x16(input, in, stride, 0, 1, shift[0]); 1862 fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1863 col_num); 1864 col_txfm_16x16_rounding(out, -shift[1]); 1865 transpose_16x16(out, in); 1866 fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1867 col_num); 1868 write_buffer_16x16(out, coeff); 1869 break; 1870 case FLIPADST_ADST: 1871 load_buffer_16x16(input, in, stride, 1, 0, shift[0]); 1872 fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1873 col_num); 1874 col_txfm_16x16_rounding(out, -shift[1]); 1875 transpose_16x16(out, in); 1876 fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1877 col_num); 1878 write_buffer_16x16(out, coeff); 1879 break; 1880 case IDTX: 1881 load_buffer_16x16(input, in, stride, 0, 0, shift[0]); 1882 idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); 1883 col_txfm_16x16_rounding(out, -shift[1]); 1884 transpose_16x16(out, in); 1885 idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); 1886 write_buffer_16x16(out, coeff); 1887 break; 1888 case V_DCT: 1889 load_buffer_16x16(input, in, stride, 0, 0, shift[0]); 1890 fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); 1891 col_txfm_16x16_rounding(out, -shift[1]); 1892 transpose_16x16(out, in); 1893 idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); 1894 write_buffer_16x16(out, coeff); 1895 break; 1896 case H_DCT: 1897 load_buffer_16x16(input, in, stride, 0, 0, shift[0]); 1898 idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); 1899 col_txfm_16x16_rounding(out, -shift[1]); 1900 transpose_16x16(out, in); 1901 fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); 1902 write_buffer_16x16(out, coeff); 1903 break; 1904 case V_ADST: 1905 load_buffer_16x16(input, in, stride, 0, 0, shift[0]); 1906 fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1907 col_num); 1908 col_txfm_16x16_rounding(out, -shift[1]); 1909 transpose_16x16(out, in); 1910 idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); 1911 write_buffer_16x16(out, coeff); 1912 break; 1913 case H_ADST: 1914 load_buffer_16x16(input, in, stride, 0, 0, shift[0]); 1915 idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); 1916 col_txfm_16x16_rounding(out, -shift[1]); 1917 transpose_16x16(out, in); 1918 fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1919 col_num); 1920 write_buffer_16x16(out, coeff); 1921 break; 1922 case V_FLIPADST: 1923 load_buffer_16x16(input, in, stride, 1, 0, shift[0]); 1924 fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1925 col_num); 1926 col_txfm_16x16_rounding(out, -shift[1]); 1927 transpose_16x16(out, in); 1928 idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); 1929 write_buffer_16x16(out, coeff); 1930 break; 1931 case H_FLIPADST: 1932 load_buffer_16x16(input, in, stride, 0, 1, shift[0]); 1933 idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); 1934 col_txfm_16x16_rounding(out, -shift[1]); 1935 transpose_16x16(out, in); 1936 fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1937 col_num); 1938 write_buffer_16x16(out, coeff); 1939 break; 1940 default: assert(0); 1941 } 1942 (void)bd; 1943 } 1944 1945 static inline void flip_buf_sse4_1(__m128i *in, __m128i *out, int size) { 1946 for (int i = 0; i < size; i += 2) in[30 - i] = out[i]; 1947 for (int i = 1; i < size; i += 2) in[size - i] = out[i]; 1948 } 1949 1950 static const fwd_transform_1d_sse4_1 col_highbd_txfm8x8_arr[TX_TYPES] = { 1951 fdct8x8_sse4_1, // DCT_DCT 1952 fadst8x8_sse4_1, // ADST_DCT 1953 fdct8x8_sse4_1, // DCT_ADST 1954 fadst8x8_sse4_1, // ADST_ADST 1955 fadst8x8_sse4_1, // FLIPADST_DCT 1956 fdct8x8_sse4_1, // DCT_FLIPADST 1957 fadst8x8_sse4_1, // FLIPADST_FLIPADST 1958 fadst8x8_sse4_1, // ADST_FLIPADST 1959 fadst8x8_sse4_1, // FLIPADST_ADST 1960 idtx8x8_sse4_1, // IDTX 1961 fdct8x8_sse4_1, // V_DCT 1962 idtx8x8_sse4_1, // H_DCT 1963 fadst8x8_sse4_1, // V_ADST 1964 idtx8x8_sse4_1, // H_ADST 1965 fadst8x8_sse4_1, // V_FLIPADST 1966 idtx8x8_sse4_1 // H_FLIPADST 1967 }; 1968 #if !CONFIG_REALTIME_ONLY 1969 static const fwd_transform_1d_sse4_1 row_highbd_txfm32x8_arr[TX_TYPES] = { 1970 fdct8x8_sse4_1, // DCT_DCT 1971 NULL, // ADST_DCT 1972 NULL, // DCT_ADST 1973 NULL, // ADST_ADST 1974 NULL, // FLIPADST_DCT 1975 NULL, // DCT_FLIPADST 1976 NULL, // FLIPADST_FLIPADST 1977 NULL, // ADST_FLIPADST 1978 NULL, // FLIPADST-ADST 1979 idtx32x8_sse4_1, // IDTX 1980 NULL, // V_DCT 1981 NULL, // H_DCT 1982 NULL, // V_ADST 1983 NULL, // H_ADST 1984 NULL, // V_FLIPADST 1985 NULL, // H_FLIPADST 1986 }; 1987 #endif 1988 static const fwd_transform_1d_sse4_1 col_highbd_txfm4x8_arr[TX_TYPES] = { 1989 fdct4x8_sse4_1, // DCT_DCT 1990 fadst8x8_sse4_1, // ADST_DCT 1991 fdct4x8_sse4_1, // DCT_ADST 1992 fadst8x8_sse4_1, // ADST_ADST 1993 fadst8x8_sse4_1, // FLIPADST_DCT 1994 fdct4x8_sse4_1, // DCT_FLIPADST 1995 fadst8x8_sse4_1, // FLIPADST_FLIPADST 1996 fadst8x8_sse4_1, // ADST_FLIPADST 1997 fadst8x8_sse4_1, // FLIPADST_ADST 1998 idtx8x8_sse4_1, // IDTX 1999 fdct4x8_sse4_1, // V_DCT 2000 idtx8x8_sse4_1, // H_DCT 2001 fadst8x8_sse4_1, // V_ADST 2002 idtx8x8_sse4_1, // H_ADST 2003 fadst8x8_sse4_1, // V_FLIPADST 2004 idtx8x8_sse4_1 // H_FLIPADST 2005 }; 2006 2007 static const fwd_transform_1d_sse4_1 row_highbd_txfm8x16_arr[TX_TYPES] = { 2008 fdct16x16_sse4_1, // DCT_DCT 2009 fdct16x16_sse4_1, // ADST_DCT 2010 fadst16x16_sse4_1, // DCT_ADST 2011 fadst16x16_sse4_1, // ADST_ADST 2012 fdct16x16_sse4_1, // FLIPADST_DCT 2013 fadst16x16_sse4_1, // DCT_FLIPADST 2014 fadst16x16_sse4_1, // FLIPADST_FLIPADST 2015 fadst16x16_sse4_1, // ADST_FLIPADST 2016 fadst16x16_sse4_1, // FLIPADST_ADST 2017 idtx16x16_sse4_1, // IDTX 2018 idtx16x16_sse4_1, // V_DCT 2019 fdct16x16_sse4_1, // H_DCT 2020 idtx16x16_sse4_1, // V_ADST 2021 fadst16x16_sse4_1, // H_ADST 2022 idtx16x16_sse4_1, // V_FLIPADST 2023 fadst16x16_sse4_1 // H_FLIPADST 2024 }; 2025 2026 static const fwd_transform_1d_sse4_1 col_highbd_txfm8x16_arr[TX_TYPES] = { 2027 fdct16x16_sse4_1, // DCT_DCT 2028 fadst16x16_sse4_1, // ADST_DCT 2029 fdct16x16_sse4_1, // DCT_ADST 2030 fadst16x16_sse4_1, // ADST_ADST 2031 fadst16x16_sse4_1, // FLIPADST_DCT 2032 fdct16x16_sse4_1, // DCT_FLIPADST 2033 fadst16x16_sse4_1, // FLIPADST_FLIPADST 2034 fadst16x16_sse4_1, // ADST_FLIPADST 2035 fadst16x16_sse4_1, // FLIPADST_ADST 2036 idtx16x16_sse4_1, // IDTX 2037 fdct16x16_sse4_1, // V_DCT 2038 idtx16x16_sse4_1, // H_DCT 2039 fadst16x16_sse4_1, // V_ADST 2040 idtx16x16_sse4_1, // H_ADST 2041 fadst16x16_sse4_1, // V_FLIPADST 2042 idtx16x16_sse4_1 // H_FLIPADST 2043 }; 2044 static const fwd_transform_1d_sse4_1 row_highbd_txfm8x8_arr[TX_TYPES] = { 2045 fdct8x8_sse4_1, // DCT_DCT 2046 fdct8x8_sse4_1, // ADST_DCT 2047 fadst8x8_sse4_1, // DCT_ADST 2048 fadst8x8_sse4_1, // ADST_ADST 2049 fdct8x8_sse4_1, // FLIPADST_DCT 2050 fadst8x8_sse4_1, // DCT_FLIPADST 2051 fadst8x8_sse4_1, // FLIPADST_FLIPADST 2052 fadst8x8_sse4_1, // ADST_FLIPADST 2053 fadst8x8_sse4_1, // FLIPADST_ADST 2054 idtx8x8_sse4_1, // IDTX 2055 idtx8x8_sse4_1, // V_DCT 2056 fdct8x8_sse4_1, // H_DCT 2057 idtx8x8_sse4_1, // V_ADST 2058 fadst8x8_sse4_1, // H_ADST 2059 idtx8x8_sse4_1, // V_FLIPADST 2060 fadst8x8_sse4_1 // H_FLIPADST 2061 }; 2062 2063 static const fwd_transform_1d_sse4_1 row_highbd_txfm4x8_arr[TX_TYPES] = { 2064 fdct4x8_sse4_1, // DCT_DCT 2065 fdct4x8_sse4_1, // ADST_DCT 2066 fadst8x8_sse4_1, // DCT_ADST 2067 fadst8x8_sse4_1, // ADST_ADST 2068 fdct4x8_sse4_1, // FLIPADST_DCT 2069 fadst8x8_sse4_1, // DCT_FLIPADST 2070 fadst8x8_sse4_1, // FLIPADST_FLIPADST 2071 fadst8x8_sse4_1, // ADST_FLIPADST 2072 fadst8x8_sse4_1, // FLIPADST_ADST 2073 idtx8x8_sse4_1, // IDTX 2074 idtx8x8_sse4_1, // V_DCT 2075 fdct4x8_sse4_1, // H_DCT 2076 idtx8x8_sse4_1, // V_ADST 2077 fadst8x8_sse4_1, // H_ADST 2078 idtx8x8_sse4_1, // V_FLIPADST 2079 fadst8x8_sse4_1 // H_FLIPADST 2080 }; 2081 2082 static const fwd_transform_1d_sse4_1 row_highbd_txfm4x4_arr[TX_TYPES] = { 2083 fdct4x4_sse4_1, // DCT_DCT 2084 fdct4x4_sse4_1, // ADST_DCT 2085 fadst4x4_sse4_1, // DCT_ADST 2086 fadst4x4_sse4_1, // ADST_ADST 2087 fdct4x4_sse4_1, // FLIPADST_DCT 2088 fadst4x4_sse4_1, // DCT_FLIPADST 2089 fadst4x4_sse4_1, // FLIPADST_FLIPADST 2090 fadst4x4_sse4_1, // ADST_FLIPADST 2091 fadst4x4_sse4_1, // FLIPADST_ADST 2092 idtx4x4_sse4_1, // IDTX 2093 idtx4x4_sse4_1, // V_DCT 2094 fdct4x4_sse4_1, // H_DCT 2095 idtx4x4_sse4_1, // V_ADST 2096 fadst4x4_sse4_1, // H_ADST 2097 idtx4x4_sse4_1, // V_FLIPADST 2098 fadst4x4_sse4_1 // H_FLIPADST 2099 }; 2100 2101 static const fwd_transform_1d_sse4_1 col_highbd_txfm4x4_arr[TX_TYPES] = { 2102 fdct4x4_sse4_1, // DCT_DCT 2103 fadst4x4_sse4_1, // ADST_DCT 2104 fdct4x4_sse4_1, // DCT_ADST 2105 fadst4x4_sse4_1, // ADST_ADST 2106 fadst4x4_sse4_1, // FLIPADST_DCT 2107 fdct4x4_sse4_1, // DCT_FLIPADST 2108 fadst4x4_sse4_1, // FLIPADST_FLIPADST 2109 fadst4x4_sse4_1, // ADST_FLIPADST 2110 fadst4x4_sse4_1, // FLIPADST_ADST 2111 idtx4x4_sse4_1, // IDTX 2112 fdct4x4_sse4_1, // V_DCT 2113 idtx4x4_sse4_1, // H_DCT 2114 fadst4x4_sse4_1, // V_ADST 2115 idtx4x4_sse4_1, // H_ADST 2116 fadst4x4_sse4_1, // V_FLIPADST 2117 idtx4x4_sse4_1 // H_FLIPADST 2118 }; 2119 2120 static const fwd_transform_1d_sse4_1 col_highbd_txfm8x32_arr[TX_TYPES] = { 2121 av1_fdct32_sse4_1, // DCT_DCT 2122 NULL, // ADST_DCT 2123 NULL, // DCT_ADST 2124 NULL, // ADST_ADST 2125 NULL, // FLIPADST_DCT 2126 NULL, // DCT_FLIPADST 2127 NULL, // FLIPADST_FLIPADST 2128 NULL, // ADST_FLIPADST 2129 NULL, // FLIPADST_ADST 2130 av1_idtx32_sse4_1, // IDTX 2131 NULL, // V_DCT 2132 NULL, // H_DCT 2133 NULL, // V_ADST 2134 NULL, // H_ADST 2135 NULL, // V_FLIPADST 2136 NULL // H_FLIPADST 2137 }; 2138 2139 static const fwd_transform_1d_sse4_1 row_highbd_txfm8x32_arr[TX_TYPES] = { 2140 fdct16x16_sse4_1, // DCT_DCT 2141 NULL, // ADST_DCT 2142 NULL, // DCT_ADST 2143 NULL, // ADST_ADST 2144 NULL, // FLIPADST_DCT 2145 NULL, // DCT_FLIPADST 2146 NULL, // FLIPADST_FLIPADST 2147 NULL, // ADST_FLIPADST 2148 NULL, // FLIPADST_ADST 2149 idtx16x16_sse4_1, // IDTX 2150 NULL, // V_DCT 2151 NULL, // H_DCT 2152 NULL, // V_ADST 2153 NULL, // H_ADST 2154 NULL, // V_FLIPADST 2155 NULL // H_FLIPADST 2156 }; 2157 2158 void av1_fwd_txfm2d_16x8_sse4_1(const int16_t *input, int32_t *coeff, 2159 int stride, TX_TYPE tx_type, int bd) { 2160 __m128i in[32], out[32]; 2161 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8]; 2162 const int txw_idx = get_txw_idx(TX_16X8); 2163 const int txh_idx = get_txh_idx(TX_16X8); 2164 const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x8_arr[tx_type]; 2165 const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x16_arr[tx_type]; 2166 int bit = av1_fwd_cos_bit_col[txw_idx][txh_idx]; 2167 int ud_flip, lr_flip; 2168 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 2169 2170 for (int i = 0; i < 2; i++) { 2171 load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]); 2172 col_txfm(in, in, bit, 2); 2173 col_txfm_8x8_rounding(in, -shift[1]); 2174 transpose_8x8(in, out + i * 16); 2175 } 2176 2177 if (lr_flip) { 2178 flip_buf_sse4_1(in, out, 32); 2179 row_txfm(in, out, bit, 2); 2180 } else { 2181 row_txfm(out, out, bit, 2); 2182 } 2183 2184 for (int i = 0; i < 2; i++) { 2185 av1_round_shift_rect_array_32_sse4_1(out + i * 16, in, 16, -shift[2], 2186 NewSqrt2); 2187 write_buffer_8x8(in, coeff + i * 64); 2188 } 2189 (void)bd; 2190 } 2191 2192 void av1_fwd_txfm2d_8x16_sse4_1(const int16_t *input, int32_t *coeff, 2193 int stride, TX_TYPE tx_type, int bd) { 2194 __m128i in[32], out[32]; 2195 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16]; 2196 const int txw_idx = get_txw_idx(TX_8X16); 2197 const int txh_idx = get_txh_idx(TX_8X16); 2198 const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x16_arr[tx_type]; 2199 const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x8_arr[tx_type]; 2200 int bit = av1_fwd_cos_bit_col[txw_idx][txh_idx]; 2201 int ud_flip, lr_flip; 2202 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 2203 2204 load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]); 2205 col_txfm(in, in, bit, 2); 2206 col_txfm_8x16_rounding(in, -shift[1]); 2207 transpose_8x8(in, out); 2208 transpose_8x8(in + 16, out + 16); 2209 2210 for (int i = 0; i < 2; i++) { 2211 row_txfm(out + i * 16, out, bit, 2); 2212 av1_round_shift_rect_array_32_sse4_1(out, out, 16, -shift[2], NewSqrt2); 2213 write_buffer_16x8(out, coeff + i * 8, 16); 2214 } 2215 (void)bd; 2216 } 2217 2218 #if !CONFIG_REALTIME_ONLY 2219 void av1_fwd_txfm2d_4x16_sse4_1(const int16_t *input, int32_t *coeff, 2220 int stride, TX_TYPE tx_type, int bd) { 2221 __m128i in[16]; 2222 __m128i *outcoeff128 = (__m128i *)coeff; 2223 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X16]; 2224 const int txw_idx = get_txw_idx(TX_4X16); 2225 const int txh_idx = get_txh_idx(TX_4X16); 2226 const int txfm_size_col = tx_size_wide[TX_4X16]; 2227 const int txfm_size_row = tx_size_high[TX_4X16]; 2228 int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; 2229 int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; 2230 const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x16_arr[tx_type]; 2231 const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm4x4_arr[tx_type]; 2232 2233 int ud_flip, lr_flip; 2234 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 2235 // col transform 2236 load_buffer_4x16(input, in, stride, ud_flip, lr_flip, shift[0]); 2237 col_txfm(in, outcoeff128, bitcol, 1); 2238 col_txfm_8x8_rounding(outcoeff128, -shift[1]); 2239 transpose_8nx8n(outcoeff128, in, txfm_size_col, txfm_size_row); 2240 2241 // row transform 2242 for (int i = 0; i < 4; i++) { 2243 __m128i tmp[4]; 2244 row_txfm(in + i, tmp, bitrow, txfm_size_row >> 2); 2245 store_output_w4(coeff + i * 4, tmp, txfm_size_row, txfm_size_col); 2246 } 2247 (void)bd; 2248 } 2249 #endif 2250 2251 void av1_fwd_txfm2d_16x4_sse4_1(const int16_t *input, int32_t *coeff, 2252 int stride, TX_TYPE tx_type, int bd) { 2253 __m128i in[16]; 2254 __m128i *outcoeff128 = (__m128i *)coeff; 2255 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X4]; 2256 const int txw_idx = get_txw_idx(TX_16X4); 2257 const int txh_idx = get_txh_idx(TX_16X4); 2258 const int txfm_size_col = tx_size_wide[TX_16X4]; 2259 const int txfm_size_row = tx_size_high[TX_16X4]; 2260 int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; 2261 int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; 2262 const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm4x4_arr[tx_type]; 2263 const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x16_arr[tx_type]; 2264 int ud_flip, lr_flip; 2265 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 2266 2267 // col transform 2268 load_buffer_16x4(input, in, stride, ud_flip, lr_flip, shift[0]); 2269 2270 for (int i = 0; i < (txfm_size_col >> 2); i++) { 2271 __m128i *cur_in = &in[i * txfm_size_row]; 2272 col_txfm(cur_in, cur_in, bitcol, 1); 2273 transpose_32bit_4x4(cur_in, cur_in); 2274 } 2275 col_txfm_8x8_rounding(in, -shift[1]); 2276 2277 // row transform 2278 row_txfm(in, outcoeff128, bitrow, 1); 2279 (void)bd; 2280 } 2281 2282 void av1_fwd_txfm2d_16x32_sse4_1(const int16_t *input, int32_t *coeff, 2283 int stride, TX_TYPE tx_type, int bd) { 2284 __m128i in[128]; 2285 __m128i *outcoef128 = (__m128i *)coeff; 2286 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X32]; 2287 const int txw_idx = get_txw_idx(TX_16X32); 2288 const int txh_idx = get_txh_idx(TX_16X32); 2289 const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x32_arr[tx_type]; 2290 const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x32_arr[tx_type]; 2291 int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; 2292 int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; 2293 2294 // column transform 2295 load_buffer_16x16(input, in, stride, 0, 0, shift[0]); 2296 load_buffer_16x16(input + 16 * stride, in + 64, stride, 0, 0, shift[0]); 2297 2298 for (int i = 0; i < 4; i++) { 2299 col_txfm((in + i), (in + i), bitcol, 4); 2300 } 2301 col_txfm_16x16_rounding(&in[0], -shift[1]); 2302 col_txfm_16x16_rounding(&in[64], -shift[1]); 2303 transpose_8nx8n(in, outcoef128, 16, 32); 2304 2305 // row transform 2306 row_txfm(outcoef128, in, bitrow, 8); 2307 av1_round_shift_rect_array_32_sse4_1(in, outcoef128, 128, -shift[2], 2308 NewSqrt2); 2309 (void)bd; 2310 } 2311 2312 void av1_fwd_txfm2d_32x64_sse4_1(const int16_t *input, int32_t *coeff, 2313 int stride, TX_TYPE tx_type, int bd) { 2314 (void)tx_type; 2315 __m128i in[512]; 2316 __m128i *outcoef128 = (__m128i *)coeff; 2317 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X64]; 2318 const int txw_idx = get_txw_idx(TX_32X64); 2319 const int txh_idx = get_txh_idx(TX_32X64); 2320 const int txfm_size_col = tx_size_wide[TX_32X64]; 2321 const int txfm_size_row = tx_size_high[TX_32X64]; 2322 int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; 2323 int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; 2324 const int num_row = txfm_size_row >> 2; 2325 const int num_col = txfm_size_col >> 2; 2326 2327 // column transform 2328 load_buffer_32x8n(input, in, stride, 0, 0, shift[0], txfm_size_row); 2329 for (int i = 0; i < num_col; i++) { 2330 av1_fdct64_sse4_1((in + i), (in + i), bitcol, num_col, num_col); 2331 } 2332 for (int i = 0; i < num_col; i++) { 2333 col_txfm_16x16_rounding((in + i * txfm_size_row), -shift[1]); 2334 } 2335 transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row); 2336 2337 // row transform 2338 for (int i = 0; i < num_row; i++) { 2339 av1_fdct32_sse4_1((outcoef128 + i), (in + i), bitrow, num_row); 2340 } 2341 for (int i = 0; i < txfm_size_col; i++) { 2342 av1_round_shift_rect_array_32_sse4_1(in + i * 16, outcoef128 + i * 8, 8, 2343 -shift[2], NewSqrt2); 2344 } 2345 (void)bd; 2346 } 2347 2348 void av1_fwd_txfm2d_64x32_sse4_1(const int16_t *input, int32_t *coeff, 2349 int stride, TX_TYPE tx_type, int bd) { 2350 (void)tx_type; 2351 __m128i in[512]; 2352 __m128i *outcoef128 = (__m128i *)coeff; 2353 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_64X32]; 2354 const int txw_idx = get_txw_idx(TX_64X32); 2355 const int txh_idx = get_txh_idx(TX_64X32); 2356 const int txfm_size_col = tx_size_wide[TX_64X32]; 2357 const int txfm_size_row = tx_size_high[TX_64X32]; 2358 int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; 2359 int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; 2360 const int num_row = txfm_size_row >> 2; 2361 const int num_col = txfm_size_col >> 2; 2362 2363 // column transform 2364 for (int i = 0; i < 32; i++) { 2365 load_buffer_4x4(input + 0 + i * stride, in + 0 + i * 16, 4, 0, 0, shift[0]); 2366 load_buffer_4x4(input + 16 + i * stride, in + 4 + i * 16, 4, 0, 0, 2367 shift[0]); 2368 load_buffer_4x4(input + 32 + i * stride, in + 8 + i * 16, 4, 0, 0, 2369 shift[0]); 2370 load_buffer_4x4(input + 48 + i * stride, in + 12 + i * 16, 4, 0, 0, 2371 shift[0]); 2372 } 2373 2374 for (int i = 0; i < num_col; i++) { 2375 av1_fdct32_sse4_1((in + i), (in + i), bitcol, num_col); 2376 } 2377 2378 for (int i = 0; i < num_row; i++) { 2379 col_txfm_16x16_rounding((in + i * txfm_size_col), -shift[1]); 2380 } 2381 transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row); 2382 2383 // row transform 2384 for (int i = 0; i < num_row; i++) { 2385 av1_fdct64_sse4_1((outcoef128 + i), (in + i), bitrow, num_row, num_row); 2386 } 2387 av1_round_shift_rect_array_32_sse4_1(in, outcoef128, 512, -shift[2], 2388 NewSqrt2); 2389 (void)bd; 2390 } 2391 2392 void av1_fwd_txfm2d_32x16_sse4_1(const int16_t *input, int32_t *coeff, 2393 int stride, TX_TYPE tx_type, int bd) { 2394 __m128i in[128]; 2395 __m128i *outcoef128 = (__m128i *)coeff; 2396 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16]; 2397 const int txw_idx = get_txw_idx(TX_32X16); 2398 const int txh_idx = get_txh_idx(TX_32X16); 2399 const fwd_transform_1d_sse4_1 col_txfm = row_highbd_txfm8x32_arr[tx_type]; 2400 const fwd_transform_1d_sse4_1 row_txfm = col_highbd_txfm8x32_arr[tx_type]; 2401 int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; 2402 int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; 2403 2404 // column transform 2405 load_buffer_32x8n(input, in, stride, 0, 0, shift[0], 16); 2406 col_txfm(in, in, bitcol, 8); 2407 col_txfm_16x16_rounding(&in[0], -shift[1]); 2408 col_txfm_16x16_rounding(&in[64], -shift[1]); 2409 transpose_8nx8n(in, outcoef128, 32, 16); 2410 2411 // row transform 2412 for (int i = 0; i < 4; i++) { 2413 row_txfm((outcoef128 + i), (in + i), bitrow, 4); 2414 } 2415 av1_round_shift_rect_array_32_sse4_1(in, outcoef128, 128, -shift[2], 2416 NewSqrt2); 2417 (void)bd; 2418 } 2419 2420 #if !CONFIG_REALTIME_ONLY 2421 void av1_fwd_txfm2d_8x32_sse4_1(const int16_t *input, int32_t *coeff, 2422 int stride, TX_TYPE tx_type, int bd) { 2423 __m128i in[64]; 2424 __m128i *outcoef128 = (__m128i *)coeff; 2425 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X32]; 2426 const int txw_idx = get_txw_idx(TX_8X32); 2427 const int txh_idx = get_txh_idx(TX_8X32); 2428 const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x32_arr[tx_type]; 2429 const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm32x8_arr[tx_type]; 2430 int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; 2431 int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; 2432 2433 const int txfm_size_col = tx_size_wide[TX_8X32]; 2434 const int txfm_size_row = tx_size_high[TX_8X32]; 2435 const int num_col = txfm_size_col >> 2; 2436 2437 // column transform 2438 load_buffer_8x16(input, in, stride, 0, 0, shift[0]); 2439 load_buffer_8x16(input + (txfm_size_row >> 1) * stride, in + txfm_size_row, 2440 stride, 0, 0, shift[0]); 2441 2442 for (int i = 0; i < num_col; i++) { 2443 col_txfm((in + i), (in + i), bitcol, num_col); 2444 } 2445 col_txfm_16x16_rounding(in, -shift[1]); 2446 transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row); 2447 2448 // row transform 2449 for (int i = 0; i < txfm_size_col; i += 2) { 2450 row_txfm((outcoef128 + i), (outcoef128 + i), bitrow, txfm_size_col); 2451 } 2452 (void)bd; 2453 } 2454 2455 void av1_fwd_txfm2d_32x8_sse4_1(const int16_t *input, int32_t *coeff, 2456 int stride, TX_TYPE tx_type, int bd) { 2457 __m128i in[64]; 2458 __m128i *outcoef128 = (__m128i *)coeff; 2459 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X8]; 2460 const int txw_idx = get_txw_idx(TX_32X8); 2461 const int txh_idx = get_txh_idx(TX_32X8); 2462 const fwd_transform_1d_sse4_1 col_txfm = row_highbd_txfm32x8_arr[tx_type]; 2463 const fwd_transform_1d_sse4_1 row_txfm = col_highbd_txfm8x32_arr[tx_type]; 2464 int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; 2465 int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; 2466 2467 const int txfm_size_col = tx_size_wide[TX_32X8]; 2468 const int txfm_size_row = tx_size_high[TX_32X8]; 2469 const int num_col = txfm_size_row >> 2; 2470 2471 // column transform 2472 load_buffer_32x8n(input, in, stride, 0, 0, shift[0], 8); 2473 for (int i = 0; i < txfm_size_row; i += 2) { 2474 col_txfm((in + i), (in + i), bitcol, txfm_size_row); 2475 } 2476 2477 col_txfm_16x16_rounding(&in[0], -shift[1]); 2478 transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row); 2479 2480 // row transform 2481 for (int i = 0; i < num_col; i++) { 2482 row_txfm((outcoef128 + i), (outcoef128 + i), bitrow, num_col); 2483 } 2484 (void)bd; 2485 } 2486 #endif 2487 2488 void av1_fwd_txfm2d_4x8_sse4_1(const int16_t *input, int32_t *coeff, int stride, 2489 TX_TYPE tx_type, int bd) { 2490 __m128i in[8]; 2491 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X8]; 2492 const int txw_idx = get_txw_idx(TX_4X8); 2493 const int txh_idx = get_txh_idx(TX_4X8); 2494 const int txfm_size_col = tx_size_wide[TX_4X8]; 2495 const int txfm_size_row = tx_size_high[TX_4X8]; 2496 int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; 2497 int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; 2498 const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm4x8_arr[tx_type]; 2499 const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm4x4_arr[tx_type]; 2500 2501 int ud_flip, lr_flip; 2502 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 2503 2504 load_buffer_4x8(input, in, stride, ud_flip, lr_flip, shift[0]); 2505 col_txfm(in, in, bitcol, 1); 2506 col_txfm_4x8_rounding(in, -shift[1]); 2507 2508 for (int i = 0; i < 2; i++) { 2509 __m128i *cur_in = &in[i * 4]; 2510 transpose_32bit_4x4(cur_in, cur_in); 2511 row_txfm(cur_in, cur_in, bitrow, 1); 2512 av1_round_shift_rect_array_32_sse4_1(cur_in, cur_in, txfm_size_col, 2513 -shift[2], NewSqrt2); 2514 store_output_w4(coeff + i * 4, cur_in, txfm_size_row, 4); 2515 } 2516 (void)bd; 2517 } 2518 2519 void av1_fwd_txfm2d_8x4_sse4_1(const int16_t *input, int32_t *coeff, int stride, 2520 TX_TYPE tx_type, int bd) { 2521 __m128i in[8]; 2522 __m128i *outcoeff128 = (__m128i *)coeff; 2523 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X4]; 2524 const int txw_idx = get_txw_idx(TX_8X4); 2525 const int txh_idx = get_txh_idx(TX_8X4); 2526 const int txfm_size_col = tx_size_wide[TX_8X4]; 2527 const int txfm_size_row = tx_size_high[TX_8X4]; 2528 int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; 2529 int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; 2530 const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm4x4_arr[tx_type]; 2531 const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm4x8_arr[tx_type]; 2532 int ud_flip, lr_flip; 2533 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 2534 // col tranform 2535 load_buffer_8x4(input, in, stride, ud_flip, lr_flip, shift[0]); 2536 for (int i = 0; i < 2; i++) { 2537 __m128i *cur_in = &in[i * txfm_size_row]; 2538 col_txfm(cur_in, cur_in, bitcol, 1); 2539 transpose_32bit_4x4(cur_in, cur_in); 2540 } 2541 col_txfm_4x8_rounding(in, -shift[1]); 2542 2543 // row tranform 2544 row_txfm(in, outcoeff128, bitrow, 1); 2545 av1_round_shift_rect_array_32_sse4_1(outcoeff128, outcoeff128, txfm_size_col, 2546 -shift[2], NewSqrt2); 2547 (void)bd; 2548 } 2549 2550 #if !CONFIG_REALTIME_ONLY 2551 void av1_fwd_txfm2d_16x64_sse4_1(const int16_t *input, int32_t *coeff, 2552 int stride, TX_TYPE tx_type, int bd) { 2553 __m128i in[256]; 2554 __m128i *outcoeff128 = (__m128i *)coeff; 2555 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X64]; 2556 const int txw_idx = get_txw_idx(TX_16X64); 2557 const int txh_idx = get_txh_idx(TX_16X64); 2558 const int txfm_size_col = tx_size_wide[TX_16X64]; 2559 const int txfm_size_row = tx_size_high[TX_16X64]; 2560 int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; 2561 int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; 2562 int ud_flip, lr_flip; 2563 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 2564 const int num_col = txfm_size_col >> 2; 2565 // col tranform 2566 for (int i = 0; i < txfm_size_row; i += num_col) { 2567 load_buffer_4x4(input + (i + 0) * stride, in + (i + 0) * num_col, num_col, 2568 ud_flip, lr_flip, shift[0]); 2569 load_buffer_4x4(input + (i + 1) * stride, in + (i + 1) * num_col, num_col, 2570 ud_flip, lr_flip, shift[0]); 2571 load_buffer_4x4(input + (i + 2) * stride, in + (i + 2) * num_col, num_col, 2572 ud_flip, lr_flip, shift[0]); 2573 load_buffer_4x4(input + (i + 3) * stride, in + (i + 3) * num_col, num_col, 2574 ud_flip, lr_flip, shift[0]); 2575 } 2576 2577 for (int i = 0; i < num_col; i++) { 2578 av1_fdct64_sse4_1(in + i, outcoeff128 + i, bitcol, num_col, num_col); 2579 } 2580 2581 col_txfm_16x16_rounding(outcoeff128, -shift[1]); 2582 col_txfm_16x16_rounding(outcoeff128 + 64, -shift[1]); 2583 col_txfm_16x16_rounding(outcoeff128 + 128, -shift[1]); 2584 col_txfm_16x16_rounding(outcoeff128 + 192, -shift[1]); 2585 2586 transpose_8nx8n(outcoeff128, in, txfm_size_col, 32); 2587 fdct16x16_sse4_1(in, outcoeff128, bitrow, 8); 2588 (void)bd; 2589 } 2590 2591 void av1_fwd_txfm2d_64x16_sse4_1(const int16_t *input, int32_t *coeff, 2592 int stride, TX_TYPE tx_type, int bd) { 2593 __m128i in[256]; 2594 __m128i *outcoeff128 = (__m128i *)coeff; 2595 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_64X16]; 2596 const int txw_idx = get_txw_idx(TX_64X16); 2597 const int txh_idx = get_txh_idx(TX_64X16); 2598 const int txfm_size_col = tx_size_wide[TX_64X16]; 2599 const int txfm_size_row = tx_size_high[TX_64X16]; 2600 int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; 2601 int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; 2602 int ud_flip, lr_flip; 2603 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 2604 // col tranform 2605 for (int i = 0; i < txfm_size_row; i++) { 2606 load_buffer_4x4(input + 0 + i * stride, in + 0 + i * txfm_size_row, 4, 2607 ud_flip, lr_flip, shift[0]); 2608 load_buffer_4x4(input + 16 + i * stride, in + 4 + i * txfm_size_row, 4, 2609 ud_flip, lr_flip, shift[0]); 2610 load_buffer_4x4(input + 32 + i * stride, in + 8 + i * txfm_size_row, 4, 2611 ud_flip, lr_flip, shift[0]); 2612 load_buffer_4x4(input + 48 + i * stride, in + 12 + i * txfm_size_row, 4, 2613 ud_flip, lr_flip, shift[0]); 2614 } 2615 2616 fdct16x16_sse4_1(in, outcoeff128, bitcol, txfm_size_row); 2617 col_txfm_16x16_rounding(outcoeff128, -shift[1]); 2618 col_txfm_16x16_rounding(outcoeff128 + 64, -shift[1]); 2619 col_txfm_16x16_rounding(outcoeff128 + 128, -shift[1]); 2620 col_txfm_16x16_rounding(outcoeff128 + 192, -shift[1]); 2621 2622 transpose_8nx8n(outcoeff128, in, txfm_size_col, txfm_size_row); 2623 for (int i = 0; i < 4; i++) { 2624 av1_fdct64_sse4_1(in + i, outcoeff128 + i, bitrow, 4, 4); 2625 } 2626 memset(coeff + txfm_size_row * 32, 0, txfm_size_row * 32 * sizeof(*coeff)); 2627 (void)bd; 2628 } 2629 #endif