adaptive_quantize_sse2.c (23626B)
1 /* 2 * Copyright (c) 2019, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <assert.h> 13 #include <emmintrin.h> 14 #include "config/aom_dsp_rtcd.h" 15 #include "aom/aom_integer.h" 16 #include "aom_dsp/quantize.h" 17 #include "aom_dsp/x86/quantize_x86.h" 18 19 void aom_quantize_b_adaptive_sse2( 20 const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, 21 const int16_t *round_ptr, const int16_t *quant_ptr, 22 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, 23 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, 24 const int16_t *scan, const int16_t *iscan) { 25 int index = 16; 26 int non_zero_count = 0; 27 int non_zero_count_prescan_add_zero = 0; 28 int is_found0 = 0, is_found1 = 0; 29 int eob = -1; 30 const __m128i zero = _mm_setzero_si128(); 31 __m128i zbin, round, quant, dequant, shift; 32 __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; 33 __m128i qcoeff0, qcoeff1; 34 __m128i cmp_mask0, cmp_mask1; 35 __m128i all_zero; 36 __m128i mask0 = zero, mask1 = zero; 37 38 int prescan_add[2]; 39 int thresh[4]; 40 const qm_val_t wt = (1 << AOM_QM_BITS); 41 for (int i = 0; i < 2; ++i) { 42 prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); 43 thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1; 44 } 45 thresh[2] = thresh[3] = thresh[1]; 46 __m128i threshold[2]; 47 threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]); 48 threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]); 49 50 #if SKIP_EOB_FACTOR_ADJUST 51 int first = -1; 52 #endif 53 // Setup global values. 54 load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, 55 dequant_ptr, &dequant, quant_shift_ptr, &shift); 56 57 // Do DC and first 15 AC. 58 coeff0 = load_coefficients(coeff_ptr); 59 coeff1 = load_coefficients(coeff_ptr + 8); 60 61 // Poor man's abs(). 62 coeff0_sign = _mm_srai_epi16(coeff0, 15); 63 coeff1_sign = _mm_srai_epi16(coeff1, 15); 64 qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); 65 qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); 66 67 update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0); 68 69 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); 70 zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC 71 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); 72 73 update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1); 74 75 threshold[0] = threshold[1]; 76 all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); 77 if (_mm_movemask_epi8(all_zero) == 0) { 78 _mm_store_si128((__m128i *)(qcoeff_ptr), zero); 79 _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); 80 _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero); 81 _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero); 82 _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); 83 _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); 84 _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero); 85 _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero); 86 round = _mm_unpackhi_epi64(round, round); 87 quant = _mm_unpackhi_epi64(quant, quant); 88 shift = _mm_unpackhi_epi64(shift, shift); 89 dequant = _mm_unpackhi_epi64(dequant, dequant); 90 } else { 91 calculate_qcoeff(&qcoeff0, round, quant, shift); 92 93 round = _mm_unpackhi_epi64(round, round); 94 quant = _mm_unpackhi_epi64(quant, quant); 95 shift = _mm_unpackhi_epi64(shift, shift); 96 97 calculate_qcoeff(&qcoeff1, round, quant, shift); 98 99 // Reinsert signs 100 qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); 101 qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); 102 103 // Mask out zbin threshold coeffs 104 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); 105 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); 106 107 store_coefficients(qcoeff0, qcoeff_ptr); 108 store_coefficients(qcoeff1, qcoeff_ptr + 8); 109 110 coeff0 = calculate_dqcoeff(qcoeff0, dequant); 111 dequant = _mm_unpackhi_epi64(dequant, dequant); 112 coeff1 = calculate_dqcoeff(qcoeff1, dequant); 113 114 store_coefficients(coeff0, dqcoeff_ptr); 115 store_coefficients(coeff1, dqcoeff_ptr + 8); 116 } 117 118 // AC only loop. 119 while (index < n_coeffs) { 120 coeff0 = load_coefficients(coeff_ptr + index); 121 coeff1 = load_coefficients(coeff_ptr + index + 8); 122 123 coeff0_sign = _mm_srai_epi16(coeff0, 15); 124 coeff1_sign = _mm_srai_epi16(coeff1, 15); 125 qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); 126 qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); 127 128 update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0, 129 &mask0); 130 131 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); 132 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); 133 134 update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1); 135 136 all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); 137 if (_mm_movemask_epi8(all_zero) == 0) { 138 _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); 139 _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); 140 _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero); 141 _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero); 142 _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); 143 _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); 144 _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero); 145 _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero); 146 index += 16; 147 continue; 148 } 149 calculate_qcoeff(&qcoeff0, round, quant, shift); 150 calculate_qcoeff(&qcoeff1, round, quant, shift); 151 152 qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); 153 qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); 154 155 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); 156 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); 157 158 store_coefficients(qcoeff0, qcoeff_ptr + index); 159 store_coefficients(qcoeff1, qcoeff_ptr + index + 8); 160 161 coeff0 = calculate_dqcoeff(qcoeff0, dequant); 162 coeff1 = calculate_dqcoeff(qcoeff1, dequant); 163 164 store_coefficients(coeff0, dqcoeff_ptr + index); 165 store_coefficients(coeff1, dqcoeff_ptr + index + 8); 166 167 index += 16; 168 } 169 if (is_found0) non_zero_count = calculate_non_zero_count(mask0); 170 if (is_found1) 171 non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1); 172 173 for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { 174 const int rc = scan[i]; 175 qcoeff_ptr[rc] = 0; 176 dqcoeff_ptr[rc] = 0; 177 } 178 179 for (int i = non_zero_count - 1; i >= 0; i--) { 180 const int rc = scan[i]; 181 if (qcoeff_ptr[rc]) { 182 eob = i; 183 break; 184 } 185 } 186 187 *eob_ptr = eob + 1; 188 #if SKIP_EOB_FACTOR_ADJUST 189 // TODO(Aniket): Experiment the following loop with intrinsic by combining 190 // with the quantization loop above 191 for (int i = 0; i < non_zero_count; i++) { 192 const int rc = scan[i]; 193 const int qcoeff = qcoeff_ptr[rc]; 194 if (qcoeff) { 195 first = i; 196 break; 197 } 198 } 199 if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { 200 const int rc = scan[(*eob_ptr - 1)]; 201 if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { 202 const int coeff = coeff_ptr[rc] * wt; 203 const int coeff_sign = AOMSIGN(coeff); 204 const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; 205 const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; 206 const int prescan_add_val = 207 ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); 208 if (abs_coeff < 209 (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { 210 qcoeff_ptr[rc] = 0; 211 dqcoeff_ptr[rc] = 0; 212 *eob_ptr = 0; 213 } 214 } 215 } 216 #endif 217 } 218 219 void aom_quantize_b_32x32_adaptive_sse2( 220 const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, 221 const int16_t *round_ptr, const int16_t *quant_ptr, 222 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, 223 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, 224 const int16_t *scan, const int16_t *iscan) { 225 int index = 16; 226 const int log_scale = 1; 227 int non_zero_count = 0; 228 int non_zero_count_prescan_add_zero = 0; 229 int is_found0 = 0, is_found1 = 0; 230 int eob = -1; 231 const __m128i zero = _mm_setzero_si128(); 232 const __m128i one = _mm_set1_epi16(1); 233 const __m128i log_scale_vec = _mm_set1_epi16(log_scale); 234 __m128i zbin, round, quant, dequant, shift; 235 __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; 236 __m128i qcoeff0, qcoeff1; 237 __m128i cmp_mask0, cmp_mask1; 238 __m128i all_zero; 239 __m128i mask0 = zero, mask1 = zero; 240 241 const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), 242 ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; 243 int prescan_add[2]; 244 int thresh[4]; 245 const qm_val_t wt = (1 << AOM_QM_BITS); 246 for (int i = 0; i < 2; ++i) { 247 prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); 248 thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1; 249 } 250 thresh[2] = thresh[3] = thresh[1]; 251 __m128i threshold[2]; 252 threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]); 253 threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]); 254 255 #if SKIP_EOB_FACTOR_ADJUST 256 int first = -1; 257 #endif 258 // Setup global values. 259 zbin = _mm_load_si128((const __m128i *)zbin_ptr); 260 round = _mm_load_si128((const __m128i *)round_ptr); 261 quant = _mm_load_si128((const __m128i *)quant_ptr); 262 dequant = _mm_load_si128((const __m128i *)dequant_ptr); 263 shift = _mm_load_si128((const __m128i *)quant_shift_ptr); 264 265 // Shift with rounding. 266 zbin = _mm_add_epi16(zbin, log_scale_vec); 267 round = _mm_add_epi16(round, log_scale_vec); 268 zbin = _mm_srli_epi16(zbin, log_scale); 269 round = _mm_srli_epi16(round, log_scale); 270 zbin = _mm_sub_epi16(zbin, one); 271 272 // Do DC and first 15 AC. 273 coeff0 = load_coefficients(coeff_ptr); 274 coeff1 = load_coefficients(coeff_ptr + 8); 275 276 coeff0_sign = _mm_srai_epi16(coeff0, 15); 277 coeff1_sign = _mm_srai_epi16(coeff1, 15); 278 qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); 279 qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); 280 281 update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0); 282 283 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); 284 zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC 285 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); 286 287 update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1); 288 289 threshold[0] = threshold[1]; 290 all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); 291 if (_mm_movemask_epi8(all_zero) == 0) { 292 _mm_store_si128((__m128i *)(qcoeff_ptr), zero); 293 _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); 294 _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero); 295 _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero); 296 _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); 297 _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); 298 _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero); 299 _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero); 300 round = _mm_unpackhi_epi64(round, round); 301 quant = _mm_unpackhi_epi64(quant, quant); 302 shift = _mm_unpackhi_epi64(shift, shift); 303 dequant = _mm_unpackhi_epi64(dequant, dequant); 304 } else { 305 calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale); 306 round = _mm_unpackhi_epi64(round, round); 307 quant = _mm_unpackhi_epi64(quant, quant); 308 shift = _mm_unpackhi_epi64(shift, shift); 309 calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale); 310 311 // Reinsert signs 312 qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); 313 qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); 314 315 // Mask out zbin threshold coeffs 316 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); 317 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); 318 319 store_coefficients(qcoeff0, qcoeff_ptr); 320 store_coefficients(qcoeff1, qcoeff_ptr + 8); 321 322 calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, dqcoeff_ptr, 323 &log_scale); 324 dequant = _mm_unpackhi_epi64(dequant, dequant); 325 calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero, 326 dqcoeff_ptr + 8, &log_scale); 327 } 328 329 // AC only loop. 330 while (index < n_coeffs) { 331 coeff0 = load_coefficients(coeff_ptr + index); 332 coeff1 = load_coefficients(coeff_ptr + index + 8); 333 334 coeff0_sign = _mm_srai_epi16(coeff0, 15); 335 coeff1_sign = _mm_srai_epi16(coeff1, 15); 336 qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); 337 qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); 338 339 update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0, 340 &mask0); 341 342 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); 343 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); 344 345 update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1); 346 347 all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); 348 if (_mm_movemask_epi8(all_zero) == 0) { 349 _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); 350 _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); 351 _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero); 352 _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero); 353 _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); 354 _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); 355 _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero); 356 _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero); 357 index += 16; 358 continue; 359 } 360 calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale); 361 calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale); 362 363 qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); 364 qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); 365 366 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); 367 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); 368 369 store_coefficients(qcoeff0, qcoeff_ptr + index); 370 store_coefficients(qcoeff1, qcoeff_ptr + index + 8); 371 372 calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, 373 dqcoeff_ptr + index, &log_scale); 374 calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero, 375 dqcoeff_ptr + index + 8, &log_scale); 376 index += 16; 377 } 378 if (is_found0) non_zero_count = calculate_non_zero_count(mask0); 379 if (is_found1) 380 non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1); 381 382 for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { 383 const int rc = scan[i]; 384 qcoeff_ptr[rc] = 0; 385 dqcoeff_ptr[rc] = 0; 386 } 387 388 for (int i = non_zero_count - 1; i >= 0; i--) { 389 const int rc = scan[i]; 390 if (qcoeff_ptr[rc]) { 391 eob = i; 392 break; 393 } 394 } 395 396 *eob_ptr = eob + 1; 397 #if SKIP_EOB_FACTOR_ADJUST 398 // TODO(Aniket): Experiment the following loop with intrinsic by combining 399 // with the quantization loop above 400 for (int i = 0; i < non_zero_count; i++) { 401 const int rc = scan[i]; 402 const int qcoeff = qcoeff_ptr[rc]; 403 if (qcoeff) { 404 first = i; 405 break; 406 } 407 } 408 if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { 409 const int rc = scan[(*eob_ptr - 1)]; 410 if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { 411 const int coeff = coeff_ptr[rc] * wt; 412 const int coeff_sign = AOMSIGN(coeff); 413 const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; 414 const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; 415 const int prescan_add_val = 416 ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); 417 if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { 418 qcoeff_ptr[rc] = 0; 419 dqcoeff_ptr[rc] = 0; 420 *eob_ptr = 0; 421 } 422 } 423 } 424 #endif 425 } 426 427 void aom_quantize_b_64x64_adaptive_sse2( 428 const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, 429 const int16_t *round_ptr, const int16_t *quant_ptr, 430 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, 431 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, 432 const int16_t *scan, const int16_t *iscan) { 433 int index = 16; 434 const int log_scale = 2; 435 int non_zero_count = 0; 436 int non_zero_count_prescan_add_zero = 0; 437 int is_found0 = 0, is_found1 = 0; 438 int eob = -1; 439 const __m128i zero = _mm_setzero_si128(); 440 const __m128i one = _mm_set1_epi16(1); 441 const __m128i log_scale_vec = _mm_set1_epi16(log_scale); 442 __m128i zbin, round, quant, dequant, shift; 443 __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; 444 __m128i qcoeff0, qcoeff1; 445 __m128i cmp_mask0, cmp_mask1; 446 __m128i all_zero; 447 __m128i mask0 = zero, mask1 = zero; 448 449 const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), 450 ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; 451 int prescan_add[2]; 452 int thresh[4]; 453 const qm_val_t wt = (1 << AOM_QM_BITS); 454 for (int i = 0; i < 2; ++i) { 455 prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); 456 thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1; 457 } 458 thresh[2] = thresh[3] = thresh[1]; 459 __m128i threshold[2]; 460 threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]); 461 threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]); 462 463 #if SKIP_EOB_FACTOR_ADJUST 464 int first = -1; 465 #endif 466 // Setup global values. 467 zbin = _mm_load_si128((const __m128i *)zbin_ptr); 468 round = _mm_load_si128((const __m128i *)round_ptr); 469 quant = _mm_load_si128((const __m128i *)quant_ptr); 470 dequant = _mm_load_si128((const __m128i *)dequant_ptr); 471 shift = _mm_load_si128((const __m128i *)quant_shift_ptr); 472 473 // Shift with rounding. 474 zbin = _mm_add_epi16(zbin, log_scale_vec); 475 round = _mm_add_epi16(round, log_scale_vec); 476 zbin = _mm_srli_epi16(zbin, log_scale); 477 round = _mm_srli_epi16(round, log_scale); 478 zbin = _mm_sub_epi16(zbin, one); 479 480 // Do DC and first 15 AC. 481 coeff0 = load_coefficients(coeff_ptr); 482 coeff1 = load_coefficients(coeff_ptr + 8); 483 484 coeff0_sign = _mm_srai_epi16(coeff0, 15); 485 coeff1_sign = _mm_srai_epi16(coeff1, 15); 486 qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); 487 qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); 488 489 update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0); 490 491 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); 492 zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC 493 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); 494 495 update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1); 496 497 threshold[0] = threshold[1]; 498 all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); 499 if (_mm_movemask_epi8(all_zero) == 0) { 500 _mm_store_si128((__m128i *)(qcoeff_ptr), zero); 501 _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); 502 _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero); 503 _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero); 504 _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); 505 _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); 506 _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero); 507 _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero); 508 round = _mm_unpackhi_epi64(round, round); 509 quant = _mm_unpackhi_epi64(quant, quant); 510 shift = _mm_unpackhi_epi64(shift, shift); 511 dequant = _mm_unpackhi_epi64(dequant, dequant); 512 } else { 513 calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale); 514 round = _mm_unpackhi_epi64(round, round); 515 quant = _mm_unpackhi_epi64(quant, quant); 516 shift = _mm_unpackhi_epi64(shift, shift); 517 calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale); 518 519 // Reinsert signs 520 qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); 521 qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); 522 523 // Mask out zbin threshold coeffs 524 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); 525 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); 526 527 store_coefficients(qcoeff0, qcoeff_ptr); 528 store_coefficients(qcoeff1, qcoeff_ptr + 8); 529 530 calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, dqcoeff_ptr, 531 &log_scale); 532 dequant = _mm_unpackhi_epi64(dequant, dequant); 533 calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero, 534 dqcoeff_ptr + 8, &log_scale); 535 } 536 537 // AC only loop. 538 while (index < n_coeffs) { 539 coeff0 = load_coefficients(coeff_ptr + index); 540 coeff1 = load_coefficients(coeff_ptr + index + 8); 541 542 coeff0_sign = _mm_srai_epi16(coeff0, 15); 543 coeff1_sign = _mm_srai_epi16(coeff1, 15); 544 qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); 545 qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); 546 547 update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0, 548 &mask0); 549 550 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); 551 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); 552 553 update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1); 554 555 all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); 556 if (_mm_movemask_epi8(all_zero) == 0) { 557 _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); 558 _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); 559 _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero); 560 _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero); 561 _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); 562 _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); 563 _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero); 564 _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero); 565 index += 16; 566 continue; 567 } 568 calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale); 569 calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale); 570 571 qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); 572 qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); 573 574 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); 575 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); 576 577 store_coefficients(qcoeff0, qcoeff_ptr + index); 578 store_coefficients(qcoeff1, qcoeff_ptr + index + 8); 579 580 calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, 581 dqcoeff_ptr + index, &log_scale); 582 calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero, 583 dqcoeff_ptr + index + 8, &log_scale); 584 index += 16; 585 } 586 if (is_found0) non_zero_count = calculate_non_zero_count(mask0); 587 if (is_found1) 588 non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1); 589 590 for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { 591 const int rc = scan[i]; 592 qcoeff_ptr[rc] = 0; 593 dqcoeff_ptr[rc] = 0; 594 } 595 596 for (int i = non_zero_count - 1; i >= 0; i--) { 597 const int rc = scan[i]; 598 if (qcoeff_ptr[rc]) { 599 eob = i; 600 break; 601 } 602 } 603 604 *eob_ptr = eob + 1; 605 #if SKIP_EOB_FACTOR_ADJUST 606 // TODO(Aniket): Experiment the following loop with intrinsic by combining 607 // with the quantization loop above 608 for (int i = 0; i < non_zero_count; i++) { 609 const int rc = scan[i]; 610 const int qcoeff = qcoeff_ptr[rc]; 611 if (qcoeff) { 612 first = i; 613 break; 614 } 615 } 616 if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { 617 const int rc = scan[(*eob_ptr - 1)]; 618 if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { 619 const int coeff = coeff_ptr[rc] * wt; 620 const int coeff_sign = AOMSIGN(coeff); 621 const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; 622 const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; 623 const int prescan_add_val = 624 ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); 625 if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { 626 qcoeff_ptr[rc] = 0; 627 dqcoeff_ptr[rc] = 0; 628 *eob_ptr = 0; 629 } 630 } 631 } 632 #endif 633 }