highbd_adaptive_quantize_sse2.c (27745B)
1 /* 2 * Copyright (c) 2019, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <emmintrin.h> 13 #include "config/aom_dsp_rtcd.h" 14 15 #include "aom/aom_integer.h" 16 #include "aom_dsp/quantize.h" 17 #include "aom_dsp/x86/quantize_x86.h" 18 19 static inline __m128i highbd_invert_sign_64bit_sse2(__m128i a, __m128i sign) { 20 a = _mm_xor_si128(a, sign); 21 return _mm_sub_epi64(a, sign); 22 } 23 24 static inline void highbd_mul_shift_sse2(const __m128i *x, const __m128i *y, 25 __m128i *p, const int shift) { 26 __m128i sign = _mm_srai_epi32(*y, 31); 27 __m128i sign_lo = _mm_unpacklo_epi32(sign, sign); 28 __m128i sign_hi = _mm_unpackhi_epi32(sign, sign); 29 __m128i abs_y = invert_sign_32_sse2(*y, sign); 30 __m128i prod_lo = _mm_mul_epu32(*x, abs_y); 31 __m128i prod_hi = _mm_srli_epi64(*x, 32); 32 const __m128i mult_hi = _mm_srli_epi64(abs_y, 32); 33 prod_hi = _mm_mul_epu32(prod_hi, mult_hi); 34 prod_lo = highbd_invert_sign_64bit_sse2(prod_lo, sign_lo); 35 prod_hi = highbd_invert_sign_64bit_sse2(prod_hi, sign_hi); 36 37 prod_lo = _mm_srli_epi64(prod_lo, shift); 38 const __m128i mask = _mm_set_epi32(0, -1, 0, -1); 39 prod_lo = _mm_and_si128(prod_lo, mask); 40 prod_hi = _mm_srli_epi64(prod_hi, shift); 41 42 prod_hi = _mm_slli_epi64(prod_hi, 32); 43 *p = _mm_or_si128(prod_lo, prod_hi); 44 } 45 46 static inline void highbd_calculate_qcoeff(__m128i *coeff, const __m128i *round, 47 const __m128i *quant, 48 const __m128i *shift, 49 const int *log_scale) { 50 __m128i tmp, qcoeff; 51 qcoeff = _mm_add_epi32(*coeff, *round); 52 highbd_mul_shift_sse2(&qcoeff, quant, &tmp, 16); 53 qcoeff = _mm_add_epi32(tmp, qcoeff); 54 highbd_mul_shift_sse2(&qcoeff, shift, coeff, 16 - *log_scale); 55 } 56 57 static inline void highbd_update_mask1(__m128i *cmp_mask0, 58 const int16_t *iscan_ptr, int *is_found, 59 __m128i *mask) { 60 __m128i temp_mask = _mm_setzero_si128(); 61 if (_mm_movemask_epi8(*cmp_mask0)) { 62 __m128i iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr)); 63 __m128i mask0 = _mm_and_si128(*cmp_mask0, iscan0); 64 temp_mask = mask0; 65 *is_found = 1; 66 } 67 *mask = _mm_max_epi16(temp_mask, *mask); 68 } 69 70 static inline void highbd_update_mask0(__m128i *qcoeff0, __m128i *qcoeff1, 71 __m128i *threshold, 72 const int16_t *iscan_ptr, int *is_found, 73 __m128i *mask) { 74 __m128i coeff[2], cmp_mask0, cmp_mask1; 75 76 coeff[0] = _mm_slli_epi32(*qcoeff0, AOM_QM_BITS); 77 cmp_mask0 = _mm_cmpgt_epi32(coeff[0], threshold[0]); 78 coeff[1] = _mm_slli_epi32(*qcoeff1, AOM_QM_BITS); 79 cmp_mask1 = _mm_cmpgt_epi32(coeff[1], threshold[1]); 80 81 cmp_mask0 = _mm_packs_epi32(cmp_mask0, cmp_mask1); 82 83 highbd_update_mask1(&cmp_mask0, iscan_ptr, is_found, mask); 84 } 85 86 static inline __m128i highbd_calculate_dqcoeff(__m128i qcoeff, __m128i dequant, 87 const int log_scale) { 88 __m128i coeff_sign = _mm_srai_epi32(qcoeff, 31); 89 __m128i abs_coeff = invert_sign_32_sse2(qcoeff, coeff_sign); 90 highbd_mul_shift_sse2(&abs_coeff, &dequant, &abs_coeff, log_scale); 91 return invert_sign_32_sse2(abs_coeff, coeff_sign); 92 } 93 94 void aom_highbd_quantize_b_adaptive_sse2( 95 const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, 96 const int16_t *round_ptr, const int16_t *quant_ptr, 97 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, 98 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, 99 const int16_t *scan, const int16_t *iscan) { 100 int index = 8; 101 const int log_scale = 0; 102 int non_zero_count = 0; 103 int non_zero_count_prescan_add_zero = 0; 104 int is_found0 = 0, is_found1 = 0; 105 int eob = -1; 106 const __m128i zero = _mm_setzero_si128(); 107 const __m128i one = _mm_set1_epi32(1); 108 __m128i zbin, round, quant, dequant, shift; 109 __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; 110 __m128i qcoeff0, qcoeff1; 111 __m128i cmp_mask0, cmp_mask1, cmp_mask; 112 __m128i all_zero; 113 __m128i mask0 = zero, mask1 = zero; 114 115 int prescan_add[2]; 116 int thresh[4]; 117 const qm_val_t wt = (1 << AOM_QM_BITS); 118 for (int i = 0; i < 2; ++i) { 119 prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); 120 thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1; 121 } 122 thresh[2] = thresh[3] = thresh[1]; 123 __m128i threshold[2]; 124 threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]); 125 threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]); 126 127 #if SKIP_EOB_FACTOR_ADJUST 128 int first = -1; 129 #endif 130 // Setup global values. 131 zbin = _mm_load_si128((const __m128i *)zbin_ptr); 132 round = _mm_load_si128((const __m128i *)round_ptr); 133 quant = _mm_load_si128((const __m128i *)quant_ptr); 134 dequant = _mm_load_si128((const __m128i *)dequant_ptr); 135 shift = _mm_load_si128((const __m128i *)quant_shift_ptr); 136 137 __m128i zbin_sign = _mm_srai_epi16(zbin, 15); 138 __m128i round_sign = _mm_srai_epi16(round, 15); 139 __m128i quant_sign = _mm_srai_epi16(quant, 15); 140 __m128i dequant_sign = _mm_srai_epi16(dequant, 15); 141 __m128i shift_sign = _mm_srai_epi16(shift, 15); 142 143 zbin = _mm_unpacklo_epi16(zbin, zbin_sign); 144 round = _mm_unpacklo_epi16(round, round_sign); 145 quant = _mm_unpacklo_epi16(quant, quant_sign); 146 dequant = _mm_unpacklo_epi16(dequant, dequant_sign); 147 shift = _mm_unpacklo_epi16(shift, shift_sign); 148 zbin = _mm_sub_epi32(zbin, one); 149 150 // Do DC and first 15 AC. 151 coeff0 = _mm_load_si128((__m128i *)(coeff_ptr)); 152 coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4)); 153 154 coeff0_sign = _mm_srai_epi32(coeff0, 31); 155 coeff1_sign = _mm_srai_epi32(coeff1, 31); 156 qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign); 157 qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign); 158 159 highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0); 160 161 cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin); 162 zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC 163 cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin); 164 cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1); 165 highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1); 166 167 threshold[0] = threshold[1]; 168 all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); 169 if (_mm_movemask_epi8(all_zero) == 0) { 170 _mm_store_si128((__m128i *)(qcoeff_ptr), zero); 171 _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); 172 _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); 173 _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); 174 175 round = _mm_unpackhi_epi64(round, round); 176 quant = _mm_unpackhi_epi64(quant, quant); 177 shift = _mm_unpackhi_epi64(shift, shift); 178 dequant = _mm_unpackhi_epi64(dequant, dequant); 179 } else { 180 highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale); 181 182 round = _mm_unpackhi_epi64(round, round); 183 quant = _mm_unpackhi_epi64(quant, quant); 184 shift = _mm_unpackhi_epi64(shift, shift); 185 highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale); 186 187 // Reinsert signs 188 qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign); 189 qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign); 190 191 // Mask out zbin threshold coeffs 192 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); 193 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); 194 195 _mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0); 196 _mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1); 197 198 coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale); 199 dequant = _mm_unpackhi_epi64(dequant, dequant); 200 coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale); 201 _mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0); 202 _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1); 203 } 204 205 // AC only loop. 206 while (index < n_coeffs) { 207 coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index)); 208 coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4)); 209 210 coeff0_sign = _mm_srai_epi32(coeff0, 31); 211 coeff1_sign = _mm_srai_epi32(coeff1, 31); 212 qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign); 213 qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign); 214 215 highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, 216 &is_found0, &mask0); 217 218 cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin); 219 cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin); 220 cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1); 221 highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1); 222 223 all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); 224 if (_mm_movemask_epi8(all_zero) == 0) { 225 _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); 226 _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); 227 _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); 228 _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); 229 index += 8; 230 continue; 231 } 232 highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale); 233 highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale); 234 235 qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign); 236 qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign); 237 238 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); 239 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); 240 241 _mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0); 242 _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1); 243 244 coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale); 245 coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale); 246 247 _mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0); 248 _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1); 249 250 index += 8; 251 } 252 if (is_found0) non_zero_count = calculate_non_zero_count(mask0); 253 if (is_found1) 254 non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1); 255 256 for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { 257 const int rc = scan[i]; 258 qcoeff_ptr[rc] = 0; 259 dqcoeff_ptr[rc] = 0; 260 } 261 262 for (int i = non_zero_count - 1; i >= 0; i--) { 263 const int rc = scan[i]; 264 if (qcoeff_ptr[rc]) { 265 eob = i; 266 break; 267 } 268 } 269 270 *eob_ptr = eob + 1; 271 #if SKIP_EOB_FACTOR_ADJUST 272 // TODO(Aniket): Experiment the following loop with intrinsic by combining 273 // with the quantization loop above 274 for (int i = 0; i < non_zero_count; i++) { 275 const int rc = scan[i]; 276 const int qcoeff = qcoeff_ptr[rc]; 277 if (qcoeff) { 278 first = i; 279 break; 280 } 281 } 282 if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { 283 const int rc = scan[(*eob_ptr - 1)]; 284 if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { 285 const int coeff = coeff_ptr[rc] * wt; 286 const int coeff_sign = AOMSIGN(coeff); 287 const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; 288 const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; 289 const int prescan_add_val = 290 ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); 291 if (abs_coeff < 292 (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { 293 qcoeff_ptr[rc] = 0; 294 dqcoeff_ptr[rc] = 0; 295 *eob_ptr = 0; 296 } 297 } 298 } 299 #endif 300 } 301 302 void aom_highbd_quantize_b_32x32_adaptive_sse2( 303 const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, 304 const int16_t *round_ptr, const int16_t *quant_ptr, 305 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, 306 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, 307 const int16_t *scan, const int16_t *iscan) { 308 int index = 8; 309 const int log_scale = 1; 310 int non_zero_count = 0; 311 int non_zero_count_prescan_add_zero = 0; 312 int is_found0 = 0, is_found1 = 0; 313 int eob = -1; 314 const __m128i zero = _mm_setzero_si128(); 315 const __m128i one = _mm_set1_epi32(1); 316 const __m128i log_scale_vec = _mm_set1_epi32(log_scale); 317 __m128i zbin, round, quant, dequant, shift; 318 __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; 319 __m128i qcoeff0, qcoeff1; 320 __m128i cmp_mask0, cmp_mask1, cmp_mask; 321 __m128i all_zero; 322 __m128i mask0 = zero, mask1 = zero; 323 324 const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), 325 ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; 326 int prescan_add[2]; 327 int thresh[4]; 328 const qm_val_t wt = (1 << AOM_QM_BITS); 329 for (int i = 0; i < 2; ++i) { 330 prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); 331 thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1; 332 } 333 thresh[2] = thresh[3] = thresh[1]; 334 __m128i threshold[2]; 335 threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]); 336 threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]); 337 338 #if SKIP_EOB_FACTOR_ADJUST 339 int first = -1; 340 #endif 341 // Setup global values. 342 zbin = _mm_load_si128((const __m128i *)zbin_ptr); 343 round = _mm_load_si128((const __m128i *)round_ptr); 344 quant = _mm_load_si128((const __m128i *)quant_ptr); 345 dequant = _mm_load_si128((const __m128i *)dequant_ptr); 346 shift = _mm_load_si128((const __m128i *)quant_shift_ptr); 347 348 __m128i zbin_sign = _mm_srai_epi16(zbin, 15); 349 __m128i round_sign = _mm_srai_epi16(round, 15); 350 __m128i quant_sign = _mm_srai_epi16(quant, 15); 351 __m128i dequant_sign = _mm_srai_epi16(dequant, 15); 352 __m128i shift_sign = _mm_srai_epi16(shift, 15); 353 354 zbin = _mm_unpacklo_epi16(zbin, zbin_sign); 355 round = _mm_unpacklo_epi16(round, round_sign); 356 quant = _mm_unpacklo_epi16(quant, quant_sign); 357 dequant = _mm_unpacklo_epi16(dequant, dequant_sign); 358 shift = _mm_unpacklo_epi16(shift, shift_sign); 359 360 // Shift with rounding. 361 zbin = _mm_add_epi32(zbin, log_scale_vec); 362 round = _mm_add_epi32(round, log_scale_vec); 363 zbin = _mm_srli_epi32(zbin, log_scale); 364 round = _mm_srli_epi32(round, log_scale); 365 zbin = _mm_sub_epi32(zbin, one); 366 367 // Do DC and first 15 AC. 368 coeff0 = _mm_load_si128((__m128i *)(coeff_ptr)); 369 coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4)); 370 371 coeff0_sign = _mm_srai_epi32(coeff0, 31); 372 coeff1_sign = _mm_srai_epi32(coeff1, 31); 373 qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign); 374 qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign); 375 376 highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0); 377 378 cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin); 379 zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC 380 cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin); 381 cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1); 382 highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1); 383 384 threshold[0] = threshold[1]; 385 all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); 386 if (_mm_movemask_epi8(all_zero) == 0) { 387 _mm_store_si128((__m128i *)(qcoeff_ptr), zero); 388 _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); 389 _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); 390 _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); 391 392 round = _mm_unpackhi_epi64(round, round); 393 quant = _mm_unpackhi_epi64(quant, quant); 394 shift = _mm_unpackhi_epi64(shift, shift); 395 dequant = _mm_unpackhi_epi64(dequant, dequant); 396 } else { 397 highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale); 398 399 round = _mm_unpackhi_epi64(round, round); 400 quant = _mm_unpackhi_epi64(quant, quant); 401 shift = _mm_unpackhi_epi64(shift, shift); 402 highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale); 403 404 // Reinsert signs 405 qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign); 406 qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign); 407 408 // Mask out zbin threshold coeffs 409 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); 410 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); 411 412 _mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0); 413 _mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1); 414 415 coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale); 416 dequant = _mm_unpackhi_epi64(dequant, dequant); 417 coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale); 418 _mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0); 419 _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1); 420 } 421 422 // AC only loop. 423 while (index < n_coeffs) { 424 coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index)); 425 coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4)); 426 427 coeff0_sign = _mm_srai_epi32(coeff0, 31); 428 coeff1_sign = _mm_srai_epi32(coeff1, 31); 429 qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign); 430 qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign); 431 432 highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, 433 &is_found0, &mask0); 434 435 cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin); 436 cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin); 437 cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1); 438 highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1); 439 440 all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); 441 if (_mm_movemask_epi8(all_zero) == 0) { 442 _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); 443 _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); 444 _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); 445 _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); 446 index += 8; 447 continue; 448 } 449 highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale); 450 highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale); 451 452 qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign); 453 qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign); 454 455 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); 456 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); 457 458 _mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0); 459 _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1); 460 461 coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale); 462 coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale); 463 464 _mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0); 465 _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1); 466 467 index += 8; 468 } 469 if (is_found0) non_zero_count = calculate_non_zero_count(mask0); 470 if (is_found1) 471 non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1); 472 473 for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { 474 const int rc = scan[i]; 475 qcoeff_ptr[rc] = 0; 476 dqcoeff_ptr[rc] = 0; 477 } 478 479 for (int i = non_zero_count - 1; i >= 0; i--) { 480 const int rc = scan[i]; 481 if (qcoeff_ptr[rc]) { 482 eob = i; 483 break; 484 } 485 } 486 487 *eob_ptr = eob + 1; 488 #if SKIP_EOB_FACTOR_ADJUST 489 // TODO(Aniket): Experiment the following loop with intrinsic by combining 490 // with the quantization loop above 491 for (int i = 0; i < non_zero_count; i++) { 492 const int rc = scan[i]; 493 const int qcoeff = qcoeff_ptr[rc]; 494 if (qcoeff) { 495 first = i; 496 break; 497 } 498 } 499 if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { 500 const int rc = scan[(*eob_ptr - 1)]; 501 if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { 502 const int coeff = coeff_ptr[rc] * wt; 503 const int coeff_sign = AOMSIGN(coeff); 504 const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; 505 const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; 506 const int prescan_add_val = 507 ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); 508 if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { 509 qcoeff_ptr[rc] = 0; 510 dqcoeff_ptr[rc] = 0; 511 *eob_ptr = 0; 512 } 513 } 514 } 515 #endif 516 } 517 518 void aom_highbd_quantize_b_64x64_adaptive_sse2( 519 const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, 520 const int16_t *round_ptr, const int16_t *quant_ptr, 521 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, 522 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, 523 const int16_t *scan, const int16_t *iscan) { 524 int index = 8; 525 const int log_scale = 2; 526 int non_zero_count = 0; 527 int non_zero_count_prescan_add_zero = 0; 528 int is_found0 = 0, is_found1 = 0; 529 int eob = -1; 530 const __m128i zero = _mm_setzero_si128(); 531 const __m128i one = _mm_set1_epi32(1); 532 const __m128i log_scale_vec = _mm_set1_epi32(log_scale); 533 __m128i zbin, round, quant, dequant, shift; 534 __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; 535 __m128i qcoeff0, qcoeff1; 536 __m128i cmp_mask0, cmp_mask1, cmp_mask; 537 __m128i all_zero; 538 __m128i mask0 = zero, mask1 = zero; 539 540 const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), 541 ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; 542 int prescan_add[2]; 543 int thresh[4]; 544 const qm_val_t wt = (1 << AOM_QM_BITS); 545 for (int i = 0; i < 2; ++i) { 546 prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); 547 thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1; 548 } 549 thresh[2] = thresh[3] = thresh[1]; 550 __m128i threshold[2]; 551 threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]); 552 threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]); 553 554 #if SKIP_EOB_FACTOR_ADJUST 555 int first = -1; 556 #endif 557 // Setup global values. 558 zbin = _mm_load_si128((const __m128i *)zbin_ptr); 559 round = _mm_load_si128((const __m128i *)round_ptr); 560 quant = _mm_load_si128((const __m128i *)quant_ptr); 561 dequant = _mm_load_si128((const __m128i *)dequant_ptr); 562 shift = _mm_load_si128((const __m128i *)quant_shift_ptr); 563 564 __m128i zbin_sign = _mm_srai_epi16(zbin, 15); 565 __m128i round_sign = _mm_srai_epi16(round, 15); 566 __m128i quant_sign = _mm_srai_epi16(quant, 15); 567 __m128i dequant_sign = _mm_srai_epi16(dequant, 15); 568 __m128i shift_sign = _mm_srai_epi16(shift, 15); 569 570 zbin = _mm_unpacklo_epi16(zbin, zbin_sign); 571 round = _mm_unpacklo_epi16(round, round_sign); 572 quant = _mm_unpacklo_epi16(quant, quant_sign); 573 dequant = _mm_unpacklo_epi16(dequant, dequant_sign); 574 shift = _mm_unpacklo_epi16(shift, shift_sign); 575 576 // Shift with rounding. 577 zbin = _mm_add_epi32(zbin, log_scale_vec); 578 round = _mm_add_epi32(round, log_scale_vec); 579 zbin = _mm_srli_epi32(zbin, log_scale); 580 round = _mm_srli_epi32(round, log_scale); 581 zbin = _mm_sub_epi32(zbin, one); 582 583 // Do DC and first 15 AC. 584 coeff0 = _mm_load_si128((__m128i *)(coeff_ptr)); 585 coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4)); 586 587 coeff0_sign = _mm_srai_epi32(coeff0, 31); 588 coeff1_sign = _mm_srai_epi32(coeff1, 31); 589 qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign); 590 qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign); 591 592 highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0); 593 594 cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin); 595 zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC 596 cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin); 597 cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1); 598 highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1); 599 600 threshold[0] = threshold[1]; 601 all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); 602 if (_mm_movemask_epi8(all_zero) == 0) { 603 _mm_store_si128((__m128i *)(qcoeff_ptr), zero); 604 _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); 605 _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); 606 _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); 607 608 round = _mm_unpackhi_epi64(round, round); 609 quant = _mm_unpackhi_epi64(quant, quant); 610 shift = _mm_unpackhi_epi64(shift, shift); 611 dequant = _mm_unpackhi_epi64(dequant, dequant); 612 } else { 613 highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale); 614 615 round = _mm_unpackhi_epi64(round, round); 616 quant = _mm_unpackhi_epi64(quant, quant); 617 shift = _mm_unpackhi_epi64(shift, shift); 618 highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale); 619 620 // Reinsert signs 621 qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign); 622 qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign); 623 624 // Mask out zbin threshold coeffs 625 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); 626 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); 627 628 _mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0); 629 _mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1); 630 631 coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale); 632 dequant = _mm_unpackhi_epi64(dequant, dequant); 633 coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale); 634 _mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0); 635 _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1); 636 } 637 638 // AC only loop. 639 while (index < n_coeffs) { 640 coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index)); 641 coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4)); 642 643 coeff0_sign = _mm_srai_epi32(coeff0, 31); 644 coeff1_sign = _mm_srai_epi32(coeff1, 31); 645 qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign); 646 qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign); 647 648 highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, 649 &is_found0, &mask0); 650 651 cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin); 652 cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin); 653 cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1); 654 highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1); 655 656 all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); 657 if (_mm_movemask_epi8(all_zero) == 0) { 658 _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); 659 _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); 660 _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); 661 _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); 662 index += 8; 663 continue; 664 } 665 highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale); 666 highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale); 667 668 qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign); 669 qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign); 670 671 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); 672 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); 673 674 _mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0); 675 _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1); 676 677 coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale); 678 coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale); 679 680 _mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0); 681 _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1); 682 683 index += 8; 684 } 685 if (is_found0) non_zero_count = calculate_non_zero_count(mask0); 686 if (is_found1) 687 non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1); 688 689 for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { 690 const int rc = scan[i]; 691 qcoeff_ptr[rc] = 0; 692 dqcoeff_ptr[rc] = 0; 693 } 694 695 for (int i = non_zero_count - 1; i >= 0; i--) { 696 const int rc = scan[i]; 697 if (qcoeff_ptr[rc]) { 698 eob = i; 699 break; 700 } 701 } 702 703 *eob_ptr = eob + 1; 704 #if SKIP_EOB_FACTOR_ADJUST 705 // TODO(Aniket): Experiment the following loop with intrinsic by combining 706 // with the quantization loop above 707 for (int i = 0; i < non_zero_count; i++) { 708 const int rc = scan[i]; 709 const int qcoeff = qcoeff_ptr[rc]; 710 if (qcoeff) { 711 first = i; 712 break; 713 } 714 } 715 if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { 716 const int rc = scan[(*eob_ptr - 1)]; 717 if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { 718 const int coeff = coeff_ptr[rc] * wt; 719 const int coeff_sign = AOMSIGN(coeff); 720 const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; 721 const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; 722 const int prescan_add_val = 723 ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); 724 if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { 725 qcoeff_ptr[rc] = 0; 726 dqcoeff_ptr[rc] = 0; 727 *eob_ptr = 0; 728 } 729 } 730 } 731 #endif 732 }