tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

highbd_adaptive_quantize_sse2.c (27745B)


      1 /*
      2 * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <emmintrin.h>
     13 #include "config/aom_dsp_rtcd.h"
     14 
     15 #include "aom/aom_integer.h"
     16 #include "aom_dsp/quantize.h"
     17 #include "aom_dsp/x86/quantize_x86.h"
     18 
     19 static inline __m128i highbd_invert_sign_64bit_sse2(__m128i a, __m128i sign) {
     20  a = _mm_xor_si128(a, sign);
     21  return _mm_sub_epi64(a, sign);
     22 }
     23 
     24 static inline void highbd_mul_shift_sse2(const __m128i *x, const __m128i *y,
     25                                         __m128i *p, const int shift) {
     26  __m128i sign = _mm_srai_epi32(*y, 31);
     27  __m128i sign_lo = _mm_unpacklo_epi32(sign, sign);
     28  __m128i sign_hi = _mm_unpackhi_epi32(sign, sign);
     29  __m128i abs_y = invert_sign_32_sse2(*y, sign);
     30  __m128i prod_lo = _mm_mul_epu32(*x, abs_y);
     31  __m128i prod_hi = _mm_srli_epi64(*x, 32);
     32  const __m128i mult_hi = _mm_srli_epi64(abs_y, 32);
     33  prod_hi = _mm_mul_epu32(prod_hi, mult_hi);
     34  prod_lo = highbd_invert_sign_64bit_sse2(prod_lo, sign_lo);
     35  prod_hi = highbd_invert_sign_64bit_sse2(prod_hi, sign_hi);
     36 
     37  prod_lo = _mm_srli_epi64(prod_lo, shift);
     38  const __m128i mask = _mm_set_epi32(0, -1, 0, -1);
     39  prod_lo = _mm_and_si128(prod_lo, mask);
     40  prod_hi = _mm_srli_epi64(prod_hi, shift);
     41 
     42  prod_hi = _mm_slli_epi64(prod_hi, 32);
     43  *p = _mm_or_si128(prod_lo, prod_hi);
     44 }
     45 
     46 static inline void highbd_calculate_qcoeff(__m128i *coeff, const __m128i *round,
     47                                           const __m128i *quant,
     48                                           const __m128i *shift,
     49                                           const int *log_scale) {
     50  __m128i tmp, qcoeff;
     51  qcoeff = _mm_add_epi32(*coeff, *round);
     52  highbd_mul_shift_sse2(&qcoeff, quant, &tmp, 16);
     53  qcoeff = _mm_add_epi32(tmp, qcoeff);
     54  highbd_mul_shift_sse2(&qcoeff, shift, coeff, 16 - *log_scale);
     55 }
     56 
     57 static inline void highbd_update_mask1(__m128i *cmp_mask0,
     58                                       const int16_t *iscan_ptr, int *is_found,
     59                                       __m128i *mask) {
     60  __m128i temp_mask = _mm_setzero_si128();
     61  if (_mm_movemask_epi8(*cmp_mask0)) {
     62    __m128i iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr));
     63    __m128i mask0 = _mm_and_si128(*cmp_mask0, iscan0);
     64    temp_mask = mask0;
     65    *is_found = 1;
     66  }
     67  *mask = _mm_max_epi16(temp_mask, *mask);
     68 }
     69 
     70 static inline void highbd_update_mask0(__m128i *qcoeff0, __m128i *qcoeff1,
     71                                       __m128i *threshold,
     72                                       const int16_t *iscan_ptr, int *is_found,
     73                                       __m128i *mask) {
     74  __m128i coeff[2], cmp_mask0, cmp_mask1;
     75 
     76  coeff[0] = _mm_slli_epi32(*qcoeff0, AOM_QM_BITS);
     77  cmp_mask0 = _mm_cmpgt_epi32(coeff[0], threshold[0]);
     78  coeff[1] = _mm_slli_epi32(*qcoeff1, AOM_QM_BITS);
     79  cmp_mask1 = _mm_cmpgt_epi32(coeff[1], threshold[1]);
     80 
     81  cmp_mask0 = _mm_packs_epi32(cmp_mask0, cmp_mask1);
     82 
     83  highbd_update_mask1(&cmp_mask0, iscan_ptr, is_found, mask);
     84 }
     85 
     86 static inline __m128i highbd_calculate_dqcoeff(__m128i qcoeff, __m128i dequant,
     87                                               const int log_scale) {
     88  __m128i coeff_sign = _mm_srai_epi32(qcoeff, 31);
     89  __m128i abs_coeff = invert_sign_32_sse2(qcoeff, coeff_sign);
     90  highbd_mul_shift_sse2(&abs_coeff, &dequant, &abs_coeff, log_scale);
     91  return invert_sign_32_sse2(abs_coeff, coeff_sign);
     92 }
     93 
     94 void aom_highbd_quantize_b_adaptive_sse2(
     95    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
     96    const int16_t *round_ptr, const int16_t *quant_ptr,
     97    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
     98    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     99    const int16_t *scan, const int16_t *iscan) {
    100  int index = 8;
    101  const int log_scale = 0;
    102  int non_zero_count = 0;
    103  int non_zero_count_prescan_add_zero = 0;
    104  int is_found0 = 0, is_found1 = 0;
    105  int eob = -1;
    106  const __m128i zero = _mm_setzero_si128();
    107  const __m128i one = _mm_set1_epi32(1);
    108  __m128i zbin, round, quant, dequant, shift;
    109  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
    110  __m128i qcoeff0, qcoeff1;
    111  __m128i cmp_mask0, cmp_mask1, cmp_mask;
    112  __m128i all_zero;
    113  __m128i mask0 = zero, mask1 = zero;
    114 
    115  int prescan_add[2];
    116  int thresh[4];
    117  const qm_val_t wt = (1 << AOM_QM_BITS);
    118  for (int i = 0; i < 2; ++i) {
    119    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
    120    thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1;
    121  }
    122  thresh[2] = thresh[3] = thresh[1];
    123  __m128i threshold[2];
    124  threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
    125  threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
    126 
    127 #if SKIP_EOB_FACTOR_ADJUST
    128  int first = -1;
    129 #endif
    130  // Setup global values.
    131  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
    132  round = _mm_load_si128((const __m128i *)round_ptr);
    133  quant = _mm_load_si128((const __m128i *)quant_ptr);
    134  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
    135  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
    136 
    137  __m128i zbin_sign = _mm_srai_epi16(zbin, 15);
    138  __m128i round_sign = _mm_srai_epi16(round, 15);
    139  __m128i quant_sign = _mm_srai_epi16(quant, 15);
    140  __m128i dequant_sign = _mm_srai_epi16(dequant, 15);
    141  __m128i shift_sign = _mm_srai_epi16(shift, 15);
    142 
    143  zbin = _mm_unpacklo_epi16(zbin, zbin_sign);
    144  round = _mm_unpacklo_epi16(round, round_sign);
    145  quant = _mm_unpacklo_epi16(quant, quant_sign);
    146  dequant = _mm_unpacklo_epi16(dequant, dequant_sign);
    147  shift = _mm_unpacklo_epi16(shift, shift_sign);
    148  zbin = _mm_sub_epi32(zbin, one);
    149 
    150  // Do DC and first 15 AC.
    151  coeff0 = _mm_load_si128((__m128i *)(coeff_ptr));
    152  coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4));
    153 
    154  coeff0_sign = _mm_srai_epi32(coeff0, 31);
    155  coeff1_sign = _mm_srai_epi32(coeff1, 31);
    156  qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
    157  qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
    158 
    159  highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
    160 
    161  cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
    162  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
    163  cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
    164  cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
    165  highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1);
    166 
    167  threshold[0] = threshold[1];
    168  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
    169  if (_mm_movemask_epi8(all_zero) == 0) {
    170    _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
    171    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
    172    _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
    173    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
    174 
    175    round = _mm_unpackhi_epi64(round, round);
    176    quant = _mm_unpackhi_epi64(quant, quant);
    177    shift = _mm_unpackhi_epi64(shift, shift);
    178    dequant = _mm_unpackhi_epi64(dequant, dequant);
    179  } else {
    180    highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
    181 
    182    round = _mm_unpackhi_epi64(round, round);
    183    quant = _mm_unpackhi_epi64(quant, quant);
    184    shift = _mm_unpackhi_epi64(shift, shift);
    185    highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
    186 
    187    // Reinsert signs
    188    qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
    189    qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
    190 
    191    // Mask out zbin threshold coeffs
    192    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
    193    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
    194 
    195    _mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0);
    196    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1);
    197 
    198    coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
    199    dequant = _mm_unpackhi_epi64(dequant, dequant);
    200    coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
    201    _mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0);
    202    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1);
    203  }
    204 
    205  // AC only loop.
    206  while (index < n_coeffs) {
    207    coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index));
    208    coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4));
    209 
    210    coeff0_sign = _mm_srai_epi32(coeff0, 31);
    211    coeff1_sign = _mm_srai_epi32(coeff1, 31);
    212    qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
    213    qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
    214 
    215    highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index,
    216                        &is_found0, &mask0);
    217 
    218    cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
    219    cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
    220    cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
    221    highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1);
    222 
    223    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
    224    if (_mm_movemask_epi8(all_zero) == 0) {
    225      _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
    226      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
    227      _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
    228      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
    229      index += 8;
    230      continue;
    231    }
    232    highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
    233    highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
    234 
    235    qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
    236    qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
    237 
    238    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
    239    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
    240 
    241    _mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0);
    242    _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1);
    243 
    244    coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
    245    coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
    246 
    247    _mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0);
    248    _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1);
    249 
    250    index += 8;
    251  }
    252  if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
    253  if (is_found1)
    254    non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
    255 
    256  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
    257    const int rc = scan[i];
    258    qcoeff_ptr[rc] = 0;
    259    dqcoeff_ptr[rc] = 0;
    260  }
    261 
    262  for (int i = non_zero_count - 1; i >= 0; i--) {
    263    const int rc = scan[i];
    264    if (qcoeff_ptr[rc]) {
    265      eob = i;
    266      break;
    267    }
    268  }
    269 
    270  *eob_ptr = eob + 1;
    271 #if SKIP_EOB_FACTOR_ADJUST
    272  // TODO(Aniket): Experiment the following loop with intrinsic by combining
    273  // with the quantization loop above
    274  for (int i = 0; i < non_zero_count; i++) {
    275    const int rc = scan[i];
    276    const int qcoeff = qcoeff_ptr[rc];
    277    if (qcoeff) {
    278      first = i;
    279      break;
    280    }
    281  }
    282  if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
    283    const int rc = scan[(*eob_ptr - 1)];
    284    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
    285      const int coeff = coeff_ptr[rc] * wt;
    286      const int coeff_sign = AOMSIGN(coeff);
    287      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
    288      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
    289      const int prescan_add_val =
    290          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
    291      if (abs_coeff <
    292          (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
    293        qcoeff_ptr[rc] = 0;
    294        dqcoeff_ptr[rc] = 0;
    295        *eob_ptr = 0;
    296      }
    297    }
    298  }
    299 #endif
    300 }
    301 
    302 void aom_highbd_quantize_b_32x32_adaptive_sse2(
    303    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
    304    const int16_t *round_ptr, const int16_t *quant_ptr,
    305    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
    306    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
    307    const int16_t *scan, const int16_t *iscan) {
    308  int index = 8;
    309  const int log_scale = 1;
    310  int non_zero_count = 0;
    311  int non_zero_count_prescan_add_zero = 0;
    312  int is_found0 = 0, is_found1 = 0;
    313  int eob = -1;
    314  const __m128i zero = _mm_setzero_si128();
    315  const __m128i one = _mm_set1_epi32(1);
    316  const __m128i log_scale_vec = _mm_set1_epi32(log_scale);
    317  __m128i zbin, round, quant, dequant, shift;
    318  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
    319  __m128i qcoeff0, qcoeff1;
    320  __m128i cmp_mask0, cmp_mask1, cmp_mask;
    321  __m128i all_zero;
    322  __m128i mask0 = zero, mask1 = zero;
    323 
    324  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
    325                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
    326  int prescan_add[2];
    327  int thresh[4];
    328  const qm_val_t wt = (1 << AOM_QM_BITS);
    329  for (int i = 0; i < 2; ++i) {
    330    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
    331    thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
    332  }
    333  thresh[2] = thresh[3] = thresh[1];
    334  __m128i threshold[2];
    335  threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
    336  threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
    337 
    338 #if SKIP_EOB_FACTOR_ADJUST
    339  int first = -1;
    340 #endif
    341  // Setup global values.
    342  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
    343  round = _mm_load_si128((const __m128i *)round_ptr);
    344  quant = _mm_load_si128((const __m128i *)quant_ptr);
    345  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
    346  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
    347 
    348  __m128i zbin_sign = _mm_srai_epi16(zbin, 15);
    349  __m128i round_sign = _mm_srai_epi16(round, 15);
    350  __m128i quant_sign = _mm_srai_epi16(quant, 15);
    351  __m128i dequant_sign = _mm_srai_epi16(dequant, 15);
    352  __m128i shift_sign = _mm_srai_epi16(shift, 15);
    353 
    354  zbin = _mm_unpacklo_epi16(zbin, zbin_sign);
    355  round = _mm_unpacklo_epi16(round, round_sign);
    356  quant = _mm_unpacklo_epi16(quant, quant_sign);
    357  dequant = _mm_unpacklo_epi16(dequant, dequant_sign);
    358  shift = _mm_unpacklo_epi16(shift, shift_sign);
    359 
    360  // Shift with rounding.
    361  zbin = _mm_add_epi32(zbin, log_scale_vec);
    362  round = _mm_add_epi32(round, log_scale_vec);
    363  zbin = _mm_srli_epi32(zbin, log_scale);
    364  round = _mm_srli_epi32(round, log_scale);
    365  zbin = _mm_sub_epi32(zbin, one);
    366 
    367  // Do DC and first 15 AC.
    368  coeff0 = _mm_load_si128((__m128i *)(coeff_ptr));
    369  coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4));
    370 
    371  coeff0_sign = _mm_srai_epi32(coeff0, 31);
    372  coeff1_sign = _mm_srai_epi32(coeff1, 31);
    373  qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
    374  qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
    375 
    376  highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
    377 
    378  cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
    379  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
    380  cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
    381  cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
    382  highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1);
    383 
    384  threshold[0] = threshold[1];
    385  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
    386  if (_mm_movemask_epi8(all_zero) == 0) {
    387    _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
    388    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
    389    _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
    390    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
    391 
    392    round = _mm_unpackhi_epi64(round, round);
    393    quant = _mm_unpackhi_epi64(quant, quant);
    394    shift = _mm_unpackhi_epi64(shift, shift);
    395    dequant = _mm_unpackhi_epi64(dequant, dequant);
    396  } else {
    397    highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
    398 
    399    round = _mm_unpackhi_epi64(round, round);
    400    quant = _mm_unpackhi_epi64(quant, quant);
    401    shift = _mm_unpackhi_epi64(shift, shift);
    402    highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
    403 
    404    // Reinsert signs
    405    qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
    406    qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
    407 
    408    // Mask out zbin threshold coeffs
    409    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
    410    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
    411 
    412    _mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0);
    413    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1);
    414 
    415    coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
    416    dequant = _mm_unpackhi_epi64(dequant, dequant);
    417    coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
    418    _mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0);
    419    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1);
    420  }
    421 
    422  // AC only loop.
    423  while (index < n_coeffs) {
    424    coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index));
    425    coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4));
    426 
    427    coeff0_sign = _mm_srai_epi32(coeff0, 31);
    428    coeff1_sign = _mm_srai_epi32(coeff1, 31);
    429    qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
    430    qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
    431 
    432    highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index,
    433                        &is_found0, &mask0);
    434 
    435    cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
    436    cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
    437    cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
    438    highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1);
    439 
    440    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
    441    if (_mm_movemask_epi8(all_zero) == 0) {
    442      _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
    443      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
    444      _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
    445      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
    446      index += 8;
    447      continue;
    448    }
    449    highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
    450    highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
    451 
    452    qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
    453    qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
    454 
    455    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
    456    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
    457 
    458    _mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0);
    459    _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1);
    460 
    461    coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
    462    coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
    463 
    464    _mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0);
    465    _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1);
    466 
    467    index += 8;
    468  }
    469  if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
    470  if (is_found1)
    471    non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
    472 
    473  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
    474    const int rc = scan[i];
    475    qcoeff_ptr[rc] = 0;
    476    dqcoeff_ptr[rc] = 0;
    477  }
    478 
    479  for (int i = non_zero_count - 1; i >= 0; i--) {
    480    const int rc = scan[i];
    481    if (qcoeff_ptr[rc]) {
    482      eob = i;
    483      break;
    484    }
    485  }
    486 
    487  *eob_ptr = eob + 1;
    488 #if SKIP_EOB_FACTOR_ADJUST
    489  // TODO(Aniket): Experiment the following loop with intrinsic by combining
    490  // with the quantization loop above
    491  for (int i = 0; i < non_zero_count; i++) {
    492    const int rc = scan[i];
    493    const int qcoeff = qcoeff_ptr[rc];
    494    if (qcoeff) {
    495      first = i;
    496      break;
    497    }
    498  }
    499  if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
    500    const int rc = scan[(*eob_ptr - 1)];
    501    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
    502      const int coeff = coeff_ptr[rc] * wt;
    503      const int coeff_sign = AOMSIGN(coeff);
    504      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
    505      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
    506      const int prescan_add_val =
    507          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
    508      if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
    509        qcoeff_ptr[rc] = 0;
    510        dqcoeff_ptr[rc] = 0;
    511        *eob_ptr = 0;
    512      }
    513    }
    514  }
    515 #endif
    516 }
    517 
    518 void aom_highbd_quantize_b_64x64_adaptive_sse2(
    519    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
    520    const int16_t *round_ptr, const int16_t *quant_ptr,
    521    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
    522    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
    523    const int16_t *scan, const int16_t *iscan) {
    524  int index = 8;
    525  const int log_scale = 2;
    526  int non_zero_count = 0;
    527  int non_zero_count_prescan_add_zero = 0;
    528  int is_found0 = 0, is_found1 = 0;
    529  int eob = -1;
    530  const __m128i zero = _mm_setzero_si128();
    531  const __m128i one = _mm_set1_epi32(1);
    532  const __m128i log_scale_vec = _mm_set1_epi32(log_scale);
    533  __m128i zbin, round, quant, dequant, shift;
    534  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
    535  __m128i qcoeff0, qcoeff1;
    536  __m128i cmp_mask0, cmp_mask1, cmp_mask;
    537  __m128i all_zero;
    538  __m128i mask0 = zero, mask1 = zero;
    539 
    540  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
    541                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
    542  int prescan_add[2];
    543  int thresh[4];
    544  const qm_val_t wt = (1 << AOM_QM_BITS);
    545  for (int i = 0; i < 2; ++i) {
    546    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
    547    thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
    548  }
    549  thresh[2] = thresh[3] = thresh[1];
    550  __m128i threshold[2];
    551  threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
    552  threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
    553 
    554 #if SKIP_EOB_FACTOR_ADJUST
    555  int first = -1;
    556 #endif
    557  // Setup global values.
    558  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
    559  round = _mm_load_si128((const __m128i *)round_ptr);
    560  quant = _mm_load_si128((const __m128i *)quant_ptr);
    561  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
    562  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
    563 
    564  __m128i zbin_sign = _mm_srai_epi16(zbin, 15);
    565  __m128i round_sign = _mm_srai_epi16(round, 15);
    566  __m128i quant_sign = _mm_srai_epi16(quant, 15);
    567  __m128i dequant_sign = _mm_srai_epi16(dequant, 15);
    568  __m128i shift_sign = _mm_srai_epi16(shift, 15);
    569 
    570  zbin = _mm_unpacklo_epi16(zbin, zbin_sign);
    571  round = _mm_unpacklo_epi16(round, round_sign);
    572  quant = _mm_unpacklo_epi16(quant, quant_sign);
    573  dequant = _mm_unpacklo_epi16(dequant, dequant_sign);
    574  shift = _mm_unpacklo_epi16(shift, shift_sign);
    575 
    576  // Shift with rounding.
    577  zbin = _mm_add_epi32(zbin, log_scale_vec);
    578  round = _mm_add_epi32(round, log_scale_vec);
    579  zbin = _mm_srli_epi32(zbin, log_scale);
    580  round = _mm_srli_epi32(round, log_scale);
    581  zbin = _mm_sub_epi32(zbin, one);
    582 
    583  // Do DC and first 15 AC.
    584  coeff0 = _mm_load_si128((__m128i *)(coeff_ptr));
    585  coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4));
    586 
    587  coeff0_sign = _mm_srai_epi32(coeff0, 31);
    588  coeff1_sign = _mm_srai_epi32(coeff1, 31);
    589  qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
    590  qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
    591 
    592  highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
    593 
    594  cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
    595  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
    596  cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
    597  cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
    598  highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1);
    599 
    600  threshold[0] = threshold[1];
    601  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
    602  if (_mm_movemask_epi8(all_zero) == 0) {
    603    _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
    604    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
    605    _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
    606    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
    607 
    608    round = _mm_unpackhi_epi64(round, round);
    609    quant = _mm_unpackhi_epi64(quant, quant);
    610    shift = _mm_unpackhi_epi64(shift, shift);
    611    dequant = _mm_unpackhi_epi64(dequant, dequant);
    612  } else {
    613    highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
    614 
    615    round = _mm_unpackhi_epi64(round, round);
    616    quant = _mm_unpackhi_epi64(quant, quant);
    617    shift = _mm_unpackhi_epi64(shift, shift);
    618    highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
    619 
    620    // Reinsert signs
    621    qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
    622    qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
    623 
    624    // Mask out zbin threshold coeffs
    625    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
    626    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
    627 
    628    _mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0);
    629    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1);
    630 
    631    coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
    632    dequant = _mm_unpackhi_epi64(dequant, dequant);
    633    coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
    634    _mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0);
    635    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1);
    636  }
    637 
    638  // AC only loop.
    639  while (index < n_coeffs) {
    640    coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index));
    641    coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4));
    642 
    643    coeff0_sign = _mm_srai_epi32(coeff0, 31);
    644    coeff1_sign = _mm_srai_epi32(coeff1, 31);
    645    qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
    646    qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
    647 
    648    highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index,
    649                        &is_found0, &mask0);
    650 
    651    cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
    652    cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
    653    cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
    654    highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1);
    655 
    656    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
    657    if (_mm_movemask_epi8(all_zero) == 0) {
    658      _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
    659      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
    660      _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
    661      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
    662      index += 8;
    663      continue;
    664    }
    665    highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
    666    highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
    667 
    668    qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
    669    qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
    670 
    671    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
    672    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
    673 
    674    _mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0);
    675    _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1);
    676 
    677    coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
    678    coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
    679 
    680    _mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0);
    681    _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1);
    682 
    683    index += 8;
    684  }
    685  if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
    686  if (is_found1)
    687    non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
    688 
    689  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
    690    const int rc = scan[i];
    691    qcoeff_ptr[rc] = 0;
    692    dqcoeff_ptr[rc] = 0;
    693  }
    694 
    695  for (int i = non_zero_count - 1; i >= 0; i--) {
    696    const int rc = scan[i];
    697    if (qcoeff_ptr[rc]) {
    698      eob = i;
    699      break;
    700    }
    701  }
    702 
    703  *eob_ptr = eob + 1;
    704 #if SKIP_EOB_FACTOR_ADJUST
    705  // TODO(Aniket): Experiment the following loop with intrinsic by combining
    706  // with the quantization loop above
    707  for (int i = 0; i < non_zero_count; i++) {
    708    const int rc = scan[i];
    709    const int qcoeff = qcoeff_ptr[rc];
    710    if (qcoeff) {
    711      first = i;
    712      break;
    713    }
    714  }
    715  if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
    716    const int rc = scan[(*eob_ptr - 1)];
    717    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
    718      const int coeff = coeff_ptr[rc] * wt;
    719      const int coeff_sign = AOMSIGN(coeff);
    720      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
    721      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
    722      const int prescan_add_val =
    723          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
    724      if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
    725        qcoeff_ptr[rc] = 0;
    726        dqcoeff_ptr[rc] = 0;
    727        *eob_ptr = 0;
    728      }
    729    }
    730  }
    731 #endif
    732 }