tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

adaptive_quantize_sse2.c (23626B)


      1 /*
      2 * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <assert.h>
     13 #include <emmintrin.h>
     14 #include "config/aom_dsp_rtcd.h"
     15 #include "aom/aom_integer.h"
     16 #include "aom_dsp/quantize.h"
     17 #include "aom_dsp/x86/quantize_x86.h"
     18 
     19 void aom_quantize_b_adaptive_sse2(
     20    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
     21    const int16_t *round_ptr, const int16_t *quant_ptr,
     22    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
     23    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     24    const int16_t *scan, const int16_t *iscan) {
     25  int index = 16;
     26  int non_zero_count = 0;
     27  int non_zero_count_prescan_add_zero = 0;
     28  int is_found0 = 0, is_found1 = 0;
     29  int eob = -1;
     30  const __m128i zero = _mm_setzero_si128();
     31  __m128i zbin, round, quant, dequant, shift;
     32  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
     33  __m128i qcoeff0, qcoeff1;
     34  __m128i cmp_mask0, cmp_mask1;
     35  __m128i all_zero;
     36  __m128i mask0 = zero, mask1 = zero;
     37 
     38  int prescan_add[2];
     39  int thresh[4];
     40  const qm_val_t wt = (1 << AOM_QM_BITS);
     41  for (int i = 0; i < 2; ++i) {
     42    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
     43    thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1;
     44  }
     45  thresh[2] = thresh[3] = thresh[1];
     46  __m128i threshold[2];
     47  threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
     48  threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
     49 
     50 #if SKIP_EOB_FACTOR_ADJUST
     51  int first = -1;
     52 #endif
     53  // Setup global values.
     54  load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
     55                dequant_ptr, &dequant, quant_shift_ptr, &shift);
     56 
     57  // Do DC and first 15 AC.
     58  coeff0 = load_coefficients(coeff_ptr);
     59  coeff1 = load_coefficients(coeff_ptr + 8);
     60 
     61  // Poor man's abs().
     62  coeff0_sign = _mm_srai_epi16(coeff0, 15);
     63  coeff1_sign = _mm_srai_epi16(coeff1, 15);
     64  qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
     65  qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
     66 
     67  update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
     68 
     69  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
     70  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
     71  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
     72 
     73  update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1);
     74 
     75  threshold[0] = threshold[1];
     76  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
     77  if (_mm_movemask_epi8(all_zero) == 0) {
     78    _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
     79    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
     80    _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
     81    _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
     82    _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
     83    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
     84    _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
     85    _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
     86    round = _mm_unpackhi_epi64(round, round);
     87    quant = _mm_unpackhi_epi64(quant, quant);
     88    shift = _mm_unpackhi_epi64(shift, shift);
     89    dequant = _mm_unpackhi_epi64(dequant, dequant);
     90  } else {
     91    calculate_qcoeff(&qcoeff0, round, quant, shift);
     92 
     93    round = _mm_unpackhi_epi64(round, round);
     94    quant = _mm_unpackhi_epi64(quant, quant);
     95    shift = _mm_unpackhi_epi64(shift, shift);
     96 
     97    calculate_qcoeff(&qcoeff1, round, quant, shift);
     98 
     99    // Reinsert signs
    100    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
    101    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
    102 
    103    // Mask out zbin threshold coeffs
    104    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
    105    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
    106 
    107    store_coefficients(qcoeff0, qcoeff_ptr);
    108    store_coefficients(qcoeff1, qcoeff_ptr + 8);
    109 
    110    coeff0 = calculate_dqcoeff(qcoeff0, dequant);
    111    dequant = _mm_unpackhi_epi64(dequant, dequant);
    112    coeff1 = calculate_dqcoeff(qcoeff1, dequant);
    113 
    114    store_coefficients(coeff0, dqcoeff_ptr);
    115    store_coefficients(coeff1, dqcoeff_ptr + 8);
    116  }
    117 
    118  // AC only loop.
    119  while (index < n_coeffs) {
    120    coeff0 = load_coefficients(coeff_ptr + index);
    121    coeff1 = load_coefficients(coeff_ptr + index + 8);
    122 
    123    coeff0_sign = _mm_srai_epi16(coeff0, 15);
    124    coeff1_sign = _mm_srai_epi16(coeff1, 15);
    125    qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
    126    qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
    127 
    128    update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0,
    129                 &mask0);
    130 
    131    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
    132    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
    133 
    134    update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1);
    135 
    136    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
    137    if (_mm_movemask_epi8(all_zero) == 0) {
    138      _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
    139      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
    140      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
    141      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
    142      _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
    143      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
    144      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
    145      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
    146      index += 16;
    147      continue;
    148    }
    149    calculate_qcoeff(&qcoeff0, round, quant, shift);
    150    calculate_qcoeff(&qcoeff1, round, quant, shift);
    151 
    152    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
    153    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
    154 
    155    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
    156    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
    157 
    158    store_coefficients(qcoeff0, qcoeff_ptr + index);
    159    store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
    160 
    161    coeff0 = calculate_dqcoeff(qcoeff0, dequant);
    162    coeff1 = calculate_dqcoeff(qcoeff1, dequant);
    163 
    164    store_coefficients(coeff0, dqcoeff_ptr + index);
    165    store_coefficients(coeff1, dqcoeff_ptr + index + 8);
    166 
    167    index += 16;
    168  }
    169  if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
    170  if (is_found1)
    171    non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
    172 
    173  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
    174    const int rc = scan[i];
    175    qcoeff_ptr[rc] = 0;
    176    dqcoeff_ptr[rc] = 0;
    177  }
    178 
    179  for (int i = non_zero_count - 1; i >= 0; i--) {
    180    const int rc = scan[i];
    181    if (qcoeff_ptr[rc]) {
    182      eob = i;
    183      break;
    184    }
    185  }
    186 
    187  *eob_ptr = eob + 1;
    188 #if SKIP_EOB_FACTOR_ADJUST
    189  // TODO(Aniket): Experiment the following loop with intrinsic by combining
    190  // with the quantization loop above
    191  for (int i = 0; i < non_zero_count; i++) {
    192    const int rc = scan[i];
    193    const int qcoeff = qcoeff_ptr[rc];
    194    if (qcoeff) {
    195      first = i;
    196      break;
    197    }
    198  }
    199  if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
    200    const int rc = scan[(*eob_ptr - 1)];
    201    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
    202      const int coeff = coeff_ptr[rc] * wt;
    203      const int coeff_sign = AOMSIGN(coeff);
    204      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
    205      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
    206      const int prescan_add_val =
    207          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
    208      if (abs_coeff <
    209          (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
    210        qcoeff_ptr[rc] = 0;
    211        dqcoeff_ptr[rc] = 0;
    212        *eob_ptr = 0;
    213      }
    214    }
    215  }
    216 #endif
    217 }
    218 
    219 void aom_quantize_b_32x32_adaptive_sse2(
    220    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
    221    const int16_t *round_ptr, const int16_t *quant_ptr,
    222    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
    223    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
    224    const int16_t *scan, const int16_t *iscan) {
    225  int index = 16;
    226  const int log_scale = 1;
    227  int non_zero_count = 0;
    228  int non_zero_count_prescan_add_zero = 0;
    229  int is_found0 = 0, is_found1 = 0;
    230  int eob = -1;
    231  const __m128i zero = _mm_setzero_si128();
    232  const __m128i one = _mm_set1_epi16(1);
    233  const __m128i log_scale_vec = _mm_set1_epi16(log_scale);
    234  __m128i zbin, round, quant, dequant, shift;
    235  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
    236  __m128i qcoeff0, qcoeff1;
    237  __m128i cmp_mask0, cmp_mask1;
    238  __m128i all_zero;
    239  __m128i mask0 = zero, mask1 = zero;
    240 
    241  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
    242                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
    243  int prescan_add[2];
    244  int thresh[4];
    245  const qm_val_t wt = (1 << AOM_QM_BITS);
    246  for (int i = 0; i < 2; ++i) {
    247    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
    248    thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
    249  }
    250  thresh[2] = thresh[3] = thresh[1];
    251  __m128i threshold[2];
    252  threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
    253  threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
    254 
    255 #if SKIP_EOB_FACTOR_ADJUST
    256  int first = -1;
    257 #endif
    258  // Setup global values.
    259  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
    260  round = _mm_load_si128((const __m128i *)round_ptr);
    261  quant = _mm_load_si128((const __m128i *)quant_ptr);
    262  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
    263  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
    264 
    265  // Shift with rounding.
    266  zbin = _mm_add_epi16(zbin, log_scale_vec);
    267  round = _mm_add_epi16(round, log_scale_vec);
    268  zbin = _mm_srli_epi16(zbin, log_scale);
    269  round = _mm_srli_epi16(round, log_scale);
    270  zbin = _mm_sub_epi16(zbin, one);
    271 
    272  // Do DC and first 15 AC.
    273  coeff0 = load_coefficients(coeff_ptr);
    274  coeff1 = load_coefficients(coeff_ptr + 8);
    275 
    276  coeff0_sign = _mm_srai_epi16(coeff0, 15);
    277  coeff1_sign = _mm_srai_epi16(coeff1, 15);
    278  qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
    279  qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
    280 
    281  update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
    282 
    283  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
    284  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
    285  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
    286 
    287  update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1);
    288 
    289  threshold[0] = threshold[1];
    290  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
    291  if (_mm_movemask_epi8(all_zero) == 0) {
    292    _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
    293    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
    294    _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
    295    _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
    296    _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
    297    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
    298    _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
    299    _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
    300    round = _mm_unpackhi_epi64(round, round);
    301    quant = _mm_unpackhi_epi64(quant, quant);
    302    shift = _mm_unpackhi_epi64(shift, shift);
    303    dequant = _mm_unpackhi_epi64(dequant, dequant);
    304  } else {
    305    calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
    306    round = _mm_unpackhi_epi64(round, round);
    307    quant = _mm_unpackhi_epi64(quant, quant);
    308    shift = _mm_unpackhi_epi64(shift, shift);
    309    calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
    310 
    311    // Reinsert signs
    312    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
    313    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
    314 
    315    // Mask out zbin threshold coeffs
    316    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
    317    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
    318 
    319    store_coefficients(qcoeff0, qcoeff_ptr);
    320    store_coefficients(qcoeff1, qcoeff_ptr + 8);
    321 
    322    calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, dqcoeff_ptr,
    323                                          &log_scale);
    324    dequant = _mm_unpackhi_epi64(dequant, dequant);
    325    calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
    326                                          dqcoeff_ptr + 8, &log_scale);
    327  }
    328 
    329  // AC only loop.
    330  while (index < n_coeffs) {
    331    coeff0 = load_coefficients(coeff_ptr + index);
    332    coeff1 = load_coefficients(coeff_ptr + index + 8);
    333 
    334    coeff0_sign = _mm_srai_epi16(coeff0, 15);
    335    coeff1_sign = _mm_srai_epi16(coeff1, 15);
    336    qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
    337    qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
    338 
    339    update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0,
    340                 &mask0);
    341 
    342    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
    343    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
    344 
    345    update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1);
    346 
    347    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
    348    if (_mm_movemask_epi8(all_zero) == 0) {
    349      _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
    350      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
    351      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
    352      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
    353      _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
    354      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
    355      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
    356      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
    357      index += 16;
    358      continue;
    359    }
    360    calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
    361    calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
    362 
    363    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
    364    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
    365 
    366    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
    367    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
    368 
    369    store_coefficients(qcoeff0, qcoeff_ptr + index);
    370    store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
    371 
    372    calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero,
    373                                          dqcoeff_ptr + index, &log_scale);
    374    calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
    375                                          dqcoeff_ptr + index + 8, &log_scale);
    376    index += 16;
    377  }
    378  if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
    379  if (is_found1)
    380    non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
    381 
    382  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
    383    const int rc = scan[i];
    384    qcoeff_ptr[rc] = 0;
    385    dqcoeff_ptr[rc] = 0;
    386  }
    387 
    388  for (int i = non_zero_count - 1; i >= 0; i--) {
    389    const int rc = scan[i];
    390    if (qcoeff_ptr[rc]) {
    391      eob = i;
    392      break;
    393    }
    394  }
    395 
    396  *eob_ptr = eob + 1;
    397 #if SKIP_EOB_FACTOR_ADJUST
    398  // TODO(Aniket): Experiment the following loop with intrinsic by combining
    399  // with the quantization loop above
    400  for (int i = 0; i < non_zero_count; i++) {
    401    const int rc = scan[i];
    402    const int qcoeff = qcoeff_ptr[rc];
    403    if (qcoeff) {
    404      first = i;
    405      break;
    406    }
    407  }
    408  if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
    409    const int rc = scan[(*eob_ptr - 1)];
    410    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
    411      const int coeff = coeff_ptr[rc] * wt;
    412      const int coeff_sign = AOMSIGN(coeff);
    413      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
    414      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
    415      const int prescan_add_val =
    416          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
    417      if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
    418        qcoeff_ptr[rc] = 0;
    419        dqcoeff_ptr[rc] = 0;
    420        *eob_ptr = 0;
    421      }
    422    }
    423  }
    424 #endif
    425 }
    426 
    427 void aom_quantize_b_64x64_adaptive_sse2(
    428    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
    429    const int16_t *round_ptr, const int16_t *quant_ptr,
    430    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
    431    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
    432    const int16_t *scan, const int16_t *iscan) {
    433  int index = 16;
    434  const int log_scale = 2;
    435  int non_zero_count = 0;
    436  int non_zero_count_prescan_add_zero = 0;
    437  int is_found0 = 0, is_found1 = 0;
    438  int eob = -1;
    439  const __m128i zero = _mm_setzero_si128();
    440  const __m128i one = _mm_set1_epi16(1);
    441  const __m128i log_scale_vec = _mm_set1_epi16(log_scale);
    442  __m128i zbin, round, quant, dequant, shift;
    443  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
    444  __m128i qcoeff0, qcoeff1;
    445  __m128i cmp_mask0, cmp_mask1;
    446  __m128i all_zero;
    447  __m128i mask0 = zero, mask1 = zero;
    448 
    449  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
    450                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
    451  int prescan_add[2];
    452  int thresh[4];
    453  const qm_val_t wt = (1 << AOM_QM_BITS);
    454  for (int i = 0; i < 2; ++i) {
    455    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
    456    thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
    457  }
    458  thresh[2] = thresh[3] = thresh[1];
    459  __m128i threshold[2];
    460  threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
    461  threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
    462 
    463 #if SKIP_EOB_FACTOR_ADJUST
    464  int first = -1;
    465 #endif
    466  // Setup global values.
    467  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
    468  round = _mm_load_si128((const __m128i *)round_ptr);
    469  quant = _mm_load_si128((const __m128i *)quant_ptr);
    470  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
    471  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
    472 
    473  // Shift with rounding.
    474  zbin = _mm_add_epi16(zbin, log_scale_vec);
    475  round = _mm_add_epi16(round, log_scale_vec);
    476  zbin = _mm_srli_epi16(zbin, log_scale);
    477  round = _mm_srli_epi16(round, log_scale);
    478  zbin = _mm_sub_epi16(zbin, one);
    479 
    480  // Do DC and first 15 AC.
    481  coeff0 = load_coefficients(coeff_ptr);
    482  coeff1 = load_coefficients(coeff_ptr + 8);
    483 
    484  coeff0_sign = _mm_srai_epi16(coeff0, 15);
    485  coeff1_sign = _mm_srai_epi16(coeff1, 15);
    486  qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
    487  qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
    488 
    489  update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
    490 
    491  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
    492  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
    493  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
    494 
    495  update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1);
    496 
    497  threshold[0] = threshold[1];
    498  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
    499  if (_mm_movemask_epi8(all_zero) == 0) {
    500    _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
    501    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
    502    _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
    503    _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
    504    _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
    505    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
    506    _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
    507    _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
    508    round = _mm_unpackhi_epi64(round, round);
    509    quant = _mm_unpackhi_epi64(quant, quant);
    510    shift = _mm_unpackhi_epi64(shift, shift);
    511    dequant = _mm_unpackhi_epi64(dequant, dequant);
    512  } else {
    513    calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
    514    round = _mm_unpackhi_epi64(round, round);
    515    quant = _mm_unpackhi_epi64(quant, quant);
    516    shift = _mm_unpackhi_epi64(shift, shift);
    517    calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
    518 
    519    // Reinsert signs
    520    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
    521    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
    522 
    523    // Mask out zbin threshold coeffs
    524    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
    525    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
    526 
    527    store_coefficients(qcoeff0, qcoeff_ptr);
    528    store_coefficients(qcoeff1, qcoeff_ptr + 8);
    529 
    530    calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, dqcoeff_ptr,
    531                                          &log_scale);
    532    dequant = _mm_unpackhi_epi64(dequant, dequant);
    533    calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
    534                                          dqcoeff_ptr + 8, &log_scale);
    535  }
    536 
    537  // AC only loop.
    538  while (index < n_coeffs) {
    539    coeff0 = load_coefficients(coeff_ptr + index);
    540    coeff1 = load_coefficients(coeff_ptr + index + 8);
    541 
    542    coeff0_sign = _mm_srai_epi16(coeff0, 15);
    543    coeff1_sign = _mm_srai_epi16(coeff1, 15);
    544    qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
    545    qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
    546 
    547    update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0,
    548                 &mask0);
    549 
    550    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
    551    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
    552 
    553    update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1);
    554 
    555    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
    556    if (_mm_movemask_epi8(all_zero) == 0) {
    557      _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
    558      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
    559      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
    560      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
    561      _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
    562      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
    563      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
    564      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
    565      index += 16;
    566      continue;
    567    }
    568    calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
    569    calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
    570 
    571    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
    572    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
    573 
    574    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
    575    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
    576 
    577    store_coefficients(qcoeff0, qcoeff_ptr + index);
    578    store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
    579 
    580    calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero,
    581                                          dqcoeff_ptr + index, &log_scale);
    582    calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
    583                                          dqcoeff_ptr + index + 8, &log_scale);
    584    index += 16;
    585  }
    586  if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
    587  if (is_found1)
    588    non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
    589 
    590  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
    591    const int rc = scan[i];
    592    qcoeff_ptr[rc] = 0;
    593    dqcoeff_ptr[rc] = 0;
    594  }
    595 
    596  for (int i = non_zero_count - 1; i >= 0; i--) {
    597    const int rc = scan[i];
    598    if (qcoeff_ptr[rc]) {
    599      eob = i;
    600      break;
    601    }
    602  }
    603 
    604  *eob_ptr = eob + 1;
    605 #if SKIP_EOB_FACTOR_ADJUST
    606  // TODO(Aniket): Experiment the following loop with intrinsic by combining
    607  // with the quantization loop above
    608  for (int i = 0; i < non_zero_count; i++) {
    609    const int rc = scan[i];
    610    const int qcoeff = qcoeff_ptr[rc];
    611    if (qcoeff) {
    612      first = i;
    613      break;
    614    }
    615  }
    616  if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
    617    const int rc = scan[(*eob_ptr - 1)];
    618    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
    619      const int coeff = coeff_ptr[rc] * wt;
    620      const int coeff_sign = AOMSIGN(coeff);
    621      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
    622      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
    623      const int prescan_add_val =
    624          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
    625      if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
    626        qcoeff_ptr[rc] = 0;
    627        dqcoeff_ptr[rc] = 0;
    628        *eob_ptr = 0;
    629      }
    630    }
    631  }
    632 #endif
    633 }