tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

highbd_quantize_intrin_sse2.c (8171B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <emmintrin.h>
     13 
     14 #include "aom_dsp/aom_dsp_common.h"
     15 #include "aom_mem/aom_mem.h"
     16 #include "aom_ports/mem.h"
     17 #include "config/aom_dsp_rtcd.h"
     18 
     19 void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
     20                                const int16_t *zbin_ptr,
     21                                const int16_t *round_ptr,
     22                                const int16_t *quant_ptr,
     23                                const int16_t *quant_shift_ptr,
     24                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
     25                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
     26                                const int16_t *scan, const int16_t *iscan) {
     27  int i, j, non_zero_regs = (int)count / 4, eob_i = -1;
     28  __m128i zbins[2];
     29  __m128i nzbins[2];
     30 
     31  zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1],
     32                           (int)zbin_ptr[0]);
     33  zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]);
     34 
     35  nzbins[0] = _mm_setzero_si128();
     36  nzbins[1] = _mm_setzero_si128();
     37  nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
     38  nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
     39 
     40  (void)scan;
     41 
     42  memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
     43  memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
     44 
     45  // Pre-scan pass
     46  for (i = ((int)count / 4) - 1; i >= 0; i--) {
     47    __m128i coeffs, cmp1, cmp2;
     48    int test;
     49    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
     50    cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
     51    cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
     52    cmp1 = _mm_and_si128(cmp1, cmp2);
     53    test = _mm_movemask_epi8(cmp1);
     54    if (test == 0xffff)
     55      non_zero_regs--;
     56    else
     57      break;
     58  }
     59 
     60  // Quantization pass:
     61  for (i = 0; i < non_zero_regs; i++) {
     62    __m128i coeffs, coeffs_sign, tmp1, tmp2;
     63    int test;
     64    int abs_coeff[4];
     65    int coeff_sign[4];
     66 
     67    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
     68    coeffs_sign = _mm_srai_epi32(coeffs, 31);
     69    coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
     70    tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
     71    tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
     72    tmp1 = _mm_or_si128(tmp1, tmp2);
     73    test = _mm_movemask_epi8(tmp1);
     74    _mm_storeu_si128((__m128i *)abs_coeff, coeffs);
     75    _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);
     76 
     77    for (j = 0; j < 4; j++) {
     78      if (test & (1 << (4 * j))) {
     79        int k = 4 * i + j;
     80        const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];
     81        const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
     82        const uint32_t abs_qcoeff =
     83            (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
     84        qcoeff_ptr[k] =
     85            (int)(abs_qcoeff ^ (uint32_t)coeff_sign[j]) - coeff_sign[j];
     86        dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
     87        if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
     88      }
     89    }
     90  }
     91  *eob_ptr = eob_i + 1;
     92 }
     93 
     94 void aom_highbd_quantize_b_32x32_sse2(
     95    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
     96    const int16_t *round_ptr, const int16_t *quant_ptr,
     97    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
     98    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     99    const int16_t *scan, const int16_t *iscan) {
    100  __m128i zbins[2];
    101  __m128i nzbins[2];
    102  int idx = 0;
    103  int idx_arr[1024];
    104  int i, eob = -1;
    105  const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
    106  const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
    107  (void)scan;
    108  zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
    109  zbins[1] = _mm_set1_epi32(zbin1_tmp);
    110 
    111  nzbins[0] = _mm_setzero_si128();
    112  nzbins[1] = _mm_setzero_si128();
    113  nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
    114  nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
    115 
    116  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
    117  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
    118 
    119  // Pre-scan pass
    120  for (i = 0; i < n_coeffs / 4; i++) {
    121    __m128i coeffs, cmp1, cmp2;
    122    int test;
    123    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
    124    cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
    125    cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
    126    cmp1 = _mm_and_si128(cmp1, cmp2);
    127    test = _mm_movemask_epi8(cmp1);
    128    if (!(test & 0xf)) idx_arr[idx++] = i * 4;
    129    if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
    130    if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
    131    if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
    132  }
    133 
    134  // Quantization pass: only process the coefficients selected in
    135  // pre-scan pass. Note: idx can be zero.
    136  for (i = 0; i < idx; i++) {
    137    const int rc = idx_arr[i];
    138    const int coeff = coeff_ptr[rc];
    139    const int coeff_sign = AOMSIGN(coeff);
    140    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
    141    const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
    142    const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
    143    const uint32_t abs_qcoeff =
    144        (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
    145    qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign;
    146    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
    147    if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
    148  }
    149  *eob_ptr = eob + 1;
    150 }
    151 
    152 void aom_highbd_quantize_b_64x64_sse2(
    153    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
    154    const int16_t *round_ptr, const int16_t *quant_ptr,
    155    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
    156    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
    157    const int16_t *scan, const int16_t *iscan) {
    158  __m128i zbins[2];
    159  __m128i nzbins[2];
    160  int idx = 0;
    161  int idx_arr[1024];
    162  int i, eob = -1;
    163  const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 2);
    164  const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 2);
    165  (void)scan;
    166  zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
    167  zbins[1] = _mm_set1_epi32(zbin1_tmp);
    168 
    169  nzbins[0] = _mm_setzero_si128();
    170  nzbins[1] = _mm_setzero_si128();
    171  nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
    172  nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
    173 
    174  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
    175  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
    176 
    177  // Pre-scan pass
    178  for (i = 0; i < n_coeffs / 4; i++) {
    179    __m128i coeffs, cmp1, cmp2;
    180    int test;
    181    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
    182    cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
    183    cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
    184    cmp1 = _mm_and_si128(cmp1, cmp2);
    185    test = _mm_movemask_epi8(cmp1);
    186    if (!(test & 0xf)) idx_arr[idx++] = i * 4;
    187    if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
    188    if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
    189    if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
    190  }
    191 
    192  // Quantization pass: only process the coefficients selected in
    193  // pre-scan pass. Note: idx can be zero.
    194  for (i = 0; i < idx; i++) {
    195    const int rc = idx_arr[i];
    196    const int coeff = coeff_ptr[rc];
    197    const int coeff_sign = AOMSIGN(coeff);
    198    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
    199    const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2);
    200    const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
    201    const uint32_t abs_qcoeff =
    202        (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 14);
    203    qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign;
    204    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 4;
    205    if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
    206  }
    207  *eob_ptr = eob + 1;
    208 }