tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

av1_quantize_sse2.c (11870B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <emmintrin.h>
     13 #include <xmmintrin.h>
     14 
     15 #include "config/av1_rtcd.h"
     16 
     17 #include "aom/aom_integer.h"
     18 #include "aom_dsp/x86/quantize_x86.h"
     19 
     20 static inline void read_coeff(const tran_low_t *coeff, intptr_t offset,
     21                              __m128i *c0, __m128i *c1) {
     22  const tran_low_t *addr = coeff + offset;
     23  if (sizeof(tran_low_t) == 4) {
     24    const __m128i x0 = _mm_load_si128((const __m128i *)addr);
     25    const __m128i x1 = _mm_load_si128((const __m128i *)addr + 1);
     26    const __m128i x2 = _mm_load_si128((const __m128i *)addr + 2);
     27    const __m128i x3 = _mm_load_si128((const __m128i *)addr + 3);
     28    *c0 = _mm_packs_epi32(x0, x1);
     29    *c1 = _mm_packs_epi32(x2, x3);
     30  } else {
     31    *c0 = _mm_load_si128((const __m128i *)addr);
     32    *c1 = _mm_load_si128((const __m128i *)addr + 1);
     33  }
     34 }
     35 
     36 static inline void write_qcoeff(const __m128i *qc0, const __m128i *qc1,
     37                                tran_low_t *qcoeff, intptr_t offset) {
     38  tran_low_t *addr = qcoeff + offset;
     39  if (sizeof(tran_low_t) == 4) {
     40    const __m128i zero = _mm_setzero_si128();
     41    __m128i sign_bits = _mm_cmplt_epi16(*qc0, zero);
     42    __m128i y0 = _mm_unpacklo_epi16(*qc0, sign_bits);
     43    __m128i y1 = _mm_unpackhi_epi16(*qc0, sign_bits);
     44    _mm_store_si128((__m128i *)addr, y0);
     45    _mm_store_si128((__m128i *)addr + 1, y1);
     46 
     47    sign_bits = _mm_cmplt_epi16(*qc1, zero);
     48    y0 = _mm_unpacklo_epi16(*qc1, sign_bits);
     49    y1 = _mm_unpackhi_epi16(*qc1, sign_bits);
     50    _mm_store_si128((__m128i *)addr + 2, y0);
     51    _mm_store_si128((__m128i *)addr + 3, y1);
     52  } else {
     53    _mm_store_si128((__m128i *)addr, *qc0);
     54    _mm_store_si128((__m128i *)addr + 1, *qc1);
     55  }
     56 }
     57 
     58 static inline void write_zero(tran_low_t *qcoeff, intptr_t offset) {
     59  const __m128i zero = _mm_setzero_si128();
     60  tran_low_t *addr = qcoeff + offset;
     61  if (sizeof(tran_low_t) == 4) {
     62    _mm_store_si128((__m128i *)addr, zero);
     63    _mm_store_si128((__m128i *)addr + 1, zero);
     64    _mm_store_si128((__m128i *)addr + 2, zero);
     65    _mm_store_si128((__m128i *)addr + 3, zero);
     66  } else {
     67    _mm_store_si128((__m128i *)addr, zero);
     68    _mm_store_si128((__m128i *)addr + 1, zero);
     69  }
     70 }
     71 
     72 static inline void quantize(const int16_t *iscan_ptr,
     73                            const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     74                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
     75                            const __m128i *round0, const __m128i *round1,
     76                            const __m128i *quant0, const __m128i *quant1,
     77                            const __m128i *dequant0, const __m128i *dequant1,
     78                            const __m128i *thr0, const __m128i *thr1,
     79                            __m128i *eob) {
     80  __m128i coeff0, coeff1;
     81  // Do DC and first 15 AC
     82  read_coeff(coeff_ptr, n_coeffs, &coeff0, &coeff1);
     83 
     84  // Poor man's sign extract
     85  const __m128i coeff0_sign = _mm_srai_epi16(coeff0, 15);
     86  const __m128i coeff1_sign = _mm_srai_epi16(coeff1, 15);
     87  __m128i qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
     88  __m128i qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
     89  qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
     90  qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
     91  const __m128i mask0 = _mm_or_si128(_mm_cmpgt_epi16(qcoeff0, *thr0),
     92                                     _mm_cmpeq_epi16(qcoeff0, *thr0));
     93  const __m128i mask1 = _mm_or_si128(_mm_cmpgt_epi16(qcoeff1, *thr1),
     94                                     _mm_cmpeq_epi16(qcoeff1, *thr1));
     95  const int nzflag = _mm_movemask_epi8(mask0) | _mm_movemask_epi8(mask1);
     96 
     97  if (nzflag) {
     98    qcoeff0 = _mm_adds_epi16(qcoeff0, *round0);
     99    qcoeff1 = _mm_adds_epi16(qcoeff1, *round1);
    100    const __m128i qtmp0 = _mm_mulhi_epi16(qcoeff0, *quant0);
    101    const __m128i qtmp1 = _mm_mulhi_epi16(qcoeff1, *quant1);
    102 
    103    // Reinsert signs
    104    qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
    105    qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
    106    qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
    107    qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
    108 
    109    write_qcoeff(&qcoeff0, &qcoeff1, qcoeff_ptr, n_coeffs);
    110 
    111    coeff0 = _mm_mullo_epi16(qcoeff0, *dequant0);
    112    coeff1 = _mm_mullo_epi16(qcoeff1, *dequant1);
    113 
    114    write_qcoeff(&coeff0, &coeff1, dqcoeff_ptr, n_coeffs);
    115 
    116    const __m128i zero = _mm_setzero_si128();
    117    // Scan for eob
    118    const __m128i zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
    119    const __m128i zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
    120    const __m128i nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
    121    const __m128i nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
    122    const __m128i iscan0 =
    123        _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
    124    const __m128i iscan1 =
    125        _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
    126    // Add one to convert from indices to counts
    127    const __m128i iscan0_nz = _mm_sub_epi16(iscan0, nzero_coeff0);
    128    const __m128i iscan1_nz = _mm_sub_epi16(iscan1, nzero_coeff1);
    129    const __m128i eob0 = _mm_and_si128(iscan0_nz, nzero_coeff0);
    130    const __m128i eob1 = _mm_and_si128(iscan1_nz, nzero_coeff1);
    131    const __m128i eob2 = _mm_max_epi16(eob0, eob1);
    132    *eob = _mm_max_epi16(*eob, eob2);
    133  } else {
    134    write_zero(qcoeff_ptr, n_coeffs);
    135    write_zero(dqcoeff_ptr, n_coeffs);
    136  }
    137 }
    138 
    139 void av1_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
    140                          const int16_t *zbin_ptr, const int16_t *round_ptr,
    141                          const int16_t *quant_ptr,
    142                          const int16_t *quant_shift_ptr,
    143                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
    144                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
    145                          const int16_t *scan_ptr, const int16_t *iscan_ptr) {
    146  (void)scan_ptr;
    147  (void)zbin_ptr;
    148  (void)quant_shift_ptr;
    149 
    150  coeff_ptr += n_coeffs;
    151  iscan_ptr += n_coeffs;
    152  qcoeff_ptr += n_coeffs;
    153  dqcoeff_ptr += n_coeffs;
    154  n_coeffs = -n_coeffs;
    155 
    156  const __m128i round0 = _mm_load_si128((const __m128i *)round_ptr);
    157  const __m128i round1 = _mm_unpackhi_epi64(round0, round0);
    158  const __m128i quant0 = _mm_load_si128((const __m128i *)quant_ptr);
    159  const __m128i quant1 = _mm_unpackhi_epi64(quant0, quant0);
    160  const __m128i dequant0 = _mm_load_si128((const __m128i *)dequant_ptr);
    161  const __m128i dequant1 = _mm_unpackhi_epi64(dequant0, dequant0);
    162  const __m128i thr0 = _mm_srai_epi16(dequant0, 1);
    163  const __m128i thr1 = _mm_srai_epi16(dequant1, 1);
    164  __m128i eob = _mm_setzero_si128();
    165 
    166  quantize(iscan_ptr, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round0,
    167           &round1, &quant0, &quant1, &dequant0, &dequant1, &thr0, &thr1, &eob);
    168 
    169  n_coeffs += 8 * 2;
    170 
    171  // AC only loop
    172  while (n_coeffs < 0) {
    173    quantize(iscan_ptr, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round1,
    174             &round1, &quant1, &quant1, &dequant1, &dequant1, &thr1, &thr1,
    175             &eob);
    176    n_coeffs += 8 * 2;
    177  }
    178 
    179  // Accumulate EOB
    180  {
    181    __m128i eob_shuffled;
    182    eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
    183    eob = _mm_max_epi16(eob, eob_shuffled);
    184    eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
    185    eob = _mm_max_epi16(eob, eob_shuffled);
    186    eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
    187    eob = _mm_max_epi16(eob, eob_shuffled);
    188    *eob_ptr = _mm_extract_epi16(eob, 1);
    189  }
    190 }
    191 
    192 static inline void quantize_lp(const int16_t *iscan_ptr,
    193                               const int16_t *coeff_ptr, intptr_t n_coeffs,
    194                               int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
    195                               const __m128i *round0, const __m128i *round1,
    196                               const __m128i *quant0, const __m128i *quant1,
    197                               const __m128i *dequant0, const __m128i *dequant1,
    198                               __m128i *eob) {
    199  const int16_t *read = coeff_ptr + n_coeffs;
    200  __m128i coeff0 = _mm_load_si128((const __m128i *)read);
    201  __m128i coeff1 = _mm_load_si128((const __m128i *)read + 1);
    202 
    203  // Poor man's sign extract
    204  const __m128i coeff0_sign = _mm_srai_epi16(coeff0, 15);
    205  const __m128i coeff1_sign = _mm_srai_epi16(coeff1, 15);
    206  __m128i qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
    207  __m128i qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
    208  qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
    209  qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
    210 
    211  qcoeff0 = _mm_adds_epi16(qcoeff0, *round0);
    212  qcoeff1 = _mm_adds_epi16(qcoeff1, *round1);
    213  const __m128i qtmp0 = _mm_mulhi_epi16(qcoeff0, *quant0);
    214  const __m128i qtmp1 = _mm_mulhi_epi16(qcoeff1, *quant1);
    215 
    216  // Reinsert signs
    217  qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
    218  qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
    219  qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
    220  qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
    221 
    222  int16_t *addr = qcoeff_ptr + n_coeffs;
    223  _mm_store_si128((__m128i *)addr, qcoeff0);
    224  _mm_store_si128((__m128i *)addr + 1, qcoeff1);
    225 
    226  coeff0 = _mm_mullo_epi16(qcoeff0, *dequant0);
    227  coeff1 = _mm_mullo_epi16(qcoeff1, *dequant1);
    228 
    229  addr = dqcoeff_ptr + n_coeffs;
    230  _mm_store_si128((__m128i *)addr, coeff0);
    231  _mm_store_si128((__m128i *)addr + 1, coeff1);
    232 
    233  const __m128i zero = _mm_setzero_si128();
    234  // Scan for eob
    235  const __m128i zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
    236  const __m128i zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
    237  const __m128i nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
    238  const __m128i nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
    239 
    240  const __m128i iscan0 =
    241      _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
    242  const __m128i iscan1 =
    243      _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
    244 
    245  // Add one to convert from indices to counts
    246  const __m128i iscan0_nz = _mm_sub_epi16(iscan0, nzero_coeff0);
    247  const __m128i iscan1_nz = _mm_sub_epi16(iscan1, nzero_coeff1);
    248  const __m128i eob0 = _mm_and_si128(iscan0_nz, nzero_coeff0);
    249  const __m128i eob1 = _mm_and_si128(iscan1_nz, nzero_coeff1);
    250  const __m128i eob2 = _mm_max_epi16(eob0, eob1);
    251  *eob = _mm_max_epi16(*eob, eob2);
    252 }
    253 
    254 void av1_quantize_lp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs,
    255                          const int16_t *round_ptr, const int16_t *quant_ptr,
    256                          int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
    257                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
    258                          const int16_t *scan, const int16_t *iscan) {
    259  (void)scan;
    260  coeff_ptr += n_coeffs;
    261  iscan += n_coeffs;
    262  qcoeff_ptr += n_coeffs;
    263  dqcoeff_ptr += n_coeffs;
    264  n_coeffs = -n_coeffs;
    265 
    266  // Setup global values
    267  const __m128i round0 = _mm_load_si128((const __m128i *)round_ptr);
    268  const __m128i round1 = _mm_unpackhi_epi64(round0, round0);
    269  const __m128i quant0 = _mm_load_si128((const __m128i *)quant_ptr);
    270  const __m128i quant1 = _mm_unpackhi_epi64(quant0, quant0);
    271  const __m128i dequant0 = _mm_load_si128((const __m128i *)dequant_ptr);
    272  const __m128i dequant1 = _mm_unpackhi_epi64(dequant0, dequant0);
    273  __m128i eob = _mm_setzero_si128();
    274 
    275  // DC and first 15 AC
    276  quantize_lp(iscan, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round0,
    277              &round1, &quant0, &quant1, &dequant0, &dequant1, &eob);
    278  n_coeffs += 8 * 2;
    279 
    280  // AC only loop
    281  while (n_coeffs < 0) {
    282    quantize_lp(iscan, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round1,
    283                &round1, &quant1, &quant1, &dequant1, &dequant1, &eob);
    284    n_coeffs += 8 * 2;
    285  }
    286 
    287  // Accumulate EOB
    288  *eob_ptr = accumulate_eob(eob);
    289 }