tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

quantize_ssse3.c (7555B)


      1 /*
      2 * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <assert.h>
     13 #include <tmmintrin.h>
     14 #include <emmintrin.h>
     15 #include <xmmintrin.h>
     16 
     17 #include "config/aom_dsp_rtcd.h"
     18 
     19 #include "aom/aom_integer.h"
     20 #include "aom_dsp/x86/quantize_x86.h"
     21 
     22 static inline void calculate_qcoeff_64x64(__m128i *coeff, const __m128i round,
     23                                          const __m128i quant,
     24                                          const __m128i *shift) {
     25  __m128i tmp, qcoeff, tmp1;
     26  qcoeff = _mm_adds_epi16(*coeff, round);
     27  tmp = _mm_mulhi_epi16(qcoeff, quant);
     28  qcoeff = _mm_add_epi16(tmp, qcoeff);
     29  tmp = _mm_mullo_epi16(qcoeff, *shift);
     30  tmp = _mm_srli_epi16(tmp, 14);
     31  tmp1 = _mm_mulhi_epi16(qcoeff, *shift);
     32  tmp1 = _mm_slli_epi16(tmp1, 2);
     33  *coeff = _mm_or_si128(tmp, tmp1);
     34 }
     35 
     36 static inline void calculate_dqcoeff_and_store_64x64(const __m128i qcoeff,
     37                                                     const __m128i dequant,
     38                                                     const __m128i zero,
     39                                                     tran_low_t *dqcoeff) {
     40  // Un-sign to bias rounding like C.
     41  const __m128i coeff = _mm_abs_epi16(qcoeff);
     42 
     43  const __m128i sign_0 = _mm_unpacklo_epi16(zero, qcoeff);
     44  const __m128i sign_1 = _mm_unpackhi_epi16(zero, qcoeff);
     45 
     46  const __m128i low = _mm_mullo_epi16(coeff, dequant);
     47  const __m128i high = _mm_mulhi_epi16(coeff, dequant);
     48  __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high);
     49  __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high);
     50 
     51  // "Divide" by 4.
     52  dqcoeff32_0 = _mm_srli_epi32(dqcoeff32_0, 2);
     53  dqcoeff32_1 = _mm_srli_epi32(dqcoeff32_1, 2);
     54 
     55  dqcoeff32_0 = _mm_sign_epi32(dqcoeff32_0, sign_0);
     56  dqcoeff32_1 = _mm_sign_epi32(dqcoeff32_1, sign_1);
     57 
     58  _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0);
     59  _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1);
     60 }
     61 
     62 void aom_quantize_b_64x64_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     63                                const int16_t *zbin_ptr,
     64                                const int16_t *round_ptr,
     65                                const int16_t *quant_ptr,
     66                                const int16_t *quant_shift_ptr,
     67                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
     68                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
     69                                const int16_t *scan, const int16_t *iscan) {
     70  const __m128i zero = _mm_setzero_si128();
     71  const __m128i one = _mm_set1_epi16(1);
     72  const __m128i two = _mm_set1_epi16(2);
     73  int index;
     74 
     75  __m128i zbin, round, quant, dequant, shift;
     76  __m128i coeff0, coeff1, qcoeff0, qcoeff1;
     77  __m128i cmp_mask0, cmp_mask1, all_zero;
     78  __m128i eob = zero, eob0;
     79 
     80  (void)scan;
     81  (void)n_coeffs;
     82 
     83  // Setup global values.
     84  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
     85  round = _mm_load_si128((const __m128i *)round_ptr);
     86  quant = _mm_load_si128((const __m128i *)quant_ptr);
     87  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
     88  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
     89 
     90  // Shift with rounding.
     91  zbin = _mm_add_epi16(zbin, two);
     92  round = _mm_add_epi16(round, two);
     93  zbin = _mm_srli_epi16(zbin, 2);
     94  round = _mm_srli_epi16(round, 2);
     95  zbin = _mm_sub_epi16(zbin, one);
     96  // Do DC and first 15 AC.
     97  coeff0 = load_coefficients(coeff_ptr);
     98  coeff1 = load_coefficients(coeff_ptr + 8);
     99 
    100  qcoeff0 = _mm_abs_epi16(coeff0);
    101  qcoeff1 = _mm_abs_epi16(coeff1);
    102 
    103  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
    104  zbin = _mm_unpackhi_epi64(zbin, zbin);
    105  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
    106  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
    107  if (_mm_movemask_epi8(all_zero) == 0) {
    108    _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
    109    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
    110    _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
    111    _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
    112    _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
    113    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
    114    _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
    115    _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
    116    round = _mm_unpackhi_epi64(round, round);
    117    quant = _mm_unpackhi_epi64(quant, quant);
    118    shift = _mm_unpackhi_epi64(shift, shift);
    119    dequant = _mm_unpackhi_epi64(dequant, dequant);
    120  } else {
    121    calculate_qcoeff_64x64(&qcoeff0, round, quant, &shift);
    122    round = _mm_unpackhi_epi64(round, round);
    123    quant = _mm_unpackhi_epi64(quant, quant);
    124    shift = _mm_unpackhi_epi64(shift, shift);
    125    calculate_qcoeff_64x64(&qcoeff1, round, quant, &shift);
    126 
    127    // Reinsert signs.
    128    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
    129    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
    130 
    131    // Mask out zbin threshold coeffs.
    132    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
    133    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
    134 
    135    store_coefficients(qcoeff0, qcoeff_ptr);
    136    store_coefficients(qcoeff1, qcoeff_ptr + 8);
    137 
    138    calculate_dqcoeff_and_store_64x64(qcoeff0, dequant, zero, dqcoeff_ptr);
    139    dequant = _mm_unpackhi_epi64(dequant, dequant);
    140    calculate_dqcoeff_and_store_64x64(qcoeff1, dequant, zero, dqcoeff_ptr + 8);
    141 
    142    eob =
    143        scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
    144  }
    145 
    146  // AC only loop.
    147  for (index = 16; index < 1024; index += 16) {
    148    coeff0 = load_coefficients(coeff_ptr + index);
    149    coeff1 = load_coefficients(coeff_ptr + index + 8);
    150 
    151    qcoeff0 = _mm_abs_epi16(coeff0);
    152    qcoeff1 = _mm_abs_epi16(coeff1);
    153 
    154    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
    155    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
    156 
    157    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
    158    if (_mm_movemask_epi8(all_zero) == 0) {
    159      _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
    160      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
    161      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
    162      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
    163      _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
    164      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
    165      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
    166      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
    167      continue;
    168    }
    169    calculate_qcoeff_64x64(&qcoeff0, round, quant, &shift);
    170    calculate_qcoeff_64x64(&qcoeff1, round, quant, &shift);
    171 
    172    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
    173    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
    174 
    175    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
    176    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
    177 
    178    store_coefficients(qcoeff0, qcoeff_ptr + index);
    179    store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
    180 
    181    calculate_dqcoeff_and_store_64x64(qcoeff0, dequant, zero,
    182                                      dqcoeff_ptr + index);
    183    calculate_dqcoeff_and_store_64x64(qcoeff1, dequant, zero,
    184                                      dqcoeff_ptr + 8 + index);
    185 
    186    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
    187                        zero);
    188    eob = _mm_max_epi16(eob, eob0);
    189  }
    190 
    191  *eob_ptr = accumulate_eob(eob);
    192 }