tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

av1_highbd_quantize_sse4.c (7650B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <smmintrin.h>
     13 #include <stdint.h>
     14 
     15 #include "config/av1_rtcd.h"
     16 
     17 #include "aom_dsp/aom_dsp_common.h"
     18 #include "aom_dsp/x86/synonyms.h"
     19 
     20 // Coefficient quantization phase 1
     21 // param[0-2] : rounding/quan/dequan constants
     22 static inline void quantize_coeff_phase1(__m128i *coeff, const __m128i *param,
     23                                         const int shift, const int scale,
     24                                         __m128i *qcoeff, __m128i *dquan,
     25                                         __m128i *sign) {
     26  const __m128i zero = _mm_setzero_si128();
     27  const __m128i one = _mm_set1_epi32(1);
     28 
     29  *sign = _mm_cmplt_epi32(*coeff, zero);
     30  *sign = _mm_or_si128(*sign, one);
     31  *coeff = _mm_abs_epi32(*coeff);
     32 
     33  qcoeff[0] = _mm_add_epi32(*coeff, param[0]);
     34  qcoeff[1] = _mm_unpackhi_epi32(qcoeff[0], zero);
     35  qcoeff[0] = _mm_unpacklo_epi32(qcoeff[0], zero);
     36 
     37  qcoeff[0] = _mm_mul_epi32(qcoeff[0], param[1]);
     38  qcoeff[0] = _mm_srli_epi64(qcoeff[0], shift);
     39  dquan[0] = _mm_mul_epi32(qcoeff[0], param[2]);
     40  dquan[0] = _mm_srli_epi64(dquan[0], scale);
     41  const __m128i abs_s = _mm_slli_epi32(*coeff, 1 + scale);
     42  qcoeff[2] = _mm_cmplt_epi32(abs_s, param[3]);
     43 }
     44 
     45 // Coefficient quantization phase 2
     46 static inline void quantize_coeff_phase2(__m128i *qcoeff, __m128i *dquan,
     47                                         const __m128i *sign,
     48                                         const __m128i *param, const int shift,
     49                                         const int scale, tran_low_t *qAddr,
     50                                         tran_low_t *dqAddr) {
     51  __m128i mask0L = _mm_set_epi32(-1, -1, 0, 0);
     52  __m128i mask0H = _mm_set_epi32(0, 0, -1, -1);
     53 
     54  qcoeff[1] = _mm_mul_epi32(qcoeff[1], param[1]);
     55  qcoeff[1] = _mm_srli_epi64(qcoeff[1], shift);
     56  dquan[1] = _mm_mul_epi32(qcoeff[1], param[2]);
     57  dquan[1] = _mm_srli_epi64(dquan[1], scale);
     58 
     59  // combine L&H
     60  qcoeff[0] = _mm_shuffle_epi32(qcoeff[0], 0xd8);
     61  qcoeff[1] = _mm_shuffle_epi32(qcoeff[1], 0x8d);
     62 
     63  qcoeff[0] = _mm_and_si128(qcoeff[0], mask0H);
     64  qcoeff[1] = _mm_and_si128(qcoeff[1], mask0L);
     65 
     66  dquan[0] = _mm_shuffle_epi32(dquan[0], 0xd8);
     67  dquan[1] = _mm_shuffle_epi32(dquan[1], 0x8d);
     68 
     69  dquan[0] = _mm_and_si128(dquan[0], mask0H);
     70  dquan[1] = _mm_and_si128(dquan[1], mask0L);
     71 
     72  qcoeff[0] = _mm_or_si128(qcoeff[0], qcoeff[1]);
     73  dquan[0] = _mm_or_si128(dquan[0], dquan[1]);
     74 
     75  qcoeff[0] = _mm_sign_epi32(qcoeff[0], *sign);
     76  dquan[0] = _mm_sign_epi32(dquan[0], *sign);
     77  qcoeff[0] = _mm_andnot_si128(qcoeff[2], qcoeff[0]);
     78  dquan[0] = _mm_andnot_si128(qcoeff[2], dquan[0]);
     79  _mm_storeu_si128((__m128i *)qAddr, qcoeff[0]);
     80  _mm_storeu_si128((__m128i *)dqAddr, dquan[0]);
     81 }
     82 
     83 static inline void find_eob(tran_low_t *qcoeff_ptr, const int16_t *iscan,
     84                            __m128i *eob) {
     85  const __m128i zero = _mm_setzero_si128();
     86  __m128i mask, iscanIdx;
     87  const __m128i q0 = _mm_loadu_si128((__m128i const *)qcoeff_ptr);
     88  const __m128i q1 = _mm_loadu_si128((__m128i const *)(qcoeff_ptr + 4));
     89  __m128i nz_flag0 = _mm_cmpeq_epi32(q0, zero);
     90  __m128i nz_flag1 = _mm_cmpeq_epi32(q1, zero);
     91 
     92  nz_flag0 = _mm_cmpeq_epi32(nz_flag0, zero);
     93  nz_flag1 = _mm_cmpeq_epi32(nz_flag1, zero);
     94 
     95  mask = _mm_packs_epi32(nz_flag0, nz_flag1);
     96  iscanIdx = _mm_loadu_si128((__m128i const *)iscan);
     97  iscanIdx = _mm_sub_epi16(iscanIdx, mask);
     98  iscanIdx = _mm_and_si128(iscanIdx, mask);
     99  *eob = _mm_max_epi16(*eob, iscanIdx);
    100 }
    101 
    102 static inline uint16_t get_accumulated_eob(__m128i *eob) {
    103  __m128i eob_shuffled;
    104  uint16_t eobValue;
    105  eob_shuffled = _mm_shuffle_epi32(*eob, 0xe);
    106  *eob = _mm_max_epi16(*eob, eob_shuffled);
    107  eob_shuffled = _mm_shufflelo_epi16(*eob, 0xe);
    108  *eob = _mm_max_epi16(*eob, eob_shuffled);
    109  eob_shuffled = _mm_shufflelo_epi16(*eob, 0x1);
    110  *eob = _mm_max_epi16(*eob, eob_shuffled);
    111  eobValue = _mm_extract_epi16(*eob, 0);
    112  return eobValue;
    113 }
    114 
    115 void av1_highbd_quantize_fp_sse4_1(
    116    const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr,
    117    const int16_t *round_ptr, const int16_t *quant_ptr,
    118    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
    119    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
    120    const int16_t *scan, const int16_t *iscan, int log_scale) {
    121  __m128i coeff[2], qcoeff[3], dequant[2], qparam[4], coeff_sign;
    122  __m128i eob = _mm_setzero_si128();
    123  const tran_low_t *src = coeff_ptr;
    124  tran_low_t *quanAddr = qcoeff_ptr;
    125  tran_low_t *dquanAddr = dqcoeff_ptr;
    126  const int shift = 16 - log_scale;
    127  const int coeff_stride = 4;
    128  const int quan_stride = coeff_stride;
    129  (void)zbin_ptr;
    130  (void)quant_shift_ptr;
    131  (void)scan;
    132 
    133  memset(quanAddr, 0, count * sizeof(quanAddr[0]));
    134  memset(dquanAddr, 0, count * sizeof(dquanAddr[0]));
    135 
    136  coeff[0] = _mm_loadu_si128((__m128i const *)src);
    137  const int round1 = ROUND_POWER_OF_TWO(round_ptr[1], log_scale);
    138  const int round0 = ROUND_POWER_OF_TWO(round_ptr[0], log_scale);
    139 
    140  qparam[0] = _mm_set_epi32(round1, round1, round1, round0);
    141  qparam[1] = _mm_set_epi64x((uint32_t)quant_ptr[1], (uint32_t)quant_ptr[0]);
    142  qparam[2] =
    143      _mm_set_epi64x((uint32_t)dequant_ptr[1], (uint32_t)dequant_ptr[0]);
    144  qparam[3] = _mm_set_epi32(dequant_ptr[1], dequant_ptr[1], dequant_ptr[1],
    145                            dequant_ptr[0]);
    146 
    147  // DC and first 3 AC
    148  quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant,
    149                        &coeff_sign);
    150 
    151  // update round/quan/dquan for AC
    152  qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]);
    153  qparam[1] = _mm_set1_epi64x((uint32_t)quant_ptr[1]);
    154  qparam[2] = _mm_set1_epi64x((uint32_t)dequant_ptr[1]);
    155  qparam[3] = _mm_set1_epi32(dequant_ptr[1]);
    156  quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale,
    157                        quanAddr, dquanAddr);
    158 
    159  // next 4 AC
    160  coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
    161  quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant,
    162                        &coeff_sign);
    163  quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale,
    164                        quanAddr + quan_stride, dquanAddr + quan_stride);
    165 
    166  find_eob(quanAddr, iscan, &eob);
    167 
    168  count -= 8;
    169 
    170  // loop for the rest of AC
    171  while (count > 0) {
    172    src += coeff_stride << 1;
    173    quanAddr += quan_stride << 1;
    174    dquanAddr += quan_stride << 1;
    175    iscan += quan_stride << 1;
    176 
    177    coeff[0] = _mm_loadu_si128((__m128i const *)src);
    178    coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
    179 
    180    quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant,
    181                          &coeff_sign);
    182    quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
    183                          log_scale, quanAddr, dquanAddr);
    184 
    185    quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant,
    186                          &coeff_sign);
    187    quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
    188                          log_scale, quanAddr + quan_stride,
    189                          dquanAddr + quan_stride);
    190 
    191    find_eob(quanAddr, iscan, &eob);
    192 
    193    count -= 8;
    194  }
    195  *eob_ptr = get_accumulated_eob(&eob);
    196 }