tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

highbd_quantize_intrin_avx2.c (11029B)


      1 /*
      2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <immintrin.h>
     13 
     14 #include "config/aom_dsp_rtcd.h"
     15 
     16 #include "aom/aom_integer.h"
     17 
     18 static inline void init_one_qp(const __m128i *p, __m256i *qp) {
     19  const __m128i sign = _mm_srai_epi16(*p, 15);
     20  const __m128i dc = _mm_unpacklo_epi16(*p, sign);
     21  const __m128i ac = _mm_unpackhi_epi16(*p, sign);
     22  *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1);
     23 }
     24 
     25 static inline void update_qp(__m256i *qp) {
     26  int i;
     27  for (i = 0; i < 5; ++i) {
     28    qp[i] = _mm256_permute2x128_si256(qp[i], qp[i], 0x11);
     29  }
     30 }
     31 
     32 static inline void init_qp(const int16_t *zbin_ptr, const int16_t *round_ptr,
     33                           const int16_t *quant_ptr, const int16_t *dequant_ptr,
     34                           const int16_t *quant_shift_ptr, __m256i *qp,
     35                           int log_scale) {
     36  const __m128i zbin = _mm_loadu_si128((const __m128i *)zbin_ptr);
     37  const __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
     38  const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr);
     39  const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
     40  const __m128i quant_shift = _mm_loadu_si128((const __m128i *)quant_shift_ptr);
     41  init_one_qp(&zbin, &qp[0]);
     42  init_one_qp(&round, &qp[1]);
     43  init_one_qp(&quant, &qp[2]);
     44  init_one_qp(&dequant, &qp[3]);
     45  init_one_qp(&quant_shift, &qp[4]);
     46  if (log_scale > 0) {
     47    const __m256i rnd = _mm256_set1_epi32((int16_t)(1 << (log_scale - 1)));
     48    qp[0] = _mm256_add_epi32(qp[0], rnd);
     49    qp[0] = _mm256_srai_epi32(qp[0], log_scale);
     50 
     51    qp[1] = _mm256_add_epi32(qp[1], rnd);
     52    qp[1] = _mm256_srai_epi32(qp[1], log_scale);
     53  }
     54  // Subtracting 1 here eliminates a _mm256_cmpeq_epi32() instruction when
     55  // calculating the zbin mask.
     56  qp[0] = _mm256_sub_epi32(qp[0], _mm256_set1_epi32(1));
     57 }
     58 
     59 // Note:
     60 // *x is vector multiplied by *y which is 16 int32_t parallel multiplication
     61 // and right shift 16.  The output, 16 int32_t is save in *p.
     62 static inline __m256i mm256_mul_shift_epi32(const __m256i *x,
     63                                            const __m256i *y) {
     64  __m256i prod_lo = _mm256_mul_epi32(*x, *y);
     65  __m256i prod_hi = _mm256_srli_epi64(*x, 32);
     66  const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
     67  prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
     68 
     69  prod_lo = _mm256_srli_epi64(prod_lo, 16);
     70  const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
     71  prod_lo = _mm256_and_si256(prod_lo, mask);
     72  prod_hi = _mm256_srli_epi64(prod_hi, 16);
     73 
     74  prod_hi = _mm256_slli_epi64(prod_hi, 32);
     75  return _mm256_or_si256(prod_lo, prod_hi);
     76 }
     77 
     78 static AOM_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan_ptr,
     79                                                 __m256i eobmax,
     80                                                 __m256i nz_mask) {
     81  const __m256i packed_nz_mask = _mm256_packs_epi32(nz_mask, nz_mask);
     82  const __m256i packed_nz_mask_perm =
     83      _mm256_permute4x64_epi64(packed_nz_mask, 0xD8);
     84  const __m256i iscan =
     85      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)iscan_ptr));
     86  const __m256i iscan_plus1 = _mm256_sub_epi16(iscan, packed_nz_mask_perm);
     87  const __m256i nz_iscan = _mm256_and_si256(iscan_plus1, packed_nz_mask_perm);
     88  return _mm256_max_epi16(eobmax, nz_iscan);
     89 }
     90 
     91 // Get the max eob from the lower 128 bits.
     92 static AOM_FORCE_INLINE uint16_t get_max_eob(__m256i eob) {
     93  __m256i eob_s;
     94  eob_s = _mm256_shuffle_epi32(eob, 0xe);
     95  eob = _mm256_max_epi16(eob, eob_s);
     96  eob_s = _mm256_shufflelo_epi16(eob, 0xe);
     97  eob = _mm256_max_epi16(eob, eob_s);
     98  eob_s = _mm256_shufflelo_epi16(eob, 1);
     99  eob = _mm256_max_epi16(eob, eob_s);
    100  return (uint16_t)_mm256_extract_epi16(eob, 0);
    101 }
    102 
    103 static AOM_FORCE_INLINE __m256i mm256_mul_shift_epi32_logscale(const __m256i *x,
    104                                                               const __m256i *y,
    105                                                               int log_scale) {
    106  __m256i prod_lo = _mm256_mul_epi32(*x, *y);
    107  __m256i prod_hi = _mm256_srli_epi64(*x, 32);
    108  const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
    109  prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
    110  prod_lo = _mm256_srli_epi64(prod_lo, 16 - log_scale);
    111  const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
    112  prod_lo = _mm256_and_si256(prod_lo, mask);
    113  prod_hi = _mm256_srli_epi64(prod_hi, 16 - log_scale);
    114  prod_hi = _mm256_slli_epi64(prod_hi, 32);
    115  return _mm256_or_si256(prod_lo, prod_hi);
    116 }
    117 
    118 static AOM_FORCE_INLINE void quantize_logscale(
    119    const __m256i *qp, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
    120    tran_low_t *qcoeff, tran_low_t *dqcoeff, __m256i *eob, int log_scale) {
    121  const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
    122  const __m256i abs_coeff = _mm256_abs_epi32(coeff);
    123  const __m256i zbin_mask = _mm256_cmpgt_epi32(abs_coeff, qp[0]);
    124 
    125  if (UNLIKELY(_mm256_movemask_epi8(zbin_mask) == 0)) {
    126    const __m256i zero = _mm256_setzero_si256();
    127    _mm256_storeu_si256((__m256i *)qcoeff, zero);
    128    _mm256_storeu_si256((__m256i *)dqcoeff, zero);
    129    return;
    130  }
    131 
    132  const __m256i tmp_rnd =
    133      _mm256_and_si256(_mm256_add_epi32(abs_coeff, qp[1]), zbin_mask);
    134  // const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
    135  const __m256i tmp = mm256_mul_shift_epi32_logscale(&tmp_rnd, &qp[2], 0);
    136  const __m256i tmp2 = _mm256_add_epi32(tmp, tmp_rnd);
    137  // const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >>
    138  //                              (16 - log_scale + AOM_QM_BITS));
    139  const __m256i abs_q =
    140      mm256_mul_shift_epi32_logscale(&tmp2, &qp[4], log_scale);
    141  const __m256i abs_dq =
    142      _mm256_srli_epi32(_mm256_mullo_epi32(abs_q, qp[3]), log_scale);
    143  const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256());
    144  const __m256i q = _mm256_sign_epi32(abs_q, coeff);
    145  const __m256i dq = _mm256_sign_epi32(abs_dq, coeff);
    146 
    147  _mm256_storeu_si256((__m256i *)qcoeff, q);
    148  _mm256_storeu_si256((__m256i *)dqcoeff, dq);
    149 
    150  *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask);
    151 }
    152 
    153 static AOM_FORCE_INLINE void quantize(const __m256i *qp,
    154                                      const tran_low_t *coeff_ptr,
    155                                      const int16_t *iscan_ptr,
    156                                      tran_low_t *qcoeff, tran_low_t *dqcoeff,
    157                                      __m256i *eob) {
    158  const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
    159  const __m256i abs_coeff = _mm256_abs_epi32(coeff);
    160  const __m256i zbin_mask = _mm256_cmpgt_epi32(abs_coeff, qp[0]);
    161 
    162  if (UNLIKELY(_mm256_movemask_epi8(zbin_mask) == 0)) {
    163    const __m256i zero = _mm256_setzero_si256();
    164    _mm256_storeu_si256((__m256i *)qcoeff, zero);
    165    _mm256_storeu_si256((__m256i *)dqcoeff, zero);
    166    return;
    167  }
    168 
    169  const __m256i tmp_rnd =
    170      _mm256_and_si256(_mm256_add_epi32(abs_coeff, qp[1]), zbin_mask);
    171  const __m256i tmp = mm256_mul_shift_epi32(&tmp_rnd, &qp[2]);
    172  const __m256i tmp2 = _mm256_add_epi32(tmp, tmp_rnd);
    173  const __m256i abs_q = mm256_mul_shift_epi32(&tmp2, &qp[4]);
    174  const __m256i abs_dq = _mm256_mullo_epi32(abs_q, qp[3]);
    175  const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256());
    176  const __m256i q = _mm256_sign_epi32(abs_q, coeff);
    177  const __m256i dq = _mm256_sign_epi32(abs_dq, coeff);
    178 
    179  _mm256_storeu_si256((__m256i *)qcoeff, q);
    180  _mm256_storeu_si256((__m256i *)dqcoeff, dq);
    181 
    182  *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask);
    183 }
    184 
    185 void aom_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
    186                                const int16_t *zbin_ptr,
    187                                const int16_t *round_ptr,
    188                                const int16_t *quant_ptr,
    189                                const int16_t *quant_shift_ptr,
    190                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
    191                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
    192                                const int16_t *scan, const int16_t *iscan) {
    193  (void)scan;
    194  const int step = 8;
    195 
    196  __m256i eob = _mm256_setzero_si256();
    197  __m256i qp[5];
    198 
    199  init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 0);
    200 
    201  quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
    202 
    203  coeff_ptr += step;
    204  qcoeff_ptr += step;
    205  dqcoeff_ptr += step;
    206  iscan += step;
    207  n_coeffs -= step;
    208 
    209  update_qp(qp);
    210 
    211  while (n_coeffs > 0) {
    212    quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
    213 
    214    coeff_ptr += step;
    215    qcoeff_ptr += step;
    216    dqcoeff_ptr += step;
    217    iscan += step;
    218    n_coeffs -= step;
    219  }
    220 
    221  *eob_ptr = get_max_eob(eob);
    222 }
    223 
    224 void aom_highbd_quantize_b_32x32_avx2(
    225    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
    226    const int16_t *round_ptr, const int16_t *quant_ptr,
    227    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
    228    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
    229    const int16_t *scan, const int16_t *iscan) {
    230  (void)scan;
    231  const unsigned int step = 8;
    232 
    233  __m256i eob = _mm256_setzero_si256();
    234  __m256i qp[5];
    235  init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 1);
    236 
    237  quantize_logscale(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob, 1);
    238 
    239  coeff_ptr += step;
    240  qcoeff_ptr += step;
    241  dqcoeff_ptr += step;
    242  iscan += step;
    243  n_coeffs -= step;
    244 
    245  update_qp(qp);
    246 
    247  while (n_coeffs > 0) {
    248    quantize_logscale(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob, 1);
    249 
    250    coeff_ptr += step;
    251    qcoeff_ptr += step;
    252    dqcoeff_ptr += step;
    253    iscan += step;
    254    n_coeffs -= step;
    255  }
    256 
    257  *eob_ptr = get_max_eob(eob);
    258 }
    259 
    260 void aom_highbd_quantize_b_64x64_avx2(
    261    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
    262    const int16_t *round_ptr, const int16_t *quant_ptr,
    263    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
    264    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
    265    const int16_t *scan, const int16_t *iscan) {
    266  (void)scan;
    267  const int step = 8;
    268 
    269  __m256i eob = _mm256_setzero_si256();
    270  __m256i qp[5];
    271  init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 2);
    272 
    273  quantize_logscale(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob, 2);
    274 
    275  coeff_ptr += step;
    276  qcoeff_ptr += step;
    277  dqcoeff_ptr += step;
    278  iscan += step;
    279  n_coeffs -= step;
    280 
    281  update_qp(qp);
    282 
    283  while (n_coeffs > 0) {
    284    quantize_logscale(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob, 2);
    285 
    286    coeff_ptr += step;
    287    qcoeff_ptr += step;
    288    dqcoeff_ptr += step;
    289    iscan += step;
    290    n_coeffs -= step;
    291  }
    292 
    293  *eob_ptr = get_max_eob(eob);
    294 }