tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

quantize_neon.c (40521B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <arm_neon.h>
     13 
     14 #include <assert.h>
     15 #include <math.h>
     16 
     17 #include "config/aom_config.h"
     18 
     19 #include "aom_dsp/arm/mem_neon.h"
     20 #include "aom_dsp/arm/sum_neon.h"
     21 #include "aom_mem/aom_mem.h"
     22 
     23 #include "av1/common/quant_common.h"
     24 #include "av1/common/seg_common.h"
     25 
     26 #include "av1/encoder/av1_quantize.h"
     27 #include "av1/encoder/encoder.h"
     28 #include "av1/encoder/rd.h"
     29 
     30 static inline uint16_t get_max_eob(int16x8_t v_eobmax) {
     31 #if AOM_ARCH_AARCH64
     32  return (uint16_t)vmaxvq_s16(v_eobmax);
     33 #else
     34  const int16x4_t v_eobmax_3210 =
     35      vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax));
     36  const int64x1_t v_eobmax_xx32 =
     37      vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
     38  const int16x4_t v_eobmax_tmp =
     39      vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
     40  const int64x1_t v_eobmax_xxx3 =
     41      vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
     42  const int16x4_t v_eobmax_final =
     43      vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
     44  return (uint16_t)vget_lane_s16(v_eobmax_final, 0);
     45 #endif
     46 }
     47 
     48 static inline int16x8_t get_max_lane_eob(const int16_t *iscan,
     49                                         int16x8_t v_eobmax,
     50                                         uint16x8_t v_mask) {
     51  const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
     52  const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, vdupq_n_s16(1));
     53  const int16x8_t v_nz_iscan = vbslq_s16(v_mask, v_iscan_plus1, vdupq_n_s16(0));
     54  return vmaxq_s16(v_eobmax, v_nz_iscan);
     55 }
     56 
     57 static inline uint16x8_t quantize_fp_8(const tran_low_t *coeff_ptr,
     58                                       tran_low_t *qcoeff_ptr,
     59                                       tran_low_t *dqcoeff_ptr,
     60                                       int16x8_t v_quant, int16x8_t v_dequant,
     61                                       int16x8_t v_round, int16x8_t v_zero) {
     62  const int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
     63  const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
     64  const int16x8_t v_abs = vabsq_s16(v_coeff);
     65  const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round);
     66  const int16x8_t v_tmp2 = vshrq_n_s16(vqdmulhq_s16(v_tmp, v_quant), 1);
     67  const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero);
     68  const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
     69  const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
     70  const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
     71  store_s16q_to_tran_low(&qcoeff_ptr[0], v_qcoeff);
     72  store_s16q_to_tran_low(&dqcoeff_ptr[0], v_dqcoeff);
     73  return v_nz_mask;
     74 }
     75 
     76 void av1_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
     77                          const int16_t *zbin_ptr, const int16_t *round_ptr,
     78                          const int16_t *quant_ptr,
     79                          const int16_t *quant_shift_ptr,
     80                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
     81                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
     82                          const int16_t *scan, const int16_t *iscan) {
     83  // TODO(jingning) Decide the need of these arguments after the
     84  // quantization process is completed.
     85  (void)zbin_ptr;
     86  (void)quant_shift_ptr;
     87  (void)scan;
     88 
     89  // Quantization pass: All coefficients with index >= zero_flag are
     90  // skippable. Note: zero_flag can be zero.
     91  const int16x8_t v_zero = vdupq_n_s16(0);
     92  int16x8_t v_quant = vld1q_s16(quant_ptr);
     93  int16x8_t v_dequant = vld1q_s16(dequant_ptr);
     94  int16x8_t v_round = vld1q_s16(round_ptr);
     95  int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
     96  uint16x8_t v_nz_mask;
     97  // process dc and the first seven ac coeffs
     98  v_nz_mask = quantize_fp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
     99                            v_dequant, v_round, v_zero);
    100  v_eobmax_76543210 = get_max_lane_eob(&iscan[0], v_eobmax_76543210, v_nz_mask);
    101  // overwrite the dc constants with ac constants
    102  v_quant = vdupq_lane_s16(vget_low_s16(v_quant), 1);
    103  v_dequant = vdupq_lane_s16(vget_low_s16(v_dequant), 1);
    104  v_round = vdupq_lane_s16(vget_low_s16(v_round), 1);
    105 
    106  count -= 8;
    107  // now process the rest of the ac coeffs
    108  do {
    109    coeff_ptr += 8;
    110    qcoeff_ptr += 8;
    111    dqcoeff_ptr += 8;
    112    iscan += 8;
    113    v_nz_mask = quantize_fp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
    114                              v_dequant, v_round, v_zero);
    115    v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask);
    116    count -= 8;
    117  } while (count > 0);
    118  *eob_ptr = get_max_eob(v_eobmax_76543210);
    119 }
    120 
    121 static inline uint16x8_t quantize_lp_8(const int16_t *coeff_ptr,
    122                                       int16_t *qcoeff_ptr,
    123                                       int16_t *dqcoeff_ptr, int16x8_t v_quant,
    124                                       int16x8_t v_dequant, int16x8_t v_round,
    125                                       int16x8_t v_zero) {
    126  const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[0]);
    127  const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
    128  const int16x8_t v_abs = vabsq_s16(v_coeff);
    129  const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round);
    130  const int16x8_t v_tmp2 = vshrq_n_s16(vqdmulhq_s16(v_tmp, v_quant), 1);
    131  const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero);
    132  const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
    133  const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
    134  const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
    135  vst1q_s16(qcoeff_ptr, v_qcoeff);
    136  vst1q_s16(dqcoeff_ptr, v_dqcoeff);
    137  return v_nz_mask;
    138 }
    139 
    140 void av1_quantize_lp_neon(const int16_t *coeff_ptr, intptr_t n_coeffs,
    141                          const int16_t *round_ptr, const int16_t *quant_ptr,
    142                          int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
    143                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
    144                          const int16_t *scan, const int16_t *iscan) {
    145  (void)scan;
    146  // Quantization pass: All coefficients with index >= zero_flag are
    147  // skippable. Note: zero_flag can be zero.
    148  const int16x8_t v_zero = vdupq_n_s16(0);
    149  int16x8_t v_quant = vld1q_s16(quant_ptr);
    150  int16x8_t v_dequant = vld1q_s16(dequant_ptr);
    151  int16x8_t v_round = vld1q_s16(round_ptr);
    152  int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
    153  uint16x8_t v_nz_mask;
    154  intptr_t count = n_coeffs;
    155 
    156  // process dc and the first seven ac coeffs
    157  v_nz_mask = quantize_lp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
    158                            v_dequant, v_round, v_zero);
    159  v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask);
    160  // overwrite the dc constants with ac constants
    161  v_quant = vdupq_lane_s16(vget_low_s16(v_quant), 1);
    162  v_dequant = vdupq_lane_s16(vget_low_s16(v_dequant), 1);
    163  v_round = vdupq_lane_s16(vget_low_s16(v_round), 1);
    164 
    165  count -= 8;
    166  // now process the rest of the ac coeffs
    167  do {
    168    coeff_ptr += 8;
    169    qcoeff_ptr += 8;
    170    dqcoeff_ptr += 8;
    171    iscan += 8;
    172    v_nz_mask = quantize_lp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
    173                              v_dequant, v_round, v_zero);
    174    v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask);
    175    count -= 8;
    176  } while (count != 0);
    177  *eob_ptr = get_max_eob(v_eobmax_76543210);
    178 }
    179 
    180 static AOM_FORCE_INLINE uint16x8_t quantize_fp_logscale_8(
    181    const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
    182    tran_low_t *dqcoeff_ptr, int16x8_t v_quant, int16x8_t v_dequant,
    183    int16x8_t v_round, int16x8_t v_zero, int log_scale) {
    184  const int16x8_t v_log_scale_minus_1 = vdupq_n_s16(log_scale - 1);
    185  const int16x8_t v_neg_log_scale_plus_1 = vdupq_n_s16(-(1 + log_scale));
    186  const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
    187  const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
    188  const int16x8_t v_abs_coeff = vabsq_s16(v_coeff);
    189  const uint16x8_t v_mask =
    190      vcgeq_s16(v_abs_coeff, vshlq_s16(v_dequant, v_neg_log_scale_plus_1));
    191  // const int64_t tmp = vmask ? (int64_t)abs_coeff + log_scaled_round : 0
    192  const int16x8_t v_tmp = vandq_s16(vqaddq_s16(v_abs_coeff, v_round),
    193                                    vreinterpretq_s16_u16(v_mask));
    194  const int16x8_t v_tmp2 =
    195      vqdmulhq_s16(vshlq_s16(v_tmp, v_log_scale_minus_1), v_quant);
    196  const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero);
    197  const int16x8_t v_qcoeff =
    198      vsubq_s16(veorq_s16(v_tmp2, v_coeff_sign), v_coeff_sign);
    199  // Multiplying by dequant here will use all 16 bits. Cast to unsigned before
    200  // shifting right. (vshlq_s16 will shift right if shift value is negative)
    201  const uint16x8_t v_abs_dqcoeff =
    202      vshlq_u16(vreinterpretq_u16_s16(vmulq_s16(v_tmp2, v_dequant)),
    203                vdupq_n_s16(-log_scale));
    204  const int16x8_t v_dqcoeff =
    205      vsubq_s16(veorq_s16(vreinterpretq_s16_u16(v_abs_dqcoeff), v_coeff_sign),
    206                v_coeff_sign);
    207  store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);
    208  store_s16q_to_tran_low(dqcoeff_ptr, v_dqcoeff);
    209  return v_nz_mask;
    210 }
    211 
    212 static AOM_FORCE_INLINE uint16x8_t quantize_fp_logscale2_8(
    213    const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
    214    tran_low_t *dqcoeff_ptr, int16x8_t v_quant, int16x8_t v_dequant,
    215    int16x8_t v_round, int16x8_t v_zero) {
    216  const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
    217  const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
    218  const int16x8_t v_abs_coeff = vabsq_s16(v_coeff);
    219  const uint16x8_t v_mask =
    220      vcgeq_u16(vshlq_n_u16(vreinterpretq_u16_s16(v_abs_coeff), 1),
    221                vshrq_n_u16(vreinterpretq_u16_s16(v_dequant), 2));
    222  // abs_coeff = vmask ? (int64_t)abs_coeff + log_scaled_round : 0
    223  const int16x8_t v_tmp = vandq_s16(vqaddq_s16(v_abs_coeff, v_round),
    224                                    vreinterpretq_s16_u16(v_mask));
    225  // tmp32 = (int)((abs_coeff * quant_ptr[rc != 0]) >> (16 - log_scale));
    226  const int16x8_t v_tmp2 =
    227      vorrq_s16(vshlq_n_s16(vqdmulhq_s16(v_tmp, v_quant), 1),
    228                vreinterpretq_s16_u16(vshrq_n_u16(
    229                    vreinterpretq_u16_s16(vmulq_s16(v_tmp, v_quant)), 14)));
    230  const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero);
    231  const int16x8_t v_qcoeff =
    232      vsubq_s16(veorq_s16(v_tmp2, v_coeff_sign), v_coeff_sign);
    233  // const tran_low_t abs_dqcoeff = (tmp32 * dequant_ptr[rc != 0]) >> log_scale;
    234  const int16x8_t v_abs_dqcoeff =
    235      vorrq_s16(vshlq_n_s16(vqdmulhq_s16(v_tmp2, v_dequant), 13),
    236                vreinterpretq_s16_u16(vshrq_n_u16(
    237                    vreinterpretq_u16_s16(vmulq_s16(v_tmp2, v_dequant)), 2)));
    238  const int16x8_t v_dqcoeff =
    239      vsubq_s16(veorq_s16(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign);
    240  store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);
    241  store_s16q_to_tran_low(dqcoeff_ptr, v_dqcoeff);
    242  return v_nz_mask;
    243 }
    244 
    245 static AOM_FORCE_INLINE void quantize_fp_no_qmatrix_neon(
    246    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr,
    247    const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
    248    const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *iscan,
    249    int log_scale) {
    250  const int16x8_t v_zero = vdupq_n_s16(0);
    251  int16x8_t v_quant = vld1q_s16(quant_ptr);
    252  int16x8_t v_dequant = vld1q_s16(dequant_ptr);
    253  const int16x8_t v_round_no_scale = vld1q_s16(round_ptr);
    254  int16x8_t v_round =
    255      vqrdmulhq_n_s16(v_round_no_scale, (int16_t)(1 << (15 - log_scale)));
    256  int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
    257  intptr_t non_zero_count = n_coeffs;
    258 
    259  assert(n_coeffs > 16);
    260  // Pre-scan pass
    261  const int16x8_t v_dequant_scaled =
    262      vshlq_s16(v_dequant, vdupq_n_s16(-(1 + log_scale)));
    263  const int16x8_t v_zbin_s16 =
    264      vdupq_lane_s16(vget_low_s16(v_dequant_scaled), 1);
    265  intptr_t i = n_coeffs;
    266  do {
    267    const int16x8_t v_coeff_a = load_tran_low_to_s16q(coeff_ptr + i - 8);
    268    const int16x8_t v_coeff_b = load_tran_low_to_s16q(coeff_ptr + i - 16);
    269    const int16x8_t v_abs_coeff_a = vabsq_s16(v_coeff_a);
    270    const int16x8_t v_abs_coeff_b = vabsq_s16(v_coeff_b);
    271    const uint16x8_t v_mask_a = vcgeq_s16(v_abs_coeff_a, v_zbin_s16);
    272    const uint16x8_t v_mask_b = vcgeq_s16(v_abs_coeff_b, v_zbin_s16);
    273    // If the coefficient is in the base ZBIN range, then discard.
    274    if (horizontal_long_add_u16x8(v_mask_a, v_mask_b) == 0) {
    275      non_zero_count -= 16;
    276    } else {
    277      break;
    278    }
    279    i -= 16;
    280  } while (i > 0);
    281 
    282  const intptr_t remaining_zcoeffs = n_coeffs - non_zero_count;
    283  memset(qcoeff_ptr + non_zero_count, 0,
    284         remaining_zcoeffs * sizeof(*qcoeff_ptr));
    285  memset(dqcoeff_ptr + non_zero_count, 0,
    286         remaining_zcoeffs * sizeof(*dqcoeff_ptr));
    287 
    288  // process dc and the first seven ac coeffs
    289  uint16x8_t v_nz_mask;
    290  if (log_scale == 2) {
    291    v_nz_mask = quantize_fp_logscale2_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
    292                                        v_quant, v_dequant, v_round, v_zero);
    293  } else {
    294    v_nz_mask =
    295        quantize_fp_logscale_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
    296                               v_dequant, v_round, v_zero, log_scale);
    297  }
    298  v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask);
    299  // overwrite the dc constants with ac constants
    300  v_quant = vdupq_lane_s16(vget_low_s16(v_quant), 1);
    301  v_dequant = vdupq_lane_s16(vget_low_s16(v_dequant), 1);
    302  v_round = vdupq_lane_s16(vget_low_s16(v_round), 1);
    303 
    304  for (intptr_t count = non_zero_count - 8; count > 0; count -= 8) {
    305    coeff_ptr += 8;
    306    qcoeff_ptr += 8;
    307    dqcoeff_ptr += 8;
    308    iscan += 8;
    309    if (log_scale == 2) {
    310      v_nz_mask = quantize_fp_logscale2_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
    311                                          v_quant, v_dequant, v_round, v_zero);
    312    } else {
    313      v_nz_mask =
    314          quantize_fp_logscale_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
    315                                 v_dequant, v_round, v_zero, log_scale);
    316    }
    317    v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask);
    318  }
    319  *eob_ptr = get_max_eob(v_eobmax_76543210);
    320 }
    321 
    322 void av1_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
    323                                const int16_t *zbin_ptr,
    324                                const int16_t *round_ptr,
    325                                const int16_t *quant_ptr,
    326                                const int16_t *quant_shift_ptr,
    327                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
    328                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
    329                                const int16_t *scan, const int16_t *iscan) {
    330  (void)zbin_ptr;
    331  (void)quant_shift_ptr;
    332  (void)scan;
    333  quantize_fp_no_qmatrix_neon(coeff_ptr, n_coeffs, round_ptr, quant_ptr,
    334                              qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr,
    335                              iscan, 1);
    336 }
    337 
    338 void av1_quantize_fp_64x64_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
    339                                const int16_t *zbin_ptr,
    340                                const int16_t *round_ptr,
    341                                const int16_t *quant_ptr,
    342                                const int16_t *quant_shift_ptr,
    343                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
    344                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
    345                                const int16_t *scan, const int16_t *iscan) {
    346  (void)zbin_ptr;
    347  (void)quant_shift_ptr;
    348  (void)scan;
    349  quantize_fp_no_qmatrix_neon(coeff_ptr, n_coeffs, round_ptr, quant_ptr,
    350                              qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr,
    351                              iscan, 2);
    352 }
    353 
    354 void aom_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
    355                         const int16_t *zbin_ptr, const int16_t *round_ptr,
    356                         const int16_t *quant_ptr,
    357                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
    358                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
    359                         uint16_t *eob_ptr, const int16_t *scan,
    360                         const int16_t *iscan) {
    361  (void)quant_shift_ptr;
    362  (void)scan;
    363 
    364  const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
    365 
    366  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
    367  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
    368 
    369  const int16x8_t zero = vdupq_n_s16(0);
    370  int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero));
    371 
    372  int16x8_t vzbins = vdupq_n_s16(zbins[1]), vround = vdupq_n_s16(round_ptr[1]);
    373  int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]);
    374  int16x8_t vquant = vdupq_n_s16(quant_ptr[1]);
    375  int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]);
    376 
    377  int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
    378  int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
    379  int16x8_t v_abs = vabsq_s16(v_coeff);
    380 
    381  vzbins = vsetq_lane_s16(zbins[0], vzbins, 0);
    382 
    383  uint16x8_t vcond = vcgeq_s16(v_abs, vzbins);
    384  uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
    385  if (nz_check) {
    386    vround = vsetq_lane_s16(round_ptr[0], vround, 0);
    387    vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0);
    388    vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0);
    389    vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0);
    390 
    391    int16x8_t vtmp = vqaddq_s16(v_abs, vround);
    392    int16x8_t vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
    393    vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1);
    394 
    395    int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
    396    int16x8_t coeff_nz_mask =
    397        vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0]));
    398    store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask);
    399    int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant);
    400 
    401    vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
    402    coeff_nz_mask =
    403        vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0]));
    404    store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask);
    405 
    406    vround = vsetq_lane_s16(round_ptr[1], vround, 0);
    407    vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0);
    408    vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0);
    409    vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0);
    410 
    411    uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
    412    const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
    413    int16x8_t v_iscan = vld1q_s16(&iscan[0]);
    414    vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
    415    v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
    416  }
    417  vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
    418 
    419  for (int i = 8; i < n_coeffs; i += 8) {
    420    v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
    421    v_coeff_sign = vshrq_n_s16(v_coeff, 15);
    422    v_abs = vabsq_s16(v_coeff);
    423    vcond = vcgeq_s16(v_abs, vzbins);
    424 
    425    nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
    426    if (nz_check) {
    427      int16x8_t vtmp = vqaddq_s16(v_abs, vround);
    428      int16x8_t vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
    429 
    430      vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1);
    431      int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
    432      int16x8_t coeff_nz_mask =
    433          vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i]));
    434      store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask);
    435      int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant);
    436      vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
    437      coeff_nz_mask =
    438          vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i]));
    439      store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask);
    440 
    441      uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
    442      const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
    443      int16x8_t v_iscan = vld1q_s16(&iscan[i]);
    444      vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
    445      v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
    446    }
    447  }
    448  *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
    449 }
    450 
    451 #define QM_MULL_SHIFT(x0, x1)                                              \
    452  vreinterpretq_s16_u16(vorrq_u16(                                         \
    453      vreinterpretq_u16_s16(vshlq_n_s16(                                   \
    454          vqdmulhq_s16(x0, vreinterpretq_s16_u16(x1)), 15 - AOM_QM_BITS)), \
    455      vshrq_n_u16(vmulq_u16(vreinterpretq_u16_s16(x0), x1), AOM_QM_BITS)))
    456 
    457 static void aom_quantize_b_helper_16x16_neon(
    458    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
    459    const int16_t *round_ptr, const int16_t *quant_ptr,
    460    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
    461    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
    462    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
    463    const qm_val_t *iqm_ptr) {
    464  (void)scan;
    465 
    466  uint16x8_t vwt, viwt;
    467  const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
    468 
    469  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
    470  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
    471 
    472  const int16x8_t zero = vdupq_n_s16(0);
    473  int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero));
    474 
    475  int16x8_t vzbins = vdupq_n_s16(zbins[1]), vround = vdupq_n_s16(round_ptr[1]);
    476  int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]);
    477  int16x8_t vquant = vdupq_n_s16(quant_ptr[1]);
    478  int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]);
    479 
    480  int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
    481  int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
    482  int16x8_t v_abs = vabsq_s16(v_coeff);
    483  vzbins = vsetq_lane_s16(zbins[0], vzbins, 0);
    484  uint16x8_t vcond;
    485  if (qm_ptr == NULL) {
    486    vcond = vcgeq_s16(v_abs, vzbins);
    487  } else {
    488    vwt = vmovl_u8(vld1_u8(&qm_ptr[0]));
    489    vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
    490  }
    491  uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
    492  if (nz_check) {
    493    vround = vsetq_lane_s16(round_ptr[0], vround, 0);
    494    vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0);
    495    vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0);
    496    vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0);
    497 
    498    int16x8_t vtmp = vqaddq_s16(v_abs, vround);
    499 
    500    int16x8_t vtmp2;
    501    if (qm_ptr == NULL) {
    502      vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
    503    } else {
    504      vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
    505      vtmp2 = vaddq_s16(vtmp2, vtmp);
    506    }
    507 
    508    vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1);
    509    int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
    510    int16x8_t coeff_nz_mask =
    511        vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0]));
    512    store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask);
    513 
    514    if (iqm_ptr != NULL) {
    515      viwt = vmovl_u8(vld1_u8(&iqm_ptr[0]));
    516      vdequant = QM_MULL_SHIFT(vdequant, viwt);
    517    }
    518    int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant);
    519    vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
    520    coeff_nz_mask =
    521        vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0]));
    522    store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask);
    523 
    524    vround = vsetq_lane_s16(round_ptr[1], vround, 0);
    525    vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0);
    526    vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0);
    527    vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0);
    528 
    529    uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
    530    const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
    531    int16x8_t v_iscan = vld1q_s16(&iscan[0]);
    532    vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
    533    v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
    534  }
    535  vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
    536 
    537  for (int i = 8; i < n_coeffs; i += 8) {
    538    v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
    539    v_coeff_sign = vshrq_n_s16(v_coeff, 15);
    540    v_abs = vabsq_s16(v_coeff);
    541 
    542    if (qm_ptr == NULL) {
    543      vcond = vcgeq_s16(v_abs, vzbins);
    544    } else {
    545      vwt = vmovl_u8(vld1_u8(&qm_ptr[i]));
    546      vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
    547    }
    548    nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
    549    if (nz_check) {
    550      int16x8_t vtmp = vqaddq_s16(v_abs, vround);
    551 
    552      int16x8_t vtmp2;
    553      if (qm_ptr == NULL) {
    554        vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
    555      } else {
    556        vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
    557        vtmp2 = vaddq_s16(vtmp2, vtmp);
    558      }
    559 
    560      vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1);
    561      int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
    562      int16x8_t coeff_nz_mask =
    563          vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i]));
    564      store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask);
    565 
    566      if (iqm_ptr != NULL) {
    567        viwt = vmovl_u8(vld1_u8(&iqm_ptr[i]));
    568        vdequant = QM_MULL_SHIFT(vdequant, viwt);
    569      }
    570      int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant);
    571      vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
    572      coeff_nz_mask =
    573          vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i]));
    574      store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask);
    575 
    576      uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
    577      const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
    578      int16x8_t v_iscan = vld1q_s16(&iscan[i]);
    579      vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
    580      v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
    581    }
    582  }
    583  *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
    584 }
    585 
    586 static void aom_quantize_b_helper_32x32_neon(
    587    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
    588    const int16_t *round_ptr, const int16_t *quant_ptr,
    589    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
    590    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
    591    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
    592    const qm_val_t *iqm_ptr) {
    593  (void)scan;
    594 
    595  uint16x8_t vwt, viwt;
    596  const int log_scale = 1;
    597  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
    598                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
    599 
    600  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
    601  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
    602 
    603  const int16x8_t zero = vdupq_n_s16(0);
    604  int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero));
    605  const int16x8_t v_log_scale = v_eobmax_76543210;
    606 
    607  int16x8_t vzbins = vdupq_n_s16(zbins[1]),
    608            vround = vdupq_n_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale));
    609  int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]);
    610  int16x8_t vquant = vdupq_n_s16(quant_ptr[1]);
    611  int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]);
    612 
    613  int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
    614  int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
    615  int16x8_t v_abs = vabsq_s16(v_coeff);
    616  vzbins = vsetq_lane_s16(zbins[0], vzbins, 0);
    617  uint16x8_t vcond;
    618  if (qm_ptr == NULL) {
    619    vcond = vcgeq_s16(v_abs, vzbins);
    620  } else {
    621    vwt = vmovl_u8(vld1_u8(&qm_ptr[0]));
    622    vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
    623  }
    624  uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
    625  if (nz_check) {
    626    vround =
    627        vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[0], log_scale), vround, 0);
    628    vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0);
    629    vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0);
    630    vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0);
    631 
    632    int16x8_t vtmp = vqaddq_s16(v_abs, vround);
    633 
    634    int16x8_t vtmp2;
    635    if (qm_ptr == NULL) {
    636      vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
    637    } else {
    638      vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
    639      vtmp2 = vaddq_s16(vtmp2, vtmp);
    640    }
    641 
    642    vtmp2 = vqdmulhq_s16(vtmp2, vquant_shift);
    643    int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
    644    int16x8_t coeff_nz_mask =
    645        vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0]));
    646    store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask);
    647 
    648    if (iqm_ptr != NULL) {
    649      viwt = vmovl_u8(vld1_u8(&iqm_ptr[0]));
    650      vdequant = QM_MULL_SHIFT(vdequant, viwt);
    651    }
    652    int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16(
    653        vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale));
    654    vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
    655    coeff_nz_mask =
    656        vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0]));
    657    store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask);
    658 
    659    vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
    660    vround =
    661        vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale), vround, 0);
    662    vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0);
    663    vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0);
    664    vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0);
    665 
    666    uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
    667    const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
    668    int16x8_t v_iscan = vld1q_s16(&iscan[0]);
    669    vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
    670    v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
    671  }
    672  vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
    673 
    674  for (int i = 8; i < n_coeffs; i += 8) {
    675    v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
    676    v_coeff_sign = vshrq_n_s16(v_coeff, 15);
    677    v_abs = vabsq_s16(v_coeff);
    678 
    679    if (qm_ptr == NULL) {
    680      vcond = vcgeq_s16(v_abs, vzbins);
    681    } else {
    682      vwt = vmovl_u8(vld1_u8(&qm_ptr[i]));
    683      vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
    684    }
    685    nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
    686    if (nz_check) {
    687      int16x8_t vtmp = vqaddq_s16(v_abs, vround);
    688 
    689      int16x8_t vtmp2;
    690      if (qm_ptr == NULL) {
    691        vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
    692      } else {
    693        vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
    694        vtmp2 = vaddq_s16(vtmp2, vtmp);
    695      }
    696      vtmp2 = vqdmulhq_s16(vtmp2, vquant_shift);
    697 
    698      int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
    699      int16x8_t coeff_nz_mask =
    700          vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i]));
    701      store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask);
    702 
    703      if (iqm_ptr != NULL) {
    704        viwt = vmovl_u8(vld1_u8(&iqm_ptr[i]));
    705        vdequant = QM_MULL_SHIFT(vdequant, viwt);
    706      }
    707      int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16(
    708          vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale));
    709      vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
    710      coeff_nz_mask =
    711          vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i]));
    712      store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask);
    713 
    714      uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
    715      const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
    716      int16x8_t v_iscan = vld1q_s16(&iscan[i]);
    717      vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
    718      v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
    719    }
    720  }
    721  *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
    722 }
    723 
    724 static void aom_quantize_b_helper_64x64_neon(
    725    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
    726    const int16_t *round_ptr, const int16_t *quant_ptr,
    727    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
    728    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
    729    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
    730    const qm_val_t *iqm_ptr) {
    731  (void)scan;
    732 
    733  uint16x8_t vwt, viwt;
    734  const int log_scale = 2;
    735  const int16x8_t v_log_scale =
    736      vreinterpretq_s16_s64(vdupq_n_s64(0xFFFEFFFEFFFEFFFE));
    737 
    738  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
    739                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
    740 
    741  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
    742  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
    743 
    744  const int16x8_t zero = vdupq_n_s16(0);
    745  int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero));
    746  int16x8_t v_ones = vnegq_s16(v_eobmax_76543210);
    747 
    748  int16x8_t vzbins = vdupq_n_s16(zbins[1]),
    749            vround = vdupq_n_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale));
    750  int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]);
    751  int16x8_t vquant = vdupq_n_s16(quant_ptr[1]);
    752  int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]);
    753 
    754  int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
    755  int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
    756  int16x8_t v_abs = vabsq_s16(v_coeff);
    757  vzbins = vsetq_lane_s16(zbins[0], vzbins, 0);
    758  uint16x8_t vcond;
    759  if (qm_ptr == NULL) {
    760    vcond = vcgeq_s16(v_abs, vzbins);
    761  } else {
    762    vwt = vmovl_u8(vld1_u8(&qm_ptr[0]));
    763    vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
    764  }
    765  uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
    766  if (nz_check) {
    767    vround =
    768        vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[0], log_scale), vround, 0);
    769    vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0);
    770    vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0);
    771    vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0);
    772    int16x8_t vtmp = vqaddq_s16(v_abs, vround);
    773 
    774    int16x8_t vtmp2;
    775    if (qm_ptr == NULL) {
    776      vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
    777    } else {
    778      vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
    779      vtmp2 = vaddq_s16(vtmp2, vtmp);
    780    }
    781 
    782    int16x8_t ones =
    783        vandq_s16(vshrq_n_s16(vmulq_s16(vtmp2, vquant_shift), 14), v_ones);
    784    vtmp2 =
    785        vaddq_s16(vshlq_s16(vqdmulhq_s16(vtmp2, vquant_shift), v_ones), ones);
    786    int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
    787    int16x8_t coeff_nz_mask =
    788        vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0]));
    789    store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask);
    790 
    791    if (iqm_ptr != NULL) {
    792      viwt = vmovl_u8(vld1_u8(&iqm_ptr[0]));
    793      vdequant = QM_MULL_SHIFT(vdequant, viwt);
    794    }
    795    int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16(
    796        vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale));
    797    v_deq_abs =
    798        vorrq_s16(vshlq_n_s16(vqdmulhq_s16(vtmp2, vdequant), 13), v_deq_abs);
    799    vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
    800    coeff_nz_mask =
    801        vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0]));
    802    store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask);
    803 
    804    vround =
    805        vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale), vround, 0);
    806    vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0);
    807    vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0);
    808    vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0);
    809 
    810    uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
    811    const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
    812    int16x8_t v_iscan = vld1q_s16(&iscan[0]);
    813    vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
    814    v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
    815  }
    816  vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
    817 
    818  for (int i = 8; i < n_coeffs; i += 8) {
    819    v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
    820    v_coeff_sign = vshrq_n_s16(v_coeff, 15);
    821    v_abs = vabsq_s16(v_coeff);
    822 
    823    if (qm_ptr == NULL) {
    824      vcond = vcgeq_s16(v_abs, vzbins);
    825    } else {
    826      vwt = vmovl_u8(vld1_u8(&qm_ptr[i]));
    827      vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
    828    }
    829    nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
    830    if (nz_check) {
    831      int16x8_t vtmp = vqaddq_s16(v_abs, vround);
    832 
    833      int16x8_t vtmp2;
    834      if (qm_ptr == NULL) {
    835        vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
    836      } else {
    837        vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
    838        vtmp2 = vaddq_s16(vtmp2, vtmp);
    839      }
    840 
    841      int16x8_t ones =
    842          vandq_s16(vshrq_n_s16(vmulq_s16(vtmp2, vquant_shift), 14), v_ones);
    843      vtmp2 =
    844          vaddq_s16(vshlq_s16(vqdmulhq_s16(vtmp2, vquant_shift), v_ones), ones);
    845      int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
    846      int16x8_t coeff_nz_mask =
    847          vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i]));
    848      store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask);
    849 
    850      if (iqm_ptr != NULL) {
    851        viwt = vmovl_u8(vld1_u8(&iqm_ptr[i]));
    852        vdequant = QM_MULL_SHIFT(vdequant, viwt);
    853      }
    854      int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16(
    855          vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale));
    856      v_deq_abs =
    857          vorrq_s16(vshlq_n_s16(vqdmulhq_s16(vtmp2, vdequant), 13), v_deq_abs);
    858      vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
    859      coeff_nz_mask =
    860          vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i]));
    861      store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask);
    862 
    863      uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
    864      const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
    865      int16x8_t v_iscan = vld1q_s16(&iscan[i]);
    866      vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
    867      v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
    868    }
    869  }
    870  *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
    871 }
    872 
    873 void aom_quantize_b_helper_neon(
    874    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
    875    const int16_t *round_ptr, const int16_t *quant_ptr,
    876    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
    877    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
    878    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
    879    const qm_val_t *iqm_ptr, const int log_scale) {
    880  switch (log_scale) {  // log_scale for AV1 encoder can be only 0, 1, 2
    881    case 0:
    882      aom_quantize_b_helper_16x16_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
    883                                       quant_ptr, quant_shift_ptr, qcoeff_ptr,
    884                                       dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
    885                                       iscan, qm_ptr, iqm_ptr);
    886      break;
    887    case 1:
    888      aom_quantize_b_helper_32x32_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
    889                                       quant_ptr, quant_shift_ptr, qcoeff_ptr,
    890                                       dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
    891                                       iscan, qm_ptr, iqm_ptr);
    892      break;
    893    case 2:
    894      aom_quantize_b_helper_64x64_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
    895                                       quant_ptr, quant_shift_ptr, qcoeff_ptr,
    896                                       dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
    897                                       iscan, qm_ptr, iqm_ptr);
    898      break;
    899  }
    900 }
    901 
    902 void aom_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
    903                               const int16_t *zbin_ptr,
    904                               const int16_t *round_ptr,
    905                               const int16_t *quant_ptr,
    906                               const int16_t *quant_shift_ptr,
    907                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
    908                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
    909                               const int16_t *scan, const int16_t *iscan) {
    910  aom_quantize_b_helper_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
    911                             quant_ptr, quant_shift_ptr, qcoeff_ptr,
    912                             dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
    913                             NULL, NULL, 1);
    914 }
    915 
    916 void aom_quantize_b_64x64_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
    917                               const int16_t *zbin_ptr,
    918                               const int16_t *round_ptr,
    919                               const int16_t *quant_ptr,
    920                               const int16_t *quant_shift_ptr,
    921                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
    922                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
    923                               const int16_t *scan, const int16_t *iscan) {
    924  aom_quantize_b_helper_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
    925                             quant_ptr, quant_shift_ptr, qcoeff_ptr,
    926                             dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
    927                             NULL, NULL, 2);
    928 }