quantize_neon.c (40521B)
1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <arm_neon.h> 13 14 #include <assert.h> 15 #include <math.h> 16 17 #include "config/aom_config.h" 18 19 #include "aom_dsp/arm/mem_neon.h" 20 #include "aom_dsp/arm/sum_neon.h" 21 #include "aom_mem/aom_mem.h" 22 23 #include "av1/common/quant_common.h" 24 #include "av1/common/seg_common.h" 25 26 #include "av1/encoder/av1_quantize.h" 27 #include "av1/encoder/encoder.h" 28 #include "av1/encoder/rd.h" 29 30 static inline uint16_t get_max_eob(int16x8_t v_eobmax) { 31 #if AOM_ARCH_AARCH64 32 return (uint16_t)vmaxvq_s16(v_eobmax); 33 #else 34 const int16x4_t v_eobmax_3210 = 35 vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax)); 36 const int64x1_t v_eobmax_xx32 = 37 vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32); 38 const int16x4_t v_eobmax_tmp = 39 vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32)); 40 const int64x1_t v_eobmax_xxx3 = 41 vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16); 42 const int16x4_t v_eobmax_final = 43 vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3)); 44 return (uint16_t)vget_lane_s16(v_eobmax_final, 0); 45 #endif 46 } 47 48 static inline int16x8_t get_max_lane_eob(const int16_t *iscan, 49 int16x8_t v_eobmax, 50 uint16x8_t v_mask) { 51 const int16x8_t v_iscan = vld1q_s16(&iscan[0]); 52 const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, vdupq_n_s16(1)); 53 const int16x8_t v_nz_iscan = vbslq_s16(v_mask, v_iscan_plus1, vdupq_n_s16(0)); 54 return vmaxq_s16(v_eobmax, v_nz_iscan); 55 } 56 57 static inline uint16x8_t quantize_fp_8(const tran_low_t *coeff_ptr, 58 tran_low_t *qcoeff_ptr, 59 tran_low_t *dqcoeff_ptr, 60 int16x8_t v_quant, int16x8_t v_dequant, 61 int16x8_t v_round, int16x8_t v_zero) { 62 const int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]); 63 const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); 64 const int16x8_t v_abs = vabsq_s16(v_coeff); 65 const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round); 66 const int16x8_t v_tmp2 = vshrq_n_s16(vqdmulhq_s16(v_tmp, v_quant), 1); 67 const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero); 68 const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); 69 const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); 70 const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant); 71 store_s16q_to_tran_low(&qcoeff_ptr[0], v_qcoeff); 72 store_s16q_to_tran_low(&dqcoeff_ptr[0], v_dqcoeff); 73 return v_nz_mask; 74 } 75 76 void av1_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count, 77 const int16_t *zbin_ptr, const int16_t *round_ptr, 78 const int16_t *quant_ptr, 79 const int16_t *quant_shift_ptr, 80 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, 81 const int16_t *dequant_ptr, uint16_t *eob_ptr, 82 const int16_t *scan, const int16_t *iscan) { 83 // TODO(jingning) Decide the need of these arguments after the 84 // quantization process is completed. 85 (void)zbin_ptr; 86 (void)quant_shift_ptr; 87 (void)scan; 88 89 // Quantization pass: All coefficients with index >= zero_flag are 90 // skippable. Note: zero_flag can be zero. 91 const int16x8_t v_zero = vdupq_n_s16(0); 92 int16x8_t v_quant = vld1q_s16(quant_ptr); 93 int16x8_t v_dequant = vld1q_s16(dequant_ptr); 94 int16x8_t v_round = vld1q_s16(round_ptr); 95 int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1); 96 uint16x8_t v_nz_mask; 97 // process dc and the first seven ac coeffs 98 v_nz_mask = quantize_fp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant, 99 v_dequant, v_round, v_zero); 100 v_eobmax_76543210 = get_max_lane_eob(&iscan[0], v_eobmax_76543210, v_nz_mask); 101 // overwrite the dc constants with ac constants 102 v_quant = vdupq_lane_s16(vget_low_s16(v_quant), 1); 103 v_dequant = vdupq_lane_s16(vget_low_s16(v_dequant), 1); 104 v_round = vdupq_lane_s16(vget_low_s16(v_round), 1); 105 106 count -= 8; 107 // now process the rest of the ac coeffs 108 do { 109 coeff_ptr += 8; 110 qcoeff_ptr += 8; 111 dqcoeff_ptr += 8; 112 iscan += 8; 113 v_nz_mask = quantize_fp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant, 114 v_dequant, v_round, v_zero); 115 v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask); 116 count -= 8; 117 } while (count > 0); 118 *eob_ptr = get_max_eob(v_eobmax_76543210); 119 } 120 121 static inline uint16x8_t quantize_lp_8(const int16_t *coeff_ptr, 122 int16_t *qcoeff_ptr, 123 int16_t *dqcoeff_ptr, int16x8_t v_quant, 124 int16x8_t v_dequant, int16x8_t v_round, 125 int16x8_t v_zero) { 126 const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[0]); 127 const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); 128 const int16x8_t v_abs = vabsq_s16(v_coeff); 129 const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round); 130 const int16x8_t v_tmp2 = vshrq_n_s16(vqdmulhq_s16(v_tmp, v_quant), 1); 131 const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero); 132 const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); 133 const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); 134 const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant); 135 vst1q_s16(qcoeff_ptr, v_qcoeff); 136 vst1q_s16(dqcoeff_ptr, v_dqcoeff); 137 return v_nz_mask; 138 } 139 140 void av1_quantize_lp_neon(const int16_t *coeff_ptr, intptr_t n_coeffs, 141 const int16_t *round_ptr, const int16_t *quant_ptr, 142 int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, 143 const int16_t *dequant_ptr, uint16_t *eob_ptr, 144 const int16_t *scan, const int16_t *iscan) { 145 (void)scan; 146 // Quantization pass: All coefficients with index >= zero_flag are 147 // skippable. Note: zero_flag can be zero. 148 const int16x8_t v_zero = vdupq_n_s16(0); 149 int16x8_t v_quant = vld1q_s16(quant_ptr); 150 int16x8_t v_dequant = vld1q_s16(dequant_ptr); 151 int16x8_t v_round = vld1q_s16(round_ptr); 152 int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1); 153 uint16x8_t v_nz_mask; 154 intptr_t count = n_coeffs; 155 156 // process dc and the first seven ac coeffs 157 v_nz_mask = quantize_lp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant, 158 v_dequant, v_round, v_zero); 159 v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask); 160 // overwrite the dc constants with ac constants 161 v_quant = vdupq_lane_s16(vget_low_s16(v_quant), 1); 162 v_dequant = vdupq_lane_s16(vget_low_s16(v_dequant), 1); 163 v_round = vdupq_lane_s16(vget_low_s16(v_round), 1); 164 165 count -= 8; 166 // now process the rest of the ac coeffs 167 do { 168 coeff_ptr += 8; 169 qcoeff_ptr += 8; 170 dqcoeff_ptr += 8; 171 iscan += 8; 172 v_nz_mask = quantize_lp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant, 173 v_dequant, v_round, v_zero); 174 v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask); 175 count -= 8; 176 } while (count != 0); 177 *eob_ptr = get_max_eob(v_eobmax_76543210); 178 } 179 180 static AOM_FORCE_INLINE uint16x8_t quantize_fp_logscale_8( 181 const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, 182 tran_low_t *dqcoeff_ptr, int16x8_t v_quant, int16x8_t v_dequant, 183 int16x8_t v_round, int16x8_t v_zero, int log_scale) { 184 const int16x8_t v_log_scale_minus_1 = vdupq_n_s16(log_scale - 1); 185 const int16x8_t v_neg_log_scale_plus_1 = vdupq_n_s16(-(1 + log_scale)); 186 const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr); 187 const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); 188 const int16x8_t v_abs_coeff = vabsq_s16(v_coeff); 189 const uint16x8_t v_mask = 190 vcgeq_s16(v_abs_coeff, vshlq_s16(v_dequant, v_neg_log_scale_plus_1)); 191 // const int64_t tmp = vmask ? (int64_t)abs_coeff + log_scaled_round : 0 192 const int16x8_t v_tmp = vandq_s16(vqaddq_s16(v_abs_coeff, v_round), 193 vreinterpretq_s16_u16(v_mask)); 194 const int16x8_t v_tmp2 = 195 vqdmulhq_s16(vshlq_s16(v_tmp, v_log_scale_minus_1), v_quant); 196 const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero); 197 const int16x8_t v_qcoeff = 198 vsubq_s16(veorq_s16(v_tmp2, v_coeff_sign), v_coeff_sign); 199 // Multiplying by dequant here will use all 16 bits. Cast to unsigned before 200 // shifting right. (vshlq_s16 will shift right if shift value is negative) 201 const uint16x8_t v_abs_dqcoeff = 202 vshlq_u16(vreinterpretq_u16_s16(vmulq_s16(v_tmp2, v_dequant)), 203 vdupq_n_s16(-log_scale)); 204 const int16x8_t v_dqcoeff = 205 vsubq_s16(veorq_s16(vreinterpretq_s16_u16(v_abs_dqcoeff), v_coeff_sign), 206 v_coeff_sign); 207 store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff); 208 store_s16q_to_tran_low(dqcoeff_ptr, v_dqcoeff); 209 return v_nz_mask; 210 } 211 212 static AOM_FORCE_INLINE uint16x8_t quantize_fp_logscale2_8( 213 const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, 214 tran_low_t *dqcoeff_ptr, int16x8_t v_quant, int16x8_t v_dequant, 215 int16x8_t v_round, int16x8_t v_zero) { 216 const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr); 217 const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); 218 const int16x8_t v_abs_coeff = vabsq_s16(v_coeff); 219 const uint16x8_t v_mask = 220 vcgeq_u16(vshlq_n_u16(vreinterpretq_u16_s16(v_abs_coeff), 1), 221 vshrq_n_u16(vreinterpretq_u16_s16(v_dequant), 2)); 222 // abs_coeff = vmask ? (int64_t)abs_coeff + log_scaled_round : 0 223 const int16x8_t v_tmp = vandq_s16(vqaddq_s16(v_abs_coeff, v_round), 224 vreinterpretq_s16_u16(v_mask)); 225 // tmp32 = (int)((abs_coeff * quant_ptr[rc != 0]) >> (16 - log_scale)); 226 const int16x8_t v_tmp2 = 227 vorrq_s16(vshlq_n_s16(vqdmulhq_s16(v_tmp, v_quant), 1), 228 vreinterpretq_s16_u16(vshrq_n_u16( 229 vreinterpretq_u16_s16(vmulq_s16(v_tmp, v_quant)), 14))); 230 const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero); 231 const int16x8_t v_qcoeff = 232 vsubq_s16(veorq_s16(v_tmp2, v_coeff_sign), v_coeff_sign); 233 // const tran_low_t abs_dqcoeff = (tmp32 * dequant_ptr[rc != 0]) >> log_scale; 234 const int16x8_t v_abs_dqcoeff = 235 vorrq_s16(vshlq_n_s16(vqdmulhq_s16(v_tmp2, v_dequant), 13), 236 vreinterpretq_s16_u16(vshrq_n_u16( 237 vreinterpretq_u16_s16(vmulq_s16(v_tmp2, v_dequant)), 2))); 238 const int16x8_t v_dqcoeff = 239 vsubq_s16(veorq_s16(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign); 240 store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff); 241 store_s16q_to_tran_low(dqcoeff_ptr, v_dqcoeff); 242 return v_nz_mask; 243 } 244 245 static AOM_FORCE_INLINE void quantize_fp_no_qmatrix_neon( 246 const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, 247 const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, 248 const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *iscan, 249 int log_scale) { 250 const int16x8_t v_zero = vdupq_n_s16(0); 251 int16x8_t v_quant = vld1q_s16(quant_ptr); 252 int16x8_t v_dequant = vld1q_s16(dequant_ptr); 253 const int16x8_t v_round_no_scale = vld1q_s16(round_ptr); 254 int16x8_t v_round = 255 vqrdmulhq_n_s16(v_round_no_scale, (int16_t)(1 << (15 - log_scale))); 256 int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1); 257 intptr_t non_zero_count = n_coeffs; 258 259 assert(n_coeffs > 16); 260 // Pre-scan pass 261 const int16x8_t v_dequant_scaled = 262 vshlq_s16(v_dequant, vdupq_n_s16(-(1 + log_scale))); 263 const int16x8_t v_zbin_s16 = 264 vdupq_lane_s16(vget_low_s16(v_dequant_scaled), 1); 265 intptr_t i = n_coeffs; 266 do { 267 const int16x8_t v_coeff_a = load_tran_low_to_s16q(coeff_ptr + i - 8); 268 const int16x8_t v_coeff_b = load_tran_low_to_s16q(coeff_ptr + i - 16); 269 const int16x8_t v_abs_coeff_a = vabsq_s16(v_coeff_a); 270 const int16x8_t v_abs_coeff_b = vabsq_s16(v_coeff_b); 271 const uint16x8_t v_mask_a = vcgeq_s16(v_abs_coeff_a, v_zbin_s16); 272 const uint16x8_t v_mask_b = vcgeq_s16(v_abs_coeff_b, v_zbin_s16); 273 // If the coefficient is in the base ZBIN range, then discard. 274 if (horizontal_long_add_u16x8(v_mask_a, v_mask_b) == 0) { 275 non_zero_count -= 16; 276 } else { 277 break; 278 } 279 i -= 16; 280 } while (i > 0); 281 282 const intptr_t remaining_zcoeffs = n_coeffs - non_zero_count; 283 memset(qcoeff_ptr + non_zero_count, 0, 284 remaining_zcoeffs * sizeof(*qcoeff_ptr)); 285 memset(dqcoeff_ptr + non_zero_count, 0, 286 remaining_zcoeffs * sizeof(*dqcoeff_ptr)); 287 288 // process dc and the first seven ac coeffs 289 uint16x8_t v_nz_mask; 290 if (log_scale == 2) { 291 v_nz_mask = quantize_fp_logscale2_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, 292 v_quant, v_dequant, v_round, v_zero); 293 } else { 294 v_nz_mask = 295 quantize_fp_logscale_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant, 296 v_dequant, v_round, v_zero, log_scale); 297 } 298 v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask); 299 // overwrite the dc constants with ac constants 300 v_quant = vdupq_lane_s16(vget_low_s16(v_quant), 1); 301 v_dequant = vdupq_lane_s16(vget_low_s16(v_dequant), 1); 302 v_round = vdupq_lane_s16(vget_low_s16(v_round), 1); 303 304 for (intptr_t count = non_zero_count - 8; count > 0; count -= 8) { 305 coeff_ptr += 8; 306 qcoeff_ptr += 8; 307 dqcoeff_ptr += 8; 308 iscan += 8; 309 if (log_scale == 2) { 310 v_nz_mask = quantize_fp_logscale2_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, 311 v_quant, v_dequant, v_round, v_zero); 312 } else { 313 v_nz_mask = 314 quantize_fp_logscale_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant, 315 v_dequant, v_round, v_zero, log_scale); 316 } 317 v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask); 318 } 319 *eob_ptr = get_max_eob(v_eobmax_76543210); 320 } 321 322 void av1_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, 323 const int16_t *zbin_ptr, 324 const int16_t *round_ptr, 325 const int16_t *quant_ptr, 326 const int16_t *quant_shift_ptr, 327 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, 328 const int16_t *dequant_ptr, uint16_t *eob_ptr, 329 const int16_t *scan, const int16_t *iscan) { 330 (void)zbin_ptr; 331 (void)quant_shift_ptr; 332 (void)scan; 333 quantize_fp_no_qmatrix_neon(coeff_ptr, n_coeffs, round_ptr, quant_ptr, 334 qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, 335 iscan, 1); 336 } 337 338 void av1_quantize_fp_64x64_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, 339 const int16_t *zbin_ptr, 340 const int16_t *round_ptr, 341 const int16_t *quant_ptr, 342 const int16_t *quant_shift_ptr, 343 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, 344 const int16_t *dequant_ptr, uint16_t *eob_ptr, 345 const int16_t *scan, const int16_t *iscan) { 346 (void)zbin_ptr; 347 (void)quant_shift_ptr; 348 (void)scan; 349 quantize_fp_no_qmatrix_neon(coeff_ptr, n_coeffs, round_ptr, quant_ptr, 350 qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, 351 iscan, 2); 352 } 353 354 void aom_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, 355 const int16_t *zbin_ptr, const int16_t *round_ptr, 356 const int16_t *quant_ptr, 357 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, 358 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, 359 uint16_t *eob_ptr, const int16_t *scan, 360 const int16_t *iscan) { 361 (void)quant_shift_ptr; 362 (void)scan; 363 364 const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] }; 365 366 memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); 367 memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); 368 369 const int16x8_t zero = vdupq_n_s16(0); 370 int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero)); 371 372 int16x8_t vzbins = vdupq_n_s16(zbins[1]), vround = vdupq_n_s16(round_ptr[1]); 373 int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]); 374 int16x8_t vquant = vdupq_n_s16(quant_ptr[1]); 375 int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]); 376 377 int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]); 378 int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); 379 int16x8_t v_abs = vabsq_s16(v_coeff); 380 381 vzbins = vsetq_lane_s16(zbins[0], vzbins, 0); 382 383 uint16x8_t vcond = vcgeq_s16(v_abs, vzbins); 384 uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0); 385 if (nz_check) { 386 vround = vsetq_lane_s16(round_ptr[0], vround, 0); 387 vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0); 388 vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0); 389 vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0); 390 391 int16x8_t vtmp = vqaddq_s16(v_abs, vround); 392 int16x8_t vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1); 393 vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1); 394 395 int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign); 396 int16x8_t coeff_nz_mask = 397 vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0])); 398 store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask); 399 int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant); 400 401 vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign); 402 coeff_nz_mask = 403 vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0])); 404 store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask); 405 406 vround = vsetq_lane_s16(round_ptr[1], vround, 0); 407 vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0); 408 vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0); 409 vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0); 410 411 uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero); 412 const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond); 413 int16x8_t v_iscan = vld1q_s16(&iscan[0]); 414 vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210)); 415 v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210); 416 } 417 vzbins = vsetq_lane_s16(zbins[1], vzbins, 0); 418 419 for (int i = 8; i < n_coeffs; i += 8) { 420 v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]); 421 v_coeff_sign = vshrq_n_s16(v_coeff, 15); 422 v_abs = vabsq_s16(v_coeff); 423 vcond = vcgeq_s16(v_abs, vzbins); 424 425 nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0); 426 if (nz_check) { 427 int16x8_t vtmp = vqaddq_s16(v_abs, vround); 428 int16x8_t vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1); 429 430 vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1); 431 int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign); 432 int16x8_t coeff_nz_mask = 433 vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i])); 434 store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask); 435 int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant); 436 vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign); 437 coeff_nz_mask = 438 vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i])); 439 store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask); 440 441 uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero); 442 const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond); 443 int16x8_t v_iscan = vld1q_s16(&iscan[i]); 444 vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210)); 445 v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210); 446 } 447 } 448 *eob_ptr = get_max_eob(v_eobmax_76543210) + 1; 449 } 450 451 #define QM_MULL_SHIFT(x0, x1) \ 452 vreinterpretq_s16_u16(vorrq_u16( \ 453 vreinterpretq_u16_s16(vshlq_n_s16( \ 454 vqdmulhq_s16(x0, vreinterpretq_s16_u16(x1)), 15 - AOM_QM_BITS)), \ 455 vshrq_n_u16(vmulq_u16(vreinterpretq_u16_s16(x0), x1), AOM_QM_BITS))) 456 457 static void aom_quantize_b_helper_16x16_neon( 458 const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, 459 const int16_t *round_ptr, const int16_t *quant_ptr, 460 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, 461 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, 462 const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, 463 const qm_val_t *iqm_ptr) { 464 (void)scan; 465 466 uint16x8_t vwt, viwt; 467 const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] }; 468 469 memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); 470 memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); 471 472 const int16x8_t zero = vdupq_n_s16(0); 473 int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero)); 474 475 int16x8_t vzbins = vdupq_n_s16(zbins[1]), vround = vdupq_n_s16(round_ptr[1]); 476 int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]); 477 int16x8_t vquant = vdupq_n_s16(quant_ptr[1]); 478 int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]); 479 480 int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]); 481 int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); 482 int16x8_t v_abs = vabsq_s16(v_coeff); 483 vzbins = vsetq_lane_s16(zbins[0], vzbins, 0); 484 uint16x8_t vcond; 485 if (qm_ptr == NULL) { 486 vcond = vcgeq_s16(v_abs, vzbins); 487 } else { 488 vwt = vmovl_u8(vld1_u8(&qm_ptr[0])); 489 vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins); 490 } 491 uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0); 492 if (nz_check) { 493 vround = vsetq_lane_s16(round_ptr[0], vround, 0); 494 vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0); 495 vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0); 496 vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0); 497 498 int16x8_t vtmp = vqaddq_s16(v_abs, vround); 499 500 int16x8_t vtmp2; 501 if (qm_ptr == NULL) { 502 vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1); 503 } else { 504 vtmp2 = QM_MULL_SHIFT(vtmp, vwt); 505 vtmp2 = vaddq_s16(vtmp2, vtmp); 506 } 507 508 vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1); 509 int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign); 510 int16x8_t coeff_nz_mask = 511 vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0])); 512 store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask); 513 514 if (iqm_ptr != NULL) { 515 viwt = vmovl_u8(vld1_u8(&iqm_ptr[0])); 516 vdequant = QM_MULL_SHIFT(vdequant, viwt); 517 } 518 int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant); 519 vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign); 520 coeff_nz_mask = 521 vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0])); 522 store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask); 523 524 vround = vsetq_lane_s16(round_ptr[1], vround, 0); 525 vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0); 526 vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0); 527 vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0); 528 529 uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero); 530 const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond); 531 int16x8_t v_iscan = vld1q_s16(&iscan[0]); 532 vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210)); 533 v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210); 534 } 535 vzbins = vsetq_lane_s16(zbins[1], vzbins, 0); 536 537 for (int i = 8; i < n_coeffs; i += 8) { 538 v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]); 539 v_coeff_sign = vshrq_n_s16(v_coeff, 15); 540 v_abs = vabsq_s16(v_coeff); 541 542 if (qm_ptr == NULL) { 543 vcond = vcgeq_s16(v_abs, vzbins); 544 } else { 545 vwt = vmovl_u8(vld1_u8(&qm_ptr[i])); 546 vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins); 547 } 548 nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0); 549 if (nz_check) { 550 int16x8_t vtmp = vqaddq_s16(v_abs, vround); 551 552 int16x8_t vtmp2; 553 if (qm_ptr == NULL) { 554 vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1); 555 } else { 556 vtmp2 = QM_MULL_SHIFT(vtmp, vwt); 557 vtmp2 = vaddq_s16(vtmp2, vtmp); 558 } 559 560 vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1); 561 int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign); 562 int16x8_t coeff_nz_mask = 563 vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i])); 564 store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask); 565 566 if (iqm_ptr != NULL) { 567 viwt = vmovl_u8(vld1_u8(&iqm_ptr[i])); 568 vdequant = QM_MULL_SHIFT(vdequant, viwt); 569 } 570 int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant); 571 vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign); 572 coeff_nz_mask = 573 vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i])); 574 store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask); 575 576 uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero); 577 const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond); 578 int16x8_t v_iscan = vld1q_s16(&iscan[i]); 579 vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210)); 580 v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210); 581 } 582 } 583 *eob_ptr = get_max_eob(v_eobmax_76543210) + 1; 584 } 585 586 static void aom_quantize_b_helper_32x32_neon( 587 const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, 588 const int16_t *round_ptr, const int16_t *quant_ptr, 589 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, 590 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, 591 const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, 592 const qm_val_t *iqm_ptr) { 593 (void)scan; 594 595 uint16x8_t vwt, viwt; 596 const int log_scale = 1; 597 const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), 598 ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; 599 600 memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); 601 memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); 602 603 const int16x8_t zero = vdupq_n_s16(0); 604 int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero)); 605 const int16x8_t v_log_scale = v_eobmax_76543210; 606 607 int16x8_t vzbins = vdupq_n_s16(zbins[1]), 608 vround = vdupq_n_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale)); 609 int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]); 610 int16x8_t vquant = vdupq_n_s16(quant_ptr[1]); 611 int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]); 612 613 int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]); 614 int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); 615 int16x8_t v_abs = vabsq_s16(v_coeff); 616 vzbins = vsetq_lane_s16(zbins[0], vzbins, 0); 617 uint16x8_t vcond; 618 if (qm_ptr == NULL) { 619 vcond = vcgeq_s16(v_abs, vzbins); 620 } else { 621 vwt = vmovl_u8(vld1_u8(&qm_ptr[0])); 622 vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins); 623 } 624 uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0); 625 if (nz_check) { 626 vround = 627 vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[0], log_scale), vround, 0); 628 vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0); 629 vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0); 630 vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0); 631 632 int16x8_t vtmp = vqaddq_s16(v_abs, vround); 633 634 int16x8_t vtmp2; 635 if (qm_ptr == NULL) { 636 vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1); 637 } else { 638 vtmp2 = QM_MULL_SHIFT(vtmp, vwt); 639 vtmp2 = vaddq_s16(vtmp2, vtmp); 640 } 641 642 vtmp2 = vqdmulhq_s16(vtmp2, vquant_shift); 643 int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign); 644 int16x8_t coeff_nz_mask = 645 vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0])); 646 store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask); 647 648 if (iqm_ptr != NULL) { 649 viwt = vmovl_u8(vld1_u8(&iqm_ptr[0])); 650 vdequant = QM_MULL_SHIFT(vdequant, viwt); 651 } 652 int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16( 653 vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale)); 654 vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign); 655 coeff_nz_mask = 656 vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0])); 657 store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask); 658 659 vzbins = vsetq_lane_s16(zbins[1], vzbins, 0); 660 vround = 661 vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale), vround, 0); 662 vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0); 663 vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0); 664 vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0); 665 666 uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero); 667 const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond); 668 int16x8_t v_iscan = vld1q_s16(&iscan[0]); 669 vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210)); 670 v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210); 671 } 672 vzbins = vsetq_lane_s16(zbins[1], vzbins, 0); 673 674 for (int i = 8; i < n_coeffs; i += 8) { 675 v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]); 676 v_coeff_sign = vshrq_n_s16(v_coeff, 15); 677 v_abs = vabsq_s16(v_coeff); 678 679 if (qm_ptr == NULL) { 680 vcond = vcgeq_s16(v_abs, vzbins); 681 } else { 682 vwt = vmovl_u8(vld1_u8(&qm_ptr[i])); 683 vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins); 684 } 685 nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0); 686 if (nz_check) { 687 int16x8_t vtmp = vqaddq_s16(v_abs, vround); 688 689 int16x8_t vtmp2; 690 if (qm_ptr == NULL) { 691 vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1); 692 } else { 693 vtmp2 = QM_MULL_SHIFT(vtmp, vwt); 694 vtmp2 = vaddq_s16(vtmp2, vtmp); 695 } 696 vtmp2 = vqdmulhq_s16(vtmp2, vquant_shift); 697 698 int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign); 699 int16x8_t coeff_nz_mask = 700 vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i])); 701 store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask); 702 703 if (iqm_ptr != NULL) { 704 viwt = vmovl_u8(vld1_u8(&iqm_ptr[i])); 705 vdequant = QM_MULL_SHIFT(vdequant, viwt); 706 } 707 int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16( 708 vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale)); 709 vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign); 710 coeff_nz_mask = 711 vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i])); 712 store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask); 713 714 uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero); 715 const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond); 716 int16x8_t v_iscan = vld1q_s16(&iscan[i]); 717 vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210)); 718 v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210); 719 } 720 } 721 *eob_ptr = get_max_eob(v_eobmax_76543210) + 1; 722 } 723 724 static void aom_quantize_b_helper_64x64_neon( 725 const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, 726 const int16_t *round_ptr, const int16_t *quant_ptr, 727 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, 728 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, 729 const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, 730 const qm_val_t *iqm_ptr) { 731 (void)scan; 732 733 uint16x8_t vwt, viwt; 734 const int log_scale = 2; 735 const int16x8_t v_log_scale = 736 vreinterpretq_s16_s64(vdupq_n_s64(0xFFFEFFFEFFFEFFFE)); 737 738 const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), 739 ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; 740 741 memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); 742 memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); 743 744 const int16x8_t zero = vdupq_n_s16(0); 745 int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero)); 746 int16x8_t v_ones = vnegq_s16(v_eobmax_76543210); 747 748 int16x8_t vzbins = vdupq_n_s16(zbins[1]), 749 vround = vdupq_n_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale)); 750 int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]); 751 int16x8_t vquant = vdupq_n_s16(quant_ptr[1]); 752 int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]); 753 754 int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]); 755 int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); 756 int16x8_t v_abs = vabsq_s16(v_coeff); 757 vzbins = vsetq_lane_s16(zbins[0], vzbins, 0); 758 uint16x8_t vcond; 759 if (qm_ptr == NULL) { 760 vcond = vcgeq_s16(v_abs, vzbins); 761 } else { 762 vwt = vmovl_u8(vld1_u8(&qm_ptr[0])); 763 vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins); 764 } 765 uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0); 766 if (nz_check) { 767 vround = 768 vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[0], log_scale), vround, 0); 769 vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0); 770 vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0); 771 vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0); 772 int16x8_t vtmp = vqaddq_s16(v_abs, vround); 773 774 int16x8_t vtmp2; 775 if (qm_ptr == NULL) { 776 vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1); 777 } else { 778 vtmp2 = QM_MULL_SHIFT(vtmp, vwt); 779 vtmp2 = vaddq_s16(vtmp2, vtmp); 780 } 781 782 int16x8_t ones = 783 vandq_s16(vshrq_n_s16(vmulq_s16(vtmp2, vquant_shift), 14), v_ones); 784 vtmp2 = 785 vaddq_s16(vshlq_s16(vqdmulhq_s16(vtmp2, vquant_shift), v_ones), ones); 786 int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign); 787 int16x8_t coeff_nz_mask = 788 vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0])); 789 store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask); 790 791 if (iqm_ptr != NULL) { 792 viwt = vmovl_u8(vld1_u8(&iqm_ptr[0])); 793 vdequant = QM_MULL_SHIFT(vdequant, viwt); 794 } 795 int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16( 796 vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale)); 797 v_deq_abs = 798 vorrq_s16(vshlq_n_s16(vqdmulhq_s16(vtmp2, vdequant), 13), v_deq_abs); 799 vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign); 800 coeff_nz_mask = 801 vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0])); 802 store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask); 803 804 vround = 805 vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale), vround, 0); 806 vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0); 807 vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0); 808 vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0); 809 810 uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero); 811 const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond); 812 int16x8_t v_iscan = vld1q_s16(&iscan[0]); 813 vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210)); 814 v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210); 815 } 816 vzbins = vsetq_lane_s16(zbins[1], vzbins, 0); 817 818 for (int i = 8; i < n_coeffs; i += 8) { 819 v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]); 820 v_coeff_sign = vshrq_n_s16(v_coeff, 15); 821 v_abs = vabsq_s16(v_coeff); 822 823 if (qm_ptr == NULL) { 824 vcond = vcgeq_s16(v_abs, vzbins); 825 } else { 826 vwt = vmovl_u8(vld1_u8(&qm_ptr[i])); 827 vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins); 828 } 829 nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0); 830 if (nz_check) { 831 int16x8_t vtmp = vqaddq_s16(v_abs, vround); 832 833 int16x8_t vtmp2; 834 if (qm_ptr == NULL) { 835 vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1); 836 } else { 837 vtmp2 = QM_MULL_SHIFT(vtmp, vwt); 838 vtmp2 = vaddq_s16(vtmp2, vtmp); 839 } 840 841 int16x8_t ones = 842 vandq_s16(vshrq_n_s16(vmulq_s16(vtmp2, vquant_shift), 14), v_ones); 843 vtmp2 = 844 vaddq_s16(vshlq_s16(vqdmulhq_s16(vtmp2, vquant_shift), v_ones), ones); 845 int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign); 846 int16x8_t coeff_nz_mask = 847 vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i])); 848 store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask); 849 850 if (iqm_ptr != NULL) { 851 viwt = vmovl_u8(vld1_u8(&iqm_ptr[i])); 852 vdequant = QM_MULL_SHIFT(vdequant, viwt); 853 } 854 int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16( 855 vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale)); 856 v_deq_abs = 857 vorrq_s16(vshlq_n_s16(vqdmulhq_s16(vtmp2, vdequant), 13), v_deq_abs); 858 vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign); 859 coeff_nz_mask = 860 vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i])); 861 store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask); 862 863 uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero); 864 const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond); 865 int16x8_t v_iscan = vld1q_s16(&iscan[i]); 866 vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210)); 867 v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210); 868 } 869 } 870 *eob_ptr = get_max_eob(v_eobmax_76543210) + 1; 871 } 872 873 void aom_quantize_b_helper_neon( 874 const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, 875 const int16_t *round_ptr, const int16_t *quant_ptr, 876 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, 877 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, 878 const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, 879 const qm_val_t *iqm_ptr, const int log_scale) { 880 switch (log_scale) { // log_scale for AV1 encoder can be only 0, 1, 2 881 case 0: 882 aom_quantize_b_helper_16x16_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, 883 quant_ptr, quant_shift_ptr, qcoeff_ptr, 884 dqcoeff_ptr, dequant_ptr, eob_ptr, scan, 885 iscan, qm_ptr, iqm_ptr); 886 break; 887 case 1: 888 aom_quantize_b_helper_32x32_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, 889 quant_ptr, quant_shift_ptr, qcoeff_ptr, 890 dqcoeff_ptr, dequant_ptr, eob_ptr, scan, 891 iscan, qm_ptr, iqm_ptr); 892 break; 893 case 2: 894 aom_quantize_b_helper_64x64_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, 895 quant_ptr, quant_shift_ptr, qcoeff_ptr, 896 dqcoeff_ptr, dequant_ptr, eob_ptr, scan, 897 iscan, qm_ptr, iqm_ptr); 898 break; 899 } 900 } 901 902 void aom_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, 903 const int16_t *zbin_ptr, 904 const int16_t *round_ptr, 905 const int16_t *quant_ptr, 906 const int16_t *quant_shift_ptr, 907 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, 908 const int16_t *dequant_ptr, uint16_t *eob_ptr, 909 const int16_t *scan, const int16_t *iscan) { 910 aom_quantize_b_helper_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, 911 quant_ptr, quant_shift_ptr, qcoeff_ptr, 912 dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 913 NULL, NULL, 1); 914 } 915 916 void aom_quantize_b_64x64_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, 917 const int16_t *zbin_ptr, 918 const int16_t *round_ptr, 919 const int16_t *quant_ptr, 920 const int16_t *quant_shift_ptr, 921 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, 922 const int16_t *dequant_ptr, uint16_t *eob_ptr, 923 const int16_t *scan, const int16_t *iscan) { 924 aom_quantize_b_helper_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, 925 quant_ptr, quant_shift_ptr, qcoeff_ptr, 926 dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 927 NULL, NULL, 2); 928 }