av1_quantize_avx2.c (15984B)
1 /* 2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <immintrin.h> 13 14 #include "config/av1_rtcd.h" 15 16 #include "aom/aom_integer.h" 17 #include "aom_dsp/aom_dsp_common.h" 18 19 static inline void write_zero(tran_low_t *qcoeff) { 20 const __m256i zero = _mm256_setzero_si256(); 21 _mm256_storeu_si256((__m256i *)qcoeff, zero); 22 _mm256_storeu_si256((__m256i *)qcoeff + 1, zero); 23 } 24 25 static inline void init_one_qp(const __m128i *p, __m256i *qp) { 26 const __m128i ac = _mm_unpackhi_epi64(*p, *p); 27 *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(*p), ac, 1); 28 } 29 30 static inline void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr, 31 const int16_t *dequant_ptr, int log_scale, 32 __m256i *thr, __m256i *qp) { 33 __m128i round = _mm_loadu_si128((const __m128i *)round_ptr); 34 const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr); 35 const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr); 36 37 if (log_scale > 0) { 38 const __m128i rnd = _mm_set1_epi16((int16_t)1 << (log_scale - 1)); 39 round = _mm_add_epi16(round, rnd); 40 round = _mm_srai_epi16(round, log_scale); 41 } 42 43 init_one_qp(&round, &qp[0]); 44 init_one_qp(&quant, &qp[1]); 45 46 if (log_scale == 1) { 47 qp[1] = _mm256_slli_epi16(qp[1], log_scale); 48 } 49 50 init_one_qp(&dequant, &qp[2]); 51 *thr = _mm256_srai_epi16(qp[2], 1 + log_scale); 52 // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when 53 // calculating the zbin mask. 54 *thr = _mm256_sub_epi16(*thr, _mm256_set1_epi16(1)); 55 } 56 57 static inline void update_qp(__m256i *thr, __m256i *qp) { 58 qp[0] = _mm256_permute2x128_si256(qp[0], qp[0], 0x11); 59 qp[1] = _mm256_permute2x128_si256(qp[1], qp[1], 0x11); 60 qp[2] = _mm256_permute2x128_si256(qp[2], qp[2], 0x11); 61 *thr = _mm256_permute2x128_si256(*thr, *thr, 0x11); 62 } 63 64 static inline __m256i load_coefficients_avx2(const tran_low_t *coeff_ptr) { 65 const __m256i coeff1 = _mm256_load_si256((__m256i *)coeff_ptr); 66 const __m256i coeff2 = _mm256_load_si256((__m256i *)(coeff_ptr + 8)); 67 return _mm256_packs_epi32(coeff1, coeff2); 68 } 69 70 static inline void store_coefficients_avx2(__m256i coeff_vals, 71 tran_low_t *coeff_ptr) { 72 __m256i coeff_sign = _mm256_srai_epi16(coeff_vals, 15); 73 __m256i coeff_vals_lo = _mm256_unpacklo_epi16(coeff_vals, coeff_sign); 74 __m256i coeff_vals_hi = _mm256_unpackhi_epi16(coeff_vals, coeff_sign); 75 _mm256_store_si256((__m256i *)coeff_ptr, coeff_vals_lo); 76 _mm256_store_si256((__m256i *)(coeff_ptr + 8), coeff_vals_hi); 77 } 78 79 static inline uint16_t quant_gather_eob(__m256i eob) { 80 const __m128i eob_lo = _mm256_castsi256_si128(eob); 81 const __m128i eob_hi = _mm256_extractf128_si256(eob, 1); 82 __m128i eob_s = _mm_max_epi16(eob_lo, eob_hi); 83 eob_s = _mm_subs_epu16(_mm_set1_epi16(INT16_MAX), eob_s); 84 eob_s = _mm_minpos_epu16(eob_s); 85 return INT16_MAX - _mm_extract_epi16(eob_s, 0); 86 } 87 88 static inline int16_t accumulate_eob256(__m256i eob256) { 89 const __m128i eob_lo = _mm256_castsi256_si128(eob256); 90 const __m128i eob_hi = _mm256_extractf128_si256(eob256, 1); 91 __m128i eob = _mm_max_epi16(eob_lo, eob_hi); 92 __m128i eob_shuffled = _mm_shuffle_epi32(eob, 0xe); 93 eob = _mm_max_epi16(eob, eob_shuffled); 94 eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); 95 eob = _mm_max_epi16(eob, eob_shuffled); 96 eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); 97 eob = _mm_max_epi16(eob, eob_shuffled); 98 return _mm_extract_epi16(eob, 1); 99 } 100 101 static AOM_FORCE_INLINE void quantize_lp_16_first( 102 const int16_t *coeff_ptr, const int16_t *iscan_ptr, int16_t *qcoeff_ptr, 103 int16_t *dqcoeff_ptr, __m256i *round256, __m256i *quant256, 104 __m256i *dequant256, __m256i *eob) { 105 const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); 106 const __m256i abs_coeff = _mm256_abs_epi16(coeff); 107 const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round256); 108 const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant256); 109 const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff); 110 const __m256i dqcoeff = _mm256_mullo_epi16(qcoeff, *dequant256); 111 const __m256i nz_mask = 112 _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256()); 113 114 _mm256_storeu_si256((__m256i *)qcoeff_ptr, qcoeff); 115 _mm256_storeu_si256((__m256i *)dqcoeff_ptr, dqcoeff); 116 117 const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr); 118 const __m256i iscan_plus1 = _mm256_sub_epi16(iscan, nz_mask); 119 const __m256i nz_iscan = _mm256_and_si256(iscan_plus1, nz_mask); 120 *eob = _mm256_max_epi16(*eob, nz_iscan); 121 } 122 123 static AOM_FORCE_INLINE void quantize_lp_16( 124 const int16_t *coeff_ptr, intptr_t n_coeffs, const int16_t *iscan_ptr, 125 int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, __m256i *round256, 126 __m256i *quant256, __m256i *dequant256, __m256i *eob) { 127 const __m256i coeff = 128 _mm256_loadu_si256((const __m256i *)(coeff_ptr + n_coeffs)); 129 const __m256i abs_coeff = _mm256_abs_epi16(coeff); 130 const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round256); 131 const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant256); 132 const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff); 133 const __m256i dqcoeff = _mm256_mullo_epi16(qcoeff, *dequant256); 134 const __m256i nz_mask = 135 _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256()); 136 137 _mm256_storeu_si256((__m256i *)(qcoeff_ptr + n_coeffs), qcoeff); 138 _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + n_coeffs), dqcoeff); 139 140 const __m256i iscan = 141 _mm256_loadu_si256((const __m256i *)(iscan_ptr + n_coeffs)); 142 const __m256i iscan_plus1 = _mm256_sub_epi16(iscan, nz_mask); 143 const __m256i nz_iscan = _mm256_and_si256(iscan_plus1, nz_mask); 144 *eob = _mm256_max_epi16(*eob, nz_iscan); 145 } 146 147 void av1_quantize_lp_avx2(const int16_t *coeff_ptr, intptr_t n_coeffs, 148 const int16_t *round_ptr, const int16_t *quant_ptr, 149 int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, 150 const int16_t *dequant_ptr, uint16_t *eob_ptr, 151 const int16_t *scan, const int16_t *iscan) { 152 (void)scan; 153 __m256i eob256 = _mm256_setzero_si256(); 154 155 // Setup global values. 156 __m256i round256 = 157 _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr)); 158 __m256i quant256 = 159 _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr)); 160 __m256i dequant256 = 161 _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr)); 162 163 // Populate upper AC values. 164 round256 = _mm256_permute4x64_epi64(round256, 0x54); 165 quant256 = _mm256_permute4x64_epi64(quant256, 0x54); 166 dequant256 = _mm256_permute4x64_epi64(dequant256, 0x54); 167 168 // Process DC and the first 15 AC coeffs. 169 quantize_lp_16_first(coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &round256, 170 &quant256, &dequant256, &eob256); 171 172 if (n_coeffs > 16) { 173 // Overwrite the DC constants with AC constants 174 dequant256 = _mm256_permute2x128_si256(dequant256, dequant256, 0x31); 175 quant256 = _mm256_permute2x128_si256(quant256, quant256, 0x31); 176 round256 = _mm256_permute2x128_si256(round256, round256, 0x31); 177 178 // AC only loop. 179 for (int idx = 16; idx < n_coeffs; idx += 16) { 180 quantize_lp_16(coeff_ptr, idx, iscan, qcoeff_ptr, dqcoeff_ptr, &round256, 181 &quant256, &dequant256, &eob256); 182 } 183 } 184 185 *eob_ptr = accumulate_eob256(eob256); 186 } 187 188 static AOM_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan, 189 __m256i v_eobmax, 190 __m256i v_mask) { 191 const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan); 192 const __m256i v_iscan_perm = _mm256_permute4x64_epi64(v_iscan, 0xD8); 193 const __m256i v_iscan_plus1 = _mm256_sub_epi16(v_iscan_perm, v_mask); 194 const __m256i v_nz_iscan = _mm256_and_si256(v_iscan_plus1, v_mask); 195 return _mm256_max_epi16(v_eobmax, v_nz_iscan); 196 } 197 198 static AOM_FORCE_INLINE void quantize_fp_16( 199 const __m256i *thr, const __m256i *qp, const tran_low_t *coeff_ptr, 200 const int16_t *iscan_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, 201 __m256i *eob) { 202 const __m256i coeff = load_coefficients_avx2(coeff_ptr); 203 const __m256i abs_coeff = _mm256_abs_epi16(coeff); 204 const __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr); 205 const int nzflag = _mm256_movemask_epi8(mask); 206 207 if (nzflag) { 208 const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, qp[0]); 209 const __m256i abs_q = _mm256_mulhi_epi16(tmp_rnd, qp[1]); 210 const __m256i q = _mm256_sign_epi16(abs_q, coeff); 211 const __m256i dq = _mm256_mullo_epi16(q, qp[2]); 212 const __m256i nz_mask = _mm256_cmpgt_epi16(abs_q, _mm256_setzero_si256()); 213 214 store_coefficients_avx2(q, qcoeff_ptr); 215 store_coefficients_avx2(dq, dqcoeff_ptr); 216 217 *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask); 218 } else { 219 write_zero(qcoeff_ptr); 220 write_zero(dqcoeff_ptr); 221 } 222 } 223 224 void av1_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, 225 const int16_t *zbin_ptr, const int16_t *round_ptr, 226 const int16_t *quant_ptr, 227 const int16_t *quant_shift_ptr, 228 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, 229 const int16_t *dequant_ptr, uint16_t *eob_ptr, 230 const int16_t *scan_ptr, const int16_t *iscan_ptr) { 231 (void)scan_ptr; 232 (void)zbin_ptr; 233 (void)quant_shift_ptr; 234 235 const int log_scale = 0; 236 const int step = 16; 237 __m256i qp[3], thr; 238 __m256i eob = _mm256_setzero_si256(); 239 240 init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp); 241 242 quantize_fp_16(&thr, qp, coeff_ptr, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob); 243 244 coeff_ptr += step; 245 qcoeff_ptr += step; 246 dqcoeff_ptr += step; 247 iscan_ptr += step; 248 n_coeffs -= step; 249 250 update_qp(&thr, qp); 251 252 while (n_coeffs > 0) { 253 quantize_fp_16(&thr, qp, coeff_ptr, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, 254 &eob); 255 256 coeff_ptr += step; 257 qcoeff_ptr += step; 258 dqcoeff_ptr += step; 259 iscan_ptr += step; 260 n_coeffs -= step; 261 } 262 *eob_ptr = quant_gather_eob(eob); 263 } 264 265 static AOM_FORCE_INLINE void quantize_fp_32x32( 266 const __m256i *thr, const __m256i *qp, const tran_low_t *coeff_ptr, 267 const int16_t *iscan_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, 268 __m256i *eob) { 269 const __m256i coeff = load_coefficients_avx2(coeff_ptr); 270 const __m256i abs_coeff = _mm256_abs_epi16(coeff); 271 const __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr); 272 const int nzflag = _mm256_movemask_epi8(mask); 273 274 if (nzflag) { 275 const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, qp[0]); 276 const __m256i abs_q = _mm256_mulhi_epu16(tmp_rnd, qp[1]); 277 const __m256i q = _mm256_sign_epi16(abs_q, coeff); 278 const __m256i abs_dq = 279 _mm256_srli_epi16(_mm256_mullo_epi16(abs_q, qp[2]), 1); 280 const __m256i nz_mask = _mm256_cmpgt_epi16(abs_q, _mm256_setzero_si256()); 281 const __m256i dq = _mm256_sign_epi16(abs_dq, coeff); 282 283 store_coefficients_avx2(q, qcoeff_ptr); 284 store_coefficients_avx2(dq, dqcoeff_ptr); 285 286 *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask); 287 } else { 288 write_zero(qcoeff_ptr); 289 write_zero(dqcoeff_ptr); 290 } 291 } 292 293 void av1_quantize_fp_32x32_avx2( 294 const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, 295 const int16_t *round_ptr, const int16_t *quant_ptr, 296 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, 297 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, 298 const int16_t *scan_ptr, const int16_t *iscan_ptr) { 299 (void)scan_ptr; 300 (void)zbin_ptr; 301 (void)quant_shift_ptr; 302 303 const int log_scale = 1; 304 const unsigned int step = 16; 305 __m256i qp[3], thr; 306 __m256i eob = _mm256_setzero_si256(); 307 308 init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp); 309 310 quantize_fp_32x32(&thr, qp, coeff_ptr, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, 311 &eob); 312 313 coeff_ptr += step; 314 qcoeff_ptr += step; 315 dqcoeff_ptr += step; 316 iscan_ptr += step; 317 n_coeffs -= step; 318 319 update_qp(&thr, qp); 320 321 while (n_coeffs > 0) { 322 quantize_fp_32x32(&thr, qp, coeff_ptr, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, 323 &eob); 324 325 coeff_ptr += step; 326 qcoeff_ptr += step; 327 dqcoeff_ptr += step; 328 iscan_ptr += step; 329 n_coeffs -= step; 330 } 331 *eob_ptr = quant_gather_eob(eob); 332 } 333 334 static inline void quantize_fp_64x64(const __m256i *thr, const __m256i *qp, 335 const tran_low_t *coeff_ptr, 336 const int16_t *iscan_ptr, 337 tran_low_t *qcoeff_ptr, 338 tran_low_t *dqcoeff_ptr, __m256i *eob) { 339 const __m256i coeff = load_coefficients_avx2(coeff_ptr); 340 const __m256i abs_coeff = _mm256_abs_epi16(coeff); 341 const __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr); 342 const int nzflag = _mm256_movemask_epi8(mask); 343 344 if (nzflag) { 345 const __m256i tmp_rnd = 346 _mm256_and_si256(_mm256_adds_epi16(abs_coeff, qp[0]), mask); 347 const __m256i qh = _mm256_slli_epi16(_mm256_mulhi_epi16(tmp_rnd, qp[1]), 2); 348 const __m256i ql = 349 _mm256_srli_epi16(_mm256_mullo_epi16(tmp_rnd, qp[1]), 14); 350 const __m256i abs_q = _mm256_or_si256(qh, ql); 351 const __m256i dqh = _mm256_slli_epi16(_mm256_mulhi_epi16(abs_q, qp[2]), 14); 352 const __m256i dql = _mm256_srli_epi16(_mm256_mullo_epi16(abs_q, qp[2]), 2); 353 const __m256i abs_dq = _mm256_or_si256(dqh, dql); 354 const __m256i q = _mm256_sign_epi16(abs_q, coeff); 355 const __m256i dq = _mm256_sign_epi16(abs_dq, coeff); 356 // Check the signed q/dq value here instead of the absolute value. When 357 // dequant equals 4, the dequant threshold (*thr) becomes 0 after being 358 // scaled down by (1 + log_scale). See init_qp(). When *thr is 0 and the 359 // abs_coeff is 0, the nzflag will be set. As a result, the eob will be 360 // incorrectly calculated. The psign instruction corrects the error by 361 // zeroing out q/dq if coeff is zero. 362 const __m256i z_mask = _mm256_cmpeq_epi16(dq, _mm256_setzero_si256()); 363 const __m256i nz_mask = _mm256_cmpeq_epi16(z_mask, _mm256_setzero_si256()); 364 365 store_coefficients_avx2(q, qcoeff_ptr); 366 store_coefficients_avx2(dq, dqcoeff_ptr); 367 368 *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask); 369 } else { 370 write_zero(qcoeff_ptr); 371 write_zero(dqcoeff_ptr); 372 } 373 } 374 375 void av1_quantize_fp_64x64_avx2( 376 const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, 377 const int16_t *round_ptr, const int16_t *quant_ptr, 378 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, 379 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, 380 const int16_t *scan_ptr, const int16_t *iscan_ptr) { 381 (void)scan_ptr; 382 (void)zbin_ptr; 383 (void)quant_shift_ptr; 384 385 const int log_scale = 2; 386 const unsigned int step = 16; 387 __m256i qp[3], thr; 388 __m256i eob = _mm256_setzero_si256(); 389 390 init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp); 391 392 quantize_fp_64x64(&thr, qp, coeff_ptr, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, 393 &eob); 394 395 coeff_ptr += step; 396 qcoeff_ptr += step; 397 dqcoeff_ptr += step; 398 iscan_ptr += step; 399 n_coeffs -= step; 400 401 update_qp(&thr, qp); 402 403 while (n_coeffs > 0) { 404 quantize_fp_64x64(&thr, qp, coeff_ptr, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, 405 &eob); 406 407 coeff_ptr += step; 408 qcoeff_ptr += step; 409 dqcoeff_ptr += step; 410 iscan_ptr += step; 411 n_coeffs -= step; 412 } 413 *eob_ptr = quant_gather_eob(eob); 414 }