quantize_sse2.c (4297B)
1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <assert.h> 13 #include <emmintrin.h> 14 #include <xmmintrin.h> 15 16 #include "config/aom_dsp_rtcd.h" 17 18 #include "aom/aom_integer.h" 19 #include "aom_dsp/x86/quantize_x86.h" 20 21 void aom_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, 22 const int16_t *zbin_ptr, const int16_t *round_ptr, 23 const int16_t *quant_ptr, 24 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, 25 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, 26 uint16_t *eob_ptr, const int16_t *scan_ptr, 27 const int16_t *iscan_ptr) { 28 const __m128i zero = _mm_setzero_si128(); 29 int index = 16; 30 31 __m128i zbin, round, quant, dequant, shift; 32 __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; 33 __m128i qcoeff0, qcoeff1; 34 __m128i cmp_mask0, cmp_mask1; 35 __m128i eob, eob0; 36 37 (void)scan_ptr; 38 39 // Setup global values. 40 load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, 41 dequant_ptr, &dequant, quant_shift_ptr, &shift); 42 43 // Do DC and first 15 AC. 44 coeff0 = load_coefficients(coeff_ptr); 45 coeff1 = load_coefficients(coeff_ptr + 8); 46 47 // Poor man's abs(). 48 coeff0_sign = _mm_srai_epi16(coeff0, 15); 49 coeff1_sign = _mm_srai_epi16(coeff1, 15); 50 qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); 51 qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); 52 53 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); 54 zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC 55 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); 56 57 calculate_qcoeff(&qcoeff0, round, quant, shift); 58 59 round = _mm_unpackhi_epi64(round, round); 60 quant = _mm_unpackhi_epi64(quant, quant); 61 shift = _mm_unpackhi_epi64(shift, shift); 62 63 calculate_qcoeff(&qcoeff1, round, quant, shift); 64 65 // Reinsert signs 66 qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); 67 qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); 68 69 // Mask out zbin threshold coeffs 70 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); 71 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); 72 73 store_coefficients(qcoeff0, qcoeff_ptr); 74 store_coefficients(qcoeff1, qcoeff_ptr + 8); 75 76 coeff0 = calculate_dqcoeff(qcoeff0, dequant); 77 dequant = _mm_unpackhi_epi64(dequant, dequant); 78 coeff1 = calculate_dqcoeff(qcoeff1, dequant); 79 80 store_coefficients(coeff0, dqcoeff_ptr); 81 store_coefficients(coeff1, dqcoeff_ptr + 8); 82 83 eob = 84 scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero); 85 86 // AC only loop. 87 while (index < n_coeffs) { 88 coeff0 = load_coefficients(coeff_ptr + index); 89 coeff1 = load_coefficients(coeff_ptr + index + 8); 90 91 coeff0_sign = _mm_srai_epi16(coeff0, 15); 92 coeff1_sign = _mm_srai_epi16(coeff1, 15); 93 qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); 94 qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); 95 96 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); 97 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); 98 99 calculate_qcoeff(&qcoeff0, round, quant, shift); 100 calculate_qcoeff(&qcoeff1, round, quant, shift); 101 102 qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); 103 qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); 104 105 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); 106 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); 107 108 store_coefficients(qcoeff0, qcoeff_ptr + index); 109 store_coefficients(qcoeff1, qcoeff_ptr + index + 8); 110 111 coeff0 = calculate_dqcoeff(qcoeff0, dequant); 112 coeff1 = calculate_dqcoeff(qcoeff1, dequant); 113 114 store_coefficients(coeff0, dqcoeff_ptr + index); 115 store_coefficients(coeff1, dqcoeff_ptr + index + 8); 116 117 eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 118 index, zero); 119 eob = _mm_max_epi16(eob, eob0); 120 121 index += 16; 122 } 123 124 *eob_ptr = accumulate_eob(eob); 125 }