quantize_ssse3.c (7555B)
1 /* 2 * Copyright (c) 2019, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <assert.h> 13 #include <tmmintrin.h> 14 #include <emmintrin.h> 15 #include <xmmintrin.h> 16 17 #include "config/aom_dsp_rtcd.h" 18 19 #include "aom/aom_integer.h" 20 #include "aom_dsp/x86/quantize_x86.h" 21 22 static inline void calculate_qcoeff_64x64(__m128i *coeff, const __m128i round, 23 const __m128i quant, 24 const __m128i *shift) { 25 __m128i tmp, qcoeff, tmp1; 26 qcoeff = _mm_adds_epi16(*coeff, round); 27 tmp = _mm_mulhi_epi16(qcoeff, quant); 28 qcoeff = _mm_add_epi16(tmp, qcoeff); 29 tmp = _mm_mullo_epi16(qcoeff, *shift); 30 tmp = _mm_srli_epi16(tmp, 14); 31 tmp1 = _mm_mulhi_epi16(qcoeff, *shift); 32 tmp1 = _mm_slli_epi16(tmp1, 2); 33 *coeff = _mm_or_si128(tmp, tmp1); 34 } 35 36 static inline void calculate_dqcoeff_and_store_64x64(const __m128i qcoeff, 37 const __m128i dequant, 38 const __m128i zero, 39 tran_low_t *dqcoeff) { 40 // Un-sign to bias rounding like C. 41 const __m128i coeff = _mm_abs_epi16(qcoeff); 42 43 const __m128i sign_0 = _mm_unpacklo_epi16(zero, qcoeff); 44 const __m128i sign_1 = _mm_unpackhi_epi16(zero, qcoeff); 45 46 const __m128i low = _mm_mullo_epi16(coeff, dequant); 47 const __m128i high = _mm_mulhi_epi16(coeff, dequant); 48 __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high); 49 __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high); 50 51 // "Divide" by 4. 52 dqcoeff32_0 = _mm_srli_epi32(dqcoeff32_0, 2); 53 dqcoeff32_1 = _mm_srli_epi32(dqcoeff32_1, 2); 54 55 dqcoeff32_0 = _mm_sign_epi32(dqcoeff32_0, sign_0); 56 dqcoeff32_1 = _mm_sign_epi32(dqcoeff32_1, sign_1); 57 58 _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0); 59 _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1); 60 } 61 62 void aom_quantize_b_64x64_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, 63 const int16_t *zbin_ptr, 64 const int16_t *round_ptr, 65 const int16_t *quant_ptr, 66 const int16_t *quant_shift_ptr, 67 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, 68 const int16_t *dequant_ptr, uint16_t *eob_ptr, 69 const int16_t *scan, const int16_t *iscan) { 70 const __m128i zero = _mm_setzero_si128(); 71 const __m128i one = _mm_set1_epi16(1); 72 const __m128i two = _mm_set1_epi16(2); 73 int index; 74 75 __m128i zbin, round, quant, dequant, shift; 76 __m128i coeff0, coeff1, qcoeff0, qcoeff1; 77 __m128i cmp_mask0, cmp_mask1, all_zero; 78 __m128i eob = zero, eob0; 79 80 (void)scan; 81 (void)n_coeffs; 82 83 // Setup global values. 84 zbin = _mm_load_si128((const __m128i *)zbin_ptr); 85 round = _mm_load_si128((const __m128i *)round_ptr); 86 quant = _mm_load_si128((const __m128i *)quant_ptr); 87 dequant = _mm_load_si128((const __m128i *)dequant_ptr); 88 shift = _mm_load_si128((const __m128i *)quant_shift_ptr); 89 90 // Shift with rounding. 91 zbin = _mm_add_epi16(zbin, two); 92 round = _mm_add_epi16(round, two); 93 zbin = _mm_srli_epi16(zbin, 2); 94 round = _mm_srli_epi16(round, 2); 95 zbin = _mm_sub_epi16(zbin, one); 96 // Do DC and first 15 AC. 97 coeff0 = load_coefficients(coeff_ptr); 98 coeff1 = load_coefficients(coeff_ptr + 8); 99 100 qcoeff0 = _mm_abs_epi16(coeff0); 101 qcoeff1 = _mm_abs_epi16(coeff1); 102 103 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); 104 zbin = _mm_unpackhi_epi64(zbin, zbin); 105 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); 106 all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); 107 if (_mm_movemask_epi8(all_zero) == 0) { 108 _mm_store_si128((__m128i *)(qcoeff_ptr), zero); 109 _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); 110 _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero); 111 _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero); 112 _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); 113 _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); 114 _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero); 115 _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero); 116 round = _mm_unpackhi_epi64(round, round); 117 quant = _mm_unpackhi_epi64(quant, quant); 118 shift = _mm_unpackhi_epi64(shift, shift); 119 dequant = _mm_unpackhi_epi64(dequant, dequant); 120 } else { 121 calculate_qcoeff_64x64(&qcoeff0, round, quant, &shift); 122 round = _mm_unpackhi_epi64(round, round); 123 quant = _mm_unpackhi_epi64(quant, quant); 124 shift = _mm_unpackhi_epi64(shift, shift); 125 calculate_qcoeff_64x64(&qcoeff1, round, quant, &shift); 126 127 // Reinsert signs. 128 qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); 129 qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); 130 131 // Mask out zbin threshold coeffs. 132 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); 133 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); 134 135 store_coefficients(qcoeff0, qcoeff_ptr); 136 store_coefficients(qcoeff1, qcoeff_ptr + 8); 137 138 calculate_dqcoeff_and_store_64x64(qcoeff0, dequant, zero, dqcoeff_ptr); 139 dequant = _mm_unpackhi_epi64(dequant, dequant); 140 calculate_dqcoeff_and_store_64x64(qcoeff1, dequant, zero, dqcoeff_ptr + 8); 141 142 eob = 143 scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); 144 } 145 146 // AC only loop. 147 for (index = 16; index < 1024; index += 16) { 148 coeff0 = load_coefficients(coeff_ptr + index); 149 coeff1 = load_coefficients(coeff_ptr + index + 8); 150 151 qcoeff0 = _mm_abs_epi16(coeff0); 152 qcoeff1 = _mm_abs_epi16(coeff1); 153 154 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); 155 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); 156 157 all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); 158 if (_mm_movemask_epi8(all_zero) == 0) { 159 _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); 160 _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); 161 _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero); 162 _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero); 163 _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); 164 _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); 165 _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero); 166 _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero); 167 continue; 168 } 169 calculate_qcoeff_64x64(&qcoeff0, round, quant, &shift); 170 calculate_qcoeff_64x64(&qcoeff1, round, quant, &shift); 171 172 qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); 173 qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); 174 175 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); 176 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); 177 178 store_coefficients(qcoeff0, qcoeff_ptr + index); 179 store_coefficients(qcoeff1, qcoeff_ptr + index + 8); 180 181 calculate_dqcoeff_and_store_64x64(qcoeff0, dequant, zero, 182 dqcoeff_ptr + index); 183 calculate_dqcoeff_and_store_64x64(qcoeff1, dequant, zero, 184 dqcoeff_ptr + 8 + index); 185 186 eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, 187 zero); 188 eob = _mm_max_epi16(eob, eob0); 189 } 190 191 *eob_ptr = accumulate_eob(eob); 192 }