aom_quantize_avx.c (10558B)
1 /* 2 * Copyright (c) 2020, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <immintrin.h> 13 14 #include "config/aom_dsp_rtcd.h" 15 #include "aom/aom_integer.h" 16 #include "aom_dsp/x86/bitdepth_conversion_sse2.h" 17 #include "aom_dsp/x86/quantize_x86.h" 18 19 static inline void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant, 20 tran_low_t *dqcoeff) { 21 const __m128i low = _mm_mullo_epi16(qcoeff, dequant); 22 const __m128i high = _mm_mulhi_epi16(qcoeff, dequant); 23 24 const __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high); 25 const __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high); 26 27 _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0); 28 _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1); 29 } 30 31 void aom_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, 32 const int16_t *zbin_ptr, const int16_t *round_ptr, 33 const int16_t *quant_ptr, 34 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, 35 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, 36 uint16_t *eob_ptr, const int16_t *scan, 37 const int16_t *iscan) { 38 const __m128i zero = _mm_setzero_si128(); 39 const __m256i big_zero = _mm256_setzero_si256(); 40 int index; 41 42 __m128i zbin, round, quant, dequant, shift; 43 __m128i coeff0, coeff1; 44 __m128i qcoeff0, qcoeff1; 45 __m128i cmp_mask0, cmp_mask1; 46 __m128i all_zero; 47 __m128i eob = zero, eob0; 48 49 (void)scan; 50 51 *eob_ptr = 0; 52 53 load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, 54 dequant_ptr, &dequant, quant_shift_ptr, &shift); 55 56 // Do DC and first 15 AC. 57 coeff0 = load_tran_low(coeff_ptr); 58 coeff1 = load_tran_low(coeff_ptr + 8); 59 60 qcoeff0 = _mm_abs_epi16(coeff0); 61 qcoeff1 = _mm_abs_epi16(coeff1); 62 63 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); 64 zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC 65 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); 66 67 all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); 68 if (_mm_test_all_zeros(all_zero, all_zero)) { 69 _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero); 70 _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero); 71 _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero); 72 _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero); 73 74 if (n_coeffs == 16) return; 75 76 round = _mm_unpackhi_epi64(round, round); 77 quant = _mm_unpackhi_epi64(quant, quant); 78 shift = _mm_unpackhi_epi64(shift, shift); 79 dequant = _mm_unpackhi_epi64(dequant, dequant); 80 } else { 81 calculate_qcoeff(&qcoeff0, round, quant, shift); 82 round = _mm_unpackhi_epi64(round, round); 83 quant = _mm_unpackhi_epi64(quant, quant); 84 shift = _mm_unpackhi_epi64(shift, shift); 85 calculate_qcoeff(&qcoeff1, round, quant, shift); 86 87 // Reinsert signs 88 qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); 89 qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); 90 91 // Mask out zbin threshold coeffs 92 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); 93 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); 94 95 store_tran_low(qcoeff0, qcoeff_ptr); 96 store_tran_low(qcoeff1, qcoeff_ptr + 8); 97 98 calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr); 99 dequant = _mm_unpackhi_epi64(dequant, dequant); 100 calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8); 101 102 eob = 103 scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); 104 } 105 106 // AC only loop. 107 for (index = 16; index < n_coeffs; index += 16) { 108 coeff0 = load_tran_low(coeff_ptr + index); 109 coeff1 = load_tran_low(coeff_ptr + index + 8); 110 111 qcoeff0 = _mm_abs_epi16(coeff0); 112 qcoeff1 = _mm_abs_epi16(coeff1); 113 114 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); 115 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); 116 117 all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); 118 if (_mm_test_all_zeros(all_zero, all_zero)) { 119 _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero); 120 _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero); 121 _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero); 122 _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero); 123 continue; 124 } 125 126 calculate_qcoeff(&qcoeff0, round, quant, shift); 127 calculate_qcoeff(&qcoeff1, round, quant, shift); 128 129 qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); 130 qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); 131 132 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); 133 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); 134 135 store_tran_low(qcoeff0, qcoeff_ptr + index); 136 store_tran_low(qcoeff1, qcoeff_ptr + index + 8); 137 138 calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index); 139 calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8); 140 141 eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, 142 zero); 143 eob = _mm_max_epi16(eob, eob0); 144 } 145 146 *eob_ptr = accumulate_eob(eob); 147 } 148 149 void aom_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, 150 const int16_t *zbin_ptr, const int16_t *round_ptr, 151 const int16_t *quant_ptr, 152 const int16_t *quant_shift_ptr, 153 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, 154 const int16_t *dequant_ptr, uint16_t *eob_ptr, 155 const int16_t *scan, const int16_t *iscan) { 156 const __m128i zero = _mm_setzero_si128(); 157 const __m128i one = _mm_set1_epi16(1); 158 const __m256i big_zero = _mm256_setzero_si256(); 159 int index; 160 const int log_scale = 1; 161 162 __m128i zbin, round, quant, dequant, shift; 163 __m128i coeff0, coeff1; 164 __m128i qcoeff0, qcoeff1; 165 __m128i cmp_mask0, cmp_mask1; 166 __m128i all_zero; 167 __m128i eob = zero, eob0; 168 169 (void)scan; 170 171 // Setup global values. 172 // The 32x32 halves zbin and round. 173 zbin = _mm_load_si128((const __m128i *)zbin_ptr); 174 // Shift with rounding. 175 zbin = _mm_add_epi16(zbin, one); 176 zbin = _mm_srli_epi16(zbin, 1); 177 // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so 178 // it is a strict "greater" comparison. 179 zbin = _mm_sub_epi16(zbin, one); 180 181 round = _mm_load_si128((const __m128i *)round_ptr); 182 round = _mm_add_epi16(round, one); 183 round = _mm_srli_epi16(round, 1); 184 185 quant = _mm_load_si128((const __m128i *)quant_ptr); 186 dequant = _mm_load_si128((const __m128i *)dequant_ptr); 187 shift = _mm_load_si128((const __m128i *)quant_shift_ptr); 188 189 // Do DC and first 15 AC. 190 coeff0 = load_tran_low(coeff_ptr); 191 coeff1 = load_tran_low(coeff_ptr + 8); 192 193 qcoeff0 = _mm_abs_epi16(coeff0); 194 qcoeff1 = _mm_abs_epi16(coeff1); 195 196 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); 197 zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC. 198 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); 199 200 all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); 201 if (_mm_test_all_zeros(all_zero, all_zero)) { 202 _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero); 203 _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero); 204 _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero); 205 _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero); 206 207 round = _mm_unpackhi_epi64(round, round); 208 quant = _mm_unpackhi_epi64(quant, quant); 209 shift = _mm_unpackhi_epi64(shift, shift); 210 dequant = _mm_unpackhi_epi64(dequant, dequant); 211 } else { 212 calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale); 213 round = _mm_unpackhi_epi64(round, round); 214 quant = _mm_unpackhi_epi64(quant, quant); 215 shift = _mm_unpackhi_epi64(shift, shift); 216 calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale); 217 218 // Reinsert signs. 219 qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); 220 qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); 221 222 // Mask out zbin threshold coeffs. 223 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); 224 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); 225 226 store_tran_low(qcoeff0, qcoeff_ptr); 227 store_tran_low(qcoeff1, qcoeff_ptr + 8); 228 229 calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, dqcoeff_ptr, 230 &log_scale); 231 dequant = _mm_unpackhi_epi64(dequant, dequant); 232 calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero, 233 dqcoeff_ptr + 8, &log_scale); 234 235 eob = 236 scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); 237 } 238 239 // AC only loop. 240 for (index = 16; index < n_coeffs; index += 16) { 241 coeff0 = load_tran_low(coeff_ptr + index); 242 coeff1 = load_tran_low(coeff_ptr + index + 8); 243 244 qcoeff0 = _mm_abs_epi16(coeff0); 245 qcoeff1 = _mm_abs_epi16(coeff1); 246 247 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); 248 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); 249 250 all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); 251 if (_mm_test_all_zeros(all_zero, all_zero)) { 252 _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero); 253 _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero); 254 _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero); 255 _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero); 256 continue; 257 } 258 259 calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale); 260 calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale); 261 262 qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); 263 qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); 264 265 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); 266 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); 267 268 store_tran_low(qcoeff0, qcoeff_ptr + index); 269 store_tran_low(qcoeff1, qcoeff_ptr + index + 8); 270 271 calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, 272 dqcoeff_ptr + index, &log_scale); 273 calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero, 274 dqcoeff_ptr + index + 8, &log_scale); 275 276 eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, 277 zero); 278 eob = _mm_max_epi16(eob, eob0); 279 } 280 281 *eob_ptr = accumulate_eob(eob); 282 }