adaptive_quantize_avx2.c (10063B)
1 /* 2 * Copyright (c) 2019, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <immintrin.h> 13 #include "config/aom_dsp_rtcd.h" 14 #include "aom/aom_integer.h" 15 #include "aom_dsp/quantize.h" 16 #include "aom_dsp/x86/quantize_x86.h" 17 18 static inline void load_b_values_avx2(const int16_t *zbin_ptr, __m256i *zbin, 19 const int16_t *round_ptr, __m256i *round, 20 const int16_t *quant_ptr, __m256i *quant, 21 const int16_t *dequant_ptr, 22 __m256i *dequant, 23 const int16_t *shift_ptr, 24 __m256i *shift) { 25 *zbin = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)zbin_ptr)); 26 *zbin = _mm256_permute4x64_epi64(*zbin, 0x54); 27 *zbin = _mm256_sub_epi16(*zbin, _mm256_set1_epi16(1)); 28 *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr)); 29 *round = _mm256_permute4x64_epi64(*round, 0x54); 30 *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr)); 31 *quant = _mm256_permute4x64_epi64(*quant, 0x54); 32 *dequant = 33 _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr)); 34 *dequant = _mm256_permute4x64_epi64(*dequant, 0x54); 35 *shift = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)shift_ptr)); 36 *shift = _mm256_permute4x64_epi64(*shift, 0x54); 37 } 38 39 static inline __m256i load_coefficients_avx2(const tran_low_t *coeff_ptr) { 40 const __m256i coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr)); 41 const __m256i coeff2 = _mm256_load_si256((__m256i *)(coeff_ptr + 8)); 42 return _mm256_packs_epi32(coeff1, coeff2); 43 } 44 45 static inline void update_mask1_avx2(__m256i *cmp_mask, 46 const int16_t *iscan_ptr, int *is_found, 47 __m256i *mask) { 48 __m256i temp_mask = _mm256_setzero_si256(); 49 if (_mm256_movemask_epi8(*cmp_mask)) { 50 __m256i iscan = _mm256_loadu_si256((const __m256i *)(iscan_ptr)); 51 temp_mask = _mm256_and_si256(*cmp_mask, iscan); 52 *is_found = 1; 53 } 54 *mask = _mm256_max_epi16(temp_mask, *mask); 55 } 56 57 static inline void update_mask0_avx2(__m256i *qcoeff, __m256i *threshold, 58 const int16_t *iscan_ptr, int *is_found, 59 __m256i *mask) { 60 __m256i zero = _mm256_setzero_si256(); 61 __m256i coeff[2], cmp_mask0, cmp_mask1; 62 coeff[0] = _mm256_unpacklo_epi16(*qcoeff, zero); 63 coeff[1] = _mm256_unpackhi_epi16(*qcoeff, zero); 64 coeff[0] = _mm256_slli_epi32(coeff[0], AOM_QM_BITS); 65 cmp_mask0 = _mm256_cmpgt_epi32(coeff[0], threshold[0]); 66 coeff[1] = _mm256_slli_epi32(coeff[1], AOM_QM_BITS); 67 cmp_mask1 = _mm256_cmpgt_epi32(coeff[1], threshold[1]); 68 cmp_mask0 = 69 _mm256_permute4x64_epi64(_mm256_packs_epi32(cmp_mask0, cmp_mask1), 0xd8); 70 update_mask1_avx2(&cmp_mask0, iscan_ptr, is_found, mask); 71 } 72 73 static inline void calculate_qcoeff_avx2(__m256i *coeff, const __m256i *round, 74 const __m256i *quant, 75 const __m256i *shift) { 76 __m256i tmp, qcoeff; 77 qcoeff = _mm256_adds_epi16(*coeff, *round); 78 tmp = _mm256_mulhi_epi16(qcoeff, *quant); 79 qcoeff = _mm256_add_epi16(tmp, qcoeff); 80 *coeff = _mm256_mulhi_epi16(qcoeff, *shift); 81 } 82 83 static inline __m256i calculate_dqcoeff_avx2(__m256i qcoeff, __m256i dequant) { 84 return _mm256_mullo_epi16(qcoeff, dequant); 85 } 86 87 static inline void store_coefficients_avx2(__m256i coeff_vals, 88 tran_low_t *coeff_ptr) { 89 __m256i coeff_sign = _mm256_srai_epi16(coeff_vals, 15); 90 __m256i coeff_vals_lo = _mm256_unpacklo_epi16(coeff_vals, coeff_sign); 91 __m256i coeff_vals_hi = _mm256_unpackhi_epi16(coeff_vals, coeff_sign); 92 _mm256_store_si256((__m256i *)(coeff_ptr), coeff_vals_lo); 93 _mm256_store_si256((__m256i *)(coeff_ptr + 8), coeff_vals_hi); 94 } 95 96 void aom_quantize_b_adaptive_avx2( 97 const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, 98 const int16_t *round_ptr, const int16_t *quant_ptr, 99 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, 100 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, 101 const int16_t *scan, const int16_t *iscan) { 102 int index = 16; 103 int non_zero_count = 0; 104 int non_zero_count_prescan_add_zero = 0; 105 int is_found0 = 0, is_found1 = 0; 106 int eob = -1; 107 const __m256i zero = _mm256_setzero_si256(); 108 __m256i zbin, round, quant, dequant, shift; 109 __m256i coeff, qcoeff; 110 __m256i cmp_mask, mask0 = zero, mask1 = zero; 111 __m128i temp_mask0, temp_mask1; 112 int prescan_add[2]; 113 int thresh[2]; 114 const qm_val_t wt = (1 << AOM_QM_BITS); 115 for (int i = 0; i < 2; ++i) { 116 prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); 117 thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1; 118 } 119 __m256i threshold[2]; 120 threshold[0] = _mm256_set1_epi32(thresh[0]); 121 threshold[1] = _mm256_set1_epi32(thresh[1]); 122 threshold[0] = _mm256_blend_epi32(threshold[0], threshold[1], 0xfe); 123 124 #if SKIP_EOB_FACTOR_ADJUST 125 int first = -1; 126 #endif 127 128 // Setup global values. 129 load_b_values_avx2(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, 130 dequant_ptr, &dequant, quant_shift_ptr, &shift); 131 132 // Do DC and first 15 AC. 133 coeff = load_coefficients_avx2(coeff_ptr); 134 qcoeff = _mm256_abs_epi16(coeff); 135 update_mask0_avx2(&qcoeff, threshold, iscan, &is_found0, &mask0); 136 __m256i temp0 = _mm256_cmpgt_epi16(qcoeff, zbin); 137 zbin = _mm256_unpackhi_epi64(zbin, zbin); 138 cmp_mask = _mm256_permute4x64_epi64(temp0, 0xd8); 139 update_mask1_avx2(&cmp_mask, iscan, &is_found1, &mask1); 140 threshold[0] = threshold[1]; 141 if (_mm256_movemask_epi8(cmp_mask) == 0) { 142 _mm256_store_si256((__m256i *)(qcoeff_ptr), zero); 143 _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), zero); 144 _mm256_store_si256((__m256i *)(dqcoeff_ptr), zero); 145 _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), zero); 146 round = _mm256_unpackhi_epi64(round, round); 147 quant = _mm256_unpackhi_epi64(quant, quant); 148 shift = _mm256_unpackhi_epi64(shift, shift); 149 dequant = _mm256_unpackhi_epi64(dequant, dequant); 150 } else { 151 calculate_qcoeff_avx2(&qcoeff, &round, &quant, &shift); 152 round = _mm256_unpackhi_epi64(round, round); 153 quant = _mm256_unpackhi_epi64(quant, quant); 154 shift = _mm256_unpackhi_epi64(shift, shift); 155 // Reinsert signs 156 qcoeff = _mm256_sign_epi16(qcoeff, coeff); 157 // Mask out zbin threshold coeffs 158 qcoeff = _mm256_and_si256(qcoeff, temp0); 159 store_coefficients_avx2(qcoeff, qcoeff_ptr); 160 coeff = calculate_dqcoeff_avx2(qcoeff, dequant); 161 dequant = _mm256_unpackhi_epi64(dequant, dequant); 162 store_coefficients_avx2(coeff, dqcoeff_ptr); 163 } 164 165 // AC only loop. 166 while (index < n_coeffs) { 167 coeff = load_coefficients_avx2(coeff_ptr + index); 168 qcoeff = _mm256_abs_epi16(coeff); 169 update_mask0_avx2(&qcoeff, threshold, iscan + index, &is_found0, &mask0); 170 temp0 = _mm256_cmpgt_epi16(qcoeff, zbin); 171 cmp_mask = _mm256_permute4x64_epi64(temp0, 0xd8); 172 update_mask1_avx2(&cmp_mask, iscan + index, &is_found1, &mask1); 173 if (_mm256_movemask_epi8(cmp_mask) == 0) { 174 _mm256_store_si256((__m256i *)(qcoeff_ptr + index), zero); 175 _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), zero); 176 _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), zero); 177 _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), zero); 178 index += 16; 179 continue; 180 } 181 calculate_qcoeff_avx2(&qcoeff, &round, &quant, &shift); 182 qcoeff = _mm256_sign_epi16(qcoeff, coeff); 183 qcoeff = _mm256_and_si256(qcoeff, temp0); 184 store_coefficients_avx2(qcoeff, qcoeff_ptr + index); 185 coeff = calculate_dqcoeff_avx2(qcoeff, dequant); 186 store_coefficients_avx2(coeff, dqcoeff_ptr + index); 187 index += 16; 188 } 189 if (is_found0) { 190 temp_mask0 = _mm_max_epi16(_mm256_castsi256_si128(mask0), 191 _mm256_extracti128_si256(mask0, 1)); 192 non_zero_count = calculate_non_zero_count(temp_mask0); 193 } 194 if (is_found1) { 195 temp_mask1 = _mm_max_epi16(_mm256_castsi256_si128(mask1), 196 _mm256_extracti128_si256(mask1, 1)); 197 non_zero_count_prescan_add_zero = calculate_non_zero_count(temp_mask1); 198 } 199 200 for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { 201 const int rc = scan[i]; 202 qcoeff_ptr[rc] = 0; 203 dqcoeff_ptr[rc] = 0; 204 } 205 206 for (int i = non_zero_count - 1; i >= 0; i--) { 207 const int rc = scan[i]; 208 if (qcoeff_ptr[rc]) { 209 eob = i; 210 break; 211 } 212 } 213 214 *eob_ptr = eob + 1; 215 #if SKIP_EOB_FACTOR_ADJUST 216 // TODO(Aniket): Experiment the following loop with intrinsic by combining 217 // with the quantization loop above 218 for (int i = 0; i < non_zero_count; i++) { 219 const int rc = scan[i]; 220 const int qcoeff0 = qcoeff_ptr[rc]; 221 if (qcoeff0) { 222 first = i; 223 break; 224 } 225 } 226 if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { 227 const int rc = scan[(*eob_ptr - 1)]; 228 if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { 229 const int coeff0 = coeff_ptr[rc] * wt; 230 const int coeff_sign = AOMSIGN(coeff0); 231 const int abs_coeff = (coeff0 ^ coeff_sign) - coeff_sign; 232 const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; 233 const int prescan_add_val = 234 ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); 235 if (abs_coeff < 236 (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { 237 qcoeff_ptr[rc] = 0; 238 dqcoeff_ptr[rc] = 0; 239 *eob_ptr = 0; 240 } 241 } 242 } 243 #endif 244 }