highbd_quantize_intrin_avx2.c (11029B)
1 /* 2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <immintrin.h> 13 14 #include "config/aom_dsp_rtcd.h" 15 16 #include "aom/aom_integer.h" 17 18 static inline void init_one_qp(const __m128i *p, __m256i *qp) { 19 const __m128i sign = _mm_srai_epi16(*p, 15); 20 const __m128i dc = _mm_unpacklo_epi16(*p, sign); 21 const __m128i ac = _mm_unpackhi_epi16(*p, sign); 22 *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1); 23 } 24 25 static inline void update_qp(__m256i *qp) { 26 int i; 27 for (i = 0; i < 5; ++i) { 28 qp[i] = _mm256_permute2x128_si256(qp[i], qp[i], 0x11); 29 } 30 } 31 32 static inline void init_qp(const int16_t *zbin_ptr, const int16_t *round_ptr, 33 const int16_t *quant_ptr, const int16_t *dequant_ptr, 34 const int16_t *quant_shift_ptr, __m256i *qp, 35 int log_scale) { 36 const __m128i zbin = _mm_loadu_si128((const __m128i *)zbin_ptr); 37 const __m128i round = _mm_loadu_si128((const __m128i *)round_ptr); 38 const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr); 39 const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr); 40 const __m128i quant_shift = _mm_loadu_si128((const __m128i *)quant_shift_ptr); 41 init_one_qp(&zbin, &qp[0]); 42 init_one_qp(&round, &qp[1]); 43 init_one_qp(&quant, &qp[2]); 44 init_one_qp(&dequant, &qp[3]); 45 init_one_qp(&quant_shift, &qp[4]); 46 if (log_scale > 0) { 47 const __m256i rnd = _mm256_set1_epi32((int16_t)(1 << (log_scale - 1))); 48 qp[0] = _mm256_add_epi32(qp[0], rnd); 49 qp[0] = _mm256_srai_epi32(qp[0], log_scale); 50 51 qp[1] = _mm256_add_epi32(qp[1], rnd); 52 qp[1] = _mm256_srai_epi32(qp[1], log_scale); 53 } 54 // Subtracting 1 here eliminates a _mm256_cmpeq_epi32() instruction when 55 // calculating the zbin mask. 56 qp[0] = _mm256_sub_epi32(qp[0], _mm256_set1_epi32(1)); 57 } 58 59 // Note: 60 // *x is vector multiplied by *y which is 16 int32_t parallel multiplication 61 // and right shift 16. The output, 16 int32_t is save in *p. 62 static inline __m256i mm256_mul_shift_epi32(const __m256i *x, 63 const __m256i *y) { 64 __m256i prod_lo = _mm256_mul_epi32(*x, *y); 65 __m256i prod_hi = _mm256_srli_epi64(*x, 32); 66 const __m256i mult_hi = _mm256_srli_epi64(*y, 32); 67 prod_hi = _mm256_mul_epi32(prod_hi, mult_hi); 68 69 prod_lo = _mm256_srli_epi64(prod_lo, 16); 70 const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); 71 prod_lo = _mm256_and_si256(prod_lo, mask); 72 prod_hi = _mm256_srli_epi64(prod_hi, 16); 73 74 prod_hi = _mm256_slli_epi64(prod_hi, 32); 75 return _mm256_or_si256(prod_lo, prod_hi); 76 } 77 78 static AOM_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan_ptr, 79 __m256i eobmax, 80 __m256i nz_mask) { 81 const __m256i packed_nz_mask = _mm256_packs_epi32(nz_mask, nz_mask); 82 const __m256i packed_nz_mask_perm = 83 _mm256_permute4x64_epi64(packed_nz_mask, 0xD8); 84 const __m256i iscan = 85 _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)iscan_ptr)); 86 const __m256i iscan_plus1 = _mm256_sub_epi16(iscan, packed_nz_mask_perm); 87 const __m256i nz_iscan = _mm256_and_si256(iscan_plus1, packed_nz_mask_perm); 88 return _mm256_max_epi16(eobmax, nz_iscan); 89 } 90 91 // Get the max eob from the lower 128 bits. 92 static AOM_FORCE_INLINE uint16_t get_max_eob(__m256i eob) { 93 __m256i eob_s; 94 eob_s = _mm256_shuffle_epi32(eob, 0xe); 95 eob = _mm256_max_epi16(eob, eob_s); 96 eob_s = _mm256_shufflelo_epi16(eob, 0xe); 97 eob = _mm256_max_epi16(eob, eob_s); 98 eob_s = _mm256_shufflelo_epi16(eob, 1); 99 eob = _mm256_max_epi16(eob, eob_s); 100 return (uint16_t)_mm256_extract_epi16(eob, 0); 101 } 102 103 static AOM_FORCE_INLINE __m256i mm256_mul_shift_epi32_logscale(const __m256i *x, 104 const __m256i *y, 105 int log_scale) { 106 __m256i prod_lo = _mm256_mul_epi32(*x, *y); 107 __m256i prod_hi = _mm256_srli_epi64(*x, 32); 108 const __m256i mult_hi = _mm256_srli_epi64(*y, 32); 109 prod_hi = _mm256_mul_epi32(prod_hi, mult_hi); 110 prod_lo = _mm256_srli_epi64(prod_lo, 16 - log_scale); 111 const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); 112 prod_lo = _mm256_and_si256(prod_lo, mask); 113 prod_hi = _mm256_srli_epi64(prod_hi, 16 - log_scale); 114 prod_hi = _mm256_slli_epi64(prod_hi, 32); 115 return _mm256_or_si256(prod_lo, prod_hi); 116 } 117 118 static AOM_FORCE_INLINE void quantize_logscale( 119 const __m256i *qp, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, 120 tran_low_t *qcoeff, tran_low_t *dqcoeff, __m256i *eob, int log_scale) { 121 const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); 122 const __m256i abs_coeff = _mm256_abs_epi32(coeff); 123 const __m256i zbin_mask = _mm256_cmpgt_epi32(abs_coeff, qp[0]); 124 125 if (UNLIKELY(_mm256_movemask_epi8(zbin_mask) == 0)) { 126 const __m256i zero = _mm256_setzero_si256(); 127 _mm256_storeu_si256((__m256i *)qcoeff, zero); 128 _mm256_storeu_si256((__m256i *)dqcoeff, zero); 129 return; 130 } 131 132 const __m256i tmp_rnd = 133 _mm256_and_si256(_mm256_add_epi32(abs_coeff, qp[1]), zbin_mask); 134 // const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw; 135 const __m256i tmp = mm256_mul_shift_epi32_logscale(&tmp_rnd, &qp[2], 0); 136 const __m256i tmp2 = _mm256_add_epi32(tmp, tmp_rnd); 137 // const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 138 // (16 - log_scale + AOM_QM_BITS)); 139 const __m256i abs_q = 140 mm256_mul_shift_epi32_logscale(&tmp2, &qp[4], log_scale); 141 const __m256i abs_dq = 142 _mm256_srli_epi32(_mm256_mullo_epi32(abs_q, qp[3]), log_scale); 143 const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256()); 144 const __m256i q = _mm256_sign_epi32(abs_q, coeff); 145 const __m256i dq = _mm256_sign_epi32(abs_dq, coeff); 146 147 _mm256_storeu_si256((__m256i *)qcoeff, q); 148 _mm256_storeu_si256((__m256i *)dqcoeff, dq); 149 150 *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask); 151 } 152 153 static AOM_FORCE_INLINE void quantize(const __m256i *qp, 154 const tran_low_t *coeff_ptr, 155 const int16_t *iscan_ptr, 156 tran_low_t *qcoeff, tran_low_t *dqcoeff, 157 __m256i *eob) { 158 const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); 159 const __m256i abs_coeff = _mm256_abs_epi32(coeff); 160 const __m256i zbin_mask = _mm256_cmpgt_epi32(abs_coeff, qp[0]); 161 162 if (UNLIKELY(_mm256_movemask_epi8(zbin_mask) == 0)) { 163 const __m256i zero = _mm256_setzero_si256(); 164 _mm256_storeu_si256((__m256i *)qcoeff, zero); 165 _mm256_storeu_si256((__m256i *)dqcoeff, zero); 166 return; 167 } 168 169 const __m256i tmp_rnd = 170 _mm256_and_si256(_mm256_add_epi32(abs_coeff, qp[1]), zbin_mask); 171 const __m256i tmp = mm256_mul_shift_epi32(&tmp_rnd, &qp[2]); 172 const __m256i tmp2 = _mm256_add_epi32(tmp, tmp_rnd); 173 const __m256i abs_q = mm256_mul_shift_epi32(&tmp2, &qp[4]); 174 const __m256i abs_dq = _mm256_mullo_epi32(abs_q, qp[3]); 175 const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256()); 176 const __m256i q = _mm256_sign_epi32(abs_q, coeff); 177 const __m256i dq = _mm256_sign_epi32(abs_dq, coeff); 178 179 _mm256_storeu_si256((__m256i *)qcoeff, q); 180 _mm256_storeu_si256((__m256i *)dqcoeff, dq); 181 182 *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask); 183 } 184 185 void aom_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, 186 const int16_t *zbin_ptr, 187 const int16_t *round_ptr, 188 const int16_t *quant_ptr, 189 const int16_t *quant_shift_ptr, 190 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, 191 const int16_t *dequant_ptr, uint16_t *eob_ptr, 192 const int16_t *scan, const int16_t *iscan) { 193 (void)scan; 194 const int step = 8; 195 196 __m256i eob = _mm256_setzero_si256(); 197 __m256i qp[5]; 198 199 init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 0); 200 201 quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); 202 203 coeff_ptr += step; 204 qcoeff_ptr += step; 205 dqcoeff_ptr += step; 206 iscan += step; 207 n_coeffs -= step; 208 209 update_qp(qp); 210 211 while (n_coeffs > 0) { 212 quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); 213 214 coeff_ptr += step; 215 qcoeff_ptr += step; 216 dqcoeff_ptr += step; 217 iscan += step; 218 n_coeffs -= step; 219 } 220 221 *eob_ptr = get_max_eob(eob); 222 } 223 224 void aom_highbd_quantize_b_32x32_avx2( 225 const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, 226 const int16_t *round_ptr, const int16_t *quant_ptr, 227 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, 228 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, 229 const int16_t *scan, const int16_t *iscan) { 230 (void)scan; 231 const unsigned int step = 8; 232 233 __m256i eob = _mm256_setzero_si256(); 234 __m256i qp[5]; 235 init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 1); 236 237 quantize_logscale(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob, 1); 238 239 coeff_ptr += step; 240 qcoeff_ptr += step; 241 dqcoeff_ptr += step; 242 iscan += step; 243 n_coeffs -= step; 244 245 update_qp(qp); 246 247 while (n_coeffs > 0) { 248 quantize_logscale(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob, 1); 249 250 coeff_ptr += step; 251 qcoeff_ptr += step; 252 dqcoeff_ptr += step; 253 iscan += step; 254 n_coeffs -= step; 255 } 256 257 *eob_ptr = get_max_eob(eob); 258 } 259 260 void aom_highbd_quantize_b_64x64_avx2( 261 const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, 262 const int16_t *round_ptr, const int16_t *quant_ptr, 263 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, 264 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, 265 const int16_t *scan, const int16_t *iscan) { 266 (void)scan; 267 const int step = 8; 268 269 __m256i eob = _mm256_setzero_si256(); 270 __m256i qp[5]; 271 init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 2); 272 273 quantize_logscale(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob, 2); 274 275 coeff_ptr += step; 276 qcoeff_ptr += step; 277 dqcoeff_ptr += step; 278 iscan += step; 279 n_coeffs -= step; 280 281 update_qp(qp); 282 283 while (n_coeffs > 0) { 284 quantize_logscale(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob, 2); 285 286 coeff_ptr += step; 287 qcoeff_ptr += step; 288 dqcoeff_ptr += step; 289 iscan += step; 290 n_coeffs -= step; 291 } 292 293 *eob_ptr = get_max_eob(eob); 294 }