av1_highbd_quantize_sse4.c (7650B)
1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <smmintrin.h> 13 #include <stdint.h> 14 15 #include "config/av1_rtcd.h" 16 17 #include "aom_dsp/aom_dsp_common.h" 18 #include "aom_dsp/x86/synonyms.h" 19 20 // Coefficient quantization phase 1 21 // param[0-2] : rounding/quan/dequan constants 22 static inline void quantize_coeff_phase1(__m128i *coeff, const __m128i *param, 23 const int shift, const int scale, 24 __m128i *qcoeff, __m128i *dquan, 25 __m128i *sign) { 26 const __m128i zero = _mm_setzero_si128(); 27 const __m128i one = _mm_set1_epi32(1); 28 29 *sign = _mm_cmplt_epi32(*coeff, zero); 30 *sign = _mm_or_si128(*sign, one); 31 *coeff = _mm_abs_epi32(*coeff); 32 33 qcoeff[0] = _mm_add_epi32(*coeff, param[0]); 34 qcoeff[1] = _mm_unpackhi_epi32(qcoeff[0], zero); 35 qcoeff[0] = _mm_unpacklo_epi32(qcoeff[0], zero); 36 37 qcoeff[0] = _mm_mul_epi32(qcoeff[0], param[1]); 38 qcoeff[0] = _mm_srli_epi64(qcoeff[0], shift); 39 dquan[0] = _mm_mul_epi32(qcoeff[0], param[2]); 40 dquan[0] = _mm_srli_epi64(dquan[0], scale); 41 const __m128i abs_s = _mm_slli_epi32(*coeff, 1 + scale); 42 qcoeff[2] = _mm_cmplt_epi32(abs_s, param[3]); 43 } 44 45 // Coefficient quantization phase 2 46 static inline void quantize_coeff_phase2(__m128i *qcoeff, __m128i *dquan, 47 const __m128i *sign, 48 const __m128i *param, const int shift, 49 const int scale, tran_low_t *qAddr, 50 tran_low_t *dqAddr) { 51 __m128i mask0L = _mm_set_epi32(-1, -1, 0, 0); 52 __m128i mask0H = _mm_set_epi32(0, 0, -1, -1); 53 54 qcoeff[1] = _mm_mul_epi32(qcoeff[1], param[1]); 55 qcoeff[1] = _mm_srli_epi64(qcoeff[1], shift); 56 dquan[1] = _mm_mul_epi32(qcoeff[1], param[2]); 57 dquan[1] = _mm_srli_epi64(dquan[1], scale); 58 59 // combine L&H 60 qcoeff[0] = _mm_shuffle_epi32(qcoeff[0], 0xd8); 61 qcoeff[1] = _mm_shuffle_epi32(qcoeff[1], 0x8d); 62 63 qcoeff[0] = _mm_and_si128(qcoeff[0], mask0H); 64 qcoeff[1] = _mm_and_si128(qcoeff[1], mask0L); 65 66 dquan[0] = _mm_shuffle_epi32(dquan[0], 0xd8); 67 dquan[1] = _mm_shuffle_epi32(dquan[1], 0x8d); 68 69 dquan[0] = _mm_and_si128(dquan[0], mask0H); 70 dquan[1] = _mm_and_si128(dquan[1], mask0L); 71 72 qcoeff[0] = _mm_or_si128(qcoeff[0], qcoeff[1]); 73 dquan[0] = _mm_or_si128(dquan[0], dquan[1]); 74 75 qcoeff[0] = _mm_sign_epi32(qcoeff[0], *sign); 76 dquan[0] = _mm_sign_epi32(dquan[0], *sign); 77 qcoeff[0] = _mm_andnot_si128(qcoeff[2], qcoeff[0]); 78 dquan[0] = _mm_andnot_si128(qcoeff[2], dquan[0]); 79 _mm_storeu_si128((__m128i *)qAddr, qcoeff[0]); 80 _mm_storeu_si128((__m128i *)dqAddr, dquan[0]); 81 } 82 83 static inline void find_eob(tran_low_t *qcoeff_ptr, const int16_t *iscan, 84 __m128i *eob) { 85 const __m128i zero = _mm_setzero_si128(); 86 __m128i mask, iscanIdx; 87 const __m128i q0 = _mm_loadu_si128((__m128i const *)qcoeff_ptr); 88 const __m128i q1 = _mm_loadu_si128((__m128i const *)(qcoeff_ptr + 4)); 89 __m128i nz_flag0 = _mm_cmpeq_epi32(q0, zero); 90 __m128i nz_flag1 = _mm_cmpeq_epi32(q1, zero); 91 92 nz_flag0 = _mm_cmpeq_epi32(nz_flag0, zero); 93 nz_flag1 = _mm_cmpeq_epi32(nz_flag1, zero); 94 95 mask = _mm_packs_epi32(nz_flag0, nz_flag1); 96 iscanIdx = _mm_loadu_si128((__m128i const *)iscan); 97 iscanIdx = _mm_sub_epi16(iscanIdx, mask); 98 iscanIdx = _mm_and_si128(iscanIdx, mask); 99 *eob = _mm_max_epi16(*eob, iscanIdx); 100 } 101 102 static inline uint16_t get_accumulated_eob(__m128i *eob) { 103 __m128i eob_shuffled; 104 uint16_t eobValue; 105 eob_shuffled = _mm_shuffle_epi32(*eob, 0xe); 106 *eob = _mm_max_epi16(*eob, eob_shuffled); 107 eob_shuffled = _mm_shufflelo_epi16(*eob, 0xe); 108 *eob = _mm_max_epi16(*eob, eob_shuffled); 109 eob_shuffled = _mm_shufflelo_epi16(*eob, 0x1); 110 *eob = _mm_max_epi16(*eob, eob_shuffled); 111 eobValue = _mm_extract_epi16(*eob, 0); 112 return eobValue; 113 } 114 115 void av1_highbd_quantize_fp_sse4_1( 116 const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr, 117 const int16_t *round_ptr, const int16_t *quant_ptr, 118 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, 119 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, 120 const int16_t *scan, const int16_t *iscan, int log_scale) { 121 __m128i coeff[2], qcoeff[3], dequant[2], qparam[4], coeff_sign; 122 __m128i eob = _mm_setzero_si128(); 123 const tran_low_t *src = coeff_ptr; 124 tran_low_t *quanAddr = qcoeff_ptr; 125 tran_low_t *dquanAddr = dqcoeff_ptr; 126 const int shift = 16 - log_scale; 127 const int coeff_stride = 4; 128 const int quan_stride = coeff_stride; 129 (void)zbin_ptr; 130 (void)quant_shift_ptr; 131 (void)scan; 132 133 memset(quanAddr, 0, count * sizeof(quanAddr[0])); 134 memset(dquanAddr, 0, count * sizeof(dquanAddr[0])); 135 136 coeff[0] = _mm_loadu_si128((__m128i const *)src); 137 const int round1 = ROUND_POWER_OF_TWO(round_ptr[1], log_scale); 138 const int round0 = ROUND_POWER_OF_TWO(round_ptr[0], log_scale); 139 140 qparam[0] = _mm_set_epi32(round1, round1, round1, round0); 141 qparam[1] = _mm_set_epi64x((uint32_t)quant_ptr[1], (uint32_t)quant_ptr[0]); 142 qparam[2] = 143 _mm_set_epi64x((uint32_t)dequant_ptr[1], (uint32_t)dequant_ptr[0]); 144 qparam[3] = _mm_set_epi32(dequant_ptr[1], dequant_ptr[1], dequant_ptr[1], 145 dequant_ptr[0]); 146 147 // DC and first 3 AC 148 quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant, 149 &coeff_sign); 150 151 // update round/quan/dquan for AC 152 qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]); 153 qparam[1] = _mm_set1_epi64x((uint32_t)quant_ptr[1]); 154 qparam[2] = _mm_set1_epi64x((uint32_t)dequant_ptr[1]); 155 qparam[3] = _mm_set1_epi32(dequant_ptr[1]); 156 quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale, 157 quanAddr, dquanAddr); 158 159 // next 4 AC 160 coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride)); 161 quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant, 162 &coeff_sign); 163 quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale, 164 quanAddr + quan_stride, dquanAddr + quan_stride); 165 166 find_eob(quanAddr, iscan, &eob); 167 168 count -= 8; 169 170 // loop for the rest of AC 171 while (count > 0) { 172 src += coeff_stride << 1; 173 quanAddr += quan_stride << 1; 174 dquanAddr += quan_stride << 1; 175 iscan += quan_stride << 1; 176 177 coeff[0] = _mm_loadu_si128((__m128i const *)src); 178 coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride)); 179 180 quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant, 181 &coeff_sign); 182 quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, 183 log_scale, quanAddr, dquanAddr); 184 185 quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant, 186 &coeff_sign); 187 quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, 188 log_scale, quanAddr + quan_stride, 189 dquanAddr + quan_stride); 190 191 find_eob(quanAddr, iscan, &eob); 192 193 count -= 8; 194 } 195 *eob_ptr = get_accumulated_eob(&eob); 196 }