highbd_quantize_intrin_sse2.c (8171B)
1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <emmintrin.h> 13 14 #include "aom_dsp/aom_dsp_common.h" 15 #include "aom_mem/aom_mem.h" 16 #include "aom_ports/mem.h" 17 #include "config/aom_dsp_rtcd.h" 18 19 void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, 20 const int16_t *zbin_ptr, 21 const int16_t *round_ptr, 22 const int16_t *quant_ptr, 23 const int16_t *quant_shift_ptr, 24 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, 25 const int16_t *dequant_ptr, uint16_t *eob_ptr, 26 const int16_t *scan, const int16_t *iscan) { 27 int i, j, non_zero_regs = (int)count / 4, eob_i = -1; 28 __m128i zbins[2]; 29 __m128i nzbins[2]; 30 31 zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1], 32 (int)zbin_ptr[0]); 33 zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]); 34 35 nzbins[0] = _mm_setzero_si128(); 36 nzbins[1] = _mm_setzero_si128(); 37 nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); 38 nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); 39 40 (void)scan; 41 42 memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr)); 43 memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr)); 44 45 // Pre-scan pass 46 for (i = ((int)count / 4) - 1; i >= 0; i--) { 47 __m128i coeffs, cmp1, cmp2; 48 int test; 49 coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); 50 cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); 51 cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); 52 cmp1 = _mm_and_si128(cmp1, cmp2); 53 test = _mm_movemask_epi8(cmp1); 54 if (test == 0xffff) 55 non_zero_regs--; 56 else 57 break; 58 } 59 60 // Quantization pass: 61 for (i = 0; i < non_zero_regs; i++) { 62 __m128i coeffs, coeffs_sign, tmp1, tmp2; 63 int test; 64 int abs_coeff[4]; 65 int coeff_sign[4]; 66 67 coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); 68 coeffs_sign = _mm_srai_epi32(coeffs, 31); 69 coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign); 70 tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]); 71 tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]); 72 tmp1 = _mm_or_si128(tmp1, tmp2); 73 test = _mm_movemask_epi8(tmp1); 74 _mm_storeu_si128((__m128i *)abs_coeff, coeffs); 75 _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign); 76 77 for (j = 0; j < 4; j++) { 78 if (test & (1 << (4 * j))) { 79 int k = 4 * i + j; 80 const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0]; 81 const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3; 82 const uint32_t abs_qcoeff = 83 (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16); 84 qcoeff_ptr[k] = 85 (int)(abs_qcoeff ^ (uint32_t)coeff_sign[j]) - coeff_sign[j]; 86 dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0]; 87 if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i; 88 } 89 } 90 } 91 *eob_ptr = eob_i + 1; 92 } 93 94 void aom_highbd_quantize_b_32x32_sse2( 95 const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, 96 const int16_t *round_ptr, const int16_t *quant_ptr, 97 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, 98 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, 99 const int16_t *scan, const int16_t *iscan) { 100 __m128i zbins[2]; 101 __m128i nzbins[2]; 102 int idx = 0; 103 int idx_arr[1024]; 104 int i, eob = -1; 105 const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1); 106 const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1); 107 (void)scan; 108 zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp); 109 zbins[1] = _mm_set1_epi32(zbin1_tmp); 110 111 nzbins[0] = _mm_setzero_si128(); 112 nzbins[1] = _mm_setzero_si128(); 113 nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); 114 nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); 115 116 memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); 117 memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); 118 119 // Pre-scan pass 120 for (i = 0; i < n_coeffs / 4; i++) { 121 __m128i coeffs, cmp1, cmp2; 122 int test; 123 coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); 124 cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); 125 cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); 126 cmp1 = _mm_and_si128(cmp1, cmp2); 127 test = _mm_movemask_epi8(cmp1); 128 if (!(test & 0xf)) idx_arr[idx++] = i * 4; 129 if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1; 130 if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2; 131 if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3; 132 } 133 134 // Quantization pass: only process the coefficients selected in 135 // pre-scan pass. Note: idx can be zero. 136 for (i = 0; i < idx; i++) { 137 const int rc = idx_arr[i]; 138 const int coeff = coeff_ptr[rc]; 139 const int coeff_sign = AOMSIGN(coeff); 140 const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; 141 const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); 142 const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; 143 const uint32_t abs_qcoeff = 144 (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); 145 qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign; 146 dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; 147 if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; 148 } 149 *eob_ptr = eob + 1; 150 } 151 152 void aom_highbd_quantize_b_64x64_sse2( 153 const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, 154 const int16_t *round_ptr, const int16_t *quant_ptr, 155 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, 156 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, 157 const int16_t *scan, const int16_t *iscan) { 158 __m128i zbins[2]; 159 __m128i nzbins[2]; 160 int idx = 0; 161 int idx_arr[1024]; 162 int i, eob = -1; 163 const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 2); 164 const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 2); 165 (void)scan; 166 zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp); 167 zbins[1] = _mm_set1_epi32(zbin1_tmp); 168 169 nzbins[0] = _mm_setzero_si128(); 170 nzbins[1] = _mm_setzero_si128(); 171 nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); 172 nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); 173 174 memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); 175 memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); 176 177 // Pre-scan pass 178 for (i = 0; i < n_coeffs / 4; i++) { 179 __m128i coeffs, cmp1, cmp2; 180 int test; 181 coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); 182 cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); 183 cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); 184 cmp1 = _mm_and_si128(cmp1, cmp2); 185 test = _mm_movemask_epi8(cmp1); 186 if (!(test & 0xf)) idx_arr[idx++] = i * 4; 187 if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1; 188 if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2; 189 if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3; 190 } 191 192 // Quantization pass: only process the coefficients selected in 193 // pre-scan pass. Note: idx can be zero. 194 for (i = 0; i < idx; i++) { 195 const int rc = idx_arr[i]; 196 const int coeff = coeff_ptr[rc]; 197 const int coeff_sign = AOMSIGN(coeff); 198 const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; 199 const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2); 200 const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; 201 const uint32_t abs_qcoeff = 202 (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 14); 203 qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign; 204 dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 4; 205 if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; 206 } 207 *eob_ptr = eob + 1; 208 }