av1_quantize_sse2.c (11870B)
1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <emmintrin.h> 13 #include <xmmintrin.h> 14 15 #include "config/av1_rtcd.h" 16 17 #include "aom/aom_integer.h" 18 #include "aom_dsp/x86/quantize_x86.h" 19 20 static inline void read_coeff(const tran_low_t *coeff, intptr_t offset, 21 __m128i *c0, __m128i *c1) { 22 const tran_low_t *addr = coeff + offset; 23 if (sizeof(tran_low_t) == 4) { 24 const __m128i x0 = _mm_load_si128((const __m128i *)addr); 25 const __m128i x1 = _mm_load_si128((const __m128i *)addr + 1); 26 const __m128i x2 = _mm_load_si128((const __m128i *)addr + 2); 27 const __m128i x3 = _mm_load_si128((const __m128i *)addr + 3); 28 *c0 = _mm_packs_epi32(x0, x1); 29 *c1 = _mm_packs_epi32(x2, x3); 30 } else { 31 *c0 = _mm_load_si128((const __m128i *)addr); 32 *c1 = _mm_load_si128((const __m128i *)addr + 1); 33 } 34 } 35 36 static inline void write_qcoeff(const __m128i *qc0, const __m128i *qc1, 37 tran_low_t *qcoeff, intptr_t offset) { 38 tran_low_t *addr = qcoeff + offset; 39 if (sizeof(tran_low_t) == 4) { 40 const __m128i zero = _mm_setzero_si128(); 41 __m128i sign_bits = _mm_cmplt_epi16(*qc0, zero); 42 __m128i y0 = _mm_unpacklo_epi16(*qc0, sign_bits); 43 __m128i y1 = _mm_unpackhi_epi16(*qc0, sign_bits); 44 _mm_store_si128((__m128i *)addr, y0); 45 _mm_store_si128((__m128i *)addr + 1, y1); 46 47 sign_bits = _mm_cmplt_epi16(*qc1, zero); 48 y0 = _mm_unpacklo_epi16(*qc1, sign_bits); 49 y1 = _mm_unpackhi_epi16(*qc1, sign_bits); 50 _mm_store_si128((__m128i *)addr + 2, y0); 51 _mm_store_si128((__m128i *)addr + 3, y1); 52 } else { 53 _mm_store_si128((__m128i *)addr, *qc0); 54 _mm_store_si128((__m128i *)addr + 1, *qc1); 55 } 56 } 57 58 static inline void write_zero(tran_low_t *qcoeff, intptr_t offset) { 59 const __m128i zero = _mm_setzero_si128(); 60 tran_low_t *addr = qcoeff + offset; 61 if (sizeof(tran_low_t) == 4) { 62 _mm_store_si128((__m128i *)addr, zero); 63 _mm_store_si128((__m128i *)addr + 1, zero); 64 _mm_store_si128((__m128i *)addr + 2, zero); 65 _mm_store_si128((__m128i *)addr + 3, zero); 66 } else { 67 _mm_store_si128((__m128i *)addr, zero); 68 _mm_store_si128((__m128i *)addr + 1, zero); 69 } 70 } 71 72 static inline void quantize(const int16_t *iscan_ptr, 73 const tran_low_t *coeff_ptr, intptr_t n_coeffs, 74 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, 75 const __m128i *round0, const __m128i *round1, 76 const __m128i *quant0, const __m128i *quant1, 77 const __m128i *dequant0, const __m128i *dequant1, 78 const __m128i *thr0, const __m128i *thr1, 79 __m128i *eob) { 80 __m128i coeff0, coeff1; 81 // Do DC and first 15 AC 82 read_coeff(coeff_ptr, n_coeffs, &coeff0, &coeff1); 83 84 // Poor man's sign extract 85 const __m128i coeff0_sign = _mm_srai_epi16(coeff0, 15); 86 const __m128i coeff1_sign = _mm_srai_epi16(coeff1, 15); 87 __m128i qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); 88 __m128i qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); 89 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); 90 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); 91 const __m128i mask0 = _mm_or_si128(_mm_cmpgt_epi16(qcoeff0, *thr0), 92 _mm_cmpeq_epi16(qcoeff0, *thr0)); 93 const __m128i mask1 = _mm_or_si128(_mm_cmpgt_epi16(qcoeff1, *thr1), 94 _mm_cmpeq_epi16(qcoeff1, *thr1)); 95 const int nzflag = _mm_movemask_epi8(mask0) | _mm_movemask_epi8(mask1); 96 97 if (nzflag) { 98 qcoeff0 = _mm_adds_epi16(qcoeff0, *round0); 99 qcoeff1 = _mm_adds_epi16(qcoeff1, *round1); 100 const __m128i qtmp0 = _mm_mulhi_epi16(qcoeff0, *quant0); 101 const __m128i qtmp1 = _mm_mulhi_epi16(qcoeff1, *quant1); 102 103 // Reinsert signs 104 qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); 105 qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); 106 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); 107 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); 108 109 write_qcoeff(&qcoeff0, &qcoeff1, qcoeff_ptr, n_coeffs); 110 111 coeff0 = _mm_mullo_epi16(qcoeff0, *dequant0); 112 coeff1 = _mm_mullo_epi16(qcoeff1, *dequant1); 113 114 write_qcoeff(&coeff0, &coeff1, dqcoeff_ptr, n_coeffs); 115 116 const __m128i zero = _mm_setzero_si128(); 117 // Scan for eob 118 const __m128i zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); 119 const __m128i zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); 120 const __m128i nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); 121 const __m128i nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); 122 const __m128i iscan0 = 123 _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); 124 const __m128i iscan1 = 125 _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); 126 // Add one to convert from indices to counts 127 const __m128i iscan0_nz = _mm_sub_epi16(iscan0, nzero_coeff0); 128 const __m128i iscan1_nz = _mm_sub_epi16(iscan1, nzero_coeff1); 129 const __m128i eob0 = _mm_and_si128(iscan0_nz, nzero_coeff0); 130 const __m128i eob1 = _mm_and_si128(iscan1_nz, nzero_coeff1); 131 const __m128i eob2 = _mm_max_epi16(eob0, eob1); 132 *eob = _mm_max_epi16(*eob, eob2); 133 } else { 134 write_zero(qcoeff_ptr, n_coeffs); 135 write_zero(dqcoeff_ptr, n_coeffs); 136 } 137 } 138 139 void av1_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, 140 const int16_t *zbin_ptr, const int16_t *round_ptr, 141 const int16_t *quant_ptr, 142 const int16_t *quant_shift_ptr, 143 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, 144 const int16_t *dequant_ptr, uint16_t *eob_ptr, 145 const int16_t *scan_ptr, const int16_t *iscan_ptr) { 146 (void)scan_ptr; 147 (void)zbin_ptr; 148 (void)quant_shift_ptr; 149 150 coeff_ptr += n_coeffs; 151 iscan_ptr += n_coeffs; 152 qcoeff_ptr += n_coeffs; 153 dqcoeff_ptr += n_coeffs; 154 n_coeffs = -n_coeffs; 155 156 const __m128i round0 = _mm_load_si128((const __m128i *)round_ptr); 157 const __m128i round1 = _mm_unpackhi_epi64(round0, round0); 158 const __m128i quant0 = _mm_load_si128((const __m128i *)quant_ptr); 159 const __m128i quant1 = _mm_unpackhi_epi64(quant0, quant0); 160 const __m128i dequant0 = _mm_load_si128((const __m128i *)dequant_ptr); 161 const __m128i dequant1 = _mm_unpackhi_epi64(dequant0, dequant0); 162 const __m128i thr0 = _mm_srai_epi16(dequant0, 1); 163 const __m128i thr1 = _mm_srai_epi16(dequant1, 1); 164 __m128i eob = _mm_setzero_si128(); 165 166 quantize(iscan_ptr, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round0, 167 &round1, &quant0, &quant1, &dequant0, &dequant1, &thr0, &thr1, &eob); 168 169 n_coeffs += 8 * 2; 170 171 // AC only loop 172 while (n_coeffs < 0) { 173 quantize(iscan_ptr, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round1, 174 &round1, &quant1, &quant1, &dequant1, &dequant1, &thr1, &thr1, 175 &eob); 176 n_coeffs += 8 * 2; 177 } 178 179 // Accumulate EOB 180 { 181 __m128i eob_shuffled; 182 eob_shuffled = _mm_shuffle_epi32(eob, 0xe); 183 eob = _mm_max_epi16(eob, eob_shuffled); 184 eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); 185 eob = _mm_max_epi16(eob, eob_shuffled); 186 eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); 187 eob = _mm_max_epi16(eob, eob_shuffled); 188 *eob_ptr = _mm_extract_epi16(eob, 1); 189 } 190 } 191 192 static inline void quantize_lp(const int16_t *iscan_ptr, 193 const int16_t *coeff_ptr, intptr_t n_coeffs, 194 int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, 195 const __m128i *round0, const __m128i *round1, 196 const __m128i *quant0, const __m128i *quant1, 197 const __m128i *dequant0, const __m128i *dequant1, 198 __m128i *eob) { 199 const int16_t *read = coeff_ptr + n_coeffs; 200 __m128i coeff0 = _mm_load_si128((const __m128i *)read); 201 __m128i coeff1 = _mm_load_si128((const __m128i *)read + 1); 202 203 // Poor man's sign extract 204 const __m128i coeff0_sign = _mm_srai_epi16(coeff0, 15); 205 const __m128i coeff1_sign = _mm_srai_epi16(coeff1, 15); 206 __m128i qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); 207 __m128i qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); 208 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); 209 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); 210 211 qcoeff0 = _mm_adds_epi16(qcoeff0, *round0); 212 qcoeff1 = _mm_adds_epi16(qcoeff1, *round1); 213 const __m128i qtmp0 = _mm_mulhi_epi16(qcoeff0, *quant0); 214 const __m128i qtmp1 = _mm_mulhi_epi16(qcoeff1, *quant1); 215 216 // Reinsert signs 217 qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); 218 qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); 219 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); 220 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); 221 222 int16_t *addr = qcoeff_ptr + n_coeffs; 223 _mm_store_si128((__m128i *)addr, qcoeff0); 224 _mm_store_si128((__m128i *)addr + 1, qcoeff1); 225 226 coeff0 = _mm_mullo_epi16(qcoeff0, *dequant0); 227 coeff1 = _mm_mullo_epi16(qcoeff1, *dequant1); 228 229 addr = dqcoeff_ptr + n_coeffs; 230 _mm_store_si128((__m128i *)addr, coeff0); 231 _mm_store_si128((__m128i *)addr + 1, coeff1); 232 233 const __m128i zero = _mm_setzero_si128(); 234 // Scan for eob 235 const __m128i zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); 236 const __m128i zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); 237 const __m128i nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); 238 const __m128i nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); 239 240 const __m128i iscan0 = 241 _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); 242 const __m128i iscan1 = 243 _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); 244 245 // Add one to convert from indices to counts 246 const __m128i iscan0_nz = _mm_sub_epi16(iscan0, nzero_coeff0); 247 const __m128i iscan1_nz = _mm_sub_epi16(iscan1, nzero_coeff1); 248 const __m128i eob0 = _mm_and_si128(iscan0_nz, nzero_coeff0); 249 const __m128i eob1 = _mm_and_si128(iscan1_nz, nzero_coeff1); 250 const __m128i eob2 = _mm_max_epi16(eob0, eob1); 251 *eob = _mm_max_epi16(*eob, eob2); 252 } 253 254 void av1_quantize_lp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs, 255 const int16_t *round_ptr, const int16_t *quant_ptr, 256 int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, 257 const int16_t *dequant_ptr, uint16_t *eob_ptr, 258 const int16_t *scan, const int16_t *iscan) { 259 (void)scan; 260 coeff_ptr += n_coeffs; 261 iscan += n_coeffs; 262 qcoeff_ptr += n_coeffs; 263 dqcoeff_ptr += n_coeffs; 264 n_coeffs = -n_coeffs; 265 266 // Setup global values 267 const __m128i round0 = _mm_load_si128((const __m128i *)round_ptr); 268 const __m128i round1 = _mm_unpackhi_epi64(round0, round0); 269 const __m128i quant0 = _mm_load_si128((const __m128i *)quant_ptr); 270 const __m128i quant1 = _mm_unpackhi_epi64(quant0, quant0); 271 const __m128i dequant0 = _mm_load_si128((const __m128i *)dequant_ptr); 272 const __m128i dequant1 = _mm_unpackhi_epi64(dequant0, dequant0); 273 __m128i eob = _mm_setzero_si128(); 274 275 // DC and first 15 AC 276 quantize_lp(iscan, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round0, 277 &round1, &quant0, &quant1, &dequant0, &dequant1, &eob); 278 n_coeffs += 8 * 2; 279 280 // AC only loop 281 while (n_coeffs < 0) { 282 quantize_lp(iscan, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round1, 283 &round1, &quant1, &quant1, &dequant1, &dequant1, &eob); 284 n_coeffs += 8 * 2; 285 } 286 287 // Accumulate EOB 288 *eob_ptr = accumulate_eob(eob); 289 }