av1_fwd_txfm2d_sse4.c (14058B)
1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include "config/av1_rtcd.h" 13 14 #include "av1/common/enums.h" 15 #include "av1/common/av1_txfm.h" 16 #include "av1/common/x86/av1_txfm_sse2.h" 17 #include "av1/common/x86/highbd_txfm_utility_sse4.h" 18 #include "av1/encoder/av1_fwd_txfm1d_cfg.h" 19 #include "av1/encoder/x86/av1_txfm1d_sse4.h" 20 #include "av1/encoder/x86/av1_fwd_txfm_sse2.h" 21 22 static inline void int16_array_with_stride_to_int32_array_without_stride( 23 const int16_t *input, int stride, int32_t *output, int txfm1d_size) { 24 int r, c; 25 for (r = 0; r < txfm1d_size; r++) { 26 for (c = 0; c < txfm1d_size; c++) { 27 output[r * txfm1d_size + c] = (int32_t)input[r * stride + c]; 28 } 29 } 30 } 31 32 static inline void store_output_32bit_w8(int32_t *const out, 33 const __m128i *const in1, 34 const __m128i *const in2, 35 const int stride, const int out_size) { 36 for (int i = 0; i < out_size; ++i) { 37 _mm_store_si128((__m128i *)(out + stride * i), in1[i]); 38 _mm_store_si128((__m128i *)(out + stride * i + 4), in2[i]); 39 } 40 } 41 42 typedef void (*TxfmFuncSSE2)(__m128i *input, __m128i *output, 43 const int8_t cos_bit, const int8_t *stage_range); 44 45 static void fdct32_sse4_1(__m128i *input, __m128i *output, const int8_t cos_bit, 46 const int8_t *stage_range) { 47 const int txfm_size = 32; 48 const int num_per_128 = 4; 49 int col_num = txfm_size / num_per_128; 50 int col; 51 (void)stage_range; 52 for (col = 0; col < col_num; col++) { 53 av1_fdct32_sse4_1((input + col), (output + col), cos_bit, col_num); 54 } 55 } 56 57 static void fdct64_new_sse4_1(__m128i *input, __m128i *output, 58 const int8_t cos_bit, const int8_t *stage_range) { 59 const int txfm_size = 64; 60 const int num_per_128 = 4; 61 int col_num = txfm_size / num_per_128; 62 (void)stage_range; 63 for (int col = 0; col < col_num; col++) { 64 av1_fdct64_sse4_1((input + col), (output + col), cos_bit, col_num, col_num); 65 } 66 } 67 static void idtx32x32_sse4_1(__m128i *input, __m128i *output, 68 const int8_t cos_bit, const int8_t *stage_range) { 69 (void)stage_range; 70 71 for (int i = 0; i < 8; i++) { 72 av1_idtx32_sse4_1(&input[i * 32], &output[i * 32], cos_bit, 1); 73 } 74 } 75 76 static inline TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) { 77 switch (txfm_type) { 78 case TXFM_TYPE_DCT32: return fdct32_sse4_1; 79 case TXFM_TYPE_DCT64: return fdct64_new_sse4_1; 80 case TXFM_TYPE_IDENTITY32: return idtx32x32_sse4_1; 81 default: assert(0); 82 } 83 return NULL; 84 } 85 86 static inline void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output, 87 const int stride, 88 const TXFM_2D_FLIP_CFG *cfg, 89 int32_t *txfm_buf) { 90 // TODO(sarahparker) This does not currently support rectangular transforms 91 // and will break without splitting txfm_size out into row and col size. 92 // Rectangular transforms use c code only, so it should be ok for now. 93 // It will be corrected when there are sse implementations for rectangular 94 // transforms. 95 assert(cfg->tx_size < TX_SIZES); 96 const int txfm_size = tx_size_wide[cfg->tx_size]; 97 const int8_t *shift = cfg->shift; 98 const int8_t *stage_range_col = cfg->stage_range_col; 99 const int8_t *stage_range_row = cfg->stage_range_row; 100 const int8_t cos_bit_col = cfg->cos_bit_col; 101 const int8_t cos_bit_row = cfg->cos_bit_row; 102 const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col); 103 const TxfmFuncSSE2 txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row); 104 105 __m128i *buf_128 = (__m128i *)txfm_buf; 106 __m128i *out_128 = (__m128i *)output; 107 int num_per_128 = 4; 108 int txfm2d_size_128 = txfm_size * txfm_size / num_per_128; 109 110 int16_array_with_stride_to_int32_array_without_stride(input, stride, txfm_buf, 111 txfm_size); 112 av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[0]); 113 txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col); 114 av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]); 115 transpose_32(txfm_size, out_128, buf_128); 116 txfm_func_row(buf_128, out_128, cos_bit_row, stage_range_row); 117 av1_round_shift_array_32_sse4_1(out_128, out_128, txfm2d_size_128, -shift[2]); 118 } 119 120 static inline void fwd_txfm2d_64x64_sse4_1(const int16_t *input, 121 int32_t *output, const int stride, 122 const TXFM_2D_FLIP_CFG *cfg, 123 int32_t *txfm_buf) { 124 assert(cfg->tx_size < TX_SIZES); 125 const int txfm_size = tx_size_wide[cfg->tx_size]; 126 const int8_t *shift = cfg->shift; 127 const int8_t *stage_range_col = cfg->stage_range_col; 128 const int8_t cos_bit_col = cfg->cos_bit_col; 129 const int8_t cos_bit_row = cfg->cos_bit_row; 130 const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col); 131 __m128i *buf_128 = (__m128i *)txfm_buf; 132 __m128i *out_128 = (__m128i *)output; 133 134 const int num_per_128 = 4; 135 int txfm2d_size_128 = txfm_size * txfm_size / num_per_128; 136 int col_num = txfm_size / num_per_128; 137 138 int16_array_with_stride_to_int32_array_without_stride(input, stride, output, 139 txfm_size); 140 /*col wise transform*/ 141 txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col); 142 av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]); 143 transpose_32(txfm_size, out_128, buf_128); 144 145 /*row wise transform*/ 146 for (int col = 0; col < (col_num >> 1); col++) { 147 av1_fdct64_sse4_1((buf_128 + col), (out_128 + col), cos_bit_row, col_num, 148 (col_num >> 1)); 149 } 150 151 txfm2d_size_128 = (col_num >> 1) * (txfm_size >> 1); 152 av1_round_shift_array_32_sse4_1(out_128, out_128, txfm2d_size_128, -shift[2]); 153 } 154 155 void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output, 156 int stride, TX_TYPE tx_type, int bd) { 157 DECLARE_ALIGNED(16, int32_t, txfm_buf[1024]); 158 TXFM_2D_FLIP_CFG cfg; 159 av1_get_fwd_txfm_cfg(tx_type, TX_32X32, &cfg); 160 (void)bd; 161 fwd_txfm2d_sse4_1(input, output, stride, &cfg, txfm_buf); 162 } 163 164 void av1_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output, 165 int stride, TX_TYPE tx_type, int bd) { 166 DECLARE_ALIGNED(16, int32_t, txfm_buf[4096]); 167 TXFM_2D_FLIP_CFG cfg; 168 av1_get_fwd_txfm_cfg(tx_type, TX_64X64, &cfg); 169 (void)bd; 170 fwd_txfm2d_64x64_sse4_1(input, output, stride, &cfg, txfm_buf); 171 } 172 173 static void lowbd_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output, 174 int stride, TX_TYPE tx_type, int bd) { 175 (void)bd; 176 (void)tx_type; 177 assert(tx_type == DCT_DCT); 178 const TX_SIZE tx_size = TX_64X64; 179 __m128i buf0[64], buf1[512]; 180 const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; 181 const int txw_idx = get_txw_idx(tx_size); 182 const int txh_idx = get_txh_idx(tx_size); 183 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; 184 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; 185 const int width = tx_size_wide[tx_size]; 186 const int height = tx_size_high[tx_size]; 187 const transform_1d_sse2 col_txfm = av1_fdct8x64_new_sse2; 188 const int width_div8 = (width >> 3); 189 const int height_div8 = (height >> 3); 190 191 for (int i = 0; i < width_div8; i++) { 192 load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); 193 round_shift_16bit(buf0, height, shift[0]); 194 col_txfm(buf0, buf0, cos_bit_col); 195 round_shift_16bit(buf0, height, shift[1]); 196 for (int j = 0; j < AOMMIN(4, height_div8); ++j) { 197 transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i); 198 } 199 } 200 for (int i = 0; i < AOMMIN(4, height_div8); i++) { 201 __m128i bufA[64]; 202 __m128i bufB[64]; 203 __m128i *buf = buf1 + width * i; 204 for (int j = 0; j < width; ++j) { 205 bufA[j] = _mm_cvtepi16_epi32(buf[j]); 206 bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j])); 207 } 208 av1_fdct64_sse4_1(bufA, bufA, cos_bit_row, 1, 1); 209 av1_fdct64_sse4_1(bufB, bufB, cos_bit_row, 1, 1); 210 av1_round_shift_array_32_sse4_1(bufA, bufA, 32, -shift[2]); 211 av1_round_shift_array_32_sse4_1(bufB, bufB, 32, -shift[2]); 212 213 store_output_32bit_w8(output + i * 8, bufA, bufB, 32, 32); 214 } 215 } 216 217 static void lowbd_fwd_txfm2d_64x32_sse4_1(const int16_t *input, int32_t *output, 218 int stride, TX_TYPE tx_type, int bd) { 219 (void)bd; 220 const TX_SIZE tx_size = TX_64X32; 221 __m128i buf0[64], buf1[256]; 222 const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; 223 const int txw_idx = get_txw_idx(tx_size); 224 const int txh_idx = get_txh_idx(tx_size); 225 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; 226 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; 227 const int width = tx_size_wide[tx_size]; 228 const int height = tx_size_high[tx_size]; 229 const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type]; 230 const int width_div8 = (width >> 3); 231 const int height_div8 = (height >> 3); 232 233 for (int i = 0; i < width_div8; i++) { 234 load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); 235 round_shift_16bit(buf0, height, shift[0]); 236 col_txfm(buf0, buf0, cos_bit_col); 237 round_shift_16bit(buf0, height, shift[1]); 238 for (int j = 0; j < AOMMIN(4, height_div8); ++j) { 239 transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i); 240 } 241 } 242 assert(tx_type == DCT_DCT); 243 for (int i = 0; i < AOMMIN(4, height_div8); i++) { 244 __m128i bufA[64]; 245 __m128i bufB[64]; 246 __m128i *buf = buf1 + width * i; 247 for (int j = 0; j < width; ++j) { 248 bufA[j] = _mm_cvtepi16_epi32(buf[j]); 249 bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j])); 250 } 251 av1_fdct64_sse4_1(bufA, bufA, cos_bit_row, 1, 1); 252 av1_fdct64_sse4_1(bufB, bufB, cos_bit_row, 1, 1); 253 av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2], NewSqrt2); 254 av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2], NewSqrt2); 255 256 store_output_32bit_w8(output + i * 8, bufA, bufB, 32, 32); 257 } 258 } 259 260 static void lowbd_fwd_txfm2d_32x64_sse4_1(const int16_t *input, int32_t *output, 261 int stride, TX_TYPE tx_type, int bd) { 262 (void)bd; 263 (void)tx_type; 264 assert(tx_type == DCT_DCT); 265 const TX_SIZE tx_size = TX_32X64; 266 __m128i buf0[64], buf1[256]; 267 const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; 268 const int txw_idx = get_txw_idx(tx_size); 269 const int txh_idx = get_txh_idx(tx_size); 270 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; 271 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; 272 const int width = tx_size_wide[tx_size]; 273 const int height = tx_size_high[tx_size]; 274 const transform_1d_sse2 col_txfm = av1_fdct8x64_new_sse2; 275 const int width_div8 = (width >> 3); 276 const int height_div8 = (height >> 3); 277 278 for (int i = 0; i < width_div8; i++) { 279 load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); 280 round_shift_16bit(buf0, height, shift[0]); 281 col_txfm(buf0, buf0, cos_bit_col); 282 round_shift_16bit(buf0, height, shift[1]); 283 for (int j = 0; j < AOMMIN(4, height_div8); ++j) { 284 transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i); 285 } 286 } 287 288 for (int i = 0; i < AOMMIN(4, height_div8); i++) { 289 __m128i bufA[32]; 290 __m128i bufB[32]; 291 __m128i *buf = buf1 + width * i; 292 for (int j = 0; j < width; ++j) { 293 bufA[j] = _mm_cvtepi16_epi32(buf[j]); 294 bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j])); 295 } 296 av1_fdct32_sse4_1(bufA, bufA, cos_bit_row, 1); 297 av1_fdct32_sse4_1(bufB, bufB, cos_bit_row, 1); 298 av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2], NewSqrt2); 299 av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2], NewSqrt2); 300 301 store_output_32bit_w8(output + i * 8, bufA, bufB, 32, 32); 302 } 303 } 304 305 static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = { 306 av1_lowbd_fwd_txfm2d_4x4_sse2, // 4x4 transform 307 av1_lowbd_fwd_txfm2d_8x8_sse2, // 8x8 transform 308 av1_lowbd_fwd_txfm2d_16x16_sse2, // 16x16 transform 309 av1_lowbd_fwd_txfm2d_32x32_sse2, // 32x32 transform 310 lowbd_fwd_txfm2d_64x64_sse4_1, // 64x64 transform 311 av1_lowbd_fwd_txfm2d_4x8_sse2, // 4x8 transform 312 av1_lowbd_fwd_txfm2d_8x4_sse2, // 8x4 transform 313 av1_lowbd_fwd_txfm2d_8x16_sse2, // 8x16 transform 314 av1_lowbd_fwd_txfm2d_16x8_sse2, // 16x8 transform 315 av1_lowbd_fwd_txfm2d_16x32_sse2, // 16x32 transform 316 av1_lowbd_fwd_txfm2d_32x16_sse2, // 32x16 transform 317 lowbd_fwd_txfm2d_32x64_sse4_1, // 32x64 transform 318 lowbd_fwd_txfm2d_64x32_sse4_1, // 64x32 transform 319 av1_lowbd_fwd_txfm2d_4x16_sse2, // 4x16 transform 320 av1_lowbd_fwd_txfm2d_16x4_sse2, // 16x4 transform 321 av1_lowbd_fwd_txfm2d_8x32_sse2, // 8x32 transform 322 av1_lowbd_fwd_txfm2d_32x8_sse2, // 32x8 transform 323 av1_lowbd_fwd_txfm2d_16x64_sse2, // 16x64 transform 324 av1_lowbd_fwd_txfm2d_64x16_sse2, // 64x16 transform 325 }; 326 327 void av1_lowbd_fwd_txfm_sse4_1(const int16_t *src_diff, tran_low_t *coeff, 328 int diff_stride, TxfmParam *txfm_param) { 329 FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size]; 330 if (txfm_param->lossless && txfm_param->tx_size == TX_4X4) { 331 av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param); 332 } else { 333 fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type, 334 txfm_param->bd); 335 } 336 }