reconinter_enc_sse2.c (12715B)
1 /* 2 * Copyright (c) 2021, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <assert.h> 13 #include <emmintrin.h> // SSE2 14 15 #include "config/aom_config.h" 16 #include "config/aom_dsp_rtcd.h" 17 #include "config/aom_scale_rtcd.h" 18 19 #include "aom/aom_integer.h" 20 #include "aom_dsp/blend.h" 21 #include "aom_dsp/x86/mem_sse2.h" 22 #include "aom_dsp/x86/synonyms.h" 23 24 #include "av1/common/av1_common_int.h" 25 #include "av1/common/blockd.h" 26 #include "av1/common/mvref_common.h" 27 #include "av1/common/obmc.h" 28 #include "av1/common/reconinter.h" 29 #include "av1/common/reconintra.h" 30 #include "av1/encoder/reconinter_enc.h" 31 32 void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm, 33 int mi_row, int mi_col, const MV *const mv, 34 uint8_t *comp_pred, int width, int height, 35 int subpel_x_q3, int subpel_y_q3, 36 const uint8_t *ref, int ref_stride, 37 int subpel_search) { 38 // expect xd == NULL only in tests 39 if (xd != NULL) { 40 const MB_MODE_INFO *mi = xd->mi[0]; 41 const int ref_num = 0; 42 const int is_intrabc = is_intrabc_block(mi); 43 const struct scale_factors *const sf = 44 is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num]; 45 const int is_scaled = av1_is_scaled(sf); 46 47 if (is_scaled) { 48 int plane = 0; 49 const int mi_x = mi_col * MI_SIZE; 50 const int mi_y = mi_row * MI_SIZE; 51 const struct macroblockd_plane *const pd = &xd->plane[plane]; 52 const struct buf_2d *const dst_buf = &pd->dst; 53 const struct buf_2d *const pre_buf = 54 is_intrabc ? dst_buf : &pd->pre[ref_num]; 55 56 InterPredParams inter_pred_params; 57 inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd); 58 const int_interpfilters filters = 59 av1_broadcast_interp_filter(EIGHTTAP_REGULAR); 60 av1_init_inter_params( 61 &inter_pred_params, width, height, mi_y >> pd->subsampling_y, 62 mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y, 63 xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters); 64 av1_enc_build_one_inter_predictor(comp_pred, width, mv, 65 &inter_pred_params); 66 return; 67 } 68 } 69 70 const InterpFilterParams *filter = av1_get_filter(subpel_search); 71 // (TODO:yunqing) 2-tap case uses 4-tap functions since there is no SIMD for 72 // 2-tap yet. 73 int filter_taps = (subpel_search <= USE_4_TAPS) ? 4 : SUBPEL_TAPS; 74 75 if (!subpel_x_q3 && !subpel_y_q3) { 76 if (width >= 16) { 77 int i; 78 assert(!(width & 15)); 79 /*Read 16 pixels one row at a time.*/ 80 for (i = 0; i < height; i++) { 81 int j; 82 for (j = 0; j < width; j += 16) { 83 xx_storeu_128(comp_pred, xx_loadu_128(ref)); 84 comp_pred += 16; 85 ref += 16; 86 } 87 ref += ref_stride - width; 88 } 89 } else if (width >= 8) { 90 int i; 91 assert(!(width & 7)); 92 assert(!(height & 1)); 93 /*Read 8 pixels two rows at a time.*/ 94 for (i = 0; i < height; i += 2) { 95 __m128i s0 = xx_loadl_64(ref + 0 * ref_stride); 96 __m128i s1 = xx_loadl_64(ref + 1 * ref_stride); 97 xx_storeu_128(comp_pred, _mm_unpacklo_epi64(s0, s1)); 98 comp_pred += 16; 99 ref += 2 * ref_stride; 100 } 101 } else { 102 int i; 103 assert(!(width & 3)); 104 assert(!(height & 3)); 105 /*Read 4 pixels four rows at a time.*/ 106 for (i = 0; i < height; i++) { 107 const __m128i row0 = xx_loadl_64(ref + 0 * ref_stride); 108 const __m128i row1 = xx_loadl_64(ref + 1 * ref_stride); 109 const __m128i row2 = xx_loadl_64(ref + 2 * ref_stride); 110 const __m128i row3 = xx_loadl_64(ref + 3 * ref_stride); 111 const __m128i reg = _mm_unpacklo_epi64(_mm_unpacklo_epi32(row0, row1), 112 _mm_unpacklo_epi32(row2, row3)); 113 xx_storeu_128(comp_pred, reg); 114 comp_pred += 16; 115 ref += 4 * ref_stride; 116 } 117 } 118 } else if (!subpel_y_q3) { 119 const int16_t *const kernel = 120 av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); 121 aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL, -1, 122 width, height); 123 } else if (!subpel_x_q3) { 124 const int16_t *const kernel = 125 av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); 126 aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, 16, 127 width, height); 128 } else { 129 DECLARE_ALIGNED(16, uint8_t, 130 temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]); 131 const int16_t *const kernel_x = 132 av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); 133 const int16_t *const kernel_y = 134 av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); 135 const uint8_t *ref_start = ref - ref_stride * ((filter_taps >> 1) - 1); 136 uint8_t *temp_start_horiz = (subpel_search <= USE_4_TAPS) 137 ? temp + (filter_taps >> 1) * MAX_SB_SIZE 138 : temp; 139 uint8_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1); 140 int intermediate_height = 141 (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps; 142 assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); 143 aom_convolve8_horiz(ref_start, ref_stride, temp_start_horiz, MAX_SB_SIZE, 144 kernel_x, 16, NULL, -1, width, intermediate_height); 145 aom_convolve8_vert(temp_start_vert, MAX_SB_SIZE, comp_pred, width, NULL, -1, 146 kernel_y, 16, width, height); 147 } 148 } 149 150 #if CONFIG_AV1_HIGHBITDEPTH 151 void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd, 152 const struct AV1Common *const cm, 153 int mi_row, int mi_col, const MV *const mv, 154 uint8_t *comp_pred8, int width, int height, 155 int subpel_x_q3, int subpel_y_q3, 156 const uint8_t *ref8, int ref_stride, int bd, 157 int subpel_search) { 158 // expect xd == NULL only in tests 159 if (xd != NULL) { 160 const MB_MODE_INFO *mi = xd->mi[0]; 161 const int ref_num = 0; 162 const int is_intrabc = is_intrabc_block(mi); 163 const struct scale_factors *const sf = 164 is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num]; 165 const int is_scaled = av1_is_scaled(sf); 166 167 if (is_scaled) { 168 int plane = 0; 169 const int mi_x = mi_col * MI_SIZE; 170 const int mi_y = mi_row * MI_SIZE; 171 const struct macroblockd_plane *const pd = &xd->plane[plane]; 172 const struct buf_2d *const dst_buf = &pd->dst; 173 const struct buf_2d *const pre_buf = 174 is_intrabc ? dst_buf : &pd->pre[ref_num]; 175 176 InterPredParams inter_pred_params; 177 inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd); 178 const int_interpfilters filters = 179 av1_broadcast_interp_filter(EIGHTTAP_REGULAR); 180 av1_init_inter_params( 181 &inter_pred_params, width, height, mi_y >> pd->subsampling_y, 182 mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y, 183 xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters); 184 av1_enc_build_one_inter_predictor(comp_pred8, width, mv, 185 &inter_pred_params); 186 return; 187 } 188 } 189 190 const InterpFilterParams *filter = av1_get_filter(subpel_search); 191 int filter_taps = (subpel_search <= USE_4_TAPS) ? 4 : SUBPEL_TAPS; 192 if (!subpel_x_q3 && !subpel_y_q3) { 193 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); 194 uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); 195 if (width >= 8) { 196 int i; 197 assert(!(width & 7)); 198 /*Read 8 pixels one row at a time.*/ 199 for (i = 0; i < height; i++) { 200 int j; 201 for (j = 0; j < width; j += 8) { 202 __m128i s0 = _mm_loadu_si128((const __m128i *)ref); 203 _mm_storeu_si128((__m128i *)comp_pred, s0); 204 comp_pred += 8; 205 ref += 8; 206 } 207 ref += ref_stride - width; 208 } 209 } else { 210 int i; 211 assert(!(width & 3)); 212 /*Read 4 pixels two rows at a time.*/ 213 for (i = 0; i < height; i += 2) { 214 __m128i s0 = _mm_loadl_epi64((const __m128i *)ref); 215 __m128i s1 = _mm_loadl_epi64((const __m128i *)(ref + ref_stride)); 216 __m128i t0 = _mm_unpacklo_epi64(s0, s1); 217 _mm_storeu_si128((__m128i *)comp_pred, t0); 218 comp_pred += 8; 219 ref += 2 * ref_stride; 220 } 221 } 222 } else if (!subpel_y_q3) { 223 const int16_t *const kernel = 224 av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); 225 aom_highbd_convolve8_horiz(ref8, ref_stride, comp_pred8, width, kernel, 16, 226 NULL, -1, width, height, bd); 227 } else if (!subpel_x_q3) { 228 const int16_t *const kernel = 229 av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); 230 aom_highbd_convolve8_vert(ref8, ref_stride, comp_pred8, width, NULL, -1, 231 kernel, 16, width, height, bd); 232 } else { 233 DECLARE_ALIGNED(16, uint16_t, 234 temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]); 235 const int16_t *const kernel_x = 236 av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); 237 const int16_t *const kernel_y = 238 av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); 239 const uint8_t *ref_start = ref8 - ref_stride * ((filter_taps >> 1) - 1); 240 uint16_t *temp_start_horiz = (subpel_search <= USE_4_TAPS) 241 ? temp + (filter_taps >> 1) * MAX_SB_SIZE 242 : temp; 243 uint16_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1); 244 const int intermediate_height = 245 (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps; 246 assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); 247 aom_highbd_convolve8_horiz( 248 ref_start, ref_stride, CONVERT_TO_BYTEPTR(temp_start_horiz), 249 MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, intermediate_height, bd); 250 aom_highbd_convolve8_vert(CONVERT_TO_BYTEPTR(temp_start_vert), MAX_SB_SIZE, 251 comp_pred8, width, NULL, -1, kernel_y, 16, width, 252 height, bd); 253 } 254 } 255 256 void aom_highbd_comp_avg_upsampled_pred_sse2( 257 MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, 258 const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, 259 int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, 260 int ref_stride, int bd, int subpel_search) { 261 aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width, 262 height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, 263 bd, subpel_search); 264 uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); 265 uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8); 266 /*The total number of pixels must be a multiple of 8 (e.g., 4x4).*/ 267 assert(!(width * height & 7)); 268 int n = width * height >> 3; 269 for (int i = 0; i < n; i++) { 270 __m128i s0 = _mm_loadu_si128((const __m128i *)comp_pred16); 271 __m128i p0 = _mm_loadu_si128((const __m128i *)pred); 272 _mm_storeu_si128((__m128i *)comp_pred16, _mm_avg_epu16(s0, p0)); 273 comp_pred16 += 8; 274 pred += 8; 275 } 276 } 277 #endif // CONFIG_AV1_HIGHBITDEPTH 278 279 void aom_comp_avg_upsampled_pred_sse2( 280 MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, 281 const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, 282 int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, 283 int ref_stride, int subpel_search) { 284 int n; 285 int i; 286 aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, 287 subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search); 288 /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/ 289 assert(!(width * height & 15)); 290 n = width * height >> 4; 291 for (i = 0; i < n; i++) { 292 __m128i s0 = xx_loadu_128(comp_pred); 293 __m128i p0 = xx_loadu_128(pred); 294 xx_storeu_128(comp_pred, _mm_avg_epu8(s0, p0)); 295 comp_pred += 16; 296 pred += 16; 297 } 298 }