resize_sse2.c (15713B)
1 /* 2 * Copyright (c) 2024, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 #include <immintrin.h> 12 13 #include "config/av1_rtcd.h" 14 15 #include "av1/common/resize.h" 16 17 #include "aom_dsp/x86/synonyms.h" 18 19 #define ROW_OFFSET 5 20 21 #define PROCESS_RESIZE_Y_WD8 \ 22 /* ah0 ah1 ... ah7 */ \ 23 const __m128i AH = _mm_add_epi16(l0, l7); \ 24 /* bg0 bg1 ... bh7 */ \ 25 const __m128i BG = _mm_add_epi16(l1, l6); \ 26 /* cf0 cf1 ... cf7 */ \ 27 const __m128i CF = _mm_add_epi16(l2, l5); \ 28 /* de0 de1 ... de7 */ \ 29 const __m128i DE = _mm_add_epi16(l3, l4); \ 30 \ 31 /* ah0 bg0 ... ah3 bg3 */ \ 32 const __m128i AHBG_low = _mm_unpacklo_epi16(AH, BG); \ 33 /*cf0 de0 ... cf2 de2 */ \ 34 const __m128i CFDE_low = _mm_unpacklo_epi16(CF, DE); \ 35 \ 36 /* ah4 bg4... ah7 bg7 */ \ 37 const __m128i AHBG_hi = _mm_unpackhi_epi16(AH, BG); \ 38 /* cf4 de4... cf7 de7 */ \ 39 const __m128i CFDE_hi = _mm_unpackhi_epi16(CF, DE); \ 40 \ 41 /* r00 r01 r02 r03 */ \ 42 const __m128i r00 = _mm_madd_epi16(AHBG_low, coeffs_y[0]); \ 43 const __m128i r01 = _mm_madd_epi16(CFDE_low, coeffs_y[1]); \ 44 __m128i r0 = _mm_add_epi32(r00, r01); \ 45 /* r04 r05 r06 r07 */ \ 46 const __m128i r10 = _mm_madd_epi16(AHBG_hi, coeffs_y[0]); \ 47 const __m128i r11 = _mm_madd_epi16(CFDE_hi, coeffs_y[1]); \ 48 __m128i r1 = _mm_add_epi32(r10, r11); \ 49 \ 50 r0 = _mm_add_epi32(r0, round_const_bits); \ 51 r1 = _mm_add_epi32(r1, round_const_bits); \ 52 r0 = _mm_sra_epi32(r0, round_shift_bits); \ 53 r1 = _mm_sra_epi32(r1, round_shift_bits); \ 54 \ 55 /* r00 ... r07 (8 values of each 16bit) */ \ 56 const __m128i res_16b = _mm_packs_epi32(r0, r1); \ 57 /* r00 ... r07 | r00 ... r07 (16 values of each 8bit) */ \ 58 const __m128i res_8b0 = _mm_packus_epi16(res_16b, res_16b); \ 59 \ 60 __m128i res = _mm_min_epu8(res_8b0, clip_pixel); \ 61 res = _mm_max_epu8(res, zero); \ 62 _mm_storel_epi64((__m128i *)&output[(i / 2) * out_stride + j], res); \ 63 \ 64 l0 = l2; \ 65 l1 = l3; \ 66 l2 = l4; \ 67 l3 = l5; \ 68 l4 = l6; \ 69 l5 = l7; \ 70 data += 2 * stride; 71 72 static inline void prepare_filter_coeffs(const int16_t *filter, 73 __m128i *const coeffs /* [2] */) { 74 // f0 f1 f2 f3 x x x x 75 const __m128i sym_even_filter = _mm_loadl_epi64((__m128i *)filter); 76 77 // f1 f0 f3 f2 x x x x 78 const __m128i tmp1 = _mm_shufflelo_epi16(sym_even_filter, 0xb1); 79 80 // f3 f2 f3 f2 ... 81 coeffs[0] = _mm_shuffle_epi32(tmp1, 0x55); 82 // f1 f0 f1 f0 ... 83 coeffs[1] = _mm_shuffle_epi32(tmp1, 0x00); 84 } 85 86 bool av1_resize_vert_dir_sse2(uint8_t *intbuf, uint8_t *output, int out_stride, 87 int height, int height2, int stride, 88 int start_col) { 89 // For the GM tool, the input layer height or width is assured to be an even 90 // number. Hence the function 'down2_symodd()' is not invoked and SIMD 91 // optimization of the same is not implemented. 92 // When the input height is less than 8 and even, the potential input 93 // heights are limited to 2, 4, or 6. These scenarios require seperate 94 // handling due to padding requirements. Invoking the C function here will 95 // eliminate the need for conditional statements within the subsequent SIMD 96 // code to manage these cases. 97 if (height & 1 || height < 8) { 98 return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2, 99 stride, start_col); 100 } 101 102 __m128i coeffs_y[2]; 103 const int bits = FILTER_BITS; 104 const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1); 105 const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); 106 const uint8_t max_pixel = 255; 107 const __m128i clip_pixel = _mm_set1_epi8((char)max_pixel); 108 const __m128i zero = _mm_setzero_si128(); 109 prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_y); 110 111 const int remain_col = stride % 8; 112 113 for (int j = start_col; j < stride - remain_col; j += 8) { 114 uint8_t *data = &intbuf[j]; 115 // d0 ... d7 116 const __m128i l8_3 = _mm_loadl_epi64((__m128i *)(data + 0 * stride)); 117 // Padding top 3 rows with the last available row at the top. 118 // a0 ... a7 119 const __m128i l8_0 = l8_3; 120 // b0 ... b7 121 const __m128i l8_1 = l8_3; 122 // c0 ... c7 123 const __m128i l8_2 = l8_3; 124 // e0 ... e7 125 const __m128i l8_4 = _mm_loadl_epi64((__m128i *)(data + 1 * stride)); 126 // f0 ... f7 127 const __m128i l8_5 = _mm_loadl_epi64((__m128i *)(data + 2 * stride)); 128 129 // Convert to 16bit as addition of 2 source pixel crosses 8 bit. 130 __m128i l0 = _mm_unpacklo_epi8(l8_0, zero); // A(128bit) = a0 - a7(16 bit) 131 __m128i l1 = _mm_unpacklo_epi8(l8_1, zero); // B(128bit) = b0 - b7(16 bit) 132 __m128i l2 = _mm_unpacklo_epi8(l8_2, zero); // C(128bit) = c0 - c7(16 bit) 133 __m128i l3 = _mm_unpacklo_epi8(l8_3, zero); // D(128bit) = d0 - d7(16 bit) 134 __m128i l4 = _mm_unpacklo_epi8(l8_4, zero); // E(128bit) = e0 - e7(16 bit) 135 __m128i l5 = _mm_unpacklo_epi8(l8_5, zero); // F(128bit) = f0 - f7(16 bit) 136 137 // Increment the pointer such that the loading starts from row G. 138 data = data + 3 * stride; 139 // The core vertical SIMD processes 2 input rows simultaneously to generate 140 // output corresponding to 1 row. To streamline the core loop and eliminate 141 // the need for conditional checks, the remaining rows 4 are processed 142 // separately. 143 for (int i = 0; i < height - 4; i += 2) { 144 // g0 ... g7 145 __m128i l8_6 = _mm_loadl_epi64((__m128i *)(data)); 146 // h0 ... h7 147 __m128i l8_7 = _mm_loadl_epi64((__m128i *)(data + stride)); 148 __m128i l6 = _mm_unpacklo_epi8(l8_6, zero); // G(128bit):g0-g7(16b) 149 __m128i l7 = _mm_unpacklo_epi8(l8_7, zero); // H(128bit):h0-h7(16b) 150 151 PROCESS_RESIZE_Y_WD8 152 } 153 154 __m128i l8_6 = _mm_loadl_epi64((__m128i *)(data)); 155 __m128i l6 = _mm_unpacklo_epi8(l8_6, zero); 156 // Process the last 4 input rows here. 157 for (int i = height - 4; i < height; i += 2) { 158 __m128i l7 = l6; 159 PROCESS_RESIZE_Y_WD8 160 } 161 } 162 163 if (remain_col) 164 return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2, 165 stride, stride - remain_col); 166 167 return true; 168 } 169 170 // Blends a and b using mask and returns the result. 171 static inline __m128i blend(__m128i a, __m128i b, __m128i mask) { 172 const __m128i masked_b = _mm_and_si128(mask, b); 173 const __m128i masked_a = _mm_andnot_si128(mask, a); 174 return (_mm_or_si128(masked_a, masked_b)); 175 } 176 177 // Masks used for width 16 pixels, with left and right padding 178 // requirements. 179 static const uint8_t left_padding_mask[16] = { 180 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 181 }; 182 183 static const uint8_t right_padding_mask[16] = { 0, 0, 0, 0, 0, 0, 184 0, 0, 0, 0, 255, 255, 185 255, 255, 255, 255 }; 186 187 static const uint8_t mask_16[16] = { 188 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 189 }; 190 191 void av1_resize_horz_dir_sse2(const uint8_t *const input, int in_stride, 192 uint8_t *intbuf, int height, int filtered_length, 193 int width2) { 194 assert(height % 2 == 0); 195 // Invoke C for width less than 16. 196 if (filtered_length < 16) { 197 av1_resize_horz_dir_c(input, in_stride, intbuf, height, filtered_length, 198 width2); 199 return; 200 } 201 202 __m128i coeffs_x[2]; 203 const int bits = FILTER_BITS; 204 const int dst_stride = width2; 205 const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1); 206 const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); 207 208 const uint8_t max_pixel = 255; 209 const __m128i clip_pixel = _mm_set1_epi8((char)max_pixel); 210 const __m128i zero = _mm_setzero_si128(); 211 212 const __m128i start_pad_mask = _mm_loadu_si128((__m128i *)left_padding_mask); 213 const __m128i end_pad_mask = _mm_loadu_si128((__m128i *)right_padding_mask); 214 const __m128i mask_even = _mm_loadu_si128((__m128i *)mask_16); 215 prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_x); 216 217 for (int i = 0; i < height; ++i) { 218 int filter_offset = 0; 219 int row01_offset = ROW_OFFSET; 220 int remain_col = filtered_length; 221 // To avoid pixel over-read at frame boundary, processing of 16 pixels 222 // is done using the core loop only if sufficient number of pixels required 223 // for the load are present.The remaining pixels are processed separately. 224 for (int j = 0; j <= filtered_length - 16; j += 16) { 225 if (remain_col == 18 || remain_col == 20) { 226 break; 227 } 228 const int is_last_cols16 = (j == filtered_length - 16); 229 // While processing the last 16 pixels of the row, ensure that only valid 230 // pixels are loaded. 231 if (is_last_cols16) row01_offset = 0; 232 const int in_idx = i * in_stride + j - filter_offset; 233 const int out_idx = i * dst_stride + j / 2; 234 remain_col -= 16; 235 // a0 a1 a2 a3 .... a15 236 __m128i row00 = _mm_loadu_si128((__m128i *)&input[in_idx]); 237 // a8 a9 a10 a11 .... a23 238 __m128i row01 = _mm_loadu_si128( 239 (__m128i *)&input[in_idx + row01_offset + filter_offset]); 240 filter_offset = 3; 241 242 // Pad start pixels to the left, while processing the first pixels in the 243 // row. 244 if (j == 0) { 245 const __m128i start_pixel_row0 = 246 _mm_set1_epi8((char)input[i * in_stride]); 247 row00 = 248 blend(_mm_slli_si128(row00, 3), start_pixel_row0, start_pad_mask); 249 } 250 251 // Pad end pixels to the right, while processing the last pixels in the 252 // row. 253 if (is_last_cols16) { 254 const __m128i end_pixel_row0 = 255 _mm_set1_epi8((char)input[i * in_stride + filtered_length - 1]); 256 row01 = blend(_mm_srli_si128(row01, ROW_OFFSET), end_pixel_row0, 257 end_pad_mask); 258 } 259 260 // a2 a3 a4 a5 a6 a7 a8 a9 .... a17 261 const __m128i row0_1 = _mm_unpacklo_epi64(_mm_srli_si128(row00, 2), 262 _mm_srli_si128(row01, 2)); 263 // a4 a5 a6 a7 a9 10 a11 a12 .... a19 264 const __m128i row0_2 = _mm_unpacklo_epi64(_mm_srli_si128(row00, 4), 265 _mm_srli_si128(row01, 4)); 266 // a6 a7 a8 a9 a10 a11 a12 a13 .... a21 267 const __m128i row0_3 = _mm_unpacklo_epi64(_mm_srli_si128(row00, 6), 268 _mm_srli_si128(row01, 6)); 269 270 // a0 a2 a4 a6 a8 a10 a12 a14 (each 16 bit) 271 const __m128i s0 = _mm_and_si128(row00, mask_even); 272 // a1 a3 a5 a7 a9 a11 a13 a15 273 const __m128i s1 = _mm_and_si128(_mm_srli_epi16(row00, 8), mask_even); 274 // a2 a4 a6 a8 a10 a12 a14 a16 275 const __m128i s2 = _mm_and_si128(row0_1, mask_even); 276 // a3 a5 a7 a9 a11 a13 a15 a17 277 const __m128i s3 = _mm_and_si128(_mm_srli_epi16(row0_1, 8), mask_even); 278 // a4 a6 a8 a10 a12 a14 a16 a18 279 const __m128i s4 = _mm_and_si128(row0_2, mask_even); 280 // a5 a7 a9 a11 a13 a15 a17 a19 281 const __m128i s5 = _mm_and_si128(_mm_srli_epi16(row0_2, 8), mask_even); 282 // a6 a8 a10 a12 a14 a16 a18 a20 283 const __m128i s6 = _mm_and_si128(row0_3, mask_even); 284 // a7 a9 a11 a13 a15 a17 a19 a21 285 const __m128i s7 = _mm_and_si128(_mm_srli_epi16(row0_3, 8), mask_even); 286 287 // a0a7 a2a9 a4a11 .... a12a19 a14a21 288 const __m128i s07 = _mm_add_epi16(s0, s7); 289 // a1a6 a3a8 a5a10 .... a13a18 a15a20 290 const __m128i s16 = _mm_add_epi16(s1, s6); 291 // a2a5 a4a7 a6a9 .... a14a17 a16a19 292 const __m128i s25 = _mm_add_epi16(s2, s5); 293 // a3a4 a5a6 a7a8 .... a15a16 a17a18 294 const __m128i s34 = _mm_add_epi16(s3, s4); 295 296 // a0a7 a1a6 a2a9 a3a8 a4a11 a5a10 a6a13 a7a12 297 const __m128i s1607_low = _mm_unpacklo_epi16(s07, s16); 298 // a2a5 a3a4 a4a7 a5a6 a6a9 a7a8 a8a11 a9a10 299 const __m128i s3425_low = _mm_unpacklo_epi16(s25, s34); 300 301 // a8a15 a9a14 a10a17 a11a16 a12a19 a13a18 a14a21 a15a20 302 const __m128i s1607_high = _mm_unpackhi_epi16(s07, s16); 303 // a10a13 a11a12 a12a15 a13a14 a14a17 a15a16 a16a19 a17a18 304 const __m128i s3425_high = _mm_unpackhi_epi16(s25, s34); 305 306 const __m128i r01_0 = _mm_madd_epi16(s3425_low, coeffs_x[1]); 307 const __m128i r01_1 = _mm_madd_epi16(s1607_low, coeffs_x[0]); 308 const __m128i r01_2 = _mm_madd_epi16(s3425_high, coeffs_x[1]); 309 const __m128i r01_3 = _mm_madd_epi16(s1607_high, coeffs_x[0]); 310 311 // Result of first 8 pixels of row0 (a0 to a7). 312 // r0_0 r0_1 r0_2 r0_3 313 __m128i r00 = _mm_add_epi32(r01_0, r01_1); 314 r00 = _mm_add_epi32(r00, round_const_bits); 315 r00 = _mm_sra_epi32(r00, round_shift_bits); 316 317 // Result of next 8 pixels of row0 (a8 to 15). 318 // r0_4 r0_5 r0_6 r0_7 319 __m128i r01 = _mm_add_epi32(r01_2, r01_3); 320 r01 = _mm_add_epi32(r01, round_const_bits); 321 r01 = _mm_sra_epi32(r01, round_shift_bits); 322 323 // r0_0 r0_1 r1_2 r0_3 r0_4 r0_5 r0_6 r0_7 324 const __m128i res_16 = _mm_packs_epi32(r00, r01); 325 const __m128i res_8 = _mm_packus_epi16(res_16, res_16); 326 __m128i res = _mm_min_epu8(res_8, clip_pixel); 327 res = _mm_max_epu8(res, zero); 328 329 // r0_0 r0_1 r1_2 r0_3 r0_4 r0_5 r0_6 r0_7 330 _mm_storel_epi64((__m128i *)&intbuf[out_idx], res); 331 } 332 333 int wd_processed = filtered_length - remain_col; 334 if (remain_col) { 335 const int in_idx = (in_stride * i); 336 const int out_idx = (wd_processed / 2) + width2 * i; 337 338 down2_symeven(input + in_idx, filtered_length, intbuf + out_idx, 339 wd_processed); 340 } 341 } 342 }