highbd_convolve_ssse3.c (17314B)
1 /* 2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <tmmintrin.h> 13 #include <assert.h> 14 15 #include "config/av1_rtcd.h" 16 17 #include "aom_dsp/x86/convolve_sse2.h" 18 #include "aom_dsp/x86/convolve_common_intrin.h" 19 20 void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, 21 uint16_t *dst, int dst_stride, int w, int h, 22 const InterpFilterParams *filter_params_y, 23 const int subpel_y_qn, int bd) { 24 int i, j; 25 const int fo_vert = filter_params_y->taps / 2 - 1; 26 const uint16_t *const src_ptr = src - fo_vert * src_stride; 27 const int bits = FILTER_BITS; 28 29 const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); 30 const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1); 31 const __m128i clip_pixel = 32 _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); 33 const __m128i zero = _mm_setzero_si128(); 34 if (filter_params_y->taps == 12) { 35 __m128i s[24], coeffs_y[6]; 36 37 prepare_coeffs_12tap(filter_params_y, subpel_y_qn, coeffs_y); 38 39 for (j = 0; j < w; j += 8) { 40 const uint16_t *data = &src_ptr[j]; 41 /* Vertical filter */ 42 __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); 43 __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); 44 __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); 45 __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); 46 __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride)); 47 __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride)); 48 __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride)); 49 __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride)); 50 __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride)); 51 __m128i s9 = _mm_loadu_si128((__m128i *)(data + 9 * src_stride)); 52 __m128i s10 = _mm_loadu_si128((__m128i *)(data + 10 * src_stride)); 53 54 s[0] = _mm_unpacklo_epi16(s0, s1); 55 s[1] = _mm_unpacklo_epi16(s2, s3); 56 s[2] = _mm_unpacklo_epi16(s4, s5); 57 s[3] = _mm_unpacklo_epi16(s6, s7); 58 s[4] = _mm_unpacklo_epi16(s8, s9); 59 60 s[6] = _mm_unpackhi_epi16(s0, s1); 61 s[7] = _mm_unpackhi_epi16(s2, s3); 62 s[8] = _mm_unpackhi_epi16(s4, s5); 63 s[9] = _mm_unpackhi_epi16(s6, s7); 64 s[10] = _mm_unpackhi_epi16(s8, s9); 65 66 s[12] = _mm_unpacklo_epi16(s1, s2); 67 s[13] = _mm_unpacklo_epi16(s3, s4); 68 s[14] = _mm_unpacklo_epi16(s5, s6); 69 s[15] = _mm_unpacklo_epi16(s7, s8); 70 s[16] = _mm_unpacklo_epi16(s9, s10); 71 72 s[18] = _mm_unpackhi_epi16(s1, s2); 73 s[19] = _mm_unpackhi_epi16(s3, s4); 74 s[20] = _mm_unpackhi_epi16(s5, s6); 75 s[21] = _mm_unpackhi_epi16(s7, s8); 76 s[22] = _mm_unpackhi_epi16(s9, s10); 77 78 for (i = 0; i < h; i += 2) { 79 data = &src_ptr[i * src_stride + j]; 80 81 __m128i s11 = _mm_loadu_si128((__m128i *)(data + 11 * src_stride)); 82 __m128i s12 = _mm_loadu_si128((__m128i *)(data + 12 * src_stride)); 83 84 s[5] = _mm_unpacklo_epi16(s10, s11); 85 s[11] = _mm_unpackhi_epi16(s10, s11); 86 87 s[17] = _mm_unpacklo_epi16(s11, s12); 88 s[23] = _mm_unpackhi_epi16(s11, s12); 89 90 const __m128i res_a0 = convolve_12tap(s, coeffs_y); 91 __m128i res_a_round0 = _mm_sra_epi32( 92 _mm_add_epi32(res_a0, round_const_bits), round_shift_bits); 93 94 const __m128i res_a1 = convolve_12tap(s + 12, coeffs_y); 95 __m128i res_a_round1 = _mm_sra_epi32( 96 _mm_add_epi32(res_a1, round_const_bits), round_shift_bits); 97 98 if (w - j > 4) { 99 const __m128i res_b0 = convolve_12tap(s + 6, coeffs_y); 100 __m128i res_b_round0 = _mm_sra_epi32( 101 _mm_add_epi32(res_b0, round_const_bits), round_shift_bits); 102 103 const __m128i res_b1 = convolve_12tap(s + 18, coeffs_y); 104 __m128i res_b_round1 = _mm_sra_epi32( 105 _mm_add_epi32(res_b1, round_const_bits), round_shift_bits); 106 107 __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0); 108 res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel); 109 res_16bit0 = _mm_max_epi16(res_16bit0, zero); 110 111 __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1); 112 res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel); 113 res_16bit1 = _mm_max_epi16(res_16bit1, zero); 114 115 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0); 116 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], 117 res_16bit1); 118 } else if (w == 4) { 119 res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0); 120 res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel); 121 res_a_round0 = _mm_max_epi16(res_a_round0, zero); 122 123 res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1); 124 res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel); 125 res_a_round1 = _mm_max_epi16(res_a_round1, zero); 126 127 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0); 128 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], 129 res_a_round1); 130 } else { 131 res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0); 132 res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel); 133 res_a_round0 = _mm_max_epi16(res_a_round0, zero); 134 135 res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1); 136 res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel); 137 res_a_round1 = _mm_max_epi16(res_a_round1, zero); 138 139 *((int *)(&dst[i * dst_stride + j])) = 140 _mm_cvtsi128_si32(res_a_round0); 141 142 *((int *)(&dst[i * dst_stride + j + dst_stride])) = 143 _mm_cvtsi128_si32(res_a_round1); 144 } 145 146 s[0] = s[1]; 147 s[1] = s[2]; 148 s[2] = s[3]; 149 s[3] = s[4]; 150 s[4] = s[5]; 151 152 s[6] = s[7]; 153 s[7] = s[8]; 154 s[8] = s[9]; 155 s[9] = s[10]; 156 s[10] = s[11]; 157 158 s[12] = s[13]; 159 s[13] = s[14]; 160 s[14] = s[15]; 161 s[15] = s[16]; 162 s[16] = s[17]; 163 164 s[18] = s[19]; 165 s[19] = s[20]; 166 s[20] = s[21]; 167 s[21] = s[22]; 168 s[22] = s[23]; 169 170 s10 = s12; 171 } 172 } 173 } else { 174 __m128i s[16], coeffs_y[4]; 175 176 prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); 177 178 for (j = 0; j < w; j += 8) { 179 const uint16_t *data = &src_ptr[j]; 180 /* Vertical filter */ 181 { 182 __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); 183 __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); 184 __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); 185 __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); 186 __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride)); 187 __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride)); 188 __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride)); 189 190 s[0] = _mm_unpacklo_epi16(s0, s1); 191 s[1] = _mm_unpacklo_epi16(s2, s3); 192 s[2] = _mm_unpacklo_epi16(s4, s5); 193 194 s[4] = _mm_unpackhi_epi16(s0, s1); 195 s[5] = _mm_unpackhi_epi16(s2, s3); 196 s[6] = _mm_unpackhi_epi16(s4, s5); 197 198 s[0 + 8] = _mm_unpacklo_epi16(s1, s2); 199 s[1 + 8] = _mm_unpacklo_epi16(s3, s4); 200 s[2 + 8] = _mm_unpacklo_epi16(s5, s6); 201 202 s[4 + 8] = _mm_unpackhi_epi16(s1, s2); 203 s[5 + 8] = _mm_unpackhi_epi16(s3, s4); 204 s[6 + 8] = _mm_unpackhi_epi16(s5, s6); 205 206 for (i = 0; i < h; i += 2) { 207 data = &src_ptr[i * src_stride + j]; 208 209 __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride)); 210 __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride)); 211 212 s[3] = _mm_unpacklo_epi16(s6, s7); 213 s[7] = _mm_unpackhi_epi16(s6, s7); 214 215 s[3 + 8] = _mm_unpacklo_epi16(s7, s8); 216 s[7 + 8] = _mm_unpackhi_epi16(s7, s8); 217 218 const __m128i res_a0 = convolve(s, coeffs_y); 219 __m128i res_a_round0 = _mm_sra_epi32( 220 _mm_add_epi32(res_a0, round_const_bits), round_shift_bits); 221 222 const __m128i res_a1 = convolve(s + 8, coeffs_y); 223 __m128i res_a_round1 = _mm_sra_epi32( 224 _mm_add_epi32(res_a1, round_const_bits), round_shift_bits); 225 226 if (w - j > 4) { 227 const __m128i res_b0 = convolve(s + 4, coeffs_y); 228 __m128i res_b_round0 = _mm_sra_epi32( 229 _mm_add_epi32(res_b0, round_const_bits), round_shift_bits); 230 231 const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y); 232 __m128i res_b_round1 = _mm_sra_epi32( 233 _mm_add_epi32(res_b1, round_const_bits), round_shift_bits); 234 235 __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0); 236 res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel); 237 res_16bit0 = _mm_max_epi16(res_16bit0, zero); 238 239 __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1); 240 res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel); 241 res_16bit1 = _mm_max_epi16(res_16bit1, zero); 242 243 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0); 244 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], 245 res_16bit1); 246 } else if (w == 4) { 247 res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0); 248 res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel); 249 res_a_round0 = _mm_max_epi16(res_a_round0, zero); 250 251 res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1); 252 res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel); 253 res_a_round1 = _mm_max_epi16(res_a_round1, zero); 254 255 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0); 256 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], 257 res_a_round1); 258 } else { 259 res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0); 260 res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel); 261 res_a_round0 = _mm_max_epi16(res_a_round0, zero); 262 263 res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1); 264 res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel); 265 res_a_round1 = _mm_max_epi16(res_a_round1, zero); 266 267 *((int *)(&dst[i * dst_stride + j])) = 268 _mm_cvtsi128_si32(res_a_round0); 269 270 *((int *)(&dst[i * dst_stride + j + dst_stride])) = 271 _mm_cvtsi128_si32(res_a_round1); 272 } 273 274 s[0] = s[1]; 275 s[1] = s[2]; 276 s[2] = s[3]; 277 278 s[4] = s[5]; 279 s[5] = s[6]; 280 s[6] = s[7]; 281 282 s[0 + 8] = s[1 + 8]; 283 s[1 + 8] = s[2 + 8]; 284 s[2 + 8] = s[3 + 8]; 285 286 s[4 + 8] = s[5 + 8]; 287 s[5 + 8] = s[6 + 8]; 288 s[6 + 8] = s[7 + 8]; 289 290 s6 = s8; 291 } 292 } 293 } 294 } 295 } 296 297 void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, 298 uint16_t *dst, int dst_stride, int w, int h, 299 const InterpFilterParams *filter_params_x, 300 const int subpel_x_qn, 301 ConvolveParams *conv_params, int bd) { 302 int i, j; 303 const int fo_horiz = filter_params_x->taps / 2 - 1; 304 const uint16_t *const src_ptr = src - fo_horiz; 305 306 // Check that, even with 12-bit input, the intermediate values will fit 307 // into an unsigned 16-bit intermediate array. 308 assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); 309 310 const __m128i round_const_x = 311 _mm_set1_epi32(((1 << conv_params->round_0) >> 1)); 312 const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); 313 314 const int bits = FILTER_BITS - conv_params->round_0; 315 316 const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); 317 const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1); 318 const __m128i clip_pixel = 319 _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); 320 const __m128i zero = _mm_setzero_si128(); 321 322 if (filter_params_x->taps == 12) { 323 __m128i s[6], coeffs_x[6]; 324 325 prepare_coeffs_12tap(filter_params_x, subpel_x_qn, coeffs_x); 326 327 for (j = 0; j < w; j += 8) { 328 /* Horizontal filter */ 329 { 330 for (i = 0; i < h; i += 1) { 331 const __m128i row00 = 332 _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); 333 const __m128i row01 = 334 _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]); 335 const __m128i row02 = 336 _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 16)]); 337 338 // even pixels 339 s[0] = _mm_alignr_epi8(row01, row00, 0); 340 s[1] = _mm_alignr_epi8(row01, row00, 4); 341 s[2] = _mm_alignr_epi8(row01, row00, 8); 342 s[3] = _mm_alignr_epi8(row01, row00, 12); 343 s[4] = _mm_alignr_epi8(row02, row01, 0); 344 s[5] = _mm_alignr_epi8(row02, row01, 4); 345 346 __m128i res_even = convolve_12tap(s, coeffs_x); 347 res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x), 348 round_shift_x); 349 res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_bits), 350 round_shift_bits); 351 352 // odd pixels 353 s[0] = _mm_alignr_epi8(row01, row00, 2); 354 s[1] = _mm_alignr_epi8(row01, row00, 6); 355 s[2] = _mm_alignr_epi8(row01, row00, 10); 356 s[3] = _mm_alignr_epi8(row01, row00, 14); 357 s[4] = _mm_alignr_epi8(row02, row01, 2); 358 s[5] = _mm_alignr_epi8(row02, row01, 6); 359 360 __m128i res_odd = convolve_12tap(s, coeffs_x); 361 res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), 362 round_shift_x); 363 res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_bits), 364 round_shift_bits); 365 366 __m128i res_even1 = _mm_packs_epi32(res_even, res_even); 367 __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd); 368 __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1); 369 370 res = _mm_min_epi16(res, clip_pixel); 371 res = _mm_max_epi16(res, zero); 372 373 if (w - j > 4) { 374 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); 375 } else if (w == 4) { 376 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res); 377 } else { 378 *((int *)(&dst[i * dst_stride + j])) = _mm_cvtsi128_si32(res); 379 } 380 } 381 } 382 } 383 } else { 384 __m128i s[4], coeffs_x[4]; 385 prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x); 386 387 for (j = 0; j < w; j += 8) { 388 /* Horizontal filter */ 389 { 390 for (i = 0; i < h; i += 1) { 391 const __m128i row00 = 392 _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); 393 const __m128i row01 = 394 _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]); 395 396 // even pixels 397 s[0] = _mm_alignr_epi8(row01, row00, 0); 398 s[1] = _mm_alignr_epi8(row01, row00, 4); 399 s[2] = _mm_alignr_epi8(row01, row00, 8); 400 s[3] = _mm_alignr_epi8(row01, row00, 12); 401 402 __m128i res_even = convolve(s, coeffs_x); 403 res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x), 404 round_shift_x); 405 406 // odd pixels 407 s[0] = _mm_alignr_epi8(row01, row00, 2); 408 s[1] = _mm_alignr_epi8(row01, row00, 6); 409 s[2] = _mm_alignr_epi8(row01, row00, 10); 410 s[3] = _mm_alignr_epi8(row01, row00, 14); 411 412 __m128i res_odd = convolve(s, coeffs_x); 413 res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), 414 round_shift_x); 415 416 res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_bits), 417 round_shift_bits); 418 res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_bits), 419 round_shift_bits); 420 421 __m128i res_even1 = _mm_packs_epi32(res_even, res_even); 422 __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd); 423 __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1); 424 425 res = _mm_min_epi16(res, clip_pixel); 426 res = _mm_max_epi16(res, zero); 427 428 if (w - j > 4) { 429 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); 430 } else if (w == 4) { 431 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res); 432 } else { 433 *((int *)(&dst[i * dst_stride + j])) = _mm_cvtsi128_si32(res); 434 } 435 } 436 } 437 } 438 } 439 }