highbd_convolve_sse2.c (16169B)
1 /* 2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 #include <emmintrin.h> 12 13 #include "config/aom_dsp_rtcd.h" 14 #include "aom_dsp/x86/convolve.h" 15 16 // ----------------------------------------------------------------------------- 17 18 static void aom_highbd_filter_block1d4_v4_sse2( 19 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 20 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 21 __m128i filtersReg; 22 __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; 23 __m128i srcReg23_lo, srcReg34_lo; 24 __m128i srcReg45_lo, srcReg56_lo; 25 __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo; 26 __m128i resReg23_45_lo, resReg34_56_lo; 27 __m128i resReg23_45, resReg34_56; 28 __m128i addFilterReg64, secondFilters, thirdFilters; 29 unsigned int i; 30 ptrdiff_t src_stride, dst_stride; 31 32 const __m128i max = _mm_set1_epi16((1 << bd) - 1); 33 addFilterReg64 = _mm_set1_epi32(64); 34 filtersReg = _mm_loadu_si128((const __m128i *)filter); 35 36 // coeffs 0 1 0 1 2 3 2 3 37 const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg); 38 // coeffs 4 5 4 5 6 7 6 7 39 const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg); 40 41 secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3 42 thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5 43 44 // multiply the size of the source and destination stride by two 45 src_stride = src_pitch << 1; 46 dst_stride = dst_pitch << 1; 47 48 srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); 49 srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); 50 srcReg23_lo = _mm_unpacklo_epi16(srcReg2, srcReg3); 51 52 srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); 53 srcReg34_lo = _mm_unpacklo_epi16(srcReg3, srcReg4); 54 55 for (i = height; i > 1; i -= 2) { 56 srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); 57 srcReg45_lo = _mm_unpacklo_epi16(srcReg4, srcReg5); 58 59 srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); 60 srcReg56_lo = _mm_unpacklo_epi16(srcReg5, srcReg6); 61 62 // multiply 2 adjacent elements with the filter and add the result 63 64 resReg23_lo = _mm_madd_epi16(srcReg23_lo, secondFilters); 65 resReg34_lo = _mm_madd_epi16(srcReg34_lo, secondFilters); 66 resReg45_lo = _mm_madd_epi16(srcReg45_lo, thirdFilters); 67 resReg56_lo = _mm_madd_epi16(srcReg56_lo, thirdFilters); 68 69 resReg23_45_lo = _mm_add_epi32(resReg23_lo, resReg45_lo); 70 resReg34_56_lo = _mm_add_epi32(resReg34_lo, resReg56_lo); 71 72 // shift by 7 bit each 32 bit 73 resReg23_45_lo = _mm_add_epi32(resReg23_45_lo, addFilterReg64); 74 resReg34_56_lo = _mm_add_epi32(resReg34_56_lo, addFilterReg64); 75 resReg23_45_lo = _mm_srai_epi32(resReg23_45_lo, 7); 76 resReg34_56_lo = _mm_srai_epi32(resReg34_56_lo, 7); 77 78 // shrink to 16 bit each 32 bits, the first lane contain the first 79 // convolve result and the second lane contain the second convolve 80 // result 81 resReg23_45 = _mm_packs_epi32(resReg23_45_lo, _mm_setzero_si128()); 82 resReg34_56 = _mm_packs_epi32(resReg34_56_lo, _mm_setzero_si128()); 83 84 resReg23_45 = _mm_max_epi16(resReg23_45, _mm_setzero_si128()); 85 resReg23_45 = _mm_min_epi16(resReg23_45, max); 86 resReg34_56 = _mm_max_epi16(resReg34_56, _mm_setzero_si128()); 87 resReg34_56 = _mm_min_epi16(resReg34_56, max); 88 89 src_ptr += src_stride; 90 91 _mm_storel_epi64((__m128i *)dst_ptr, (resReg23_45)); 92 _mm_storel_epi64((__m128i *)(dst_ptr + dst_pitch), (resReg34_56)); 93 94 dst_ptr += dst_stride; 95 96 // save part of the registers for next strides 97 srcReg23_lo = srcReg45_lo; 98 srcReg34_lo = srcReg56_lo; 99 srcReg4 = srcReg6; 100 } 101 } 102 103 static void aom_highbd_filter_block1d4_h4_sse2( 104 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 105 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 106 __m128i filtersReg; 107 __m128i addFilterReg64; 108 __m128i secondFilters, thirdFilters; 109 __m128i srcRegFilt32b1_1; 110 __m128i srcReg32b1; 111 unsigned int i; 112 src_ptr -= 3; 113 addFilterReg64 = _mm_set1_epi32(64); 114 filtersReg = _mm_loadu_si128((const __m128i *)filter); 115 const __m128i max = _mm_set1_epi16((1 << bd) - 1); 116 117 // coeffs 0 1 0 1 2 3 2 3 118 const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg); 119 // coeffs 4 5 4 5 6 7 6 7 120 const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg); 121 122 secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 123 thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5 124 125 for (i = height; i > 0; i -= 1) { 126 srcReg32b1 = _mm_loadu_si128((const __m128i *)(src_ptr + 2)); 127 128 __m128i ss_3_1 = _mm_srli_si128(srcReg32b1, 2); 129 __m128i ss_4_1 = _mm_srli_si128(srcReg32b1, 4); 130 __m128i ss_5_1 = _mm_srli_si128(srcReg32b1, 6); 131 __m128i ss_23 = _mm_unpacklo_epi32(srcReg32b1, ss_3_1); 132 __m128i ss_45 = _mm_unpacklo_epi32(ss_4_1, ss_5_1); 133 134 ss_23 = _mm_madd_epi16(ss_23, secondFilters); 135 ss_45 = _mm_madd_epi16(ss_45, thirdFilters); 136 srcRegFilt32b1_1 = _mm_add_epi32(ss_23, ss_45); 137 138 // shift by 7 bit each 32 bit 139 srcRegFilt32b1_1 = _mm_add_epi32(srcRegFilt32b1_1, addFilterReg64); 140 srcRegFilt32b1_1 = _mm_srai_epi32(srcRegFilt32b1_1, 7); 141 142 srcRegFilt32b1_1 = _mm_packs_epi32(srcRegFilt32b1_1, _mm_setzero_si128()); 143 srcRegFilt32b1_1 = _mm_max_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); 144 srcRegFilt32b1_1 = _mm_min_epi16(srcRegFilt32b1_1, max); 145 146 src_ptr += src_pitch; 147 148 _mm_storel_epi64((__m128i *)dst_ptr, srcRegFilt32b1_1); 149 150 dst_ptr += dst_pitch; 151 } 152 } 153 154 static void aom_highbd_filter_block1d8_v4_sse2( 155 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 156 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 157 __m128i filtersReg; 158 __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; 159 __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi; 160 __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi; 161 __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo; 162 __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi; 163 __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi; 164 __m128i resReg23_45, resReg34_56; 165 __m128i addFilterReg64, secondFilters, thirdFilters; 166 unsigned int i; 167 ptrdiff_t src_stride, dst_stride; 168 169 const __m128i max = _mm_set1_epi16((1 << bd) - 1); 170 addFilterReg64 = _mm_set1_epi32(64); 171 filtersReg = _mm_loadu_si128((const __m128i *)filter); 172 173 // coeffs 0 1 0 1 2 3 2 3 174 const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg); 175 // coeffs 4 5 4 5 6 7 6 7 176 const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg); 177 178 secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3 179 thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5 180 181 // multiple the size of the source and destination stride by two 182 src_stride = src_pitch << 1; 183 dst_stride = dst_pitch << 1; 184 185 srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); 186 srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); 187 srcReg23_lo = _mm_unpacklo_epi16(srcReg2, srcReg3); 188 srcReg23_hi = _mm_unpackhi_epi16(srcReg2, srcReg3); 189 190 srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); 191 srcReg34_lo = _mm_unpacklo_epi16(srcReg3, srcReg4); 192 srcReg34_hi = _mm_unpackhi_epi16(srcReg3, srcReg4); 193 194 for (i = height; i > 1; i -= 2) { 195 srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); 196 197 srcReg45_lo = _mm_unpacklo_epi16(srcReg4, srcReg5); 198 srcReg45_hi = _mm_unpackhi_epi16(srcReg4, srcReg5); 199 200 srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); 201 202 srcReg56_lo = _mm_unpacklo_epi16(srcReg5, srcReg6); 203 srcReg56_hi = _mm_unpackhi_epi16(srcReg5, srcReg6); 204 205 // multiply 2 adjacent elements with the filter and add the result 206 207 resReg23_lo = _mm_madd_epi16(srcReg23_lo, secondFilters); 208 resReg34_lo = _mm_madd_epi16(srcReg34_lo, secondFilters); 209 resReg45_lo = _mm_madd_epi16(srcReg45_lo, thirdFilters); 210 resReg56_lo = _mm_madd_epi16(srcReg56_lo, thirdFilters); 211 212 resReg23_45_lo = _mm_add_epi32(resReg23_lo, resReg45_lo); 213 resReg34_56_lo = _mm_add_epi32(resReg34_lo, resReg56_lo); 214 215 // multiply 2 adjacent elements with the filter and add the result 216 217 resReg23_hi = _mm_madd_epi16(srcReg23_hi, secondFilters); 218 resReg34_hi = _mm_madd_epi16(srcReg34_hi, secondFilters); 219 resReg45_hi = _mm_madd_epi16(srcReg45_hi, thirdFilters); 220 resReg56_hi = _mm_madd_epi16(srcReg56_hi, thirdFilters); 221 222 resReg23_45_hi = _mm_add_epi32(resReg23_hi, resReg45_hi); 223 resReg34_56_hi = _mm_add_epi32(resReg34_hi, resReg56_hi); 224 225 // shift by 7 bit each 32 bit 226 resReg23_45_lo = _mm_add_epi32(resReg23_45_lo, addFilterReg64); 227 resReg34_56_lo = _mm_add_epi32(resReg34_56_lo, addFilterReg64); 228 resReg23_45_hi = _mm_add_epi32(resReg23_45_hi, addFilterReg64); 229 resReg34_56_hi = _mm_add_epi32(resReg34_56_hi, addFilterReg64); 230 resReg23_45_lo = _mm_srai_epi32(resReg23_45_lo, 7); 231 resReg34_56_lo = _mm_srai_epi32(resReg34_56_lo, 7); 232 resReg23_45_hi = _mm_srai_epi32(resReg23_45_hi, 7); 233 resReg34_56_hi = _mm_srai_epi32(resReg34_56_hi, 7); 234 235 // shrink to 16 bit each 32 bits, the first lane contain the first 236 // convolve result and the second lane contain the second convolve 237 // result 238 resReg23_45 = _mm_packs_epi32(resReg23_45_lo, resReg23_45_hi); 239 resReg34_56 = _mm_packs_epi32(resReg34_56_lo, resReg34_56_hi); 240 241 resReg23_45 = _mm_max_epi16(resReg23_45, _mm_setzero_si128()); 242 resReg23_45 = _mm_min_epi16(resReg23_45, max); 243 resReg34_56 = _mm_max_epi16(resReg34_56, _mm_setzero_si128()); 244 resReg34_56 = _mm_min_epi16(resReg34_56, max); 245 246 src_ptr += src_stride; 247 248 _mm_store_si128((__m128i *)dst_ptr, (resReg23_45)); 249 _mm_store_si128((__m128i *)(dst_ptr + dst_pitch), (resReg34_56)); 250 251 dst_ptr += dst_stride; 252 253 // save part of the registers for next strides 254 srcReg23_lo = srcReg45_lo; 255 srcReg23_hi = srcReg45_hi; 256 srcReg34_lo = srcReg56_lo; 257 srcReg34_hi = srcReg56_hi; 258 srcReg4 = srcReg6; 259 } 260 } 261 262 static void aom_highbd_filter_block1d8_h4_sse2( 263 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 264 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 265 __m128i filtersReg; 266 __m128i addFilterReg64; 267 __m128i secondFilters, thirdFilters; 268 __m128i srcRegFilt32b1_1, srcRegFilt32b1_2; 269 __m128i srcReg32b1, srcReg32b2; 270 unsigned int i; 271 src_ptr -= 3; 272 addFilterReg64 = _mm_set1_epi32(64); 273 filtersReg = _mm_loadu_si128((const __m128i *)filter); 274 const __m128i max = _mm_set1_epi16((1 << bd) - 1); 275 276 // coeffs 0 1 0 1 2 3 2 3 277 const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg); 278 // coeffs 4 5 4 5 6 7 6 7 279 const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg); 280 281 secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 282 thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5 283 284 for (i = height; i > 0; i -= 1) { 285 srcReg32b1 = _mm_loadu_si128((const __m128i *)(src_ptr + 2)); 286 srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 6)); 287 288 __m128i ss_4_1 = _mm_srli_si128(srcReg32b1, 4); 289 __m128i ss_4_2 = _mm_srli_si128(srcReg32b2, 4); 290 __m128i ss_4 = _mm_unpacklo_epi64(ss_4_1, ss_4_2); 291 292 __m128i d1 = _mm_madd_epi16(srcReg32b1, secondFilters); 293 __m128i d2 = _mm_madd_epi16(ss_4, thirdFilters); 294 srcRegFilt32b1_1 = _mm_add_epi32(d1, d2); 295 296 __m128i ss_3_1 = _mm_srli_si128(srcReg32b1, 2); 297 __m128i ss_5_1 = _mm_srli_si128(srcReg32b1, 6); 298 __m128i ss_3_2 = _mm_srli_si128(srcReg32b2, 2); 299 __m128i ss_5_2 = _mm_srli_si128(srcReg32b2, 6); 300 __m128i ss_3 = _mm_unpacklo_epi64(ss_3_1, ss_3_2); 301 __m128i ss_5 = _mm_unpacklo_epi64(ss_5_1, ss_5_2); 302 303 d1 = _mm_madd_epi16(ss_3, secondFilters); 304 d2 = _mm_madd_epi16(ss_5, thirdFilters); 305 srcRegFilt32b1_2 = _mm_add_epi32(d1, d2); 306 307 __m128i res_lo_1 = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2); 308 __m128i res_hi_1 = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2); 309 310 // shift by 7 bit each 32 bit 311 res_lo_1 = _mm_add_epi32(res_lo_1, addFilterReg64); 312 res_hi_1 = _mm_add_epi32(res_hi_1, addFilterReg64); 313 res_lo_1 = _mm_srai_epi32(res_lo_1, 7); 314 res_hi_1 = _mm_srai_epi32(res_hi_1, 7); 315 316 srcRegFilt32b1_1 = _mm_packs_epi32(res_lo_1, res_hi_1); 317 318 srcRegFilt32b1_1 = _mm_max_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); 319 srcRegFilt32b1_1 = _mm_min_epi16(srcRegFilt32b1_1, max); 320 321 src_ptr += src_pitch; 322 323 _mm_store_si128((__m128i *)dst_ptr, srcRegFilt32b1_1); 324 325 dst_ptr += dst_pitch; 326 } 327 } 328 329 static void aom_highbd_filter_block1d16_v4_sse2( 330 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 331 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 332 aom_highbd_filter_block1d8_v4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch, 333 height, filter, bd); 334 aom_highbd_filter_block1d8_v4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8), 335 dst_pitch, height, filter, bd); 336 } 337 338 static void aom_highbd_filter_block1d16_h4_sse2( 339 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 340 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 341 aom_highbd_filter_block1d8_h4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch, 342 height, filter, bd); 343 aom_highbd_filter_block1d8_h4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8), 344 dst_pitch, height, filter, bd); 345 } 346 347 // From aom_dsp/x86/aom_high_subpixel_8t_sse2.asm 348 highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2; 349 highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2; 350 highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2; 351 highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2; 352 highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2; 353 highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2; 354 355 // From aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm 356 highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2; 357 highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2; 358 highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2; 359 highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_sse2; 360 highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_sse2; 361 highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2; 362 363 // void aom_highbd_convolve8_horiz_sse2(const uint8_t *src, 364 // ptrdiff_t src_stride, 365 // uint8_t *dst, 366 // ptrdiff_t dst_stride, 367 // const int16_t *filter_x, 368 // int x_step_q4, 369 // const int16_t *filter_y, 370 // int y_step_q4, 371 // int w, int h, int bd); 372 // void aom_highbd_convolve8_vert_sse2(const uint8_t *src, 373 // ptrdiff_t src_stride, 374 // uint8_t *dst, 375 // ptrdiff_t dst_stride, 376 // const int16_t *filter_x, 377 // int x_step_q4, 378 // const int16_t *filter_y, 379 // int y_step_q4, 380 // int w, int h, int bd); 381 HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2) 382 HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2)