aom_subpixel_8t_intrin_avx2.c (60817B)
1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <immintrin.h> 13 14 #include "config/aom_dsp_rtcd.h" 15 16 #include "aom_dsp/x86/convolve.h" 17 #include "aom_dsp/x86/convolve_avx2.h" 18 #include "aom_dsp/x86/synonyms_avx2.h" 19 #include "aom_ports/mem.h" 20 21 #if defined(__clang__) 22 #if (__clang_major__ > 0 && __clang_major__ < 3) || \ 23 (__clang_major__ == 3 && __clang_minor__ <= 3) || \ 24 (defined(__APPLE__) && defined(__apple_build_version__) && \ 25 ((__clang_major__ == 4 && __clang_minor__ <= 2) || \ 26 (__clang_major__ == 5 && __clang_minor__ == 0))) 27 #define MM256_BROADCASTSI128_SI256(x) \ 28 _mm_broadcastsi128_si256((__m128i const *)&(x)) 29 #else // clang > 3.3, and not 5.0 on macosx. 30 #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) 31 #endif // clang <= 3.3 32 #elif defined(__GNUC__) 33 #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6) 34 #define MM256_BROADCASTSI128_SI256(x) \ 35 _mm_broadcastsi128_si256((__m128i const *)&(x)) 36 #elif __GNUC__ == 4 && __GNUC_MINOR__ == 7 37 #define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x) 38 #else // gcc > 4.7 39 #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) 40 #endif // gcc <= 4.6 41 #else // !(gcc || clang) 42 #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) 43 #endif // __clang__ 44 45 static inline void xx_storeu2_epi32(const uint8_t *output_ptr, 46 const ptrdiff_t stride, const __m256i *a) { 47 *((int *)(output_ptr)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*a)); 48 *((int *)(output_ptr + stride)) = 49 _mm_cvtsi128_si32(_mm256_extracti128_si256(*a, 1)); 50 } 51 52 static inline __m256i xx_loadu2_epi64(const void *hi, const void *lo) { 53 __m256i a = _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)(lo))); 54 a = _mm256_inserti128_si256(a, _mm_loadl_epi64((const __m128i *)(hi)), 1); 55 return a; 56 } 57 58 static inline void xx_storeu2_epi64(const uint8_t *output_ptr, 59 const ptrdiff_t stride, const __m256i *a) { 60 _mm_storel_epi64((__m128i *)output_ptr, _mm256_castsi256_si128(*a)); 61 _mm_storel_epi64((__m128i *)(output_ptr + stride), 62 _mm256_extractf128_si256(*a, 1)); 63 } 64 65 static inline void xx_store2_mi128(const uint8_t *output_ptr, 66 const ptrdiff_t stride, const __m256i *a) { 67 _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(*a)); 68 _mm_store_si128((__m128i *)(output_ptr + stride), 69 _mm256_extractf128_si256(*a, 1)); 70 } 71 72 static void aom_filter_block1d4_h4_avx2( 73 const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, 74 ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { 75 __m128i filtersReg; 76 __m256i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1; 77 unsigned int i; 78 ptrdiff_t src_stride, dst_stride; 79 src_ptr -= 3; 80 addFilterReg32 = _mm256_set1_epi16(32); 81 filtersReg = _mm_loadu_si128((const __m128i *)filter); 82 filtersReg = _mm_srai_epi16(filtersReg, 1); 83 // converting the 16 bit (short) to 8 bit (byte) and have the same data 84 // in both lanes of 128 bit register. 85 filtersReg = _mm_packs_epi16(filtersReg, filtersReg); 86 // have the same data in both lanes of a 256 bit register 87 const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); 88 89 firstFilters = 90 _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u)); 91 filt1Reg = _mm256_load_si256((__m256i const *)(filt4_d4_global_avx2)); 92 93 // multiple the size of the source and destination stride by two 94 src_stride = src_pixels_per_line << 1; 95 dst_stride = output_pitch << 1; 96 for (i = output_height; i > 1; i -= 2) { 97 // load the 2 strides of source 98 srcReg32b1 = yy_loadu2_128(src_ptr + src_pixels_per_line, src_ptr); 99 100 // filter the source buffer 101 srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); 102 103 // multiply 4 adjacent elements with the filter and add the result 104 srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); 105 106 srcRegFilt32b1_1 = 107 _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); 108 109 // shift by 6 bit each 16 bit 110 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); 111 srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); 112 113 // shrink to 8 bit each 16 bits, the first lane contain the first 114 // convolve result and the second lane contain the second convolve result 115 srcRegFilt32b1_1 = 116 _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); 117 118 src_ptr += src_stride; 119 120 xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1); 121 output_ptr += dst_stride; 122 } 123 124 // if the number of strides is odd. 125 // process only 4 bytes 126 if (i > 0) { 127 __m128i srcReg1, srcRegFilt1_1; 128 129 srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); 130 131 // filter the source buffer 132 srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); 133 134 // multiply 4 adjacent elements with the filter and add the result 135 srcRegFilt1_1 = 136 _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); 137 138 srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128()); 139 // shift by 6 bit each 16 bit 140 srcRegFilt1_1 = 141 _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); 142 srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); 143 144 // shrink to 8 bit each 16 bits, the first lane contain the first 145 // convolve result and the second lane contain the second convolve result 146 srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128()); 147 148 // save 4 bytes 149 *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1); 150 } 151 } 152 153 static void aom_filter_block1d4_h8_avx2( 154 const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, 155 ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { 156 __m128i filtersReg; 157 __m256i addFilterReg32, filt1Reg, filt2Reg; 158 __m256i firstFilters, secondFilters; 159 __m256i srcRegFilt32b1_1, srcRegFilt32b2; 160 __m256i srcReg32b1; 161 unsigned int i; 162 ptrdiff_t src_stride, dst_stride; 163 src_ptr -= 3; 164 addFilterReg32 = _mm256_set1_epi16(32); 165 filtersReg = _mm_loadu_si128((const __m128i *)filter); 166 filtersReg = _mm_srai_epi16(filtersReg, 1); 167 // converting the 16 bit (short) to 8 bit (byte) and have the same data 168 // in both lanes of 128 bit register. 169 filtersReg = _mm_packs_epi16(filtersReg, filtersReg); 170 // have the same data in both lanes of a 256 bit register 171 const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); 172 173 // duplicate only the first 32 bits 174 firstFilters = _mm256_shuffle_epi32(filtersReg32, 0); 175 // duplicate only the second 32 bits 176 secondFilters = _mm256_shuffle_epi32(filtersReg32, 0x55); 177 178 filt1Reg = _mm256_load_si256((__m256i const *)filt_d4_global_avx2); 179 filt2Reg = _mm256_load_si256((__m256i const *)(filt_d4_global_avx2 + 32)); 180 181 // multiple the size of the source and destination stride by two 182 src_stride = src_pixels_per_line << 1; 183 dst_stride = output_pitch << 1; 184 for (i = output_height; i > 1; i -= 2) { 185 // load the 2 strides of source 186 srcReg32b1 = yy_loadu2_128(src_ptr + src_pixels_per_line, src_ptr); 187 188 // filter the source buffer 189 srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); 190 191 // multiply 4 adjacent elements with the filter and add the result 192 srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); 193 194 // filter the source buffer 195 srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); 196 197 // multiply 4 adjacent elements with the filter and add the result 198 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters); 199 200 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); 201 202 srcRegFilt32b1_1 = 203 _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); 204 205 // shift by 6 bit each 16 bit 206 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); 207 srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); 208 209 // shrink to 8 bit each 16 bits, the first lane contain the first 210 // convolve result and the second lane contain the second convolve result 211 srcRegFilt32b1_1 = 212 _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); 213 214 src_ptr += src_stride; 215 216 xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1); 217 output_ptr += dst_stride; 218 } 219 220 // if the number of strides is odd. 221 // process only 4 bytes 222 if (i > 0) { 223 __m128i srcReg1, srcRegFilt1_1; 224 __m128i srcRegFilt2; 225 226 srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); 227 228 // filter the source buffer 229 srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); 230 231 // multiply 4 adjacent elements with the filter and add the result 232 srcRegFilt1_1 = 233 _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); 234 235 // filter the source buffer 236 srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); 237 238 // multiply 4 adjacent elements with the filter and add the result 239 srcRegFilt2 = 240 _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters)); 241 242 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); 243 srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128()); 244 // shift by 6 bit each 16 bit 245 srcRegFilt1_1 = 246 _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); 247 srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); 248 249 // shrink to 8 bit each 16 bits, the first lane contain the first 250 // convolve result and the second lane contain the second convolve result 251 srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128()); 252 253 // save 4 bytes 254 *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1); 255 } 256 } 257 258 static void aom_filter_block1d8_h4_avx2( 259 const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, 260 ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { 261 __m128i filtersReg; 262 __m256i addFilterReg32, filt2Reg, filt3Reg; 263 __m256i secondFilters, thirdFilters; 264 __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3; 265 __m256i srcReg32b1, filtersReg32; 266 unsigned int i; 267 ptrdiff_t src_stride, dst_stride; 268 src_ptr -= 3; 269 addFilterReg32 = _mm256_set1_epi16(32); 270 filtersReg = _mm_loadu_si128((const __m128i *)filter); 271 filtersReg = _mm_srai_epi16(filtersReg, 1); 272 // converting the 16 bit (short) to 8 bit (byte) and have the same data 273 // in both lanes of 128 bit register. 274 filtersReg = _mm_packs_epi16(filtersReg, filtersReg); 275 // have the same data in both lanes of a 256 bit register 276 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); 277 278 // duplicate only the second 16 bits (third and forth byte) 279 // across 256 bit register 280 secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); 281 // duplicate only the third 16 bits (fifth and sixth byte) 282 // across 256 bit register 283 thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); 284 285 filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); 286 filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); 287 288 // multiply the size of the source and destination stride by two 289 src_stride = src_pixels_per_line << 1; 290 dst_stride = output_pitch << 1; 291 for (i = output_height; i > 1; i -= 2) { 292 // load the 2 strides of source 293 srcReg32b1 = yy_loadu2_128(src_ptr + src_pixels_per_line, src_ptr); 294 295 // filter the source buffer 296 srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); 297 srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg); 298 299 // multiply 2 adjacent elements with the filter and add the result 300 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); 301 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); 302 303 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); 304 305 // shift by 6 bit each 16 bit 306 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); 307 srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); 308 309 // shrink to 8 bit each 16 bits 310 srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b1_1); 311 312 src_ptr += src_stride; 313 314 xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1); 315 output_ptr += dst_stride; 316 } 317 318 // if the number of strides is odd. 319 // process only 8 bytes 320 if (i > 0) { 321 __m128i srcReg1, srcRegFilt1_1; 322 __m128i srcRegFilt2, srcRegFilt3; 323 324 srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); 325 326 // filter the source buffer 327 srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); 328 srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg)); 329 330 // multiply 2 adjacent elements with the filter and add the result 331 srcRegFilt2 = 332 _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters)); 333 srcRegFilt3 = 334 _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(thirdFilters)); 335 336 // add and saturate the results together 337 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt2, srcRegFilt3); 338 339 // shift by 6 bit each 16 bit 340 srcRegFilt1_1 = 341 _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); 342 srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); 343 344 // shrink to 8 bit each 16 bits 345 srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128()); 346 347 // save 8 bytes 348 _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1); 349 } 350 } 351 352 static void aom_filter_block1d8_h8_avx2( 353 const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, 354 ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { 355 __m128i filtersReg; 356 __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg; 357 __m256i firstFilters, secondFilters, thirdFilters, forthFilters; 358 __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3; 359 __m256i srcReg32b1; 360 unsigned int i; 361 ptrdiff_t src_stride, dst_stride; 362 src_ptr -= 3; 363 addFilterReg32 = _mm256_set1_epi16(32); 364 filtersReg = _mm_loadu_si128((const __m128i *)filter); 365 filtersReg = _mm_srai_epi16(filtersReg, 1); 366 // converting the 16 bit (short) to 8 bit (byte) and have the same data 367 // in both lanes of 128 bit register. 368 filtersReg = _mm_packs_epi16(filtersReg, filtersReg); 369 // have the same data in both lanes of a 256 bit register 370 const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); 371 372 // duplicate only the first 16 bits (first and second byte) 373 // across 256 bit register 374 firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); 375 // duplicate only the second 16 bits (third and forth byte) 376 // across 256 bit register 377 secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); 378 // duplicate only the third 16 bits (fifth and sixth byte) 379 // across 256 bit register 380 thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); 381 // duplicate only the forth 16 bits (seventh and eighth byte) 382 // across 256 bit register 383 forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); 384 385 filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2); 386 filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); 387 filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); 388 filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); 389 390 // multiple the size of the source and destination stride by two 391 src_stride = src_pixels_per_line << 1; 392 dst_stride = output_pitch << 1; 393 for (i = output_height; i > 1; i -= 2) { 394 // load the 2 strides of source 395 srcReg32b1 = yy_loadu2_128(src_ptr + src_pixels_per_line, src_ptr); 396 397 // filter the source buffer 398 srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); 399 srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg); 400 401 // multiply 2 adjacent elements with the filter and add the result 402 srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); 403 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); 404 405 // add and saturate the results together 406 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); 407 408 // filter the source buffer 409 srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); 410 srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg); 411 412 // multiply 2 adjacent elements with the filter and add the result 413 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); 414 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); 415 416 __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); 417 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23); 418 419 // shift by 6 bit each 16 bit 420 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); 421 srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); 422 423 // shrink to 8 bit each 16 bits, the first lane contain the first 424 // convolve result and the second lane contain the second convolve result 425 srcRegFilt32b1_1 = 426 _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); 427 428 src_ptr += src_stride; 429 430 xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1); 431 output_ptr += dst_stride; 432 } 433 434 // if the number of strides is odd. 435 // process only 8 bytes 436 if (i > 0) { 437 __m128i srcReg1, srcRegFilt1_1; 438 __m128i srcRegFilt2, srcRegFilt3; 439 440 srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); 441 442 // filter the source buffer 443 srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); 444 srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg)); 445 446 // multiply 2 adjacent elements with the filter and add the result 447 srcRegFilt1_1 = 448 _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); 449 srcRegFilt2 = 450 _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); 451 452 // add and saturate the results together 453 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); 454 455 // filter the source buffer 456 srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); 457 srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg)); 458 459 // multiply 2 adjacent elements with the filter and add the result 460 srcRegFilt3 = 461 _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); 462 srcRegFilt2 = 463 _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); 464 465 // add and saturate the results together 466 srcRegFilt1_1 = 467 _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2)); 468 469 // shift by 6 bit each 16 bit 470 srcRegFilt1_1 = 471 _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); 472 srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); 473 474 // shrink to 8 bit each 16 bits, the first lane contain the first 475 // convolve result and the second lane contain the second convolve 476 // result 477 srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128()); 478 479 // save 8 bytes 480 _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1); 481 } 482 } 483 484 static void aom_filter_block1d16_h4_avx2( 485 const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, 486 ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { 487 __m128i filtersReg; 488 __m256i addFilterReg32, filt2Reg, filt3Reg; 489 __m256i secondFilters, thirdFilters; 490 __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; 491 __m256i srcReg32b1, srcReg32b2, filtersReg32; 492 unsigned int i; 493 ptrdiff_t src_stride, dst_stride; 494 src_ptr -= 3; 495 addFilterReg32 = _mm256_set1_epi16(32); 496 filtersReg = _mm_loadu_si128((const __m128i *)filter); 497 filtersReg = _mm_srai_epi16(filtersReg, 1); 498 // converting the 16 bit (short) to 8 bit (byte) and have the same data 499 // in both lanes of 128 bit register. 500 filtersReg = _mm_packs_epi16(filtersReg, filtersReg); 501 // have the same data in both lanes of a 256 bit register 502 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); 503 504 // duplicate only the second 16 bits (third and forth byte) 505 // across 256 bit register 506 secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); 507 // duplicate only the third 16 bits (fifth and sixth byte) 508 // across 256 bit register 509 thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); 510 511 filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); 512 filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); 513 514 // multiply the size of the source and destination stride by two 515 src_stride = src_pixels_per_line << 1; 516 dst_stride = output_pitch << 1; 517 for (i = output_height; i > 1; i -= 2) { 518 // load the 2 strides of source 519 srcReg32b1 = yy_loadu2_128(src_ptr + src_pixels_per_line, src_ptr); 520 521 // filter the source buffer 522 srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); 523 srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg); 524 525 // multiply 2 adjacent elements with the filter and add the result 526 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); 527 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); 528 529 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); 530 531 // reading 2 strides of the next 16 bytes 532 // (part of it was being read by earlier read) 533 srcReg32b2 = yy_loadu2_128(src_ptr + src_pixels_per_line + 8, src_ptr + 8); 534 535 // filter the source buffer 536 srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg); 537 srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg); 538 539 // multiply 2 adjacent elements with the filter and add the result 540 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); 541 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); 542 543 // add and saturate the results together 544 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); 545 546 // shift by 6 bit each 16 bit 547 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); 548 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32); 549 srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); 550 srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6); 551 552 // shrink to 8 bit each 16 bits, the first lane contain the first 553 // convolve result and the second lane contain the second convolve result 554 srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); 555 556 src_ptr += src_stride; 557 558 xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1); 559 output_ptr += dst_stride; 560 } 561 562 // if the number of strides is odd. 563 // process only 16 bytes 564 if (i > 0) { 565 __m256i srcReg1, srcReg12; 566 __m256i srcRegFilt2, srcRegFilt3, srcRegFilt1_1; 567 568 srcReg1 = _mm256_loadu_si256((const __m256i *)(src_ptr)); 569 srcReg12 = _mm256_permute4x64_epi64(srcReg1, 0x94); 570 571 // filter the source buffer 572 srcRegFilt2 = _mm256_shuffle_epi8(srcReg12, filt2Reg); 573 srcRegFilt3 = _mm256_shuffle_epi8(srcReg12, filt3Reg); 574 575 // multiply 2 adjacent elements with the filter and add the result 576 srcRegFilt2 = _mm256_maddubs_epi16(srcRegFilt2, secondFilters); 577 srcRegFilt3 = _mm256_maddubs_epi16(srcRegFilt3, thirdFilters); 578 579 // add and saturate the results together 580 srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt2, srcRegFilt3); 581 582 // shift by 6 bit each 16 bit 583 srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt1_1, addFilterReg32); 584 srcRegFilt1_1 = _mm256_srai_epi16(srcRegFilt1_1, 6); 585 586 // shrink to 8 bit each 16 bits, the first lane contain the first 587 // convolve result and the second lane contain the second convolve 588 // result 589 srcRegFilt1_1 = _mm256_packus_epi16(srcRegFilt1_1, srcRegFilt1_1); 590 srcRegFilt1_1 = _mm256_permute4x64_epi64(srcRegFilt1_1, 0x8); 591 592 // save 16 bytes 593 _mm_store_si128((__m128i *)output_ptr, 594 _mm256_castsi256_si128(srcRegFilt1_1)); 595 } 596 } 597 598 static void aom_filter_block1d16_h8_avx2( 599 const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, 600 ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { 601 __m128i filtersReg; 602 __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg; 603 __m256i firstFilters, secondFilters, thirdFilters, forthFilters; 604 __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; 605 __m256i srcReg32b1, srcReg32b2, filtersReg32; 606 unsigned int i; 607 ptrdiff_t src_stride, dst_stride; 608 src_ptr -= 3; 609 addFilterReg32 = _mm256_set1_epi16(32); 610 filtersReg = _mm_loadu_si128((const __m128i *)filter); 611 filtersReg = _mm_srai_epi16(filtersReg, 1); 612 // converting the 16 bit (short) to 8 bit (byte) and have the same data 613 // in both lanes of 128 bit register. 614 filtersReg = _mm_packs_epi16(filtersReg, filtersReg); 615 // have the same data in both lanes of a 256 bit register 616 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); 617 618 // duplicate only the first 16 bits (first and second byte) 619 // across 256 bit register 620 firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); 621 // duplicate only the second 16 bits (third and forth byte) 622 // across 256 bit register 623 secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); 624 // duplicate only the third 16 bits (fifth and sixth byte) 625 // across 256 bit register 626 thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); 627 // duplicate only the forth 16 bits (seventh and eighth byte) 628 // across 256 bit register 629 forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); 630 631 filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2); 632 filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); 633 filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); 634 filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); 635 636 // multiple the size of the source and destination stride by two 637 src_stride = src_pixels_per_line << 1; 638 dst_stride = output_pitch << 1; 639 for (i = output_height; i > 1; i -= 2) { 640 // load the 2 strides of source 641 srcReg32b1 = yy_loadu2_128(src_ptr + src_pixels_per_line, src_ptr); 642 643 // filter the source buffer 644 srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); 645 srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg); 646 647 // multiply 2 adjacent elements with the filter and add the result 648 srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); 649 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); 650 651 // add and saturate the results together 652 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); 653 654 // filter the source buffer 655 srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); 656 srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg); 657 658 // multiply 2 adjacent elements with the filter and add the result 659 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); 660 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); 661 662 __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); 663 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23); 664 665 // reading 2 strides of the next 16 bytes 666 // (part of it was being read by earlier read) 667 srcReg32b2 = yy_loadu2_128(src_ptr + src_pixels_per_line + 8, src_ptr + 8); 668 669 // filter the source buffer 670 srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); 671 srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg); 672 673 // multiply 2 adjacent elements with the filter and add the result 674 srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters); 675 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); 676 677 // add and saturate the results together 678 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2); 679 680 // filter the source buffer 681 srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg); 682 srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg); 683 684 // multiply 2 adjacent elements with the filter and add the result 685 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); 686 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); 687 688 // add and saturate the results together 689 srcRegFilt32b2_1 = _mm256_adds_epi16( 690 srcRegFilt32b2_1, _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2)); 691 692 // shift by 6 bit each 16 bit 693 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); 694 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32); 695 srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); 696 srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6); 697 698 // shrink to 8 bit each 16 bits, the first lane contain the first 699 // convolve result and the second lane contain the second convolve result 700 srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); 701 702 src_ptr += src_stride; 703 704 xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1); 705 output_ptr += dst_stride; 706 } 707 708 // if the number of strides is odd. 709 // process only 16 bytes 710 if (i > 0) { 711 __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; 712 __m128i srcRegFilt2, srcRegFilt3; 713 714 srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); 715 716 // filter the source buffer 717 srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); 718 srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg)); 719 720 // multiply 2 adjacent elements with the filter and add the result 721 srcRegFilt1_1 = 722 _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); 723 srcRegFilt2 = 724 _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); 725 726 // add and saturate the results together 727 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); 728 729 // filter the source buffer 730 srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); 731 srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg)); 732 733 // multiply 2 adjacent elements with the filter and add the result 734 srcRegFilt3 = 735 _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); 736 srcRegFilt2 = 737 _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); 738 739 // add and saturate the results together 740 srcRegFilt1_1 = 741 _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2)); 742 743 // reading the next 16 bytes 744 // (part of it was being read by earlier read) 745 srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8)); 746 747 // filter the source buffer 748 srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg)); 749 srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg)); 750 751 // multiply 2 adjacent elements with the filter and add the result 752 srcRegFilt2_1 = 753 _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters)); 754 srcRegFilt2 = 755 _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); 756 757 // add and saturate the results together 758 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); 759 760 // filter the source buffer 761 srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg)); 762 srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg)); 763 764 // multiply 2 adjacent elements with the filter and add the result 765 srcRegFilt3 = 766 _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); 767 srcRegFilt2 = 768 _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); 769 770 // add and saturate the results together 771 srcRegFilt2_1 = 772 _mm_adds_epi16(srcRegFilt2_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2)); 773 774 // shift by 6 bit each 16 bit 775 srcRegFilt1_1 = 776 _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); 777 srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); 778 779 srcRegFilt2_1 = 780 _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg32)); 781 srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 6); 782 783 // shrink to 8 bit each 16 bits, the first lane contain the first 784 // convolve result and the second lane contain the second convolve 785 // result 786 srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); 787 788 // save 16 bytes 789 _mm_store_si128((__m128i *)output_ptr, srcRegFilt1_1); 790 } 791 } 792 793 static void aom_filter_block1d8_v4_avx2( 794 const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, 795 ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { 796 __m128i filtersReg; 797 __m256i filtersReg32, addFilterReg32; 798 __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56; 799 __m256i srcReg23_34_lo, srcReg45_56_lo; 800 __m256i resReg23_34_lo, resReg45_56_lo; 801 __m256i resReglo, resReg; 802 __m256i secondFilters, thirdFilters; 803 unsigned int i; 804 ptrdiff_t src_stride, dst_stride; 805 806 addFilterReg32 = _mm256_set1_epi16(32); 807 filtersReg = _mm_loadu_si128((const __m128i *)filter); 808 // converting the 16 bit (short) to 8 bit (byte) and have the 809 // same data in both lanes of 128 bit register. 810 filtersReg = _mm_srai_epi16(filtersReg, 1); 811 filtersReg = _mm_packs_epi16(filtersReg, filtersReg); 812 // have the same data in both lanes of a 256 bit register 813 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); 814 815 // duplicate only the second 16 bits (third and forth byte) 816 // across 256 bit register 817 secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); 818 // duplicate only the third 16 bits (fifth and sixth byte) 819 // across 256 bit register 820 thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); 821 822 // multiple the size of the source and destination stride by two 823 src_stride = src_pitch << 1; 824 dst_stride = out_pitch << 1; 825 826 srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); 827 srcReg4x = _mm256_castsi128_si256( 828 _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4))); 829 830 // have consecutive loads on the same 256 register 831 srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21); 832 833 srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34); 834 835 for (i = output_height; i > 1; i -= 2) { 836 // load the last 2 loads of 16 bytes and have every two 837 // consecutive loads in the same 256 bit register 838 srcReg5x = _mm256_castsi128_si256( 839 _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5))); 840 srcReg45 = 841 _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1); 842 843 srcReg6x = _mm256_castsi128_si256( 844 _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6))); 845 srcReg56 = 846 _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1); 847 848 // merge every two consecutive registers 849 srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56); 850 851 // multiply 2 adjacent elements with the filter and add the result 852 resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters); 853 resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters); 854 855 // add and saturate the results together 856 resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo); 857 858 // shift by 6 bit each 16 bit 859 resReglo = _mm256_adds_epi16(resReglo, addFilterReg32); 860 resReglo = _mm256_srai_epi16(resReglo, 6); 861 862 // shrink to 8 bit each 16 bits, the first lane contain the first 863 // convolve result and the second lane contain the second convolve 864 // result 865 resReg = _mm256_packus_epi16(resReglo, resReglo); 866 867 src_ptr += src_stride; 868 869 xx_storeu2_epi64(output_ptr, out_pitch, &resReg); 870 871 output_ptr += dst_stride; 872 873 // save part of the registers for next strides 874 srcReg23_34_lo = srcReg45_56_lo; 875 srcReg4x = srcReg6x; 876 } 877 } 878 879 static void aom_filter_block1d8_v8_avx2( 880 const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, 881 ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { 882 __m128i filtersReg; 883 __m256i addFilterReg32; 884 __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; 885 __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; 886 __m256i srcReg32b11, srcReg32b12, filtersReg32; 887 __m256i firstFilters, secondFilters, thirdFilters, forthFilters; 888 unsigned int i; 889 ptrdiff_t src_stride, dst_stride; 890 891 addFilterReg32 = _mm256_set1_epi16(32); 892 filtersReg = _mm_loadu_si128((const __m128i *)filter); 893 // converting the 16 bit (short) to 8 bit (byte) and have the 894 // same data in both lanes of 128 bit register. 895 filtersReg = _mm_srai_epi16(filtersReg, 1); 896 filtersReg = _mm_packs_epi16(filtersReg, filtersReg); 897 // have the same data in both lanes of a 256 bit register 898 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); 899 900 // duplicate only the first 16 bits (first and second byte) 901 // across 256 bit register 902 firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); 903 // duplicate only the second 16 bits (third and forth byte) 904 // across 256 bit register 905 secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); 906 // duplicate only the third 16 bits (fifth and sixth byte) 907 // across 256 bit register 908 thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); 909 // duplicate only the forth 16 bits (seventh and eighth byte) 910 // across 256 bit register 911 forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); 912 913 // multiple the size of the source and destination stride by two 914 src_stride = src_pitch << 1; 915 dst_stride = out_pitch << 1; 916 917 // load 16 bytes 7 times in stride of src_pitch 918 srcReg32b1 = xx_loadu2_epi64(src_ptr + src_pitch, src_ptr); 919 srcReg32b3 = 920 xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); 921 srcReg32b5 = 922 xx_loadu2_epi64(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4); 923 srcReg32b7 = _mm256_castsi128_si256( 924 _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6))); 925 926 // have each consecutive loads on the same 256 register 927 srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21); 928 srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21); 929 srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21); 930 // merge every two consecutive registers except the last one 931 srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); 932 srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); 933 srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); 934 935 for (i = output_height; i > 1; i -= 2) { 936 // load the last 2 loads of 16 bytes and have every two 937 // consecutive loads in the same 256 bit register 938 srcReg32b8 = _mm256_castsi128_si256( 939 _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7))); 940 srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, 941 _mm256_castsi256_si128(srcReg32b8), 1); 942 srcReg32b9 = _mm256_castsi128_si256( 943 _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 8))); 944 srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, 945 _mm256_castsi256_si128(srcReg32b9), 1); 946 947 // merge every two consecutive registers 948 // save 949 srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); 950 951 // multiply 2 adjacent elements with the filter and add the result 952 srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); 953 srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); 954 955 // add and saturate the results together 956 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); 957 958 // multiply 2 adjacent elements with the filter and add the result 959 srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); 960 srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); 961 962 // add and saturate the results together 963 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, 964 _mm256_adds_epi16(srcReg32b8, srcReg32b12)); 965 966 // shift by 6 bit each 16 bit 967 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32); 968 srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6); 969 970 // shrink to 8 bit each 16 bits, the first lane contain the first 971 // convolve result and the second lane contain the second convolve 972 // result 973 srcReg32b1 = _mm256_packus_epi16(srcReg32b10, _mm256_setzero_si256()); 974 975 src_ptr += src_stride; 976 977 xx_storeu2_epi64(output_ptr, out_pitch, &srcReg32b1); 978 979 output_ptr += dst_stride; 980 981 // save part of the registers for next strides 982 srcReg32b10 = srcReg32b11; 983 srcReg32b11 = srcReg32b2; 984 srcReg32b2 = srcReg32b4; 985 srcReg32b7 = srcReg32b9; 986 } 987 if (i > 0) { 988 __m128i srcRegFilt1, srcRegFilt4, srcRegFilt6, srcRegFilt8; 989 // load the last 16 bytes 990 srcRegFilt8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); 991 992 // merge the last 2 results together 993 srcRegFilt4 = 994 _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8); 995 996 // multiply 2 adjacent elements with the filter and add the result 997 srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), 998 _mm256_castsi256_si128(firstFilters)); 999 srcRegFilt4 = 1000 _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters)); 1001 1002 // add and saturate the results together 1003 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); 1004 1005 // multiply 2 adjacent elements with the filter and add the result 1006 srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11), 1007 _mm256_castsi256_si128(secondFilters)); 1008 1009 // multiply 2 adjacent elements with the filter and add the result 1010 srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2), 1011 _mm256_castsi256_si128(thirdFilters)); 1012 1013 // add and saturate the results together 1014 srcRegFilt1 = 1015 _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6)); 1016 1017 // shift by 6 bit each 16 bit 1018 srcRegFilt1 = 1019 _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32)); 1020 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6); 1021 1022 // shrink to 8 bit each 16 bits, the first lane contain the first 1023 // convolve result and the second lane contain the second convolve result 1024 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, _mm_setzero_si128()); 1025 1026 // save 8 bytes 1027 _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1); 1028 } 1029 } 1030 1031 static void aom_filter_block1d16_v4_avx2( 1032 const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, 1033 ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { 1034 __m128i filtersReg; 1035 __m256i filtersReg32, addFilterReg32; 1036 __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56; 1037 __m256i srcReg23_34_lo, srcReg23_34_hi, srcReg45_56_lo, srcReg45_56_hi; 1038 __m256i resReg23_34_lo, resReg23_34_hi, resReg45_56_lo, resReg45_56_hi; 1039 __m256i resReglo, resReghi, resReg; 1040 __m256i secondFilters, thirdFilters; 1041 unsigned int i; 1042 ptrdiff_t src_stride, dst_stride; 1043 1044 addFilterReg32 = _mm256_set1_epi16(32); 1045 filtersReg = _mm_loadu_si128((const __m128i *)filter); 1046 // converting the 16 bit (short) to 8 bit (byte) and have the 1047 // same data in both lanes of 128 bit register. 1048 filtersReg = _mm_srai_epi16(filtersReg, 1); 1049 filtersReg = _mm_packs_epi16(filtersReg, filtersReg); 1050 // have the same data in both lanes of a 256 bit register 1051 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); 1052 1053 // duplicate only the second 16 bits (third and forth byte) 1054 // across 256 bit register 1055 secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); 1056 // duplicate only the third 16 bits (fifth and sixth byte) 1057 // across 256 bit register 1058 thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); 1059 1060 // multiple the size of the source and destination stride by two 1061 src_stride = src_pitch << 1; 1062 dst_stride = out_pitch << 1; 1063 1064 srcReg23 = yy_loadu2_128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); 1065 srcReg4x = _mm256_castsi128_si256( 1066 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4))); 1067 1068 // have consecutive loads on the same 256 register 1069 srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21); 1070 1071 srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34); 1072 srcReg23_34_hi = _mm256_unpackhi_epi8(srcReg23, srcReg34); 1073 1074 for (i = output_height; i > 1; i -= 2) { 1075 // load the last 2 loads of 16 bytes and have every two 1076 // consecutive loads in the same 256 bit register 1077 srcReg5x = _mm256_castsi128_si256( 1078 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5))); 1079 srcReg45 = 1080 _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1); 1081 1082 srcReg6x = _mm256_castsi128_si256( 1083 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); 1084 srcReg56 = 1085 _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1); 1086 1087 // merge every two consecutive registers 1088 srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56); 1089 srcReg45_56_hi = _mm256_unpackhi_epi8(srcReg45, srcReg56); 1090 1091 // multiply 2 adjacent elements with the filter and add the result 1092 resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters); 1093 resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters); 1094 1095 // add and saturate the results together 1096 resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo); 1097 1098 // multiply 2 adjacent elements with the filter and add the result 1099 resReg23_34_hi = _mm256_maddubs_epi16(srcReg23_34_hi, secondFilters); 1100 resReg45_56_hi = _mm256_maddubs_epi16(srcReg45_56_hi, thirdFilters); 1101 1102 // add and saturate the results together 1103 resReghi = _mm256_adds_epi16(resReg23_34_hi, resReg45_56_hi); 1104 1105 // shift by 6 bit each 16 bit 1106 resReglo = _mm256_adds_epi16(resReglo, addFilterReg32); 1107 resReghi = _mm256_adds_epi16(resReghi, addFilterReg32); 1108 resReglo = _mm256_srai_epi16(resReglo, 6); 1109 resReghi = _mm256_srai_epi16(resReghi, 6); 1110 1111 // shrink to 8 bit each 16 bits, the first lane contain the first 1112 // convolve result and the second lane contain the second convolve 1113 // result 1114 resReg = _mm256_packus_epi16(resReglo, resReghi); 1115 1116 src_ptr += src_stride; 1117 1118 xx_store2_mi128(output_ptr, out_pitch, &resReg); 1119 1120 output_ptr += dst_stride; 1121 1122 // save part of the registers for next strides 1123 srcReg23_34_lo = srcReg45_56_lo; 1124 srcReg23_34_hi = srcReg45_56_hi; 1125 srcReg4x = srcReg6x; 1126 } 1127 } 1128 1129 static void aom_filter_block1d16_v8_avx2( 1130 const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, 1131 ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { 1132 __m128i filtersReg; 1133 __m256i addFilterReg32; 1134 __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; 1135 __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; 1136 __m256i srcReg32b11, srcReg32b12, filtersReg32; 1137 __m256i firstFilters, secondFilters, thirdFilters, forthFilters; 1138 unsigned int i; 1139 ptrdiff_t src_stride, dst_stride; 1140 1141 addFilterReg32 = _mm256_set1_epi16(32); 1142 filtersReg = _mm_loadu_si128((const __m128i *)filter); 1143 // converting the 16 bit (short) to 8 bit (byte) and have the 1144 // same data in both lanes of 128 bit register. 1145 filtersReg = _mm_srai_epi16(filtersReg, 1); 1146 filtersReg = _mm_packs_epi16(filtersReg, filtersReg); 1147 // have the same data in both lanes of a 256 bit register 1148 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); 1149 1150 // duplicate only the first 16 bits (first and second byte) 1151 // across 256 bit register 1152 firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); 1153 // duplicate only the second 16 bits (third and forth byte) 1154 // across 256 bit register 1155 secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); 1156 // duplicate only the third 16 bits (fifth and sixth byte) 1157 // across 256 bit register 1158 thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); 1159 // duplicate only the forth 16 bits (seventh and eighth byte) 1160 // across 256 bit register 1161 forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); 1162 1163 // multiple the size of the source and destination stride by two 1164 src_stride = src_pitch << 1; 1165 dst_stride = out_pitch << 1; 1166 1167 // load 16 bytes 7 times in stride of src_pitch 1168 srcReg32b1 = yy_loadu2_128(src_ptr + src_pitch, src_ptr); 1169 srcReg32b3 = yy_loadu2_128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); 1170 srcReg32b5 = yy_loadu2_128(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4); 1171 srcReg32b7 = _mm256_castsi128_si256( 1172 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); 1173 1174 // have each consecutive loads on the same 256 register 1175 srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21); 1176 srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21); 1177 srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21); 1178 // merge every two consecutive registers except the last one 1179 srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); 1180 srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2); 1181 1182 // save 1183 srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); 1184 srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4); 1185 srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); 1186 srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6); 1187 1188 for (i = output_height; i > 1; i -= 2) { 1189 // load the last 2 loads of 16 bytes and have every two 1190 // consecutive loads in the same 256 bit register 1191 srcReg32b8 = _mm256_castsi128_si256( 1192 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7))); 1193 srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, 1194 _mm256_castsi256_si128(srcReg32b8), 1); 1195 srcReg32b9 = _mm256_castsi128_si256( 1196 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8))); 1197 srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, 1198 _mm256_castsi256_si128(srcReg32b9), 1); 1199 1200 // merge every two consecutive registers 1201 // save 1202 srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); 1203 srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8); 1204 1205 // multiply 2 adjacent elements with the filter and add the result 1206 srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); 1207 srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); 1208 1209 // add and saturate the results together 1210 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); 1211 1212 // multiply 2 adjacent elements with the filter and add the result 1213 srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); 1214 srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); 1215 1216 // add and saturate the results together 1217 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, 1218 _mm256_adds_epi16(srcReg32b8, srcReg32b12)); 1219 1220 // multiply 2 adjacent elements with the filter and add the result 1221 srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters); 1222 srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters); 1223 1224 srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6); 1225 1226 // multiply 2 adjacent elements with the filter and add the result 1227 srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters); 1228 srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters); 1229 1230 // add and saturate the results together 1231 srcReg32b1 = _mm256_adds_epi16(srcReg32b1, 1232 _mm256_adds_epi16(srcReg32b8, srcReg32b12)); 1233 1234 // shift by 6 bit each 16 bit 1235 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32); 1236 srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg32); 1237 srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6); 1238 srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 6); 1239 1240 // shrink to 8 bit each 16 bits, the first lane contain the first 1241 // convolve result and the second lane contain the second convolve 1242 // result 1243 srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1); 1244 1245 src_ptr += src_stride; 1246 1247 xx_store2_mi128(output_ptr, out_pitch, &srcReg32b1); 1248 1249 output_ptr += dst_stride; 1250 1251 // save part of the registers for next strides 1252 srcReg32b10 = srcReg32b11; 1253 srcReg32b1 = srcReg32b3; 1254 srcReg32b11 = srcReg32b2; 1255 srcReg32b3 = srcReg32b5; 1256 srcReg32b2 = srcReg32b4; 1257 srcReg32b5 = srcReg32b7; 1258 srcReg32b7 = srcReg32b9; 1259 } 1260 if (i > 0) { 1261 __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5; 1262 __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8; 1263 // load the last 16 bytes 1264 srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); 1265 1266 // merge the last 2 results together 1267 srcRegFilt4 = 1268 _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8); 1269 srcRegFilt7 = 1270 _mm_unpackhi_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8); 1271 1272 // multiply 2 adjacent elements with the filter and add the result 1273 srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), 1274 _mm256_castsi256_si128(firstFilters)); 1275 srcRegFilt4 = 1276 _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters)); 1277 srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1), 1278 _mm256_castsi256_si128(firstFilters)); 1279 srcRegFilt7 = 1280 _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters)); 1281 1282 // add and saturate the results together 1283 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); 1284 srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7); 1285 1286 // multiply 2 adjacent elements with the filter and add the result 1287 srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11), 1288 _mm256_castsi256_si128(secondFilters)); 1289 srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3), 1290 _mm256_castsi256_si128(secondFilters)); 1291 1292 // multiply 2 adjacent elements with the filter and add the result 1293 srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2), 1294 _mm256_castsi256_si128(thirdFilters)); 1295 srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5), 1296 _mm256_castsi256_si128(thirdFilters)); 1297 1298 // add and saturate the results together 1299 srcRegFilt1 = 1300 _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6)); 1301 srcRegFilt3 = 1302 _mm_adds_epi16(srcRegFilt3, _mm_adds_epi16(srcRegFilt5, srcRegFilt7)); 1303 1304 // shift by 6 bit each 16 bit 1305 srcRegFilt1 = 1306 _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32)); 1307 srcRegFilt3 = 1308 _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg32)); 1309 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6); 1310 srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 6); 1311 1312 // shrink to 8 bit each 16 bits, the first lane contain the first 1313 // convolve result and the second lane contain the second convolve 1314 // result 1315 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); 1316 1317 // save 16 bytes 1318 _mm_store_si128((__m128i *)output_ptr, srcRegFilt1); 1319 } 1320 } 1321 1322 static void aom_filter_block1d4_v4_avx2( 1323 const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, 1324 ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { 1325 __m128i filtersReg; 1326 __m256i filtersReg32, addFilterReg32; 1327 __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56; 1328 __m256i srcReg23_34_lo, srcReg45_56_lo; 1329 __m256i srcReg2345_3456_lo; 1330 __m256i resReglo, resReg; 1331 __m256i firstFilters; 1332 unsigned int i; 1333 ptrdiff_t src_stride, dst_stride; 1334 1335 addFilterReg32 = _mm256_set1_epi16(32); 1336 filtersReg = _mm_loadu_si128((const __m128i *)filter); 1337 // converting the 16 bit (short) to 8 bit (byte) and have the 1338 // same data in both lanes of 128 bit register. 1339 filtersReg = _mm_srai_epi16(filtersReg, 1); 1340 filtersReg = _mm_packs_epi16(filtersReg, filtersReg); 1341 // have the same data in both lanes of a 256 bit register 1342 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); 1343 1344 firstFilters = 1345 _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u)); 1346 1347 // multiple the size of the source and destination stride by two 1348 src_stride = src_pitch << 1; 1349 dst_stride = out_pitch << 1; 1350 1351 srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); 1352 srcReg4x = _mm256_castsi128_si256( 1353 _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4))); 1354 1355 // have consecutive loads on the same 256 register 1356 srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21); 1357 1358 srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34); 1359 1360 for (i = output_height; i > 1; i -= 2) { 1361 // load the last 2 loads of 16 bytes and have every two 1362 // consecutive loads in the same 256 bit register 1363 srcReg5x = _mm256_castsi128_si256( 1364 _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5))); 1365 srcReg45 = 1366 _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1); 1367 1368 srcReg6x = _mm256_castsi128_si256( 1369 _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6))); 1370 srcReg56 = 1371 _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1); 1372 1373 // merge every two consecutive registers 1374 srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56); 1375 1376 srcReg2345_3456_lo = _mm256_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo); 1377 1378 // multiply 2 adjacent elements with the filter and add the result 1379 resReglo = _mm256_maddubs_epi16(srcReg2345_3456_lo, firstFilters); 1380 1381 resReglo = _mm256_hadds_epi16(resReglo, _mm256_setzero_si256()); 1382 1383 // shift by 6 bit each 16 bit 1384 resReglo = _mm256_adds_epi16(resReglo, addFilterReg32); 1385 resReglo = _mm256_srai_epi16(resReglo, 6); 1386 1387 // shrink to 8 bit each 16 bits, the first lane contain the first 1388 // convolve result and the second lane contain the second convolve 1389 // result 1390 resReg = _mm256_packus_epi16(resReglo, resReglo); 1391 1392 src_ptr += src_stride; 1393 1394 xx_storeu2_epi32(output_ptr, out_pitch, &resReg); 1395 1396 output_ptr += dst_stride; 1397 1398 // save part of the registers for next strides 1399 srcReg23_34_lo = srcReg45_56_lo; 1400 srcReg4x = srcReg6x; 1401 } 1402 } 1403 1404 #if HAVE_AVX2 && HAVE_SSSE3 1405 filter8_1dfunction aom_filter_block1d4_v8_ssse3; 1406 filter8_1dfunction aom_filter_block1d16_v2_ssse3; 1407 filter8_1dfunction aom_filter_block1d16_h2_ssse3; 1408 filter8_1dfunction aom_filter_block1d8_v2_ssse3; 1409 filter8_1dfunction aom_filter_block1d8_h2_ssse3; 1410 filter8_1dfunction aom_filter_block1d4_v2_ssse3; 1411 filter8_1dfunction aom_filter_block1d4_h2_ssse3; 1412 #define aom_filter_block1d4_v8_avx2 aom_filter_block1d4_v8_ssse3 1413 #define aom_filter_block1d16_v2_avx2 aom_filter_block1d16_v2_ssse3 1414 #define aom_filter_block1d16_h2_avx2 aom_filter_block1d16_h2_ssse3 1415 #define aom_filter_block1d8_v2_avx2 aom_filter_block1d8_v2_ssse3 1416 #define aom_filter_block1d8_h2_avx2 aom_filter_block1d8_h2_ssse3 1417 #define aom_filter_block1d4_v2_avx2 aom_filter_block1d4_v2_ssse3 1418 #define aom_filter_block1d4_h2_avx2 aom_filter_block1d4_h2_ssse3 1419 // void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, 1420 // uint8_t *dst, ptrdiff_t dst_stride, 1421 // const int16_t *filter_x, int x_step_q4, 1422 // const int16_t *filter_y, int y_step_q4, 1423 // int w, int h); 1424 // void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, 1425 // uint8_t *dst, ptrdiff_t dst_stride, 1426 // const int16_t *filter_x, int x_step_q4, 1427 // const int16_t *filter_y, int y_step_q4, 1428 // int w, int h); 1429 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2) 1430 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2) 1431 1432 #endif // HAVE_AX2 && HAVE_SSSE3