aom_subpixel_8t_intrin_ssse3.c (33339B)
1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <tmmintrin.h> 13 14 #include "config/aom_dsp_rtcd.h" 15 16 #include "aom_dsp/aom_filter.h" 17 #include "aom_dsp/x86/convolve.h" 18 #include "aom_dsp/x86/convolve_sse2.h" 19 #include "aom_dsp/x86/convolve_ssse3.h" 20 #include "aom_dsp/x86/mem_sse2.h" 21 #include "aom_dsp/x86/transpose_sse2.h" 22 #include "aom_mem/aom_mem.h" 23 #include "aom_ports/mem.h" 24 #include "aom_ports/emmintrin_compat.h" 25 26 DECLARE_ALIGNED(32, static const uint8_t, filt_h4[]) = { 27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, 28 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 2, 3, 3, 4, 4, 5, 29 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6, 30 7, 7, 8, 8, 9, 9, 10, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 31 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 32 12, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 6, 7, 33 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 34 }; 35 36 DECLARE_ALIGNED(32, static const uint8_t, filtd4[]) = { 37 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 38 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 39 }; 40 41 static void aom_filter_block1d4_h4_ssse3( 42 const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, 43 ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { 44 __m128i filtersReg; 45 __m128i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1; 46 unsigned int i; 47 src_ptr -= 3; 48 addFilterReg32 = _mm_set1_epi16(32); 49 filtersReg = _mm_loadu_si128((const __m128i *)filter); 50 filtersReg = _mm_srai_epi16(filtersReg, 1); 51 // converting the 16 bit (short) to 8 bit (byte) and have the same data 52 // in both lanes of 128 bit register. 53 filtersReg = _mm_packs_epi16(filtersReg, filtersReg); 54 55 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u)); 56 filt1Reg = _mm_load_si128((__m128i const *)(filtd4)); 57 58 for (i = output_height; i > 0; i -= 1) { 59 // load the 2 strides of source 60 srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); 61 62 // filter the source buffer 63 srcRegFilt32b1_1 = _mm_shuffle_epi8(srcReg32b1, filt1Reg); 64 65 // multiply 4 adjacent elements with the filter and add the result 66 srcRegFilt32b1_1 = _mm_maddubs_epi16(srcRegFilt32b1_1, firstFilters); 67 68 srcRegFilt32b1_1 = _mm_hadds_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); 69 70 // shift by 6 bit each 16 bit 71 srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); 72 srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); 73 74 // shrink to 8 bit each 16 bits, the first lane contain the first 75 // convolve result and the second lane contain the second convolve result 76 srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); 77 78 src_ptr += src_pixels_per_line; 79 80 *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt32b1_1); 81 output_ptr += output_pitch; 82 } 83 } 84 85 static void aom_filter_block1d4_v4_ssse3( 86 const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, 87 ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { 88 __m128i filtersReg; 89 __m128i addFilterReg32; 90 __m128i srcReg2, srcReg3, srcReg23, srcReg4, srcReg34, srcReg5, srcReg45, 91 srcReg6, srcReg56; 92 __m128i srcReg23_34_lo, srcReg45_56_lo; 93 __m128i srcReg2345_3456_lo, srcReg2345_3456_hi; 94 __m128i resReglo, resReghi; 95 __m128i firstFilters; 96 unsigned int i; 97 ptrdiff_t src_stride, dst_stride; 98 99 addFilterReg32 = _mm_set1_epi16(32); 100 filtersReg = _mm_loadu_si128((const __m128i *)filter); 101 // converting the 16 bit (short) to 8 bit (byte) and have the 102 // same data in both lanes of 128 bit register. 103 filtersReg = _mm_srai_epi16(filtersReg, 1); 104 filtersReg = _mm_packs_epi16(filtersReg, filtersReg); 105 106 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u)); 107 108 // multiple the size of the source and destination stride by two 109 src_stride = src_pitch << 1; 110 dst_stride = out_pitch << 1; 111 112 srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); 113 srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); 114 srcReg23 = _mm_unpacklo_epi32(srcReg2, srcReg3); 115 116 srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); 117 118 // have consecutive loads on the same 256 register 119 srcReg34 = _mm_unpacklo_epi32(srcReg3, srcReg4); 120 121 srcReg23_34_lo = _mm_unpacklo_epi8(srcReg23, srcReg34); 122 123 for (i = output_height; i > 1; i -= 2) { 124 srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); 125 srcReg45 = _mm_unpacklo_epi32(srcReg4, srcReg5); 126 127 srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); 128 srcReg56 = _mm_unpacklo_epi32(srcReg5, srcReg6); 129 130 // merge every two consecutive registers 131 srcReg45_56_lo = _mm_unpacklo_epi8(srcReg45, srcReg56); 132 133 srcReg2345_3456_lo = _mm_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo); 134 srcReg2345_3456_hi = _mm_unpackhi_epi16(srcReg23_34_lo, srcReg45_56_lo); 135 136 // multiply 2 adjacent elements with the filter and add the result 137 resReglo = _mm_maddubs_epi16(srcReg2345_3456_lo, firstFilters); 138 resReghi = _mm_maddubs_epi16(srcReg2345_3456_hi, firstFilters); 139 140 resReglo = _mm_hadds_epi16(resReglo, _mm_setzero_si128()); 141 resReghi = _mm_hadds_epi16(resReghi, _mm_setzero_si128()); 142 143 // shift by 6 bit each 16 bit 144 resReglo = _mm_adds_epi16(resReglo, addFilterReg32); 145 resReghi = _mm_adds_epi16(resReghi, addFilterReg32); 146 resReglo = _mm_srai_epi16(resReglo, 6); 147 resReghi = _mm_srai_epi16(resReghi, 6); 148 149 // shrink to 8 bit each 16 bits, the first lane contain the first 150 // convolve result and the second lane contain the second convolve 151 // result 152 resReglo = _mm_packus_epi16(resReglo, resReglo); 153 resReghi = _mm_packus_epi16(resReghi, resReghi); 154 155 src_ptr += src_stride; 156 157 *((int *)(output_ptr)) = _mm_cvtsi128_si32(resReglo); 158 *((int *)(output_ptr + out_pitch)) = _mm_cvtsi128_si32(resReghi); 159 160 output_ptr += dst_stride; 161 162 // save part of the registers for next strides 163 srcReg23_34_lo = srcReg45_56_lo; 164 srcReg4 = srcReg6; 165 } 166 } 167 168 static void aom_filter_block1d8_h4_ssse3( 169 const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, 170 ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { 171 __m128i filtersReg; 172 __m128i addFilterReg32, filt2Reg, filt3Reg; 173 __m128i secondFilters, thirdFilters; 174 __m128i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3; 175 __m128i srcReg32b1; 176 unsigned int i; 177 src_ptr -= 3; 178 addFilterReg32 = _mm_set1_epi16(32); 179 filtersReg = _mm_loadu_si128((const __m128i *)filter); 180 filtersReg = _mm_srai_epi16(filtersReg, 1); 181 // converting the 16 bit (short) to 8 bit (byte) and have the same data 182 // in both lanes of 128 bit register. 183 filtersReg = _mm_packs_epi16(filtersReg, filtersReg); 184 185 // duplicate only the second 16 bits (third and forth byte) 186 // across 256 bit register 187 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); 188 // duplicate only the third 16 bits (fifth and sixth byte) 189 // across 256 bit register 190 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); 191 192 filt2Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32)); 193 filt3Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32 * 2)); 194 195 for (i = output_height; i > 0; i -= 1) { 196 srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); 197 198 // filter the source buffer 199 srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b1, filt2Reg); 200 srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b1, filt3Reg); 201 202 // multiply 2 adjacent elements with the filter and add the result 203 srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters); 204 srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters); 205 206 srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); 207 208 // shift by 6 bit each 16 bit 209 srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); 210 srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); 211 212 // shrink to 8 bit each 16 bits 213 srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); 214 215 src_ptr += src_pixels_per_line; 216 217 _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt32b1_1); 218 219 output_ptr += output_pitch; 220 } 221 } 222 223 static void aom_filter_block1d8_v4_ssse3( 224 const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, 225 ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { 226 __m128i filtersReg; 227 __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; 228 __m128i srcReg23, srcReg34, srcReg45, srcReg56; 229 __m128i resReg23, resReg34, resReg45, resReg56; 230 __m128i resReg23_45, resReg34_56; 231 __m128i addFilterReg32, secondFilters, thirdFilters; 232 unsigned int i; 233 ptrdiff_t src_stride, dst_stride; 234 235 addFilterReg32 = _mm_set1_epi16(32); 236 filtersReg = _mm_loadu_si128((const __m128i *)filter); 237 // converting the 16 bit (short) to 8 bit (byte) and have the 238 // same data in both lanes of 128 bit register. 239 filtersReg = _mm_srai_epi16(filtersReg, 1); 240 filtersReg = _mm_packs_epi16(filtersReg, filtersReg); 241 242 // duplicate only the second 16 bits (third and forth byte) 243 // across 128 bit register 244 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); 245 // duplicate only the third 16 bits (fifth and sixth byte) 246 // across 128 bit register 247 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); 248 249 // multiple the size of the source and destination stride by two 250 src_stride = src_pitch << 1; 251 dst_stride = out_pitch << 1; 252 253 srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); 254 srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); 255 srcReg23 = _mm_unpacklo_epi8(srcReg2, srcReg3); 256 257 srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); 258 259 // have consecutive loads on the same 256 register 260 srcReg34 = _mm_unpacklo_epi8(srcReg3, srcReg4); 261 262 for (i = output_height; i > 1; i -= 2) { 263 srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); 264 265 srcReg45 = _mm_unpacklo_epi8(srcReg4, srcReg5); 266 267 srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); 268 269 srcReg56 = _mm_unpacklo_epi8(srcReg5, srcReg6); 270 271 // multiply 2 adjacent elements with the filter and add the result 272 resReg23 = _mm_maddubs_epi16(srcReg23, secondFilters); 273 resReg34 = _mm_maddubs_epi16(srcReg34, secondFilters); 274 resReg45 = _mm_maddubs_epi16(srcReg45, thirdFilters); 275 resReg56 = _mm_maddubs_epi16(srcReg56, thirdFilters); 276 277 // add and saturate the results together 278 resReg23_45 = _mm_adds_epi16(resReg23, resReg45); 279 resReg34_56 = _mm_adds_epi16(resReg34, resReg56); 280 281 // shift by 6 bit each 16 bit 282 resReg23_45 = _mm_adds_epi16(resReg23_45, addFilterReg32); 283 resReg34_56 = _mm_adds_epi16(resReg34_56, addFilterReg32); 284 resReg23_45 = _mm_srai_epi16(resReg23_45, 6); 285 resReg34_56 = _mm_srai_epi16(resReg34_56, 6); 286 287 // shrink to 8 bit each 16 bits, the first lane contain the first 288 // convolve result and the second lane contain the second convolve 289 // result 290 resReg23_45 = _mm_packus_epi16(resReg23_45, _mm_setzero_si128()); 291 resReg34_56 = _mm_packus_epi16(resReg34_56, _mm_setzero_si128()); 292 293 src_ptr += src_stride; 294 295 _mm_storel_epi64((__m128i *)output_ptr, (resReg23_45)); 296 _mm_storel_epi64((__m128i *)(output_ptr + out_pitch), (resReg34_56)); 297 298 output_ptr += dst_stride; 299 300 // save part of the registers for next strides 301 srcReg23 = srcReg45; 302 srcReg34 = srcReg56; 303 srcReg4 = srcReg6; 304 } 305 } 306 307 static void aom_filter_block1d16_h4_ssse3( 308 const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, 309 ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { 310 __m128i filtersReg; 311 __m128i addFilterReg32, filt2Reg, filt3Reg; 312 __m128i secondFilters, thirdFilters; 313 __m128i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; 314 __m128i srcReg32b1, srcReg32b2; 315 unsigned int i; 316 src_ptr -= 3; 317 addFilterReg32 = _mm_set1_epi16(32); 318 filtersReg = _mm_loadu_si128((const __m128i *)filter); 319 filtersReg = _mm_srai_epi16(filtersReg, 1); 320 // converting the 16 bit (short) to 8 bit (byte) and have the same data 321 // in both lanes of 128 bit register. 322 filtersReg = _mm_packs_epi16(filtersReg, filtersReg); 323 324 // duplicate only the second 16 bits (third and forth byte) 325 // across 256 bit register 326 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); 327 // duplicate only the third 16 bits (fifth and sixth byte) 328 // across 256 bit register 329 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); 330 331 filt2Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32)); 332 filt3Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32 * 2)); 333 334 for (i = output_height; i > 0; i -= 1) { 335 srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); 336 337 // filter the source buffer 338 srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b1, filt2Reg); 339 srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b1, filt3Reg); 340 341 // multiply 2 adjacent elements with the filter and add the result 342 srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters); 343 srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters); 344 345 srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); 346 347 // reading stride of the next 16 bytes 348 // (part of it was being read by earlier read) 349 srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8)); 350 351 // filter the source buffer 352 srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b2, filt2Reg); 353 srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b2, filt3Reg); 354 355 // multiply 2 adjacent elements with the filter and add the result 356 srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters); 357 srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters); 358 359 // add and saturate the results together 360 srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); 361 362 // shift by 6 bit each 16 bit 363 srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); 364 srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b2_1, addFilterReg32); 365 srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); 366 srcRegFilt32b2_1 = _mm_srai_epi16(srcRegFilt32b2_1, 6); 367 368 // shrink to 8 bit each 16 bits, the first lane contain the first 369 // convolve result and the second lane contain the second convolve result 370 srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); 371 372 src_ptr += src_pixels_per_line; 373 374 _mm_store_si128((__m128i *)output_ptr, srcRegFilt32b1_1); 375 376 output_ptr += output_pitch; 377 } 378 } 379 380 static void aom_filter_block1d16_v4_ssse3( 381 const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, 382 ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { 383 __m128i filtersReg; 384 __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; 385 __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi; 386 __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi; 387 __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo; 388 __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi; 389 __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi; 390 __m128i resReg23_45, resReg34_56; 391 __m128i addFilterReg32, secondFilters, thirdFilters; 392 unsigned int i; 393 ptrdiff_t src_stride, dst_stride; 394 395 addFilterReg32 = _mm_set1_epi16(32); 396 filtersReg = _mm_loadu_si128((const __m128i *)filter); 397 // converting the 16 bit (short) to 8 bit (byte) and have the 398 // same data in both lanes of 128 bit register. 399 filtersReg = _mm_srai_epi16(filtersReg, 1); 400 filtersReg = _mm_packs_epi16(filtersReg, filtersReg); 401 402 // duplicate only the second 16 bits (third and forth byte) 403 // across 128 bit register 404 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); 405 // duplicate only the third 16 bits (fifth and sixth byte) 406 // across 128 bit register 407 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); 408 409 // multiple the size of the source and destination stride by two 410 src_stride = src_pitch << 1; 411 dst_stride = out_pitch << 1; 412 413 srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); 414 srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); 415 srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3); 416 srcReg23_hi = _mm_unpackhi_epi8(srcReg2, srcReg3); 417 418 srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); 419 420 // have consecutive loads on the same 256 register 421 srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4); 422 srcReg34_hi = _mm_unpackhi_epi8(srcReg3, srcReg4); 423 424 for (i = output_height; i > 1; i -= 2) { 425 srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); 426 427 srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5); 428 srcReg45_hi = _mm_unpackhi_epi8(srcReg4, srcReg5); 429 430 srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); 431 432 srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6); 433 srcReg56_hi = _mm_unpackhi_epi8(srcReg5, srcReg6); 434 435 // multiply 2 adjacent elements with the filter and add the result 436 resReg23_lo = _mm_maddubs_epi16(srcReg23_lo, secondFilters); 437 resReg34_lo = _mm_maddubs_epi16(srcReg34_lo, secondFilters); 438 resReg45_lo = _mm_maddubs_epi16(srcReg45_lo, thirdFilters); 439 resReg56_lo = _mm_maddubs_epi16(srcReg56_lo, thirdFilters); 440 441 // add and saturate the results together 442 resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo); 443 resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo); 444 445 // multiply 2 adjacent elements with the filter and add the result 446 447 resReg23_hi = _mm_maddubs_epi16(srcReg23_hi, secondFilters); 448 resReg34_hi = _mm_maddubs_epi16(srcReg34_hi, secondFilters); 449 resReg45_hi = _mm_maddubs_epi16(srcReg45_hi, thirdFilters); 450 resReg56_hi = _mm_maddubs_epi16(srcReg56_hi, thirdFilters); 451 452 // add and saturate the results together 453 resReg23_45_hi = _mm_adds_epi16(resReg23_hi, resReg45_hi); 454 resReg34_56_hi = _mm_adds_epi16(resReg34_hi, resReg56_hi); 455 456 // shift by 6 bit each 16 bit 457 resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32); 458 resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32); 459 resReg23_45_hi = _mm_adds_epi16(resReg23_45_hi, addFilterReg32); 460 resReg34_56_hi = _mm_adds_epi16(resReg34_56_hi, addFilterReg32); 461 resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6); 462 resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6); 463 resReg23_45_hi = _mm_srai_epi16(resReg23_45_hi, 6); 464 resReg34_56_hi = _mm_srai_epi16(resReg34_56_hi, 6); 465 466 // shrink to 8 bit each 16 bits, the first lane contain the first 467 // convolve result and the second lane contain the second convolve 468 // result 469 resReg23_45 = _mm_packus_epi16(resReg23_45_lo, resReg23_45_hi); 470 resReg34_56 = _mm_packus_epi16(resReg34_56_lo, resReg34_56_hi); 471 472 src_ptr += src_stride; 473 474 _mm_store_si128((__m128i *)output_ptr, (resReg23_45)); 475 _mm_store_si128((__m128i *)(output_ptr + out_pitch), (resReg34_56)); 476 477 output_ptr += dst_stride; 478 479 // save part of the registers for next strides 480 srcReg23_lo = srcReg45_lo; 481 srcReg34_lo = srcReg56_lo; 482 srcReg23_hi = srcReg45_hi; 483 srcReg34_hi = srcReg56_hi; 484 srcReg4 = srcReg6; 485 } 486 } 487 488 static inline __m128i shuffle_filter_convolve8_8_ssse3( 489 const __m128i *const s, const int16_t *const filter) { 490 __m128i f[4]; 491 shuffle_filter_ssse3(filter, f); 492 return convolve8_8_ssse3(s, f); 493 } 494 495 static void filter_horiz_w8_ssse3(const uint8_t *const src, 496 const ptrdiff_t src_stride, 497 uint8_t *const dst, 498 const int16_t *const x_filter) { 499 __m128i s[8], ss[4], temp; 500 501 load_8bit_8x8(src, src_stride, s); 502 // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 503 // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73 504 // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 505 // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 506 transpose_16bit_4x8(s, ss); 507 temp = shuffle_filter_convolve8_8_ssse3(ss, x_filter); 508 // shrink to 8 bit each 16 bits 509 temp = _mm_packus_epi16(temp, temp); 510 // save only 8 bytes convolve result 511 _mm_storel_epi64((__m128i *)dst, temp); 512 } 513 514 static void transpose8x8_to_dst(const uint8_t *const src, 515 const ptrdiff_t src_stride, uint8_t *const dst, 516 const ptrdiff_t dst_stride) { 517 __m128i s[8]; 518 519 load_8bit_8x8(src, src_stride, s); 520 transpose_8bit_8x8(s, s); 521 store_8bit_8x8(s, dst, dst_stride); 522 } 523 524 static void scaledconvolve_horiz_w8(const uint8_t *src, 525 const ptrdiff_t src_stride, uint8_t *dst, 526 const ptrdiff_t dst_stride, 527 const InterpKernel *const x_filters, 528 const int x0_q4, const int x_step_q4, 529 const int w, const int h) { 530 DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]); 531 int x, y, z; 532 src -= SUBPEL_TAPS / 2 - 1; 533 534 // This function processes 8x8 areas. The intermediate height is not always 535 // a multiple of 8, so force it to be a multiple of 8 here. 536 y = h + (8 - (h & 0x7)); 537 538 do { 539 int x_q4 = x0_q4; 540 for (x = 0; x < w; x += 8) { 541 // process 8 src_x steps 542 for (z = 0; z < 8; ++z) { 543 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; 544 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; 545 if (x_q4 & SUBPEL_MASK) { 546 filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter); 547 } else { 548 int i; 549 for (i = 0; i < 8; ++i) { 550 temp[z * 8 + i] = src_x[i * src_stride + 3]; 551 } 552 } 553 x_q4 += x_step_q4; 554 } 555 556 // transpose the 8x8 filters values back to dst 557 transpose8x8_to_dst(temp, 8, dst + x, dst_stride); 558 } 559 560 src += src_stride * 8; 561 dst += dst_stride * 8; 562 } while (y -= 8); 563 } 564 565 static void filter_horiz_w4_ssse3(const uint8_t *const src, 566 const ptrdiff_t src_stride, 567 uint8_t *const dst, 568 const int16_t *const filter) { 569 __m128i s[4]; 570 __m128i temp; 571 572 load_8bit_8x4(src, src_stride, s); 573 transpose_16bit_4x4(s, s); 574 575 temp = shuffle_filter_convolve8_8_ssse3(s, filter); 576 // shrink to 8 bit each 16 bits 577 temp = _mm_packus_epi16(temp, temp); 578 // save only 4 bytes 579 *(int *)dst = _mm_cvtsi128_si32(temp); 580 } 581 582 static void transpose4x4_to_dst(const uint8_t *const src, 583 const ptrdiff_t src_stride, uint8_t *const dst, 584 const ptrdiff_t dst_stride) { 585 __m128i s[4]; 586 587 load_8bit_4x4(src, src_stride, s); 588 s[0] = transpose_8bit_4x4(s); 589 s[1] = _mm_srli_si128(s[0], 4); 590 s[2] = _mm_srli_si128(s[0], 8); 591 s[3] = _mm_srli_si128(s[0], 12); 592 store_8bit_4x4(s, dst, dst_stride); 593 } 594 595 static void scaledconvolve_horiz_w4(const uint8_t *src, 596 const ptrdiff_t src_stride, uint8_t *dst, 597 const ptrdiff_t dst_stride, 598 const InterpKernel *const x_filters, 599 const int x0_q4, const int x_step_q4, 600 const int w, const int h) { 601 DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]); 602 int x, y, z; 603 src -= SUBPEL_TAPS / 2 - 1; 604 605 for (y = 0; y < h; y += 4) { 606 int x_q4 = x0_q4; 607 for (x = 0; x < w; x += 4) { 608 // process 4 src_x steps 609 for (z = 0; z < 4; ++z) { 610 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; 611 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; 612 if (x_q4 & SUBPEL_MASK) { 613 filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter); 614 } else { 615 int i; 616 for (i = 0; i < 4; ++i) { 617 temp[z * 4 + i] = src_x[i * src_stride + 3]; 618 } 619 } 620 x_q4 += x_step_q4; 621 } 622 623 // transpose the 4x4 filters values back to dst 624 transpose4x4_to_dst(temp, 4, dst + x, dst_stride); 625 } 626 627 src += src_stride * 4; 628 dst += dst_stride * 4; 629 } 630 } 631 632 static __m128i filter_vert_kernel(const __m128i *const s, 633 const int16_t *const filter) { 634 __m128i ss[4]; 635 __m128i temp; 636 637 // 00 10 01 11 02 12 03 13 638 ss[0] = _mm_unpacklo_epi8(s[0], s[1]); 639 // 20 30 21 31 22 32 23 33 640 ss[1] = _mm_unpacklo_epi8(s[2], s[3]); 641 // 40 50 41 51 42 52 43 53 642 ss[2] = _mm_unpacklo_epi8(s[4], s[5]); 643 // 60 70 61 71 62 72 63 73 644 ss[3] = _mm_unpacklo_epi8(s[6], s[7]); 645 646 temp = shuffle_filter_convolve8_8_ssse3(ss, filter); 647 // shrink to 8 bit each 16 bits 648 return _mm_packus_epi16(temp, temp); 649 } 650 651 static void filter_vert_w4_ssse3(const uint8_t *const src, 652 const ptrdiff_t src_stride, uint8_t *const dst, 653 const int16_t *const filter) { 654 __m128i s[8]; 655 __m128i temp; 656 657 load_8bit_4x8(src, src_stride, s); 658 temp = filter_vert_kernel(s, filter); 659 // save only 4 bytes 660 *(int *)dst = _mm_cvtsi128_si32(temp); 661 } 662 663 static void scaledconvolve_vert_w4( 664 const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst, 665 const ptrdiff_t dst_stride, const InterpKernel *const y_filters, 666 const int y0_q4, const int y_step_q4, const int w, const int h) { 667 int y; 668 int y_q4 = y0_q4; 669 670 src -= src_stride * (SUBPEL_TAPS / 2 - 1); 671 for (y = 0; y < h; ++y) { 672 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; 673 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; 674 675 if (y_q4 & SUBPEL_MASK) { 676 filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter); 677 } else { 678 memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w); 679 } 680 681 y_q4 += y_step_q4; 682 } 683 } 684 685 static void filter_vert_w8_ssse3(const uint8_t *const src, 686 const ptrdiff_t src_stride, uint8_t *const dst, 687 const int16_t *const filter) { 688 __m128i s[8], temp; 689 690 load_8bit_8x8(src, src_stride, s); 691 temp = filter_vert_kernel(s, filter); 692 // save only 8 bytes convolve result 693 _mm_storel_epi64((__m128i *)dst, temp); 694 } 695 696 static void scaledconvolve_vert_w8( 697 const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst, 698 const ptrdiff_t dst_stride, const InterpKernel *const y_filters, 699 const int y0_q4, const int y_step_q4, const int w, const int h) { 700 int y; 701 int y_q4 = y0_q4; 702 703 src -= src_stride * (SUBPEL_TAPS / 2 - 1); 704 for (y = 0; y < h; ++y) { 705 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; 706 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; 707 if (y_q4 & SUBPEL_MASK) { 708 filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter); 709 } else { 710 memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w); 711 } 712 y_q4 += y_step_q4; 713 } 714 } 715 716 static void filter_vert_w16_ssse3(const uint8_t *src, 717 const ptrdiff_t src_stride, 718 uint8_t *const dst, 719 const int16_t *const filter, const int w) { 720 int i; 721 __m128i f[4]; 722 shuffle_filter_ssse3(filter, f); 723 724 for (i = 0; i < w; i += 16) { 725 __m128i s[8], s_lo[4], s_hi[4], temp_lo, temp_hi; 726 727 loadu_8bit_16x8(src, src_stride, s); 728 729 // merge the result together 730 s_lo[0] = _mm_unpacklo_epi8(s[0], s[1]); 731 s_hi[0] = _mm_unpackhi_epi8(s[0], s[1]); 732 s_lo[1] = _mm_unpacklo_epi8(s[2], s[3]); 733 s_hi[1] = _mm_unpackhi_epi8(s[2], s[3]); 734 s_lo[2] = _mm_unpacklo_epi8(s[4], s[5]); 735 s_hi[2] = _mm_unpackhi_epi8(s[4], s[5]); 736 s_lo[3] = _mm_unpacklo_epi8(s[6], s[7]); 737 s_hi[3] = _mm_unpackhi_epi8(s[6], s[7]); 738 temp_lo = convolve8_8_ssse3(s_lo, f); 739 temp_hi = convolve8_8_ssse3(s_hi, f); 740 741 // shrink to 8 bit each 16 bits, the first lane contain the first convolve 742 // result and the second lane contain the second convolve result 743 temp_hi = _mm_packus_epi16(temp_lo, temp_hi); 744 src += 16; 745 // save 16 bytes convolve result 746 _mm_store_si128((__m128i *)&dst[i], temp_hi); 747 } 748 } 749 750 static void scaledconvolve_vert_w16( 751 const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst, 752 const ptrdiff_t dst_stride, const InterpKernel *const y_filters, 753 const int y0_q4, const int y_step_q4, const int w, const int h) { 754 int y; 755 int y_q4 = y0_q4; 756 757 src -= src_stride * (SUBPEL_TAPS / 2 - 1); 758 for (y = 0; y < h; ++y) { 759 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; 760 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; 761 if (y_q4 & SUBPEL_MASK) { 762 filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter, 763 w); 764 } else { 765 memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w); 766 } 767 y_q4 += y_step_q4; 768 } 769 } 770 771 void aom_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, 772 ptrdiff_t dst_stride, const InterpKernel *filter, 773 int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, 774 int w, int h) { 775 // Note: Fixed size intermediate buffer, temp, places limits on parameters. 776 // 2d filtering proceeds in 2 steps: 777 // (1) Interpolate horizontally into an intermediate buffer, temp. 778 // (2) Interpolate temp vertically to derive the sub-pixel result. 779 // Deriving the maximum number of rows in the temp buffer (135): 780 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). 781 // --Largest block size is 64x64 pixels. 782 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the 783 // original frame (in 1/16th pixel units). 784 // --Must round-up because block may be located at sub-pixel position. 785 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. 786 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. 787 // --Require an additional 8 rows for the horiz_w8 transpose tail. 788 // When calling in frame scaling function, the smallest scaling factor is x1/4 789 // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still 790 // big enough. 791 DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]); 792 const int intermediate_height = 793 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; 794 795 assert(w <= 64); 796 assert(h <= 64); 797 assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32)); 798 assert(x_step_q4 <= 64); 799 800 if (w >= 8) { 801 scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1), 802 src_stride, temp, 64, filter, x0_q4, x_step_q4, w, 803 intermediate_height); 804 } else { 805 scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1), 806 src_stride, temp, 64, filter, x0_q4, x_step_q4, w, 807 intermediate_height); 808 } 809 810 if (w >= 16) { 811 scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, 812 dst_stride, filter, y0_q4, y_step_q4, w, h); 813 } else if (w == 8) { 814 scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, 815 dst_stride, filter, y0_q4, y_step_q4, w, h); 816 } else { 817 scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, 818 dst_stride, filter, y0_q4, y_step_q4, w, h); 819 } 820 } 821 822 filter8_1dfunction aom_filter_block1d16_v8_ssse3; 823 filter8_1dfunction aom_filter_block1d16_h8_ssse3; 824 filter8_1dfunction aom_filter_block1d8_v8_ssse3; 825 filter8_1dfunction aom_filter_block1d8_h8_ssse3; 826 filter8_1dfunction aom_filter_block1d4_v8_ssse3; 827 filter8_1dfunction aom_filter_block1d4_h8_ssse3; 828 829 filter8_1dfunction aom_filter_block1d16_v2_ssse3; 830 filter8_1dfunction aom_filter_block1d16_h2_ssse3; 831 filter8_1dfunction aom_filter_block1d8_v2_ssse3; 832 filter8_1dfunction aom_filter_block1d8_h2_ssse3; 833 filter8_1dfunction aom_filter_block1d4_v2_ssse3; 834 filter8_1dfunction aom_filter_block1d4_h2_ssse3; 835 836 // void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, 837 // uint8_t *dst, ptrdiff_t dst_stride, 838 // const int16_t *filter_x, int x_step_q4, 839 // const int16_t *filter_y, int y_step_q4, 840 // int w, int h); 841 // void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, 842 // uint8_t *dst, ptrdiff_t dst_stride, 843 // const int16_t *filter_x, int x_step_q4, 844 // const int16_t *filter_y, int y_step_q4, 845 // int w, int h); 846 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3) 847 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3)