tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

aom_subpixel_8t_intrin_ssse3.c (33339B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <tmmintrin.h>
     13 
     14 #include "config/aom_dsp_rtcd.h"
     15 
     16 #include "aom_dsp/aom_filter.h"
     17 #include "aom_dsp/x86/convolve.h"
     18 #include "aom_dsp/x86/convolve_sse2.h"
     19 #include "aom_dsp/x86/convolve_ssse3.h"
     20 #include "aom_dsp/x86/mem_sse2.h"
     21 #include "aom_dsp/x86/transpose_sse2.h"
     22 #include "aom_mem/aom_mem.h"
     23 #include "aom_ports/mem.h"
     24 #include "aom_ports/emmintrin_compat.h"
     25 
     26 DECLARE_ALIGNED(32, static const uint8_t, filt_h4[]) = {
     27  0,  1,  1,  2,  2, 3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  0,  1,  1,
     28  2,  2,  3,  3,  4, 4,  5,  5,  6,  6,  7,  7,  8,  2,  3,  3,  4,  4,  5,
     29  5,  6,  6,  7,  7, 8,  8,  9,  9,  10, 2,  3,  3,  4,  4,  5,  5,  6,  6,
     30  7,  7,  8,  8,  9, 9,  10, 4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  10,
     31  10, 11, 11, 12, 4, 5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  10, 10, 11, 11,
     32  12, 6,  7,  7,  8, 8,  9,  9,  10, 10, 11, 11, 12, 12, 13, 13, 14, 6,  7,
     33  7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14
     34 };
     35 
     36 DECLARE_ALIGNED(32, static const uint8_t, filtd4[]) = {
     37  2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
     38  2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
     39 };
     40 
     41 static void aom_filter_block1d4_h4_ssse3(
     42    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
     43    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
     44  __m128i filtersReg;
     45  __m128i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1;
     46  unsigned int i;
     47  src_ptr -= 3;
     48  addFilterReg32 = _mm_set1_epi16(32);
     49  filtersReg = _mm_loadu_si128((const __m128i *)filter);
     50  filtersReg = _mm_srai_epi16(filtersReg, 1);
     51  // converting the 16 bit (short) to 8 bit (byte) and have the same data
     52  // in both lanes of 128 bit register.
     53  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
     54 
     55  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u));
     56  filt1Reg = _mm_load_si128((__m128i const *)(filtd4));
     57 
     58  for (i = output_height; i > 0; i -= 1) {
     59    // load the 2 strides of source
     60    srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
     61 
     62    // filter the source buffer
     63    srcRegFilt32b1_1 = _mm_shuffle_epi8(srcReg32b1, filt1Reg);
     64 
     65    // multiply 4 adjacent elements with the filter and add the result
     66    srcRegFilt32b1_1 = _mm_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
     67 
     68    srcRegFilt32b1_1 = _mm_hadds_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
     69 
     70    // shift by 6 bit each 16 bit
     71    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
     72    srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
     73 
     74    // shrink to 8 bit each 16 bits, the first lane contain the first
     75    // convolve result and the second lane contain the second convolve result
     76    srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
     77 
     78    src_ptr += src_pixels_per_line;
     79 
     80    *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt32b1_1);
     81    output_ptr += output_pitch;
     82  }
     83 }
     84 
     85 static void aom_filter_block1d4_v4_ssse3(
     86    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
     87    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
     88  __m128i filtersReg;
     89  __m128i addFilterReg32;
     90  __m128i srcReg2, srcReg3, srcReg23, srcReg4, srcReg34, srcReg5, srcReg45,
     91      srcReg6, srcReg56;
     92  __m128i srcReg23_34_lo, srcReg45_56_lo;
     93  __m128i srcReg2345_3456_lo, srcReg2345_3456_hi;
     94  __m128i resReglo, resReghi;
     95  __m128i firstFilters;
     96  unsigned int i;
     97  ptrdiff_t src_stride, dst_stride;
     98 
     99  addFilterReg32 = _mm_set1_epi16(32);
    100  filtersReg = _mm_loadu_si128((const __m128i *)filter);
    101  // converting the 16 bit (short) to  8 bit (byte) and have the
    102  // same data in both lanes of 128 bit register.
    103  filtersReg = _mm_srai_epi16(filtersReg, 1);
    104  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
    105 
    106  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u));
    107 
    108  // multiple the size of the source and destination stride by two
    109  src_stride = src_pitch << 1;
    110  dst_stride = out_pitch << 1;
    111 
    112  srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
    113  srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
    114  srcReg23 = _mm_unpacklo_epi32(srcReg2, srcReg3);
    115 
    116  srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
    117 
    118  // have consecutive loads on the same 256 register
    119  srcReg34 = _mm_unpacklo_epi32(srcReg3, srcReg4);
    120 
    121  srcReg23_34_lo = _mm_unpacklo_epi8(srcReg23, srcReg34);
    122 
    123  for (i = output_height; i > 1; i -= 2) {
    124    srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
    125    srcReg45 = _mm_unpacklo_epi32(srcReg4, srcReg5);
    126 
    127    srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
    128    srcReg56 = _mm_unpacklo_epi32(srcReg5, srcReg6);
    129 
    130    // merge every two consecutive registers
    131    srcReg45_56_lo = _mm_unpacklo_epi8(srcReg45, srcReg56);
    132 
    133    srcReg2345_3456_lo = _mm_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo);
    134    srcReg2345_3456_hi = _mm_unpackhi_epi16(srcReg23_34_lo, srcReg45_56_lo);
    135 
    136    // multiply 2 adjacent elements with the filter and add the result
    137    resReglo = _mm_maddubs_epi16(srcReg2345_3456_lo, firstFilters);
    138    resReghi = _mm_maddubs_epi16(srcReg2345_3456_hi, firstFilters);
    139 
    140    resReglo = _mm_hadds_epi16(resReglo, _mm_setzero_si128());
    141    resReghi = _mm_hadds_epi16(resReghi, _mm_setzero_si128());
    142 
    143    // shift by 6 bit each 16 bit
    144    resReglo = _mm_adds_epi16(resReglo, addFilterReg32);
    145    resReghi = _mm_adds_epi16(resReghi, addFilterReg32);
    146    resReglo = _mm_srai_epi16(resReglo, 6);
    147    resReghi = _mm_srai_epi16(resReghi, 6);
    148 
    149    // shrink to 8 bit each 16 bits, the first lane contain the first
    150    // convolve result and the second lane contain the second convolve
    151    // result
    152    resReglo = _mm_packus_epi16(resReglo, resReglo);
    153    resReghi = _mm_packus_epi16(resReghi, resReghi);
    154 
    155    src_ptr += src_stride;
    156 
    157    *((int *)(output_ptr)) = _mm_cvtsi128_si32(resReglo);
    158    *((int *)(output_ptr + out_pitch)) = _mm_cvtsi128_si32(resReghi);
    159 
    160    output_ptr += dst_stride;
    161 
    162    // save part of the registers for next strides
    163    srcReg23_34_lo = srcReg45_56_lo;
    164    srcReg4 = srcReg6;
    165  }
    166 }
    167 
    168 static void aom_filter_block1d8_h4_ssse3(
    169    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
    170    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
    171  __m128i filtersReg;
    172  __m128i addFilterReg32, filt2Reg, filt3Reg;
    173  __m128i secondFilters, thirdFilters;
    174  __m128i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
    175  __m128i srcReg32b1;
    176  unsigned int i;
    177  src_ptr -= 3;
    178  addFilterReg32 = _mm_set1_epi16(32);
    179  filtersReg = _mm_loadu_si128((const __m128i *)filter);
    180  filtersReg = _mm_srai_epi16(filtersReg, 1);
    181  // converting the 16 bit (short) to 8 bit (byte) and have the same data
    182  // in both lanes of 128 bit register.
    183  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
    184 
    185  // duplicate only the second 16 bits (third and forth byte)
    186  // across 256 bit register
    187  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
    188  // duplicate only the third 16 bits (fifth and sixth byte)
    189  // across 256 bit register
    190  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
    191 
    192  filt2Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32));
    193  filt3Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32 * 2));
    194 
    195  for (i = output_height; i > 0; i -= 1) {
    196    srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
    197 
    198    // filter the source buffer
    199    srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b1, filt2Reg);
    200    srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b1, filt3Reg);
    201 
    202    // multiply 2 adjacent elements with the filter and add the result
    203    srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters);
    204    srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters);
    205 
    206    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
    207 
    208    // shift by 6 bit each 16 bit
    209    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
    210    srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
    211 
    212    // shrink to 8 bit each 16 bits
    213    srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
    214 
    215    src_ptr += src_pixels_per_line;
    216 
    217    _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt32b1_1);
    218 
    219    output_ptr += output_pitch;
    220  }
    221 }
    222 
    223 static void aom_filter_block1d8_v4_ssse3(
    224    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
    225    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
    226  __m128i filtersReg;
    227  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
    228  __m128i srcReg23, srcReg34, srcReg45, srcReg56;
    229  __m128i resReg23, resReg34, resReg45, resReg56;
    230  __m128i resReg23_45, resReg34_56;
    231  __m128i addFilterReg32, secondFilters, thirdFilters;
    232  unsigned int i;
    233  ptrdiff_t src_stride, dst_stride;
    234 
    235  addFilterReg32 = _mm_set1_epi16(32);
    236  filtersReg = _mm_loadu_si128((const __m128i *)filter);
    237  // converting the 16 bit (short) to  8 bit (byte) and have the
    238  // same data in both lanes of 128 bit register.
    239  filtersReg = _mm_srai_epi16(filtersReg, 1);
    240  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
    241 
    242  // duplicate only the second 16 bits (third and forth byte)
    243  // across 128 bit register
    244  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
    245  // duplicate only the third 16 bits (fifth and sixth byte)
    246  // across 128 bit register
    247  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
    248 
    249  // multiple the size of the source and destination stride by two
    250  src_stride = src_pitch << 1;
    251  dst_stride = out_pitch << 1;
    252 
    253  srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
    254  srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
    255  srcReg23 = _mm_unpacklo_epi8(srcReg2, srcReg3);
    256 
    257  srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
    258 
    259  // have consecutive loads on the same 256 register
    260  srcReg34 = _mm_unpacklo_epi8(srcReg3, srcReg4);
    261 
    262  for (i = output_height; i > 1; i -= 2) {
    263    srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
    264 
    265    srcReg45 = _mm_unpacklo_epi8(srcReg4, srcReg5);
    266 
    267    srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
    268 
    269    srcReg56 = _mm_unpacklo_epi8(srcReg5, srcReg6);
    270 
    271    // multiply 2 adjacent elements with the filter and add the result
    272    resReg23 = _mm_maddubs_epi16(srcReg23, secondFilters);
    273    resReg34 = _mm_maddubs_epi16(srcReg34, secondFilters);
    274    resReg45 = _mm_maddubs_epi16(srcReg45, thirdFilters);
    275    resReg56 = _mm_maddubs_epi16(srcReg56, thirdFilters);
    276 
    277    // add and saturate the results together
    278    resReg23_45 = _mm_adds_epi16(resReg23, resReg45);
    279    resReg34_56 = _mm_adds_epi16(resReg34, resReg56);
    280 
    281    // shift by 6 bit each 16 bit
    282    resReg23_45 = _mm_adds_epi16(resReg23_45, addFilterReg32);
    283    resReg34_56 = _mm_adds_epi16(resReg34_56, addFilterReg32);
    284    resReg23_45 = _mm_srai_epi16(resReg23_45, 6);
    285    resReg34_56 = _mm_srai_epi16(resReg34_56, 6);
    286 
    287    // shrink to 8 bit each 16 bits, the first lane contain the first
    288    // convolve result and the second lane contain the second convolve
    289    // result
    290    resReg23_45 = _mm_packus_epi16(resReg23_45, _mm_setzero_si128());
    291    resReg34_56 = _mm_packus_epi16(resReg34_56, _mm_setzero_si128());
    292 
    293    src_ptr += src_stride;
    294 
    295    _mm_storel_epi64((__m128i *)output_ptr, (resReg23_45));
    296    _mm_storel_epi64((__m128i *)(output_ptr + out_pitch), (resReg34_56));
    297 
    298    output_ptr += dst_stride;
    299 
    300    // save part of the registers for next strides
    301    srcReg23 = srcReg45;
    302    srcReg34 = srcReg56;
    303    srcReg4 = srcReg6;
    304  }
    305 }
    306 
    307 static void aom_filter_block1d16_h4_ssse3(
    308    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
    309    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
    310  __m128i filtersReg;
    311  __m128i addFilterReg32, filt2Reg, filt3Reg;
    312  __m128i secondFilters, thirdFilters;
    313  __m128i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
    314  __m128i srcReg32b1, srcReg32b2;
    315  unsigned int i;
    316  src_ptr -= 3;
    317  addFilterReg32 = _mm_set1_epi16(32);
    318  filtersReg = _mm_loadu_si128((const __m128i *)filter);
    319  filtersReg = _mm_srai_epi16(filtersReg, 1);
    320  // converting the 16 bit (short) to 8 bit (byte) and have the same data
    321  // in both lanes of 128 bit register.
    322  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
    323 
    324  // duplicate only the second 16 bits (third and forth byte)
    325  // across 256 bit register
    326  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
    327  // duplicate only the third 16 bits (fifth and sixth byte)
    328  // across 256 bit register
    329  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
    330 
    331  filt2Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32));
    332  filt3Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32 * 2));
    333 
    334  for (i = output_height; i > 0; i -= 1) {
    335    srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
    336 
    337    // filter the source buffer
    338    srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b1, filt2Reg);
    339    srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b1, filt3Reg);
    340 
    341    // multiply 2 adjacent elements with the filter and add the result
    342    srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters);
    343    srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters);
    344 
    345    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
    346 
    347    // reading stride of the next 16 bytes
    348    // (part of it was being read by earlier read)
    349    srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
    350 
    351    // filter the source buffer
    352    srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b2, filt2Reg);
    353    srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b2, filt3Reg);
    354 
    355    // multiply 2 adjacent elements with the filter and add the result
    356    srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters);
    357    srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters);
    358 
    359    // add and saturate the results together
    360    srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
    361 
    362    // shift by 6 bit each 16 bit
    363    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
    364    srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
    365    srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
    366    srcRegFilt32b2_1 = _mm_srai_epi16(srcRegFilt32b2_1, 6);
    367 
    368    // shrink to 8 bit each 16 bits, the first lane contain the first
    369    // convolve result and the second lane contain the second convolve result
    370    srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
    371 
    372    src_ptr += src_pixels_per_line;
    373 
    374    _mm_store_si128((__m128i *)output_ptr, srcRegFilt32b1_1);
    375 
    376    output_ptr += output_pitch;
    377  }
    378 }
    379 
    380 static void aom_filter_block1d16_v4_ssse3(
    381    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
    382    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
    383  __m128i filtersReg;
    384  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
    385  __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi;
    386  __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi;
    387  __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
    388  __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi;
    389  __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi;
    390  __m128i resReg23_45, resReg34_56;
    391  __m128i addFilterReg32, secondFilters, thirdFilters;
    392  unsigned int i;
    393  ptrdiff_t src_stride, dst_stride;
    394 
    395  addFilterReg32 = _mm_set1_epi16(32);
    396  filtersReg = _mm_loadu_si128((const __m128i *)filter);
    397  // converting the 16 bit (short) to  8 bit (byte) and have the
    398  // same data in both lanes of 128 bit register.
    399  filtersReg = _mm_srai_epi16(filtersReg, 1);
    400  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
    401 
    402  // duplicate only the second 16 bits (third and forth byte)
    403  // across 128 bit register
    404  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
    405  // duplicate only the third 16 bits (fifth and sixth byte)
    406  // across 128 bit register
    407  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
    408 
    409  // multiple the size of the source and destination stride by two
    410  src_stride = src_pitch << 1;
    411  dst_stride = out_pitch << 1;
    412 
    413  srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
    414  srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
    415  srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3);
    416  srcReg23_hi = _mm_unpackhi_epi8(srcReg2, srcReg3);
    417 
    418  srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
    419 
    420  // have consecutive loads on the same 256 register
    421  srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4);
    422  srcReg34_hi = _mm_unpackhi_epi8(srcReg3, srcReg4);
    423 
    424  for (i = output_height; i > 1; i -= 2) {
    425    srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
    426 
    427    srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5);
    428    srcReg45_hi = _mm_unpackhi_epi8(srcReg4, srcReg5);
    429 
    430    srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
    431 
    432    srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6);
    433    srcReg56_hi = _mm_unpackhi_epi8(srcReg5, srcReg6);
    434 
    435    // multiply 2 adjacent elements with the filter and add the result
    436    resReg23_lo = _mm_maddubs_epi16(srcReg23_lo, secondFilters);
    437    resReg34_lo = _mm_maddubs_epi16(srcReg34_lo, secondFilters);
    438    resReg45_lo = _mm_maddubs_epi16(srcReg45_lo, thirdFilters);
    439    resReg56_lo = _mm_maddubs_epi16(srcReg56_lo, thirdFilters);
    440 
    441    // add and saturate the results together
    442    resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo);
    443    resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo);
    444 
    445    // multiply 2 adjacent elements with the filter and add the result
    446 
    447    resReg23_hi = _mm_maddubs_epi16(srcReg23_hi, secondFilters);
    448    resReg34_hi = _mm_maddubs_epi16(srcReg34_hi, secondFilters);
    449    resReg45_hi = _mm_maddubs_epi16(srcReg45_hi, thirdFilters);
    450    resReg56_hi = _mm_maddubs_epi16(srcReg56_hi, thirdFilters);
    451 
    452    // add and saturate the results together
    453    resReg23_45_hi = _mm_adds_epi16(resReg23_hi, resReg45_hi);
    454    resReg34_56_hi = _mm_adds_epi16(resReg34_hi, resReg56_hi);
    455 
    456    // shift by 6 bit each 16 bit
    457    resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32);
    458    resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32);
    459    resReg23_45_hi = _mm_adds_epi16(resReg23_45_hi, addFilterReg32);
    460    resReg34_56_hi = _mm_adds_epi16(resReg34_56_hi, addFilterReg32);
    461    resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6);
    462    resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6);
    463    resReg23_45_hi = _mm_srai_epi16(resReg23_45_hi, 6);
    464    resReg34_56_hi = _mm_srai_epi16(resReg34_56_hi, 6);
    465 
    466    // shrink to 8 bit each 16 bits, the first lane contain the first
    467    // convolve result and the second lane contain the second convolve
    468    // result
    469    resReg23_45 = _mm_packus_epi16(resReg23_45_lo, resReg23_45_hi);
    470    resReg34_56 = _mm_packus_epi16(resReg34_56_lo, resReg34_56_hi);
    471 
    472    src_ptr += src_stride;
    473 
    474    _mm_store_si128((__m128i *)output_ptr, (resReg23_45));
    475    _mm_store_si128((__m128i *)(output_ptr + out_pitch), (resReg34_56));
    476 
    477    output_ptr += dst_stride;
    478 
    479    // save part of the registers for next strides
    480    srcReg23_lo = srcReg45_lo;
    481    srcReg34_lo = srcReg56_lo;
    482    srcReg23_hi = srcReg45_hi;
    483    srcReg34_hi = srcReg56_hi;
    484    srcReg4 = srcReg6;
    485  }
    486 }
    487 
    488 static inline __m128i shuffle_filter_convolve8_8_ssse3(
    489    const __m128i *const s, const int16_t *const filter) {
    490  __m128i f[4];
    491  shuffle_filter_ssse3(filter, f);
    492  return convolve8_8_ssse3(s, f);
    493 }
    494 
    495 static void filter_horiz_w8_ssse3(const uint8_t *const src,
    496                                  const ptrdiff_t src_stride,
    497                                  uint8_t *const dst,
    498                                  const int16_t *const x_filter) {
    499  __m128i s[8], ss[4], temp;
    500 
    501  load_8bit_8x8(src, src_stride, s);
    502  // 00 01 10 11 20 21 30 31  40 41 50 51 60 61 70 71
    503  // 02 03 12 13 22 23 32 33  42 43 52 53 62 63 72 73
    504  // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75
    505  // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77
    506  transpose_16bit_4x8(s, ss);
    507  temp = shuffle_filter_convolve8_8_ssse3(ss, x_filter);
    508  // shrink to 8 bit each 16 bits
    509  temp = _mm_packus_epi16(temp, temp);
    510  // save only 8 bytes convolve result
    511  _mm_storel_epi64((__m128i *)dst, temp);
    512 }
    513 
    514 static void transpose8x8_to_dst(const uint8_t *const src,
    515                                const ptrdiff_t src_stride, uint8_t *const dst,
    516                                const ptrdiff_t dst_stride) {
    517  __m128i s[8];
    518 
    519  load_8bit_8x8(src, src_stride, s);
    520  transpose_8bit_8x8(s, s);
    521  store_8bit_8x8(s, dst, dst_stride);
    522 }
    523 
    524 static void scaledconvolve_horiz_w8(const uint8_t *src,
    525                                    const ptrdiff_t src_stride, uint8_t *dst,
    526                                    const ptrdiff_t dst_stride,
    527                                    const InterpKernel *const x_filters,
    528                                    const int x0_q4, const int x_step_q4,
    529                                    const int w, const int h) {
    530  DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
    531  int x, y, z;
    532  src -= SUBPEL_TAPS / 2 - 1;
    533 
    534  // This function processes 8x8 areas. The intermediate height is not always
    535  // a multiple of 8, so force it to be a multiple of 8 here.
    536  y = h + (8 - (h & 0x7));
    537 
    538  do {
    539    int x_q4 = x0_q4;
    540    for (x = 0; x < w; x += 8) {
    541      // process 8 src_x steps
    542      for (z = 0; z < 8; ++z) {
    543        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
    544        const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
    545        if (x_q4 & SUBPEL_MASK) {
    546          filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter);
    547        } else {
    548          int i;
    549          for (i = 0; i < 8; ++i) {
    550            temp[z * 8 + i] = src_x[i * src_stride + 3];
    551          }
    552        }
    553        x_q4 += x_step_q4;
    554      }
    555 
    556      // transpose the 8x8 filters values back to dst
    557      transpose8x8_to_dst(temp, 8, dst + x, dst_stride);
    558    }
    559 
    560    src += src_stride * 8;
    561    dst += dst_stride * 8;
    562  } while (y -= 8);
    563 }
    564 
    565 static void filter_horiz_w4_ssse3(const uint8_t *const src,
    566                                  const ptrdiff_t src_stride,
    567                                  uint8_t *const dst,
    568                                  const int16_t *const filter) {
    569  __m128i s[4];
    570  __m128i temp;
    571 
    572  load_8bit_8x4(src, src_stride, s);
    573  transpose_16bit_4x4(s, s);
    574 
    575  temp = shuffle_filter_convolve8_8_ssse3(s, filter);
    576  // shrink to 8 bit each 16 bits
    577  temp = _mm_packus_epi16(temp, temp);
    578  // save only 4 bytes
    579  *(int *)dst = _mm_cvtsi128_si32(temp);
    580 }
    581 
    582 static void transpose4x4_to_dst(const uint8_t *const src,
    583                                const ptrdiff_t src_stride, uint8_t *const dst,
    584                                const ptrdiff_t dst_stride) {
    585  __m128i s[4];
    586 
    587  load_8bit_4x4(src, src_stride, s);
    588  s[0] = transpose_8bit_4x4(s);
    589  s[1] = _mm_srli_si128(s[0], 4);
    590  s[2] = _mm_srli_si128(s[0], 8);
    591  s[3] = _mm_srli_si128(s[0], 12);
    592  store_8bit_4x4(s, dst, dst_stride);
    593 }
    594 
    595 static void scaledconvolve_horiz_w4(const uint8_t *src,
    596                                    const ptrdiff_t src_stride, uint8_t *dst,
    597                                    const ptrdiff_t dst_stride,
    598                                    const InterpKernel *const x_filters,
    599                                    const int x0_q4, const int x_step_q4,
    600                                    const int w, const int h) {
    601  DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
    602  int x, y, z;
    603  src -= SUBPEL_TAPS / 2 - 1;
    604 
    605  for (y = 0; y < h; y += 4) {
    606    int x_q4 = x0_q4;
    607    for (x = 0; x < w; x += 4) {
    608      // process 4 src_x steps
    609      for (z = 0; z < 4; ++z) {
    610        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
    611        const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
    612        if (x_q4 & SUBPEL_MASK) {
    613          filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter);
    614        } else {
    615          int i;
    616          for (i = 0; i < 4; ++i) {
    617            temp[z * 4 + i] = src_x[i * src_stride + 3];
    618          }
    619        }
    620        x_q4 += x_step_q4;
    621      }
    622 
    623      // transpose the 4x4 filters values back to dst
    624      transpose4x4_to_dst(temp, 4, dst + x, dst_stride);
    625    }
    626 
    627    src += src_stride * 4;
    628    dst += dst_stride * 4;
    629  }
    630 }
    631 
    632 static __m128i filter_vert_kernel(const __m128i *const s,
    633                                  const int16_t *const filter) {
    634  __m128i ss[4];
    635  __m128i temp;
    636 
    637  // 00 10 01 11 02 12 03 13
    638  ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
    639  // 20 30 21 31 22 32 23 33
    640  ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
    641  // 40 50 41 51 42 52 43 53
    642  ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
    643  // 60 70 61 71 62 72 63 73
    644  ss[3] = _mm_unpacklo_epi8(s[6], s[7]);
    645 
    646  temp = shuffle_filter_convolve8_8_ssse3(ss, filter);
    647  // shrink to 8 bit each 16 bits
    648  return _mm_packus_epi16(temp, temp);
    649 }
    650 
    651 static void filter_vert_w4_ssse3(const uint8_t *const src,
    652                                 const ptrdiff_t src_stride, uint8_t *const dst,
    653                                 const int16_t *const filter) {
    654  __m128i s[8];
    655  __m128i temp;
    656 
    657  load_8bit_4x8(src, src_stride, s);
    658  temp = filter_vert_kernel(s, filter);
    659  // save only 4 bytes
    660  *(int *)dst = _mm_cvtsi128_si32(temp);
    661 }
    662 
    663 static void scaledconvolve_vert_w4(
    664    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
    665    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
    666    const int y0_q4, const int y_step_q4, const int w, const int h) {
    667  int y;
    668  int y_q4 = y0_q4;
    669 
    670  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
    671  for (y = 0; y < h; ++y) {
    672    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
    673    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
    674 
    675    if (y_q4 & SUBPEL_MASK) {
    676      filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
    677    } else {
    678      memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
    679    }
    680 
    681    y_q4 += y_step_q4;
    682  }
    683 }
    684 
    685 static void filter_vert_w8_ssse3(const uint8_t *const src,
    686                                 const ptrdiff_t src_stride, uint8_t *const dst,
    687                                 const int16_t *const filter) {
    688  __m128i s[8], temp;
    689 
    690  load_8bit_8x8(src, src_stride, s);
    691  temp = filter_vert_kernel(s, filter);
    692  // save only 8 bytes convolve result
    693  _mm_storel_epi64((__m128i *)dst, temp);
    694 }
    695 
    696 static void scaledconvolve_vert_w8(
    697    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
    698    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
    699    const int y0_q4, const int y_step_q4, const int w, const int h) {
    700  int y;
    701  int y_q4 = y0_q4;
    702 
    703  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
    704  for (y = 0; y < h; ++y) {
    705    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
    706    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
    707    if (y_q4 & SUBPEL_MASK) {
    708      filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
    709    } else {
    710      memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
    711    }
    712    y_q4 += y_step_q4;
    713  }
    714 }
    715 
    716 static void filter_vert_w16_ssse3(const uint8_t *src,
    717                                  const ptrdiff_t src_stride,
    718                                  uint8_t *const dst,
    719                                  const int16_t *const filter, const int w) {
    720  int i;
    721  __m128i f[4];
    722  shuffle_filter_ssse3(filter, f);
    723 
    724  for (i = 0; i < w; i += 16) {
    725    __m128i s[8], s_lo[4], s_hi[4], temp_lo, temp_hi;
    726 
    727    loadu_8bit_16x8(src, src_stride, s);
    728 
    729    // merge the result together
    730    s_lo[0] = _mm_unpacklo_epi8(s[0], s[1]);
    731    s_hi[0] = _mm_unpackhi_epi8(s[0], s[1]);
    732    s_lo[1] = _mm_unpacklo_epi8(s[2], s[3]);
    733    s_hi[1] = _mm_unpackhi_epi8(s[2], s[3]);
    734    s_lo[2] = _mm_unpacklo_epi8(s[4], s[5]);
    735    s_hi[2] = _mm_unpackhi_epi8(s[4], s[5]);
    736    s_lo[3] = _mm_unpacklo_epi8(s[6], s[7]);
    737    s_hi[3] = _mm_unpackhi_epi8(s[6], s[7]);
    738    temp_lo = convolve8_8_ssse3(s_lo, f);
    739    temp_hi = convolve8_8_ssse3(s_hi, f);
    740 
    741    // shrink to 8 bit each 16 bits, the first lane contain the first convolve
    742    // result and the second lane contain the second convolve result
    743    temp_hi = _mm_packus_epi16(temp_lo, temp_hi);
    744    src += 16;
    745    // save 16 bytes convolve result
    746    _mm_store_si128((__m128i *)&dst[i], temp_hi);
    747  }
    748 }
    749 
    750 static void scaledconvolve_vert_w16(
    751    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
    752    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
    753    const int y0_q4, const int y_step_q4, const int w, const int h) {
    754  int y;
    755  int y_q4 = y0_q4;
    756 
    757  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
    758  for (y = 0; y < h; ++y) {
    759    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
    760    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
    761    if (y_q4 & SUBPEL_MASK) {
    762      filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter,
    763                            w);
    764    } else {
    765      memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
    766    }
    767    y_q4 += y_step_q4;
    768  }
    769 }
    770 
    771 void aom_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
    772                         ptrdiff_t dst_stride, const InterpKernel *filter,
    773                         int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
    774                         int w, int h) {
    775  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
    776  // 2d filtering proceeds in 2 steps:
    777  //   (1) Interpolate horizontally into an intermediate buffer, temp.
    778  //   (2) Interpolate temp vertically to derive the sub-pixel result.
    779  // Deriving the maximum number of rows in the temp buffer (135):
    780  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
    781  // --Largest block size is 64x64 pixels.
    782  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
    783  //   original frame (in 1/16th pixel units).
    784  // --Must round-up because block may be located at sub-pixel position.
    785  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
    786  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
    787  // --Require an additional 8 rows for the horiz_w8 transpose tail.
    788  // When calling in frame scaling function, the smallest scaling factor is x1/4
    789  // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
    790  // big enough.
    791  DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
    792  const int intermediate_height =
    793      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
    794 
    795  assert(w <= 64);
    796  assert(h <= 64);
    797  assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
    798  assert(x_step_q4 <= 64);
    799 
    800  if (w >= 8) {
    801    scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
    802                            src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
    803                            intermediate_height);
    804  } else {
    805    scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
    806                            src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
    807                            intermediate_height);
    808  }
    809 
    810  if (w >= 16) {
    811    scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
    812                            dst_stride, filter, y0_q4, y_step_q4, w, h);
    813  } else if (w == 8) {
    814    scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
    815                           dst_stride, filter, y0_q4, y_step_q4, w, h);
    816  } else {
    817    scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
    818                           dst_stride, filter, y0_q4, y_step_q4, w, h);
    819  }
    820 }
    821 
    822 filter8_1dfunction aom_filter_block1d16_v8_ssse3;
    823 filter8_1dfunction aom_filter_block1d16_h8_ssse3;
    824 filter8_1dfunction aom_filter_block1d8_v8_ssse3;
    825 filter8_1dfunction aom_filter_block1d8_h8_ssse3;
    826 filter8_1dfunction aom_filter_block1d4_v8_ssse3;
    827 filter8_1dfunction aom_filter_block1d4_h8_ssse3;
    828 
    829 filter8_1dfunction aom_filter_block1d16_v2_ssse3;
    830 filter8_1dfunction aom_filter_block1d16_h2_ssse3;
    831 filter8_1dfunction aom_filter_block1d8_v2_ssse3;
    832 filter8_1dfunction aom_filter_block1d8_h2_ssse3;
    833 filter8_1dfunction aom_filter_block1d4_v2_ssse3;
    834 filter8_1dfunction aom_filter_block1d4_h2_ssse3;
    835 
    836 // void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
    837 //                                uint8_t *dst, ptrdiff_t dst_stride,
    838 //                                const int16_t *filter_x, int x_step_q4,
    839 //                                const int16_t *filter_y, int y_step_q4,
    840 //                                int w, int h);
    841 // void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
    842 //                               uint8_t *dst, ptrdiff_t dst_stride,
    843 //                               const int16_t *filter_x, int x_step_q4,
    844 //                               const int16_t *filter_y, int y_step_q4,
    845 //                               int w, int h);
    846 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3)
    847 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3)