tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

subpel_variance_ssse3.asm (42539B)


      1 ;
      2 ; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 ;
      4 ; This source code is subject to the terms of the BSD 2 Clause License and
      5 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 ; was not distributed with this source code in the LICENSE file, you can
      7 ; obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 ; Media Patent License 1.0 was not distributed with this source code in the
      9 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 ;
     11 
     12 ;
     13 
     14 %include "third_party/x86inc/x86inc.asm"
     15 
     16 SECTION_RODATA
     17 pw_8: times  8 dw  8
     18 
     19 bilin_filter_m_ssse3: times  8 db 16,  0
     20                      times  8 db 14,  2
     21                      times  8 db 12,  4
     22                      times  8 db 10,  6
     23                      times 16 db  8
     24                      times  8 db  6, 10
     25                      times  8 db  4, 12
     26                      times  8 db  2, 14
     27 
     28 SECTION .text
     29 
     30 ; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
     31 ;                               int x_offset, int y_offset,
     32 ;                               const uint8_t *dst, ptrdiff_t dst_stride,
     33 ;                               int height, unsigned int *sse);
     34 ;
     35 ; This function returns the SE and stores SSE in the given pointer.
     36 
     37 %macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
     38  psubw                %3, %4
     39  psubw                %1, %2
     40  paddw                %5, %3
     41  pmaddwd              %3, %3
     42  paddw                %5, %1
     43  pmaddwd              %1, %1
     44  paddd                %6, %3
     45  paddd                %6, %1
     46 %endmacro
     47 
     48 %macro STORE_AND_RET 1
     49 %if %1 > 4
     50  ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
     51  ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
     52  ; We have to sign-extend it before adding the words within the register
     53  ; and outputing to a dword.
     54  pcmpgtw              m5, m6           ; mask for 0 > x
     55  movhlps              m3, m7
     56  punpcklwd            m4, m6, m5
     57  punpckhwd            m6, m5           ; sign-extend m6 word->dword
     58  paddd                m7, m3
     59  paddd                m6, m4
     60  pshufd               m3, m7, 0x1
     61  movhlps              m4, m6
     62  paddd                m7, m3
     63  paddd                m6, m4
     64  mov                  r1, ssem         ; r1 = unsigned int *sse
     65  pshufd               m4, m6, 0x1
     66  movd               [r1], m7           ; store sse
     67  paddd                m6, m4
     68  movd               raxd, m6           ; store sum as return value
     69 %else ; 4xh
     70  pshuflw              m4, m6, 0xe
     71  pshuflw              m3, m7, 0xe
     72  paddw                m6, m4
     73  paddd                m7, m3
     74  pcmpgtw              m5, m6           ; mask for 0 > x
     75  mov                  r1, ssem         ; r1 = unsigned int *sse
     76  punpcklwd            m6, m5           ; sign-extend m6 word->dword
     77  movd               [r1], m7           ; store sse
     78  pshuflw              m4, m6, 0xe
     79  paddd                m6, m4
     80  movd               raxd, m6           ; store sum as return value
     81 %endif
     82  RET
     83 %endmacro
     84 
     85 %macro INC_SRC_BY_SRC_STRIDE  0
     86 %if AOM_ARCH_X86=1 && CONFIG_PIC=1
     87  add                srcq, src_stridemp
     88 %else
     89  add                srcq, src_strideq
     90 %endif
     91 %endmacro
     92 
     93 %macro SUBPEL_VARIANCE 1-2 0 ; W
     94 %if cpuflag(ssse3)
     95 %define bilin_filter_m bilin_filter_m_ssse3
     96 %define filter_idx_shift 4
     97 %endif
     98 ; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
     99 ; 11, not 13, if the registers are ordered correctly. May make a minor speed
    100 ; difference on Win64
    101 
    102 %if AOM_ARCH_X86_64
    103  %if %2 == 1 ; avg
    104    cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
    105                                        x_offset, y_offset, dst, dst_stride, \
    106                                        sec, sec_stride, height, sse
    107    %define sec_str sec_strideq
    108  %else
    109    cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
    110                                    x_offset, y_offset, dst, dst_stride, \
    111                                    height, sse
    112  %endif
    113  %define block_height heightd
    114  %define bilin_filter sseq
    115 %else
    116  %if CONFIG_PIC=1
    117    %if %2 == 1 ; avg
    118      cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
    119                                          x_offset, y_offset, dst, dst_stride, \
    120                                          sec, sec_stride, height, sse
    121      %define block_height dword heightm
    122      %define sec_str sec_stridemp
    123    %else
    124      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
    125                                      x_offset, y_offset, dst, dst_stride, \
    126                                      height, sse
    127      %define block_height heightd
    128    %endif
    129 
    130    ; reuse argument stack space
    131    %define g_bilin_filterm x_offsetm
    132    %define g_pw_8m y_offsetm
    133 
    134    ;Store bilin_filter and pw_8 location in stack
    135    %if GET_GOT_DEFINED == 1
    136      GET_GOT eax
    137      add esp, 4                ; restore esp
    138    %endif
    139 
    140    lea ecx, [GLOBAL(bilin_filter_m)]
    141    mov g_bilin_filterm, ecx
    142 
    143    lea ecx, [GLOBAL(pw_8)]
    144    mov g_pw_8m, ecx
    145 
    146    LOAD_IF_USED 0, 1         ; load eax, ecx back
    147  %else
    148    %if %2 == 1 ; avg
    149      cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
    150                                          x_offset, y_offset, \
    151                                          dst, dst_stride, sec, sec_stride, \
    152                                          height, sse
    153      %define block_height dword heightm
    154      %define sec_str sec_stridemp
    155    %else
    156      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
    157                                      x_offset, y_offset, dst, dst_stride, \
    158                                      height, sse
    159      %define block_height heightd
    160    %endif
    161    %define bilin_filter bilin_filter_m
    162  %endif
    163 %endif
    164 
    165 %if %1 == 4
    166  %define movx movd
    167 %else
    168  %define movx movh
    169 %endif
    170 
    171  ASSERT               %1 <= 16         ; m6 overflows if w > 16
    172  pxor                 m6, m6           ; sum
    173  pxor                 m7, m7           ; sse
    174  ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we
    175  ; could perhaps use it for something more productive then
    176  pxor                 m5, m5           ; dedicated zero register
    177 %if %1 < 16
    178  sar                   block_height, 1
    179 %if %2 == 1 ; avg
    180  shl             sec_str, 1
    181 %endif
    182 %endif
    183 
    184  ; FIXME(rbultje) replace by jumptable?
    185  test          x_offsetd, x_offsetd
    186  jnz .x_nonzero
    187  ; x_offset == 0
    188  test          y_offsetd, y_offsetd
    189  jnz .x_zero_y_nonzero
    190 
    191  ; x_offset == 0 && y_offset == 0
    192 .x_zero_y_zero_loop:
    193 %if %1 == 16
    194  movu                 m0, [srcq]
    195  mova                 m1, [dstq]
    196 %if %2 == 1 ; avg
    197  pavgb                m0, [secq]
    198  punpckhbw            m3, m1, m5
    199  punpcklbw            m1, m5
    200 %endif
    201  punpckhbw            m2, m0, m5
    202  punpcklbw            m0, m5
    203 
    204 %if %2 == 0 ; !avg
    205  punpckhbw            m3, m1, m5
    206  punpcklbw            m1, m5
    207 %endif
    208  SUM_SSE              m0, m1, m2, m3, m6, m7
    209 
    210  add                srcq, src_strideq
    211  add                dstq, dst_strideq
    212 %else ; %1 < 16
    213  movx                 m0, [srcq]
    214 %if %2 == 1 ; avg
    215 %if %1 > 4
    216  movhps               m0, [srcq+src_strideq]
    217 %else ; 4xh
    218  movx                 m1, [srcq+src_strideq]
    219  punpckldq            m0, m1
    220 %endif
    221 %else ; !avg
    222  movx                 m2, [srcq+src_strideq]
    223 %endif
    224 
    225  movx                 m1, [dstq]
    226  movx                 m3, [dstq+dst_strideq]
    227 
    228 %if %2 == 1 ; avg
    229 %if %1 > 4
    230  pavgb                m0, [secq]
    231 %else
    232  movh                 m2, [secq]
    233  pavgb                m0, m2
    234 %endif
    235  punpcklbw            m3, m5
    236  punpcklbw            m1, m5
    237 %if %1 > 4
    238  punpckhbw            m2, m0, m5
    239  punpcklbw            m0, m5
    240 %else ; 4xh
    241  punpcklbw            m0, m5
    242  movhlps              m2, m0
    243 %endif
    244 %else ; !avg
    245  punpcklbw            m0, m5
    246  punpcklbw            m2, m5
    247  punpcklbw            m3, m5
    248  punpcklbw            m1, m5
    249 %endif
    250  SUM_SSE              m0, m1, m2, m3, m6, m7
    251 
    252  lea                srcq, [srcq+src_strideq*2]
    253  lea                dstq, [dstq+dst_strideq*2]
    254 %endif
    255 %if %2 == 1 ; avg
    256  add                secq, sec_str
    257 %endif
    258  dec                   block_height
    259  jg .x_zero_y_zero_loop
    260  STORE_AND_RET %1
    261 
    262 .x_zero_y_nonzero:
    263  cmp           y_offsetd, 4
    264  jne .x_zero_y_nonhalf
    265 
    266  ; x_offset == 0 && y_offset == 0.5
    267 .x_zero_y_half_loop:
    268 %if %1 == 16
    269  movu                 m0, [srcq]
    270  movu                 m4, [srcq+src_strideq]
    271  mova                 m1, [dstq]
    272  pavgb                m0, m4
    273  punpckhbw            m3, m1, m5
    274 %if %2 == 1 ; avg
    275  pavgb                m0, [secq]
    276 %endif
    277  punpcklbw            m1, m5
    278  punpckhbw            m2, m0, m5
    279  punpcklbw            m0, m5
    280  SUM_SSE              m0, m1, m2, m3, m6, m7
    281 
    282  add                srcq, src_strideq
    283  add                dstq, dst_strideq
    284 %else ; %1 < 16
    285  movx                 m0, [srcq]
    286  movx                 m2, [srcq+src_strideq]
    287 %if %2 == 1 ; avg
    288 %if %1 > 4
    289  movhps               m2, [srcq+src_strideq*2]
    290 %else ; 4xh
    291  movx                 m1, [srcq+src_strideq*2]
    292  punpckldq            m2, m1
    293 %endif
    294  movx                 m1, [dstq]
    295 %if %1 > 4
    296  movlhps              m0, m2
    297 %else ; 4xh
    298  punpckldq            m0, m2
    299 %endif
    300  movx                 m3, [dstq+dst_strideq]
    301  pavgb                m0, m2
    302  punpcklbw            m1, m5
    303 %if %1 > 4
    304  pavgb                m0, [secq]
    305  punpcklbw            m3, m5
    306  punpckhbw            m2, m0, m5
    307  punpcklbw            m0, m5
    308 %else ; 4xh
    309  movh                 m4, [secq]
    310  pavgb                m0, m4
    311  punpcklbw            m3, m5
    312  punpcklbw            m0, m5
    313  movhlps              m2, m0
    314 %endif
    315 %else ; !avg
    316  movx                 m4, [srcq+src_strideq*2]
    317  movx                 m1, [dstq]
    318  pavgb                m0, m2
    319  movx                 m3, [dstq+dst_strideq]
    320  pavgb                m2, m4
    321  punpcklbw            m0, m5
    322  punpcklbw            m2, m5
    323  punpcklbw            m3, m5
    324  punpcklbw            m1, m5
    325 %endif
    326  SUM_SSE              m0, m1, m2, m3, m6, m7
    327 
    328  lea                srcq, [srcq+src_strideq*2]
    329  lea                dstq, [dstq+dst_strideq*2]
    330 %endif
    331 %if %2 == 1 ; avg
    332  add                secq, sec_str
    333 %endif
    334  dec                   block_height
    335  jg .x_zero_y_half_loop
    336  STORE_AND_RET %1
    337 
    338 .x_zero_y_nonhalf:
    339  ; x_offset == 0 && y_offset == bilin interpolation
    340 %if AOM_ARCH_X86_64
    341  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
    342 %endif
    343  shl           y_offsetd, filter_idx_shift
    344 %if AOM_ARCH_X86_64 && %1 > 4
    345  mova                 m8, [bilin_filter+y_offsetq]
    346 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
    347  mova                 m9, [bilin_filter+y_offsetq+16]
    348 %endif
    349  mova                m10, [GLOBAL(pw_8)]
    350 %define filter_y_a m8
    351 %define filter_y_b m9
    352 %define filter_rnd m10
    353 %else ; x86-32 or mmx
    354 %if AOM_ARCH_X86=1 && CONFIG_PIC=1
    355 ; x_offset == 0, reuse x_offset reg
    356 %define tempq x_offsetq
    357  add y_offsetq, g_bilin_filterm
    358 %define filter_y_a [y_offsetq]
    359 %define filter_y_b [y_offsetq+16]
    360  mov tempq, g_pw_8m
    361 %define filter_rnd [tempq]
    362 %else
    363  add           y_offsetq, bilin_filter
    364 %define filter_y_a [y_offsetq]
    365 %define filter_y_b [y_offsetq+16]
    366 %define filter_rnd [GLOBAL(pw_8)]
    367 %endif
    368 %endif
    369 
    370 .x_zero_y_other_loop:
    371 %if %1 == 16
    372  movu                 m0, [srcq]
    373  movu                 m4, [srcq+src_strideq]
    374  mova                 m1, [dstq]
    375 %if cpuflag(ssse3)
    376  punpckhbw            m2, m0, m4
    377  punpcklbw            m0, m4
    378  pmaddubsw            m2, filter_y_a
    379  pmaddubsw            m0, filter_y_a
    380  paddw                m2, filter_rnd
    381  paddw                m0, filter_rnd
    382 %else
    383  punpckhbw            m2, m0, m5
    384  punpckhbw            m3, m4, m5
    385  punpcklbw            m0, m5
    386  punpcklbw            m4, m5
    387  ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
    388  ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
    389  ; instructions is the same (5), but it is 1 mul instead of 2, so might be
    390  ; slightly faster because of pmullw latency. It would also cut our rodata
    391  ; tables in half for this function, and save 1-2 registers on x86-64.
    392  pmullw               m2, filter_y_a
    393  pmullw               m3, filter_y_b
    394  paddw                m2, filter_rnd
    395  pmullw               m0, filter_y_a
    396  pmullw               m4, filter_y_b
    397  paddw                m0, filter_rnd
    398  paddw                m2, m3
    399  paddw                m0, m4
    400 %endif
    401  psraw                m2, 4
    402  psraw                m0, 4
    403 %if %2 == 1 ; avg
    404  ; FIXME(rbultje) pipeline
    405  packuswb             m0, m2
    406  pavgb                m0, [secq]
    407  punpckhbw            m2, m0, m5
    408  punpcklbw            m0, m5
    409 %endif
    410  punpckhbw            m3, m1, m5
    411  punpcklbw            m1, m5
    412  SUM_SSE              m0, m1, m2, m3, m6, m7
    413 
    414  add                srcq, src_strideq
    415  add                dstq, dst_strideq
    416 %else ; %1 < 16
    417  movx                 m0, [srcq]
    418  movx                 m2, [srcq+src_strideq]
    419  movx                 m4, [srcq+src_strideq*2]
    420  movx                 m3, [dstq+dst_strideq]
    421 %if cpuflag(ssse3)
    422  movx                 m1, [dstq]
    423  punpcklbw            m0, m2
    424  punpcklbw            m2, m4
    425  pmaddubsw            m0, filter_y_a
    426  pmaddubsw            m2, filter_y_a
    427  punpcklbw            m3, m5
    428  paddw                m2, filter_rnd
    429  paddw                m0, filter_rnd
    430 %else
    431  punpcklbw            m0, m5
    432  punpcklbw            m2, m5
    433  punpcklbw            m4, m5
    434  pmullw               m0, filter_y_a
    435  pmullw               m1, m2, filter_y_b
    436  punpcklbw            m3, m5
    437  paddw                m0, filter_rnd
    438  pmullw               m2, filter_y_a
    439  pmullw               m4, filter_y_b
    440  paddw                m0, m1
    441  paddw                m2, filter_rnd
    442  movx                 m1, [dstq]
    443  paddw                m2, m4
    444 %endif
    445  psraw                m0, 4
    446  psraw                m2, 4
    447 %if %2 == 1 ; avg
    448  ; FIXME(rbultje) pipeline
    449 %if %1 == 4
    450  movlhps              m0, m2
    451 %endif
    452  packuswb             m0, m2
    453 %if %1 > 4
    454  pavgb                m0, [secq]
    455  punpckhbw            m2, m0, m5
    456  punpcklbw            m0, m5
    457 %else ; 4xh
    458  movh                 m2, [secq]
    459  pavgb                m0, m2
    460  punpcklbw            m0, m5
    461  movhlps              m2, m0
    462 %endif
    463 %endif
    464  punpcklbw            m1, m5
    465  SUM_SSE              m0, m1, m2, m3, m6, m7
    466 
    467  lea                srcq, [srcq+src_strideq*2]
    468  lea                dstq, [dstq+dst_strideq*2]
    469 %endif
    470 %if %2 == 1 ; avg
    471  add                secq, sec_str
    472 %endif
    473  dec                   block_height
    474  jg .x_zero_y_other_loop
    475 %undef filter_y_a
    476 %undef filter_y_b
    477 %undef filter_rnd
    478  STORE_AND_RET %1
    479 
    480 .x_nonzero:
    481  cmp           x_offsetd, 4
    482  jne .x_nonhalf
    483  ; x_offset == 0.5
    484  test          y_offsetd, y_offsetd
    485  jnz .x_half_y_nonzero
    486 
    487  ; x_offset == 0.5 && y_offset == 0
    488 .x_half_y_zero_loop:
    489 %if %1 == 16
    490  movu                 m0, [srcq]
    491  movu                 m4, [srcq+1]
    492  mova                 m1, [dstq]
    493  pavgb                m0, m4
    494  punpckhbw            m3, m1, m5
    495 %if %2 == 1 ; avg
    496  pavgb                m0, [secq]
    497 %endif
    498  punpcklbw            m1, m5
    499  punpckhbw            m2, m0, m5
    500  punpcklbw            m0, m5
    501  SUM_SSE              m0, m1, m2, m3, m6, m7
    502 
    503  add                srcq, src_strideq
    504  add                dstq, dst_strideq
    505 %else ; %1 < 16
    506  movx                 m0, [srcq]
    507  movx                 m4, [srcq+1]
    508 %if %2 == 1 ; avg
    509 %if %1 > 4
    510  movhps               m0, [srcq+src_strideq]
    511  movhps               m4, [srcq+src_strideq+1]
    512 %else ; 4xh
    513  movx                 m1, [srcq+src_strideq]
    514  punpckldq            m0, m1
    515  movx                 m2, [srcq+src_strideq+1]
    516  punpckldq            m4, m2
    517 %endif
    518  movx                 m1, [dstq]
    519  movx                 m3, [dstq+dst_strideq]
    520  pavgb                m0, m4
    521  punpcklbw            m3, m5
    522 %if %1 > 4
    523  pavgb                m0, [secq]
    524  punpcklbw            m1, m5
    525  punpckhbw            m2, m0, m5
    526  punpcklbw            m0, m5
    527 %else ; 4xh
    528  movh                 m2, [secq]
    529  pavgb                m0, m2
    530  punpcklbw            m1, m5
    531  punpcklbw            m0, m5
    532  movhlps              m2, m0
    533 %endif
    534 %else ; !avg
    535  movx                 m2, [srcq+src_strideq]
    536  movx                 m1, [dstq]
    537  pavgb                m0, m4
    538  movx                 m4, [srcq+src_strideq+1]
    539  movx                 m3, [dstq+dst_strideq]
    540  pavgb                m2, m4
    541  punpcklbw            m0, m5
    542  punpcklbw            m2, m5
    543  punpcklbw            m3, m5
    544  punpcklbw            m1, m5
    545 %endif
    546  SUM_SSE              m0, m1, m2, m3, m6, m7
    547 
    548  lea                srcq, [srcq+src_strideq*2]
    549  lea                dstq, [dstq+dst_strideq*2]
    550 %endif
    551 %if %2 == 1 ; avg
    552  add                secq, sec_str
    553 %endif
    554  dec                   block_height
    555  jg .x_half_y_zero_loop
    556  STORE_AND_RET %1
    557 
    558 .x_half_y_nonzero:
    559  cmp           y_offsetd, 4
    560  jne .x_half_y_nonhalf
    561 
    562  ; x_offset == 0.5 && y_offset == 0.5
    563 %if %1 == 16
    564  movu                 m0, [srcq]
    565  movu                 m3, [srcq+1]
    566  add                srcq, src_strideq
    567  pavgb                m0, m3
    568 .x_half_y_half_loop:
    569  movu                 m4, [srcq]
    570  movu                 m3, [srcq+1]
    571  mova                 m1, [dstq]
    572  pavgb                m4, m3
    573  punpckhbw            m3, m1, m5
    574  pavgb                m0, m4
    575 %if %2 == 1 ; avg
    576  punpcklbw            m1, m5
    577  pavgb                m0, [secq]
    578  punpckhbw            m2, m0, m5
    579  punpcklbw            m0, m5
    580 %else
    581  punpckhbw            m2, m0, m5
    582  punpcklbw            m0, m5
    583  punpcklbw            m1, m5
    584 %endif
    585  SUM_SSE              m0, m1, m2, m3, m6, m7
    586  mova                 m0, m4
    587 
    588  add                srcq, src_strideq
    589  add                dstq, dst_strideq
    590 %else ; %1 < 16
    591  movx                 m0, [srcq]
    592  movx                 m3, [srcq+1]
    593  add                srcq, src_strideq
    594  pavgb                m0, m3
    595 .x_half_y_half_loop:
    596  movx                 m2, [srcq]
    597  movx                 m3, [srcq+1]
    598 %if %2 == 1 ; avg
    599 %if %1 > 4
    600  movhps               m2, [srcq+src_strideq]
    601  movhps               m3, [srcq+src_strideq+1]
    602 %else
    603  movx                 m1, [srcq+src_strideq]
    604  punpckldq            m2, m1
    605  movx                 m1, [srcq+src_strideq+1]
    606  punpckldq            m3, m1
    607 %endif
    608  pavgb                m2, m3
    609 %if %1 > 4
    610  movlhps              m0, m2
    611  movhlps              m4, m2
    612 %else ; 4xh
    613  punpckldq            m0, m2
    614  pshuflw              m4, m2, 0xe
    615 %endif
    616  movx                 m1, [dstq]
    617  pavgb                m0, m2
    618  movx                 m3, [dstq+dst_strideq]
    619 %if %1 > 4
    620  pavgb                m0, [secq]
    621 %else
    622  movh                 m2, [secq]
    623  pavgb                m0, m2
    624 %endif
    625  punpcklbw            m3, m5
    626  punpcklbw            m1, m5
    627 %if %1 > 4
    628  punpckhbw            m2, m0, m5
    629  punpcklbw            m0, m5
    630 %else
    631  punpcklbw            m0, m5
    632  movhlps              m2, m0
    633 %endif
    634 %else ; !avg
    635  movx                 m4, [srcq+src_strideq]
    636  movx                 m1, [srcq+src_strideq+1]
    637  pavgb                m2, m3
    638  pavgb                m4, m1
    639  pavgb                m0, m2
    640  pavgb                m2, m4
    641  movx                 m1, [dstq]
    642  movx                 m3, [dstq+dst_strideq]
    643  punpcklbw            m0, m5
    644  punpcklbw            m2, m5
    645  punpcklbw            m3, m5
    646  punpcklbw            m1, m5
    647 %endif
    648  SUM_SSE              m0, m1, m2, m3, m6, m7
    649  mova                 m0, m4
    650 
    651  lea                srcq, [srcq+src_strideq*2]
    652  lea                dstq, [dstq+dst_strideq*2]
    653 %endif
    654 %if %2 == 1 ; avg
    655  add                secq, sec_str
    656 %endif
    657  dec                   block_height
    658  jg .x_half_y_half_loop
    659  STORE_AND_RET %1
    660 
    661 .x_half_y_nonhalf:
    662  ; x_offset == 0.5 && y_offset == bilin interpolation
    663 %if AOM_ARCH_X86_64
    664  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
    665 %endif
    666  shl           y_offsetd, filter_idx_shift
    667 %if AOM_ARCH_X86_64 && %1 > 4
    668  mova                 m8, [bilin_filter+y_offsetq]
    669 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
    670  mova                 m9, [bilin_filter+y_offsetq+16]
    671 %endif
    672  mova                m10, [GLOBAL(pw_8)]
    673 %define filter_y_a m8
    674 %define filter_y_b m9
    675 %define filter_rnd m10
    676 %else  ;x86_32
    677 %if AOM_ARCH_X86=1 && CONFIG_PIC=1
    678 ; x_offset == 0.5. We can reuse x_offset reg
    679 %define tempq x_offsetq
    680  add y_offsetq, g_bilin_filterm
    681 %define filter_y_a [y_offsetq]
    682 %define filter_y_b [y_offsetq+16]
    683  mov tempq, g_pw_8m
    684 %define filter_rnd [tempq]
    685 %else
    686  add           y_offsetq, bilin_filter
    687 %define filter_y_a [y_offsetq]
    688 %define filter_y_b [y_offsetq+16]
    689 %define filter_rnd [GLOBAL(pw_8)]
    690 %endif
    691 %endif
    692 
    693 %if %1 == 16
    694  movu                 m0, [srcq]
    695  movu                 m3, [srcq+1]
    696  add                srcq, src_strideq
    697  pavgb                m0, m3
    698 .x_half_y_other_loop:
    699  movu                 m4, [srcq]
    700  movu                 m2, [srcq+1]
    701  mova                 m1, [dstq]
    702  pavgb                m4, m2
    703 %if cpuflag(ssse3)
    704  punpckhbw            m2, m0, m4
    705  punpcklbw            m0, m4
    706  pmaddubsw            m2, filter_y_a
    707  pmaddubsw            m0, filter_y_a
    708  paddw                m2, filter_rnd
    709  paddw                m0, filter_rnd
    710  psraw                m2, 4
    711 %else
    712  punpckhbw            m2, m0, m5
    713  punpckhbw            m3, m4, m5
    714  pmullw               m2, filter_y_a
    715  pmullw               m3, filter_y_b
    716  paddw                m2, filter_rnd
    717  punpcklbw            m0, m5
    718  paddw                m2, m3
    719  punpcklbw            m3, m4, m5
    720  pmullw               m0, filter_y_a
    721  pmullw               m3, filter_y_b
    722  paddw                m0, filter_rnd
    723  psraw                m2, 4
    724  paddw                m0, m3
    725 %endif
    726  punpckhbw            m3, m1, m5
    727  psraw                m0, 4
    728 %if %2 == 1 ; avg
    729  ; FIXME(rbultje) pipeline
    730  packuswb             m0, m2
    731  pavgb                m0, [secq]
    732  punpckhbw            m2, m0, m5
    733  punpcklbw            m0, m5
    734 %endif
    735  punpcklbw            m1, m5
    736  SUM_SSE              m0, m1, m2, m3, m6, m7
    737  mova                 m0, m4
    738 
    739  add                srcq, src_strideq
    740  add                dstq, dst_strideq
    741 %else ; %1 < 16
    742  movx                 m0, [srcq]
    743  movx                 m3, [srcq+1]
    744  add                srcq, src_strideq
    745  pavgb                m0, m3
    746 %if notcpuflag(ssse3)
    747  punpcklbw            m0, m5
    748 %endif
    749 .x_half_y_other_loop:
    750  movx                 m2, [srcq]
    751  movx                 m1, [srcq+1]
    752  movx                 m4, [srcq+src_strideq]
    753  movx                 m3, [srcq+src_strideq+1]
    754  pavgb                m2, m1
    755  pavgb                m4, m3
    756  movx                 m3, [dstq+dst_strideq]
    757 %if cpuflag(ssse3)
    758  movx                 m1, [dstq]
    759  punpcklbw            m0, m2
    760  punpcklbw            m2, m4
    761  pmaddubsw            m0, filter_y_a
    762  pmaddubsw            m2, filter_y_a
    763  punpcklbw            m3, m5
    764  paddw                m0, filter_rnd
    765  paddw                m2, filter_rnd
    766 %else
    767  punpcklbw            m2, m5
    768  punpcklbw            m4, m5
    769  pmullw               m0, filter_y_a
    770  pmullw               m1, m2, filter_y_b
    771  punpcklbw            m3, m5
    772  paddw                m0, filter_rnd
    773  pmullw               m2, filter_y_a
    774  paddw                m0, m1
    775  pmullw               m1, m4, filter_y_b
    776  paddw                m2, filter_rnd
    777  paddw                m2, m1
    778  movx                 m1, [dstq]
    779 %endif
    780  psraw                m0, 4
    781  psraw                m2, 4
    782 %if %2 == 1 ; avg
    783  ; FIXME(rbultje) pipeline
    784 %if %1 == 4
    785  movlhps              m0, m2
    786 %endif
    787  packuswb             m0, m2
    788 %if %1 > 4
    789  pavgb                m0, [secq]
    790  punpckhbw            m2, m0, m5
    791  punpcklbw            m0, m5
    792 %else
    793  movh                 m2, [secq]
    794  pavgb                m0, m2
    795  punpcklbw            m0, m5
    796  movhlps              m2, m0
    797 %endif
    798 %endif
    799  punpcklbw            m1, m5
    800  SUM_SSE              m0, m1, m2, m3, m6, m7
    801  mova                 m0, m4
    802 
    803  lea                srcq, [srcq+src_strideq*2]
    804  lea                dstq, [dstq+dst_strideq*2]
    805 %endif
    806 %if %2 == 1 ; avg
    807  add                secq, sec_str
    808 %endif
    809  dec                   block_height
    810  jg .x_half_y_other_loop
    811 %undef filter_y_a
    812 %undef filter_y_b
    813 %undef filter_rnd
    814  STORE_AND_RET %1
    815 
    816 .x_nonhalf:
    817  test          y_offsetd, y_offsetd
    818  jnz .x_nonhalf_y_nonzero
    819 
    820  ; x_offset == bilin interpolation && y_offset == 0
    821 %if AOM_ARCH_X86_64
    822  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
    823 %endif
    824  shl           x_offsetd, filter_idx_shift
    825 %if AOM_ARCH_X86_64 && %1 > 4
    826  mova                 m8, [bilin_filter+x_offsetq]
    827 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
    828  mova                 m9, [bilin_filter+x_offsetq+16]
    829 %endif
    830  mova                m10, [GLOBAL(pw_8)]
    831 %define filter_x_a m8
    832 %define filter_x_b m9
    833 %define filter_rnd m10
    834 %else    ; x86-32
    835 %if AOM_ARCH_X86=1 && CONFIG_PIC=1
    836 ;y_offset == 0. We can reuse y_offset reg.
    837 %define tempq y_offsetq
    838  add x_offsetq, g_bilin_filterm
    839 %define filter_x_a [x_offsetq]
    840 %define filter_x_b [x_offsetq+16]
    841  mov tempq, g_pw_8m
    842 %define filter_rnd [tempq]
    843 %else
    844  add           x_offsetq, bilin_filter
    845 %define filter_x_a [x_offsetq]
    846 %define filter_x_b [x_offsetq+16]
    847 %define filter_rnd [GLOBAL(pw_8)]
    848 %endif
    849 %endif
    850 
    851 .x_other_y_zero_loop:
    852 %if %1 == 16
    853  movu                 m0, [srcq]
    854  movu                 m4, [srcq+1]
    855  mova                 m1, [dstq]
    856 %if cpuflag(ssse3)
    857  punpckhbw            m2, m0, m4
    858  punpcklbw            m0, m4
    859  pmaddubsw            m2, filter_x_a
    860  pmaddubsw            m0, filter_x_a
    861  paddw                m2, filter_rnd
    862  paddw                m0, filter_rnd
    863 %else
    864  punpckhbw            m2, m0, m5
    865  punpckhbw            m3, m4, m5
    866  punpcklbw            m0, m5
    867  punpcklbw            m4, m5
    868  pmullw               m2, filter_x_a
    869  pmullw               m3, filter_x_b
    870  paddw                m2, filter_rnd
    871  pmullw               m0, filter_x_a
    872  pmullw               m4, filter_x_b
    873  paddw                m0, filter_rnd
    874  paddw                m2, m3
    875  paddw                m0, m4
    876 %endif
    877  psraw                m2, 4
    878  psraw                m0, 4
    879 %if %2 == 1 ; avg
    880  ; FIXME(rbultje) pipeline
    881  packuswb             m0, m2
    882  pavgb                m0, [secq]
    883  punpckhbw            m2, m0, m5
    884  punpcklbw            m0, m5
    885 %endif
    886  punpckhbw            m3, m1, m5
    887  punpcklbw            m1, m5
    888  SUM_SSE              m0, m1, m2, m3, m6, m7
    889 
    890  add                srcq, src_strideq
    891  add                dstq, dst_strideq
    892 %else ; %1 < 16
    893  movx                 m0, [srcq]
    894  movx                 m1, [srcq+1]
    895  movx                 m2, [srcq+src_strideq]
    896  movx                 m4, [srcq+src_strideq+1]
    897  movx                 m3, [dstq+dst_strideq]
    898 %if cpuflag(ssse3)
    899  punpcklbw            m0, m1
    900  movx                 m1, [dstq]
    901  punpcklbw            m2, m4
    902  pmaddubsw            m0, filter_x_a
    903  pmaddubsw            m2, filter_x_a
    904  punpcklbw            m3, m5
    905  paddw                m0, filter_rnd
    906  paddw                m2, filter_rnd
    907 %else
    908  punpcklbw            m0, m5
    909  punpcklbw            m1, m5
    910  punpcklbw            m2, m5
    911  punpcklbw            m4, m5
    912  pmullw               m0, filter_x_a
    913  pmullw               m1, filter_x_b
    914  punpcklbw            m3, m5
    915  paddw                m0, filter_rnd
    916  pmullw               m2, filter_x_a
    917  pmullw               m4, filter_x_b
    918  paddw                m0, m1
    919  paddw                m2, filter_rnd
    920  movx                 m1, [dstq]
    921  paddw                m2, m4
    922 %endif
    923  psraw                m0, 4
    924  psraw                m2, 4
    925 %if %2 == 1 ; avg
    926  ; FIXME(rbultje) pipeline
    927 %if %1 == 4
    928  movlhps              m0, m2
    929 %endif
    930  packuswb             m0, m2
    931 %if %1 > 4
    932  pavgb                m0, [secq]
    933  punpckhbw            m2, m0, m5
    934  punpcklbw            m0, m5
    935 %else
    936  movh                 m2, [secq]
    937  pavgb                m0, m2
    938  punpcklbw            m0, m5
    939  movhlps              m2, m0
    940 %endif
    941 %endif
    942  punpcklbw            m1, m5
    943  SUM_SSE              m0, m1, m2, m3, m6, m7
    944 
    945  lea                srcq, [srcq+src_strideq*2]
    946  lea                dstq, [dstq+dst_strideq*2]
    947 %endif
    948 %if %2 == 1 ; avg
    949  add                secq, sec_str
    950 %endif
    951  dec                   block_height
    952  jg .x_other_y_zero_loop
    953 %undef filter_x_a
    954 %undef filter_x_b
    955 %undef filter_rnd
    956  STORE_AND_RET %1
    957 
    958 .x_nonhalf_y_nonzero:
    959  cmp           y_offsetd, 4
    960  jne .x_nonhalf_y_nonhalf
    961 
    962  ; x_offset == bilin interpolation && y_offset == 0.5
    963 %if AOM_ARCH_X86_64
    964  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
    965 %endif
    966  shl           x_offsetd, filter_idx_shift
    967 %if AOM_ARCH_X86_64 && %1 > 4
    968  mova                 m8, [bilin_filter+x_offsetq]
    969 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
    970  mova                 m9, [bilin_filter+x_offsetq+16]
    971 %endif
    972  mova                m10, [GLOBAL(pw_8)]
    973 %define filter_x_a m8
    974 %define filter_x_b m9
    975 %define filter_rnd m10
    976 %else    ; x86-32
    977 %if AOM_ARCH_X86=1 && CONFIG_PIC=1
    978 ; y_offset == 0.5. We can reuse y_offset reg.
    979 %define tempq y_offsetq
    980  add x_offsetq, g_bilin_filterm
    981 %define filter_x_a [x_offsetq]
    982 %define filter_x_b [x_offsetq+16]
    983  mov tempq, g_pw_8m
    984 %define filter_rnd [tempq]
    985 %else
    986  add           x_offsetq, bilin_filter
    987 %define filter_x_a [x_offsetq]
    988 %define filter_x_b [x_offsetq+16]
    989 %define filter_rnd [GLOBAL(pw_8)]
    990 %endif
    991 %endif
    992 
    993 %if %1 == 16
    994  movu                 m0, [srcq]
    995  movu                 m1, [srcq+1]
    996 %if cpuflag(ssse3)
    997  punpckhbw            m2, m0, m1
    998  punpcklbw            m0, m1
    999  pmaddubsw            m2, filter_x_a
   1000  pmaddubsw            m0, filter_x_a
   1001  paddw                m2, filter_rnd
   1002  paddw                m0, filter_rnd
   1003 %else
   1004  punpckhbw            m2, m0, m5
   1005  punpckhbw            m3, m1, m5
   1006  punpcklbw            m0, m5
   1007  punpcklbw            m1, m5
   1008  pmullw               m0, filter_x_a
   1009  pmullw               m1, filter_x_b
   1010  paddw                m0, filter_rnd
   1011  pmullw               m2, filter_x_a
   1012  pmullw               m3, filter_x_b
   1013  paddw                m2, filter_rnd
   1014  paddw                m0, m1
   1015  paddw                m2, m3
   1016 %endif
   1017  psraw                m0, 4
   1018  psraw                m2, 4
   1019  add                srcq, src_strideq
   1020  packuswb             m0, m2
   1021 .x_other_y_half_loop:
   1022  movu                 m4, [srcq]
   1023  movu                 m3, [srcq+1]
   1024 %if cpuflag(ssse3)
   1025  mova                 m1, [dstq]
   1026  punpckhbw            m2, m4, m3
   1027  punpcklbw            m4, m3
   1028  pmaddubsw            m2, filter_x_a
   1029  pmaddubsw            m4, filter_x_a
   1030  paddw                m2, filter_rnd
   1031  paddw                m4, filter_rnd
   1032  psraw                m2, 4
   1033  psraw                m4, 4
   1034  packuswb             m4, m2
   1035  pavgb                m0, m4
   1036  punpckhbw            m3, m1, m5
   1037  punpcklbw            m1, m5
   1038 %else
   1039  punpckhbw            m2, m4, m5
   1040  punpckhbw            m1, m3, m5
   1041  punpcklbw            m4, m5
   1042  punpcklbw            m3, m5
   1043  pmullw               m4, filter_x_a
   1044  pmullw               m3, filter_x_b
   1045  paddw                m4, filter_rnd
   1046  pmullw               m2, filter_x_a
   1047  pmullw               m1, filter_x_b
   1048  paddw                m2, filter_rnd
   1049  paddw                m4, m3
   1050  paddw                m2, m1
   1051  mova                 m1, [dstq]
   1052  psraw                m4, 4
   1053  psraw                m2, 4
   1054  punpckhbw            m3, m1, m5
   1055  ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we
   1056  ; have a 1-register shortage to be able to store the backup of the bilin
   1057  ; filtered second line as words as cache for the next line. Packing into
   1058  ; a byte costs 1 pack and 2 unpacks, but saves a register.
   1059  packuswb             m4, m2
   1060  punpcklbw            m1, m5
   1061  pavgb                m0, m4
   1062 %endif
   1063 %if %2 == 1 ; avg
   1064  ; FIXME(rbultje) pipeline
   1065  pavgb                m0, [secq]
   1066 %endif
   1067  punpckhbw            m2, m0, m5
   1068  punpcklbw            m0, m5
   1069  SUM_SSE              m0, m1, m2, m3, m6, m7
   1070  mova                 m0, m4
   1071 
   1072  add                srcq, src_strideq
   1073  add                dstq, dst_strideq
   1074 %else ; %1 < 16
   1075  movx                 m0, [srcq]
   1076  movx                 m1, [srcq+1]
   1077 %if cpuflag(ssse3)
   1078  punpcklbw            m0, m1
   1079  pmaddubsw            m0, filter_x_a
   1080  paddw                m0, filter_rnd
   1081 %else
   1082  punpcklbw            m0, m5
   1083  punpcklbw            m1, m5
   1084  pmullw               m0, filter_x_a
   1085  pmullw               m1, filter_x_b
   1086  paddw                m0, filter_rnd
   1087  paddw                m0, m1
   1088 %endif
   1089  add                srcq, src_strideq
   1090  psraw                m0, 4
   1091 .x_other_y_half_loop:
   1092  movx                 m2, [srcq]
   1093  movx                 m1, [srcq+1]
   1094  movx                 m4, [srcq+src_strideq]
   1095  movx                 m3, [srcq+src_strideq+1]
   1096 %if cpuflag(ssse3)
   1097  punpcklbw            m2, m1
   1098  punpcklbw            m4, m3
   1099  pmaddubsw            m2, filter_x_a
   1100  pmaddubsw            m4, filter_x_a
   1101  movx                 m1, [dstq]
   1102  movx                 m3, [dstq+dst_strideq]
   1103  paddw                m2, filter_rnd
   1104  paddw                m4, filter_rnd
   1105 %else
   1106  punpcklbw            m2, m5
   1107  punpcklbw            m1, m5
   1108  punpcklbw            m4, m5
   1109  punpcklbw            m3, m5
   1110  pmullw               m2, filter_x_a
   1111  pmullw               m1, filter_x_b
   1112  paddw                m2, filter_rnd
   1113  pmullw               m4, filter_x_a
   1114  pmullw               m3, filter_x_b
   1115  paddw                m4, filter_rnd
   1116  paddw                m2, m1
   1117  movx                 m1, [dstq]
   1118  paddw                m4, m3
   1119  movx                 m3, [dstq+dst_strideq]
   1120 %endif
   1121  psraw                m2, 4
   1122  psraw                m4, 4
   1123  pavgw                m0, m2
   1124  pavgw                m2, m4
   1125 %if %2 == 1 ; avg
   1126  ; FIXME(rbultje) pipeline - also consider going to bytes here
   1127 %if %1 == 4
   1128  movlhps              m0, m2
   1129 %endif
   1130  packuswb             m0, m2
   1131 %if %1 > 4
   1132  pavgb                m0, [secq]
   1133  punpckhbw            m2, m0, m5
   1134  punpcklbw            m0, m5
   1135 %else
   1136  movh                 m2, [secq]
   1137  pavgb                m0, m2
   1138  punpcklbw            m0, m5
   1139  movhlps              m2, m0
   1140 %endif
   1141 %endif
   1142  punpcklbw            m3, m5
   1143  punpcklbw            m1, m5
   1144  SUM_SSE              m0, m1, m2, m3, m6, m7
   1145  mova                 m0, m4
   1146 
   1147  lea                srcq, [srcq+src_strideq*2]
   1148  lea                dstq, [dstq+dst_strideq*2]
   1149 %endif
   1150 %if %2 == 1 ; avg
   1151  add                secq, sec_str
   1152 %endif
   1153  dec                   block_height
   1154  jg .x_other_y_half_loop
   1155 %undef filter_x_a
   1156 %undef filter_x_b
   1157 %undef filter_rnd
   1158  STORE_AND_RET %1
   1159 
   1160 .x_nonhalf_y_nonhalf:
   1161 %if AOM_ARCH_X86_64
   1162  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
   1163 %endif
   1164  shl           x_offsetd, filter_idx_shift
   1165  shl           y_offsetd, filter_idx_shift
   1166 %if AOM_ARCH_X86_64 && %1 > 4
   1167  mova                 m8, [bilin_filter+x_offsetq]
   1168 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   1169  mova                 m9, [bilin_filter+x_offsetq+16]
   1170 %endif
   1171  mova                m10, [bilin_filter+y_offsetq]
   1172 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   1173  mova                m11, [bilin_filter+y_offsetq+16]
   1174 %endif
   1175  mova                m12, [GLOBAL(pw_8)]
   1176 %define filter_x_a m8
   1177 %define filter_x_b m9
   1178 %define filter_y_a m10
   1179 %define filter_y_b m11
   1180 %define filter_rnd m12
   1181 %else   ; x86-32
   1182 %if AOM_ARCH_X86=1 && CONFIG_PIC=1
   1183 ; In this case, there is NO unused register. Used src_stride register. Later,
   1184 ; src_stride has to be loaded from stack when it is needed.
   1185 %define tempq src_strideq
   1186  mov tempq, g_bilin_filterm
   1187  add           x_offsetq, tempq
   1188  add           y_offsetq, tempq
   1189 %define filter_x_a [x_offsetq]
   1190 %define filter_x_b [x_offsetq+16]
   1191 %define filter_y_a [y_offsetq]
   1192 %define filter_y_b [y_offsetq+16]
   1193 
   1194  mov tempq, g_pw_8m
   1195 %define filter_rnd [tempq]
   1196 %else
   1197  add           x_offsetq, bilin_filter
   1198  add           y_offsetq, bilin_filter
   1199 %define filter_x_a [x_offsetq]
   1200 %define filter_x_b [x_offsetq+16]
   1201 %define filter_y_a [y_offsetq]
   1202 %define filter_y_b [y_offsetq+16]
   1203 %define filter_rnd [GLOBAL(pw_8)]
   1204 %endif
   1205 %endif
   1206 
   1207  ; x_offset == bilin interpolation && y_offset == bilin interpolation
   1208 %if %1 == 16
   1209  movu                 m0, [srcq]
   1210  movu                 m1, [srcq+1]
   1211 %if cpuflag(ssse3)
   1212  punpckhbw            m2, m0, m1
   1213  punpcklbw            m0, m1
   1214  pmaddubsw            m2, filter_x_a
   1215  pmaddubsw            m0, filter_x_a
   1216  paddw                m2, filter_rnd
   1217  paddw                m0, filter_rnd
   1218 %else
   1219  punpckhbw            m2, m0, m5
   1220  punpckhbw            m3, m1, m5
   1221  punpcklbw            m0, m5
   1222  punpcklbw            m1, m5
   1223  pmullw               m0, filter_x_a
   1224  pmullw               m1, filter_x_b
   1225  paddw                m0, filter_rnd
   1226  pmullw               m2, filter_x_a
   1227  pmullw               m3, filter_x_b
   1228  paddw                m2, filter_rnd
   1229  paddw                m0, m1
   1230  paddw                m2, m3
   1231 %endif
   1232  psraw                m0, 4
   1233  psraw                m2, 4
   1234 
   1235  INC_SRC_BY_SRC_STRIDE
   1236 
   1237  packuswb             m0, m2
   1238 .x_other_y_other_loop:
   1239 %if cpuflag(ssse3)
   1240  movu                 m4, [srcq]
   1241  movu                 m3, [srcq+1]
   1242  mova                 m1, [dstq]
   1243  punpckhbw            m2, m4, m3
   1244  punpcklbw            m4, m3
   1245  pmaddubsw            m2, filter_x_a
   1246  pmaddubsw            m4, filter_x_a
   1247  punpckhbw            m3, m1, m5
   1248  paddw                m2, filter_rnd
   1249  paddw                m4, filter_rnd
   1250  psraw                m2, 4
   1251  psraw                m4, 4
   1252  packuswb             m4, m2
   1253  punpckhbw            m2, m0, m4
   1254  punpcklbw            m0, m4
   1255  pmaddubsw            m2, filter_y_a
   1256  pmaddubsw            m0, filter_y_a
   1257  punpcklbw            m1, m5
   1258  paddw                m2, filter_rnd
   1259  paddw                m0, filter_rnd
   1260  psraw                m2, 4
   1261  psraw                m0, 4
   1262 %else
   1263  movu                 m3, [srcq]
   1264  movu                 m4, [srcq+1]
   1265  punpckhbw            m1, m3, m5
   1266  punpckhbw            m2, m4, m5
   1267  punpcklbw            m3, m5
   1268  punpcklbw            m4, m5
   1269  pmullw               m3, filter_x_a
   1270  pmullw               m4, filter_x_b
   1271  paddw                m3, filter_rnd
   1272  pmullw               m1, filter_x_a
   1273  pmullw               m2, filter_x_b
   1274  paddw                m1, filter_rnd
   1275  paddw                m3, m4
   1276  paddw                m1, m2
   1277  psraw                m3, 4
   1278  psraw                m1, 4
   1279  packuswb             m4, m3, m1
   1280  punpckhbw            m2, m0, m5
   1281  punpcklbw            m0, m5
   1282  pmullw               m2, filter_y_a
   1283  pmullw               m1, filter_y_b
   1284  paddw                m2, filter_rnd
   1285  pmullw               m0, filter_y_a
   1286  pmullw               m3, filter_y_b
   1287  paddw                m2, m1
   1288  mova                 m1, [dstq]
   1289  paddw                m0, filter_rnd
   1290  psraw                m2, 4
   1291  paddw                m0, m3
   1292  punpckhbw            m3, m1, m5
   1293  psraw                m0, 4
   1294  punpcklbw            m1, m5
   1295 %endif
   1296 %if %2 == 1 ; avg
   1297  ; FIXME(rbultje) pipeline
   1298  packuswb             m0, m2
   1299  pavgb                m0, [secq]
   1300  punpckhbw            m2, m0, m5
   1301  punpcklbw            m0, m5
   1302 %endif
   1303  SUM_SSE              m0, m1, m2, m3, m6, m7
   1304  mova                 m0, m4
   1305 
   1306  INC_SRC_BY_SRC_STRIDE
   1307  add                dstq, dst_strideq
   1308 %else ; %1 < 16
   1309  movx                 m0, [srcq]
   1310  movx                 m1, [srcq+1]
   1311 %if cpuflag(ssse3)
   1312  punpcklbw            m0, m1
   1313  pmaddubsw            m0, filter_x_a
   1314  paddw                m0, filter_rnd
   1315 %else
   1316  punpcklbw            m0, m5
   1317  punpcklbw            m1, m5
   1318  pmullw               m0, filter_x_a
   1319  pmullw               m1, filter_x_b
   1320  paddw                m0, filter_rnd
   1321  paddw                m0, m1
   1322 %endif
   1323  psraw                m0, 4
   1324 %if cpuflag(ssse3)
   1325  packuswb             m0, m0
   1326 %endif
   1327 
   1328  INC_SRC_BY_SRC_STRIDE
   1329 
   1330 .x_other_y_other_loop:
   1331  movx                 m2, [srcq]
   1332  movx                 m1, [srcq+1]
   1333 
   1334  INC_SRC_BY_SRC_STRIDE
   1335  movx                 m4, [srcq]
   1336  movx                 m3, [srcq+1]
   1337 
   1338 %if cpuflag(ssse3)
   1339  punpcklbw            m2, m1
   1340  punpcklbw            m4, m3
   1341  pmaddubsw            m2, filter_x_a
   1342  pmaddubsw            m4, filter_x_a
   1343  movx                 m3, [dstq+dst_strideq]
   1344  movx                 m1, [dstq]
   1345  paddw                m2, filter_rnd
   1346  paddw                m4, filter_rnd
   1347  psraw                m2, 4
   1348  psraw                m4, 4
   1349  packuswb             m2, m2
   1350  packuswb             m4, m4
   1351  punpcklbw            m0, m2
   1352  punpcklbw            m2, m4
   1353  pmaddubsw            m0, filter_y_a
   1354  pmaddubsw            m2, filter_y_a
   1355  punpcklbw            m3, m5
   1356  paddw                m0, filter_rnd
   1357  paddw                m2, filter_rnd
   1358  psraw                m0, 4
   1359  psraw                m2, 4
   1360  punpcklbw            m1, m5
   1361 %else
   1362  punpcklbw            m2, m5
   1363  punpcklbw            m1, m5
   1364  punpcklbw            m4, m5
   1365  punpcklbw            m3, m5
   1366  pmullw               m2, filter_x_a
   1367  pmullw               m1, filter_x_b
   1368  paddw                m2, filter_rnd
   1369  pmullw               m4, filter_x_a
   1370  pmullw               m3, filter_x_b
   1371  paddw                m4, filter_rnd
   1372  paddw                m2, m1
   1373  paddw                m4, m3
   1374  psraw                m2, 4
   1375  psraw                m4, 4
   1376  pmullw               m0, filter_y_a
   1377  pmullw               m3, m2, filter_y_b
   1378  paddw                m0, filter_rnd
   1379  pmullw               m2, filter_y_a
   1380  pmullw               m1, m4, filter_y_b
   1381  paddw                m2, filter_rnd
   1382  paddw                m0, m3
   1383  movx                 m3, [dstq+dst_strideq]
   1384  paddw                m2, m1
   1385  movx                 m1, [dstq]
   1386  psraw                m0, 4
   1387  psraw                m2, 4
   1388  punpcklbw            m3, m5
   1389  punpcklbw            m1, m5
   1390 %endif
   1391 %if %2 == 1 ; avg
   1392  ; FIXME(rbultje) pipeline
   1393 %if %1 == 4
   1394  movlhps              m0, m2
   1395 %endif
   1396  packuswb             m0, m2
   1397 %if %1 > 4
   1398  pavgb                m0, [secq]
   1399  punpckhbw            m2, m0, m5
   1400  punpcklbw            m0, m5
   1401 %else
   1402  movh                 m2, [secq]
   1403  pavgb                m0, m2
   1404  punpcklbw            m0, m5
   1405  movhlps              m2, m0
   1406 %endif
   1407 %endif
   1408  SUM_SSE              m0, m1, m2, m3, m6, m7
   1409  mova                 m0, m4
   1410 
   1411  INC_SRC_BY_SRC_STRIDE
   1412  lea                dstq, [dstq+dst_strideq*2]
   1413 %endif
   1414 %if %2 == 1 ; avg
   1415  add                secq, sec_str
   1416 %endif
   1417  dec                   block_height
   1418  jg .x_other_y_other_loop
   1419 %undef filter_x_a
   1420 %undef filter_x_b
   1421 %undef filter_y_a
   1422 %undef filter_y_b
   1423 %undef filter_rnd
   1424 %undef movx
   1425  STORE_AND_RET %1
   1426 %endmacro
   1427 
   1428 ; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
   1429 ; between the ssse3 and non-ssse3 version. It may make sense to merge their
   1430 ; code in the sense that the ssse3 version would jump to the appropriate
   1431 ; location in the sse/2 version, rather than duplicating that code in the
   1432 ; binary.
   1433 
   1434 INIT_XMM ssse3
   1435 SUBPEL_VARIANCE  4
   1436 SUBPEL_VARIANCE  8
   1437 SUBPEL_VARIANCE 16
   1438 
   1439 INIT_XMM ssse3
   1440 SUBPEL_VARIANCE  4, 1
   1441 SUBPEL_VARIANCE  8, 1
   1442 SUBPEL_VARIANCE 16, 1