tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

highbd_subpel_variance_impl_sse2.asm (31378B)


      1 ;
      2 ; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 ;
      4 ; This source code is subject to the terms of the BSD 2 Clause License and
      5 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 ; was not distributed with this source code in the LICENSE file, you can
      7 ; obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 ; Media Patent License 1.0 was not distributed with this source code in the
      9 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 ;
     11 
     12 ;
     13 
     14 %include "third_party/x86inc/x86inc.asm"
     15 
     16 SECTION_RODATA
     17 pw_8: times  8 dw  8
     18 bilin_filter_m_sse2: times  8 dw 16
     19                     times  8 dw  0
     20                     times  8 dw 14
     21                     times  8 dw  2
     22                     times  8 dw 12
     23                     times  8 dw  4
     24                     times  8 dw 10
     25                     times  8 dw  6
     26                     times 16 dw  8
     27                     times  8 dw  6
     28                     times  8 dw 10
     29                     times  8 dw  4
     30                     times  8 dw 12
     31                     times  8 dw  2
     32                     times  8 dw 14
     33 
     34 SECTION .text
     35 
     36 ; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
     37 ;                               int x_offset, int y_offset,
     38 ;                               const uint8_t *dst, ptrdiff_t dst_stride,
     39 ;                               int height, unsigned int *sse);
     40 ;
     41 ; This function returns the SE and stores SSE in the given pointer.
     42 
     43 %macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
     44  psubw                %3, %4
     45  psubw                %1, %2
     46  mova                 %4, %3       ; make copies to manipulate to calc sum
     47  mova                 %2, %1       ; use originals for calc sse
     48  pmaddwd              %3, %3
     49  paddw                %4, %2
     50  pmaddwd              %1, %1
     51  movhlps              %2, %4
     52  paddd                %6, %3
     53  paddw                %4, %2
     54  pxor                 %2, %2
     55  pcmpgtw              %2, %4       ; mask for 0 > %4 (sum)
     56  punpcklwd            %4, %2       ; sign-extend word to dword
     57  paddd                %6, %1
     58  paddd                %5, %4
     59 
     60 %endmacro
     61 
     62 %macro STORE_AND_RET 0
     63 %if mmsize == 16
     64  ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
     65  ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
     66  ; We have to sign-extend it before adding the words within the register
     67  ; and outputing to a dword.
     68  movhlps              m3, m7
     69  movhlps              m4, m6
     70  paddd                m7, m3
     71  paddd                m6, m4
     72  pshufd               m3, m7, 0x1
     73  pshufd               m4, m6, 0x1
     74  paddd                m7, m3
     75  paddd                m6, m4
     76  mov                  r1, ssem         ; r1 = unsigned int *sse
     77  movd               [r1], m7           ; store sse
     78  movd                eax, m6           ; store sum as return value
     79 %endif
     80  RET
     81 %endmacro
     82 
     83 %macro INC_SRC_BY_SRC_STRIDE  0
     84 %if AOM_ARCH_X86=1 && CONFIG_PIC=1
     85  add                srcq, src_stridemp
     86  add                srcq, src_stridemp
     87 %else
     88  lea                srcq, [srcq + src_strideq*2]
     89 %endif
     90 %endmacro
     91 
     92 %macro SUBPEL_VARIANCE 1-2 0 ; W
     93 %define bilin_filter_m bilin_filter_m_sse2
     94 %define filter_idx_shift 5
     95 
     96 
     97 %if AOM_ARCH_X86_64
     98  %if %2 == 1 ; avg
     99    cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
    100                                      x_offset, y_offset, \
    101                                      dst, dst_stride, \
    102                                      sec, sec_stride, height, sse
    103    %define sec_str sec_strideq
    104  %else
    105    cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
    106                                  x_offset, y_offset, \
    107                                  dst, dst_stride, height, sse
    108  %endif
    109  %define block_height heightd
    110  %define bilin_filter sseq
    111 %else
    112  %if CONFIG_PIC=1
    113    %if %2 == 1 ; avg
    114      cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
    115                                        x_offset, y_offset, \
    116                                        dst, dst_stride, \
    117                                        sec, sec_stride, height, sse
    118      %define block_height dword heightm
    119      %define sec_str sec_stridemp
    120    %else
    121      cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
    122                                    x_offset, y_offset, \
    123                                    dst, dst_stride, height, sse
    124      %define block_height heightd
    125    %endif
    126 
    127    ; reuse argument stack space
    128    %define g_bilin_filterm x_offsetm
    129    %define g_pw_8m y_offsetm
    130 
    131    ; Store bilin_filter and pw_8 location in stack
    132    %if GET_GOT_DEFINED == 1
    133      GET_GOT eax
    134      add esp, 4                ; restore esp
    135    %endif
    136 
    137    lea ecx, [GLOBAL(bilin_filter_m)]
    138    mov g_bilin_filterm, ecx
    139 
    140    lea ecx, [GLOBAL(pw_8)]
    141    mov g_pw_8m, ecx
    142 
    143    LOAD_IF_USED 0, 1         ; load eax, ecx back
    144  %else
    145    %if %2 == 1 ; avg
    146      cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
    147                                        x_offset, y_offset, \
    148                                        dst, dst_stride, \
    149                                        sec, sec_stride, height, sse
    150      %define block_height dword heightm
    151      %define sec_str sec_stridemp
    152    %else
    153      cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
    154                                    x_offset, y_offset, \
    155                                    dst, dst_stride, height, sse
    156      %define block_height heightd
    157    %endif
    158 
    159    %define bilin_filter bilin_filter_m
    160  %endif
    161 %endif
    162 
    163  ASSERT               %1 <= 16         ; m6 overflows if w > 16
    164  pxor                 m6, m6           ; sum
    165  pxor                 m7, m7           ; sse
    166 
    167 %if %1 < 16
    168  sar                   block_height, 1
    169 %endif
    170 %if %2 == 1 ; avg
    171  shl             sec_str, 1
    172 %endif
    173 
    174  ; FIXME(rbultje) replace by jumptable?
    175  test          x_offsetd, x_offsetd
    176  jnz .x_nonzero
    177  ; x_offset == 0
    178  test          y_offsetd, y_offsetd
    179  jnz .x_zero_y_nonzero
    180 
    181  ; x_offset == 0 && y_offset == 0
    182 .x_zero_y_zero_loop:
    183 %if %1 == 16
    184  movu                 m0, [srcq]
    185  movu                 m2, [srcq + 16]
    186  mova                 m1, [dstq]
    187  mova                 m3, [dstq + 16]
    188 %if %2 == 1 ; avg
    189  pavgw                m0, [secq]
    190  pavgw                m2, [secq+16]
    191 %endif
    192  SUM_SSE              m0, m1, m2, m3, m6, m7
    193 
    194  lea                srcq, [srcq + src_strideq*2]
    195  lea                dstq, [dstq + dst_strideq*2]
    196 %if %2 == 1 ; avg
    197  add                secq, sec_str
    198 %endif
    199 %else ; %1 < 16
    200  movu                 m0, [srcq]
    201  movu                 m2, [srcq + src_strideq*2]
    202  mova                 m1, [dstq]
    203  mova                 m3, [dstq + dst_strideq*2]
    204 %if %2 == 1 ; avg
    205  pavgw                m0, [secq]
    206  add                secq, sec_str
    207  pavgw                m2, [secq]
    208 %endif
    209  SUM_SSE              m0, m1, m2, m3, m6, m7
    210 
    211  lea                srcq, [srcq + src_strideq*4]
    212  lea                dstq, [dstq + dst_strideq*4]
    213 %if %2 == 1 ; avg
    214  add                secq, sec_str
    215 %endif
    216 %endif
    217  dec                   block_height
    218  jg .x_zero_y_zero_loop
    219  STORE_AND_RET
    220 
    221 .x_zero_y_nonzero:
    222  cmp           y_offsetd, 8
    223  jne .x_zero_y_nonhalf
    224 
    225  ; x_offset == 0 && y_offset == 0.5
    226 .x_zero_y_half_loop:
    227 %if %1 == 16
    228  movu                 m0, [srcq]
    229  movu                 m1, [srcq+16]
    230  movu                 m4, [srcq+src_strideq*2]
    231  movu                 m5, [srcq+src_strideq*2+16]
    232  mova                 m2, [dstq]
    233  mova                 m3, [dstq+16]
    234  pavgw                m0, m4
    235  pavgw                m1, m5
    236 %if %2 == 1 ; avg
    237  pavgw                m0, [secq]
    238  pavgw                m1, [secq+16]
    239 %endif
    240  SUM_SSE              m0, m2, m1, m3, m6, m7
    241 
    242  lea                srcq, [srcq + src_strideq*2]
    243  lea                dstq, [dstq + dst_strideq*2]
    244 %if %2 == 1 ; avg
    245  add                secq, sec_str
    246 %endif
    247 %else ; %1 < 16
    248  movu                 m0, [srcq]
    249  movu                 m1, [srcq+src_strideq*2]
    250  movu                 m5, [srcq+src_strideq*4]
    251  mova                 m2, [dstq]
    252  mova                 m3, [dstq+dst_strideq*2]
    253  pavgw                m0, m1
    254  pavgw                m1, m5
    255 %if %2 == 1 ; avg
    256  pavgw                m0, [secq]
    257  add                secq, sec_str
    258  pavgw                m1, [secq]
    259 %endif
    260  SUM_SSE              m0, m2, m1, m3, m6, m7
    261 
    262  lea                srcq, [srcq + src_strideq*4]
    263  lea                dstq, [dstq + dst_strideq*4]
    264 %if %2 == 1 ; avg
    265  add                secq, sec_str
    266 %endif
    267 %endif
    268  dec                   block_height
    269  jg .x_zero_y_half_loop
    270  STORE_AND_RET
    271 
    272 .x_zero_y_nonhalf:
    273  ; x_offset == 0 && y_offset == bilin interpolation
    274 %if AOM_ARCH_X86_64
    275  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
    276 %endif
    277  shl           y_offsetd, filter_idx_shift
    278 %if AOM_ARCH_X86_64 && mmsize == 16
    279  mova                 m8, [bilin_filter+y_offsetq]
    280  mova                 m9, [bilin_filter+y_offsetq+16]
    281  mova                m10, [GLOBAL(pw_8)]
    282 %define filter_y_a m8
    283 %define filter_y_b m9
    284 %define filter_rnd m10
    285 %else ; x86-32 or mmx
    286 %if AOM_ARCH_X86=1 && CONFIG_PIC=1
    287 ; x_offset == 0, reuse x_offset reg
    288 %define tempq x_offsetq
    289  add y_offsetq, g_bilin_filterm
    290 %define filter_y_a [y_offsetq]
    291 %define filter_y_b [y_offsetq+16]
    292  mov tempq, g_pw_8m
    293 %define filter_rnd [tempq]
    294 %else
    295  add           y_offsetq, bilin_filter
    296 %define filter_y_a [y_offsetq]
    297 %define filter_y_b [y_offsetq+16]
    298 %define filter_rnd [GLOBAL(pw_8)]
    299 %endif
    300 %endif
    301 
    302 .x_zero_y_other_loop:
    303 %if %1 == 16
    304  movu                 m0, [srcq]
    305  movu                 m1, [srcq + 16]
    306  movu                 m4, [srcq+src_strideq*2]
    307  movu                 m5, [srcq+src_strideq*2+16]
    308  mova                 m2, [dstq]
    309  mova                 m3, [dstq+16]
    310  ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
    311  ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
    312  ; instructions is the same (5), but it is 1 mul instead of 2, so might be
    313  ; slightly faster because of pmullw latency. It would also cut our rodata
    314  ; tables in half for this function, and save 1-2 registers on x86-64.
    315  pmullw               m1, filter_y_a
    316  pmullw               m5, filter_y_b
    317  paddw                m1, filter_rnd
    318  pmullw               m0, filter_y_a
    319  pmullw               m4, filter_y_b
    320  paddw                m0, filter_rnd
    321  paddw                m1, m5
    322  paddw                m0, m4
    323  psrlw                m1, 4
    324  psrlw                m0, 4
    325 %if %2 == 1 ; avg
    326  pavgw                m0, [secq]
    327  pavgw                m1, [secq+16]
    328 %endif
    329  SUM_SSE              m0, m2, m1, m3, m6, m7
    330 
    331  lea                srcq, [srcq + src_strideq*2]
    332  lea                dstq, [dstq + dst_strideq*2]
    333 %if %2 == 1 ; avg
    334  add                secq, sec_str
    335 %endif
    336 %else ; %1 < 16
    337  movu                 m0, [srcq]
    338  movu                 m1, [srcq+src_strideq*2]
    339  movu                 m5, [srcq+src_strideq*4]
    340  mova                 m4, m1
    341  mova                 m2, [dstq]
    342  mova                 m3, [dstq+dst_strideq*2]
    343  pmullw               m1, filter_y_a
    344  pmullw               m5, filter_y_b
    345  paddw                m1, filter_rnd
    346  pmullw               m0, filter_y_a
    347  pmullw               m4, filter_y_b
    348  paddw                m0, filter_rnd
    349  paddw                m1, m5
    350  paddw                m0, m4
    351  psrlw                m1, 4
    352  psrlw                m0, 4
    353 %if %2 == 1 ; avg
    354  pavgw                m0, [secq]
    355  add                secq, sec_str
    356  pavgw                m1, [secq]
    357 %endif
    358  SUM_SSE              m0, m2, m1, m3, m6, m7
    359 
    360  lea                srcq, [srcq + src_strideq*4]
    361  lea                dstq, [dstq + dst_strideq*4]
    362 %if %2 == 1 ; avg
    363  add                secq, sec_str
    364 %endif
    365 %endif
    366  dec                   block_height
    367  jg .x_zero_y_other_loop
    368 %undef filter_y_a
    369 %undef filter_y_b
    370 %undef filter_rnd
    371  STORE_AND_RET
    372 
    373 .x_nonzero:
    374  cmp           x_offsetd, 8
    375  jne .x_nonhalf
    376  ; x_offset == 0.5
    377  test          y_offsetd, y_offsetd
    378  jnz .x_half_y_nonzero
    379 
    380  ; x_offset == 0.5 && y_offset == 0
    381 .x_half_y_zero_loop:
    382 %if %1 == 16
    383  movu                 m0, [srcq]
    384  movu                 m1, [srcq + 16]
    385  movu                 m4, [srcq + 2]
    386  movu                 m5, [srcq + 18]
    387  mova                 m2, [dstq]
    388  mova                 m3, [dstq + 16]
    389  pavgw                m0, m4
    390  pavgw                m1, m5
    391 %if %2 == 1 ; avg
    392  pavgw                m0, [secq]
    393  pavgw                m1, [secq+16]
    394 %endif
    395  SUM_SSE              m0, m2, m1, m3, m6, m7
    396 
    397  lea                srcq, [srcq + src_strideq*2]
    398  lea                dstq, [dstq + dst_strideq*2]
    399 %if %2 == 1 ; avg
    400  add                secq, sec_str
    401 %endif
    402 %else ; %1 < 16
    403  movu                 m0, [srcq]
    404  movu                 m1, [srcq + src_strideq*2]
    405  movu                 m4, [srcq + 2]
    406  movu                 m5, [srcq + src_strideq*2 + 2]
    407  mova                 m2, [dstq]
    408  mova                 m3, [dstq + dst_strideq*2]
    409  pavgw                m0, m4
    410  pavgw                m1, m5
    411 %if %2 == 1 ; avg
    412  pavgw                m0, [secq]
    413  add                secq, sec_str
    414  pavgw                m1, [secq]
    415 %endif
    416  SUM_SSE              m0, m2, m1, m3, m6, m7
    417 
    418  lea                srcq, [srcq + src_strideq*4]
    419  lea                dstq, [dstq + dst_strideq*4]
    420 %if %2 == 1 ; avg
    421  add                secq, sec_str
    422 %endif
    423 %endif
    424  dec                   block_height
    425  jg .x_half_y_zero_loop
    426  STORE_AND_RET
    427 
    428 .x_half_y_nonzero:
    429  cmp           y_offsetd, 8
    430  jne .x_half_y_nonhalf
    431 
    432  ; x_offset == 0.5 && y_offset == 0.5
    433 %if %1 == 16
    434  movu                 m0, [srcq]
    435  movu                 m1, [srcq+16]
    436  movu                 m2, [srcq+2]
    437  movu                 m3, [srcq+18]
    438  lea                srcq, [srcq + src_strideq*2]
    439  pavgw                m0, m2
    440  pavgw                m1, m3
    441 .x_half_y_half_loop:
    442  movu                 m2, [srcq]
    443  movu                 m3, [srcq + 16]
    444  movu                 m4, [srcq + 2]
    445  movu                 m5, [srcq + 18]
    446  pavgw                m2, m4
    447  pavgw                m3, m5
    448  pavgw                m0, m2
    449  pavgw                m1, m3
    450  mova                 m4, [dstq]
    451  mova                 m5, [dstq + 16]
    452 %if %2 == 1 ; avg
    453  pavgw                m0, [secq]
    454  pavgw                m1, [secq+16]
    455 %endif
    456  SUM_SSE              m0, m4, m1, m5, m6, m7
    457  mova                 m0, m2
    458  mova                 m1, m3
    459 
    460  lea                srcq, [srcq + src_strideq*2]
    461  lea                dstq, [dstq + dst_strideq*2]
    462 %if %2 == 1 ; avg
    463  add                secq, sec_str
    464 %endif
    465 %else ; %1 < 16
    466  movu                 m0, [srcq]
    467  movu                 m2, [srcq+2]
    468  lea                srcq, [srcq + src_strideq*2]
    469  pavgw                m0, m2
    470 .x_half_y_half_loop:
    471  movu                 m2, [srcq]
    472  movu                 m3, [srcq + src_strideq*2]
    473  movu                 m4, [srcq + 2]
    474  movu                 m5, [srcq + src_strideq*2 + 2]
    475  pavgw                m2, m4
    476  pavgw                m3, m5
    477  pavgw                m0, m2
    478  pavgw                m2, m3
    479  mova                 m4, [dstq]
    480  mova                 m5, [dstq + dst_strideq*2]
    481 %if %2 == 1 ; avg
    482  pavgw                m0, [secq]
    483  add                secq, sec_str
    484  pavgw                m2, [secq]
    485 %endif
    486  SUM_SSE              m0, m4, m2, m5, m6, m7
    487  mova                 m0, m3
    488 
    489  lea                srcq, [srcq + src_strideq*4]
    490  lea                dstq, [dstq + dst_strideq*4]
    491 %if %2 == 1 ; avg
    492  add                secq, sec_str
    493 %endif
    494 %endif
    495  dec                   block_height
    496  jg .x_half_y_half_loop
    497  STORE_AND_RET
    498 
    499 .x_half_y_nonhalf:
    500  ; x_offset == 0.5 && y_offset == bilin interpolation
    501 %if AOM_ARCH_X86_64
    502  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
    503 %endif
    504  shl           y_offsetd, filter_idx_shift
    505 %if AOM_ARCH_X86_64 && mmsize == 16
    506  mova                 m8, [bilin_filter+y_offsetq]
    507  mova                 m9, [bilin_filter+y_offsetq+16]
    508  mova                m10, [GLOBAL(pw_8)]
    509 %define filter_y_a m8
    510 %define filter_y_b m9
    511 %define filter_rnd m10
    512 %else  ; x86_32
    513 %if AOM_ARCH_X86=1 && CONFIG_PIC=1
    514 ; x_offset == 0.5. We can reuse x_offset reg
    515 %define tempq x_offsetq
    516  add y_offsetq, g_bilin_filterm
    517 %define filter_y_a [y_offsetq]
    518 %define filter_y_b [y_offsetq+16]
    519  mov tempq, g_pw_8m
    520 %define filter_rnd [tempq]
    521 %else
    522  add           y_offsetq, bilin_filter
    523 %define filter_y_a [y_offsetq]
    524 %define filter_y_b [y_offsetq+16]
    525 %define filter_rnd [GLOBAL(pw_8)]
    526 %endif
    527 %endif
    528 
    529 %if %1 == 16
    530  movu                 m0, [srcq]
    531  movu                 m1, [srcq+16]
    532  movu                 m2, [srcq+2]
    533  movu                 m3, [srcq+18]
    534  lea                srcq, [srcq + src_strideq*2]
    535  pavgw                m0, m2
    536  pavgw                m1, m3
    537 .x_half_y_other_loop:
    538  movu                 m2, [srcq]
    539  movu                 m3, [srcq+16]
    540  movu                 m4, [srcq+2]
    541  movu                 m5, [srcq+18]
    542  pavgw                m2, m4
    543  pavgw                m3, m5
    544  mova                 m4, m2
    545  mova                 m5, m3
    546  pmullw               m1, filter_y_a
    547  pmullw               m3, filter_y_b
    548  paddw                m1, filter_rnd
    549  paddw                m1, m3
    550  pmullw               m0, filter_y_a
    551  pmullw               m2, filter_y_b
    552  paddw                m0, filter_rnd
    553  psrlw                m1, 4
    554  paddw                m0, m2
    555  mova                 m2, [dstq]
    556  psrlw                m0, 4
    557  mova                 m3, [dstq+16]
    558 %if %2 == 1 ; avg
    559  pavgw                m0, [secq]
    560  pavgw                m1, [secq+16]
    561 %endif
    562  SUM_SSE              m0, m2, m1, m3, m6, m7
    563  mova                 m0, m4
    564  mova                 m1, m5
    565 
    566  lea                srcq, [srcq + src_strideq*2]
    567  lea                dstq, [dstq + dst_strideq*2]
    568 %if %2 == 1 ; avg
    569  add                secq, sec_str
    570 %endif
    571 %else ; %1 < 16
    572  movu                 m0, [srcq]
    573  movu                 m2, [srcq+2]
    574  lea                srcq, [srcq + src_strideq*2]
    575  pavgw                m0, m2
    576 .x_half_y_other_loop:
    577  movu                 m2, [srcq]
    578  movu                 m3, [srcq+src_strideq*2]
    579  movu                 m4, [srcq+2]
    580  movu                 m5, [srcq+src_strideq*2+2]
    581  pavgw                m2, m4
    582  pavgw                m3, m5
    583  mova                 m4, m2
    584  mova                 m5, m3
    585  pmullw               m4, filter_y_a
    586  pmullw               m3, filter_y_b
    587  paddw                m4, filter_rnd
    588  paddw                m4, m3
    589  pmullw               m0, filter_y_a
    590  pmullw               m2, filter_y_b
    591  paddw                m0, filter_rnd
    592  psrlw                m4, 4
    593  paddw                m0, m2
    594  mova                 m2, [dstq]
    595  psrlw                m0, 4
    596  mova                 m3, [dstq+dst_strideq*2]
    597 %if %2 == 1 ; avg
    598  pavgw                m0, [secq]
    599  add                secq, sec_str
    600  pavgw                m4, [secq]
    601 %endif
    602  SUM_SSE              m0, m2, m4, m3, m6, m7
    603  mova                 m0, m5
    604 
    605  lea                srcq, [srcq + src_strideq*4]
    606  lea                dstq, [dstq + dst_strideq*4]
    607 %if %2 == 1 ; avg
    608  add                secq, sec_str
    609 %endif
    610 %endif
    611  dec                   block_height
    612  jg .x_half_y_other_loop
    613 %undef filter_y_a
    614 %undef filter_y_b
    615 %undef filter_rnd
    616  STORE_AND_RET
    617 
    618 .x_nonhalf:
    619  test          y_offsetd, y_offsetd
    620  jnz .x_nonhalf_y_nonzero
    621 
    622  ; x_offset == bilin interpolation && y_offset == 0
    623 %if AOM_ARCH_X86_64
    624  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
    625 %endif
    626  shl           x_offsetd, filter_idx_shift
    627 %if AOM_ARCH_X86_64 && mmsize == 16
    628  mova                 m8, [bilin_filter+x_offsetq]
    629  mova                 m9, [bilin_filter+x_offsetq+16]
    630  mova                m10, [GLOBAL(pw_8)]
    631 %define filter_x_a m8
    632 %define filter_x_b m9
    633 %define filter_rnd m10
    634 %else    ; x86-32
    635 %if AOM_ARCH_X86=1 && CONFIG_PIC=1
    636 ; y_offset == 0. We can reuse y_offset reg.
    637 %define tempq y_offsetq
    638  add x_offsetq, g_bilin_filterm
    639 %define filter_x_a [x_offsetq]
    640 %define filter_x_b [x_offsetq+16]
    641  mov tempq, g_pw_8m
    642 %define filter_rnd [tempq]
    643 %else
    644  add           x_offsetq, bilin_filter
    645 %define filter_x_a [x_offsetq]
    646 %define filter_x_b [x_offsetq+16]
    647 %define filter_rnd [GLOBAL(pw_8)]
    648 %endif
    649 %endif
    650 
    651 .x_other_y_zero_loop:
    652 %if %1 == 16
    653  movu                 m0, [srcq]
    654  movu                 m1, [srcq+16]
    655  movu                 m2, [srcq+2]
    656  movu                 m3, [srcq+18]
    657  mova                 m4, [dstq]
    658  mova                 m5, [dstq+16]
    659  pmullw               m1, filter_x_a
    660  pmullw               m3, filter_x_b
    661  paddw                m1, filter_rnd
    662  pmullw               m0, filter_x_a
    663  pmullw               m2, filter_x_b
    664  paddw                m0, filter_rnd
    665  paddw                m1, m3
    666  paddw                m0, m2
    667  psrlw                m1, 4
    668  psrlw                m0, 4
    669 %if %2 == 1 ; avg
    670  pavgw                m0, [secq]
    671  pavgw                m1, [secq+16]
    672 %endif
    673  SUM_SSE              m0, m4, m1, m5, m6, m7
    674 
    675  lea                srcq, [srcq+src_strideq*2]
    676  lea                dstq, [dstq+dst_strideq*2]
    677 %if %2 == 1 ; avg
    678  add                secq, sec_str
    679 %endif
    680 %else ; %1 < 16
    681  movu                 m0, [srcq]
    682  movu                 m1, [srcq+src_strideq*2]
    683  movu                 m2, [srcq+2]
    684  movu                 m3, [srcq+src_strideq*2+2]
    685  mova                 m4, [dstq]
    686  mova                 m5, [dstq+dst_strideq*2]
    687  pmullw               m1, filter_x_a
    688  pmullw               m3, filter_x_b
    689  paddw                m1, filter_rnd
    690  pmullw               m0, filter_x_a
    691  pmullw               m2, filter_x_b
    692  paddw                m0, filter_rnd
    693  paddw                m1, m3
    694  paddw                m0, m2
    695  psrlw                m1, 4
    696  psrlw                m0, 4
    697 %if %2 == 1 ; avg
    698  pavgw                m0, [secq]
    699  add                secq, sec_str
    700  pavgw                m1, [secq]
    701 %endif
    702  SUM_SSE              m0, m4, m1, m5, m6, m7
    703 
    704  lea                srcq, [srcq+src_strideq*4]
    705  lea                dstq, [dstq+dst_strideq*4]
    706 %if %2 == 1 ; avg
    707  add                secq, sec_str
    708 %endif
    709 %endif
    710  dec                   block_height
    711  jg .x_other_y_zero_loop
    712 %undef filter_x_a
    713 %undef filter_x_b
    714 %undef filter_rnd
    715  STORE_AND_RET
    716 
    717 .x_nonhalf_y_nonzero:
    718  cmp           y_offsetd, 8
    719  jne .x_nonhalf_y_nonhalf
    720 
    721  ; x_offset == bilin interpolation && y_offset == 0.5
    722 %if AOM_ARCH_X86_64
    723  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
    724 %endif
    725  shl           x_offsetd, filter_idx_shift
    726 %if AOM_ARCH_X86_64 && mmsize == 16
    727  mova                 m8, [bilin_filter+x_offsetq]
    728  mova                 m9, [bilin_filter+x_offsetq+16]
    729  mova                m10, [GLOBAL(pw_8)]
    730 %define filter_x_a m8
    731 %define filter_x_b m9
    732 %define filter_rnd m10
    733 %else    ; x86-32
    734 %if AOM_ARCH_X86=1 && CONFIG_PIC=1
    735 ; y_offset == 0.5. We can reuse y_offset reg.
    736 %define tempq y_offsetq
    737  add x_offsetq, g_bilin_filterm
    738 %define filter_x_a [x_offsetq]
    739 %define filter_x_b [x_offsetq+16]
    740  mov tempq, g_pw_8m
    741 %define filter_rnd [tempq]
    742 %else
    743  add           x_offsetq, bilin_filter
    744 %define filter_x_a [x_offsetq]
    745 %define filter_x_b [x_offsetq+16]
    746 %define filter_rnd [GLOBAL(pw_8)]
    747 %endif
    748 %endif
    749 
    750 %if %1 == 16
    751  movu                 m0, [srcq]
    752  movu                 m1, [srcq+16]
    753  movu                 m2, [srcq+2]
    754  movu                 m3, [srcq+18]
    755  pmullw               m0, filter_x_a
    756  pmullw               m2, filter_x_b
    757  paddw                m0, filter_rnd
    758  pmullw               m1, filter_x_a
    759  pmullw               m3, filter_x_b
    760  paddw                m1, filter_rnd
    761  paddw                m0, m2
    762  paddw                m1, m3
    763  psrlw                m0, 4
    764  psrlw                m1, 4
    765  lea                srcq, [srcq+src_strideq*2]
    766 .x_other_y_half_loop:
    767  movu                 m2, [srcq]
    768  movu                 m3, [srcq+16]
    769  movu                 m4, [srcq+2]
    770  movu                 m5, [srcq+18]
    771  pmullw               m2, filter_x_a
    772  pmullw               m4, filter_x_b
    773  paddw                m2, filter_rnd
    774  pmullw               m3, filter_x_a
    775  pmullw               m5, filter_x_b
    776  paddw                m3, filter_rnd
    777  paddw                m2, m4
    778  paddw                m3, m5
    779  mova                 m4, [dstq]
    780  mova                 m5, [dstq+16]
    781  psrlw                m2, 4
    782  psrlw                m3, 4
    783  pavgw                m0, m2
    784  pavgw                m1, m3
    785 %if %2 == 1 ; avg
    786  pavgw                m0, [secq]
    787  pavgw                m1, [secq+16]
    788 %endif
    789  SUM_SSE              m0, m4, m1, m5, m6, m7
    790  mova                 m0, m2
    791  mova                 m1, m3
    792 
    793  lea                srcq, [srcq+src_strideq*2]
    794  lea                dstq, [dstq+dst_strideq*2]
    795 %if %2 == 1 ; avg
    796  add                secq, sec_str
    797 %endif
    798 %else ; %1 < 16
    799  movu                 m0, [srcq]
    800  movu                 m2, [srcq+2]
    801  pmullw               m0, filter_x_a
    802  pmullw               m2, filter_x_b
    803  paddw                m0, filter_rnd
    804  paddw                m0, m2
    805  psrlw                m0, 4
    806  lea                srcq, [srcq+src_strideq*2]
    807 .x_other_y_half_loop:
    808  movu                 m2, [srcq]
    809  movu                 m3, [srcq+src_strideq*2]
    810  movu                 m4, [srcq+2]
    811  movu                 m5, [srcq+src_strideq*2+2]
    812  pmullw               m2, filter_x_a
    813  pmullw               m4, filter_x_b
    814  paddw                m2, filter_rnd
    815  pmullw               m3, filter_x_a
    816  pmullw               m5, filter_x_b
    817  paddw                m3, filter_rnd
    818  paddw                m2, m4
    819  paddw                m3, m5
    820  mova                 m4, [dstq]
    821  mova                 m5, [dstq+dst_strideq*2]
    822  psrlw                m2, 4
    823  psrlw                m3, 4
    824  pavgw                m0, m2
    825  pavgw                m2, m3
    826 %if %2 == 1 ; avg
    827  pavgw                m0, [secq]
    828  add                secq, sec_str
    829  pavgw                m2, [secq]
    830 %endif
    831  SUM_SSE              m0, m4, m2, m5, m6, m7
    832  mova                 m0, m3
    833 
    834  lea                srcq, [srcq+src_strideq*4]
    835  lea                dstq, [dstq+dst_strideq*4]
    836 %if %2 == 1 ; avg
    837  add                secq, sec_str
    838 %endif
    839 %endif
    840  dec                   block_height
    841  jg .x_other_y_half_loop
    842 %undef filter_x_a
    843 %undef filter_x_b
    844 %undef filter_rnd
    845  STORE_AND_RET
    846 
    847 .x_nonhalf_y_nonhalf:
    848 ; loading filter - this is same as in 8-bit depth
    849 %if AOM_ARCH_X86_64
    850  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
    851 %endif
    852  shl           x_offsetd, filter_idx_shift ; filter_idx_shift = 5
    853  shl           y_offsetd, filter_idx_shift
    854 %if AOM_ARCH_X86_64 && mmsize == 16
    855  mova                 m8, [bilin_filter+x_offsetq]
    856  mova                 m9, [bilin_filter+x_offsetq+16]
    857  mova                m10, [bilin_filter+y_offsetq]
    858  mova                m11, [bilin_filter+y_offsetq+16]
    859  mova                m12, [GLOBAL(pw_8)]
    860 %define filter_x_a m8
    861 %define filter_x_b m9
    862 %define filter_y_a m10
    863 %define filter_y_b m11
    864 %define filter_rnd m12
    865 %else   ; x86-32
    866 %if AOM_ARCH_X86=1 && CONFIG_PIC=1
    867 ; In this case, there is NO unused register. Used src_stride register. Later,
    868 ; src_stride has to be loaded from stack when it is needed.
    869 %define tempq src_strideq
    870  mov tempq, g_bilin_filterm
    871  add           x_offsetq, tempq
    872  add           y_offsetq, tempq
    873 %define filter_x_a [x_offsetq]
    874 %define filter_x_b [x_offsetq+16]
    875 %define filter_y_a [y_offsetq]
    876 %define filter_y_b [y_offsetq+16]
    877 
    878  mov tempq, g_pw_8m
    879 %define filter_rnd [tempq]
    880 %else
    881  add           x_offsetq, bilin_filter
    882  add           y_offsetq, bilin_filter
    883 %define filter_x_a [x_offsetq]
    884 %define filter_x_b [x_offsetq+16]
    885 %define filter_y_a [y_offsetq]
    886 %define filter_y_b [y_offsetq+16]
    887 %define filter_rnd [GLOBAL(pw_8)]
    888 %endif
    889 %endif
    890 ; end of load filter
    891 
    892  ; x_offset == bilin interpolation && y_offset == bilin interpolation
    893 %if %1 == 16
    894  movu                 m0, [srcq]
    895  movu                 m2, [srcq+2]
    896  movu                 m1, [srcq+16]
    897  movu                 m3, [srcq+18]
    898  pmullw               m0, filter_x_a
    899  pmullw               m2, filter_x_b
    900  paddw                m0, filter_rnd
    901  pmullw               m1, filter_x_a
    902  pmullw               m3, filter_x_b
    903  paddw                m1, filter_rnd
    904  paddw                m0, m2
    905  paddw                m1, m3
    906  psrlw                m0, 4
    907  psrlw                m1, 4
    908 
    909  INC_SRC_BY_SRC_STRIDE
    910 
    911 .x_other_y_other_loop:
    912  movu                 m2, [srcq]
    913  movu                 m4, [srcq+2]
    914  movu                 m3, [srcq+16]
    915  movu                 m5, [srcq+18]
    916  pmullw               m2, filter_x_a
    917  pmullw               m4, filter_x_b
    918  paddw                m2, filter_rnd
    919  pmullw               m3, filter_x_a
    920  pmullw               m5, filter_x_b
    921  paddw                m3, filter_rnd
    922  paddw                m2, m4
    923  paddw                m3, m5
    924  psrlw                m2, 4
    925  psrlw                m3, 4
    926  mova                 m4, m2
    927  mova                 m5, m3
    928  pmullw               m0, filter_y_a
    929  pmullw               m2, filter_y_b
    930  paddw                m0, filter_rnd
    931  pmullw               m1, filter_y_a
    932  pmullw               m3, filter_y_b
    933  paddw                m0, m2
    934  paddw                m1, filter_rnd
    935  mova                 m2, [dstq]
    936  paddw                m1, m3
    937  psrlw                m0, 4
    938  psrlw                m1, 4
    939  mova                 m3, [dstq+16]
    940 %if %2 == 1 ; avg
    941  pavgw                m0, [secq]
    942  pavgw                m1, [secq+16]
    943 %endif
    944  SUM_SSE              m0, m2, m1, m3, m6, m7
    945  mova                 m0, m4
    946  mova                 m1, m5
    947 
    948  INC_SRC_BY_SRC_STRIDE
    949  lea                dstq, [dstq + dst_strideq * 2]
    950 %if %2 == 1 ; avg
    951  add                secq, sec_str
    952 %endif
    953 %else ; %1 < 16
    954  movu                 m0, [srcq]
    955  movu                 m2, [srcq+2]
    956  pmullw               m0, filter_x_a
    957  pmullw               m2, filter_x_b
    958  paddw                m0, filter_rnd
    959  paddw                m0, m2
    960  psrlw                m0, 4
    961 
    962  INC_SRC_BY_SRC_STRIDE
    963 
    964 .x_other_y_other_loop:
    965  movu                 m2, [srcq]
    966  movu                 m4, [srcq+2]
    967  INC_SRC_BY_SRC_STRIDE
    968  movu                 m3, [srcq]
    969  movu                 m5, [srcq+2]
    970  pmullw               m2, filter_x_a
    971  pmullw               m4, filter_x_b
    972  paddw                m2, filter_rnd
    973  pmullw               m3, filter_x_a
    974  pmullw               m5, filter_x_b
    975  paddw                m3, filter_rnd
    976  paddw                m2, m4
    977  paddw                m3, m5
    978  psrlw                m2, 4
    979  psrlw                m3, 4
    980  mova                 m4, m2
    981  mova                 m5, m3
    982  pmullw               m0, filter_y_a
    983  pmullw               m2, filter_y_b
    984  paddw                m0, filter_rnd
    985  pmullw               m4, filter_y_a
    986  pmullw               m3, filter_y_b
    987  paddw                m0, m2
    988  paddw                m4, filter_rnd
    989  mova                 m2, [dstq]
    990  paddw                m4, m3
    991  psrlw                m0, 4
    992  psrlw                m4, 4
    993  mova                 m3, [dstq+dst_strideq*2]
    994 %if %2 == 1 ; avg
    995  pavgw                m0, [secq]
    996  add                secq, sec_str
    997  pavgw                m4, [secq]
    998 %endif
    999  SUM_SSE              m0, m2, m4, m3, m6, m7
   1000  mova                 m0, m5
   1001 
   1002  INC_SRC_BY_SRC_STRIDE
   1003  lea                dstq, [dstq + dst_strideq * 4]
   1004 %if %2 == 1 ; avg
   1005  add                secq, sec_str
   1006 %endif
   1007 %endif
   1008  dec                   block_height
   1009  jg .x_other_y_other_loop
   1010 %undef filter_x_a
   1011 %undef filter_x_b
   1012 %undef filter_y_a
   1013 %undef filter_y_b
   1014 %undef filter_rnd
   1015  STORE_AND_RET
   1016 %endmacro
   1017 
   1018 INIT_XMM sse2
   1019 SUBPEL_VARIANCE  8
   1020 SUBPEL_VARIANCE 16
   1021 
   1022 INIT_XMM sse2
   1023 SUBPEL_VARIANCE  8, 1
   1024 SUBPEL_VARIANCE 16, 1