tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

highbd_variance_impl_sse2.asm (10052B)


      1 ;
      2 ; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 ;
      4 ; This source code is subject to the terms of the BSD 2 Clause License and
      5 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 ; was not distributed with this source code in the LICENSE file, you can
      7 ; obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 ; Media Patent License 1.0 was not distributed with this source code in the
      9 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 ;
     11 
     12 ;
     13 
     14 
     15 %include "aom_ports/x86_abi_support.asm"
     16 
     17 SECTION .text
     18 
     19 ;unsigned int aom_highbd_calc16x16var_sse2
     20 ;(
     21 ;    unsigned char   *  src_ptr,
     22 ;    int             source_stride,
     23 ;    unsigned char   *  ref_ptr,
     24 ;    int             recon_stride,
     25 ;    unsigned int    *  SSE,
     26 ;    int             *  Sum
     27 ;)
     28 globalsym(aom_highbd_calc16x16var_sse2)
     29 sym(aom_highbd_calc16x16var_sse2):
     30    push        rbp
     31    mov         rbp, rsp
     32    SHADOW_ARGS_TO_STACK 6
     33    SAVE_XMM 7
     34    push rbx
     35    push rsi
     36    push rdi
     37    ; end prolog
     38 
     39        mov         rsi,            arg(0) ;[src_ptr]
     40        mov         rdi,            arg(2) ;[ref_ptr]
     41 
     42        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
     43        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
     44        add         rax,            rax ; source stride in bytes
     45        add         rdx,            rdx ; recon stride in bytes
     46 
     47        ; Prefetch data
     48        prefetcht0      [rsi]
     49        prefetcht0      [rsi+16]
     50        prefetcht0      [rsi+rax]
     51        prefetcht0      [rsi+rax+16]
     52        lea             rbx,    [rsi+rax*2]
     53        prefetcht0      [rbx]
     54        prefetcht0      [rbx+16]
     55        prefetcht0      [rbx+rax]
     56        prefetcht0      [rbx+rax+16]
     57 
     58        prefetcht0      [rdi]
     59        prefetcht0      [rdi+16]
     60        prefetcht0      [rdi+rdx]
     61        prefetcht0      [rdi+rdx+16]
     62        lea             rbx,    [rdi+rdx*2]
     63        prefetcht0      [rbx]
     64        prefetcht0      [rbx+16]
     65        prefetcht0      [rbx+rdx]
     66        prefetcht0      [rbx+rdx+16]
     67 
     68        pxor        xmm0,           xmm0     ; clear xmm0 for unpack
     69        pxor        xmm7,           xmm7     ; clear xmm7 for accumulating diffs
     70 
     71        pxor        xmm6,           xmm6     ; clear xmm6 for accumulating sse
     72        mov         rcx,            16
     73 
     74 .var16loop:
     75        movdqu      xmm1,           XMMWORD PTR [rsi]
     76        movdqu      xmm2,           XMMWORD PTR [rdi]
     77 
     78        lea             rbx,    [rsi+rax*2]
     79        prefetcht0      [rbx]
     80        prefetcht0      [rbx+16]
     81        prefetcht0      [rbx+rax]
     82        prefetcht0      [rbx+rax+16]
     83        lea             rbx,    [rdi+rdx*2]
     84        prefetcht0      [rbx]
     85        prefetcht0      [rbx+16]
     86        prefetcht0      [rbx+rdx]
     87        prefetcht0      [rbx+rdx+16]
     88 
     89        pxor        xmm5,           xmm5
     90 
     91        psubw       xmm1,           xmm2
     92        movdqu      xmm3,           XMMWORD PTR [rsi+16]
     93        paddw       xmm5,           xmm1
     94        pmaddwd     xmm1,           xmm1
     95        movdqu      xmm2,           XMMWORD PTR [rdi+16]
     96        paddd       xmm6,           xmm1
     97 
     98        psubw       xmm3,           xmm2
     99        movdqu      xmm1,           XMMWORD PTR [rsi+rax]
    100        paddw       xmm5,           xmm3
    101        pmaddwd     xmm3,           xmm3
    102        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]
    103        paddd       xmm6,           xmm3
    104 
    105        psubw       xmm1,           xmm2
    106        movdqu      xmm3,           XMMWORD PTR [rsi+rax+16]
    107        paddw       xmm5,           xmm1
    108        pmaddwd     xmm1,           xmm1
    109        movdqu      xmm2,           XMMWORD PTR [rdi+rdx+16]
    110        paddd       xmm6,           xmm1
    111 
    112        psubw       xmm3,           xmm2
    113        paddw       xmm5,           xmm3
    114        pmaddwd     xmm3,           xmm3
    115        paddd       xmm6,           xmm3
    116 
    117        movdqa      xmm1,           xmm5
    118        movdqa      xmm2,           xmm5
    119        pcmpgtw     xmm1,           xmm0
    120        pcmpeqw     xmm2,           xmm0
    121        por         xmm1,           xmm2
    122        pcmpeqw     xmm1,           xmm0
    123        movdqa      xmm2,           xmm5
    124        punpcklwd   xmm5,           xmm1
    125        punpckhwd   xmm2,           xmm1
    126        paddd       xmm7,           xmm5
    127        paddd       xmm7,           xmm2
    128 
    129        lea         rsi,            [rsi + 2*rax]
    130        lea         rdi,            [rdi + 2*rdx]
    131        sub         rcx,            2
    132        jnz         .var16loop
    133 
    134        movdqa      xmm4,           xmm6
    135        punpckldq   xmm6,           xmm0
    136 
    137        punpckhdq   xmm4,           xmm0
    138        movdqa      xmm5,           xmm7
    139 
    140        paddd       xmm6,           xmm4
    141        punpckldq   xmm7,           xmm0
    142 
    143        punpckhdq   xmm5,           xmm0
    144        paddd       xmm7,           xmm5
    145 
    146        movdqa      xmm4,           xmm6
    147        movdqa      xmm5,           xmm7
    148 
    149        psrldq      xmm4,           8
    150        psrldq      xmm5,           8
    151 
    152        paddd       xmm6,           xmm4
    153        paddd       xmm7,           xmm5
    154 
    155        mov         rdi,            arg(4)   ; [SSE]
    156        mov         rax,            arg(5)   ; [Sum]
    157 
    158        movd DWORD PTR [rdi],       xmm6
    159        movd DWORD PTR [rax],       xmm7
    160 
    161 
    162    ; begin epilog
    163    pop rdi
    164    pop rsi
    165    pop rbx
    166    RESTORE_XMM
    167    UNSHADOW_ARGS
    168    pop         rbp
    169    ret
    170 
    171 
    172 ;unsigned int aom_highbd_calc8x8var_sse2
    173 ;(
    174 ;    unsigned char   *  src_ptr,
    175 ;    int             source_stride,
    176 ;    unsigned char   *  ref_ptr,
    177 ;    int             recon_stride,
    178 ;    unsigned int    *  SSE,
    179 ;    int             *  Sum
    180 ;)
    181 globalsym(aom_highbd_calc8x8var_sse2)
    182 sym(aom_highbd_calc8x8var_sse2):
    183    push        rbp
    184    mov         rbp, rsp
    185    SHADOW_ARGS_TO_STACK 6
    186    SAVE_XMM 7
    187    push rbx
    188    push rsi
    189    push rdi
    190    ; end prolog
    191 
    192        mov         rsi,            arg(0) ;[src_ptr]
    193        mov         rdi,            arg(2) ;[ref_ptr]
    194 
    195        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
    196        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
    197        add         rax,            rax ; source stride in bytes
    198        add         rdx,            rdx ; recon stride in bytes
    199 
    200        ; Prefetch data
    201        prefetcht0      [rsi]
    202        prefetcht0      [rsi+rax]
    203        lea             rbx,    [rsi+rax*2]
    204        prefetcht0      [rbx]
    205        prefetcht0      [rbx+rax]
    206 
    207        prefetcht0      [rdi]
    208        prefetcht0      [rdi+rdx]
    209        lea             rbx,    [rdi+rdx*2]
    210        prefetcht0      [rbx]
    211        prefetcht0      [rbx+rdx]
    212 
    213        pxor        xmm0,           xmm0     ; clear xmm0 for unpack
    214        pxor        xmm7,           xmm7     ; clear xmm7 for accumulating diffs
    215 
    216        pxor        xmm6,           xmm6     ; clear xmm6 for accumulating sse
    217        mov         rcx,            8
    218 
    219 .var8loop:
    220        movdqu      xmm1,           XMMWORD PTR [rsi]
    221        movdqu      xmm2,           XMMWORD PTR [rdi]
    222 
    223        lea             rbx,    [rsi+rax*4]
    224        prefetcht0      [rbx]
    225        prefetcht0      [rbx+rax]
    226        lea             rbx,    [rbx+rax*2]
    227        prefetcht0      [rbx]
    228        prefetcht0      [rbx+rax]
    229        lea             rbx,    [rdi+rdx*4]
    230        prefetcht0      [rbx]
    231        prefetcht0      [rbx+rdx]
    232        lea             rbx,    [rbx+rdx*2]
    233        prefetcht0      [rbx]
    234        prefetcht0      [rbx+rdx]
    235 
    236        pxor        xmm5,           xmm5
    237 
    238        psubw       xmm1,           xmm2
    239        movdqu      xmm3,           XMMWORD PTR [rsi+rax]
    240        paddw       xmm5,           xmm1
    241        pmaddwd     xmm1,           xmm1
    242        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]
    243        paddd       xmm6,           xmm1
    244 
    245        lea         rsi,            [rsi + 2*rax]
    246        lea         rdi,            [rdi + 2*rdx]
    247 
    248        psubw       xmm3,           xmm2
    249        movdqu      xmm1,           XMMWORD PTR [rsi]
    250        paddw       xmm5,           xmm3
    251        pmaddwd     xmm3,           xmm3
    252        movdqu      xmm2,           XMMWORD PTR [rdi]
    253        paddd       xmm6,           xmm3
    254 
    255        psubw       xmm1,           xmm2
    256        movdqu      xmm3,           XMMWORD PTR [rsi+rax]
    257        paddw       xmm5,           xmm1
    258        pmaddwd     xmm1,           xmm1
    259        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]
    260        paddd       xmm6,           xmm1
    261 
    262        psubw       xmm3,           xmm2
    263        paddw       xmm5,           xmm3
    264        pmaddwd     xmm3,           xmm3
    265        paddd       xmm6,           xmm3
    266 
    267        movdqa      xmm1,           xmm5
    268        movdqa      xmm2,           xmm5
    269        pcmpgtw     xmm1,           xmm0
    270        pcmpeqw     xmm2,           xmm0
    271        por         xmm1,           xmm2
    272        pcmpeqw     xmm1,           xmm0
    273        movdqa      xmm2,           xmm5
    274        punpcklwd   xmm5,           xmm1
    275        punpckhwd   xmm2,           xmm1
    276        paddd       xmm7,           xmm5
    277        paddd       xmm7,           xmm2
    278 
    279        lea         rsi,            [rsi + 2*rax]
    280        lea         rdi,            [rdi + 2*rdx]
    281        sub         rcx,            4
    282        jnz         .var8loop
    283 
    284        movdqa      xmm4,           xmm6
    285        punpckldq   xmm6,           xmm0
    286 
    287        punpckhdq   xmm4,           xmm0
    288        movdqa      xmm5,           xmm7
    289 
    290        paddd       xmm6,           xmm4
    291        punpckldq   xmm7,           xmm0
    292 
    293        punpckhdq   xmm5,           xmm0
    294        paddd       xmm7,           xmm5
    295 
    296        movdqa      xmm4,           xmm6
    297        movdqa      xmm5,           xmm7
    298 
    299        psrldq      xmm4,           8
    300        psrldq      xmm5,           8
    301 
    302        paddd       xmm6,           xmm4
    303        paddd       xmm7,           xmm5
    304 
    305        mov         rdi,            arg(4)   ; [SSE]
    306        mov         rax,            arg(5)   ; [Sum]
    307 
    308        movd DWORD PTR [rdi],       xmm6
    309        movd DWORD PTR [rax],       xmm7
    310 
    311    ; begin epilog
    312    pop rdi
    313    pop rsi
    314    pop rbx
    315    RESTORE_XMM
    316    UNSHADOW_ARGS
    317    pop         rbp
    318    ret