tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

aom_subpixel_bilinear_ssse3.asm (6292B)


      1 ;
      2 ; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 ;
      4 ; This source code is subject to the terms of the BSD 2 Clause License and
      5 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 ; was not distributed with this source code in the LICENSE file, you can
      7 ; obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 ; Media Patent License 1.0 was not distributed with this source code in the
      9 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 ;
     11 
     12 ;
     13 
     14 %include "aom_ports/x86_abi_support.asm"
     15 
     16 %macro GET_PARAM_4 0
     17    mov         rdx, arg(5)                 ;filter ptr
     18    mov         rsi, arg(0)                 ;src_ptr
     19    mov         rdi, arg(2)                 ;output_ptr
     20    mov         ecx, 0x01000100
     21 
     22    movdqa      xmm3, [rdx]                 ;load filters
     23    psrldq      xmm3, 6
     24    packsswb    xmm3, xmm3
     25    pshuflw     xmm3, xmm3, 0b              ;k3_k4
     26 
     27    movd        xmm2, ecx                   ;rounding_shift
     28    pshufd      xmm2, xmm2, 0
     29 
     30    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
     31    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
     32    movsxd      rcx, DWORD PTR arg(4)       ;output_height
     33 %endm
     34 
     35 %macro APPLY_FILTER_4 1
     36    punpcklbw   xmm0, xmm1
     37    pmaddubsw   xmm0, xmm3
     38 
     39    pmulhrsw    xmm0, xmm2                  ;rounding(+64)+shift(>>7)
     40    packuswb    xmm0, xmm0                  ;pack to byte
     41 
     42 %if %1
     43    movd        xmm1, [rdi]
     44    pavgb       xmm0, xmm1
     45 %endif
     46    movd        [rdi], xmm0
     47    lea         rsi, [rsi + rax]
     48    lea         rdi, [rdi + rdx]
     49    dec         rcx
     50 %endm
     51 
     52 %macro GET_PARAM 0
     53    mov         rdx, arg(5)                 ;filter ptr
     54    mov         rsi, arg(0)                 ;src_ptr
     55    mov         rdi, arg(2)                 ;output_ptr
     56    mov         ecx, 0x01000100
     57 
     58    movdqa      xmm7, [rdx]                 ;load filters
     59    psrldq      xmm7, 6
     60    packsswb    xmm7, xmm7
     61    pshuflw     xmm7, xmm7, 0b              ;k3_k4
     62    punpcklwd   xmm7, xmm7
     63 
     64    movd        xmm6, ecx                   ;rounding_shift
     65    pshufd      xmm6, xmm6, 0
     66 
     67    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
     68    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
     69    movsxd      rcx, DWORD PTR arg(4)       ;output_height
     70 %endm
     71 
     72 %macro APPLY_FILTER_8 1
     73    punpcklbw   xmm0, xmm1
     74    pmaddubsw   xmm0, xmm7
     75 
     76    pmulhrsw    xmm0, xmm6                  ;rounding(+64)+shift(>>7)
     77    packuswb    xmm0, xmm0                  ;pack back to byte
     78 
     79 %if %1
     80    movq        xmm1, [rdi]
     81    pavgb       xmm0, xmm1
     82 %endif
     83    movq        [rdi], xmm0                 ;store the result
     84 
     85    lea         rsi, [rsi + rax]
     86    lea         rdi, [rdi + rdx]
     87    dec         rcx
     88 %endm
     89 
     90 %macro APPLY_FILTER_16 1
     91    punpcklbw   xmm0, xmm1
     92    punpckhbw   xmm2, xmm1
     93    pmaddubsw   xmm0, xmm7
     94    pmaddubsw   xmm2, xmm7
     95 
     96    pmulhrsw    xmm0, xmm6                  ;rounding(+64)+shift(>>7)
     97    pmulhrsw    xmm2, xmm6
     98    packuswb    xmm0, xmm2                  ;pack back to byte
     99 
    100 %if %1
    101    movdqu      xmm1, [rdi]
    102    pavgb       xmm0, xmm1
    103 %endif
    104    movdqu      [rdi], xmm0                 ;store the result
    105 
    106    lea         rsi, [rsi + rax]
    107    lea         rdi, [rdi + rdx]
    108    dec         rcx
    109 %endm
    110 
    111 SECTION .text
    112 
    113 globalsym(aom_filter_block1d4_v2_ssse3)
    114 sym(aom_filter_block1d4_v2_ssse3):
    115    push        rbp
    116    mov         rbp, rsp
    117    SHADOW_ARGS_TO_STACK 6
    118    push        rsi
    119    push        rdi
    120    ; end prolog
    121 
    122    GET_PARAM_4
    123 .loop:
    124    movd        xmm0, [rsi]                 ;load src
    125    movd        xmm1, [rsi + rax]
    126 
    127    APPLY_FILTER_4 0
    128    jnz         .loop
    129 
    130    ; begin epilog
    131    pop         rdi
    132    pop         rsi
    133    UNSHADOW_ARGS
    134    pop         rbp
    135    ret
    136 
    137 globalsym(aom_filter_block1d8_v2_ssse3)
    138 sym(aom_filter_block1d8_v2_ssse3):
    139    push        rbp
    140    mov         rbp, rsp
    141    SHADOW_ARGS_TO_STACK 6
    142    SAVE_XMM 7
    143    push        rsi
    144    push        rdi
    145    ; end prolog
    146 
    147    GET_PARAM
    148 .loop:
    149    movq        xmm0, [rsi]                 ;0
    150    movq        xmm1, [rsi + rax]           ;1
    151 
    152    APPLY_FILTER_8 0
    153    jnz         .loop
    154 
    155    ; begin epilog
    156    pop         rdi
    157    pop         rsi
    158    RESTORE_XMM
    159    UNSHADOW_ARGS
    160    pop         rbp
    161    ret
    162 
    163 globalsym(aom_filter_block1d16_v2_ssse3)
    164 sym(aom_filter_block1d16_v2_ssse3):
    165    push        rbp
    166    mov         rbp, rsp
    167    SHADOW_ARGS_TO_STACK 6
    168    SAVE_XMM 7
    169    push        rsi
    170    push        rdi
    171    ; end prolog
    172 
    173    GET_PARAM
    174 .loop:
    175    movdqu        xmm0, [rsi]               ;0
    176    movdqu        xmm1, [rsi + rax]         ;1
    177    movdqa        xmm2, xmm0
    178 
    179    APPLY_FILTER_16 0
    180    jnz         .loop
    181 
    182    ; begin epilog
    183    pop         rdi
    184    pop         rsi
    185    RESTORE_XMM
    186    UNSHADOW_ARGS
    187    pop         rbp
    188    ret
    189 
    190 globalsym(aom_filter_block1d4_h2_ssse3)
    191 sym(aom_filter_block1d4_h2_ssse3):
    192    push        rbp
    193    mov         rbp, rsp
    194    SHADOW_ARGS_TO_STACK 6
    195    push        rsi
    196    push        rdi
    197    ; end prolog
    198 
    199    GET_PARAM_4
    200 .loop:
    201    movdqu      xmm0, [rsi]                 ;load src
    202    movdqa      xmm1, xmm0
    203    psrldq      xmm1, 1
    204 
    205    APPLY_FILTER_4 0
    206    jnz         .loop
    207 
    208    ; begin epilog
    209    pop         rdi
    210    pop         rsi
    211    UNSHADOW_ARGS
    212    pop         rbp
    213    ret
    214 
    215 globalsym(aom_filter_block1d8_h2_ssse3)
    216 sym(aom_filter_block1d8_h2_ssse3):
    217    push        rbp
    218    mov         rbp, rsp
    219    SHADOW_ARGS_TO_STACK 6
    220    SAVE_XMM 7
    221    push        rsi
    222    push        rdi
    223    ; end prolog
    224 
    225    GET_PARAM
    226 .loop:
    227    movdqu      xmm0, [rsi]                 ;load src
    228    movdqa      xmm1, xmm0
    229    psrldq      xmm1, 1
    230 
    231    APPLY_FILTER_8 0
    232    jnz         .loop
    233 
    234    ; begin epilog
    235    pop         rdi
    236    pop         rsi
    237    RESTORE_XMM
    238    UNSHADOW_ARGS
    239    pop         rbp
    240    ret
    241 
    242 globalsym(aom_filter_block1d16_h2_ssse3)
    243 sym(aom_filter_block1d16_h2_ssse3):
    244    push        rbp
    245    mov         rbp, rsp
    246    SHADOW_ARGS_TO_STACK 6
    247    SAVE_XMM 7
    248    push        rsi
    249    push        rdi
    250    ; end prolog
    251 
    252    GET_PARAM
    253 .loop:
    254    movdqu      xmm0,   [rsi]               ;load src
    255    movdqu      xmm1,   [rsi + 1]
    256    movdqa      xmm2, xmm0
    257 
    258    APPLY_FILTER_16 0
    259    jnz         .loop
    260 
    261    ; begin epilog
    262    pop         rdi
    263    pop         rsi
    264    RESTORE_XMM
    265    UNSHADOW_ARGS
    266    pop         rbp
    267    ret