tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

aom_high_subpixel_bilinear_sse2.asm (9141B)


      1 ;
      2 ; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 ;
      4 ; This source code is subject to the terms of the BSD 2 Clause License and
      5 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 ; was not distributed with this source code in the LICENSE file, you can
      7 ; obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 ; Media Patent License 1.0 was not distributed with this source code in the
      9 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 ;
     11 
     12 ;
     13 
     14 %include "aom_ports/x86_abi_support.asm"
     15 
     16 %macro HIGH_GET_PARAM_4 0
     17    mov         rdx, arg(5)                 ;filter ptr
     18    mov         rsi, arg(0)                 ;src_ptr
     19    mov         rdi, arg(2)                 ;output_ptr
     20    mov         rcx, 0x00000040
     21 
     22    movdqa      xmm3, [rdx]                 ;load filters
     23    pshuflw     xmm4, xmm3, 11111111b       ;k3
     24    psrldq      xmm3, 8
     25    pshuflw     xmm3, xmm3, 0b              ;k4
     26    punpcklwd   xmm4, xmm3                  ;k3k4
     27 
     28    movq        xmm3, rcx                   ;rounding
     29    pshufd      xmm3, xmm3, 0
     30 
     31    mov         rdx, 0x00010001
     32    movsxd      rcx, DWORD PTR arg(6)       ;bps
     33    movq        xmm5, rdx
     34    movq        xmm2, rcx
     35    pshufd      xmm5, xmm5, 0b
     36    movdqa      xmm1, xmm5
     37    psllw       xmm5, xmm2
     38    psubw       xmm5, xmm1                  ;max value (for clamping)
     39    pxor        xmm2, xmm2                  ;min value (for clamping)
     40 
     41    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
     42    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
     43    movsxd      rcx, DWORD PTR arg(4)       ;output_height
     44 %endm
     45 
     46 %macro HIGH_APPLY_FILTER_4 1
     47 
     48    punpcklwd   xmm0, xmm1                  ;two row in one register
     49    pmaddwd     xmm0, xmm4                  ;multiply the filter factors
     50 
     51    paddd       xmm0, xmm3                  ;rounding
     52    psrad       xmm0, 7                     ;shift
     53    packssdw    xmm0, xmm0                  ;pack to word
     54 
     55    ;clamp the values
     56    pminsw      xmm0, xmm5
     57    pmaxsw      xmm0, xmm2
     58 
     59 %if %1
     60    movq        xmm1, [rdi]
     61    pavgw       xmm0, xmm1
     62 %endif
     63 
     64    movq        [rdi], xmm0
     65    lea         rsi, [rsi + 2*rax]
     66    lea         rdi, [rdi + 2*rdx]
     67    dec         rcx
     68 %endm
     69 
     70 %macro HIGH_GET_PARAM 0
     71    mov         rdx, arg(5)                 ;filter ptr
     72    mov         rsi, arg(0)                 ;src_ptr
     73    mov         rdi, arg(2)                 ;output_ptr
     74    mov         rcx, 0x00000040
     75 
     76    movdqa      xmm6, [rdx]                 ;load filters
     77 
     78    pshuflw     xmm7, xmm6, 11111111b       ;k3
     79    pshufhw     xmm6, xmm6, 0b              ;k4
     80    psrldq      xmm6, 8
     81    punpcklwd   xmm7, xmm6                  ;k3k4k3k4k3k4k3k4
     82 
     83    movq        xmm4, rcx                   ;rounding
     84    pshufd      xmm4, xmm4, 0
     85 
     86    mov         rdx, 0x00010001
     87    movsxd      rcx, DWORD PTR arg(6)       ;bps
     88    movq        xmm3, rdx
     89    movq        xmm5, rcx
     90    pshufd      xmm3, xmm3, 0b
     91    movdqa      xmm1, xmm3
     92    psllw       xmm3, xmm5
     93    psubw       xmm3, xmm1                  ;max value (for clamping)
     94    pxor        xmm5, xmm5                  ;min value (for clamping)
     95 
     96    movdqa      max, xmm3
     97    movdqa      min, xmm5
     98 
     99    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
    100    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
    101    movsxd      rcx, DWORD PTR arg(4)       ;output_height
    102 %endm
    103 
    104 %macro HIGH_APPLY_FILTER_8 1
    105    movdqa      xmm6, xmm0
    106    punpckhwd   xmm6, xmm1
    107    punpcklwd   xmm0, xmm1
    108    pmaddwd     xmm6, xmm7
    109    pmaddwd     xmm0, xmm7
    110 
    111    paddd       xmm6, xmm4                  ;rounding
    112    paddd       xmm0, xmm4                  ;rounding
    113    psrad       xmm6, 7                     ;shift
    114    psrad       xmm0, 7                     ;shift
    115    packssdw    xmm0, xmm6                  ;pack back to word
    116 
    117    ;clamp the values
    118    pminsw      xmm0, max
    119    pmaxsw      xmm0, min
    120 
    121 %if %1
    122    movdqu      xmm1, [rdi]
    123    pavgw       xmm0, xmm1
    124 %endif
    125    movdqu      [rdi], xmm0                 ;store the result
    126 
    127    lea         rsi, [rsi + 2*rax]
    128    lea         rdi, [rdi + 2*rdx]
    129    dec         rcx
    130 %endm
    131 
    132 %macro HIGH_APPLY_FILTER_16 1
    133    movdqa      xmm5, xmm0
    134    movdqa      xmm6, xmm2
    135    punpckhwd   xmm5, xmm1
    136    punpckhwd   xmm6, xmm3
    137    punpcklwd   xmm0, xmm1
    138    punpcklwd   xmm2, xmm3
    139 
    140    pmaddwd     xmm5, xmm7
    141    pmaddwd     xmm6, xmm7
    142    pmaddwd     xmm0, xmm7
    143    pmaddwd     xmm2, xmm7
    144 
    145    paddd       xmm5, xmm4                  ;rounding
    146    paddd       xmm6, xmm4
    147    paddd       xmm0, xmm4
    148    paddd       xmm2, xmm4
    149 
    150    psrad       xmm5, 7                     ;shift
    151    psrad       xmm6, 7
    152    psrad       xmm0, 7
    153    psrad       xmm2, 7
    154 
    155    packssdw    xmm0, xmm5                  ;pack back to word
    156    packssdw    xmm2, xmm6                  ;pack back to word
    157 
    158    ;clamp the values
    159    pminsw      xmm0, max
    160    pmaxsw      xmm0, min
    161    pminsw      xmm2, max
    162    pmaxsw      xmm2, min
    163 
    164 %if %1
    165    movdqu      xmm1, [rdi]
    166    movdqu      xmm3, [rdi + 16]
    167    pavgw       xmm0, xmm1
    168    pavgw       xmm2, xmm3
    169 %endif
    170    movdqu      [rdi], xmm0               ;store the result
    171    movdqu      [rdi + 16], xmm2          ;store the result
    172 
    173    lea         rsi, [rsi + 2*rax]
    174    lea         rdi, [rdi + 2*rdx]
    175    dec         rcx
    176 %endm
    177 
    178 SECTION .text
    179 
    180 globalsym(aom_highbd_filter_block1d4_v2_sse2)
    181 sym(aom_highbd_filter_block1d4_v2_sse2):
    182    push        rbp
    183    mov         rbp, rsp
    184    SHADOW_ARGS_TO_STACK 7
    185    push        rsi
    186    push        rdi
    187    ; end prolog
    188 
    189    HIGH_GET_PARAM_4
    190 .loop:
    191    movq        xmm0, [rsi]                 ;load src
    192    movq        xmm1, [rsi + 2*rax]
    193 
    194    HIGH_APPLY_FILTER_4 0
    195    jnz         .loop
    196 
    197    ; begin epilog
    198    pop         rdi
    199    pop         rsi
    200    UNSHADOW_ARGS
    201    pop         rbp
    202    ret
    203 
    204 globalsym(aom_highbd_filter_block1d8_v2_sse2)
    205 sym(aom_highbd_filter_block1d8_v2_sse2):
    206    push        rbp
    207    mov         rbp, rsp
    208    SHADOW_ARGS_TO_STACK 7
    209    SAVE_XMM 8
    210    push        rsi
    211    push        rdi
    212    ; end prolog
    213 
    214    ALIGN_STACK 16, rax
    215    sub         rsp, 16 * 2
    216    %define max [rsp + 16 * 0]
    217    %define min [rsp + 16 * 1]
    218 
    219    HIGH_GET_PARAM
    220 .loop:
    221    movdqu      xmm0, [rsi]                 ;0
    222    movdqu      xmm1, [rsi + 2*rax]         ;1
    223 
    224    HIGH_APPLY_FILTER_8 0
    225    jnz         .loop
    226 
    227    add rsp, 16 * 2
    228    pop rsp
    229 
    230    ; begin epilog
    231    pop         rdi
    232    pop         rsi
    233    RESTORE_XMM
    234    UNSHADOW_ARGS
    235    pop         rbp
    236    ret
    237 
    238 globalsym(aom_highbd_filter_block1d16_v2_sse2)
    239 sym(aom_highbd_filter_block1d16_v2_sse2):
    240    push        rbp
    241    mov         rbp, rsp
    242    SHADOW_ARGS_TO_STACK 7
    243    SAVE_XMM 9
    244    push        rsi
    245    push        rdi
    246    ; end prolog
    247 
    248    ALIGN_STACK 16, rax
    249    sub         rsp, 16 * 2
    250    %define max [rsp + 16 * 0]
    251    %define min [rsp + 16 * 1]
    252 
    253    HIGH_GET_PARAM
    254 .loop:
    255    movdqu        xmm0, [rsi]               ;0
    256    movdqu        xmm2, [rsi + 16]
    257    movdqu        xmm1, [rsi + 2*rax]       ;1
    258    movdqu        xmm3, [rsi + 2*rax + 16]
    259 
    260    HIGH_APPLY_FILTER_16 0
    261    jnz         .loop
    262 
    263    add rsp, 16 * 2
    264    pop rsp
    265 
    266    ; begin epilog
    267    pop         rdi
    268    pop         rsi
    269    RESTORE_XMM
    270    UNSHADOW_ARGS
    271    pop         rbp
    272    ret
    273 
    274 globalsym(aom_highbd_filter_block1d4_h2_sse2)
    275 sym(aom_highbd_filter_block1d4_h2_sse2):
    276    push        rbp
    277    mov         rbp, rsp
    278    SHADOW_ARGS_TO_STACK 7
    279    push        rsi
    280    push        rdi
    281    ; end prolog
    282 
    283    HIGH_GET_PARAM_4
    284 .loop:
    285    movdqu      xmm0, [rsi]                 ;load src
    286    movdqa      xmm1, xmm0
    287    psrldq      xmm1, 2
    288 
    289    HIGH_APPLY_FILTER_4 0
    290    jnz         .loop
    291 
    292    ; begin epilog
    293    pop         rdi
    294    pop         rsi
    295    UNSHADOW_ARGS
    296    pop         rbp
    297    ret
    298 
    299 globalsym(aom_highbd_filter_block1d8_h2_sse2)
    300 sym(aom_highbd_filter_block1d8_h2_sse2):
    301    push        rbp
    302    mov         rbp, rsp
    303    SHADOW_ARGS_TO_STACK 7
    304    SAVE_XMM 8
    305    push        rsi
    306    push        rdi
    307    ; end prolog
    308 
    309    ALIGN_STACK 16, rax
    310    sub         rsp, 16 * 2
    311    %define max [rsp + 16 * 0]
    312    %define min [rsp + 16 * 1]
    313 
    314    HIGH_GET_PARAM
    315 .loop:
    316    movdqu      xmm0, [rsi]                 ;load src
    317    movdqu      xmm1, [rsi + 2]
    318 
    319    HIGH_APPLY_FILTER_8 0
    320    jnz         .loop
    321 
    322    add rsp, 16 * 2
    323    pop rsp
    324 
    325    ; begin epilog
    326    pop         rdi
    327    pop         rsi
    328    RESTORE_XMM
    329    UNSHADOW_ARGS
    330    pop         rbp
    331    ret
    332 
    333 globalsym(aom_highbd_filter_block1d16_h2_sse2)
    334 sym(aom_highbd_filter_block1d16_h2_sse2):
    335    push        rbp
    336    mov         rbp, rsp
    337    SHADOW_ARGS_TO_STACK 7
    338    SAVE_XMM 9
    339    push        rsi
    340    push        rdi
    341    ; end prolog
    342 
    343    ALIGN_STACK 16, rax
    344    sub         rsp, 16 * 2
    345    %define max [rsp + 16 * 0]
    346    %define min [rsp + 16 * 1]
    347 
    348    HIGH_GET_PARAM
    349 .loop:
    350    movdqu      xmm0,   [rsi]               ;load src
    351    movdqu      xmm1,   [rsi + 2]
    352    movdqu      xmm2,   [rsi + 16]
    353    movdqu      xmm3,   [rsi + 18]
    354 
    355    HIGH_APPLY_FILTER_16 0
    356    jnz         .loop
    357 
    358    add rsp, 16 * 2
    359    pop rsp
    360 
    361    ; begin epilog
    362    pop         rdi
    363    pop         rsi
    364    RESTORE_XMM
    365    UNSHADOW_ARGS
    366    pop         rbp
    367    ret