tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

av1_ssim_opt_x86_64.asm (6397B)


      1 ;
      2 ; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 ;
      4 ; This source code is subject to the terms of the BSD 2 Clause License and
      5 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 ; was not distributed with this source code in the LICENSE file, you can
      7 ; obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 ; Media Patent License 1.0 was not distributed with this source code in the
      9 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 ;
     11 
     12 ;
     13 
     14 %include "aom_ports/x86_abi_support.asm"
     15 
     16 ; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
     17 %macro TABULATE_SSIM 0
     18        paddusw         xmm15, xmm3  ; sum_s
     19        paddusw         xmm14, xmm4  ; sum_r
     20        movdqa          xmm1, xmm3
     21        pmaddwd         xmm1, xmm1
     22        paddd           xmm13, xmm1 ; sum_sq_s
     23        movdqa          xmm2, xmm4
     24        pmaddwd         xmm2, xmm2
     25        paddd           xmm12, xmm2 ; sum_sq_r
     26        pmaddwd         xmm3, xmm4
     27        paddd           xmm11, xmm3  ; sum_sxr
     28 %endmacro
     29 
     30 ; Sum across the register %1 starting with q words
     31 %macro SUM_ACROSS_Q 1
     32        movdqa          xmm2,%1
     33        punpckldq       %1,xmm0
     34        punpckhdq       xmm2,xmm0
     35        paddq           %1,xmm2
     36        movdqa          xmm2,%1
     37        punpcklqdq      %1,xmm0
     38        punpckhqdq      xmm2,xmm0
     39        paddq           %1,xmm2
     40 %endmacro
     41 
     42 ; Sum across the register %1 starting with q words
     43 %macro SUM_ACROSS_W 1
     44        movdqa          xmm1, %1
     45        punpcklwd       %1,xmm0
     46        punpckhwd       xmm1,xmm0
     47        paddd           %1, xmm1
     48        SUM_ACROSS_Q    %1
     49 %endmacro
     50 
     51 SECTION .text
     52 
     53 ;void ssim_parms_sse2(
     54 ;    unsigned char *s,
     55 ;    int sp,
     56 ;    unsigned char *r,
     57 ;    int rp
     58 ;    unsigned long *sum_s,
     59 ;    unsigned long *sum_r,
     60 ;    unsigned long *sum_sq_s,
     61 ;    unsigned long *sum_sq_r,
     62 ;    unsigned long *sum_sxr);
     63 ;
     64 ; TODO: Use parm passing through structure, probably don't need the pxors
     65 ; ( calling app will initialize to 0 ) could easily fit everything in sse2
     66 ; without too much hastle, and can probably do better estimates with psadw
     67 ; or pavgb At this point this is just meant to be first pass for calculating
     68 ; all the parms needed for 16x16 ssim so we can play with dssim as distortion
     69 ; in mode selection code.
     70 globalsym(av1_ssim_parms_16x16_sse2)
     71 sym(av1_ssim_parms_16x16_sse2):
     72    push        rbp
     73    mov         rbp, rsp
     74    SHADOW_ARGS_TO_STACK 9
     75    SAVE_XMM 15
     76    push        rsi
     77    push        rdi
     78    ; end prolog
     79 
     80    mov             rsi,        arg(0) ;s
     81    mov             rcx,        arg(1) ;sp
     82    mov             rdi,        arg(2) ;r
     83    mov             rax,        arg(3) ;rp
     84 
     85    pxor            xmm0, xmm0
     86    pxor            xmm15,xmm15  ;sum_s
     87    pxor            xmm14,xmm14  ;sum_r
     88    pxor            xmm13,xmm13  ;sum_sq_s
     89    pxor            xmm12,xmm12  ;sum_sq_r
     90    pxor            xmm11,xmm11  ;sum_sxr
     91 
     92    mov             rdx, 16      ;row counter
     93 .NextRow:
     94 
     95    ;grab source and reference pixels
     96    movdqu          xmm5, [rsi]
     97    movdqu          xmm6, [rdi]
     98    movdqa          xmm3, xmm5
     99    movdqa          xmm4, xmm6
    100    punpckhbw       xmm3, xmm0 ; high_s
    101    punpckhbw       xmm4, xmm0 ; high_r
    102 
    103    TABULATE_SSIM
    104 
    105    movdqa          xmm3, xmm5
    106    movdqa          xmm4, xmm6
    107    punpcklbw       xmm3, xmm0 ; low_s
    108    punpcklbw       xmm4, xmm0 ; low_r
    109 
    110    TABULATE_SSIM
    111 
    112    add             rsi, rcx   ; next s row
    113    add             rdi, rax   ; next r row
    114 
    115    dec             rdx        ; counter
    116    jnz .NextRow
    117 
    118    SUM_ACROSS_W    xmm15
    119    SUM_ACROSS_W    xmm14
    120    SUM_ACROSS_Q    xmm13
    121    SUM_ACROSS_Q    xmm12
    122    SUM_ACROSS_Q    xmm11
    123 
    124    mov             rdi,arg(4)
    125    movd            [rdi], xmm15;
    126    mov             rdi,arg(5)
    127    movd            [rdi], xmm14;
    128    mov             rdi,arg(6)
    129    movd            [rdi], xmm13;
    130    mov             rdi,arg(7)
    131    movd            [rdi], xmm12;
    132    mov             rdi,arg(8)
    133    movd            [rdi], xmm11;
    134 
    135    ; begin epilog
    136    pop         rdi
    137    pop         rsi
    138    RESTORE_XMM
    139    UNSHADOW_ARGS
    140    pop         rbp
    141    ret
    142 
    143 ;void ssim_parms_sse2(
    144 ;    unsigned char *s,
    145 ;    int sp,
    146 ;    unsigned char *r,
    147 ;    int rp
    148 ;    unsigned long *sum_s,
    149 ;    unsigned long *sum_r,
    150 ;    unsigned long *sum_sq_s,
    151 ;    unsigned long *sum_sq_r,
    152 ;    unsigned long *sum_sxr);
    153 ;
    154 ; TODO: Use parm passing through structure, probably don't need the pxors
    155 ; ( calling app will initialize to 0 ) could easily fit everything in sse2
    156 ; without too much hastle, and can probably do better estimates with psadw
    157 ; or pavgb At this point this is just meant to be first pass for calculating
    158 ; all the parms needed for 16x16 ssim so we can play with dssim as distortion
    159 ; in mode selection code.
    160 globalsym(av1_ssim_parms_8x8_sse2)
    161 sym(av1_ssim_parms_8x8_sse2):
    162    push        rbp
    163    mov         rbp, rsp
    164    SHADOW_ARGS_TO_STACK 9
    165    SAVE_XMM 15
    166    push        rsi
    167    push        rdi
    168    ; end prolog
    169 
    170    mov             rsi,        arg(0) ;s
    171    mov             rcx,        arg(1) ;sp
    172    mov             rdi,        arg(2) ;r
    173    mov             rax,        arg(3) ;rp
    174 
    175    pxor            xmm0, xmm0
    176    pxor            xmm15,xmm15  ;sum_s
    177    pxor            xmm14,xmm14  ;sum_r
    178    pxor            xmm13,xmm13  ;sum_sq_s
    179    pxor            xmm12,xmm12  ;sum_sq_r
    180    pxor            xmm11,xmm11  ;sum_sxr
    181 
    182    mov             rdx, 8      ;row counter
    183 .NextRow:
    184 
    185    ;grab source and reference pixels
    186    movq            xmm3, [rsi]
    187    movq            xmm4, [rdi]
    188    punpcklbw       xmm3, xmm0 ; low_s
    189    punpcklbw       xmm4, xmm0 ; low_r
    190 
    191    TABULATE_SSIM
    192 
    193    add             rsi, rcx   ; next s row
    194    add             rdi, rax   ; next r row
    195 
    196    dec             rdx        ; counter
    197    jnz .NextRow
    198 
    199    SUM_ACROSS_W    xmm15
    200    SUM_ACROSS_W    xmm14
    201    SUM_ACROSS_Q    xmm13
    202    SUM_ACROSS_Q    xmm12
    203    SUM_ACROSS_Q    xmm11
    204 
    205    mov             rdi,arg(4)
    206    movd            [rdi], xmm15;
    207    mov             rdi,arg(5)
    208    movd            [rdi], xmm14;
    209    mov             rdi,arg(6)
    210    movd            [rdi], xmm13;
    211    mov             rdi,arg(7)
    212    movd            [rdi], xmm12;
    213    mov             rdi,arg(8)
    214    movd            [rdi], xmm11;
    215 
    216    ; begin epilog
    217    pop         rdi
    218    pop         rsi
    219    RESTORE_XMM
    220    UNSHADOW_ARGS
    221    pop         rbp
    222    ret