tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

ssim_sse2_x86_64.asm (3901B)


      1 ;
      2 ; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 ;
      4 ; This source code is subject to the terms of the BSD 2 Clause License and
      5 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 ; was not distributed with this source code in the LICENSE file, you can
      7 ; obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 ; Media Patent License 1.0 was not distributed with this source code in the
      9 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 ;
     11 
     12 ;
     13 
     14 %include "aom_ports/x86_abi_support.asm"
     15 
     16 ; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
     17 %macro TABULATE_SSIM 0
     18        paddusw         xmm15, xmm3  ; sum_s
     19        paddusw         xmm14, xmm4  ; sum_r
     20        movdqa          xmm1, xmm3
     21        pmaddwd         xmm1, xmm1
     22        paddd           xmm13, xmm1 ; sum_sq_s
     23        movdqa          xmm2, xmm4
     24        pmaddwd         xmm2, xmm2
     25        paddd           xmm12, xmm2 ; sum_sq_r
     26        pmaddwd         xmm3, xmm4
     27        paddd           xmm11, xmm3  ; sum_sxr
     28 %endmacro
     29 
     30 ; Sum across the register %1 starting with q words
     31 %macro SUM_ACROSS_Q 1
     32        movdqa          xmm2,%1
     33        punpckldq       %1,xmm0
     34        punpckhdq       xmm2,xmm0
     35        paddq           %1,xmm2
     36        movdqa          xmm2,%1
     37        punpcklqdq      %1,xmm0
     38        punpckhqdq      xmm2,xmm0
     39        paddq           %1,xmm2
     40 %endmacro
     41 
     42 ; Sum across the register %1 starting with q words
     43 %macro SUM_ACROSS_W 1
     44        movdqa          xmm1, %1
     45        punpcklwd       %1,xmm0
     46        punpckhwd       xmm1,xmm0
     47        paddd           %1, xmm1
     48        SUM_ACROSS_Q    %1
     49 %endmacro
     50 
     51 SECTION .text
     52 
     53 ;void aom_ssim_parms_8x8_sse2(
     54 ;    unsigned char *s,
     55 ;    int sp,
     56 ;    unsigned char *r,
     57 ;    int rp
     58 ;    uint32_t *sum_s,
     59 ;    uint32_t *sum_r,
     60 ;    uint32_t *sum_sq_s,
     61 ;    uint32_t *sum_sq_r,
     62 ;    uint32_t *sum_sxr);
     63 ;
     64 ; TODO: Use parm passing through structure, probably don't need the pxors
     65 ; ( calling app will initialize to 0 ) could easily fit everything in sse2
     66 ; without too much hastle, and can probably do better estimates with psadw
     67 ; or pavgb At this point this is just meant to be first pass for calculating
     68 ; all the parms needed for 16x16 ssim so we can play with dssim as distortion
     69 ; in mode selection code.
     70 globalsym(aom_ssim_parms_8x8_sse2)
     71 sym(aom_ssim_parms_8x8_sse2):
     72    push        rbp
     73    mov         rbp, rsp
     74    SHADOW_ARGS_TO_STACK 9
     75    SAVE_XMM 15
     76    push        rsi
     77    push        rdi
     78    ; end prolog
     79 
     80    mov             rsi,        arg(0) ;s
     81    mov             rcx,        arg(1) ;sp
     82    mov             rdi,        arg(2) ;r
     83    mov             rax,        arg(3) ;rp
     84 
     85    pxor            xmm0, xmm0
     86    pxor            xmm15,xmm15  ;sum_s
     87    pxor            xmm14,xmm14  ;sum_r
     88    pxor            xmm13,xmm13  ;sum_sq_s
     89    pxor            xmm12,xmm12  ;sum_sq_r
     90    pxor            xmm11,xmm11  ;sum_sxr
     91 
     92    mov             rdx, 8      ;row counter
     93 .NextRow:
     94 
     95    ;grab source and reference pixels
     96    movq            xmm3, [rsi]
     97    movq            xmm4, [rdi]
     98    punpcklbw       xmm3, xmm0 ; low_s
     99    punpcklbw       xmm4, xmm0 ; low_r
    100 
    101    TABULATE_SSIM
    102 
    103    add             rsi, rcx   ; next s row
    104    add             rdi, rax   ; next r row
    105 
    106    dec             rdx        ; counter
    107    jnz .NextRow
    108 
    109    SUM_ACROSS_W    xmm15
    110    SUM_ACROSS_W    xmm14
    111    SUM_ACROSS_Q    xmm13
    112    SUM_ACROSS_Q    xmm12
    113    SUM_ACROSS_Q    xmm11
    114 
    115    mov             rdi,arg(4)
    116    movd            [rdi], xmm15;
    117    mov             rdi,arg(5)
    118    movd            [rdi], xmm14;
    119    mov             rdi,arg(6)
    120    movd            [rdi], xmm13;
    121    mov             rdi,arg(7)
    122    movd            [rdi], xmm12;
    123    mov             rdi,arg(8)
    124    movd            [rdi], xmm11;
    125 
    126    ; begin epilog
    127    pop         rdi
    128    pop         rsi
    129    RESTORE_XMM
    130    UNSHADOW_ARGS
    131    pop         rbp
    132    ret