ssim_sse2_x86_64.asm (3901B)
1 ; 2 ; Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 ; 4 ; This source code is subject to the terms of the BSD 2 Clause License and 5 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 ; was not distributed with this source code in the LICENSE file, you can 7 ; obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 ; Media Patent License 1.0 was not distributed with this source code in the 9 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 ; 11 12 ; 13 14 %include "aom_ports/x86_abi_support.asm" 15 16 ; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr 17 %macro TABULATE_SSIM 0 18 paddusw xmm15, xmm3 ; sum_s 19 paddusw xmm14, xmm4 ; sum_r 20 movdqa xmm1, xmm3 21 pmaddwd xmm1, xmm1 22 paddd xmm13, xmm1 ; sum_sq_s 23 movdqa xmm2, xmm4 24 pmaddwd xmm2, xmm2 25 paddd xmm12, xmm2 ; sum_sq_r 26 pmaddwd xmm3, xmm4 27 paddd xmm11, xmm3 ; sum_sxr 28 %endmacro 29 30 ; Sum across the register %1 starting with q words 31 %macro SUM_ACROSS_Q 1 32 movdqa xmm2,%1 33 punpckldq %1,xmm0 34 punpckhdq xmm2,xmm0 35 paddq %1,xmm2 36 movdqa xmm2,%1 37 punpcklqdq %1,xmm0 38 punpckhqdq xmm2,xmm0 39 paddq %1,xmm2 40 %endmacro 41 42 ; Sum across the register %1 starting with q words 43 %macro SUM_ACROSS_W 1 44 movdqa xmm1, %1 45 punpcklwd %1,xmm0 46 punpckhwd xmm1,xmm0 47 paddd %1, xmm1 48 SUM_ACROSS_Q %1 49 %endmacro 50 51 SECTION .text 52 53 ;void aom_ssim_parms_8x8_sse2( 54 ; unsigned char *s, 55 ; int sp, 56 ; unsigned char *r, 57 ; int rp 58 ; uint32_t *sum_s, 59 ; uint32_t *sum_r, 60 ; uint32_t *sum_sq_s, 61 ; uint32_t *sum_sq_r, 62 ; uint32_t *sum_sxr); 63 ; 64 ; TODO: Use parm passing through structure, probably don't need the pxors 65 ; ( calling app will initialize to 0 ) could easily fit everything in sse2 66 ; without too much hastle, and can probably do better estimates with psadw 67 ; or pavgb At this point this is just meant to be first pass for calculating 68 ; all the parms needed for 16x16 ssim so we can play with dssim as distortion 69 ; in mode selection code. 70 globalsym(aom_ssim_parms_8x8_sse2) 71 sym(aom_ssim_parms_8x8_sse2): 72 push rbp 73 mov rbp, rsp 74 SHADOW_ARGS_TO_STACK 9 75 SAVE_XMM 15 76 push rsi 77 push rdi 78 ; end prolog 79 80 mov rsi, arg(0) ;s 81 mov rcx, arg(1) ;sp 82 mov rdi, arg(2) ;r 83 mov rax, arg(3) ;rp 84 85 pxor xmm0, xmm0 86 pxor xmm15,xmm15 ;sum_s 87 pxor xmm14,xmm14 ;sum_r 88 pxor xmm13,xmm13 ;sum_sq_s 89 pxor xmm12,xmm12 ;sum_sq_r 90 pxor xmm11,xmm11 ;sum_sxr 91 92 mov rdx, 8 ;row counter 93 .NextRow: 94 95 ;grab source and reference pixels 96 movq xmm3, [rsi] 97 movq xmm4, [rdi] 98 punpcklbw xmm3, xmm0 ; low_s 99 punpcklbw xmm4, xmm0 ; low_r 100 101 TABULATE_SSIM 102 103 add rsi, rcx ; next s row 104 add rdi, rax ; next r row 105 106 dec rdx ; counter 107 jnz .NextRow 108 109 SUM_ACROSS_W xmm15 110 SUM_ACROSS_W xmm14 111 SUM_ACROSS_Q xmm13 112 SUM_ACROSS_Q xmm12 113 SUM_ACROSS_Q xmm11 114 115 mov rdi,arg(4) 116 movd [rdi], xmm15; 117 mov rdi,arg(5) 118 movd [rdi], xmm14; 119 mov rdi,arg(6) 120 movd [rdi], xmm13; 121 mov rdi,arg(7) 122 movd [rdi], xmm12; 123 mov rdi,arg(8) 124 movd [rdi], xmm11; 125 126 ; begin epilog 127 pop rdi 128 pop rsi 129 RESTORE_XMM 130 UNSHADOW_ARGS 131 pop rbp 132 ret