aom_subpixel_bilinear_ssse3.asm (6292B)
1 ; 2 ; Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 ; 4 ; This source code is subject to the terms of the BSD 2 Clause License and 5 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 ; was not distributed with this source code in the LICENSE file, you can 7 ; obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 ; Media Patent License 1.0 was not distributed with this source code in the 9 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 ; 11 12 ; 13 14 %include "aom_ports/x86_abi_support.asm" 15 16 %macro GET_PARAM_4 0 17 mov rdx, arg(5) ;filter ptr 18 mov rsi, arg(0) ;src_ptr 19 mov rdi, arg(2) ;output_ptr 20 mov ecx, 0x01000100 21 22 movdqa xmm3, [rdx] ;load filters 23 psrldq xmm3, 6 24 packsswb xmm3, xmm3 25 pshuflw xmm3, xmm3, 0b ;k3_k4 26 27 movd xmm2, ecx ;rounding_shift 28 pshufd xmm2, xmm2, 0 29 30 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 31 movsxd rdx, DWORD PTR arg(3) ;out_pitch 32 movsxd rcx, DWORD PTR arg(4) ;output_height 33 %endm 34 35 %macro APPLY_FILTER_4 1 36 punpcklbw xmm0, xmm1 37 pmaddubsw xmm0, xmm3 38 39 pmulhrsw xmm0, xmm2 ;rounding(+64)+shift(>>7) 40 packuswb xmm0, xmm0 ;pack to byte 41 42 %if %1 43 movd xmm1, [rdi] 44 pavgb xmm0, xmm1 45 %endif 46 movd [rdi], xmm0 47 lea rsi, [rsi + rax] 48 lea rdi, [rdi + rdx] 49 dec rcx 50 %endm 51 52 %macro GET_PARAM 0 53 mov rdx, arg(5) ;filter ptr 54 mov rsi, arg(0) ;src_ptr 55 mov rdi, arg(2) ;output_ptr 56 mov ecx, 0x01000100 57 58 movdqa xmm7, [rdx] ;load filters 59 psrldq xmm7, 6 60 packsswb xmm7, xmm7 61 pshuflw xmm7, xmm7, 0b ;k3_k4 62 punpcklwd xmm7, xmm7 63 64 movd xmm6, ecx ;rounding_shift 65 pshufd xmm6, xmm6, 0 66 67 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 68 movsxd rdx, DWORD PTR arg(3) ;out_pitch 69 movsxd rcx, DWORD PTR arg(4) ;output_height 70 %endm 71 72 %macro APPLY_FILTER_8 1 73 punpcklbw xmm0, xmm1 74 pmaddubsw xmm0, xmm7 75 76 pmulhrsw xmm0, xmm6 ;rounding(+64)+shift(>>7) 77 packuswb xmm0, xmm0 ;pack back to byte 78 79 %if %1 80 movq xmm1, [rdi] 81 pavgb xmm0, xmm1 82 %endif 83 movq [rdi], xmm0 ;store the result 84 85 lea rsi, [rsi + rax] 86 lea rdi, [rdi + rdx] 87 dec rcx 88 %endm 89 90 %macro APPLY_FILTER_16 1 91 punpcklbw xmm0, xmm1 92 punpckhbw xmm2, xmm1 93 pmaddubsw xmm0, xmm7 94 pmaddubsw xmm2, xmm7 95 96 pmulhrsw xmm0, xmm6 ;rounding(+64)+shift(>>7) 97 pmulhrsw xmm2, xmm6 98 packuswb xmm0, xmm2 ;pack back to byte 99 100 %if %1 101 movdqu xmm1, [rdi] 102 pavgb xmm0, xmm1 103 %endif 104 movdqu [rdi], xmm0 ;store the result 105 106 lea rsi, [rsi + rax] 107 lea rdi, [rdi + rdx] 108 dec rcx 109 %endm 110 111 SECTION .text 112 113 globalsym(aom_filter_block1d4_v2_ssse3) 114 sym(aom_filter_block1d4_v2_ssse3): 115 push rbp 116 mov rbp, rsp 117 SHADOW_ARGS_TO_STACK 6 118 push rsi 119 push rdi 120 ; end prolog 121 122 GET_PARAM_4 123 .loop: 124 movd xmm0, [rsi] ;load src 125 movd xmm1, [rsi + rax] 126 127 APPLY_FILTER_4 0 128 jnz .loop 129 130 ; begin epilog 131 pop rdi 132 pop rsi 133 UNSHADOW_ARGS 134 pop rbp 135 ret 136 137 globalsym(aom_filter_block1d8_v2_ssse3) 138 sym(aom_filter_block1d8_v2_ssse3): 139 push rbp 140 mov rbp, rsp 141 SHADOW_ARGS_TO_STACK 6 142 SAVE_XMM 7 143 push rsi 144 push rdi 145 ; end prolog 146 147 GET_PARAM 148 .loop: 149 movq xmm0, [rsi] ;0 150 movq xmm1, [rsi + rax] ;1 151 152 APPLY_FILTER_8 0 153 jnz .loop 154 155 ; begin epilog 156 pop rdi 157 pop rsi 158 RESTORE_XMM 159 UNSHADOW_ARGS 160 pop rbp 161 ret 162 163 globalsym(aom_filter_block1d16_v2_ssse3) 164 sym(aom_filter_block1d16_v2_ssse3): 165 push rbp 166 mov rbp, rsp 167 SHADOW_ARGS_TO_STACK 6 168 SAVE_XMM 7 169 push rsi 170 push rdi 171 ; end prolog 172 173 GET_PARAM 174 .loop: 175 movdqu xmm0, [rsi] ;0 176 movdqu xmm1, [rsi + rax] ;1 177 movdqa xmm2, xmm0 178 179 APPLY_FILTER_16 0 180 jnz .loop 181 182 ; begin epilog 183 pop rdi 184 pop rsi 185 RESTORE_XMM 186 UNSHADOW_ARGS 187 pop rbp 188 ret 189 190 globalsym(aom_filter_block1d4_h2_ssse3) 191 sym(aom_filter_block1d4_h2_ssse3): 192 push rbp 193 mov rbp, rsp 194 SHADOW_ARGS_TO_STACK 6 195 push rsi 196 push rdi 197 ; end prolog 198 199 GET_PARAM_4 200 .loop: 201 movdqu xmm0, [rsi] ;load src 202 movdqa xmm1, xmm0 203 psrldq xmm1, 1 204 205 APPLY_FILTER_4 0 206 jnz .loop 207 208 ; begin epilog 209 pop rdi 210 pop rsi 211 UNSHADOW_ARGS 212 pop rbp 213 ret 214 215 globalsym(aom_filter_block1d8_h2_ssse3) 216 sym(aom_filter_block1d8_h2_ssse3): 217 push rbp 218 mov rbp, rsp 219 SHADOW_ARGS_TO_STACK 6 220 SAVE_XMM 7 221 push rsi 222 push rdi 223 ; end prolog 224 225 GET_PARAM 226 .loop: 227 movdqu xmm0, [rsi] ;load src 228 movdqa xmm1, xmm0 229 psrldq xmm1, 1 230 231 APPLY_FILTER_8 0 232 jnz .loop 233 234 ; begin epilog 235 pop rdi 236 pop rsi 237 RESTORE_XMM 238 UNSHADOW_ARGS 239 pop rbp 240 ret 241 242 globalsym(aom_filter_block1d16_h2_ssse3) 243 sym(aom_filter_block1d16_h2_ssse3): 244 push rbp 245 mov rbp, rsp 246 SHADOW_ARGS_TO_STACK 6 247 SAVE_XMM 7 248 push rsi 249 push rdi 250 ; end prolog 251 252 GET_PARAM 253 .loop: 254 movdqu xmm0, [rsi] ;load src 255 movdqu xmm1, [rsi + 1] 256 movdqa xmm2, xmm0 257 258 APPLY_FILTER_16 0 259 jnz .loop 260 261 ; begin epilog 262 pop rdi 263 pop rsi 264 RESTORE_XMM 265 UNSHADOW_ARGS 266 pop rbp 267 ret