flacdsp.asm (9431B)
1 ;****************************************************************************** 2 ;* FLAC DSP SIMD optimizations 3 ;* 4 ;* Copyright (C) 2014 Loren Merritt 5 ;* Copyright (C) 2014 James Almer 6 ;* 7 ;* This file is part of FFmpeg. 8 ;* 9 ;* FFmpeg is free software; you can redistribute it and/or 10 ;* modify it under the terms of the GNU Lesser General Public 11 ;* License as published by the Free Software Foundation; either 12 ;* version 2.1 of the License, or (at your option) any later version. 13 ;* 14 ;* FFmpeg is distributed in the hope that it will be useful, 15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 ;* Lesser General Public License for more details. 18 ;* 19 ;* You should have received a copy of the GNU Lesser General Public 20 ;* License along with FFmpeg; if not, write to the Free Software 21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22 ;****************************************************************************** 23 24 %include "libavutil/x86/x86util.asm" 25 26 SECTION_RODATA 27 28 vector: db 0,1,4,5,8,9,12,13,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,1,4,5,8,9,12,13, 29 30 SECTION .text 31 32 %macro PMACSDQL 3 33 %if cpuflag(xop) 34 pmacsdql %1, %2, %3, %1 35 %else 36 pmuldq %2, %3 37 paddq %1, %2 38 %endif 39 %endmacro 40 41 %macro LPC_32 3 42 INIT_XMM %1 43 cglobal flac_lpc_%2, 5,6,5, decoded, coeffs, pred_order, qlevel, len, j 44 sub lend, pred_orderd 45 jle .ret 46 movsxdifnidn pred_orderq, pred_orderd 47 lea decodedq, [decodedq+pred_orderq*4-8] 48 lea coeffsq, [coeffsq+pred_orderq*4] 49 neg pred_orderq 50 movd m4, qlevelm 51 ALIGN 16 52 .loop_sample: 53 movd m0, [decodedq+pred_orderq*4+8] 54 add decodedq, 8 55 movd m1, [coeffsq+pred_orderq*4] 56 pxor m2, m2 57 pxor m3, m3 58 lea jq, [pred_orderq+1] 59 test jq, jq 60 jz .end_order 61 .loop_order: 62 PMACSDQL m2, m0, m1 63 movd m0, [decodedq+jq*4] 64 PMACSDQL m3, m1, m0 65 movd m1, [coeffsq+jq*4] 66 inc jq 67 jl .loop_order 68 .end_order: 69 PMACSDQL m2, m0, m1 70 %3 m2, m4 71 movd m0, [decodedq] 72 paddd m0, m2 73 movd [decodedq], m0 74 sub lend, 2 75 jl .ret 76 PMACSDQL m3, m1, m0 77 %3 m3, m4 78 movd m1, [decodedq+4] 79 paddd m1, m3 80 movd [decodedq+4], m1 81 jg .loop_sample 82 .ret: 83 RET 84 %endmacro 85 86 LPC_32 sse4, 16, psrad 87 LPC_32 sse4, 32, psrlq 88 %if HAVE_XOP_EXTERNAL 89 LPC_32 xop, 32, psrlq 90 %endif 91 92 INIT_XMM sse2 93 cglobal flac_wasted_32, 3,3,5, decoded, wasted, len 94 shl lend, 2 95 add decodedq, lenq 96 neg lenq 97 movd m4, wastedd 98 ALIGN 16 99 .loop: 100 mova m0, [decodedq+lenq+mmsize*0] 101 mova m1, [decodedq+lenq+mmsize*1] 102 mova m2, [decodedq+lenq+mmsize*2] 103 mova m3, [decodedq+lenq+mmsize*3] 104 pslld m0, m4 105 pslld m1, m4 106 pslld m2, m4 107 pslld m3, m4 108 mova [decodedq+lenq+mmsize*0], m0 109 mova [decodedq+lenq+mmsize*1], m1 110 mova [decodedq+lenq+mmsize*2], m2 111 mova [decodedq+lenq+mmsize*3], m3 112 add lenq, mmsize * 4 113 jl .loop 114 RET 115 116 INIT_XMM sse4 117 cglobal flac_wasted_33, 4,4,5, decoded, residuals, wasted, len 118 shl lend, 2 119 lea decodedq, [decodedq+lenq*2] 120 add residualsq, lenq 121 neg lenq 122 movd m4, wastedd 123 ALIGN 16 124 .loop: 125 pmovsxdq m0, [residualsq+lenq+mmsize*0] 126 pmovsxdq m1, [residualsq+lenq+mmsize/2] 127 pmovsxdq m2, [residualsq+lenq+mmsize*1] 128 pmovsxdq m3, [residualsq+lenq+mmsize*1+mmsize/2] 129 psllq m0, m4 130 psllq m1, m4 131 psllq m2, m4 132 psllq m3, m4 133 mova [decodedq+lenq*2+mmsize*0], m0 134 mova [decodedq+lenq*2+mmsize*1], m1 135 mova [decodedq+lenq*2+mmsize*2], m2 136 mova [decodedq+lenq*2+mmsize*3], m3 137 add lenq, mmsize * 2 138 jl .loop 139 RET 140 141 ;---------------------------------------------------------------------------------- 142 ;void ff_flac_decorrelate_[lrm]s_16_sse2(uint8_t **out, int32_t **in, int channels, 143 ; int len, int shift); 144 ;---------------------------------------------------------------------------------- 145 %macro FLAC_DECORRELATE_16 3-4 146 cglobal flac_decorrelate_%1_16, 2, 4, 4, out, in0, in1, len 147 %ifidn %1, indep2 148 VBROADCASTI128 m2, [vector] 149 %endif 150 %if ARCH_X86_32 151 mov lend, lenm 152 %endif 153 movd m3, r4m 154 shl lend, 2 155 mov in1q, [in0q + gprsize] 156 mov in0q, [in0q] 157 mov outq, [outq] 158 add in1q, lenq 159 add in0q, lenq 160 add outq, lenq 161 neg lenq 162 163 align 16 164 .loop: 165 mova m0, [in0q + lenq] 166 mova m1, [in1q + lenq] 167 %ifidn %1, ms 168 psrad m2, m1, 1 169 psubd m0, m2 170 %endif 171 %ifnidn %1, indep2 172 p%4d m2, m0, m1 173 packssdw m%2, m%2 174 packssdw m%3, m%3 175 punpcklwd m%2, m%3 176 psllw m%2, m3 177 %else 178 pslld m%2, m3 179 pslld m%3, m3 180 pshufb m%2, m%2, m2 181 pshufb m%3, m%3, m2 182 punpcklwd m%2, m%3 183 %endif 184 mova [outq + lenq], m%2 185 add lenq, 16 186 jl .loop 187 RET 188 %endmacro 189 190 INIT_XMM sse2 191 FLAC_DECORRELATE_16 ls, 0, 2, sub 192 FLAC_DECORRELATE_16 rs, 2, 1, add 193 FLAC_DECORRELATE_16 ms, 2, 0, add 194 195 ;---------------------------------------------------------------------------------- 196 ;void ff_flac_decorrelate_[lrm]s_32_sse2(uint8_t **out, int32_t **in, int channels, 197 ; int len, int shift); 198 ;---------------------------------------------------------------------------------- 199 %macro FLAC_DECORRELATE_32 5 200 cglobal flac_decorrelate_%1_32, 2, 4, 4, out, in0, in1, len 201 %if ARCH_X86_32 202 mov lend, lenm 203 %endif 204 movd m3, r4m 205 mov in1q, [in0q + gprsize] 206 mov in0q, [in0q] 207 mov outq, [outq] 208 sub in1q, in0q 209 210 align 16 211 .loop: 212 mova m0, [in0q] 213 mova m1, [in0q + in1q] 214 %ifidn %1, ms 215 psrad m2, m1, 1 216 psubd m0, m2 217 %endif 218 p%5d m2, m0, m1 219 pslld m%2, m3 220 pslld m%3, m3 221 222 SBUTTERFLY dq, %2, %3, %4 223 224 mova [outq ], m%2 225 mova [outq + mmsize], m%3 226 227 add in0q, mmsize 228 add outq, mmsize*2 229 sub lend, mmsize/4 230 jg .loop 231 RET 232 %endmacro 233 234 INIT_XMM sse2 235 FLAC_DECORRELATE_32 ls, 0, 2, 1, sub 236 FLAC_DECORRELATE_32 rs, 2, 1, 0, add 237 FLAC_DECORRELATE_32 ms, 2, 0, 1, add 238 239 ;----------------------------------------------------------------------------------------- 240 ;void ff_flac_decorrelate_indep<ch>_<bps>_<opt>(uint8_t **out, int32_t **in, int channels, 241 ; int len, int shift); 242 ;----------------------------------------------------------------------------------------- 243 ;%1 = bps 244 ;%2 = channels 245 ;%3 = last xmm reg used 246 ;%4 = word/dword (shift instruction) 247 %macro FLAC_DECORRELATE_INDEP 4 248 %define REPCOUNT %2/(32/%1) ; 16bits = channels / 2; 32bits = channels 249 cglobal flac_decorrelate_indep%2_%1, 2, %2+2, %3+1, out, in0, in1, len, in2, in3, in4, in5, in6, in7 250 %if ARCH_X86_32 251 %if %2 == 6 252 DEFINE_ARGS out, in0, in1, in2, in3, in4, in5 253 %define lend dword r3m 254 %else 255 mov lend, lenm 256 %endif 257 %endif 258 movd m%3, r4m 259 260 %assign %%i 1 261 %rep %2-1 262 mov in %+ %%i %+ q, [in0q+%%i*gprsize] 263 %assign %%i %%i+1 264 %endrep 265 266 mov in0q, [in0q] 267 mov outq, [outq] 268 269 %assign %%i 1 270 %rep %2-1 271 sub in %+ %%i %+ q, in0q 272 %assign %%i %%i+1 273 %endrep 274 275 align 16 276 .loop: 277 mova m0, [in0q] 278 279 %assign %%i 1 280 %rep REPCOUNT-1 281 mova m %+ %%i, [in0q + in %+ %%i %+ q] 282 %assign %%i %%i+1 283 %endrep 284 285 %if %1 == 32 286 287 %if %2 == 8 288 TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, 8 289 %elif %2 == 6 290 SBUTTERFLY dq, 0, 1, 6 291 SBUTTERFLY dq, 2, 3, 6 292 SBUTTERFLY dq, 4, 5, 6 293 294 punpcklqdq m6, m0, m2 295 punpckhqdq m2, m4 296 shufps m4, m0, 0xe4 297 punpcklqdq m0, m1, m3 298 punpckhqdq m3, m5 299 shufps m5, m1, 0xe4 300 SWAP 0,6,1,4,5,3 301 %elif %2 == 4 302 TRANSPOSE4x4D 0, 1, 2, 3, 4 303 %else ; %2 == 2 304 SBUTTERFLY dq, 0, 1, 2 305 %endif 306 307 %else ; %1 == 16 308 309 %if %2 == 8 310 packssdw m0, [in0q + in4q] 311 packssdw m1, [in0q + in5q] 312 packssdw m2, [in0q + in6q] 313 packssdw m3, [in0q + in7q] 314 TRANSPOSE2x4x4W 0, 1, 2, 3, 4 315 %elif %2 == 6 316 packssdw m0, [in0q + in3q] 317 packssdw m1, [in0q + in4q] 318 packssdw m2, [in0q + in5q] 319 pshufd m3, m0, q1032 320 punpcklwd m0, m1 321 punpckhwd m1, m2 322 punpcklwd m2, m3 323 324 shufps m3, m0, m2, q2020 325 shufps m0, m1, q2031 326 shufps m2, m1, q3131 327 shufps m1, m2, m3, q3120 328 shufps m3, m0, q0220 329 shufps m0, m2, q3113 330 SWAP 2, 0, 3 331 %else ; %2 == 4 332 packssdw m0, [in0q + in2q] 333 packssdw m1, [in0q + in3q] 334 SBUTTERFLY wd, 0, 1, 2 335 SBUTTERFLY dq, 0, 1, 2 336 %endif 337 338 %endif 339 340 %assign %%i 0 341 %rep REPCOUNT 342 psll%4 m %+ %%i, m%3 343 %assign %%i %%i+1 344 %endrep 345 346 %assign %%i 0 347 %rep REPCOUNT 348 mova [outq + %%i*mmsize], m %+ %%i 349 %assign %%i %%i+1 350 %endrep 351 352 add in0q, mmsize 353 add outq, mmsize*REPCOUNT 354 sub lend, mmsize/4 355 jg .loop 356 RET 357 %endmacro 358 359 INIT_XMM ssse3 360 FLAC_DECORRELATE_16 indep2, 0, 1 ; Reuse stereo 16bits macro 361 FLAC_DECORRELATE_INDEP 32, 2, 3, d 362 FLAC_DECORRELATE_INDEP 16, 4, 3, w 363 FLAC_DECORRELATE_INDEP 32, 4, 5, d 364 FLAC_DECORRELATE_INDEP 16, 6, 4, w 365 FLAC_DECORRELATE_INDEP 32, 6, 7, d 366 %if ARCH_X86_64 367 FLAC_DECORRELATE_INDEP 16, 8, 5, w 368 FLAC_DECORRELATE_INDEP 32, 8, 9, d 369 %endif 370 371 INIT_XMM avx 372 FLAC_DECORRELATE_INDEP 32, 4, 5, d 373 FLAC_DECORRELATE_INDEP 32, 6, 7, d 374 %if ARCH_X86_64 375 FLAC_DECORRELATE_INDEP 16, 8, 5, w 376 FLAC_DECORRELATE_INDEP 32, 8, 9, d 377 %endif