videodsp.asm (12959B)
1 ;****************************************************************************** 2 ;* Core video DSP functions 3 ;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com> 4 ;* 5 ;* This file is part of FFmpeg. 6 ;* 7 ;* FFmpeg is free software; you can redistribute it and/or 8 ;* modify it under the terms of the GNU Lesser General Public 9 ;* License as published by the Free Software Foundation; either 10 ;* version 2.1 of the License, or (at your option) any later version. 11 ;* 12 ;* FFmpeg is distributed in the hope that it will be useful, 13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 ;* Lesser General Public License for more details. 16 ;* 17 ;* You should have received a copy of the GNU Lesser General Public 18 ;* License along with FFmpeg; if not, write to the Free Software 19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 ;****************************************************************************** 21 22 %include "libavutil/x86/x86util.asm" 23 24 SECTION .text 25 26 ; slow vertical extension loop function. Works with variable-width, and 27 ; does per-line reading/writing of source data 28 29 %macro V_COPY_ROW 2 ; type (top/body/bottom), h 30 .%1_y_loop: ; do { 31 mov wq, r7mp ; initialize w (r7mp = wmp) 32 .%1_x_loop: ; do { 33 movu m0, [srcq+wq] ; m0 = read($mmsize) 34 movu [dstq+wq], m0 ; write(m0, $mmsize) 35 add wq, mmsize ; w -= $mmsize 36 cmp wq, -mmsize ; } while (w > $mmsize); 37 jl .%1_x_loop 38 movu m0, [srcq-mmsize] ; m0 = read($mmsize) 39 movu [dstq-mmsize], m0 ; write(m0, $mmsize) 40 %ifidn %1, body ; if ($type == body) { 41 add srcq, src_strideq ; src += src_stride 42 %endif ; } 43 add dstq, dst_strideq ; dst += dst_stride 44 dec %2 ; } while (--$h); 45 jnz .%1_y_loop 46 %endmacro 47 48 ; .----. <- zero 49 ; | | <- top is copied from first line in body of source 50 ; |----| <- start_y 51 ; | | <- body is copied verbatim (line-by-line) from source 52 ; |----| <- end_y 53 ; | | <- bottom is copied from last line in body of source 54 ; '----' <- bh 55 INIT_XMM sse 56 %if ARCH_X86_64 57 cglobal emu_edge_vvar, 7, 8, 1, dst, dst_stride, src, src_stride, \ 58 start_y, end_y, bh, w 59 %else ; x86-32 60 cglobal emu_edge_vvar, 1, 6, 1, dst, src, start_y, end_y, bh, w 61 %define src_strideq r3mp 62 %define dst_strideq r1mp 63 mov srcq, r2mp 64 mov start_yq, r4mp 65 mov end_yq, r5mp 66 mov bhq, r6mp 67 %endif 68 sub bhq, end_yq ; bh -= end_q 69 sub end_yq, start_yq ; end_q -= start_q 70 add srcq, r7mp ; (r7mp = wmp) 71 add dstq, r7mp ; (r7mp = wmp) 72 neg r7mp ; (r7mp = wmp) 73 test start_yq, start_yq ; if (start_q) { 74 jz .body 75 V_COPY_ROW top, start_yq ; v_copy_row(top, start_yq) 76 .body: ; } 77 V_COPY_ROW body, end_yq ; v_copy_row(body, end_yq) 78 test bhq, bhq ; if (bh) { 79 jz .end 80 sub srcq, src_strideq ; src -= src_stride 81 V_COPY_ROW bottom, bhq ; v_copy_row(bottom, bh) 82 .end: ; } 83 RET 84 85 %macro hvar_fn 0 86 cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w 87 lea dstq, [dstq+n_wordsq*2] 88 neg n_wordsq 89 lea start_xq, [start_xq+n_wordsq*2] 90 .y_loop: ; do { 91 %if cpuflag(avx2) 92 vpbroadcastb m0, [dstq+start_xq] 93 mov wq, n_wordsq ; initialize w 94 %else 95 movzx wd, byte [dstq+start_xq] ; w = read(1) 96 imul wd, 0x01010101 ; w *= 0x01010101 97 movd m0, wd 98 mov wq, n_wordsq ; initialize w 99 pshufd m0, m0, q0000 ; splat 100 %endif ; avx2 101 .x_loop: ; do { 102 movu [dstq+wq*2], m0 ; write($reg, $mmsize) 103 add wq, mmsize/2 ; w -= $mmsize/2 104 cmp wq, -(mmsize/2) ; } while (w > $mmsize/2) 105 jl .x_loop 106 movu [dstq-mmsize], m0 ; write($reg, $mmsize) 107 add dstq, dst_strideq ; dst += dst_stride 108 dec hq ; } while (h--) 109 jnz .y_loop 110 RET 111 %endmacro 112 113 INIT_XMM sse2 114 hvar_fn 115 116 %if HAVE_AVX2_EXTERNAL 117 INIT_XMM avx2 118 hvar_fn 119 %endif 120 121 ; macro to read/write a horizontal number of pixels (%2) to/from registers 122 ; on sse, - fills xmm0-15 for consecutive sets of 16 pixels 123 ; - if (%2 & 8) fills 8 bytes into xmm$next 124 ; - if (%2 & 4) fills 4 bytes into xmm$next 125 ; - if (%2 & 3) fills 1, 2 or 4 bytes in eax 126 ; writing data out is in the same way 127 %macro READ_NUM_BYTES 2 128 %assign %%off 0 ; offset in source buffer 129 %assign %%xmm_idx 0 ; xmm register index 130 131 %rep %2/mmsize 132 movu xmm %+ %%xmm_idx, [srcq+%%off] 133 %assign %%xmm_idx %%xmm_idx+1 134 %assign %%off %%off+mmsize 135 %endrep ; %2/mmsize 136 137 %if (%2-%%off) >= 8 138 %if %2 > 16 && (%2-%%off) > 8 139 movu xmm %+ %%xmm_idx, [srcq+%2-16] 140 %assign %%xmm_idx %%xmm_idx+1 141 %assign %%off %2 142 %else 143 movq xmm %+ %%xmm_idx, [srcq+%%off] 144 %assign %%xmm_idx %%xmm_idx+1 145 %assign %%off %%off+8 146 %endif 147 %endif ; (%2-%%off) >= 8 148 149 %if (%2-%%off) >= 4 150 %if %2 > 8 && (%2-%%off) > 4 151 movq xmm %+ %%xmm_idx, [srcq+%2-8] 152 %assign %%off %2 153 %else 154 movd xmm %+ %%xmm_idx, [srcq+%%off] 155 %assign %%off %%off+4 156 %endif 157 %assign %%xmm_idx %%xmm_idx+1 158 %endif ; (%2-%%off) >= 4 159 160 %if (%2-%%off) >= 1 161 %if %2 >= 4 162 movd xmm %+ %%xmm_idx, [srcq+%2-4] 163 %elif (%2-%%off) == 1 164 mov valb, [srcq+%2-1] 165 %elif (%2-%%off) == 2 166 mov valw, [srcq+%2-2] 167 %else 168 mov valb, [srcq+%2-1] 169 ror vald, 16 170 mov valw, [srcq+%2-3] 171 %endif 172 %endif ; (%2-%%off) >= 1 173 %endmacro ; READ_NUM_BYTES 174 175 %macro WRITE_NUM_BYTES 2 176 %assign %%off 0 ; offset in destination buffer 177 %assign %%xmm_idx 0 ; xmm register index 178 179 %rep %2/mmsize 180 movu [dstq+%%off], xmm %+ %%xmm_idx 181 %assign %%xmm_idx %%xmm_idx+1 182 %assign %%off %%off+mmsize 183 %endrep ; %2/mmsize 184 185 %if (%2-%%off) >= 8 186 %if %2 > 16 && (%2-%%off) > 8 187 movu [dstq+%2-16], xmm %+ %%xmm_idx 188 %assign %%xmm_idx %%xmm_idx+1 189 %assign %%off %2 190 %else 191 movq [dstq+%%off], xmm %+ %%xmm_idx 192 %assign %%xmm_idx %%xmm_idx+1 193 %assign %%off %%off+8 194 %endif 195 %endif ; (%2-%%off) >= 8 196 197 %if (%2-%%off) >= 4 198 %if %2 > 8 && (%2-%%off) > 4 199 movq [dstq+%2-8], xmm %+ %%xmm_idx 200 %assign %%off %2 201 %else 202 movd [dstq+%%off], xmm %+ %%xmm_idx 203 %assign %%off %%off+4 204 %endif 205 %assign %%xmm_idx %%xmm_idx+1 206 %endif ; (%2-%%off) >= 4 207 208 %if (%2-%%off) >= 1 209 %if %2 >= 4 210 movd [dstq+%2-4], xmm %+ %%xmm_idx 211 %elif (%2-%%off) == 1 212 mov [dstq+%2-1], valb 213 %elif (%2-%%off) == 2 214 mov [dstq+%2-2], valw 215 %else 216 mov [dstq+%2-3], valw 217 ror vald, 16 218 mov [dstq+%2-1], valb 219 %ifnidn %1, body 220 ror vald, 16 221 %endif 222 %endif 223 %endif ; (%2-%%off) >= 1 224 %endmacro ; WRITE_NUM_BYTES 225 226 ; vertical top/bottom extend and body copy fast loops 227 ; these are function pointers to set-width line copy functions, i.e. 228 ; they read a fixed number of pixels into set registers, and write 229 ; those out into the destination buffer 230 %macro VERTICAL_EXTEND 2 231 %assign %%n %1 232 %rep 1+%2-%1 233 %if %%n <= 3 234 %if ARCH_X86_64 235 cglobal emu_edge_vfix %+ %%n, 6, 8, 0, dst, dst_stride, src, src_stride, \ 236 start_y, end_y, val, bh 237 mov bhq, r6mp ; r6mp = bhmp 238 %else ; x86-32 239 cglobal emu_edge_vfix %+ %%n, 0, 6, 0, val, dst, src, start_y, end_y, bh 240 mov dstq, r0mp 241 mov srcq, r2mp 242 mov start_yq, r4mp 243 mov end_yq, r5mp 244 mov bhq, r6mp 245 %define dst_strideq r1mp 246 %define src_strideq r3mp 247 %endif ; x86-64/32 248 %else 249 %if ARCH_X86_64 250 cglobal emu_edge_vfix %+ %%n, 7, 7, 1, dst, dst_stride, src, src_stride, \ 251 start_y, end_y, bh 252 %else ; x86-32 253 cglobal emu_edge_vfix %+ %%n, 1, 5, 1, dst, src, start_y, end_y, bh 254 mov srcq, r2mp 255 mov start_yq, r4mp 256 mov end_yq, r5mp 257 mov bhq, r6mp 258 %define dst_strideq r1mp 259 %define src_strideq r3mp 260 %endif ; x86-64/32 261 %endif 262 ; FIXME move this to c wrapper? 263 sub bhq, end_yq ; bh -= end_y 264 sub end_yq, start_yq ; end_y -= start_y 265 266 ; extend pixels above body 267 test start_yq, start_yq ; if (start_y) { 268 jz .body_loop 269 READ_NUM_BYTES top, %%n ; $variable_regs = read($n) 270 .top_loop: ; do { 271 WRITE_NUM_BYTES top, %%n ; write($variable_regs, $n) 272 add dstq, dst_strideq ; dst += linesize 273 dec start_yq ; } while (--start_y) 274 jnz .top_loop ; } 275 276 ; copy body pixels 277 .body_loop: ; do { 278 READ_NUM_BYTES body, %%n ; $variable_regs = read($n) 279 WRITE_NUM_BYTES body, %%n ; write($variable_regs, $n) 280 add dstq, dst_strideq ; dst += dst_stride 281 add srcq, src_strideq ; src += src_stride 282 dec end_yq ; } while (--end_y) 283 jnz .body_loop 284 285 ; copy bottom pixels 286 test bhq, bhq ; if (block_h) { 287 jz .end 288 sub srcq, src_strideq ; src -= linesize 289 READ_NUM_BYTES bottom, %%n ; $variable_regs = read($n) 290 .bottom_loop: ; do { 291 WRITE_NUM_BYTES bottom, %%n ; write($variable_regs, $n) 292 add dstq, dst_strideq ; dst += linesize 293 dec bhq ; } while (--bh) 294 jnz .bottom_loop ; } 295 296 .end: 297 RET 298 %assign %%n %%n+1 299 %endrep ; 1+%2-%1 300 %endmacro ; VERTICAL_EXTEND 301 302 INIT_XMM sse2 303 VERTICAL_EXTEND 1, 22 304 305 ; left/right (horizontal) fast extend functions 306 ; these are essentially identical to the vertical extend ones above, 307 ; just left/right separated because number of pixels to extend is 308 ; obviously not the same on both sides. 309 310 %macro READ_V_PIXEL 2 311 %if cpuflag(avx2) 312 vpbroadcastb m0, %2 313 %else 314 movzx vald, byte %2 315 imul vald, 0x01010101 316 %if %1 >= 8 317 movd m0, vald 318 pshufd m0, m0, q0000 319 %endif ; %1 > 16 320 %endif ; avx2 321 %endmacro ; READ_V_PIXEL 322 323 %macro WRITE_V_PIXEL 2 324 %assign %%off 0 325 326 %if %1 >= 8 327 328 %rep %1/mmsize 329 movu [%2+%%off], m0 330 %assign %%off %%off+mmsize 331 %endrep ; %1/mmsize 332 333 %if %1-%%off >= 8 334 %if %1 > 16 && %1-%%off > 8 335 movu [%2+%1-16], m0 336 %assign %%off %1 337 %else 338 movq [%2+%%off], m0 339 %assign %%off %%off+8 340 %endif 341 %endif ; %1-%%off >= 8 342 343 %if %1-%%off >= 4 344 %if %1 > 8 && %1-%%off > 4 345 movq [%2+%1-8], m0 346 %assign %%off %1 347 %else 348 movd [%2+%%off], m0 349 %assign %%off %%off+4 350 %endif 351 %endif ; %1-%%off >= 4 352 353 %else ; %1 < 8 354 355 %rep %1/4 356 mov [%2+%%off], vald 357 %assign %%off %%off+4 358 %endrep ; %1/4 359 360 %endif ; %1 >=/< 8 361 362 %if %1-%%off == 2 363 %if cpuflag(avx2) 364 movd [%2+%%off-2], m0 365 %else 366 mov [%2+%%off], valw 367 %endif ; avx2 368 %endif ; (%1-%%off)/2 369 %endmacro ; WRITE_V_PIXEL 370 371 %macro H_EXTEND 2 372 %assign %%n %1 373 %rep 1+(%2-%1)/2 374 %if cpuflag(avx2) 375 cglobal emu_edge_hfix %+ %%n, 4, 4, 1, dst, dst_stride, start_x, bh 376 %else 377 cglobal emu_edge_hfix %+ %%n, 4, 5, 1, dst, dst_stride, start_x, bh, val 378 %endif 379 .loop_y: ; do { 380 READ_V_PIXEL %%n, [dstq+start_xq] ; $variable_regs = read($n) 381 WRITE_V_PIXEL %%n, dstq ; write($variable_regs, $n) 382 add dstq, dst_strideq ; dst += dst_stride 383 dec bhq ; } while (--bh) 384 jnz .loop_y 385 RET 386 %assign %%n %%n+2 387 %endrep ; 1+(%2-%1)/2 388 %endmacro ; H_EXTEND 389 390 INIT_XMM sse2 391 H_EXTEND 2, 22 392 393 %if HAVE_AVX2_EXTERNAL 394 INIT_XMM avx2 395 H_EXTEND 8, 22 396 %endif 397 398 INIT_MMX mmxext 399 cglobal prefetch, 3, 3, 0, buf, stride, h 400 .loop: 401 prefetcht0 [bufq] 402 add bufq, strideq 403 dec hd 404 jg .loop 405 RET