tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

videodsp.asm (12959B)


      1 ;******************************************************************************
      2 ;* Core video DSP functions
      3 ;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
      4 ;*
      5 ;* This file is part of FFmpeg.
      6 ;*
      7 ;* FFmpeg is free software; you can redistribute it and/or
      8 ;* modify it under the terms of the GNU Lesser General Public
      9 ;* License as published by the Free Software Foundation; either
     10 ;* version 2.1 of the License, or (at your option) any later version.
     11 ;*
     12 ;* FFmpeg is distributed in the hope that it will be useful,
     13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
     14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     15 ;* Lesser General Public License for more details.
     16 ;*
     17 ;* You should have received a copy of the GNU Lesser General Public
     18 ;* License along with FFmpeg; if not, write to the Free Software
     19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     20 ;******************************************************************************
     21 
     22 %include "libavutil/x86/x86util.asm"
     23 
     24 SECTION .text
     25 
     26 ; slow vertical extension loop function. Works with variable-width, and
     27 ; does per-line reading/writing of source data
     28 
     29 %macro V_COPY_ROW 2 ; type (top/body/bottom), h
     30 .%1_y_loop:                                     ; do {
     31    mov              wq, r7mp                   ;   initialize w (r7mp = wmp)
     32 .%1_x_loop:                                     ;   do {
     33    movu             m0, [srcq+wq]              ;     m0 = read($mmsize)
     34    movu      [dstq+wq], m0                     ;     write(m0, $mmsize)
     35    add              wq, mmsize                 ;     w -= $mmsize
     36    cmp              wq, -mmsize                ;   } while (w > $mmsize);
     37    jl .%1_x_loop
     38    movu             m0, [srcq-mmsize]          ;     m0 = read($mmsize)
     39    movu  [dstq-mmsize], m0                     ;     write(m0, $mmsize)
     40 %ifidn %1, body                                 ;   if ($type == body) {
     41    add            srcq, src_strideq            ;     src += src_stride
     42 %endif                                          ;   }
     43    add            dstq, dst_strideq            ;   dst += dst_stride
     44    dec              %2                         ; } while (--$h);
     45    jnz .%1_y_loop
     46 %endmacro
     47 
     48 ; .----. <- zero
     49 ; |    |    <- top is copied from first line in body of source
     50 ; |----| <- start_y
     51 ; |    |    <- body is copied verbatim (line-by-line) from source
     52 ; |----| <- end_y
     53 ; |    |    <- bottom is copied from last line in body of source
     54 ; '----' <- bh
     55 INIT_XMM sse
     56 %if ARCH_X86_64
     57 cglobal emu_edge_vvar, 7, 8, 1, dst, dst_stride, src, src_stride, \
     58                                start_y, end_y, bh, w
     59 %else ; x86-32
     60 cglobal emu_edge_vvar, 1, 6, 1, dst, src, start_y, end_y, bh, w
     61 %define src_strideq r3mp
     62 %define dst_strideq r1mp
     63    mov            srcq, r2mp
     64    mov        start_yq, r4mp
     65    mov          end_yq, r5mp
     66    mov             bhq, r6mp
     67 %endif
     68    sub             bhq, end_yq                 ; bh    -= end_q
     69    sub          end_yq, start_yq               ; end_q -= start_q
     70    add            srcq, r7mp                   ; (r7mp = wmp)
     71    add            dstq, r7mp                   ; (r7mp = wmp)
     72    neg            r7mp                         ; (r7mp = wmp)
     73    test       start_yq, start_yq               ; if (start_q) {
     74    jz .body
     75    V_COPY_ROW      top, start_yq               ;   v_copy_row(top, start_yq)
     76 .body:                                          ; }
     77    V_COPY_ROW     body, end_yq                 ; v_copy_row(body, end_yq)
     78    test            bhq, bhq                    ; if (bh) {
     79    jz .end
     80    sub            srcq, src_strideq            ;   src -= src_stride
     81    V_COPY_ROW   bottom, bhq                    ;   v_copy_row(bottom, bh)
     82 .end:                                           ; }
     83    RET
     84 
     85 %macro hvar_fn 0
     86 cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w
     87    lea            dstq, [dstq+n_wordsq*2]
     88    neg        n_wordsq
     89    lea        start_xq, [start_xq+n_wordsq*2]
     90 .y_loop:                                        ; do {
     91 %if cpuflag(avx2)
     92    vpbroadcastb     m0, [dstq+start_xq]
     93    mov              wq, n_wordsq               ;   initialize w
     94 %else
     95    movzx            wd, byte [dstq+start_xq]   ;   w = read(1)
     96    imul             wd, 0x01010101             ;   w *= 0x01010101
     97    movd             m0, wd
     98    mov              wq, n_wordsq               ;   initialize w
     99    pshufd           m0, m0, q0000              ;   splat
    100 %endif ; avx2
    101 .x_loop:                                        ;   do {
    102    movu    [dstq+wq*2], m0                     ;     write($reg, $mmsize)
    103    add              wq, mmsize/2               ;     w -= $mmsize/2
    104    cmp              wq, -(mmsize/2)            ;   } while (w > $mmsize/2)
    105    jl .x_loop
    106    movu  [dstq-mmsize], m0                     ;   write($reg, $mmsize)
    107    add            dstq, dst_strideq            ;   dst += dst_stride
    108    dec              hq                         ; } while (h--)
    109    jnz .y_loop
    110    RET
    111 %endmacro
    112 
    113 INIT_XMM sse2
    114 hvar_fn
    115 
    116 %if HAVE_AVX2_EXTERNAL
    117 INIT_XMM avx2
    118 hvar_fn
    119 %endif
    120 
    121 ; macro to read/write a horizontal number of pixels (%2) to/from registers
    122 ; on sse, - fills xmm0-15 for consecutive sets of 16 pixels
    123 ;         - if (%2 & 8)  fills 8 bytes into xmm$next
    124 ;         - if (%2 & 4)  fills 4 bytes into xmm$next
    125 ;         - if (%2 & 3)  fills 1, 2 or 4 bytes in eax
    126 ; writing data out is in the same way
    127 %macro READ_NUM_BYTES 2
    128 %assign %%off 0     ; offset in source buffer
    129 %assign %%xmm_idx 0 ; xmm register index
    130 
    131 %rep %2/mmsize
    132    movu   xmm %+ %%xmm_idx, [srcq+%%off]
    133 %assign %%xmm_idx %%xmm_idx+1
    134 %assign %%off %%off+mmsize
    135 %endrep ; %2/mmsize
    136 
    137 %if (%2-%%off) >= 8
    138 %if %2 > 16 && (%2-%%off) > 8
    139    movu   xmm %+ %%xmm_idx, [srcq+%2-16]
    140 %assign %%xmm_idx %%xmm_idx+1
    141 %assign %%off %2
    142 %else
    143    movq   xmm %+ %%xmm_idx, [srcq+%%off]
    144 %assign %%xmm_idx %%xmm_idx+1
    145 %assign %%off %%off+8
    146 %endif
    147 %endif ; (%2-%%off) >= 8
    148 
    149 %if (%2-%%off) >= 4
    150 %if %2 > 8 && (%2-%%off) > 4
    151    movq   xmm %+ %%xmm_idx, [srcq+%2-8]
    152 %assign %%off %2
    153 %else
    154    movd   xmm %+ %%xmm_idx, [srcq+%%off]
    155 %assign %%off %%off+4
    156 %endif
    157 %assign %%xmm_idx %%xmm_idx+1
    158 %endif ; (%2-%%off) >= 4
    159 
    160 %if (%2-%%off) >= 1
    161 %if %2 >= 4
    162    movd xmm %+ %%xmm_idx, [srcq+%2-4]
    163 %elif (%2-%%off) == 1
    164    mov            valb, [srcq+%2-1]
    165 %elif (%2-%%off) == 2
    166    mov            valw, [srcq+%2-2]
    167 %else
    168    mov            valb, [srcq+%2-1]
    169    ror            vald, 16
    170    mov            valw, [srcq+%2-3]
    171 %endif
    172 %endif ; (%2-%%off) >= 1
    173 %endmacro ; READ_NUM_BYTES
    174 
    175 %macro WRITE_NUM_BYTES 2
    176 %assign %%off 0     ; offset in destination buffer
    177 %assign %%xmm_idx 0 ; xmm register index
    178 
    179 %rep %2/mmsize
    180    movu   [dstq+%%off], xmm %+ %%xmm_idx
    181 %assign %%xmm_idx %%xmm_idx+1
    182 %assign %%off %%off+mmsize
    183 %endrep ; %2/mmsize
    184 
    185 %if (%2-%%off) >= 8
    186 %if %2 > 16 && (%2-%%off) > 8
    187    movu   [dstq+%2-16], xmm %+ %%xmm_idx
    188 %assign %%xmm_idx %%xmm_idx+1
    189 %assign %%off %2
    190 %else
    191    movq   [dstq+%%off], xmm %+ %%xmm_idx
    192 %assign %%xmm_idx %%xmm_idx+1
    193 %assign %%off %%off+8
    194 %endif
    195 %endif ; (%2-%%off) >= 8
    196 
    197 %if (%2-%%off) >= 4
    198 %if %2 > 8 && (%2-%%off) > 4
    199    movq    [dstq+%2-8], xmm %+ %%xmm_idx
    200 %assign %%off %2
    201 %else
    202    movd   [dstq+%%off], xmm %+ %%xmm_idx
    203 %assign %%off %%off+4
    204 %endif
    205 %assign %%xmm_idx %%xmm_idx+1
    206 %endif ; (%2-%%off) >= 4
    207 
    208 %if (%2-%%off) >= 1
    209 %if %2 >= 4
    210    movd    [dstq+%2-4], xmm %+ %%xmm_idx
    211 %elif (%2-%%off) == 1
    212    mov     [dstq+%2-1], valb
    213 %elif (%2-%%off) == 2
    214    mov     [dstq+%2-2], valw
    215 %else
    216    mov     [dstq+%2-3], valw
    217    ror            vald, 16
    218    mov     [dstq+%2-1], valb
    219 %ifnidn %1, body
    220    ror            vald, 16
    221 %endif
    222 %endif
    223 %endif ; (%2-%%off) >= 1
    224 %endmacro ; WRITE_NUM_BYTES
    225 
    226 ; vertical top/bottom extend and body copy fast loops
    227 ; these are function pointers to set-width line copy functions, i.e.
    228 ; they read a fixed number of pixels into set registers, and write
    229 ; those out into the destination buffer
    230 %macro VERTICAL_EXTEND 2
    231 %assign %%n %1
    232 %rep 1+%2-%1
    233 %if %%n <= 3
    234 %if ARCH_X86_64
    235 cglobal emu_edge_vfix %+ %%n, 6, 8, 0, dst, dst_stride, src, src_stride, \
    236                                       start_y, end_y, val, bh
    237    mov             bhq, r6mp                   ; r6mp = bhmp
    238 %else ; x86-32
    239 cglobal emu_edge_vfix %+ %%n, 0, 6, 0, val, dst, src, start_y, end_y, bh
    240    mov            dstq, r0mp
    241    mov            srcq, r2mp
    242    mov        start_yq, r4mp
    243    mov          end_yq, r5mp
    244    mov             bhq, r6mp
    245 %define dst_strideq r1mp
    246 %define src_strideq r3mp
    247 %endif ; x86-64/32
    248 %else
    249 %if ARCH_X86_64
    250 cglobal emu_edge_vfix %+ %%n, 7, 7, 1, dst, dst_stride, src, src_stride, \
    251                                       start_y, end_y, bh
    252 %else ; x86-32
    253 cglobal emu_edge_vfix %+ %%n, 1, 5, 1, dst, src, start_y, end_y, bh
    254    mov            srcq, r2mp
    255    mov        start_yq, r4mp
    256    mov          end_yq, r5mp
    257    mov             bhq, r6mp
    258 %define dst_strideq r1mp
    259 %define src_strideq r3mp
    260 %endif ; x86-64/32
    261 %endif
    262    ; FIXME move this to c wrapper?
    263    sub             bhq, end_yq                 ; bh    -= end_y
    264    sub          end_yq, start_yq               ; end_y -= start_y
    265 
    266    ; extend pixels above body
    267    test       start_yq, start_yq               ; if (start_y) {
    268    jz .body_loop
    269    READ_NUM_BYTES  top, %%n                    ;   $variable_regs = read($n)
    270 .top_loop:                                      ;   do {
    271    WRITE_NUM_BYTES top, %%n                    ;     write($variable_regs, $n)
    272    add            dstq, dst_strideq            ;     dst += linesize
    273    dec        start_yq                         ;   } while (--start_y)
    274    jnz .top_loop                               ; }
    275 
    276    ; copy body pixels
    277 .body_loop:                                     ; do {
    278    READ_NUM_BYTES  body, %%n                   ;   $variable_regs = read($n)
    279    WRITE_NUM_BYTES body, %%n                   ;   write($variable_regs, $n)
    280    add            dstq, dst_strideq            ;   dst += dst_stride
    281    add            srcq, src_strideq            ;   src += src_stride
    282    dec          end_yq                         ; } while (--end_y)
    283    jnz .body_loop
    284 
    285    ; copy bottom pixels
    286    test            bhq, bhq                    ; if (block_h) {
    287    jz .end
    288    sub            srcq, src_strideq            ;   src -= linesize
    289    READ_NUM_BYTES  bottom, %%n                 ;   $variable_regs = read($n)
    290 .bottom_loop:                                   ;   do {
    291    WRITE_NUM_BYTES bottom, %%n                 ;     write($variable_regs, $n)
    292    add            dstq, dst_strideq            ;     dst += linesize
    293    dec             bhq                         ;   } while (--bh)
    294    jnz .bottom_loop                            ; }
    295 
    296 .end:
    297    RET
    298 %assign %%n %%n+1
    299 %endrep ; 1+%2-%1
    300 %endmacro ; VERTICAL_EXTEND
    301 
    302 INIT_XMM sse2
    303 VERTICAL_EXTEND 1, 22
    304 
    305 ; left/right (horizontal) fast extend functions
    306 ; these are essentially identical to the vertical extend ones above,
    307 ; just left/right separated because number of pixels to extend is
    308 ; obviously not the same on both sides.
    309 
    310 %macro READ_V_PIXEL 2
    311 %if cpuflag(avx2)
    312    vpbroadcastb     m0, %2
    313 %else
    314    movzx          vald, byte %2
    315    imul           vald, 0x01010101
    316 %if %1 >= 8
    317    movd             m0, vald
    318    pshufd           m0, m0, q0000
    319 %endif ; %1 > 16
    320 %endif ; avx2
    321 %endmacro ; READ_V_PIXEL
    322 
    323 %macro WRITE_V_PIXEL 2
    324 %assign %%off 0
    325 
    326 %if %1 >= 8
    327 
    328 %rep %1/mmsize
    329    movu     [%2+%%off], m0
    330 %assign %%off %%off+mmsize
    331 %endrep ; %1/mmsize
    332 
    333 %if %1-%%off >= 8
    334 %if %1 > 16 && %1-%%off > 8
    335    movu     [%2+%1-16], m0
    336 %assign %%off %1
    337 %else
    338    movq     [%2+%%off], m0
    339 %assign %%off %%off+8
    340 %endif
    341 %endif ; %1-%%off >= 8
    342 
    343 %if %1-%%off >= 4
    344 %if %1 > 8 && %1-%%off > 4
    345    movq      [%2+%1-8], m0
    346 %assign %%off %1
    347 %else
    348    movd     [%2+%%off], m0
    349 %assign %%off %%off+4
    350 %endif
    351 %endif ; %1-%%off >= 4
    352 
    353 %else ; %1 < 8
    354 
    355 %rep %1/4
    356    mov      [%2+%%off], vald
    357 %assign %%off %%off+4
    358 %endrep ; %1/4
    359 
    360 %endif ; %1 >=/< 8
    361 
    362 %if %1-%%off == 2
    363 %if cpuflag(avx2)
    364    movd     [%2+%%off-2], m0
    365 %else
    366    mov      [%2+%%off], valw
    367 %endif ; avx2
    368 %endif ; (%1-%%off)/2
    369 %endmacro ; WRITE_V_PIXEL
    370 
    371 %macro H_EXTEND 2
    372 %assign %%n %1
    373 %rep 1+(%2-%1)/2
    374 %if cpuflag(avx2)
    375 cglobal emu_edge_hfix %+ %%n, 4, 4, 1, dst, dst_stride, start_x, bh
    376 %else
    377 cglobal emu_edge_hfix %+ %%n, 4, 5, 1, dst, dst_stride, start_x, bh, val
    378 %endif
    379 .loop_y:                                        ; do {
    380    READ_V_PIXEL    %%n, [dstq+start_xq]        ;   $variable_regs = read($n)
    381    WRITE_V_PIXEL   %%n, dstq                   ;   write($variable_regs, $n)
    382    add            dstq, dst_strideq            ;   dst += dst_stride
    383    dec             bhq                         ; } while (--bh)
    384    jnz .loop_y
    385    RET
    386 %assign %%n %%n+2
    387 %endrep ; 1+(%2-%1)/2
    388 %endmacro ; H_EXTEND
    389 
    390 INIT_XMM sse2
    391 H_EXTEND 2, 22
    392 
    393 %if HAVE_AVX2_EXTERNAL
    394 INIT_XMM avx2
    395 H_EXTEND 8, 22
    396 %endif
    397 
    398 INIT_MMX mmxext
    399 cglobal prefetch, 3, 3, 0, buf, stride, h
    400 .loop:
    401    prefetcht0 [bufq]
    402    add      bufq, strideq
    403    dec        hd
    404    jg .loop
    405    RET