tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

vp9mc_16bpp.asm (10737B)


      1 ;******************************************************************************
      2 ;* VP9 MC SIMD optimizations
      3 ;*
      4 ;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com>
      5 ;*
      6 ;* This file is part of FFmpeg.
      7 ;*
      8 ;* FFmpeg is free software; you can redistribute it and/or
      9 ;* modify it under the terms of the GNU Lesser General Public
     10 ;* License as published by the Free Software Foundation; either
     11 ;* version 2.1 of the License, or (at your option) any later version.
     12 ;*
     13 ;* FFmpeg is distributed in the hope that it will be useful,
     14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
     15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     16 ;* Lesser General Public License for more details.
     17 ;*
     18 ;* You should have received a copy of the GNU Lesser General Public
     19 ;* License along with FFmpeg; if not, write to the Free Software
     20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     21 ;******************************************************************************
     22 
     23 %include "libavutil/x86/x86util.asm"
     24 
     25 SECTION_RODATA 32
     26 
     27 pd_64: times 8 dd 64
     28 
     29 cextern pw_1023
     30 cextern pw_4095
     31 
     32 SECTION .text
     33 
     34 %macro filter_h4_fn 1-2 12
     35 cglobal vp9_%1_8tap_1d_h_4_10, 6, 6, %2, dst, dstride, src, sstride, h, filtery
     36    mova        m5, [pw_1023]
     37 .body:
     38 %if notcpuflag(sse4) && ARCH_X86_64
     39    pxor       m11, m11
     40 %endif
     41    mova        m6, [pd_64]
     42    mova        m7, [filteryq+ 0]
     43 %if ARCH_X86_64 && mmsize > 8
     44    mova        m8, [filteryq+32]
     45    mova        m9, [filteryq+64]
     46    mova       m10, [filteryq+96]
     47 %endif
     48 .loop:
     49    movh        m0, [srcq-6]
     50    movh        m1, [srcq-4]
     51    movh        m2, [srcq-2]
     52    movh        m3, [srcq+0]
     53    movh        m4, [srcq+2]
     54    punpcklwd   m0, m1
     55    punpcklwd   m2, m3
     56    pmaddwd     m0, m7
     57 %if ARCH_X86_64 && mmsize > 8
     58    pmaddwd     m2, m8
     59 %else
     60    pmaddwd     m2, [filteryq+32]
     61 %endif
     62    movu        m1, [srcq+4]
     63    movu        m3, [srcq+6]
     64    paddd       m0, m2
     65    movu        m2, [srcq+8]
     66    add       srcq, sstrideq
     67    punpcklwd   m4, m1
     68    punpcklwd   m3, m2
     69 %if ARCH_X86_64 && mmsize > 8
     70    pmaddwd     m4, m9
     71    pmaddwd     m3, m10
     72 %else
     73    pmaddwd     m4, [filteryq+64]
     74    pmaddwd     m3, [filteryq+96]
     75 %endif
     76    paddd       m0, m4
     77    paddd       m0, m3
     78    paddd       m0, m6
     79    psrad       m0, 7
     80 %if cpuflag(sse4)
     81    packusdw    m0, m0
     82 %else
     83    packssdw    m0, m0
     84 %endif
     85 %ifidn %1, avg
     86    movh        m1, [dstq]
     87 %endif
     88    pminsw      m0, m5
     89 %if notcpuflag(sse4)
     90 %if ARCH_X86_64
     91    pmaxsw      m0, m11
     92 %else
     93    pxor        m2, m2
     94    pmaxsw      m0, m2
     95 %endif
     96 %endif
     97 %ifidn %1, avg
     98    pavgw       m0, m1
     99 %endif
    100    movh    [dstq], m0
    101    add       dstq, dstrideq
    102    dec         hd
    103    jg .loop
    104    RET
    105 
    106 cglobal vp9_%1_8tap_1d_h_4_12, 6, 6, %2, dst, dstride, src, sstride, h, filtery
    107    mova        m5, [pw_4095]
    108    jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_h_4_10 %+ SUFFIX).body
    109 %endmacro
    110 
    111 INIT_XMM sse2
    112 filter_h4_fn put
    113 filter_h4_fn avg
    114 
    115 %macro filter_h_fn 1-2 12
    116 %assign %%px mmsize/2
    117 cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _10, 6, 6, %2, dst, dstride, src, sstride, h, filtery
    118    mova        m5, [pw_1023]
    119 .body:
    120 %if notcpuflag(sse4) && ARCH_X86_64
    121    pxor       m11, m11
    122 %endif
    123    mova        m6, [pd_64]
    124    mova        m7, [filteryq+ 0]
    125 %if ARCH_X86_64 && mmsize > 8
    126    mova        m8, [filteryq+32]
    127    mova        m9, [filteryq+64]
    128    mova       m10, [filteryq+96]
    129 %endif
    130 .loop:
    131    movu        m0, [srcq-6]
    132    movu        m1, [srcq-4]
    133    movu        m2, [srcq-2]
    134    movu        m3, [srcq+0]
    135    movu        m4, [srcq+2]
    136    pmaddwd     m0, m7
    137    pmaddwd     m1, m7
    138 %if ARCH_X86_64 && mmsize > 8
    139    pmaddwd     m2, m8
    140    pmaddwd     m3, m8
    141    pmaddwd     m4, m9
    142 %else
    143    pmaddwd     m2, [filteryq+32]
    144    pmaddwd     m3, [filteryq+32]
    145    pmaddwd     m4, [filteryq+64]
    146 %endif
    147    paddd       m0, m2
    148    paddd       m1, m3
    149    paddd       m0, m4
    150    movu        m2, [srcq+4]
    151    movu        m3, [srcq+6]
    152    movu        m4, [srcq+8]
    153    add       srcq, sstrideq
    154 %if ARCH_X86_64 && mmsize > 8
    155    pmaddwd     m2, m9
    156    pmaddwd     m3, m10
    157    pmaddwd     m4, m10
    158 %else
    159    pmaddwd     m2, [filteryq+64]
    160    pmaddwd     m3, [filteryq+96]
    161    pmaddwd     m4, [filteryq+96]
    162 %endif
    163    paddd       m1, m2
    164    paddd       m0, m3
    165    paddd       m1, m4
    166    paddd       m0, m6
    167    paddd       m1, m6
    168    psrad       m0, 7
    169    psrad       m1, 7
    170 %if cpuflag(sse4)
    171    packusdw    m0, m0
    172    packusdw    m1, m1
    173 %else
    174    packssdw    m0, m0
    175    packssdw    m1, m1
    176 %endif
    177    punpcklwd   m0, m1
    178    pminsw      m0, m5
    179 %if notcpuflag(sse4)
    180 %if ARCH_X86_64
    181    pmaxsw      m0, m11
    182 %else
    183    pxor        m2, m2
    184    pmaxsw      m0, m2
    185 %endif
    186 %endif
    187 %ifidn %1, avg
    188    pavgw       m0, [dstq]
    189 %endif
    190    mova    [dstq], m0
    191    add       dstq, dstrideq
    192    dec         hd
    193    jg .loop
    194    RET
    195 
    196 cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _12, 6, 6, %2, dst, dstride, src, sstride, h, filtery
    197    mova        m5, [pw_4095]
    198    jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_h_ %+ %%px %+ _10 %+ SUFFIX).body
    199 %endmacro
    200 
    201 INIT_XMM sse2
    202 filter_h_fn put
    203 filter_h_fn avg
    204 %if HAVE_AVX2_EXTERNAL
    205 INIT_YMM avx2
    206 filter_h_fn put
    207 filter_h_fn avg
    208 %endif
    209 
    210 %macro filter_v4_fn 1-2 12
    211 %if ARCH_X86_64
    212 cglobal vp9_%1_8tap_1d_v_4_10, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
    213 %else
    214 cglobal vp9_%1_8tap_1d_v_4_10, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
    215    mov   filteryq, r5mp
    216 %define hd r4mp
    217 %endif
    218    mova        m5, [pw_1023]
    219 .body:
    220 %if notcpuflag(sse4) && ARCH_X86_64
    221    pxor       m11, m11
    222 %endif
    223    mova        m6, [pd_64]
    224    lea  sstride3q, [sstrideq*3]
    225    lea      src4q, [srcq+sstrideq]
    226    sub       srcq, sstride3q
    227    mova        m7, [filteryq+  0]
    228 %if ARCH_X86_64 && mmsize > 8
    229    mova        m8, [filteryq+ 32]
    230    mova        m9, [filteryq+ 64]
    231    mova       m10, [filteryq+ 96]
    232 %endif
    233 .loop:
    234    ; FIXME maybe reuse loads from previous rows, or just
    235    ; more generally unroll this to prevent multiple loads of
    236    ; the same data?
    237    movh        m0, [srcq]
    238    movh        m1, [srcq+sstrideq]
    239    movh        m2, [srcq+sstrideq*2]
    240    movh        m3, [srcq+sstride3q]
    241    add       srcq, sstrideq
    242    movh        m4, [src4q]
    243    punpcklwd   m0, m1
    244    punpcklwd   m2, m3
    245    pmaddwd     m0, m7
    246 %if ARCH_X86_64 && mmsize > 8
    247    pmaddwd     m2, m8
    248 %else
    249    pmaddwd     m2, [filteryq+ 32]
    250 %endif
    251    movh        m1, [src4q+sstrideq]
    252    movh        m3, [src4q+sstrideq*2]
    253    paddd       m0, m2
    254    movh        m2, [src4q+sstride3q]
    255    add      src4q, sstrideq
    256    punpcklwd   m4, m1
    257    punpcklwd   m3, m2
    258 %if ARCH_X86_64 && mmsize > 8
    259    pmaddwd     m4, m9
    260    pmaddwd     m3, m10
    261 %else
    262    pmaddwd     m4, [filteryq+ 64]
    263    pmaddwd     m3, [filteryq+ 96]
    264 %endif
    265    paddd       m0, m4
    266    paddd       m0, m3
    267    paddd       m0, m6
    268    psrad       m0, 7
    269 %if cpuflag(sse4)
    270    packusdw    m0, m0
    271 %else
    272    packssdw    m0, m0
    273 %endif
    274 %ifidn %1, avg
    275    movh        m1, [dstq]
    276 %endif
    277    pminsw      m0, m5
    278 %if notcpuflag(sse4)
    279 %if ARCH_X86_64
    280    pmaxsw      m0, m11
    281 %else
    282    pxor        m2, m2
    283    pmaxsw      m0, m2
    284 %endif
    285 %endif
    286 %ifidn %1, avg
    287    pavgw       m0, m1
    288 %endif
    289    movh    [dstq], m0
    290    add       dstq, dstrideq
    291    dec         hd
    292    jg .loop
    293    RET
    294 
    295 %if ARCH_X86_64
    296 cglobal vp9_%1_8tap_1d_v_4_12, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
    297 %else
    298 cglobal vp9_%1_8tap_1d_v_4_12, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
    299    mov   filteryq, r5mp
    300 %endif
    301    mova        m5, [pw_4095]
    302    jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_v_4_10 %+ SUFFIX).body
    303 %endmacro
    304 
    305 INIT_XMM sse2
    306 filter_v4_fn put
    307 filter_v4_fn avg
    308 
    309 %macro filter_v_fn 1-2 13
    310 %assign %%px mmsize/2
    311 %if ARCH_X86_64
    312 cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _10, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
    313 %else
    314 cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _10, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
    315    mov   filteryq, r5mp
    316 %define hd r4mp
    317 %endif
    318    mova        m5, [pw_1023]
    319 .body:
    320 %if notcpuflag(sse4) && ARCH_X86_64
    321    pxor       m12, m12
    322 %endif
    323 %if ARCH_X86_64
    324    mova       m11, [pd_64]
    325 %endif
    326    lea  sstride3q, [sstrideq*3]
    327    lea      src4q, [srcq+sstrideq]
    328    sub       srcq, sstride3q
    329    mova        m7, [filteryq+  0]
    330 %if ARCH_X86_64 && mmsize > 8
    331    mova        m8, [filteryq+ 32]
    332    mova        m9, [filteryq+ 64]
    333    mova       m10, [filteryq+ 96]
    334 %endif
    335 .loop:
    336    ; FIXME maybe reuse loads from previous rows, or just
    337    ; more generally unroll this to prevent multiple loads of
    338    ; the same data?
    339    movu        m0, [srcq]
    340    movu        m1, [srcq+sstrideq]
    341    movu        m2, [srcq+sstrideq*2]
    342    movu        m3, [srcq+sstride3q]
    343    add       srcq, sstrideq
    344    movu        m4, [src4q]
    345    SBUTTERFLY  wd, 0, 1, 6
    346    SBUTTERFLY  wd, 2, 3, 6
    347    pmaddwd     m0, m7
    348    pmaddwd     m1, m7
    349 %if ARCH_X86_64 && mmsize > 8
    350    pmaddwd     m2, m8
    351    pmaddwd     m3, m8
    352 %else
    353    pmaddwd     m2, [filteryq+ 32]
    354    pmaddwd     m3, [filteryq+ 32]
    355 %endif
    356    paddd       m0, m2
    357    paddd       m1, m3
    358    movu        m2, [src4q+sstrideq]
    359    movu        m3, [src4q+sstrideq*2]
    360    SBUTTERFLY  wd, 4, 2, 6
    361 %if ARCH_X86_64 && mmsize > 8
    362    pmaddwd     m4, m9
    363    pmaddwd     m2, m9
    364 %else
    365    pmaddwd     m4, [filteryq+ 64]
    366    pmaddwd     m2, [filteryq+ 64]
    367 %endif
    368    paddd       m0, m4
    369    paddd       m1, m2
    370    movu        m4, [src4q+sstride3q]
    371    add      src4q, sstrideq
    372    SBUTTERFLY  wd, 3, 4, 6
    373 %if ARCH_X86_64 && mmsize > 8
    374    pmaddwd     m3, m10
    375    pmaddwd     m4, m10
    376 %else
    377    pmaddwd     m3, [filteryq+ 96]
    378    pmaddwd     m4, [filteryq+ 96]
    379 %endif
    380    paddd       m0, m3
    381    paddd       m1, m4
    382 %if ARCH_X86_64
    383    paddd       m0, m11
    384    paddd       m1, m11
    385 %else
    386    paddd       m0, [pd_64]
    387    paddd       m1, [pd_64]
    388 %endif
    389    psrad       m0, 7
    390    psrad       m1, 7
    391 %if cpuflag(sse4)
    392    packusdw    m0, m1
    393 %else
    394    packssdw    m0, m1
    395 %endif
    396    pminsw      m0, m5
    397 %if notcpuflag(sse4)
    398 %if ARCH_X86_64
    399    pmaxsw      m0, m12
    400 %else
    401    pxor        m2, m2
    402    pmaxsw      m0, m2
    403 %endif
    404 %endif
    405 %ifidn %1, avg
    406    pavgw       m0, [dstq]
    407 %endif
    408    mova    [dstq], m0
    409    add       dstq, dstrideq
    410    dec         hd
    411    jg .loop
    412    RET
    413 
    414 %if ARCH_X86_64
    415 cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _12, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
    416 %else
    417 cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _12, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
    418    mov   filteryq, r5mp
    419 %endif
    420    mova        m5, [pw_4095]
    421    jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_v_ %+ %%px %+ _10 %+ SUFFIX).body
    422 %endmacro
    423 
    424 INIT_XMM sse2
    425 filter_v_fn put
    426 filter_v_fn avg
    427 %if HAVE_AVX2_EXTERNAL
    428 INIT_YMM avx2
    429 filter_v_fn put
    430 filter_v_fn avg
    431 %endif