tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

h264cmc_neon.S (15575B)


      1 /*
      2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
      3 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
      4 *
      5 * This file is part of FFmpeg.
      6 *
      7 * FFmpeg is free software; you can redistribute it and/or
      8 * modify it under the terms of the GNU Lesser General Public
      9 * License as published by the Free Software Foundation; either
     10 * version 2.1 of the License, or (at your option) any later version.
     11 *
     12 * FFmpeg is distributed in the hope that it will be useful,
     13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     15 * Lesser General Public License for more details.
     16 *
     17 * You should have received a copy of the GNU Lesser General Public
     18 * License along with FFmpeg; if not, write to the Free Software
     19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     20 */
     21 
     22 #include "config_components.h"
     23 
     24 #include "libavutil/aarch64/asm.S"
     25 
     26 /* chroma_mc8(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
     27 .macro  h264_chroma_mc8 type, codec=h264
     28 function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
     29  .ifc \type,avg
     30        mov             x8,  x0
     31  .endif
     32        prfm            pldl1strm, [x1]
     33        prfm            pldl1strm, [x1, x2]
     34  .ifc \codec,rv40
     35        movrel          x6,  rv40bias
     36        lsr             w9,  w5,  #1
     37        lsr             w10, w4,  #1
     38        lsl             w9,  w9,  #3
     39        lsl             w10, w10, #1
     40        add             w9,  w9,  w10
     41        add             x6,  x6,  w9, uxtw
     42        ld1r            {v22.8h}, [x6]
     43  .endif
     44  .ifc \codec,vc1
     45        movi            v22.8h,   #28
     46  .endif
     47        mul             w7,  w4,  w5
     48        lsl             w14, w5,  #3
     49        lsl             w13, w4,  #3
     50        cmp             w7,  #0
     51        sub             w6,  w14, w7
     52        sub             w12, w13, w7
     53        sub             w4,  w7,  w13
     54        sub             w4,  w4,  w14
     55        add             w4,  w4,  #64
     56        b.eq            2f
     57 
     58        dup             v0.8b,  w4
     59        dup             v1.8b,  w12
     60        ld1             {v4.8b, v5.8b}, [x1], x2
     61        dup             v2.8b,  w6
     62        dup             v3.8b,  w7
     63        ext             v5.8b,  v4.8b,  v5.8b,  #1
     64 1:      ld1             {v6.8b, v7.8b}, [x1], x2
     65        umull           v16.8h, v4.8b,  v0.8b
     66        umlal           v16.8h, v5.8b,  v1.8b
     67        ext             v7.8b,  v6.8b,  v7.8b,  #1
     68        ld1             {v4.8b, v5.8b}, [x1], x2
     69        umlal           v16.8h, v6.8b,  v2.8b
     70        prfm            pldl1strm, [x1]
     71        ext             v5.8b,  v4.8b,  v5.8b,  #1
     72        umlal           v16.8h, v7.8b,  v3.8b
     73        umull           v17.8h, v6.8b,  v0.8b
     74        subs            w3,  w3,  #2
     75        umlal           v17.8h, v7.8b, v1.8b
     76        umlal           v17.8h, v4.8b, v2.8b
     77        umlal           v17.8h, v5.8b, v3.8b
     78        prfm            pldl1strm, [x1, x2]
     79  .ifc \codec,h264
     80        rshrn           v16.8b, v16.8h, #6
     81        rshrn           v17.8b, v17.8h, #6
     82  .else
     83        add             v16.8h, v16.8h, v22.8h
     84        add             v17.8h, v17.8h, v22.8h
     85        shrn            v16.8b, v16.8h, #6
     86        shrn            v17.8b, v17.8h, #6
     87  .endif
     88  .ifc \type,avg
     89        ld1             {v20.8b}, [x8], x2
     90        ld1             {v21.8b}, [x8], x2
     91        urhadd          v16.8b, v16.8b, v20.8b
     92        urhadd          v17.8b, v17.8b, v21.8b
     93  .endif
     94        st1             {v16.8b}, [x0], x2
     95        st1             {v17.8b}, [x0], x2
     96        b.gt            1b
     97        ret
     98 
     99 2:      adds            w12, w12, w6
    100        dup             v0.8b, w4
    101        b.eq            5f
    102        tst             w6,  w6
    103        dup             v1.8b, w12
    104        b.eq            4f
    105 
    106        ld1             {v4.8b}, [x1], x2
    107 3:      ld1             {v6.8b}, [x1], x2
    108        umull           v16.8h, v4.8b,  v0.8b
    109        umlal           v16.8h, v6.8b,  v1.8b
    110        ld1             {v4.8b}, [x1], x2
    111        umull           v17.8h, v6.8b,  v0.8b
    112        umlal           v17.8h, v4.8b,  v1.8b
    113        prfm            pldl1strm, [x1]
    114  .ifc \codec,h264
    115        rshrn           v16.8b, v16.8h, #6
    116        rshrn           v17.8b, v17.8h, #6
    117  .else
    118        add             v16.8h, v16.8h, v22.8h
    119        add             v17.8h, v17.8h, v22.8h
    120        shrn            v16.8b, v16.8h, #6
    121        shrn            v17.8b, v17.8h, #6
    122  .endif
    123        prfm            pldl1strm, [x1, x2]
    124  .ifc \type,avg
    125        ld1             {v20.8b}, [x8], x2
    126        ld1             {v21.8b}, [x8], x2
    127        urhadd          v16.8b, v16.8b, v20.8b
    128        urhadd          v17.8b, v17.8b, v21.8b
    129  .endif
    130        subs            w3,  w3,  #2
    131        st1             {v16.8b}, [x0], x2
    132        st1             {v17.8b}, [x0], x2
    133        b.gt            3b
    134        ret
    135 
    136 4:      ld1             {v4.8b, v5.8b}, [x1], x2
    137        ld1             {v6.8b, v7.8b}, [x1], x2
    138        ext             v5.8b,  v4.8b,  v5.8b,  #1
    139        ext             v7.8b,  v6.8b,  v7.8b,  #1
    140        prfm            pldl1strm, [x1]
    141        subs            w3,  w3,  #2
    142        umull           v16.8h, v4.8b, v0.8b
    143        umlal           v16.8h, v5.8b, v1.8b
    144        umull           v17.8h, v6.8b, v0.8b
    145        umlal           v17.8h, v7.8b, v1.8b
    146        prfm            pldl1strm, [x1, x2]
    147  .ifc \codec,h264
    148        rshrn           v16.8b, v16.8h, #6
    149        rshrn           v17.8b, v17.8h, #6
    150  .else
    151        add             v16.8h, v16.8h, v22.8h
    152        add             v17.8h, v17.8h, v22.8h
    153        shrn            v16.8b, v16.8h, #6
    154        shrn            v17.8b, v17.8h, #6
    155  .endif
    156  .ifc \type,avg
    157        ld1             {v20.8b}, [x8], x2
    158        ld1             {v21.8b}, [x8], x2
    159        urhadd          v16.8b, v16.8b, v20.8b
    160        urhadd          v17.8b, v17.8b, v21.8b
    161  .endif
    162        st1             {v16.8b}, [x0], x2
    163        st1             {v17.8b}, [x0], x2
    164        b.gt            4b
    165        ret
    166 
    167 5:      ld1             {v4.8b}, [x1], x2
    168        ld1             {v5.8b}, [x1], x2
    169        prfm            pldl1strm, [x1]
    170        subs            w3,  w3,  #2
    171        umull           v16.8h, v4.8b, v0.8b
    172        umull           v17.8h, v5.8b, v0.8b
    173        prfm            pldl1strm, [x1, x2]
    174  .ifc \codec,h264
    175        rshrn           v16.8b, v16.8h, #6
    176        rshrn           v17.8b, v17.8h, #6
    177  .else
    178        add             v16.8h, v16.8h, v22.8h
    179        add             v17.8h, v17.8h, v22.8h
    180        shrn            v16.8b, v16.8h, #6
    181        shrn            v17.8b, v17.8h, #6
    182  .endif
    183  .ifc \type,avg
    184        ld1             {v20.8b}, [x8], x2
    185        ld1             {v21.8b}, [x8], x2
    186        urhadd          v16.8b, v16.8b, v20.8b
    187        urhadd          v17.8b, v17.8b, v21.8b
    188  .endif
    189        st1             {v16.8b}, [x0], x2
    190        st1             {v17.8b}, [x0], x2
    191        b.gt            5b
    192        ret
    193 endfunc
    194 .endm
    195 
    196 /* chroma_mc4(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
    197 .macro  h264_chroma_mc4 type, codec=h264
    198 function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
    199  .ifc \type,avg
    200        mov             x8,  x0
    201  .endif
    202        prfm            pldl1strm, [x1]
    203        prfm            pldl1strm, [x1, x2]
    204  .ifc \codec,rv40
    205        movrel          x6,  rv40bias
    206        lsr             w9,  w5,  #1
    207        lsr             w10, w4,  #1
    208        lsl             w9,  w9,  #3
    209        lsl             w10, w10, #1
    210        add             w9,  w9,  w10
    211        add             x6,  x6,  w9, uxtw
    212        ld1r            {v22.8h}, [x6]
    213  .endif
    214  .ifc \codec,vc1
    215        movi            v22.8h,   #28
    216  .endif
    217        mul             w7,  w4,  w5
    218        lsl             w14, w5,  #3
    219        lsl             w13, w4,  #3
    220        cmp             w7,  #0
    221        sub             w6,  w14, w7
    222        sub             w12, w13, w7
    223        sub             w4,  w7,  w13
    224        sub             w4,  w4,  w14
    225        add             w4,  w4,  #64
    226        b.eq            2f
    227 
    228        dup             v24.8b,  w4
    229        dup             v25.8b,  w12
    230        ld1             {v4.8b}, [x1], x2
    231        dup             v26.8b,  w6
    232        dup             v27.8b,  w7
    233        ext             v5.8b,  v4.8b,  v5.8b, #1
    234        trn1            v0.2s,  v24.2s, v25.2s
    235        trn1            v2.2s,  v26.2s, v27.2s
    236        trn1            v4.2s,  v4.2s,  v5.2s
    237 1:      ld1             {v6.8b}, [x1], x2
    238        ext             v7.8b,  v6.8b,  v7.8b, #1
    239        trn1            v6.2s,  v6.2s,  v7.2s
    240        umull           v18.8h, v4.8b,  v0.8b
    241        umlal           v18.8h, v6.8b,  v2.8b
    242        ld1             {v4.8b}, [x1], x2
    243        ext             v5.8b,  v4.8b,  v5.8b, #1
    244        trn1            v4.2s,  v4.2s,  v5.2s
    245        prfm            pldl1strm, [x1]
    246        umull           v19.8h, v6.8b,  v0.8b
    247        umlal           v19.8h, v4.8b,  v2.8b
    248        trn1            v30.2d, v18.2d, v19.2d
    249        trn2            v31.2d, v18.2d, v19.2d
    250        add             v18.8h, v30.8h, v31.8h
    251  .ifc \codec,h264
    252        rshrn           v16.8b, v18.8h, #6
    253  .else
    254        add             v18.8h, v18.8h, v22.8h
    255        shrn            v16.8b, v18.8h, #6
    256  .endif
    257        subs            w3,  w3,  #2
    258        prfm            pldl1strm, [x1, x2]
    259  .ifc \type,avg
    260        ld1             {v20.s}[0], [x8], x2
    261        ld1             {v20.s}[1], [x8], x2
    262        urhadd          v16.8b, v16.8b, v20.8b
    263  .endif
    264        st1             {v16.s}[0], [x0], x2
    265        st1             {v16.s}[1], [x0], x2
    266        b.gt            1b
    267        ret
    268 
    269 2:      adds            w12, w12, w6
    270        dup             v30.8b, w4
    271        b.eq            5f
    272        tst             w6,  w6
    273        dup             v31.8b, w12
    274        trn1            v0.2s,  v30.2s, v31.2s
    275        trn2            v1.2s,  v30.2s, v31.2s
    276        b.eq            4f
    277 
    278        ext             v1.8b,  v0.8b,  v1.8b, #4
    279        ld1             {v4.s}[0], [x1], x2
    280 3:      ld1             {v4.s}[1], [x1], x2
    281        umull           v18.8h, v4.8b,  v0.8b
    282        ld1             {v4.s}[0], [x1], x2
    283        umull           v19.8h, v4.8b,  v1.8b
    284        trn1            v30.2d, v18.2d, v19.2d
    285        trn2            v31.2d, v18.2d, v19.2d
    286        add             v18.8h, v30.8h, v31.8h
    287        prfm            pldl1strm, [x1]
    288  .ifc \codec,h264
    289        rshrn           v16.8b, v18.8h, #6
    290  .else
    291        add             v18.8h, v18.8h, v22.8h
    292        shrn            v16.8b, v18.8h, #6
    293  .endif
    294  .ifc \type,avg
    295        ld1             {v20.s}[0], [x8], x2
    296        ld1             {v20.s}[1], [x8], x2
    297        urhadd          v16.8b, v16.8b, v20.8b
    298  .endif
    299        subs            w3,  w3,  #2
    300        prfm            pldl1strm, [x1, x2]
    301        st1             {v16.s}[0], [x0], x2
    302        st1             {v16.s}[1], [x0], x2
    303        b.gt            3b
    304        ret
    305 
    306 4:      ld1             {v4.8b}, [x1], x2
    307        ld1             {v6.8b}, [x1], x2
    308        ext             v5.8b,  v4.8b,  v5.8b, #1
    309        ext             v7.8b,  v6.8b,  v7.8b, #1
    310        trn1            v4.2s,  v4.2s,  v5.2s
    311        trn1            v6.2s,  v6.2s,  v7.2s
    312        umull           v18.8h, v4.8b,  v0.8b
    313        umull           v19.8h, v6.8b,  v0.8b
    314        subs            w3,  w3,  #2
    315        trn1            v30.2d, v18.2d, v19.2d
    316        trn2            v31.2d, v18.2d, v19.2d
    317        add             v18.8h, v30.8h, v31.8h
    318        prfm            pldl1strm, [x1]
    319  .ifc \codec,h264
    320        rshrn           v16.8b, v18.8h, #6
    321  .else
    322        add             v18.8h, v18.8h, v22.8h
    323        shrn            v16.8b, v18.8h, #6
    324  .endif
    325  .ifc \type,avg
    326        ld1             {v20.s}[0], [x8], x2
    327        ld1             {v20.s}[1], [x8], x2
    328        urhadd          v16.8b, v16.8b, v20.8b
    329  .endif
    330        prfm            pldl1strm, [x1]
    331        st1             {v16.s}[0], [x0], x2
    332        st1             {v16.s}[1], [x0], x2
    333        b.gt            4b
    334        ret
    335 
    336 5:      ld1             {v4.s}[0], [x1], x2
    337        ld1             {v4.s}[1], [x1], x2
    338        umull           v18.8h, v4.8b,  v30.8b
    339        subs            w3,  w3,  #2
    340        prfm            pldl1strm, [x1]
    341  .ifc \codec,h264
    342        rshrn           v16.8b, v18.8h, #6
    343  .else
    344        add             v18.8h, v18.8h, v22.8h
    345        shrn            v16.8b, v18.8h, #6
    346  .endif
    347  .ifc \type,avg
    348        ld1             {v20.s}[0], [x8], x2
    349        ld1             {v20.s}[1], [x8], x2
    350        urhadd          v16.8b, v16.8b, v20.8b
    351  .endif
    352        prfm            pldl1strm, [x1]
    353        st1             {v16.s}[0], [x0], x2
    354        st1             {v16.s}[1], [x0], x2
    355        b.gt            5b
    356        ret
    357 endfunc
    358 .endm
    359 
    360 .macro  h264_chroma_mc2 type
    361 function ff_\type\()_h264_chroma_mc2_neon, export=1
    362        prfm            pldl1strm, [x1]
    363        prfm            pldl1strm, [x1, x2]
    364        orr             w7,  w4,  w5
    365        cbz             w7,  2f
    366 
    367        mul             w7,  w4,  w5
    368        lsl             w14, w5,  #3
    369        lsl             w13, w4,  #3
    370        sub             w6,  w14, w7
    371        sub             w12, w13, w7
    372        sub             w4,  w7,  w13
    373        sub             w4,  w4,  w14
    374        add             w4,  w4,  #64
    375        dup             v0.8b,  w4
    376        dup             v2.8b,  w12
    377        dup             v1.8b,  w6
    378        dup             v3.8b,  w7
    379        trn1            v0.4h,  v0.4h,  v2.4h
    380        trn1            v1.4h,  v1.4h,  v3.4h
    381 1:
    382        ld1             {v4.s}[0],  [x1], x2
    383        ld1             {v4.s}[1],  [x1], x2
    384        rev64           v5.2s,  v4.2s
    385        ld1             {v5.s}[1],  [x1]
    386        ext             v6.8b,  v4.8b,  v5.8b,  #1
    387        ext             v7.8b,  v5.8b,  v4.8b,  #1
    388        trn1            v4.4h,  v4.4h,  v6.4h
    389        trn1            v5.4h,  v5.4h,  v7.4h
    390        umull           v16.8h, v4.8b,  v0.8b
    391        umlal           v16.8h, v5.8b,  v1.8b
    392  .ifc \type,avg
    393        ld1             {v18.h}[0], [x0], x2
    394        ld1             {v18.h}[2], [x0]
    395        sub             x0,  x0,  x2
    396  .endif
    397        rev64           v17.4s, v16.4s
    398        add             v16.8h, v16.8h, v17.8h
    399        rshrn           v16.8b, v16.8h, #6
    400  .ifc \type,avg
    401        urhadd          v16.8b, v16.8b, v18.8b
    402  .endif
    403        st1             {v16.h}[0], [x0], x2
    404        st1             {v16.h}[2], [x0], x2
    405        subs            w3,  w3,  #2
    406        b.gt            1b
    407        ret
    408 
    409 2:
    410        ld1             {v16.h}[0], [x1], x2
    411        ld1             {v16.h}[1], [x1], x2
    412  .ifc \type,avg
    413        ld1             {v18.h}[0], [x0], x2
    414        ld1             {v18.h}[1], [x0]
    415        sub             x0,  x0,  x2
    416        urhadd          v16.8b, v16.8b, v18.8b
    417  .endif
    418        st1             {v16.h}[0], [x0], x2
    419        st1             {v16.h}[1], [x0], x2
    420        subs            w3,  w3,  #2
    421        b.gt            2b
    422        ret
    423 endfunc
    424 .endm
    425 
    426        h264_chroma_mc8 put
    427        h264_chroma_mc8 avg
    428        h264_chroma_mc4 put
    429        h264_chroma_mc4 avg
    430        h264_chroma_mc2 put
    431        h264_chroma_mc2 avg
    432 
    433 #if CONFIG_RV40_DECODER
    434 const   rv40bias
    435        .short           0, 16, 32, 16
    436        .short          32, 28, 32, 28
    437        .short           0, 32, 16, 32
    438        .short          32, 28, 32, 28
    439 endconst
    440 
    441        h264_chroma_mc8 put, rv40
    442        h264_chroma_mc8 avg, rv40
    443        h264_chroma_mc4 put, rv40
    444        h264_chroma_mc4 avg, rv40
    445 #endif
    446 
    447 #if CONFIG_VC1DSP
    448        h264_chroma_mc8 put, vc1
    449        h264_chroma_mc8 avg, vc1
    450        h264_chroma_mc4 put, vc1
    451        h264_chroma_mc4 avg, vc1
    452 #endif