tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

hpeldsp_neon.S (13962B)


      1 /*
      2 * ARM NEON optimised DSP functions
      3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
      4 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
      5 *
      6 * This file is part of FFmpeg.
      7 *
      8 * FFmpeg is free software; you can redistribute it and/or
      9 * modify it under the terms of the GNU Lesser General Public
     10 * License as published by the Free Software Foundation; either
     11 * version 2.1 of the License, or (at your option) any later version.
     12 *
     13 * FFmpeg is distributed in the hope that it will be useful,
     14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     16 * Lesser General Public License for more details.
     17 *
     18 * You should have received a copy of the GNU Lesser General Public
     19 * License along with FFmpeg; if not, write to the Free Software
     20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     21 */
     22 
     23 #include "libavutil/aarch64/asm.S"
     24 
     25 .macro  pixels16        rnd=1, avg=0
     26  .if \avg
     27        mov             x12, x0
     28  .endif
     29 1:      ld1             {v0.16b},  [x1], x2
     30        ld1             {v1.16b},  [x1], x2
     31        ld1             {v2.16b},  [x1], x2
     32        ld1             {v3.16b},  [x1], x2
     33  .if \avg
     34        ld1             {v4.16b},  [x12], x2
     35        urhadd          v0.16b,  v0.16b,  v4.16b
     36        ld1             {v5.16b},  [x12], x2
     37        urhadd          v1.16b,  v1.16b,  v5.16b
     38        ld1             {v6.16b},  [x12], x2
     39        urhadd          v2.16b,  v2.16b,  v6.16b
     40        ld1             {v7.16b},  [x12], x2
     41        urhadd          v3.16b,  v3.16b,  v7.16b
     42  .endif
     43        subs            w3,  w3,  #4
     44        st1             {v0.16b},  [x0], x2
     45        st1             {v1.16b},  [x0], x2
     46        st1             {v2.16b},  [x0], x2
     47        st1             {v3.16b},  [x0], x2
     48        b.ne            1b
     49        ret
     50 .endm
     51 
     52 .macro  pixels16_x2     rnd=1, avg=0
     53 1:      ld1             {v0.16b, v1.16b}, [x1], x2
     54        ld1             {v2.16b, v3.16b}, [x1], x2
     55        subs            w3,  w3,  #2
     56        ext             v1.16b,  v0.16b,  v1.16b,  #1
     57        avg             v0.16b,  v0.16b,  v1.16b
     58        ext             v3.16b,  v2.16b,  v3.16b,  #1
     59        avg             v2.16b,  v2.16b,  v3.16b
     60  .if \avg
     61        ld1             {v1.16b}, [x0], x2
     62        ld1             {v3.16b}, [x0]
     63        urhadd          v0.16b,  v0.16b,  v1.16b
     64        urhadd          v2.16b,  v2.16b,  v3.16b
     65        sub             x0,  x0,  x2
     66  .endif
     67        st1             {v0.16b}, [x0], x2
     68        st1             {v2.16b}, [x0], x2
     69        b.ne            1b
     70        ret
     71 .endm
     72 
     73 .macro  pixels16_y2     rnd=1, avg=0
     74        sub             w3,  w3,  #2
     75        ld1             {v0.16b}, [x1], x2
     76        ld1             {v1.16b}, [x1], x2
     77 1:      subs            w3,  w3,  #2
     78        avg             v2.16b,  v0.16b,  v1.16b
     79        ld1             {v0.16b}, [x1], x2
     80        avg             v3.16b,  v0.16b,  v1.16b
     81        ld1             {v1.16b}, [x1], x2
     82  .if \avg
     83        ld1             {v4.16b}, [x0], x2
     84        ld1             {v5.16b}, [x0]
     85        urhadd          v2.16b,  v2.16b,  v4.16b
     86        urhadd          v3.16b,  v3.16b,  v5.16b
     87        sub             x0,  x0,  x2
     88  .endif
     89        st1             {v2.16b}, [x0], x2
     90        st1             {v3.16b}, [x0], x2
     91        b.ne            1b
     92 
     93        avg             v2.16b,  v0.16b,  v1.16b
     94        ld1             {v0.16b}, [x1], x2
     95        avg             v3.16b,  v0.16b,  v1.16b
     96  .if \avg
     97        ld1             {v4.16b}, [x0], x2
     98        ld1             {v5.16b}, [x0]
     99        urhadd          v2.16b,  v2.16b,  v4.16b
    100        urhadd          v3.16b,  v3.16b,  v5.16b
    101        sub             x0,  x0,  x2
    102  .endif
    103        st1             {v2.16b},     [x0], x2
    104        st1             {v3.16b},     [x0], x2
    105 
    106        ret
    107 .endm
    108 
    109 .macro  pixels16_xy2    rnd=1, avg=0
    110        sub             w3,  w3,  #2
    111        ld1             {v0.16b, v1.16b}, [x1], x2
    112        ld1             {v4.16b, v5.16b}, [x1], x2
    113 NRND    movi            v26.8H, #1
    114        ext             v1.16b,  v0.16b,  v1.16b,  #1
    115        ext             v5.16b,  v4.16b,  v5.16b,  #1
    116        uaddl           v16.8h,  v0.8b,   v1.8b
    117        uaddl2          v20.8h,  v0.16b,  v1.16b
    118        uaddl           v18.8h,  v4.8b,   v5.8b
    119        uaddl2          v22.8h,  v4.16b,  v5.16b
    120 1:      subs            w3,  w3,  #2
    121        ld1             {v0.16b, v1.16b}, [x1], x2
    122        add             v24.8h,  v16.8h,  v18.8h
    123 NRND    add             v24.8H,  v24.8H,  v26.8H
    124        ext             v30.16b, v0.16b,  v1.16b,  #1
    125        add             v1.8h,   v20.8h,  v22.8h
    126        mshrn           v28.8b,  v24.8h,  #2
    127 NRND    add             v1.8H,   v1.8H,   v26.8H
    128        mshrn2          v28.16b, v1.8h,   #2
    129  .if \avg
    130        ld1             {v16.16b},        [x0]
    131        urhadd          v28.16b, v28.16b, v16.16b
    132  .endif
    133        uaddl           v16.8h,  v0.8b,   v30.8b
    134        ld1             {v2.16b, v3.16b}, [x1], x2
    135        uaddl2          v20.8h,  v0.16b,  v30.16b
    136        st1             {v28.16b},        [x0], x2
    137        add             v24.8h,  v16.8h,  v18.8h
    138 NRND    add             v24.8H,  v24.8H,  v26.8H
    139        ext             v3.16b,  v2.16b,  v3.16b,  #1
    140        add             v0.8h,   v20.8h,  v22.8h
    141        mshrn           v30.8b,  v24.8h,  #2
    142 NRND    add             v0.8H,   v0.8H,   v26.8H
    143        mshrn2          v30.16b, v0.8h,   #2
    144  .if \avg
    145        ld1             {v18.16b},        [x0]
    146        urhadd          v30.16b, v30.16b, v18.16b
    147  .endif
    148        uaddl           v18.8h,   v2.8b,  v3.8b
    149        uaddl2          v22.8h,   v2.16b, v3.16b
    150        st1             {v30.16b},        [x0], x2
    151        b.gt            1b
    152 
    153        ld1             {v0.16b, v1.16b}, [x1], x2
    154        add             v24.8h,  v16.8h,  v18.8h
    155 NRND    add             v24.8H,  v24.8H,  v26.8H
    156        ext             v30.16b, v0.16b,  v1.16b,  #1
    157        add             v1.8h,   v20.8h,  v22.8h
    158        mshrn           v28.8b,  v24.8h,  #2
    159 NRND    add             v1.8H,   v1.8H,   v26.8H
    160        mshrn2          v28.16b, v1.8h,   #2
    161  .if \avg
    162        ld1             {v16.16b},        [x0]
    163        urhadd          v28.16b, v28.16b, v16.16b
    164  .endif
    165        uaddl           v16.8h,  v0.8b,   v30.8b
    166        uaddl2          v20.8h,  v0.16b,  v30.16b
    167        st1             {v28.16b},        [x0], x2
    168        add             v24.8h,  v16.8h,  v18.8h
    169 NRND    add             v24.8H,  v24.8H,  v26.8H
    170        add             v0.8h,   v20.8h,  v22.8h
    171        mshrn           v30.8b,  v24.8h,  #2
    172 NRND    add             v0.8H,   v0.8H,   v26.8H
    173        mshrn2          v30.16b, v0.8h,   #2
    174  .if \avg
    175        ld1             {v18.16b},        [x0]
    176        urhadd          v30.16b, v30.16b, v18.16b
    177  .endif
    178        st1             {v30.16b},        [x0], x2
    179 
    180        ret
    181 .endm
    182 
    183 .macro  pixels8         rnd=1, avg=0
    184 1:      ld1             {v0.8b}, [x1], x2
    185        ld1             {v1.8b}, [x1], x2
    186        ld1             {v2.8b}, [x1], x2
    187        ld1             {v3.8b}, [x1], x2
    188  .if \avg
    189        ld1             {v4.8b}, [x0], x2
    190        urhadd          v0.8b,  v0.8b,  v4.8b
    191        ld1             {v5.8b}, [x0], x2
    192        urhadd          v1.8b,  v1.8b,  v5.8b
    193        ld1             {v6.8b}, [x0], x2
    194        urhadd          v2.8b,  v2.8b,  v6.8b
    195        ld1             {v7.8b}, [x0], x2
    196        urhadd          v3.8b,  v3.8b,  v7.8b
    197        sub             x0,  x0,  x2,  lsl #2
    198  .endif
    199        subs            w3,  w3,  #4
    200        st1             {v0.8b}, [x0], x2
    201        st1             {v1.8b}, [x0], x2
    202        st1             {v2.8b}, [x0], x2
    203        st1             {v3.8b}, [x0], x2
    204        b.ne            1b
    205        ret
    206 .endm
    207 
    208 .macro  pixels8_x2      rnd=1, avg=0
    209 1:      ld1             {v0.8b, v1.8b}, [x1], x2
    210        ext             v1.8b,  v0.8b,  v1.8b,  #1
    211        ld1             {v2.8b, v3.8b}, [x1], x2
    212        ext             v3.8b,  v2.8b,  v3.8b,  #1
    213        subs            w3,  w3,  #2
    214        avg             v0.8b,   v0.8b,   v1.8b
    215        avg             v2.8b,   v2.8b,   v3.8b
    216  .if \avg
    217        ld1             {v4.8b},     [x0], x2
    218        ld1             {v5.8b},     [x0]
    219        urhadd          v0.8b,   v0.8b,   v4.8b
    220        urhadd          v2.8b,   v2.8b,   v5.8b
    221        sub             x0,  x0,  x2
    222  .endif
    223        st1             {v0.8b}, [x0], x2
    224        st1             {v2.8b}, [x0], x2
    225        b.ne            1b
    226        ret
    227 .endm
    228 
    229 .macro  pixels8_y2      rnd=1, avg=0
    230        sub             w3,  w3,  #2
    231        ld1             {v0.8b},  [x1], x2
    232        ld1             {v1.8b},  [x1], x2
    233 1:      subs            w3,  w3,  #2
    234        avg             v4.8b,  v0.8b,  v1.8b
    235        ld1             {v0.8b},  [x1], x2
    236        avg             v5.8b,  v0.8b,  v1.8b
    237        ld1             {v1.8b},  [x1], x2
    238  .if \avg
    239        ld1             {v2.8b},     [x0], x2
    240        ld1             {v3.8b},     [x0]
    241        urhadd          v4.8b,  v4.8b,  v2.8b
    242        urhadd          v5.8b,  v5.8b,  v3.8b
    243        sub             x0,  x0,  x2
    244  .endif
    245        st1             {v4.8b},     [x0], x2
    246        st1             {v5.8b},     [x0], x2
    247        b.ne            1b
    248 
    249        avg             v4.8b,  v0.8b,  v1.8b
    250        ld1             {v0.8b},  [x1], x2
    251        avg             v5.8b,  v0.8b,  v1.8b
    252  .if \avg
    253        ld1             {v2.8b},     [x0], x2
    254        ld1             {v3.8b},     [x0]
    255        urhadd          v4.8b,  v4.8b,  v2.8b
    256        urhadd          v5.8b,  v5.8b,  v3.8b
    257        sub             x0,  x0,  x2
    258  .endif
    259        st1             {v4.8b},     [x0], x2
    260        st1             {v5.8b},     [x0], x2
    261 
    262        ret
    263 .endm
    264 
    265 .macro  pixels8_xy2     rnd=1, avg=0
    266        sub             w3,  w3,  #2
    267        ld1             {v0.16b},     [x1], x2
    268        ld1             {v1.16b},     [x1], x2
    269 NRND    movi            v19.8H, #1
    270        ext             v4.16b,  v0.16b,  v4.16b,  #1
    271        ext             v6.16b,  v1.16b,  v6.16b,  #1
    272        uaddl           v16.8h,  v0.8b,  v4.8b
    273        uaddl           v17.8h,  v1.8b,  v6.8b
    274 1:      subs            w3,  w3,  #2
    275        ld1             {v0.16b},     [x1], x2
    276        add             v18.8h, v16.8h,  v17.8h
    277        ext             v4.16b,  v0.16b,  v4.16b,  #1
    278 NRND    add             v18.8H, v18.8H, v19.8H
    279        uaddl           v16.8h,  v0.8b,  v4.8b
    280        mshrn           v5.8b,  v18.8h, #2
    281        ld1             {v1.16b},     [x1], x2
    282        add             v18.8h, v16.8h,  v17.8h
    283  .if \avg
    284        ld1             {v7.8b},     [x0]
    285        urhadd          v5.8b,  v5.8b,  v7.8b
    286  .endif
    287 NRND    add             v18.8H, v18.8H, v19.8H
    288        st1             {v5.8b},     [x0], x2
    289        mshrn           v7.8b,  v18.8h, #2
    290  .if \avg
    291        ld1             {v5.8b},     [x0]
    292        urhadd          v7.8b,  v7.8b,  v5.8b
    293  .endif
    294        ext             v6.16b,  v1.16b,  v6.16b,  #1
    295        uaddl           v17.8h,  v1.8b,   v6.8b
    296        st1             {v7.8b},     [x0], x2
    297        b.gt            1b
    298 
    299        ld1             {v0.16b},     [x1], x2
    300        add             v18.8h, v16.8h, v17.8h
    301        ext             v4.16b, v0.16b, v4.16b,  #1
    302 NRND    add             v18.8H, v18.8H, v19.8H
    303        uaddl           v16.8h,  v0.8b, v4.8b
    304        mshrn           v5.8b,  v18.8h, #2
    305        add             v18.8h, v16.8h, v17.8h
    306  .if \avg
    307        ld1             {v7.8b},     [x0]
    308        urhadd          v5.8b,  v5.8b,  v7.8b
    309  .endif
    310 NRND    add             v18.8H, v18.8H, v19.8H
    311        st1             {v5.8b},     [x0], x2
    312        mshrn           v7.8b,  v18.8h, #2
    313  .if \avg
    314        ld1             {v5.8b},     [x0]
    315        urhadd          v7.8b,  v7.8b,  v5.8b
    316  .endif
    317        st1             {v7.8b},     [x0], x2
    318 
    319        ret
    320 .endm
    321 
    322 .macro  pixfunc         pfx, name, suf, rnd=1, avg=0
    323  .if \rnd
    324    .macro avg  rd, rn, rm
    325        urhadd          \rd, \rn, \rm
    326    .endm
    327    .macro mshrn rd, rn, rm
    328        rshrn           \rd, \rn, \rm
    329    .endm
    330    .macro mshrn2 rd, rn, rm
    331        rshrn2          \rd, \rn, \rm
    332    .endm
    333    .macro NRND insn:vararg
    334    .endm
    335  .else
    336    .macro avg  rd, rn, rm
    337        uhadd           \rd, \rn, \rm
    338    .endm
    339    .macro mshrn rd, rn, rm
    340        shrn            \rd, \rn, \rm
    341    .endm
    342    .macro mshrn2 rd, rn, rm
    343        shrn2           \rd, \rn, \rm
    344    .endm
    345    .macro NRND insn:vararg
    346        \insn
    347    .endm
    348  .endif
    349 function ff_\pfx\name\suf\()_neon, export=1
    350        \name           \rnd, \avg
    351 endfunc
    352        .purgem         avg
    353        .purgem         mshrn
    354        .purgem         mshrn2
    355        .purgem         NRND
    356 .endm
    357 
    358 .macro  pixfunc2        pfx, name, avg=0
    359        pixfunc         \pfx, \name,          rnd=1, avg=\avg
    360        pixfunc         \pfx, \name, _no_rnd, rnd=0, avg=\avg
    361 .endm
    362 
    363 function ff_put_h264_qpel16_mc00_neon, export=1
    364        mov             w3,  #16
    365 endfunc
    366 
    367        pixfunc         put_, pixels16,     avg=0
    368        pixfunc2        put_, pixels16_x2,  avg=0
    369        pixfunc2        put_, pixels16_y2,  avg=0
    370        pixfunc2        put_, pixels16_xy2, avg=0
    371 
    372 function ff_avg_h264_qpel16_mc00_neon, export=1
    373        mov             w3,  #16
    374 endfunc
    375 
    376        pixfunc         avg_, pixels16,     avg=1
    377        pixfunc2        avg_, pixels16_x2,  avg=1
    378        pixfunc2        avg_, pixels16_y2,  avg=1
    379        pixfunc2        avg_, pixels16_xy2, avg=1
    380 
    381 function ff_put_h264_qpel8_mc00_neon, export=1
    382        mov             w3,  #8
    383 endfunc
    384 
    385        pixfunc         put_, pixels8,     avg=0
    386        pixfunc2        put_, pixels8_x2,  avg=0
    387        pixfunc2        put_, pixels8_y2,  avg=0
    388        pixfunc2        put_, pixels8_xy2, avg=0
    389 
    390 function ff_avg_h264_qpel8_mc00_neon, export=1
    391        mov             w3,  #8
    392 endfunc
    393 
    394        pixfunc         avg_, pixels8,     avg=1
    395        pixfunc         avg_, pixels8_x2,  avg=1
    396        pixfunc         avg_, pixels8_y2,  avg=1
    397        pixfunc         avg_, pixels8_xy2, avg=1