tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

simple_idct_neon.S (12659B)


      1 /*
      2 * ARM NEON IDCT
      3 *
      4 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
      5 * Copyright (c) 2017 Matthieu Bouron <matthieu.bouron@gmail.com>
      6 *
      7 * Based on Simple IDCT
      8 * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
      9 *
     10 * This file is part of FFmpeg.
     11 *
     12 * FFmpeg is free software; you can redistribute it and/or
     13 * modify it under the terms of the GNU Lesser General Public
     14 * License as published by the Free Software Foundation; either
     15 * version 2.1 of the License, or (at your option) any later version.
     16 *
     17 * FFmpeg is distributed in the hope that it will be useful,
     18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     20 * Lesser General Public License for more details.
     21 *
     22 * You should have received a copy of the GNU Lesser General Public
     23 * License along with FFmpeg; if not, write to the Free Software
     24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     25 */
     26 
     27 #include "libavutil/aarch64/asm.S"
     28 
     29 #define Z1  22725  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
     30 #define Z2  21407  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
     31 #define Z3  19266  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
     32 #define Z4  16383  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
     33 #define Z5  12873  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
     34 #define Z6  8867   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
     35 #define Z7  4520   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
     36 #define Z4c ((1<<(COL_SHIFT-1))/Z4)
     37 #define ROW_SHIFT 11
     38 #define COL_SHIFT 20
     39 
     40 #define z1 v0.H[0]
     41 #define z2 v0.H[1]
     42 #define z3 v0.H[2]
     43 #define z4 v0.H[3]
     44 #define z5 v0.H[4]
     45 #define z6 v0.H[5]
     46 #define z7 v0.H[6]
     47 #define z4c v0.H[7]
     48 
     49 const   idct_coeff_neon, align=4
     50        .short Z1, Z2, Z3, Z4, Z5, Z6, Z7, Z4c
     51 endconst
     52 
     53 .macro idct_start data
     54        prfm            pldl1keep, [\data]
     55        mov             x10, x30
     56        movrel          x3, idct_coeff_neon
     57        ld1             {v0.2d}, [x3]
     58 .endm
     59 
     60 .macro idct_end
     61        ret             x10
     62 .endm
     63 
     64 .macro smull1 a, b, c
     65        smull           \a, \b, \c
     66 .endm
     67 
     68 .macro smlal1 a, b, c
     69        smlal           \a, \b, \c
     70 .endm
     71 
     72 .macro smlsl1 a, b, c
     73        smlsl           \a, \b, \c
     74 .endm
     75 
     76 .macro idct_col4_top y1, y2, y3, y4, i, l
     77        smull\i         v7.4s,  \y3\l, z2
     78        smull\i         v16.4s, \y3\l, z6
     79        smull\i         v17.4s, \y2\l, z1
     80        add             v19.4s, v23.4s, v7.4s
     81        smull\i         v18.4s, \y2\l, z3
     82        add             v20.4s, v23.4s, v16.4s
     83        smull\i         v5.4s,  \y2\l, z5
     84        sub             v21.4s, v23.4s, v16.4s
     85        smull\i         v6.4s,  \y2\l, z7
     86        sub             v22.4s, v23.4s, v7.4s
     87 
     88        smlal\i         v17.4s, \y4\l, z3
     89        smlsl\i         v18.4s, \y4\l, z7
     90        smlsl\i         v5.4s,  \y4\l, z1
     91        smlsl\i         v6.4s,  \y4\l, z5
     92 .endm
     93 
     94 .macro idct_row4_neon y1, y2, y3, y4, pass
     95        ld1             {\y1\().2d,\y2\().2d}, [x2], #32
     96        movi            v23.4s, #1<<2, lsl #8
     97        orr             v5.16b, \y1\().16b, \y2\().16b
     98        ld1             {\y3\().2d,\y4\().2d}, [x2], #32
     99        orr             v6.16b, \y3\().16b, \y4\().16b
    100        orr             v5.16b, v5.16b, v6.16b
    101        mov             x3, v5.d[1]
    102        smlal           v23.4s, \y1\().4h, z4
    103 
    104        idct_col4_top   \y1, \y2, \y3, \y4, 1, .4h
    105 
    106        cmp             x3, #0
    107        b.eq            \pass\()f
    108 
    109        smull2          v7.4s, \y1\().8h, z4
    110        smlal2          v17.4s, \y2\().8h, z5
    111        smlsl2          v18.4s, \y2\().8h, z1
    112        smull2          v16.4s, \y3\().8h, z2
    113        smlal2          v5.4s, \y2\().8h, z7
    114        add             v19.4s, v19.4s, v7.4s
    115        sub             v20.4s, v20.4s, v7.4s
    116        sub             v21.4s, v21.4s, v7.4s
    117        add             v22.4s, v22.4s, v7.4s
    118        smlal2          v6.4s, \y2\().8h, z3
    119        smull2          v7.4s, \y3\().8h, z6
    120        smlal2          v17.4s, \y4\().8h, z7
    121        smlsl2          v18.4s, \y4\().8h, z5
    122        smlal2          v5.4s, \y4\().8h, z3
    123        smlsl2          v6.4s, \y4\().8h, z1
    124        add             v19.4s, v19.4s, v7.4s
    125        sub             v20.4s, v20.4s, v16.4s
    126        add             v21.4s, v21.4s, v16.4s
    127        sub             v22.4s, v22.4s, v7.4s
    128 
    129 \pass:  add             \y3\().4S, v19.4S, v17.4S
    130        add             \y4\().4s, v20.4s, v18.4s
    131        shrn            \y1\().4h, \y3\().4s, #ROW_SHIFT
    132        shrn            \y2\().4h, \y4\().4s, #ROW_SHIFT
    133        add             v7.4s, v21.4s, v5.4s
    134        add             v16.4s, v22.4s, v6.4s
    135        shrn            \y3\().4h, v7.4s, #ROW_SHIFT
    136        shrn            \y4\().4h, v16.4s, #ROW_SHIFT
    137        sub             v22.4s, v22.4s, v6.4s
    138        sub             v19.4s, v19.4s, v17.4s
    139        sub             v21.4s, v21.4s, v5.4s
    140        shrn2           \y1\().8h, v22.4s, #ROW_SHIFT
    141        sub             v20.4s, v20.4s, v18.4s
    142        shrn2           \y2\().8h, v21.4s, #ROW_SHIFT
    143        shrn2           \y3\().8h, v20.4s, #ROW_SHIFT
    144        shrn2           \y4\().8h, v19.4s, #ROW_SHIFT
    145 
    146        trn1            v16.8h, \y1\().8h, \y2\().8h
    147        trn2            v17.8h, \y1\().8h, \y2\().8h
    148        trn1            v18.8h, \y3\().8h, \y4\().8h
    149        trn2            v19.8h, \y3\().8h, \y4\().8h
    150        trn1            \y1\().4s, v16.4s, v18.4s
    151        trn1            \y2\().4s, v17.4s, v19.4s
    152        trn2            \y3\().4s, v16.4s, v18.4s
    153        trn2            \y4\().4s, v17.4s, v19.4s
    154 .endm
    155 
    156 .macro declare_idct_col4_neon i, l
    157 function idct_col4_neon\i
    158        dup             v23.4h, z4c
    159 .if \i == 1
    160        add             v23.4h, v23.4h, v24.4h
    161 .else
    162        mov             v5.d[0], v24.d[1]
    163        add             v23.4h, v23.4h, v5.4h
    164 .endif
    165        smull           v23.4s, v23.4h, z4
    166 
    167        idct_col4_top   v24, v25, v26, v27, \i, \l
    168 
    169        mov             x4, v28.d[\i - 1]
    170        mov             x5, v29.d[\i - 1]
    171        cmp             x4, #0
    172        b.eq            1f
    173 
    174        smull\i         v7.4s,  v28\l,  z4
    175        add             v19.4s, v19.4s, v7.4s
    176        sub             v20.4s, v20.4s, v7.4s
    177        sub             v21.4s, v21.4s, v7.4s
    178        add             v22.4s, v22.4s, v7.4s
    179 
    180 1:      mov             x4, v30.d[\i - 1]
    181        cmp             x5, #0
    182        b.eq            2f
    183 
    184        smlal\i         v17.4s, v29\l, z5
    185        smlsl\i         v18.4s, v29\l, z1
    186        smlal\i         v5.4s,  v29\l, z7
    187        smlal\i         v6.4s,  v29\l, z3
    188 
    189 2:      mov             x5, v31.d[\i - 1]
    190        cmp             x4, #0
    191        b.eq            3f
    192 
    193        smull\i         v7.4s,  v30\l, z6
    194        smull\i         v16.4s, v30\l, z2
    195        add             v19.4s, v19.4s, v7.4s
    196        sub             v22.4s, v22.4s, v7.4s
    197        sub             v20.4s, v20.4s, v16.4s
    198        add             v21.4s, v21.4s, v16.4s
    199 
    200 3:      cmp             x5, #0
    201        b.eq            4f
    202 
    203        smlal\i         v17.4s, v31\l, z7
    204        smlsl\i         v18.4s, v31\l, z5
    205        smlal\i         v5.4s,  v31\l, z3
    206        smlsl\i         v6.4s,  v31\l, z1
    207 
    208 4:      addhn           v7.4h, v19.4s, v17.4s
    209        addhn2          v7.8h, v20.4s, v18.4s
    210        subhn           v18.4h, v20.4s, v18.4s
    211        subhn2          v18.8h, v19.4s, v17.4s
    212 
    213        addhn           v16.4h, v21.4s, v5.4s
    214        addhn2          v16.8h, v22.4s, v6.4s
    215        subhn           v17.4h, v22.4s, v6.4s
    216        subhn2          v17.8h, v21.4s, v5.4s
    217 
    218        ret
    219 endfunc
    220 .endm
    221 
    222 declare_idct_col4_neon 1, .4H
    223 declare_idct_col4_neon 2, .8H
    224 
    225 function ff_simple_idct_put_neon, export=1
    226        idct_start      x2
    227 
    228        idct_row4_neon  v24, v25, v26, v27, 1
    229        idct_row4_neon  v28, v29, v30, v31, 2
    230        bl              idct_col4_neon1
    231 
    232        sqshrun         v1.8b,  v7.8h, #COL_SHIFT-16
    233        sqshrun2        v1.16b, v16.8h, #COL_SHIFT-16
    234        sqshrun         v3.8b,  v17.8h, #COL_SHIFT-16
    235        sqshrun2        v3.16b, v18.8h, #COL_SHIFT-16
    236 
    237        bl              idct_col4_neon2
    238 
    239        sqshrun         v2.8b,  v7.8h, #COL_SHIFT-16
    240        sqshrun2        v2.16b, v16.8h, #COL_SHIFT-16
    241        sqshrun         v4.8b,  v17.8h, #COL_SHIFT-16
    242        sqshrun2        v4.16b, v18.8h, #COL_SHIFT-16
    243 
    244        zip1            v16.4s, v1.4s, v2.4s
    245        zip2            v17.4s, v1.4s, v2.4s
    246 
    247        st1             {v16.d}[0], [x0], x1
    248        st1             {v16.d}[1], [x0], x1
    249 
    250        zip1            v18.4s, v3.4s, v4.4s
    251        zip2            v19.4s, v3.4s, v4.4s
    252 
    253        st1             {v17.d}[0], [x0], x1
    254        st1             {v17.d}[1], [x0], x1
    255        st1             {v18.d}[0], [x0], x1
    256        st1             {v18.d}[1], [x0], x1
    257        st1             {v19.d}[0], [x0], x1
    258        st1             {v19.d}[1], [x0], x1
    259 
    260        idct_end
    261 endfunc
    262 
    263 function ff_simple_idct_add_neon, export=1
    264        idct_start      x2
    265 
    266        idct_row4_neon  v24, v25, v26, v27, 1
    267        idct_row4_neon  v28, v29, v30, v31, 2
    268        bl              idct_col4_neon1
    269 
    270        sshr            v1.8h, v7.8h, #COL_SHIFT-16
    271        sshr            v2.8h, v16.8h, #COL_SHIFT-16
    272        sshr            v3.8h, v17.8h, #COL_SHIFT-16
    273        sshr            v4.8h, v18.8h, #COL_SHIFT-16
    274 
    275        bl              idct_col4_neon2
    276 
    277        sshr            v7.8h, v7.8h, #COL_SHIFT-16
    278        sshr            v16.8h, v16.8h, #COL_SHIFT-16
    279        sshr            v17.8h, v17.8h, #COL_SHIFT-16
    280        sshr            v18.8h, v18.8h, #COL_SHIFT-16
    281 
    282        mov             x9,  x0
    283        ld1             {v19.d}[0], [x0], x1
    284        zip1            v23.2d, v1.2d, v7.2d
    285        zip2            v24.2d, v1.2d, v7.2d
    286        ld1             {v19.d}[1], [x0], x1
    287        zip1            v25.2d, v2.2d, v16.2d
    288        zip2            v26.2d, v2.2d, v16.2d
    289        ld1             {v20.d}[0], [x0], x1
    290        zip1            v27.2d, v3.2d, v17.2d
    291        zip2            v28.2d, v3.2d, v17.2d
    292        ld1             {v20.d}[1], [x0], x1
    293        zip1            v29.2d, v4.2d, v18.2d
    294        zip2            v30.2d, v4.2d, v18.2d
    295        ld1             {v21.d}[0], [x0], x1
    296        uaddw           v23.8h, v23.8h, v19.8b
    297        uaddw2          v24.8h, v24.8h, v19.16b
    298        ld1             {v21.d}[1], [x0], x1
    299        sqxtun          v23.8b, v23.8h
    300        sqxtun2         v23.16b, v24.8h
    301        ld1             {v22.d}[0], [x0], x1
    302        uaddw           v24.8h, v25.8h, v20.8b
    303        uaddw2          v25.8h, v26.8h, v20.16b
    304        ld1             {v22.d}[1], [x0], x1
    305        sqxtun          v24.8b, v24.8h
    306        sqxtun2         v24.16b, v25.8h
    307        st1             {v23.d}[0], [x9], x1
    308        uaddw           v25.8h, v27.8h, v21.8b
    309        uaddw2          v26.8h, v28.8h, v21.16b
    310        st1             {v23.d}[1], [x9], x1
    311        sqxtun          v25.8b, v25.8h
    312        sqxtun2         v25.16b, v26.8h
    313        st1             {v24.d}[0], [x9], x1
    314        uaddw           v26.8h, v29.8h, v22.8b
    315        uaddw2          v27.8h, v30.8h, v22.16b
    316        st1             {v24.d}[1], [x9], x1
    317        sqxtun          v26.8b, v26.8h
    318        sqxtun2         v26.16b, v27.8h
    319        st1             {v25.d}[0], [x9], x1
    320        st1             {v25.d}[1], [x9], x1
    321        st1             {v26.d}[0], [x9], x1
    322        st1             {v26.d}[1], [x9], x1
    323 
    324        idct_end
    325 endfunc
    326 
    327 function ff_simple_idct_neon, export=1
    328        idct_start      x0
    329 
    330        mov             x2,  x0
    331        idct_row4_neon  v24, v25, v26, v27, 1
    332        idct_row4_neon  v28, v29, v30, v31, 2
    333        sub             x2, x2, #128
    334        bl              idct_col4_neon1
    335 
    336        sshr            v1.8h, v7.8h, #COL_SHIFT-16
    337        sshr            v2.8h, v16.8h, #COL_SHIFT-16
    338        sshr            v3.8h, v17.8h, #COL_SHIFT-16
    339        sshr            v4.8h, v18.8h, #COL_SHIFT-16
    340 
    341        bl              idct_col4_neon2
    342 
    343        sshr            v7.8h, v7.8h, #COL_SHIFT-16
    344        sshr            v16.8h, v16.8h, #COL_SHIFT-16
    345        sshr            v17.8h, v17.8h, #COL_SHIFT-16
    346        sshr            v18.8h, v18.8h, #COL_SHIFT-16
    347 
    348        zip1            v23.2d, v1.2d, v7.2d
    349        zip2            v24.2d, v1.2d, v7.2d
    350        st1             {v23.2d,v24.2d}, [x2], #32
    351        zip1            v25.2d, v2.2d, v16.2d
    352        zip2            v26.2d, v2.2d, v16.2d
    353        st1             {v25.2d,v26.2d}, [x2], #32
    354        zip1            v27.2d, v3.2d, v17.2d
    355        zip2            v28.2d, v3.2d, v17.2d
    356        st1             {v27.2d,v28.2d}, [x2], #32
    357        zip1            v29.2d, v4.2d, v18.2d
    358        zip2            v30.2d, v4.2d, v18.2d
    359        st1             {v29.2d,v30.2d}, [x2], #32
    360 
    361        idct_end
    362 endfunc