tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

mpegaudiodsp_neon.S (7479B)


      1 /*
      2 * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
      3 *
      4 * This file is part of FFmpeg.
      5 *
      6 * FFmpeg is free software; you can redistribute it and/or
      7 * modify it under the terms of the GNU Lesser General Public
      8 * License as published by the Free Software Foundation; either
      9 * version 2.1 of the License, or (at your option) any later version.
     10 *
     11 * FFmpeg is distributed in the hope that it will be useful,
     12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     14 * Lesser General Public License for more details.
     15 *
     16 * You should have received a copy of the GNU Lesser General Public
     17 * License along with FFmpeg; if not, write to the Free Software
     18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     19 */
     20 
     21 #include "libavutil/aarch64/asm.S"
     22 
     23 #define FRAC_BITS   23   // fractional bits for sb_samples and dct
     24 #define WFRAC_BITS  16   // fractional bits for window
     25 #define OUT_SHIFT (WFRAC_BITS + FRAC_BITS - 15)
     26 
     27 const   tbl_rev128_s, align=4
     28        .byte           12, 13, 14, 15
     29        .byte            8,  9, 10, 11
     30        .byte            4,  5,  6,  7
     31        .byte            0,  1,  2,  3
     32 endconst
     33 
     34 .macro   apply_window   type, st
     35 function ff_mpadsp_apply_window_\type\()_neon, export=1
     36        mov             x7,  x0
     37        add             x8,  x0,  #512<<2
     38        ld1             {v0.4s,v1.4s,v2.4s,v3.4s},  [x7],  #64
     39        ld1             {v4.4s,v5.4s,v6.4s,v7.4s},  [x7],  #64
     40        st1             {v0.4s,v1.4s,v2.4s,v3.4s},  [x8],  #64
     41        st1             {v4.4s,v5.4s,v6.4s,v7.4s},  [x8],  #64
     42        movrel          x15, tbl_rev128_s
     43        ld1             {v27.4s}, [x15]
     44 .ifc \type, fixed
     45        lsl             x4,  x4,  #1
     46 .else
     47        lsl             x4,  x4,  #2
     48 .endif
     49        add             x10, x0,  #45<<2
     50        add             x0,  x0,  #16<<2
     51        add             x1,  x1,  #16<<2
     52        add             x5,  x3,  x4,  lsl #5
     53        sub             x5,  x5,  x4            // samples2
     54        neg             x13, x4                 // -incr
     55        mov             x9,  #64<<2
     56 .ifc \type, fixed
     57        ld1r            {v16.2s}, [x2]          // dither_state
     58        sxtl            v16.2d, v16.2s
     59        movi            v29.16b, #0
     60        movi            v30.2d, #(1<<OUT_SHIFT)-1
     61        trn1            v31.2d, v29.2d, v30.2d
     62        trn2            v30.2d, v30.2d, v29.2d
     63        trn1            v16.2d, v16.2d, v29.2d
     64 .else
     65        movi            v16.4s, #0
     66        movi            v28.4s, #0
     67 .endif
     68        mov             x14, #4
     69 1:
     70        mov             x8,  x0
     71        sub             x7,  x1,  #3<<2
     72        sub             x6,  x1,  x14, lsl #4
     73        add             x7,  x7,  x14, lsl #4
     74        add             x11, x6, #(32)<<2      // w  + 32
     75        add             x12, x7, #(32)<<2      // w2 + 32
     76        mov             x15, #8
     77        movi            v17.16b, #0
     78        movi            v18.16b, #0
     79        movi            v19.16b, #0
     80 2:
     81        subs            x15, x15, #1
     82        ld1             {v0.4s},  [x8],  x9
     83        ld1             {v1.4s},  [x10], x9
     84        ld1             {v2.4s},  [x6],  x9
     85        ld1             {v3.4s},  [x7],  x9
     86        tbl             v6.16b, {v0.16b}, v27.16b
     87        tbl             v7.16b, {v1.16b}, v27.16b
     88        ld1             {v4.4s},  [x11], x9
     89        ld1             {v5.4s},  [x12], x9
     90        MLA             v16, v2, v0
     91        MLA2            v17, v2, v0
     92        MLS             v18, v3, v6
     93        MLS2            v19, v3, v6
     94        MLS             v16, v4, v7
     95        MLS2            v17, v4, v7
     96        MLS             v18, v5, v1
     97        MLS2            v19, v5, v1
     98        b.gt            2b
     99 
    100        cmp             x14, #4
    101        sub             x10, x10, #64<<5        // 64 * 8 * sizeof(int32_t)
    102 
    103 .ifc \type, fixed
    104        and             v28.16b, v16.16b, v30.16b
    105        ext             v28.16b, v29.16b, v28.16b, #8
    106 
    107        b.eq            4f
    108        round_sample    v19, 1, 1
    109 4:
    110        round_sample    v16, 1, 0
    111        shrn            v16.2s, v16.2d,  #OUT_SHIFT
    112        round_sample    v19, 0, 0
    113        shrn            v19.2s, v19.2d,  #OUT_SHIFT
    114        round_sample    v17, 0, 1
    115        round_sample    v18, 1, 1
    116        round_sample    v17, 1, 0
    117        shrn2           v16.4s, v17.2d,  #OUT_SHIFT
    118        round_sample    v18, 0, 0
    119        shrn2           v19.4s, v18.2d,  #OUT_SHIFT
    120        sqxtn           v16.4h, v16.4s
    121        sqxtn           v18.4h, v19.4s
    122 .else
    123        ext             v18.16b, v18.16b, v18.16b, #8
    124 .endif
    125 
    126        st1             {v16.\st\()}[0], [x3], x4
    127        b.eq            4f
    128        st1             {v18.\st\()}[1], [x5], x13
    129 4:
    130        st1             {v16.\st\()}[1], [x3], x4
    131        st1             {v18.\st\()}[0], [x5], x13
    132        st1             {v16.\st\()}[2], [x3], x4
    133        st1             {v18.\st\()}[3], [x5], x13
    134        st1             {v16.\st\()}[3], [x3], x4
    135        st1             {v18.\st\()}[2], [x5], x13
    136 
    137        mov             v16.16b, v28.16b
    138 
    139        subs            x14, x14, #1
    140        add             x0,  x0,  #4<<2
    141        sub             x10, x10, #4<<2
    142        b.gt            1b
    143 
    144 // computing samples[16]
    145        add             x6,  x1,  #32<<2
    146        ld1             {v0.2s},  [x6],  x9
    147        ld1             {v1.2s},  [x0],  x9
    148 .rept   3
    149        ld1             {v2.2s},  [x6],  x9
    150        ld1             {v3.2s},  [x0],  x9
    151        MLS             v16, v0,  v1
    152        ld1             {v0.2s},  [x6],  x9
    153        ld1             {v1.2s},  [x0],  x9
    154        MLS             v16, v2,  v3
    155 .endr
    156        ld1             {v2.2s},  [x6],  x9
    157        ld1             {v3.2s},  [x0],  x9
    158        MLS             v16, v0,  v1
    159        MLS             v16, v2,  v3
    160 
    161 .ifc \type, fixed
    162        and             v28.16b, v16.16b, v30.16b
    163        shrn            v20.2s,  v16.2d,  #OUT_SHIFT
    164        xtn             v28.2s,  v28.2d
    165        sqxtn           v20.4h,  v20.4s
    166        st1             {v28.s}[0], [x2]        // save dither_state
    167        st1             {v20.h}[0], [x3]
    168 .else
    169        st1             {v16.s}[0], [x3]
    170 .endif
    171 
    172        ret
    173 endfunc
    174 .purgem round_sample
    175 .purgem MLA
    176 .purgem MLA2
    177 .purgem MLS
    178 .purgem MLS2
    179 .endm
    180 
    181 
    182 .macro  round_sample    r, idx, next
    183        add             \r\().2d, \r\().2d, v28.2d
    184 .if \idx == 0
    185        and             v28.16b,  \r\().16b,  v30.16b
    186 .else // \idx == 1
    187        and             v28.16b,  \r\().16b,  v31.16b
    188 .endif
    189 .if \idx != \next
    190  .if \next == 0
    191        ext             v28.16b, v28.16b, v29.16b, #8
    192  .else
    193        ext             v28.16b, v29.16b, v28.16b, #8
    194  .endif
    195 .endif
    196 .endm
    197 .macro  MLA             d, s1, s2
    198        smlal           \d\().2d, \s1\().2s, \s2\().2s
    199 .endm
    200 .macro  MLA2            d, s1, s2
    201        smlal2          \d\().2d, \s1\().4s, \s2\().4s
    202 .endm
    203 .macro  MLS             d, s1, s2
    204        smlsl           \d\().2d, \s1\().2s, \s2\().2s
    205 .endm
    206 .macro  MLS2            d, s1, s2
    207        smlsl2          \d\().2d, \s1\().4s, \s2\().4s
    208 .endm
    209 apply_window fixed, h
    210 
    211 
    212 // nothing to do for round_sample and ML{A,S}2
    213 .macro  round_sample    r, idx, next
    214 .endm
    215 .macro  MLA2            d, s1, s2
    216 .endm
    217 .macro  MLS2            d, s1, s2
    218 .endm
    219 .macro  MLA             d, s1, s2
    220        fmla            \d\().4s, \s1\().4s, \s2\().4s
    221 .endm
    222 .macro  MLS             d, s1, s2
    223        fmls            \d\().4s, \s1\().4s, \s2\().4s
    224 .endm
    225 apply_window float, s