tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

float_dsp_vfp.S (15252B)


      1 /*
      2 * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net>
      3 *
      4 * This file is part of FFmpeg
      5 *
      6 * FFmpeg is free software; you can redistribute it and/or
      7 * modify it under the terms of the GNU Lesser General Public
      8 * License as published by the Free Software Foundation; either
      9 * version 2.1 of the License, or (at your option) any later version.
     10 *
     11 * FFmpeg is distributed in the hope that it will be useful,
     12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     14 * Lesser General Public License for more details.
     15 *
     16 * You should have received a copy of the GNU Lesser General Public
     17 * License along with FFmpeg; if not, write to the Free Software
     18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     19 */
     20 
     21 #include "config.h"
     22 #include "asm.S"
     23 
     24 /**
     25 * Assume that len is a positive number and is multiple of 8
     26 */
     27 @ void ff_vector_fmul_vfp(float *dst, const float *src0, const float *src1, int len)
     28 function ff_vector_fmul_vfp, export=1
     29        vpush           {d8-d15}
     30        fmrx            r12, fpscr
     31        orr             r12, r12, #(3 << 16) /* set vector size to 4 */
     32        fmxr            fpscr, r12
     33 
     34        vldmia          r1!, {s0-s3}
     35        vldmia          r2!, {s8-s11}
     36        vldmia          r1!, {s4-s7}
     37        vldmia          r2!, {s12-s15}
     38        vmul.f32        s8,  s0,  s8
     39 1:
     40        subs            r3,  r3,  #16
     41        vmul.f32        s12, s4,  s12
     42        itttt           ge
     43        vldmiage        r1!, {s16-s19}
     44        vldmiage        r2!, {s24-s27}
     45        vldmiage        r1!, {s20-s23}
     46        vldmiage        r2!, {s28-s31}
     47        it              ge
     48        vmulge.f32      s24, s16, s24
     49        vstmia          r0!, {s8-s11}
     50        vstmia          r0!, {s12-s15}
     51        it              ge
     52        vmulge.f32      s28, s20, s28
     53        itttt           gt
     54        vldmiagt        r1!, {s0-s3}
     55        vldmiagt        r2!, {s8-s11}
     56        vldmiagt        r1!, {s4-s7}
     57        vldmiagt        r2!, {s12-s15}
     58        ittt            ge
     59        vmulge.f32      s8,  s0,  s8
     60        vstmiage        r0!, {s24-s27}
     61        vstmiage        r0!, {s28-s31}
     62        bgt             1b
     63 
     64        bic             r12, r12, #(7 << 16) /* set vector size back to 1 */
     65        fmxr            fpscr, r12
     66        vpop            {d8-d15}
     67        bx              lr
     68 endfunc
     69 
     70 /**
     71 * ARM VFP implementation of 'vector_fmul_window_c' function
     72 * Assume that len is a positive non-zero number
     73 */
     74 @ void ff_vector_fmul_window_vfp(float *dst, const float *src0,
     75 @                                const float *src1, const float *win, int len)
     76 function ff_vector_fmul_window_vfp, export=1
     77 DST0    .req    a1
     78 SRC0    .req    a2
     79 SRC1    .req    a3
     80 WIN0    .req    a4
     81 LEN     .req    v1
     82 DST1    .req    v2
     83 WIN1    .req    v3
     84 OLDFPSCR .req   ip
     85 
     86        push    {v1-v3,lr}
     87        ldr     LEN, [sp, #4*4+0]
     88        vpush   {s16-s31}
     89        fmrx    OLDFPSCR, FPSCR
     90        add     DST1, DST0, LEN, lsl #3
     91        add     SRC1, SRC1, LEN, lsl #2
     92        add     WIN1, WIN0, LEN, lsl #3
     93 
     94        tst     LEN, #7
     95        beq     4f                          @ common case: len is a multiple of 8
     96 
     97        ldr     lr, =0x03000000             @ RunFast mode, scalar mode
     98        fmxr    FPSCR, lr
     99 
    100        tst     LEN, #1
    101        beq     1f
    102        vldmdb  WIN1!, {s0}
    103        vldmia  SRC0!, {s8}
    104        vldmia  WIN0!, {s16}
    105        vmul.f  s24, s0, s8
    106        vldmdb  SRC1!, {s20}
    107        vmul.f  s8, s16, s8
    108        vmls.f  s24, s16, s20
    109        vmla.f  s8, s0, s20
    110        vstmia  DST0!, {s24}
    111        vstmdb  DST1!, {s8}
    112 1:
    113        tst     LEN, #2
    114        beq     2f
    115        vldmdb  WIN1!, {s0}
    116        vldmdb  WIN1!, {s1}
    117        vldmia  SRC0!, {s8-s9}
    118        vldmia  WIN0!, {s16-s17}
    119        vmul.f  s24, s0, s8
    120        vmul.f  s25, s1, s9
    121        vldmdb  SRC1!, {s20}
    122        vldmdb  SRC1!, {s21}
    123        vmul.f  s8, s16, s8
    124        vmul.f  s9, s17, s9
    125        vmls.f  s24, s16, s20
    126        vmls.f  s25, s17, s21
    127        vmla.f  s8, s0, s20
    128        vmla.f  s9, s1, s21
    129        vstmia  DST0!, {s24-s25}
    130        vstmdb  DST1!, {s8}
    131        vstmdb  DST1!, {s9}
    132 2:
    133        tst     LEN, #4
    134        beq     3f
    135        vldmdb  WIN1!, {s0}
    136        vldmdb  WIN1!, {s1}
    137        vldmdb  WIN1!, {s2}
    138        vldmdb  WIN1!, {s3}
    139        vldmia  SRC0!, {s8-s11}
    140        vldmia  WIN0!, {s16-s19}
    141        vmul.f  s24, s0, s8
    142        vmul.f  s25, s1, s9
    143        vmul.f  s26, s2, s10
    144        vmul.f  s27, s3, s11
    145        vldmdb  SRC1!, {s20}
    146        vldmdb  SRC1!, {s21}
    147        vldmdb  SRC1!, {s22}
    148        vldmdb  SRC1!, {s23}
    149        vmul.f  s8, s16, s8
    150        vmul.f  s9, s17, s9
    151        vmul.f  s10, s18, s10
    152        vmul.f  s11, s19, s11
    153        vmls.f  s24, s16, s20
    154        vmls.f  s25, s17, s21
    155        vmls.f  s26, s18, s22
    156        vmls.f  s27, s19, s23
    157        vmla.f  s8, s0, s20
    158        vmla.f  s9, s1, s21
    159        vmla.f  s10, s2, s22
    160        vmla.f  s11, s3, s23
    161        vstmia  DST0!, {s24-s27}
    162        vstmdb  DST1!, {s8}
    163        vstmdb  DST1!, {s9}
    164        vstmdb  DST1!, {s10}
    165        vstmdb  DST1!, {s11}
    166 3:
    167        bics    LEN, LEN, #7
    168        beq     7f
    169 4:
    170        ldr     lr, =0x03030000             @ RunFast mode, short vectors of length 4, stride 1
    171        fmxr    FPSCR, lr
    172 
    173        vldmdb  WIN1!, {s0}
    174        vldmdb  WIN1!, {s1}
    175        vldmdb  WIN1!, {s2}
    176        vldmdb  WIN1!, {s3}
    177        vldmia  SRC0!, {s8-s11}
    178        vldmia  WIN0!, {s16-s19}
    179        vmul.f  s24, s0, s8                     @ vector * vector
    180        vldmdb  SRC1!, {s20}
    181        vldmdb  SRC1!, {s21}
    182        vldmdb  SRC1!, {s22}
    183        vldmdb  SRC1!, {s23}
    184        vmul.f  s8, s16, s8                     @ vector * vector
    185        vmls.f  s24, s16, s20                   @ vector * vector
    186            vldmdb  WIN1!, {s4}
    187            vldmdb  WIN1!, {s5}
    188            vldmdb  WIN1!, {s6}
    189            vldmdb  WIN1!, {s7}
    190            vldmia  SRC0!, {s12-s13}
    191        vmla.f  s8, s0, s20                     @ vector * vector
    192            vldmia  SRC0!, {s14-s15}
    193        subs    LEN, LEN, #8
    194        beq     6f
    195 5:          vldmia  WIN0!, {s20-s23}
    196            vmul.f  s28, s4, s12                @ vector * vector
    197        vstmia  DST0!, {s24-s25}
    198            vldmdb  SRC1!, {s16}
    199            vldmdb  SRC1!, {s17}
    200            vldmdb  SRC1!, {s18}
    201            vldmdb  SRC1!, {s19}
    202            vmul.f  s12, s20, s12               @ vector * vector
    203        vstmia  DST0!, {s26-s27}
    204        vstmdb  DST1!, {s8}
    205        vstmdb  DST1!, {s9}
    206        vstmdb  DST1!, {s10}
    207        vstmdb  DST1!, {s11}
    208            vmls.f  s28, s20, s16               @ vector * vector
    209                vldmdb  WIN1!, {s0}
    210                vldmdb  WIN1!, {s1}
    211                vldmdb  WIN1!, {s2}
    212                vldmdb  WIN1!, {s3}
    213                vldmia  SRC0!, {s8-s9}
    214            vmla.f  s12, s4, s16                @ vector * vector
    215                vldmia  SRC0!, {s10-s11}
    216        subs    LEN, LEN, #8
    217                vldmia  WIN0!, {s16-s19}
    218                vmul.f  s24, s0, s8             @ vector * vector
    219            vstmia  DST0!, {s28-s29}
    220                vldmdb  SRC1!, {s20}
    221                vldmdb  SRC1!, {s21}
    222                vldmdb  SRC1!, {s22}
    223                vldmdb  SRC1!, {s23}
    224                vmul.f  s8, s16, s8             @ vector * vector
    225            vstmia  DST0!, {s30-s31}
    226            vstmdb  DST1!, {s12}
    227            vstmdb  DST1!, {s13}
    228            vstmdb  DST1!, {s14}
    229            vstmdb  DST1!, {s15}
    230                vmls.f  s24, s16, s20           @ vector * vector
    231                    vldmdb  WIN1!, {s4}
    232                    vldmdb  WIN1!, {s5}
    233                    vldmdb  WIN1!, {s6}
    234                    vldmdb  WIN1!, {s7}
    235                    vldmia  SRC0!, {s12-s13}
    236                vmla.f  s8, s0, s20             @ vector * vector
    237                    vldmia  SRC0!, {s14-s15}
    238        bne     5b
    239 6:                  vldmia  WIN0!, {s20-s23}
    240                    vmul.f  s28, s4, s12        @ vector * vector
    241                vstmia  DST0!, {s24-s25}
    242                    vldmdb  SRC1!, {s16}
    243                    vldmdb  SRC1!, {s17}
    244                    vldmdb  SRC1!, {s18}
    245                    vldmdb  SRC1!, {s19}
    246                    vmul.f  s12, s20, s12       @ vector * vector
    247                vstmia  DST0!, {s26-s27}
    248                vstmdb  DST1!, {s8}
    249                vstmdb  DST1!, {s9}
    250                vstmdb  DST1!, {s10}
    251                vstmdb  DST1!, {s11}
    252                    vmls.f  s28, s20, s16       @ vector * vector
    253                    vmla.f  s12, s4, s16        @ vector * vector
    254                    vstmia  DST0!, {s28-s31}
    255                    vstmdb  DST1!, {s12}
    256                    vstmdb  DST1!, {s13}
    257                    vstmdb  DST1!, {s14}
    258                    vstmdb  DST1!, {s15}
    259 7:
    260        fmxr    FPSCR, OLDFPSCR
    261        vpop    {s16-s31}
    262        pop     {v1-v3,pc}
    263 
    264        .unreq  DST0
    265        .unreq  SRC0
    266        .unreq  SRC1
    267        .unreq  WIN0
    268        .unreq  LEN
    269        .unreq  OLDFPSCR
    270        .unreq  DST1
    271        .unreq  WIN1
    272 endfunc
    273 
    274 /**
    275 * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function.
    276 * Assume that len is a positive number and is multiple of 8
    277 */
    278 @ void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
    279 @                                 const float *src1, int len)
    280 function ff_vector_fmul_reverse_vfp, export=1
    281        vpush           {d8-d15}
    282        add             r2,  r2,  r3, lsl #2
    283        vldmdb          r2!, {s0-s3}
    284        vldmia          r1!, {s8-s11}
    285        vldmdb          r2!, {s4-s7}
    286        vldmia          r1!, {s12-s15}
    287        vmul.f32        s8,  s3,  s8
    288        vmul.f32        s9,  s2,  s9
    289        vmul.f32        s10, s1,  s10
    290        vmul.f32        s11, s0,  s11
    291 1:
    292        subs            r3,  r3,  #16
    293        it              ge
    294        vldmdbge        r2!, {s16-s19}
    295        vmul.f32        s12, s7,  s12
    296        it              ge
    297        vldmiage        r1!, {s24-s27}
    298        vmul.f32        s13, s6,  s13
    299        it              ge
    300        vldmdbge        r2!, {s20-s23}
    301        vmul.f32        s14, s5,  s14
    302        it              ge
    303        vldmiage        r1!, {s28-s31}
    304        vmul.f32        s15, s4,  s15
    305        it              ge
    306        vmulge.f32      s24, s19, s24
    307        it              gt
    308        vldmdbgt        r2!, {s0-s3}
    309        it              ge
    310        vmulge.f32      s25, s18, s25
    311        vstmia          r0!, {s8-s13}
    312        it              ge
    313        vmulge.f32      s26, s17, s26
    314        it              gt
    315        vldmiagt        r1!, {s8-s11}
    316        itt             ge
    317        vmulge.f32      s27, s16, s27
    318        vmulge.f32      s28, s23, s28
    319        it              gt
    320        vldmdbgt        r2!, {s4-s7}
    321        it              ge
    322        vmulge.f32      s29, s22, s29
    323        vstmia          r0!, {s14-s15}
    324        ittt            ge
    325        vmulge.f32      s30, s21, s30
    326        vmulge.f32      s31, s20, s31
    327        vmulge.f32      s8,  s3,  s8
    328        it              gt
    329        vldmiagt        r1!, {s12-s15}
    330        itttt           ge
    331        vmulge.f32      s9,  s2,  s9
    332        vmulge.f32      s10, s1,  s10
    333        vstmiage        r0!, {s24-s27}
    334        vmulge.f32      s11, s0,  s11
    335        it              ge
    336        vstmiage        r0!, {s28-s31}
    337        bgt             1b
    338 
    339        vpop            {d8-d15}
    340        bx              lr
    341 endfunc
    342 
    343 /**
    344 * ARM VFP implementation of 'butterflies_float_c' function
    345 * Assume that len is a positive non-zero number
    346 */
    347 @ void ff_butterflies_float_vfp(float *restrict v1, float *restrict v2, int len)
    348 function ff_butterflies_float_vfp, export=1
    349 BASE1   .req    a1
    350 BASE2   .req    a2
    351 LEN     .req    a3
    352 OLDFPSCR .req   a4
    353 
    354        vpush   {s16-s31}
    355        fmrx    OLDFPSCR, FPSCR
    356 
    357        tst     LEN, #7
    358        beq     4f                          @ common case: len is a multiple of 8
    359 
    360        ldr     ip, =0x03000000             @ RunFast mode, scalar mode
    361        fmxr    FPSCR, ip
    362 
    363        tst     LEN, #1
    364        beq     1f
    365        vldmia  BASE1!, {s0}
    366        vldmia  BASE2!, {s8}
    367        vadd.f  s16, s0, s8
    368        vsub.f  s24, s0, s8
    369        vstr    s16, [BASE1, #0-4*1]
    370        vstr    s24, [BASE2, #0-4*1]
    371 1:
    372        tst     LEN, #2
    373        beq     2f
    374        vldmia  BASE1!, {s0-s1}
    375        vldmia  BASE2!, {s8-s9}
    376        vadd.f  s16, s0, s8
    377        vadd.f  s17, s1, s9
    378        vsub.f  s24, s0, s8
    379        vsub.f  s25, s1, s9
    380        vstr    d8, [BASE1, #0-8*1]    @ s16,s17
    381        vstr    d12, [BASE2, #0-8*1]   @ s24,s25
    382 2:
    383        tst     LEN, #4
    384        beq     3f
    385        vldmia  BASE1!, {s0-s1}
    386        vldmia  BASE2!, {s8-s9}
    387        vldmia  BASE1!, {s2-s3}
    388        vldmia  BASE2!, {s10-s11}
    389        vadd.f  s16, s0, s8
    390        vadd.f  s17, s1, s9
    391        vsub.f  s24, s0, s8
    392        vsub.f  s25, s1, s9
    393        vadd.f  s18, s2, s10
    394        vadd.f  s19, s3, s11
    395        vsub.f  s26, s2, s10
    396        vsub.f  s27, s3, s11
    397        vstr    d8, [BASE1, #0-16*1]    @ s16,s17
    398        vstr    d12, [BASE2, #0-16*1]   @ s24,s25
    399        vstr    d9, [BASE1, #8-16*1]    @ s18,s19
    400        vstr    d13, [BASE2, #8-16*1]   @ s26,s27
    401 3:
    402        bics    LEN, LEN, #7
    403        beq     7f
    404 4:
    405        ldr     ip, =0x03030000             @ RunFast mode, short vectors of length 4, stride 1
    406        fmxr    FPSCR, ip
    407 
    408        vldmia  BASE1!, {s0-s1}
    409        vldmia  BASE2!, {s8-s9}
    410        vldmia  BASE1!, {s2-s3}
    411        vldmia  BASE2!, {s10-s11}
    412        vadd.f  s16, s0, s8
    413            vldmia  BASE1!, {s4-s5}
    414            vldmia  BASE2!, {s12-s13}
    415            vldmia  BASE1!, {s6-s7}
    416            vldmia  BASE2!, {s14-s15}
    417        vsub.f  s24, s0, s8
    418            vadd.f  s20, s4, s12
    419        subs    LEN, LEN, #8
    420        beq     6f
    421 5:              vldmia  BASE1!, {s0-s3}
    422                vldmia  BASE2!, {s8-s11}
    423            vsub.f  s28, s4, s12
    424        vstr    d8, [BASE1, #0-16*3]    @ s16,s17
    425        vstr    d9, [BASE1, #8-16*3]    @ s18,s19
    426        vstr    d12, [BASE2, #0-16*3]   @ s24,s25
    427        vstr    d13, [BASE2, #8-16*3]   @ s26,s27
    428                vadd.f  s16, s0, s8
    429                    vldmia  BASE1!, {s4-s7}
    430                    vldmia  BASE2!, {s12-s15}
    431                vsub.f  s24, s0, s8
    432            vstr    d10, [BASE1, #0-16*3]   @ s20,s21
    433            vstr    d11, [BASE1, #8-16*3]   @ s22,s23
    434            vstr    d14, [BASE2, #0-16*3]   @ s28,s29
    435            vstr    d15, [BASE2, #8-16*3]   @ s30,s31
    436                    vadd.f  s20, s4, s12
    437        subs    LEN, LEN, #8
    438        bne     5b
    439 6:                   vsub.f  s28, s4, s12
    440                vstr    d8, [BASE1, #0-16*2]    @ s16,s17
    441                vstr    d9, [BASE1, #8-16*2]    @ s18,s19
    442                vstr    d12, [BASE2, #0-16*2]   @ s24,s25
    443                vstr    d13, [BASE2, #8-16*2]   @ s26,s27
    444                    vstr    d10, [BASE1, #0-16*1]   @ s20,s21
    445                    vstr    d11, [BASE1, #8-16*1]   @ s22,s23
    446                    vstr    d14, [BASE2, #0-16*1]   @ s28,s29
    447                    vstr    d15, [BASE2, #8-16*1]   @ s30,s31
    448 7:
    449        fmxr    FPSCR, OLDFPSCR
    450        vpop    {s16-s31}
    451        bx      lr
    452 
    453        .unreq  BASE1
    454        .unreq  BASE2
    455        .unreq  LEN
    456        .unreq  OLDFPSCR
    457 endfunc