tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

float_dsp_neon.S (9779B)


      1 /*
      2 * ARM NEON optimised Float DSP functions
      3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
      4 *
      5 * This file is part of FFmpeg.
      6 *
      7 * FFmpeg is free software; you can redistribute it and/or
      8 * modify it under the terms of the GNU Lesser General Public
      9 * License as published by the Free Software Foundation; either
     10 * version 2.1 of the License, or (at your option) any later version.
     11 *
     12 * FFmpeg is distributed in the hope that it will be useful,
     13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     15 * Lesser General Public License for more details.
     16 *
     17 * You should have received a copy of the GNU Lesser General Public
     18 * License along with FFmpeg; if not, write to the Free Software
     19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     20 */
     21 
     22 #include "config.h"
     23 #include "asm.S"
     24 
     25 function ff_vector_fmul_neon, export=1
     26        subs            r3,  r3,  #8
     27        vld1.32         {d0-d3},  [r1,:128]!
     28        vld1.32         {d4-d7},  [r2,:128]!
     29        vmul.f32        q8,  q0,  q2
     30        vmul.f32        q9,  q1,  q3
     31        beq             3f
     32        bics            ip,  r3,  #15
     33        beq             2f
     34 1:      subs            ip,  ip,  #16
     35        vld1.32         {d0-d1},  [r1,:128]!
     36        vld1.32         {d4-d5},  [r2,:128]!
     37        vmul.f32        q10, q0,  q2
     38        vld1.32         {d2-d3},  [r1,:128]!
     39        vld1.32         {d6-d7},  [r2,:128]!
     40        vmul.f32        q11, q1,  q3
     41        vst1.32         {d16-d19},[r0,:128]!
     42        vld1.32         {d0-d1},  [r1,:128]!
     43        vld1.32         {d4-d5},  [r2,:128]!
     44        vmul.f32        q8,  q0,  q2
     45        vld1.32         {d2-d3},  [r1,:128]!
     46        vld1.32         {d6-d7},  [r2,:128]!
     47        vmul.f32        q9,  q1,  q3
     48        vst1.32         {d20-d23},[r0,:128]!
     49        bne             1b
     50        ands            r3,  r3,  #15
     51        beq             3f
     52 2:      vld1.32         {d0-d1},  [r1,:128]!
     53        vld1.32         {d4-d5},  [r2,:128]!
     54        vst1.32         {d16-d17},[r0,:128]!
     55        vmul.f32        q8,  q0,  q2
     56        vld1.32         {d2-d3},  [r1,:128]!
     57        vld1.32         {d6-d7},  [r2,:128]!
     58        vst1.32         {d18-d19},[r0,:128]!
     59        vmul.f32        q9,  q1,  q3
     60 3:      vst1.32         {d16-d19},[r0,:128]!
     61        bx              lr
     62 endfunc
     63 
     64 function ff_vector_fmac_scalar_neon, export=1
     65 VFP     len .req r2
     66 VFP     acc .req r3
     67 NOVFP   len .req r3
     68 NOVFP   acc .req r2
     69 VFP     vdup.32         q15, d0[0]
     70 NOVFP   vdup.32         q15, r2
     71        bics            r12, len, #15
     72        mov             acc, r0
     73        beq             3f
     74        vld1.32         {q0},     [r1,:128]!
     75        vld1.32         {q8},     [acc,:128]!
     76        vld1.32         {q1},     [r1,:128]!
     77        vld1.32         {q9},     [acc,:128]!
     78 1:      vmla.f32        q8,  q0,  q15
     79        vld1.32         {q2},     [r1,:128]!
     80        vld1.32         {q10},    [acc,:128]!
     81        vmla.f32        q9,  q1,  q15
     82        vld1.32         {q3},     [r1,:128]!
     83        vld1.32         {q11},    [acc,:128]!
     84        vmla.f32        q10, q2,  q15
     85        vst1.32         {q8},     [r0,:128]!
     86        vmla.f32        q11, q3,  q15
     87        vst1.32         {q9},     [r0,:128]!
     88        subs            r12, r12, #16
     89        beq             2f
     90        vld1.32         {q0},     [r1,:128]!
     91        vld1.32         {q8},     [acc,:128]!
     92        vst1.32         {q10},    [r0,:128]!
     93        vld1.32         {q1},     [r1,:128]!
     94        vld1.32         {q9},     [acc,:128]!
     95        vst1.32         {q11},    [r0,:128]!
     96        b               1b
     97 2:      vst1.32         {q10},    [r0,:128]!
     98        vst1.32         {q11},    [r0,:128]!
     99        ands            len, len, #15
    100        it              eq
    101        bxeq            lr
    102 3:      vld1.32         {q0},     [r1,:128]!
    103        vld1.32         {q8},     [acc,:128]!
    104        vmla.f32        q8,  q0,  q15
    105        vst1.32         {q8},     [r0,:128]!
    106        subs            len, len, #4
    107        bgt             3b
    108        bx              lr
    109        .unreq          len
    110 endfunc
    111 
    112 function ff_vector_fmul_scalar_neon, export=1
    113 VFP     len .req r2
    114 NOVFP   len .req r3
    115 VFP     vdup.32         q8,  d0[0]
    116 NOVFP   vdup.32         q8,  r2
    117        bics            r12, len, #15
    118        beq             3f
    119        vld1.32         {q0},[r1,:128]!
    120        vld1.32         {q1},[r1,:128]!
    121 1:      vmul.f32        q0,  q0,  q8
    122        vld1.32         {q2},[r1,:128]!
    123        vmul.f32        q1,  q1,  q8
    124        vld1.32         {q3},[r1,:128]!
    125        vmul.f32        q2,  q2,  q8
    126        vst1.32         {q0},[r0,:128]!
    127        vmul.f32        q3,  q3,  q8
    128        vst1.32         {q1},[r0,:128]!
    129        subs            r12, r12, #16
    130        beq             2f
    131        vld1.32         {q0},[r1,:128]!
    132        vst1.32         {q2},[r0,:128]!
    133        vld1.32         {q1},[r1,:128]!
    134        vst1.32         {q3},[r0,:128]!
    135        b               1b
    136 2:      vst1.32         {q2},[r0,:128]!
    137        vst1.32         {q3},[r0,:128]!
    138        ands            len, len, #15
    139        it              eq
    140        bxeq            lr
    141 3:      vld1.32         {q0},[r1,:128]!
    142        vmul.f32        q0,  q0,  q8
    143        vst1.32         {q0},[r0,:128]!
    144        subs            len, len, #4
    145        bgt             3b
    146        bx              lr
    147        .unreq          len
    148 endfunc
    149 
    150 function ff_vector_fmul_window_neon, export=1
    151        push            {r4,r5,lr}
    152        ldr             lr,  [sp, #12]
    153        sub             r2,  r2,  #8
    154        sub             r5,  lr,  #2
    155        add             r2,  r2,  r5, lsl #2
    156        add             r4,  r3,  r5, lsl #3
    157        add             ip,  r0,  r5, lsl #3
    158        mov             r5,  #-16
    159        vld1.32         {d0,d1},  [r1,:128]!
    160        vld1.32         {d2,d3},  [r2,:128], r5
    161        vld1.32         {d4,d5},  [r3,:128]!
    162        vld1.32         {d6,d7},  [r4,:128], r5
    163 1:      subs            lr,  lr,  #4
    164        vmul.f32        d22, d0,  d4
    165        vrev64.32       q3,  q3
    166        vmul.f32        d23, d1,  d5
    167        vrev64.32       q1,  q1
    168        vmul.f32        d20, d0,  d7
    169        vmul.f32        d21, d1,  d6
    170        beq             2f
    171        vmla.f32        d22, d3,  d7
    172        vld1.32         {d0,d1},  [r1,:128]!
    173        vmla.f32        d23, d2,  d6
    174        vld1.32         {d18,d19},[r2,:128], r5
    175        vmls.f32        d20, d3,  d4
    176        vld1.32         {d24,d25},[r3,:128]!
    177        vmls.f32        d21, d2,  d5
    178        vld1.32         {d6,d7},  [r4,:128], r5
    179        vmov            q1,  q9
    180        vrev64.32       q11, q11
    181        vmov            q2,  q12
    182        vswp            d22, d23
    183        vst1.32         {d20,d21},[r0,:128]!
    184        vst1.32         {d22,d23},[ip,:128], r5
    185        b               1b
    186 2:      vmla.f32        d22, d3,  d7
    187        vmla.f32        d23, d2,  d6
    188        vmls.f32        d20, d3,  d4
    189        vmls.f32        d21, d2,  d5
    190        vrev64.32       q11, q11
    191        vswp            d22, d23
    192        vst1.32         {d20,d21},[r0,:128]!
    193        vst1.32         {d22,d23},[ip,:128], r5
    194        pop             {r4,r5,pc}
    195 endfunc
    196 
    197 function ff_vector_fmul_add_neon, export=1
    198        ldr             r12, [sp]
    199        vld1.32         {q0-q1},  [r1,:128]!
    200        vld1.32         {q8-q9},  [r2,:128]!
    201        vld1.32         {q2-q3},  [r3,:128]!
    202        vmul.f32        q10, q0,  q8
    203        vmul.f32        q11, q1,  q9
    204 1:      vadd.f32        q12, q2,  q10
    205        vadd.f32        q13, q3,  q11
    206        pld             [r1, #16]
    207        pld             [r2, #16]
    208        pld             [r3, #16]
    209        subs            r12, r12, #8
    210        beq             2f
    211        vld1.32         {q0},     [r1,:128]!
    212        vld1.32         {q8},     [r2,:128]!
    213        vmul.f32        q10, q0,  q8
    214        vld1.32         {q1},     [r1,:128]!
    215        vld1.32         {q9},     [r2,:128]!
    216        vmul.f32        q11, q1,  q9
    217        vld1.32         {q2-q3},  [r3,:128]!
    218        vst1.32         {q12-q13},[r0,:128]!
    219        b               1b
    220 2:      vst1.32         {q12-q13},[r0,:128]!
    221        bx              lr
    222 endfunc
    223 
    224 function ff_vector_fmul_reverse_neon, export=1
    225        add             r2,  r2,  r3,  lsl #2
    226        sub             r2,  r2,  #32
    227        mov             r12, #-32
    228        vld1.32         {q0-q1},  [r1,:128]!
    229        vld1.32         {q2-q3},  [r2,:128], r12
    230 1:      pld             [r1, #32]
    231        vrev64.32       q3,  q3
    232        vmul.f32        d16, d0,  d7
    233        vmul.f32        d17, d1,  d6
    234        pld             [r2, #-32]
    235        vrev64.32       q2,  q2
    236        vmul.f32        d18, d2,  d5
    237        vmul.f32        d19, d3,  d4
    238        subs            r3,  r3,  #8
    239        beq             2f
    240        vld1.32         {q0-q1},  [r1,:128]!
    241        vld1.32         {q2-q3},  [r2,:128], r12
    242        vst1.32         {q8-q9},  [r0,:128]!
    243        b               1b
    244 2:      vst1.32         {q8-q9},  [r0,:128]!
    245        bx              lr
    246 endfunc
    247 
    248 function ff_butterflies_float_neon, export=1
    249 1:      vld1.32         {q0},[r0,:128]
    250        vld1.32         {q1},[r1,:128]
    251        vsub.f32        q2,  q0,  q1
    252        vadd.f32        q1,  q0,  q1
    253        vst1.32         {q2},[r1,:128]!
    254        vst1.32         {q1},[r0,:128]!
    255        subs            r2,  r2,  #4
    256        bgt             1b
    257        bx              lr
    258 endfunc
    259 
    260 function ff_scalarproduct_float_neon, export=1
    261        vmov.f32        q2,  #0.0
    262 1:      vld1.32         {q0},[r0,:128]!
    263        vld1.32         {q1},[r1,:128]!
    264        vmla.f32        q2,  q0,  q1
    265        subs            r2,  r2,  #4
    266        bgt             1b
    267        vadd.f32        d0,  d4,  d5
    268        vpadd.f32       d0,  d0,  d0
    269 NOVFP   vmov.32         r0,  d0[0]
    270        bx              lr
    271 endfunc