tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

h264pred_neon.S (12010B)


      1 /*
      2 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
      3 *
      4 * This file is part of FFmpeg.
      5 *
      6 * FFmpeg is free software; you can redistribute it and/or
      7 * modify it under the terms of the GNU Lesser General Public
      8 * License as published by the Free Software Foundation; either
      9 * version 2.1 of the License, or (at your option) any later version.
     10 *
     11 * FFmpeg is distributed in the hope that it will be useful,
     12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     14 * Lesser General Public License for more details.
     15 *
     16 * You should have received a copy of the GNU Lesser General Public
     17 * License along with FFmpeg; if not, write to the Free Software
     18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     19 */
     20 
     21 #include "libavutil/arm/asm.S"
     22 
     23        .macro ldcol.8  rd,  rs,  rt,  n=8,  hi=0
     24 .if \n == 8 || \hi == 0
     25        vld1.8          {\rd[0]}, [\rs], \rt
     26        vld1.8          {\rd[1]}, [\rs], \rt
     27        vld1.8          {\rd[2]}, [\rs], \rt
     28        vld1.8          {\rd[3]}, [\rs], \rt
     29 .endif
     30 .if \n == 8 || \hi == 1
     31        vld1.8          {\rd[4]}, [\rs], \rt
     32        vld1.8          {\rd[5]}, [\rs], \rt
     33        vld1.8          {\rd[6]}, [\rs], \rt
     34        vld1.8          {\rd[7]}, [\rs], \rt
     35 .endif
     36        .endm
     37 
     38        .macro add16x8  dq,  dl,  dh,  rl,  rh
     39        vaddl.u8        \dq, \rl, \rh
     40        vadd.u16        \dl, \dl, \dh
     41        vpadd.u16       \dl, \dl, \dl
     42        vpadd.u16       \dl, \dl, \dl
     43        .endm
     44 
     45 function ff_pred16x16_128_dc_neon, export=1
     46        vmov.i8         q0,  #128
     47        b               .L_pred16x16_dc_end
     48 endfunc
     49 
     50 function ff_pred16x16_top_dc_neon, export=1
     51        sub             r2,  r0,  r1
     52        vld1.8          {q0},     [r2,:128]
     53        add16x8         q0,  d0,  d1,  d0,  d1
     54        vrshrn.u16      d0,  q0,  #4
     55        vdup.8          q0,  d0[0]
     56        b               .L_pred16x16_dc_end
     57 endfunc
     58 
     59 function ff_pred16x16_left_dc_neon, export=1
     60        sub             r2,  r0,  #1
     61        ldcol.8         d0,  r2,  r1
     62        ldcol.8         d1,  r2,  r1
     63        add16x8         q0,  d0,  d1,  d0,  d1
     64        vrshrn.u16      d0,  q0,  #4
     65        vdup.8          q0,  d0[0]
     66        b               .L_pred16x16_dc_end
     67 endfunc
     68 
     69 function ff_pred16x16_dc_neon, export=1
     70        sub             r2,  r0,  r1
     71        vld1.8          {q0},     [r2,:128]
     72        sub             r2,  r0,  #1
     73        ldcol.8         d2,  r2,  r1
     74        ldcol.8         d3,  r2,  r1
     75        vaddl.u8        q0,  d0,  d1
     76        vaddl.u8        q1,  d2,  d3
     77        vadd.u16        q0,  q0,  q1
     78        vadd.u16        d0,  d0,  d1
     79        vpadd.u16       d0,  d0,  d0
     80        vpadd.u16       d0,  d0,  d0
     81        vrshrn.u16      d0,  q0,  #5
     82        vdup.8          q0,  d0[0]
     83 .L_pred16x16_dc_end:
     84        mov             r3,  #8
     85 6:      vst1.8          {q0},     [r0,:128], r1
     86        vst1.8          {q0},     [r0,:128], r1
     87        subs            r3,  r3,  #1
     88        bne             6b
     89        bx              lr
     90 endfunc
     91 
     92 function ff_pred16x16_hor_neon, export=1
     93        sub             r2,  r0,  #1
     94        mov             r3,  #16
     95 1:      vld1.8          {d0[],d1[]},[r2],      r1
     96        vst1.8          {q0},       [r0,:128], r1
     97        subs            r3,  r3,  #1
     98        bne             1b
     99        bx              lr
    100 endfunc
    101 
    102 function ff_pred16x16_vert_neon, export=1
    103        sub             r0,  r0,  r1
    104        vld1.8          {q0},     [r0,:128], r1
    105        mov             r3,  #8
    106 1:      vst1.8          {q0},     [r0,:128], r1
    107        vst1.8          {q0},     [r0,:128], r1
    108        subs            r3,  r3,  #1
    109        bne             1b
    110        bx              lr
    111 endfunc
    112 
    113 function ff_pred16x16_plane_neon, export=1
    114        sub             r3,  r0,  r1
    115        add             r2,  r3,  #8
    116        sub             r3,  r3,  #1
    117        vld1.8          {d0},     [r3]
    118        vld1.8          {d2},     [r2,:64], r1
    119        ldcol.8         d1,  r3,  r1
    120        add             r3,  r3,  r1
    121        ldcol.8         d3,  r3,  r1
    122        vrev64.8        q0,  q0
    123        vaddl.u8        q8,  d2,  d3
    124        vsubl.u8        q2,  d2,  d0
    125        vsubl.u8        q3,  d3,  d1
    126        movrel          r3,  p16weight
    127        vld1.8          {q0},     [r3,:128]
    128        vmul.s16        q2,  q2,  q0
    129        vmul.s16        q3,  q3,  q0
    130        vadd.i16        d4,  d4,  d5
    131        vadd.i16        d5,  d6,  d7
    132        vpadd.i16       d4,  d4,  d5
    133        vpadd.i16       d4,  d4,  d4
    134        vshll.s16       q3,  d4,  #2
    135        vaddw.s16       q2,  q3,  d4
    136        vrshrn.s32      d4,  q2,  #6
    137        mov             r3,  #0
    138        vtrn.16         d4,  d5
    139        vadd.i16        d2,  d4,  d5
    140        vshl.i16        d3,  d2,  #3
    141        vrev64.16       d16, d17
    142        vsub.i16        d3,  d3,  d2
    143        vadd.i16        d16, d16, d0
    144        vshl.i16        d2,  d16, #4
    145        vsub.i16        d2,  d2,  d3
    146        vshl.i16        d3,  d4,  #4
    147        vext.16         q0,  q0,  q0,  #7
    148        vsub.i16        d6,  d5,  d3
    149        vmov.16         d0[0], r3
    150        vmul.i16        q0,  q0,  d4[0]
    151        vdup.16         q1,  d2[0]
    152        vdup.16         q2,  d4[0]
    153        vdup.16         q3,  d6[0]
    154        vshl.i16        q2,  q2,  #3
    155        vadd.i16        q1,  q1,  q0
    156        vadd.i16        q3,  q3,  q2
    157        mov             r3,  #16
    158 1:
    159        vqshrun.s16     d0,  q1,  #5
    160        vadd.i16        q1,  q1,  q2
    161        vqshrun.s16     d1,  q1,  #5
    162        vadd.i16        q1,  q1,  q3
    163        vst1.8          {q0},     [r0,:128], r1
    164        subs            r3,  r3,  #1
    165        bne             1b
    166        bx              lr
    167 endfunc
    168 
    169 const   p16weight, align=4
    170        .short          1,2,3,4,5,6,7,8
    171 endconst
    172 
    173 function ff_pred8x8_hor_neon, export=1
    174        sub             r2,  r0,  #1
    175        mov             r3,  #8
    176 1:      vld1.8          {d0[]},   [r2],     r1
    177        vst1.8          {d0},     [r0,:64], r1
    178        subs            r3,  r3,  #1
    179        bne             1b
    180        bx              lr
    181 endfunc
    182 
    183 function ff_pred8x8_vert_neon, export=1
    184        sub             r0,  r0,  r1
    185        vld1.8          {d0},     [r0,:64], r1
    186        mov             r3,  #4
    187 1:      vst1.8          {d0},     [r0,:64], r1
    188        vst1.8          {d0},     [r0,:64], r1
    189        subs            r3,  r3,  #1
    190        bne             1b
    191        bx              lr
    192 endfunc
    193 
    194 function ff_pred8x8_plane_neon, export=1
    195        sub             r3,  r0,  r1
    196        add             r2,  r3,  #4
    197        sub             r3,  r3,  #1
    198        vld1.32         {d0[0]},  [r3]
    199        vld1.32         {d2[0]},  [r2,:32], r1
    200        ldcol.8         d0,  r3,  r1,  4,  hi=1
    201        add             r3,  r3,  r1
    202        ldcol.8         d3,  r3,  r1,  4
    203        vaddl.u8        q8,  d2,  d3
    204        vrev32.8        d0,  d0
    205        vtrn.32         d2,  d3
    206        vsubl.u8        q2,  d2,  d0
    207        movrel          r3,  p16weight
    208        vld1.16         {q0},     [r3,:128]
    209        vmul.s16        d4,  d4,  d0
    210        vmul.s16        d5,  d5,  d0
    211        vpadd.i16       d4,  d4,  d5
    212        vpaddl.s16      d4,  d4
    213        vshl.i32        d5,  d4,  #4
    214        vadd.s32        d4,  d4,  d5
    215        vrshrn.s32      d4,  q2,  #5
    216        mov             r3,  #0
    217        vtrn.16         d4,  d5
    218        vadd.i16        d2,  d4,  d5
    219        vshl.i16        d3,  d2,  #2
    220        vrev64.16       d16, d16
    221        vsub.i16        d3,  d3,  d2
    222        vadd.i16        d16, d16, d0
    223        vshl.i16        d2,  d16, #4
    224        vsub.i16        d2,  d2,  d3
    225        vshl.i16        d3,  d4,  #3
    226        vext.16         q0,  q0,  q0,  #7
    227        vsub.i16        d6,  d5,  d3
    228        vmov.16         d0[0], r3
    229        vmul.i16        q0,  q0,  d4[0]
    230        vdup.16         q1,  d2[0]
    231        vdup.16         q2,  d4[0]
    232        vdup.16         q3,  d6[0]
    233        vshl.i16        q2,  q2,  #3
    234        vadd.i16        q1,  q1,  q0
    235        vadd.i16        q3,  q3,  q2
    236        mov             r3,  #8
    237 1:
    238        vqshrun.s16     d0,  q1,  #5
    239        vadd.i16        q1,  q1,  q3
    240        vst1.8          {d0},     [r0,:64], r1
    241        subs            r3,  r3,  #1
    242        bne             1b
    243        bx              lr
    244 endfunc
    245 
    246 function ff_pred8x8_128_dc_neon, export=1
    247        vmov.i8         q0,  #128
    248        b               .L_pred8x8_dc_end
    249 endfunc
    250 
    251 function ff_pred8x8_top_dc_neon, export=1
    252        sub             r2,  r0,  r1
    253        vld1.8          {d0},     [r2,:64]
    254        vpaddl.u8       d0,  d0
    255        vpadd.u16       d0,  d0,  d0
    256        vrshrn.u16      d0,  q0,  #2
    257        vdup.8          d1,  d0[1]
    258        vdup.8          d0,  d0[0]
    259        vtrn.32         d0,  d1
    260        b               .L_pred8x8_dc_end
    261 endfunc
    262 
    263 function ff_pred8x8_left_dc_neon, export=1
    264        sub             r2,  r0,  #1
    265        ldcol.8         d0,  r2,  r1
    266        vpaddl.u8       d0,  d0
    267        vpadd.u16       d0,  d0,  d0
    268        vrshrn.u16      d0,  q0,  #2
    269        vdup.8          d1,  d0[1]
    270        vdup.8          d0,  d0[0]
    271        b               .L_pred8x8_dc_end
    272 endfunc
    273 
    274 function ff_pred8x8_dc_neon, export=1
    275        sub             r2,  r0,  r1
    276        vld1.8          {d0},     [r2,:64]
    277        sub             r2,  r0,  #1
    278        ldcol.8         d1,  r2,  r1
    279        vtrn.32         d0,  d1
    280        vpaddl.u8       q0,  q0
    281        vpadd.u16       d0,  d0,  d1
    282        vpadd.u16       d1,  d0,  d0
    283        vrshrn.u16      d2,  q0,  #3
    284        vrshrn.u16      d3,  q0,  #2
    285        vdup.8          d0,  d2[4]
    286        vdup.8          d1,  d3[3]
    287        vdup.8          d4,  d3[2]
    288        vdup.8          d5,  d2[5]
    289        vtrn.32         q0,  q2
    290 .L_pred8x8_dc_end:
    291        mov             r3,  #4
    292        add             r2,  r0,  r1,  lsl #2
    293 6:      vst1.8          {d0},     [r0,:64], r1
    294        vst1.8          {d1},     [r2,:64], r1
    295        subs            r3,  r3,  #1
    296        bne             6b
    297        bx              lr
    298 endfunc
    299 
    300 function ff_pred8x8_l0t_dc_neon, export=1
    301        sub             r2,  r0,  r1
    302        vld1.8          {d0},     [r2,:64]
    303        sub             r2,  r0,  #1
    304        ldcol.8         d1,  r2,  r1,  4
    305        vtrn.32         d0,  d1
    306        vpaddl.u8       q0,  q0
    307        vpadd.u16       d0,  d0,  d1
    308        vpadd.u16       d1,  d0,  d0
    309        vrshrn.u16      d2,  q0,  #3
    310        vrshrn.u16      d3,  q0,  #2
    311        vdup.8          d0,  d2[4]
    312        vdup.8          d1,  d3[0]
    313        vdup.8          q2,  d3[2]
    314        vtrn.32         q0,  q2
    315        b               .L_pred8x8_dc_end
    316 endfunc
    317 
    318 function ff_pred8x8_l00_dc_neon, export=1
    319        sub             r2,  r0,  #1
    320        ldcol.8         d0,  r2,  r1,  4
    321        vpaddl.u8       d0,  d0
    322        vpadd.u16       d0,  d0,  d0
    323        vrshrn.u16      d0,  q0,  #2
    324        vmov.i8         d1,  #128
    325        vdup.8          d0,  d0[0]
    326        b               .L_pred8x8_dc_end
    327 endfunc
    328 
    329 function ff_pred8x8_0lt_dc_neon, export=1
    330        sub             r2,  r0,  r1
    331        vld1.8          {d0},     [r2,:64]
    332        add             r2,  r0,  r1,  lsl #2
    333        sub             r2,  r2,  #1
    334        ldcol.8         d1,  r2,  r1,  4,  hi=1
    335        vtrn.32         d0,  d1
    336        vpaddl.u8       q0,  q0
    337        vpadd.u16       d0,  d0,  d1
    338        vpadd.u16       d1,  d0,  d0
    339        vrshrn.u16      d3,  q0,  #2
    340        vrshrn.u16      d2,  q0,  #3
    341        vdup.8          d0,  d3[0]
    342        vdup.8          d1,  d3[3]
    343        vdup.8          d4,  d3[2]
    344        vdup.8          d5,  d2[5]
    345        vtrn.32         q0,  q2
    346        b               .L_pred8x8_dc_end
    347 endfunc
    348 
    349 function ff_pred8x8_0l0_dc_neon, export=1
    350        add             r2,  r0,  r1,  lsl #2
    351        sub             r2,  r2,  #1
    352        ldcol.8         d1,  r2,  r1,  4
    353        vpaddl.u8       d2,  d1
    354        vpadd.u16       d2,  d2,  d2
    355        vrshrn.u16      d1,  q1,  #2
    356        vmov.i8         d0,  #128
    357        vdup.8          d1,  d1[0]
    358        b               .L_pred8x8_dc_end
    359 endfunc