tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

h264pred_neon.S (26288B)


      1 /*
      2 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
      3 *
      4 * This file is part of FFmpeg.
      5 *
      6 * FFmpeg is free software; you can redistribute it and/or
      7 * modify it under the terms of the GNU Lesser General Public
      8 * License as published by the Free Software Foundation; either
      9 * version 2.1 of the License, or (at your option) any later version.
     10 *
     11 * FFmpeg is distributed in the hope that it will be useful,
     12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     14 * Lesser General Public License for more details.
     15 *
     16 * You should have received a copy of the GNU Lesser General Public
     17 * License along with FFmpeg; if not, write to the Free Software
     18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     19 */
     20 
     21 #include "libavutil/aarch64/asm.S"
     22 
     23 .macro ldcol.8  rd,  rs,  rt,  n=8,  hi=0
     24 .if \n >= 8 || \hi == 0
     25        ld1             {\rd\().b}[0],  [\rs], \rt
     26        ld1             {\rd\().b}[1],  [\rs], \rt
     27        ld1             {\rd\().b}[2],  [\rs], \rt
     28        ld1             {\rd\().b}[3],  [\rs], \rt
     29 .endif
     30 .if \n >= 8 || \hi == 1
     31        ld1             {\rd\().b}[4],  [\rs], \rt
     32        ld1             {\rd\().b}[5],  [\rs], \rt
     33        ld1             {\rd\().b}[6],  [\rs], \rt
     34        ld1             {\rd\().b}[7],  [\rs], \rt
     35 .endif
     36 .if \n == 16
     37        ld1             {\rd\().b}[8],  [\rs], \rt
     38        ld1             {\rd\().b}[9],  [\rs], \rt
     39        ld1             {\rd\().b}[10], [\rs], \rt
     40        ld1             {\rd\().b}[11], [\rs], \rt
     41        ld1             {\rd\().b}[12], [\rs], \rt
     42        ld1             {\rd\().b}[13], [\rs], \rt
     43        ld1             {\rd\().b}[14], [\rs], \rt
     44        ld1             {\rd\().b}[15], [\rs], \rt
     45 .endif
     46 .endm
     47 
     48 function ff_pred16x16_128_dc_neon, export=1
     49        movi            v0.16b,  #128
     50        b               .L_pred16x16_dc_end
     51 endfunc
     52 
     53 function ff_pred16x16_top_dc_neon, export=1
     54        sub             x2,  x0,  x1
     55        ld1             {v0.16b},  [x2]
     56        uaddlv          h0,  v0.16b
     57        rshrn           v0.8b,  v0.8h,  #4
     58        dup             v0.16b, v0.b[0]
     59        b               .L_pred16x16_dc_end
     60 endfunc
     61 
     62 function ff_pred16x16_left_dc_neon, export=1
     63        sub             x2,  x0,  #1
     64        ldcol.8         v0,  x2,  x1, 16
     65        uaddlv          h0,  v0.16b
     66        rshrn           v0.8b,  v0.8h,  #4
     67        dup             v0.16b, v0.b[0]
     68        b               .L_pred16x16_dc_end
     69 endfunc
     70 
     71 function ff_pred16x16_dc_neon, export=1
     72        sub             x2,  x0,  x1
     73        sub             x3,  x0,  #1
     74        ld1             {v0.16b}, [x2]
     75        ldcol.8         v1,  x3,  x1, 16
     76        uaddlv          h0,  v0.16b
     77        uaddlv          h1,  v1.16b
     78        add             v0.4h,  v0.4h,  v1.4h
     79        rshrn           v0.8b,  v0.8h,  #5
     80        dup             v0.16b, v0.b[0]
     81 .L_pred16x16_dc_end:
     82        mov             w3,  #8
     83 6:      st1             {v0.16b}, [x0], x1
     84        subs            w3,  w3,  #1
     85        st1             {v0.16b}, [x0], x1
     86        b.ne            6b
     87        ret
     88 endfunc
     89 
     90 function ff_pred16x16_hor_neon, export=1
     91        sub             x2,  x0,  #1
     92        mov             w3,  #16
     93 1:      ld1r            {v0.16b}, [x2], x1
     94        subs            w3,  w3,  #1
     95        st1             {v0.16b}, [x0], x1
     96        b.ne            1b
     97        ret
     98 endfunc
     99 
    100 function ff_pred16x16_vert_neon, export=1
    101        sub             x2,  x0,  x1
    102        add             x1,  x1,  x1
    103        ld1             {v0.16b}, [x2], x1
    104        mov             w3,  #8
    105 1:      subs            w3,  w3,  #1
    106        st1             {v0.16b}, [x0], x1
    107        st1             {v0.16b}, [x2], x1
    108        b.ne            1b
    109        ret
    110 endfunc
    111 
    112 function ff_pred16x16_plane_neon, export=1
    113        sub             x3,  x0,  x1
    114        movrel          x4,  p16weight
    115        add             x2,  x3,  #8
    116        sub             x3,  x3,  #1
    117        ld1             {v0.8b},  [x3]
    118        ld1             {v2.8b},  [x2], x1
    119        ldcol.8         v1,  x3,  x1
    120        add             x3,  x3,  x1
    121        ldcol.8         v3,  x3,  x1
    122        rev64           v0.8b,  v0.8b
    123        rev64           v1.8b,  v1.8b
    124        uaddl           v7.8h,  v2.8b,  v3.8b
    125        usubl           v2.8h,  v2.8b,  v0.8b
    126        usubl           v3.8h,  v3.8b,  v1.8b
    127        ld1             {v0.8h},     [x4]
    128        mul             v2.8h,  v2.8h,  v0.8h
    129        mul             v3.8h,  v3.8h,  v0.8h
    130        addp            v2.8h,  v2.8h,  v3.8h
    131        addp            v2.8h,  v2.8h,  v2.8h
    132        addp            v2.4h,  v2.4h,  v2.4h
    133        sshll           v3.4s,  v2.4h,  #2
    134        saddw           v2.4s,  v3.4s,  v2.4h
    135        rshrn           v4.4h,  v2.4s,  #6
    136        trn2            v5.4h,  v4.4h,  v4.4h
    137        add             v2.4h,  v4.4h,  v5.4h
    138        shl             v3.4h,  v2.4h,  #3
    139        ext             v7.16b, v7.16b, v7.16b, #14
    140        sub             v3.4h,  v3.4h,  v2.4h   // 7 * (b + c)
    141        add             v7.4h,  v7.4h,  v0.4h
    142        shl             v2.4h,  v7.4h,  #4
    143        sub             v2.4h,  v2.4h,  v3.4h
    144        shl             v3.4h,  v4.4h,  #4
    145        ext             v0.16b, v0.16b, v0.16b, #14
    146        sub             v6.4h,  v5.4h,  v3.4h
    147        mov             v0.h[0],  wzr
    148        mul             v0.8h,  v0.8h,  v4.h[0]
    149        dup             v1.8h,  v2.h[0]
    150        dup             v2.8h,  v4.h[0]
    151        dup             v3.8h,  v6.h[0]
    152        shl             v2.8h,  v2.8h,  #3
    153        add             v1.8h,  v1.8h,  v0.8h
    154        add             v3.8h,  v3.8h,  v2.8h
    155        mov             w3,  #16
    156 1:
    157        sqshrun         v0.8b,  v1.8h,  #5
    158        add             v1.8h,  v1.8h,  v2.8h
    159        sqshrun2        v0.16b, v1.8h,  #5
    160        add             v1.8h,  v1.8h,  v3.8h
    161        subs            w3,  w3,  #1
    162        st1             {v0.16b}, [x0], x1
    163        b.ne            1b
    164        ret
    165 endfunc
    166 
    167 const   p16weight, align=4
    168        .short          1,2,3,4,5,6,7,8
    169 endconst
    170 const   p8weight, align=4
    171        .short          1,2,3,4,1,2,3,4
    172 endconst
    173 
    174 function ff_pred8x8_hor_neon, export=1
    175        sub             x2,  x0,  #1
    176        mov             w3,  #8
    177 1:      ld1r            {v0.8b},  [x2], x1
    178        subs            w3,  w3,  #1
    179        st1             {v0.8b},  [x0], x1
    180        b.ne            1b
    181        ret
    182 endfunc
    183 
    184 function ff_pred8x8_vert_neon, export=1
    185        sub             x2,  x0,  x1
    186        lsl             x1,  x1,  #1
    187        ld1             {v0.8b},  [x2], x1
    188        mov             w3,  #4
    189 1:      subs            w3,  w3,  #1
    190        st1             {v0.8b},  [x0], x1
    191        st1             {v0.8b},  [x2], x1
    192        b.ne            1b
    193        ret
    194 endfunc
    195 
    196 function ff_pred8x8_plane_neon, export=1
    197        sub             x3,  x0,  x1
    198        movrel          x4,  p8weight
    199        movrel          x5,  p16weight
    200        add             x2,  x3,  #4
    201        sub             x3,  x3,  #1
    202        ld1             {v0.s}[0],  [x3]
    203        ld1             {v2.s}[0],  [x2], x1
    204        ldcol.8         v0,  x3,  x1,  4,  hi=1
    205        add             x3,  x3,  x1
    206        ldcol.8         v3,  x3,  x1,  4
    207        uaddl           v7.8h,  v2.8b,  v3.8b
    208        rev32           v0.8b,  v0.8b
    209        trn1            v2.2s,  v2.2s,  v3.2s
    210        usubl           v2.8h,  v2.8b,  v0.8b
    211        ld1             {v6.8h},  [x4]
    212        mul             v2.8h,  v2.8h,  v6.8h
    213        ld1             {v0.8h},  [x5]
    214        saddlp          v2.4s,  v2.8h
    215        addp            v2.4s,  v2.4s,  v2.4s
    216        shl             v3.4s,  v2.4s,  #4
    217        add             v2.4s,  v3.4s,  v2.4s
    218        rshrn           v5.4h,  v2.4s,  #5
    219        addp            v2.4h,  v5.4h,  v5.4h
    220        shl             v3.4h,  v2.4h,  #1
    221        add             v3.4h,  v3.4h,  v2.4h
    222        rev64           v7.4h,  v7.4h
    223        add             v7.4h,  v7.4h,  v0.4h
    224        shl             v2.4h,  v7.4h,  #4
    225        sub             v2.4h,  v2.4h,  v3.4h
    226        ext             v0.16b, v0.16b, v0.16b, #14
    227        mov             v0.h[0],  wzr
    228        mul             v0.8h,  v0.8h,  v5.h[0]
    229        dup             v1.8h,  v2.h[0]
    230        dup             v2.8h,  v5.h[1]
    231        add             v1.8h,  v1.8h,  v0.8h
    232        mov             w3,  #8
    233 1:
    234        sqshrun         v0.8b,  v1.8h,  #5
    235        subs            w3,  w3,  #1
    236        add             v1.8h,  v1.8h,  v2.8h
    237        st1             {v0.8b},  [x0], x1
    238        b.ne            1b
    239        ret
    240 endfunc
    241 
    242 function ff_pred8x8_128_dc_neon, export=1
    243        movi            v0.8b,  #128
    244        movi            v1.8b,  #128
    245        b               .L_pred8x8_dc_end
    246 endfunc
    247 
    248 function ff_pred8x8_top_dc_neon, export=1
    249        sub             x2,  x0,  x1
    250        ld1             {v0.8b},  [x2]
    251        uaddlp          v0.4h,  v0.8b
    252        addp            v0.4h,  v0.4h,  v0.4h
    253        zip1            v0.8h,  v0.8h,  v0.8h
    254        rshrn           v2.8b,  v0.8h,  #2
    255        zip1            v0.8b,  v2.8b,  v2.8b
    256        zip1            v1.8b,  v2.8b,  v2.8b
    257        b               .L_pred8x8_dc_end
    258 endfunc
    259 
    260 function ff_pred8x8_left_dc_neon, export=1
    261        sub             x2,  x0,  #1
    262        ldcol.8         v0,  x2,  x1
    263        uaddlp          v0.4h,  v0.8b
    264        addp            v0.4h,  v0.4h,  v0.4h
    265        rshrn           v2.8b,  v0.8h,  #2
    266        dup             v1.8b,  v2.b[1]
    267        dup             v0.8b,  v2.b[0]
    268        b               .L_pred8x8_dc_end
    269 endfunc
    270 
    271 function ff_pred8x8_dc_neon, export=1
    272        sub             x2,  x0,  x1
    273        sub             x3,  x0,  #1
    274        ld1             {v0.8b}, [x2]
    275        ldcol.8         v1,  x3,  x1
    276        uaddlp          v0.4h,  v0.8b
    277        uaddlp          v1.4h,  v1.8b
    278        trn1            v2.2s,  v0.2s,  v1.2s
    279        trn2            v3.2s,  v0.2s,  v1.2s
    280        addp            v4.4h,  v2.4h,  v3.4h
    281        addp            v5.4h,  v4.4h,  v4.4h
    282        rshrn           v6.8b,  v5.8h,  #3
    283        rshrn           v7.8b,  v4.8h,  #2
    284        dup             v0.8b,  v6.b[0]
    285        dup             v2.8b,  v7.b[2]
    286        dup             v1.8b,  v7.b[3]
    287        dup             v3.8b,  v6.b[1]
    288        zip1            v0.2s,  v0.2s,  v2.2s
    289        zip1            v1.2s,  v1.2s,  v3.2s
    290 .L_pred8x8_dc_end:
    291        mov             w3,  #4
    292        add             x2,  x0,  x1,  lsl #2
    293 6:      subs            w3,  w3,  #1
    294        st1             {v0.8b},  [x0], x1
    295        st1             {v1.8b},  [x2], x1
    296        b.ne            6b
    297        ret
    298 endfunc
    299 
    300 function ff_pred8x8_l0t_dc_neon, export=1
    301        sub             x2,  x0,  x1
    302        sub             x3,  x0,  #1
    303        ld1             {v0.8b},  [x2]
    304        ldcol.8         v1,  x3,  x1,  4
    305        zip1            v0.4s,  v0.4s,  v1.4s
    306        uaddlp          v0.8h,  v0.16b
    307        addp            v0.8h,  v0.8h,  v0.8h
    308        addp            v1.4h,  v0.4h,  v0.4h
    309        rshrn           v2.8b,  v0.8h,  #2
    310        rshrn           v3.8b,  v1.8h,  #3
    311        dup             v4.8b,  v3.b[0]
    312        dup             v6.8b,  v2.b[2]
    313        dup             v5.8b,  v2.b[0]
    314        zip1            v0.2s,  v4.2s,  v6.2s
    315        zip1            v1.2s,  v5.2s,  v6.2s
    316        b               .L_pred8x8_dc_end
    317 endfunc
    318 
    319 function ff_pred8x8_l00_dc_neon, export=1
    320        sub             x2,  x0,  #1
    321        ldcol.8         v0,  x2,  x1,  4
    322        uaddlp          v0.4h,  v0.8b
    323        addp            v0.4h,  v0.4h,  v0.4h
    324        rshrn           v0.8b,  v0.8h,  #2
    325        movi            v1.8b,  #128
    326        dup             v0.8b,  v0.b[0]
    327        b               .L_pred8x8_dc_end
    328 endfunc
    329 
    330 function ff_pred8x8_0lt_dc_neon, export=1
    331        add             x3,  x0,  x1,  lsl #2
    332        sub             x2,  x0,  x1
    333        sub             x3,  x3,  #1
    334        ld1             {v0.8b},  [x2]
    335        ldcol.8         v1,  x3,  x1,  4,  hi=1
    336        zip1            v0.4s,  v0.4s,  v1.4s
    337        uaddlp          v0.8h,  v0.16b
    338        addp            v0.8h,  v0.8h,  v0.8h
    339        addp            v1.4h,  v0.4h,  v0.4h
    340        rshrn           v2.8b,  v0.8h,  #2
    341        rshrn           v3.8b,  v1.8h,  #3
    342        dup             v4.8b,  v2.b[0]
    343        dup             v5.8b,  v2.b[3]
    344        dup             v6.8b,  v2.b[2]
    345        dup             v7.8b,  v3.b[1]
    346        zip1            v0.2s,  v4.2s,  v6.2s
    347        zip1            v1.2s,  v5.2s,  v7.2s
    348        b               .L_pred8x8_dc_end
    349 endfunc
    350 
    351 function ff_pred8x8_0l0_dc_neon, export=1
    352        add             x2,  x0,  x1,  lsl #2
    353        sub             x2,  x2,  #1
    354        ldcol.8         v1,  x2,  x1,  4
    355        uaddlp          v2.4h,  v1.8b
    356        addp            v2.4h,  v2.4h,  v2.4h
    357        rshrn           v1.8b,  v2.8h,  #2
    358        movi            v0.8b,  #128
    359        dup             v1.8b,  v1.b[0]
    360        b               .L_pred8x8_dc_end
    361 endfunc
    362 
    363 .macro ldcol.16  rd,  rs,  rt,  n=4,  hi=0
    364 .if \n >= 4 && \hi == 0
    365        ld1             {\rd\().h}[0],  [\rs], \rt
    366        ld1             {\rd\().h}[1],  [\rs], \rt
    367        ld1             {\rd\().h}[2],  [\rs], \rt
    368        ld1             {\rd\().h}[3],  [\rs], \rt
    369 .endif
    370 .if \n == 8 || \hi == 1
    371        ld1             {\rd\().h}[4],  [\rs], \rt
    372        ld1             {\rd\().h}[5],  [\rs], \rt
    373        ld1             {\rd\().h}[6],  [\rs], \rt
    374        ld1             {\rd\().h}[7],  [\rs], \rt
    375 .endif
    376 .endm
    377 
    378 // slower than C
    379 /*
    380 function ff_pred16x16_128_dc_neon_10, export=1
    381        movi            v0.8h, #2, lsl #8 // 512, 1 << (bit_depth - 1)
    382 
    383        b               .L_pred16x16_dc_10_end
    384 endfunc
    385 */
    386 
    387 function ff_pred16x16_top_dc_neon_10, export=1
    388        sub             x2,  x0,  x1
    389 
    390        ld1             {v0.8h, v1.8h}, [x2]
    391 
    392        add             v0.8h, v0.8h, v1.8h
    393        addv            h0, v0.8h
    394 
    395        urshr           v0.4h,  v0.4h,  #4
    396        dup             v0.8h, v0.h[0]
    397        b               .L_pred16x16_dc_10_end
    398 endfunc
    399 
    400 // slower than C
    401 /*
    402 function ff_pred16x16_left_dc_neon_10, export=1
    403        sub             x2,  x0,  #2 // access to the "left" column
    404        ldcol.16        v0,  x2,  x1,  8
    405        ldcol.16        v1,  x2,  x1,  8 // load "left" column
    406 
    407        add             v0.8h, v0.8h, v1.8h
    408        addv            h0,  v0.8h
    409 
    410        urshr           v0.4h,  v0.4h,  #4
    411        dup             v0.8h, v0.h[0]
    412        b               .L_pred16x16_dc_10_end
    413 endfunc
    414 */
    415 
    416 function ff_pred16x16_dc_neon_10, export=1
    417        sub             x2,  x0,  x1 // access to the "top" row
    418        sub             x3,  x0,  #2 // access to the "left" column
    419 
    420        ld1             {v0.8h, v1.8h}, [x2]
    421        ldcol.16        v2,  x3,  x1,  8
    422        ldcol.16        v3,  x3,  x1,  8 // load pixels in "top" row and "left" col
    423 
    424        add             v0.8h, v0.8h, v1.8h
    425        add             v2.8h, v2.8h, v3.8h
    426        add             v0.8h, v0.8h, v2.8h
    427        addv            h0, v0.8h
    428 
    429        urshr           v0.4h,  v0.4h,  #5
    430        dup             v0.8h,  v0.h[0]
    431 .L_pred16x16_dc_10_end:
    432        mov             v1.16b,  v0.16b
    433        mov             w3,  #8
    434 6:      st1             {v0.8h, v1.8h}, [x0], x1
    435        subs            w3,  w3,  #1
    436        st1             {v0.8h, v1.8h}, [x0], x1
    437        b.ne            6b
    438        ret
    439 endfunc
    440 
    441 function ff_pred16x16_hor_neon_10, export=1
    442        sub             x2,  x0,  #2
    443        add             x3,  x0,  #16
    444 
    445        mov             w4,  #16
    446 1:      ld1r            {v0.8h},  [x2],  x1
    447        subs            w4,  w4,  #1
    448        st1             {v0.8h},  [x0],  x1
    449        st1             {v0.8h},  [x3],  x1
    450        b.ne            1b
    451        ret
    452 endfunc
    453 
    454 function ff_pred16x16_vert_neon_10, export=1
    455        sub             x2,  x0,  x1
    456        add             x1,  x1,  x1
    457 
    458        ld1             {v0.8h, v1.8h},  [x2],  x1
    459 
    460        mov             w3,  #8
    461 1:      subs            w3,  w3,  #1
    462        st1             {v0.8h, v1.8h},  [x0],  x1
    463        st1             {v0.8h, v1.8h},  [x2],  x1
    464 
    465        b.ne            1b
    466        ret
    467 endfunc
    468 
    469 function ff_pred16x16_plane_neon_10, export=1
    470        sub             x3,  x0,  x1
    471        movrel          x4,  p16weight
    472        add             x2,  x3,  #16
    473        sub             x3,  x3,  #2
    474        ld1             {v0.8h},  [x3]
    475        ld1             {v2.8h},  [x2], x1
    476        ldcol.16        v1,  x3,  x1, 8
    477        add             x3,  x3,  x1
    478        ldcol.16        v3,  x3,  x1, 8
    479 
    480        rev64           v16.8h,  v0.8h
    481        rev64           v17.8h,  v1.8h
    482        ext             v0.16b, v16.16b, v16.16b, #8
    483        ext             v1.16b, v17.16b, v17.16b, #8
    484 
    485        add             v7.8h,  v2.8h,  v3.8h
    486        sub             v2.8h,  v2.8h,  v0.8h
    487        sub             v3.8h,  v3.8h,  v1.8h
    488        ld1             {v0.8h},     [x4]
    489        mul             v2.8h,  v2.8h,  v0.8h
    490        mul             v3.8h,  v3.8h,  v0.8h
    491        addp            v2.8h,  v2.8h,  v3.8h
    492        addp            v2.8h,  v2.8h,  v2.8h
    493        addp            v2.4h,  v2.4h,  v2.4h
    494        sshll           v3.4s,  v2.4h,  #2
    495        saddw           v2.4s,  v3.4s,  v2.4h
    496        rshrn           v4.4h,  v2.4s,  #6
    497        trn2            v5.4h,  v4.4h,  v4.4h
    498        add             v2.4h,  v4.4h,  v5.4h
    499        shl             v3.4h,  v2.4h,  #3
    500        ext             v7.16b, v7.16b, v7.16b, #14
    501        sub             v3.4h,  v3.4h,  v2.4h   // 7 * (b + c)
    502        add             v7.4h,  v7.4h,  v0.4h
    503        shl             v2.4h,  v7.4h,  #4
    504        ssubl           v2.4s,  v2.4h,  v3.4h
    505        ext             v0.16b, v0.16b, v0.16b, #14
    506        sxtl            v6.4s,  v5.4h          // c
    507 
    508        mov             v0.h[0],  wzr
    509        mul             v0.8h,  v0.8h,  v4.h[0]
    510        dup             v16.4s, v2.s[0]
    511        dup             v17.4s, v2.s[0]
    512        dup             v2.8h,  v4.h[0]        // b
    513        dup             v3.4s,  v6.s[0]        // c
    514        sshll           v2.4s,  v2.4h,  #3     // b * 8
    515        saddw           v16.4s, v16.4s, v0.4h
    516        saddw2          v17.4s, v17.4s, v0.8h
    517        sub             v3.4s,  v3.4s,  v2.4s
    518 
    519        mov             w3,      #16
    520        mvni            v4.8h,   #0xFC, lsl #8 // 1023 for clipping
    521 1:
    522        sqshrun         v0.4h,  v16.4s, #5
    523        sqshrun2        v0.8h,  v17.4s, #5
    524        add             v16.4s, v16.4s, v2.4s
    525        add             v17.4s, v17.4s, v2.4s
    526        sqshrun         v1.4h,  v16.4s, #5
    527        sqshrun2        v1.8h,  v17.4s, #5
    528        add             v16.4s, v16.4s, v3.4s
    529        add             v17.4s, v17.4s, v3.4s
    530 
    531        subs            w3,  w3,  #1
    532 
    533        smin            v0.8h,  v0.8h,  v4.8h
    534        smin            v1.8h,  v1.8h,  v4.8h
    535 
    536        st1             {v0.8h, v1.8h}, [x0], x1
    537        b.ne            1b
    538        ret
    539 endfunc
    540 
    541 function ff_pred8x8_hor_neon_10, export=1
    542        sub             x2,  x0,  #2
    543        mov             w3,  #8
    544 
    545 1:      ld1r            {v0.8h},  [x2], x1
    546        subs            w3,  w3,  #1
    547        st1             {v0.8h},  [x0], x1
    548        b.ne            1b
    549        ret
    550 endfunc
    551 
    552 function ff_pred8x8_vert_neon_10, export=1
    553        sub             x2,  x0,  x1
    554        lsl             x1,  x1,  #1
    555 
    556        ld1             {v0.8h},  [x2], x1
    557        mov             w3,  #4
    558 1:      subs            w3,  w3,  #1
    559        st1             {v0.8h},  [x0], x1
    560        st1             {v0.8h},  [x2], x1
    561        b.ne            1b
    562        ret
    563 endfunc
    564 
    565 function ff_pred8x8_plane_neon_10, export=1
    566        sub             x3,  x0,  x1
    567        movrel          x4,  p8weight
    568        movrel          x5,  p16weight
    569        add             x2,  x3,  #8
    570        sub             x3,  x3,  #2
    571        ld1             {v0.d}[0],  [x3]
    572        ld1             {v2.d}[0],  [x2], x1
    573        ldcol.16        v0,  x3,  x1,  hi=1
    574        add             x3,  x3,  x1
    575        ldcol.16        v3,  x3,  x1,  4
    576        add             v7.8h,  v2.8h,  v3.8h
    577        rev64           v0.8h,  v0.8h
    578        trn1            v2.2d,  v2.2d,  v3.2d
    579        sub             v2.8h,  v2.8h,  v0.8h
    580        ld1             {v6.8h},  [x4]
    581        mul             v2.8h,  v2.8h,  v6.8h
    582        ld1             {v0.8h},  [x5]
    583        saddlp          v2.4s,  v2.8h
    584        addp            v2.4s,  v2.4s,  v2.4s
    585        shl             v3.4s,  v2.4s,  #4
    586        add             v2.4s,  v3.4s,  v2.4s
    587        rshrn           v5.4h,  v2.4s,  #5
    588        addp            v2.4h,  v5.4h,  v5.4h
    589        shl             v3.4h,  v2.4h,  #1
    590        add             v3.4h,  v3.4h,  v2.4h
    591        rev64           v7.4h,  v7.4h
    592        add             v7.4h,  v7.4h,  v0.4h
    593        shl             v2.4h,  v7.4h,  #4
    594        ssubl           v2.4s,  v2.4h,  v3.4h
    595        ext             v0.16b, v0.16b, v0.16b, #14
    596        mov             v0.h[0],  wzr
    597        dup             v1.4s,  v2.s[0]
    598        dup             v2.4s,  v2.s[0]
    599        dup             v3.8h,  v5.h[1]
    600        smlal           v1.4s,  v0.4h,  v5.h[0]
    601        smlal2          v2.4s,  v0.8h,  v5.h[0]
    602        mov             w3,  #8
    603        mvni            v4.8h,  #0xFC,  lsl #8 // 1023 for clipping
    604 1:
    605        sqshrun         v0.4h,  v1.4s,  #5
    606        sqshrun2        v0.8h,  v2.4s,  #5
    607 
    608        saddw           v1.4s,  v1.4s,  v3.4h
    609        saddw           v2.4s,  v2.4s,  v3.4h
    610 
    611        subs            w3,  w3,  #1
    612 
    613        smin            v0.8h,  v0.8h,  v4.8h
    614 
    615        st1             {v0.8h},  [x0],  x1
    616        b.ne            1b
    617        ret
    618 endfunc
    619 
    620 function ff_pred8x8_128_dc_neon_10, export=1
    621        movi            v0.8h,  #2, lsl #8      // 512, 1 << (bit_depth - 1)
    622        movi            v1.8h,  #2, lsl #8
    623        b               .L_pred8x8_dc_10_end
    624 endfunc
    625 
    626 function ff_pred8x8_top_dc_neon_10, export=1
    627        sub             x2,  x0,  x1
    628        ld1             {v0.8h},  [x2]
    629 
    630        addp            v0.8h,  v0.8h,  v0.8h
    631        addp            v0.4h,  v0.4h,  v0.4h
    632        zip1            v0.4h,  v0.4h,  v0.4h
    633        urshr           v2.4h,  v0.4h,  #2
    634        zip1            v0.8h,  v2.8h,  v2.8h
    635        zip1            v1.8h,  v2.8h,  v2.8h
    636        b               .L_pred8x8_dc_10_end
    637 endfunc
    638 
    639 function ff_pred8x8_left_dc_neon_10, export=1
    640        sub             x2,  x0,  #2
    641        ldcol.16        v0,  x2,  x1,  8
    642 
    643        addp            v0.8h,  v0.8h,  v0.8h
    644        addp            v0.4h,  v0.4h,  v0.4h
    645        urshr           v2.4h,  v0.4h,  #2
    646        dup             v1.8h,  v2.h[1]
    647        dup             v0.8h,  v2.h[0]
    648        b               .L_pred8x8_dc_10_end
    649 endfunc
    650 
    651 function ff_pred8x8_dc_neon_10, export=1
    652        sub             x2,  x0,  x1
    653        sub             x3,  x0,  #2
    654 
    655        ld1             {v0.8h}, [x2]
    656        ldcol.16        v1,  x3,  x1, 8
    657 
    658        addp            v0.8h,  v0.8h, v0.8h
    659        addp            v1.8h,  v1.8h, v1.8h
    660        trn1            v2.2s,  v0.2s,  v1.2s
    661        trn2            v3.2s,  v0.2s,  v1.2s
    662        addp            v4.4h,  v2.4h,  v3.4h
    663        addp            v5.4h,  v4.4h,  v4.4h
    664        urshr           v6.4h,  v5.4h,  #3
    665        urshr           v7.4h,  v4.4h,  #2
    666        dup             v0.8h,  v6.h[0]
    667        dup             v2.8h,  v7.h[2]
    668        dup             v1.8h,  v7.h[3]
    669        dup             v3.8h,  v6.h[1]
    670        zip1            v0.2d,  v0.2d,  v2.2d
    671        zip1            v1.2d,  v1.2d,  v3.2d
    672 .L_pred8x8_dc_10_end:
    673        mov             w3,  #4
    674        add             x2,  x0,  x1,  lsl #2
    675 
    676 6:      st1             {v0.8h},  [x0], x1
    677        subs            w3,  w3,  #1
    678        st1             {v1.8h},  [x2], x1
    679        b.ne            6b
    680        ret
    681 endfunc
    682 
    683 function ff_pred8x8_l0t_dc_neon_10, export=1
    684        sub             x2,  x0,  x1
    685        sub             x3,  x0,  #2
    686 
    687        ld1             {v0.8h},  [x2]
    688        ldcol.16        v1,  x3,  x1, 4
    689 
    690        addp            v0.8h,  v0.8h,  v0.8h
    691        addp            v1.4h,  v1.4h,  v1.4h
    692        addp            v0.4h,  v0.4h,  v0.4h
    693        addp            v1.4h,  v1.4h,  v1.4h
    694        add             v1.4h,  v1.4h,  v0.4h
    695 
    696        urshr           v2.4h,  v0.4h,  #2
    697        urshr           v3.4h,  v1.4h,  #3      // the pred4x4 part
    698 
    699        dup             v4.4h,  v3.h[0]
    700        dup             v5.4h,  v2.h[0]
    701        dup             v6.4h,  v2.h[1]
    702 
    703        zip1            v0.2d,  v4.2d,  v6.2d
    704        zip1            v1.2d,  v5.2d,  v6.2d
    705        b               .L_pred8x8_dc_10_end
    706 endfunc
    707 
    708 function ff_pred8x8_l00_dc_neon_10, export=1
    709        sub             x2,  x0,  #2
    710 
    711        ldcol.16        v0,  x2,  x1,  4
    712 
    713        addp            v0.4h,  v0.4h,  v0.4h
    714        addp            v0.4h,  v0.4h,  v0.4h
    715        urshr           v0.4h,  v0.4h,  #2
    716 
    717        movi            v1.8h,  #2, lsl #8      // 512
    718        dup             v0.8h,  v0.h[0]
    719        b               .L_pred8x8_dc_10_end
    720 endfunc
    721 
    722 function ff_pred8x8_0lt_dc_neon_10, export=1
    723        add             x3,  x0,  x1,  lsl #2
    724        sub             x2,  x0,  x1
    725        sub             x3,  x3,  #2
    726 
    727        ld1             {v0.8h},  [x2]
    728        ldcol.16        v1,  x3,  x1,  hi=1
    729 
    730        addp            v0.8h,  v0.8h,  v0.8h
    731        addp            v1.8h,  v1.8h,  v1.8h
    732        addp            v0.4h,  v0.4h,  v0.4h
    733        addp            v1.4h,  v1.4h,  v1.4h
    734        zip1            v0.2s,  v0.2s,  v1.2s
    735        add             v1.4h,  v0.4h,  v1.4h
    736 
    737        urshr           v2.4h,  v0.4h,  #2
    738        urshr           v3.4h,  v1.4h,  #3
    739 
    740        dup             v4.4h,  v2.h[0]
    741        dup             v5.4h,  v2.h[3]
    742        dup             v6.4h,  v2.h[1]
    743        dup             v7.4h,  v3.h[1]
    744 
    745        zip1            v0.2d,  v4.2d,  v6.2d
    746        zip1            v1.2d,  v5.2d,  v7.2d
    747        b               .L_pred8x8_dc_10_end
    748 endfunc
    749 
    750 function ff_pred8x8_0l0_dc_neon_10, export=1
    751        add             x2,  x0,  x1,  lsl #2
    752        sub             x2,  x2,  #2
    753 
    754        ldcol.16        v1,  x2,  x1,  4
    755 
    756        addp            v2.8h,  v1.8h,  v1.8h
    757        addp            v2.4h,  v2.4h,  v2.4h
    758        urshr           v1.4h,  v2.4h,  #2
    759 
    760        movi            v0.8h,  #2,  lsl #8     // 512
    761        dup             v1.8h,  v1.h[0]
    762        b               .L_pred8x8_dc_10_end
    763 endfunc