tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

vp8dsp_neon.S (67146B)


      1 /*
      2 * VP8 NEON optimisations
      3 *
      4 * Copyright (c) 2010 Rob Clark <rob@ti.com>
      5 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
      6 * Copyright (c) 2018 Magnus Röös <mla2.roos@gmail.com>
      7 * Copyright (c) 2019 Martin Storsjo <martin@martin.st>
      8 *
      9 * This file is part of FFmpeg.
     10 *
     11 * FFmpeg is free software; you can redistribute it and/or
     12 * modify it under the terms of the GNU Lesser General Public
     13 * License as published by the Free Software Foundation; either
     14 * version 2.1 of the License, or (at your option) any later version.
     15 *
     16 * FFmpeg is distributed in the hope that it will be useful,
     17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     19 * Lesser General Public License for more details.
     20 *
     21 * You should have received a copy of the GNU Lesser General Public
     22 * License along with FFmpeg; if not, write to the Free Software
     23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     24 */
     25 
     26 #include "libavutil/aarch64/asm.S"
     27 #include "neon.S"
     28 
     29 function ff_vp8_luma_dc_wht_neon, export=1
     30        ld1             {v0.4h - v3.4h}, [x1]
     31        movi            v30.8h, #0
     32 
     33        add             v4.4h,  v0.4h,  v3.4h
     34        add             v6.4h,  v1.4h,  v2.4h
     35        st1             {v30.8h}, [x1], #16
     36        sub             v7.4h,  v1.4h,  v2.4h
     37        sub             v5.4h,  v0.4h,  v3.4h
     38        st1             {v30.8h}, [x1]
     39        add             v0.4h,  v4.4h,  v6.4h
     40        add             v1.4h,  v5.4h,  v7.4h
     41        sub             v2.4h,  v4.4h,  v6.4h
     42        sub             v3.4h,  v5.4h,  v7.4h
     43 
     44        movi            v16.4h, #3
     45 
     46        transpose_4x4H  v0, v1, v2, v3, v4, v5, v6, v7
     47 
     48        add             v0.4h,  v0.4h,  v16.4h
     49 
     50        add             v4.4h,  v0.4h,  v3.4h
     51        add             v6.4h,  v1.4h,  v2.4h
     52        sub             v7.4h,  v1.4h,  v2.4h
     53        sub             v5.4h,  v0.4h,  v3.4h
     54        add             v0.4h,  v4.4h,  v6.4h
     55        add             v1.4h,  v5.4h,  v7.4h
     56        sub             v2.4h,  v4.4h,  v6.4h
     57        sub             v3.4h,  v5.4h,  v7.4h
     58 
     59        sshr            v0.4h,  v0.4h,  #3
     60        sshr            v1.4h,  v1.4h,  #3
     61        sshr            v2.4h,  v2.4h,  #3
     62        sshr            v3.4h,  v3.4h,  #3
     63 
     64        mov             x3,  #32
     65        st1             {v0.h}[0],  [x0], x3
     66        st1             {v1.h}[0],  [x0], x3
     67        st1             {v2.h}[0],  [x0], x3
     68        st1             {v3.h}[0],  [x0], x3
     69        st1             {v0.h}[1],  [x0], x3
     70        st1             {v1.h}[1],  [x0], x3
     71        st1             {v2.h}[1],  [x0], x3
     72        st1             {v3.h}[1],  [x0], x3
     73        st1             {v0.h}[2],  [x0], x3
     74        st1             {v1.h}[2],  [x0], x3
     75        st1             {v2.h}[2],  [x0], x3
     76        st1             {v3.h}[2],  [x0], x3
     77        st1             {v0.h}[3],  [x0], x3
     78        st1             {v1.h}[3],  [x0], x3
     79        st1             {v2.h}[3],  [x0], x3
     80        st1             {v3.h}[3],  [x0], x3
     81 
     82        ret
     83 endfunc
     84 
     85 function ff_vp8_idct_add_neon, export=1
     86        ld1             {v0.8b - v3.8b},  [x1]
     87        mov             w4,  #20091
     88        movk            w4,  #35468/2, lsl #16
     89        dup             v4.2s, w4
     90 
     91        smull           v26.4s, v1.4h,  v4.h[0]
     92        smull           v27.4s, v3.4h,  v4.h[0]
     93        sqdmulh         v20.4h, v1.4h,  v4.h[1]
     94        sqdmulh         v23.4h, v3.4h,  v4.h[1]
     95        shrn            v21.4h, v26.4s, #16
     96        shrn            v22.4h, v27.4s, #16
     97        add             v21.4h, v21.4h, v1.4h
     98        add             v22.4h, v22.4h, v3.4h
     99 
    100        add             v16.4h,  v0.4h,   v2.4h
    101        sub             v17.4h,  v0.4h,   v2.4h
    102 
    103        add             v18.4h,  v21.4h,  v23.4h
    104        sub             v19.4h,  v20.4h,  v22.4h
    105 
    106        add             v0.4h,   v16.4h,  v18.4h
    107        add             v1.4h,   v17.4h,  v19.4h
    108        sub             v3.4h,   v16.4h,  v18.4h
    109        sub             v2.4h,   v17.4h,  v19.4h
    110 
    111        transpose_4x4H  v0, v1, v2, v3, v24, v5, v6, v7
    112 
    113        movi            v29.8h, #0
    114        smull           v26.4s,     v1.4h,  v4.h[0]
    115        st1             {v29.8h},   [x1],   #16
    116        smull           v27.4s,     v3.4h,  v4.h[0]
    117        st1             {v29.16b},  [x1]
    118        sqdmulh         v21.4h,     v1.4h,  v4.h[1]
    119        sqdmulh         v23.4h,     v3.4h,  v4.h[1]
    120        shrn            v20.4h,     v26.4s, #16
    121        shrn            v22.4h,     v27.4s, #16
    122        add             v20.4h,     v20.4h, v1.4h
    123        add             v22.4h,     v22.4h, v3.4h
    124        add             v16.4h,     v0.4h,  v2.4h
    125        sub             v17.4h,     v0.4h,  v2.4h
    126 
    127        add             v18.4h,     v20.4h, v23.4h
    128        ld1             {v24.s}[0], [x0],   x2
    129        sub             v19.4h, v21.4h, v22.4h
    130        ld1             {v25.s}[0], [x0],   x2
    131        add             v0.4h,      v16.4h, v18.4h
    132        add             v1.4h,      v17.4h, v19.4h
    133        ld1             {v26.s}[0], [x0],   x2
    134        sub             v3.4h,      v16.4h, v18.4h
    135        sub             v2.4h,      v17.4h, v19.4h
    136        ld1             {v27.s}[0], [x0],   x2
    137        srshr           v0.4h,      v0.4h,  #3
    138        srshr           v1.4h,      v1.4h,  #3
    139        srshr           v2.4h,      v2.4h,  #3
    140        srshr           v3.4h,      v3.4h,  #3
    141 
    142        sub             x0,  x0,  x2,  lsl #2
    143 
    144        transpose_4x4H  v0, v1, v2, v3, v5, v6, v7, v16
    145 
    146        uaddw           v0.8h,  v0.8h, v24.8b
    147        uaddw           v1.8h,  v1.8h, v25.8b
    148        uaddw           v2.8h,  v2.8h, v26.8b
    149        uaddw           v3.8h,  v3.8h, v27.8b
    150        sqxtun          v0.8b,  v0.8h
    151        sqxtun          v1.8b,  v1.8h
    152        sqxtun          v2.8b,  v2.8h
    153        sqxtun          v3.8b,  v3.8h
    154 
    155        st1             {v0.s}[0],  [x0], x2
    156        st1             {v1.s}[0],  [x0], x2
    157        st1             {v2.s}[0],  [x0], x2
    158        st1             {v3.s}[0],  [x0], x2
    159 
    160        ret
    161 endfunc
    162 
    163 function ff_vp8_idct_dc_add4uv_neon, export=1
    164        movi            v0.4h,  #0
    165        mov             x3,     #32
    166        ld1r            {v16.4h},  [x1]
    167        st1             {v0.h}[0], [x1], x3
    168        ld1r            {v17.4h},  [x1]
    169        st1             {v0.h}[0], [x1], x3
    170        ld1r            {v18.4h},  [x1]
    171        st1             {v0.h}[0], [x1], x3
    172        ld1r            {v19.4h},  [x1]
    173        st1             {v0.h}[0], [x1], x3
    174        ins             v16.d[1],  v17.d[0]
    175        ins             v18.d[1],  v19.d[0]
    176        mov             x3,  x0
    177        srshr           v16.8h,    v16.8h,  #3            // dc >>= 3
    178        ld1             {v0.8b},   [x0], x2
    179        srshr           v18.8h,    v18.8h,  #3
    180        ld1             {v1.8b},   [x0], x2
    181        uaddw           v20.8h,    v16.8h, v0.8b
    182        ld1             {v2.8b},   [x0], x2
    183        uaddw           v0.8h,     v16.8h, v1.8b
    184        ld1             {v3.8b},   [x0], x2
    185        uaddw           v22.8h,    v16.8h, v2.8b
    186        ld1             {v4.8b},   [x0], x2
    187        uaddw           v2.8h,     v16.8h, v3.8b
    188        ld1             {v5.8b},   [x0], x2
    189        uaddw           v24.8h,    v18.8h, v4.8b
    190        ld1             {v6.8b},   [x0], x2
    191        uaddw           v4.8h,     v18.8h, v5.8b
    192        ld1             {v7.8b},   [x0], x2
    193        uaddw           v26.8h,    v18.8h, v6.8b
    194        sqxtun          v20.8b,    v20.8h
    195        uaddw           v6.8h,     v18.8h, v7.8b
    196        sqxtun          v21.8b,    v0.8h
    197        sqxtun          v22.8b,    v22.8h
    198        st1             {v20.8b},  [x3], x2
    199        sqxtun          v23.8b,    v2.8h
    200        st1             {v21.8b},  [x3], x2
    201        sqxtun          v24.8b,    v24.8h
    202        st1             {v22.8b},  [x3], x2
    203        sqxtun          v25.8b,    v4.8h
    204        st1             {v23.8b},  [x3], x2
    205        sqxtun          v26.8b,    v26.8h
    206        st1             {v24.8b},  [x3], x2
    207        sqxtun          v27.8b,    v6.8h
    208        st1             {v25.8b},  [x3], x2
    209        st1             {v26.8b},  [x3], x2
    210        st1             {v27.8b},  [x3], x2
    211 
    212        ret
    213 endfunc
    214 
    215 function ff_vp8_idct_dc_add4y_neon, export=1
    216        movi            v0.16b,  #0
    217        mov             x3,  #32
    218        ld1r            {v16.4h},    [x1]
    219        st1             {v0.h}[0],   [x1], x3
    220        ld1r            {v17.4h},    [x1]
    221        st1             {v0.h}[0],   [x1], x3
    222        zip1            v16.2d,      v16.2d, v17.2d
    223        ld1r            {v18.4h},    [x1]
    224        st1             {v0.h}[0],   [x1], x3
    225        ld1r            {v19.4h},    [x1]
    226        st1             {v0.h}[0],   [x1], x3
    227        zip1            v18.2d,      v18.2d, v19.2d
    228        srshr           v16.8h,      v16.8h,  #3            // dc >>= 3
    229        ld1             {v0.16b},     [x0], x2
    230        srshr           v18.8h,       v18.8h,  #3
    231        ld1             {v1.16b},     [x0], x2
    232        uaddw           v20.8h,       v16.8h,  v0.8b
    233        ld1             {v2.16b},     [x0], x2
    234        uaddw2          v0.8h,        v18.8h,   v0.16b
    235        ld1             {v3.16b},     [x0], x2
    236        uaddw           v21.8h, v16.8h,  v1.8b
    237        uaddw2          v1.8h,  v18.8h,  v1.16b
    238        uaddw           v22.8h, v16.8h,  v2.8b
    239        uaddw2          v2.8h,  v18.8h,  v2.16b
    240        uaddw           v23.8h, v16.8h,  v3.8b
    241        uaddw2          v3.8h,  v18.8h,  v3.16b
    242        sub             x0,  x0,  x2,  lsl #2
    243        sqxtun          v20.8b,  v20.8h
    244        sqxtun2         v20.16b, v0.8h
    245        sqxtun          v21.8b,  v21.8h
    246        sqxtun2         v21.16b, v1.8h
    247        sqxtun          v22.8b,  v22.8h
    248        st1             {v20.16b},    [x0], x2
    249        sqxtun2         v22.16b, v2.8h
    250        st1             {v21.16b},    [x0], x2
    251        sqxtun          v23.8b,  v23.8h
    252        st1             {v22.16b},    [x0], x2
    253        sqxtun2         v23.16b, v3.8h
    254        st1             {v23.16b},    [x0], x2
    255 
    256        ret
    257 endfunc
    258 
    259 function ff_vp8_idct_dc_add_neon, export=1
    260        mov             w3,       #0
    261        ld1r            {v2.8h},  [x1]
    262        strh            w3,       [x1]
    263        srshr           v2.8h,  v2.8h,  #3
    264        ld1             {v0.s}[0],  [x0], x2
    265        ld1             {v0.s}[1],  [x0], x2
    266        uaddw           v3.8h,  v2.8h,  v0.8b
    267        ld1             {v1.s}[0],  [x0], x2
    268        ld1             {v1.s}[1],  [x0], x2
    269        uaddw           v4.8h,  v2.8h,  v1.8b
    270        sqxtun          v0.8b,  v3.8h
    271        sqxtun          v1.8b,  v4.8h
    272        sub             x0,  x0,  x2, lsl #2
    273        st1             {v0.s}[0],  [x0], x2
    274        st1             {v0.s}[1],  [x0], x2
    275        st1             {v1.s}[0],  [x0], x2
    276        st1             {v1.s}[1],  [x0], x2
    277        ret
    278 endfunc
    279 
    280 // Register layout:
    281 //   P3..Q3 -> v0..v7
    282 //   flim_E -> v22
    283 //   flim_I -> v23
    284 //   hev_thresh -> x5
    285 //
    286 .macro  vp8_loop_filter, inner=0, simple=0, hev_thresh
    287    .if \simple
    288        uabd            v17.16b, v3.16b,  v4.16b      // abs(P0-Q0)
    289        uabd            v23.16b, v2.16b,  v5.16b      // abs(P1-Q1)
    290        uqadd           v17.16b, v17.16b, v17.16b     // abs(P0-Q0) * 2
    291        ushr            v18.16b, v23.16b, #1          // abs(P1-Q1) / 2
    292        uqadd           v19.16b, v17.16b,  v18.16b    // (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
    293        movi            v21.16b, #0x80
    294        cmhs            v16.16b, v22.16b, v19.16b    // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim
    295    .else
    296        // calculate hev and normal_limit:
    297        uabd            v20.16b, v2.16b,  v3.16b      // abs(P1-P0)
    298        uabd            v21.16b, v5.16b,  v4.16b      // abs(Q1-Q0)
    299        uabd            v18.16b, v0.16b,  v1.16b      // abs(P3-P2)
    300        uabd            v19.16b, v1.16b,  v2.16b      // abs(P2-P1)
    301        cmhs            v16.16b, v23.16b, v20.16b     // abs(P1-P0) <= flim_I
    302        cmhs            v17.16b, v23.16b, v21.16b     // abs(Q1-Q0) <= flim_I
    303        cmhs            v18.16b, v23.16b, v18.16b     // abs(P3-P2) <= flim_I
    304        cmhs            v19.16b, v23.16b, v19.16b     // abs(P2-P1) <= flim_I
    305        and             v16.16b, v17.16b, v16.16b
    306        uabd            v17.16b, v7.16b,  v6.16b      // abs(Q3-Q2)
    307        and             v16.16b, v16.16b, v19.16b
    308        uabd            v19.16b, v6.16b,  v5.16b      // abs(Q2-Q1)
    309        and             v16.16b, v16.16b, v18.16b
    310        cmhs            v18.16b, v23.16b, v17.16b     // abs(Q3-Q2) <= flim_I
    311        cmhs            v19.16b, v23.16b, v19.16b     // abs(Q2-Q1) <= flim_I
    312        uabd            v17.16b, v3.16b,  v4.16b      // abs(P0-Q0)
    313        uabd            v23.16b, v2.16b,  v5.16b      // abs(P1-Q1)
    314        and             v16.16b, v16.16b, v18.16b
    315        uqadd           v17.16b, v17.16b, v17.16b     // abs(P0-Q0) * 2
    316        and             v16.16b, v16.16b, v19.16b
    317        ushr            v18.16b, v23.16b, #1          // abs(P1-Q1) / 2
    318        dup             v23.16b, \hev_thresh          // hev_thresh
    319        uqadd           v19.16b, v17.16b, v18.16b     // (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
    320        cmhi            v20.16b, v20.16b, v23.16b     // abs(P1-P0) > hev_thresh
    321        cmhs            v19.16b, v22.16b, v19.16b     // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E
    322        cmhi            v22.16b, v21.16b, v23.16b     // abs(Q1-Q0) > hev_thresh
    323        and             v16.16b, v16.16b, v19.16b
    324        movi            v21.16b, #0x80
    325        orr             v17.16b, v20.16b, v22.16b
    326    .endif
    327 
    328        // at this point:
    329        //   v16: normal_limit
    330        //   v17: hev
    331 
    332        // convert to signed value:
    333        eor             v3.16b, v3.16b, v21.16b           // PS0 = P0 ^ 0x80
    334        eor             v4.16b, v4.16b, v21.16b           // QS0 = Q0 ^ 0x80
    335 
    336        movi            v20.8h, #3
    337        ssubl           v18.8h, v4.8b,  v3.8b             // QS0 - PS0
    338        ssubl2          v19.8h, v4.16b, v3.16b            //   (widened to 16bit)
    339        eor             v2.16b, v2.16b, v21.16b           // PS1 = P1 ^ 0x80
    340        eor             v5.16b, v5.16b, v21.16b           // QS1 = Q1 ^ 0x80
    341        mul             v18.8h, v18.8h, v20.8h            // w = 3 * (QS0 - PS0)
    342        mul             v19.8h, v19.8h, v20.8h
    343 
    344        sqsub           v20.16b, v2.16b, v5.16b           // clamp(PS1-QS1)
    345        movi            v22.16b, #4
    346        movi            v23.16b, #3
    347    .if \inner
    348        and             v20.16b, v20.16b, v17.16b         // if(hev) w += clamp(PS1-QS1)
    349    .endif
    350        saddw           v18.8h,  v18.8h, v20.8b           // w += clamp(PS1-QS1)
    351        saddw2          v19.8h,  v19.8h, v20.16b
    352        sqxtn           v18.8b,  v18.8h                   // narrow result back into v18
    353        sqxtn2          v18.16b, v19.8h
    354    .if !\inner && !\simple
    355        eor             v1.16b,  v1.16b,  v21.16b         // PS2 = P2 ^ 0x80
    356        eor             v6.16b,  v6.16b,  v21.16b         // QS2 = Q2 ^ 0x80
    357    .endif
    358        and             v18.16b, v18.16b, v16.16b         // w &= normal_limit
    359 
    360        // registers used at this point..
    361        //   v0 -> P3  (don't corrupt)
    362        //   v1-v6 -> PS2-QS2
    363        //   v7 -> Q3  (don't corrupt)
    364        //   v17 -> hev
    365        //   v18 -> w
    366        //   v21 -> #0x80
    367        //   v22 -> #4
    368        //   v23 -> #3
    369        //   v16, v19, v29 -> unused
    370        //
    371        // filter_common:   is4tap==1
    372        //   c1 = clamp(w + 4) >> 3;
    373        //   c2 = clamp(w + 3) >> 3;
    374        //   Q0 = s2u(QS0 - c1);
    375        //   P0 = s2u(PS0 + c2);
    376 
    377    .if \simple
    378        sqadd           v19.16b, v18.16b, v22.16b           // c1 = clamp((w&hev)+4)
    379        sqadd           v20.16b, v18.16b, v23.16b           // c2 = clamp((w&hev)+3)
    380        sshr            v19.16b, v19.16b, #3                // c1 >>= 3
    381        sshr            v20.16b, v20.16b, #3                // c2 >>= 3
    382        sqsub           v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
    383        sqadd           v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
    384        eor             v4.16b,  v4.16b,  v21.16b           // Q0 = QS0 ^ 0x80
    385        eor             v3.16b,  v3.16b,  v21.16b           // P0 = PS0 ^ 0x80
    386        eor             v5.16b,  v5.16b,  v21.16b           // Q1 = QS1 ^ 0x80
    387        eor             v2.16b,  v2.16b,  v21.16b           // P1 = PS1 ^ 0x80
    388    .elseif \inner
    389        // the !is4tap case of filter_common, only used for inner blocks
    390        //   c3 = ((c1&~hev) + 1) >> 1;
    391        //   Q1 = s2u(QS1 - c3);
    392        //   P1 = s2u(PS1 + c3);
    393        sqadd           v19.16b, v18.16b, v22.16b           // c1 = clamp((w&hev)+4)
    394        sqadd           v20.16b, v18.16b, v23.16b           // c2 = clamp((w&hev)+3)
    395        sshr            v19.16b, v19.16b, #3                // c1 >>= 3
    396        sshr            v20.16b, v20.16b, #3                // c2 >>= 3
    397        sqsub           v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
    398        sqadd           v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
    399        bic             v19.16b, v19.16b, v17.16b           // c1 & ~hev
    400        eor             v4.16b,  v4.16b,  v21.16b           // Q0 = QS0 ^ 0x80
    401        srshr           v19.16b, v19.16b, #1                // c3 >>= 1
    402        eor             v3.16b,  v3.16b,  v21.16b           // P0 = PS0 ^ 0x80
    403        sqsub           v5.16b,  v5.16b,  v19.16b           // QS1 = clamp(QS1-c3)
    404        sqadd           v2.16b,  v2.16b,  v19.16b           // PS1 = clamp(PS1+c3)
    405        eor             v5.16b,  v5.16b,  v21.16b           // Q1 = QS1 ^ 0x80
    406        eor             v2.16b,  v2.16b,  v21.16b           // P1 = PS1 ^ 0x80
    407    .else
    408        and             v20.16b, v18.16b, v17.16b           // w & hev
    409        sqadd           v19.16b, v20.16b, v22.16b           // c1 = clamp((w&hev)+4)
    410        sqadd           v20.16b, v20.16b, v23.16b           // c2 = clamp((w&hev)+3)
    411        sshr            v19.16b, v19.16b, #3                // c1 >>= 3
    412        sshr            v20.16b, v20.16b, #3                // c2 >>= 3
    413        bic             v18.16b, v18.16b, v17.16b           // w &= ~hev
    414        sqsub           v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
    415        sqadd           v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
    416 
    417        // filter_mbedge:
    418        //   a = clamp((27*w + 63) >> 7);
    419        //   Q0 = s2u(QS0 - a);
    420        //   P0 = s2u(PS0 + a);
    421        //   a = clamp((18*w + 63) >> 7);
    422        //   Q1 = s2u(QS1 - a);
    423        //   P1 = s2u(PS1 + a);
    424        //   a = clamp((9*w + 63) >> 7);
    425        //   Q2 = s2u(QS2 - a);
    426        //   P2 = s2u(PS2 + a);
    427        movi            v17.8h,  #63
    428        sshll           v22.8h,  v18.8b, #3
    429        sshll2          v23.8h,  v18.16b, #3
    430        saddw           v22.8h,  v22.8h, v18.8b
    431        saddw2          v23.8h,  v23.8h, v18.16b
    432        add             v16.8h,  v17.8h, v22.8h
    433        add             v17.8h,  v17.8h, v23.8h           //  9*w + 63
    434        add             v19.8h,  v16.8h, v22.8h
    435        add             v20.8h,  v17.8h, v23.8h           // 18*w + 63
    436        add             v22.8h,  v19.8h, v22.8h
    437        add             v23.8h,  v20.8h, v23.8h           // 27*w + 63
    438        sqshrn          v16.8b,  v16.8h,  #7
    439        sqshrn2         v16.16b, v17.8h, #7              // clamp(( 9*w + 63)>>7)
    440        sqshrn          v19.8b,  v19.8h, #7
    441        sqshrn2         v19.16b, v20.8h, #7              // clamp((18*w + 63)>>7)
    442        sqshrn          v22.8b,  v22.8h, #7
    443        sqshrn2         v22.16b, v23.8h, #7              // clamp((27*w + 63)>>7)
    444        sqadd           v1.16b,  v1.16b,  v16.16b        // PS2 = clamp(PS2+a)
    445        sqsub           v6.16b,  v6.16b,  v16.16b        // QS2 = clamp(QS2-a)
    446        sqadd           v2.16b,  v2.16b,  v19.16b        // PS1 = clamp(PS1+a)
    447        sqsub           v5.16b,  v5.16b,  v19.16b        // QS1 = clamp(QS1-a)
    448        sqadd           v3.16b,  v3.16b,  v22.16b        // PS0 = clamp(PS0+a)
    449        sqsub           v4.16b,  v4.16b,  v22.16b        // QS0 = clamp(QS0-a)
    450        eor             v3.16b,  v3.16b,  v21.16b        // P0 = PS0 ^ 0x80
    451        eor             v4.16b,  v4.16b,  v21.16b        // Q0 = QS0 ^ 0x80
    452        eor             v2.16b,  v2.16b,  v21.16b        // P1 = PS1 ^ 0x80
    453        eor             v5.16b,  v5.16b,  v21.16b        // Q1 = QS1 ^ 0x80
    454        eor             v1.16b,  v1.16b,  v21.16b        // P2 = PS2 ^ 0x80
    455        eor             v6.16b,  v6.16b,  v21.16b        // Q2 = QS2 ^ 0x80
    456    .endif
    457 .endm
    458 
    459 .macro  vp8_v_loop_filter16 name, inner=0, simple=0
    460 function ff_vp8_v_loop_filter16\name\()_neon, export=1
    461        sub             x0,  x0,  x1,  lsl #1+!\simple
    462 
    463        // Load pixels:
    464    .if !\simple
    465        ld1             {v0.16b},     [x0], x1 // P3
    466        ld1             {v1.16b},     [x0], x1 // P2
    467    .endif
    468        ld1             {v2.16b},     [x0], x1 // P1
    469        ld1             {v3.16b},     [x0], x1 // P0
    470        ld1             {v4.16b},     [x0], x1 // Q0
    471        ld1             {v5.16b},     [x0], x1 // Q1
    472    .if !\simple
    473        ld1             {v6.16b},     [x0], x1 // Q2
    474        ld1             {v7.16b},     [x0]     // Q3
    475        dup             v23.16b, w3                 // flim_I
    476    .endif
    477        dup             v22.16b, w2                 // flim_E
    478 
    479        vp8_loop_filter inner=\inner, simple=\simple, hev_thresh=w4
    480 
    481        // back up to P2:  dst -= stride * 6
    482        sub             x0,  x0,  x1,  lsl #2
    483    .if !\simple
    484        sub             x0,  x0,  x1,  lsl #1
    485 
    486        // Store pixels:
    487        st1             {v1.16b},     [x0], x1 // P2
    488    .endif
    489        st1             {v2.16b},     [x0], x1 // P1
    490        st1             {v3.16b},     [x0], x1 // P0
    491        st1             {v4.16b},     [x0], x1 // Q0
    492        st1             {v5.16b},     [x0], x1 // Q1
    493    .if !\simple
    494        st1             {v6.16b},     [x0]     // Q2
    495    .endif
    496 
    497        ret
    498 endfunc
    499 .endm
    500 
    501 vp8_v_loop_filter16
    502 vp8_v_loop_filter16 _inner,  inner=1
    503 vp8_v_loop_filter16 _simple, simple=1
    504 
    505 .macro  vp8_v_loop_filter8uv name, inner=0
    506 function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
    507        sub             x0,  x0,  x2,  lsl #2
    508        sub             x1,  x1,  x2,  lsl #2
    509        // Load pixels:
    510        ld1             {v0.d}[0],     [x0], x2  // P3
    511        ld1             {v0.d}[1],     [x1], x2  // P3
    512        ld1             {v1.d}[0],     [x0], x2  // P2
    513        ld1             {v1.d}[1],     [x1], x2  // P2
    514        ld1             {v2.d}[0],     [x0], x2  // P1
    515        ld1             {v2.d}[1],     [x1], x2  // P1
    516        ld1             {v3.d}[0],     [x0], x2  // P0
    517        ld1             {v3.d}[1],     [x1], x2  // P0
    518        ld1             {v4.d}[0],     [x0], x2  // Q0
    519        ld1             {v4.d}[1],     [x1], x2  // Q0
    520        ld1             {v5.d}[0],     [x0], x2  // Q1
    521        ld1             {v5.d}[1],     [x1], x2  // Q1
    522        ld1             {v6.d}[0],     [x0], x2  // Q2
    523        ld1             {v6.d}[1],     [x1], x2  // Q2
    524        ld1             {v7.d}[0],     [x0]      // Q3
    525        ld1             {v7.d}[1],     [x1]      // Q3
    526 
    527        dup             v22.16b, w3                 // flim_E
    528        dup             v23.16b, w4                 // flim_I
    529 
    530        vp8_loop_filter inner=\inner, hev_thresh=w5
    531 
    532        // back up to P2:  u,v -= stride * 6
    533        sub             x0,  x0,  x2,  lsl #2
    534        sub             x1,  x1,  x2,  lsl #2
    535        sub             x0,  x0,  x2,  lsl #1
    536        sub             x1,  x1,  x2,  lsl #1
    537 
    538        // Store pixels:
    539 
    540        st1             {v1.d}[0],     [x0], x2  // P2
    541        st1             {v1.d}[1],     [x1], x2  // P2
    542        st1             {v2.d}[0],     [x0], x2  // P1
    543        st1             {v2.d}[1],     [x1], x2  // P1
    544        st1             {v3.d}[0],     [x0], x2  // P0
    545        st1             {v3.d}[1],     [x1], x2  // P0
    546        st1             {v4.d}[0],     [x0], x2  // Q0
    547        st1             {v4.d}[1],     [x1], x2  // Q0
    548        st1             {v5.d}[0],     [x0], x2  // Q1
    549        st1             {v5.d}[1],     [x1], x2  // Q1
    550        st1             {v6.d}[0],     [x0]      // Q2
    551        st1             {v6.d}[1],     [x1]      // Q2
    552 
    553        ret
    554 endfunc
    555 .endm
    556 
    557 vp8_v_loop_filter8uv
    558 vp8_v_loop_filter8uv _inner, inner=1
    559 
    560 .macro  vp8_h_loop_filter16 name, inner=0, simple=0
    561 function ff_vp8_h_loop_filter16\name\()_neon, export=1
    562 
    563        sub             x0,  x0,  #4
    564        // Load pixels:
    565        ld1             {v0.d}[0], [x0], x1
    566        ld1             {v1.d}[0], [x0], x1
    567        ld1             {v2.d}[0], [x0], x1
    568        ld1             {v3.d}[0], [x0], x1
    569        ld1             {v4.d}[0], [x0], x1
    570        ld1             {v5.d}[0], [x0], x1
    571        ld1             {v6.d}[0], [x0], x1
    572        ld1             {v7.d}[0], [x0], x1
    573        ld1             {v0.d}[1], [x0], x1
    574        ld1             {v1.d}[1], [x0], x1
    575        ld1             {v2.d}[1], [x0], x1
    576        ld1             {v3.d}[1], [x0], x1
    577        ld1             {v4.d}[1], [x0], x1
    578        ld1             {v5.d}[1], [x0], x1
    579        ld1             {v6.d}[1], [x0], x1
    580        ld1             {v7.d}[1], [x0], x1
    581 
    582        transpose_8x16B v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
    583 
    584        dup             v22.16b, w2                 // flim_E
    585    .if !\simple
    586        dup             v23.16b, w3                 // flim_I
    587    .endif
    588 
    589        vp8_loop_filter inner=\inner, simple=\simple, hev_thresh=w4
    590 
    591        sub             x0,  x0,  x1, lsl #4    // backup 16 rows
    592 
    593        transpose_8x16B v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
    594 
    595        // Store pixels:
    596        st1             {v0.d}[0], [x0], x1
    597        st1             {v1.d}[0], [x0], x1
    598        st1             {v2.d}[0], [x0], x1
    599        st1             {v3.d}[0], [x0], x1
    600        st1             {v4.d}[0], [x0], x1
    601        st1             {v5.d}[0], [x0], x1
    602        st1             {v6.d}[0], [x0], x1
    603        st1             {v7.d}[0], [x0], x1
    604        st1             {v0.d}[1], [x0], x1
    605        st1             {v1.d}[1], [x0], x1
    606        st1             {v2.d}[1], [x0], x1
    607        st1             {v3.d}[1], [x0], x1
    608        st1             {v4.d}[1], [x0], x1
    609        st1             {v5.d}[1], [x0], x1
    610        st1             {v6.d}[1], [x0], x1
    611        st1             {v7.d}[1], [x0]
    612 
    613        ret
    614 endfunc
    615 .endm
    616 
    617 vp8_h_loop_filter16
    618 vp8_h_loop_filter16 _inner,  inner=1
    619 vp8_h_loop_filter16 _simple, simple=1
    620 
    621 .macro  vp8_h_loop_filter8uv name, inner=0
    622 function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
    623        sub             x0,  x0,  #4
    624        sub             x1,  x1,  #4
    625 
    626        // Load pixels:
    627        ld1             {v0.d}[0],     [x0], x2 // load u
    628        ld1             {v0.d}[1],     [x1], x2 // load v
    629        ld1             {v1.d}[0],     [x0], x2
    630        ld1             {v1.d}[1],     [x1], x2
    631        ld1             {v2.d}[0],     [x0], x2
    632        ld1             {v2.d}[1],     [x1], x2
    633        ld1             {v3.d}[0],     [x0], x2
    634        ld1             {v3.d}[1],     [x1], x2
    635        ld1             {v4.d}[0],     [x0], x2
    636        ld1             {v4.d}[1],     [x1], x2
    637        ld1             {v5.d}[0],     [x0], x2
    638        ld1             {v5.d}[1],     [x1], x2
    639        ld1             {v6.d}[0],     [x0], x2
    640        ld1             {v6.d}[1],     [x1], x2
    641        ld1             {v7.d}[0],     [x0], x2
    642        ld1             {v7.d}[1],     [x1], x2
    643 
    644        transpose_8x16B v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
    645 
    646        dup             v22.16b, w3                 // flim_E
    647        dup             v23.16b, w4                 // flim_I
    648 
    649        vp8_loop_filter inner=\inner, hev_thresh=w5
    650 
    651        sub             x0,  x0,  x2, lsl #3    // backup u 8 rows
    652        sub             x1,  x1,  x2, lsl #3    // backup v 8 rows
    653 
    654        transpose_8x16B v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
    655 
    656        // Store pixels:
    657        st1             {v0.d}[0],     [x0], x2 // load u
    658        st1             {v0.d}[1],     [x1], x2 // load v
    659        st1             {v1.d}[0],     [x0], x2
    660        st1             {v1.d}[1],     [x1], x2
    661        st1             {v2.d}[0],     [x0], x2
    662        st1             {v2.d}[1],     [x1], x2
    663        st1             {v3.d}[0],     [x0], x2
    664        st1             {v3.d}[1],     [x1], x2
    665        st1             {v4.d}[0],     [x0], x2
    666        st1             {v4.d}[1],     [x1], x2
    667        st1             {v5.d}[0],     [x0], x2
    668        st1             {v5.d}[1],     [x1], x2
    669        st1             {v6.d}[0],     [x0], x2
    670        st1             {v6.d}[1],     [x1], x2
    671        st1             {v7.d}[0],     [x0]
    672        st1             {v7.d}[1],     [x1]
    673 
    674        ret
    675 
    676 endfunc
    677 .endm
    678 
    679 vp8_h_loop_filter8uv
    680 vp8_h_loop_filter8uv _inner, inner=1
    681 
    682 
    683 function ff_put_vp8_pixels16_neon, export=1
    684 1:
    685        subs            w4, w4, #4
    686        ld1             {v0.16b},     [x2], x3
    687        ld1             {v1.16b},     [x2], x3
    688        ld1             {v2.16b},     [x2], x3
    689        ld1             {v3.16b},     [x2], x3
    690        st1             {v0.16b},     [x0], x1
    691        st1             {v1.16b},     [x0], x1
    692        st1             {v2.16b},     [x0], x1
    693        st1             {v3.16b},     [x0], x1
    694        b.gt            1b
    695        ret
    696 endfunc
    697 
    698 function ff_put_vp8_pixels8_neon, export=1
    699 1:
    700        subs            w4, w4, #4
    701        ld1             {v0.8b},   [x2], x3
    702        ld1             {v0.d}[1], [x2], x3
    703        ld1             {v1.8b},   [x2], x3
    704        ld1             {v1.d}[1], [x2], x3
    705        st1             {v0.8b},   [x0], x1
    706        st1             {v0.d}[1], [x0], x1
    707        st1             {v1.8b},   [x0], x1
    708        st1             {v1.d}[1], [x0], x1
    709        b.gt            1b
    710        ret
    711 endfunc
    712 
    713 /* 4/6-tap 8th-pel MC */
    714 
    715 .macro  vp8_epel8_h6    d,   s0,   s1
    716        ext             v22.8b, \s0\().8b,  \s1\().8b,  #1
    717        uxtl            v18.8h, \s0\().8b
    718        ext             v23.8b, \s0\().8b,  \s1\().8b,  #2
    719        uxtl            v19.8h, v22.8b
    720        ext             v24.8b, \s0\().8b,  \s1\().8b,  #3
    721        uxtl            v21.8h, v23.8b
    722        ext             v25.8b, \s0\().8b,  \s1\().8b,  #4
    723        uxtl            v22.8h, v24.8b
    724        ext             v26.8b, \s0\().8b,  \s1\().8b,  #5
    725        uxtl            v25.8h, v25.8b
    726        mul             v21.8h, v21.8h, v0.h[2]
    727        uxtl            v26.8h, v26.8b
    728        mul             v22.8h, v22.8h, v0.h[3]
    729        mls             v21.8h, v19.8h, v0.h[1]
    730        mls             v22.8h, v25.8h, v0.h[4]
    731        mla             v21.8h, v18.8h, v0.h[0]
    732        mla             v22.8h, v26.8h, v0.h[5]
    733        sqadd           v22.8h, v21.8h, v22.8h
    734        sqrshrun        \d\().8b, v22.8h, #7
    735 .endm
    736 
    737 .macro  vp8_epel16_h6   d0,  v0,  v1
    738        ext             v22.16b, \v0\().16b, \v1\().16b, #3
    739        ext             v23.16b, \v0\().16b, \v1\().16b, #4
    740        uxtl            v19.8h,  v22.8b
    741        uxtl2           v22.8h,  v22.16b
    742        ext             v3.16b,  \v0\().16b, \v1\().16b, #2
    743        uxtl            v20.8h,  v23.8b
    744        uxtl2           v23.8h,  v23.16b
    745        ext             v16.16b, \v0\().16b, \v1\().16b, #1
    746        uxtl            v18.8h,  v3.8b
    747        uxtl2           v3.8h,   v3.16b
    748        ext             v2.16b,  \v0\().16b, \v1\().16b, #5
    749        uxtl            v21.8h,  v2.8b
    750        uxtl2           v2.8h,   v2.16b
    751        uxtl            v17.8h,  v16.8b
    752        uxtl2           v16.8h,  v16.16b
    753        mul             v19.8h,  v19.8h, v0.h[3]
    754        mul             v18.8h,  v18.8h, v0.h[2]
    755        mul             v3.8h,   v3.8h,  v0.h[2]
    756        mul             v22.8h,  v22.8h, v0.h[3]
    757        mls             v19.8h,  v20.8h, v0.h[4]
    758        uxtl            v20.8h,  \v0\().8b
    759        uxtl2           v1.8h,   \v0\().16b
    760        mls             v18.8h,  v17.8h, v0.h[1]
    761        mls             v3.8h,   v16.8h, v0.h[1]
    762        mls             v22.8h,  v23.8h, v0.h[4]
    763        mla             v18.8h,  v20.8h, v0.h[0]
    764        mla             v19.8h,  v21.8h, v0.h[5]
    765        mla             v3.8h,   v1.8h,  v0.h[0]
    766        mla             v22.8h,  v2.8h,  v0.h[5]
    767        sqadd           v19.8h,  v18.8h, v19.8h
    768        sqadd           v22.8h,  v3.8h,  v22.8h
    769        sqrshrun        \d0\().8b,  v19.8h, #7
    770        sqrshrun2       \d0\().16b, v22.8h, #7
    771 .endm
    772 
    773 .macro  vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
    774        uxtl            \s0\().8h, \s0\().8b
    775        uxtl            \s3\().8h, \s3\().8b
    776        uxtl            \s6\().8h, \s6\().8b
    777        uxtl            \s1\().8h, \s1\().8b
    778        uxtl            \s4\().8h, \s4\().8b
    779        uxtl            \s2\().8h, \s2\().8b
    780        uxtl            \s5\().8h, \s5\().8b
    781        mul             \s0\().8h, \s0\().8h, v0.h[0]
    782        mul             v31.8h   , \s3\().8h, v0.h[3]
    783        mul             \s3\().8h, \s3\().8h, v0.h[2]
    784        mul             \s6\().8h, \s6\().8h, v0.h[5]
    785 
    786        mls             \s0\().8h, \s1\().8h, v0.h[1]
    787        mls             v31.8h   , \s4\().8h, v0.h[4]
    788        mls             \s3\().8h, \s2\().8h, v0.h[1]
    789        mls             \s6\().8h, \s5\().8h, v0.h[4]
    790 
    791        mla             \s0\().8h, \s2\().8h, v0.h[2]
    792        mla             v31.8h   , \s5\().8h, v0.h[5]
    793        mla             \s3\().8h, \s1\().8h, v0.h[0]
    794        mla             \s6\().8h, \s4\().8h, v0.h[3]
    795        sqadd           v31.8h   , \s0\().8h, v31.8h
    796        sqadd           \s6\().8h, \s3\().8h, \s6\().8h
    797        sqrshrun        \d0\().8b, v31.8h,    #7
    798        sqrshrun        \d1\().8b, \s6\().8h, #7
    799 .endm
    800 
    801 .macro  vp8_epel8_h4    d,   v0,   v1
    802        ext             v22.8b, \v0\().8b,  \v1\().8b,  #1
    803        uxtl            v19.8h, \v0\().8b
    804        ext             v23.8b, \v0\().8b,  \v1\().8b,  #2
    805        uxtl            v20.8h, v22.8b
    806        ext             v25.8b, \v0\().8b,  \v1\().8b,  #3
    807        uxtl            v22.8h, v23.8b
    808        uxtl            v25.8h, v25.8b
    809        mul             v20.8h, v20.8h, v0.h[2]
    810        mul             v22.8h, v22.8h, v0.h[3]
    811        mls             v20.8h, v19.8h, v0.h[1]
    812        mls             v22.8h, v25.8h, v0.h[4]
    813        sqadd           v22.8h, v20.8h, v22.8h
    814        sqrshrun        \d\().8b, v22.8h, #7
    815 .endm
    816 
    817 .macro  vp8_epel8_v4_y2 d0, s0, s1, s2, s3, s4
    818        uxtl            \s0\().8h,  \s0\().8b
    819        uxtl            \s1\().8h,  \s1\().8b
    820        uxtl            \s2\().8h,  \s2\().8b
    821        uxtl            \s3\().8h,  \s3\().8b
    822        uxtl            \s4\().8h,  \s4\().8b
    823        mul             v21.8h,     \s1\().8h, v0.h[2]
    824        mul             v23.8h,     \s2\().8h, v0.h[3]
    825        mul             \s2\().8h,  \s2\().8h, v0.h[2]
    826        mul             v22.8h,     \s3\().8h, v0.h[3]
    827        mls             v21.8h,     \s0\().8h, v0.h[1]
    828        mls             v23.8h,     \s3\().8h, v0.h[4]
    829        mls             \s2\().8h,  \s1\().8h, v0.h[1]
    830        mls             v22.8h,     \s4\().8h, v0.h[4]
    831        sqadd           v21.8h,     v21.8h,    v23.8h
    832        sqadd           \s2\().8h,  \s2\().8h, v22.8h
    833        sqrshrun        \d0\().8b,  v21.8h,    #7
    834        sqrshrun2       \d0\().16b, \s2\().8h, #7
    835 .endm
    836 
    837 
    838 // note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
    839 // arithmetic can be used to apply filters
    840 const   subpel_filters, align=4
    841        .short     0,   6, 123,  12,   1,   0,   0,   0
    842        .short     2,  11, 108,  36,   8,   1,   0,   0
    843        .short     0,   9,  93,  50,   6,   0,   0,   0
    844        .short     3,  16,  77,  77,  16,   3,   0,   0
    845        .short     0,   6,  50,  93,   9,   0,   0,   0
    846        .short     1,   8,  36, 108,  11,   2,   0,   0
    847        .short     0,   1,  12, 123,   6,   0,   0,   0
    848 endconst
    849 
    850 function ff_put_vp8_epel16_v6_neon, export=1
    851        sub             x2,  x2,  x3,  lsl #1
    852 
    853        sxtw            x4,  w4
    854        sxtw            x6,  w6
    855        movrel          x17,  subpel_filters, -16
    856        add             x6,  x17,  x6, lsl #4  // y
    857        ld1             {v0.8h},     [x6]
    858 1:
    859        ld1             {v1.1d - v2.1d},    [x2], x3
    860        ld1             {v3.1d - v4.1d},    [x2], x3
    861        ld1             {v16.1d - v17.1d},  [x2], x3
    862        ld1             {v18.1d - v19.1d},  [x2], x3
    863        ld1             {v20.1d - v21.1d},  [x2], x3
    864        ld1             {v22.1d - v23.1d},  [x2], x3
    865        ld1             {v24.1d - v25.1d},  [x2]
    866        sub             x2,  x2,  x3, lsl #2
    867 
    868        vp8_epel8_v6_y2 v1, v3, v1, v3, v16, v18, v20, v22, v24
    869        vp8_epel8_v6_y2 v2, v4, v2, v4, v17, v19, v21, v23, v25
    870 
    871        st1             {v1.1d - v2.1d}, [x0], x1
    872        st1             {v3.1d - v4.1d}, [x0], x1
    873        subs            x4, x4, #2
    874        b.ne            1b
    875 
    876        ret
    877 endfunc
    878 
    879 function ff_put_vp8_epel16_h6_neon, export=1
    880        sub             x2,  x2,  #2
    881        sxtw            x5,  w5 // x
    882 
    883        // first pass (horizontal):
    884        movrel          x17,  subpel_filters, -16
    885        add             x5,  x17,  x5, lsl #4 // x
    886        ld1             {v0.8h},  [x5]
    887 1:
    888        ld1             {v1.16b, v2.16b}, [x2], x3
    889        vp8_epel16_h6   v1, v1, v2
    890        st1             {v1.16b}, [x0], x1
    891 
    892        subs            w4, w4, #1
    893        b.ne            1b
    894        ret
    895 endfunc
    896 
    897 
    898 function ff_put_vp8_epel16_h6v6_neon, export=1
    899        sub             x2,  x2,  x3,  lsl #1
    900        sub             x2,  x2,  #2
    901 
    902        // first pass (horizontal):
    903        movrel          x17,  subpel_filters, -16
    904        sxtw            x5,  w5 // x
    905        add             x16,  x17,  x5, lsl #4 // x
    906        sub             sp,  sp,  #336+16
    907        ld1             {v0.8h},  [x16]
    908        add             x7,  sp,  #15
    909        sxtw            x4,  w4
    910        add             x16, x4, #5   // h
    911        bic             x7,  x7,  #15
    912 1:
    913        ld1             {v1.16b, v2.16b}, [x2], x3
    914        vp8_epel16_h6   v1, v1, v2
    915        st1             {v1.16b}, [x7], #16
    916        subs            x16, x16, #1
    917        b.ne            1b
    918 
    919 
    920        // second pass (vertical):
    921        sxtw            x6,  w6
    922        add             x6,  x17,  x6, lsl #4  // y
    923        add             x7,  sp,  #15
    924        ld1             {v0.8h},     [x6]
    925        bic             x7,  x7,  #15
    926 2:
    927        ld1             {v1.8b - v4.8b},    [x7], #32
    928        ld1             {v16.8b - v19.8b},  [x7], #32
    929        ld1             {v20.8b - v23.8b},  [x7], #32
    930        ld1             {v24.8b - v25.8b},  [x7]
    931        sub             x7,  x7,  #64
    932 
    933        vp8_epel8_v6_y2 v1, v3, v1, v3, v16, v18, v20, v22, v24
    934        vp8_epel8_v6_y2 v2, v4, v2, v4, v17, v19, v21, v23, v25
    935        trn1            v1.2d, v1.2d, v2.2d
    936        trn1            v3.2d, v3.2d, v4.2d
    937 
    938        st1             {v1.16b}, [x0], x1
    939        st1             {v3.16b}, [x0], x1
    940        subs            x4, x4, #2
    941        b.ne            2b
    942 
    943        add             sp,  sp,  #336+16
    944        ret
    945 endfunc
    946 
    947 function ff_put_vp8_epel8_v6_neon, export=1
    948        sub             x2,  x2,  x3,  lsl #1
    949 
    950        movrel          x7,  subpel_filters, -16
    951        add             x6,  x7,  w6, uxtw #4
    952        ld1             {v0.8h},  [x6]
    953 1:
    954        ld1             {v2.8b},  [x2], x3
    955        ld1             {v3.8b},  [x2], x3
    956        ld1             {v4.8b},  [x2], x3
    957        ld1             {v5.8b},  [x2], x3
    958        ld1             {v6.8b},  [x2], x3
    959        ld1             {v7.8b},  [x2], x3
    960        ld1             {v28.8b}, [x2]
    961 
    962        sub             x2,  x2,  x3,  lsl #2
    963 
    964        vp8_epel8_v6_y2 v2, v3, v2, v3, v4, v5, v6, v7, v28
    965 
    966        st1             {v2.8b}, [x0], x1
    967        st1             {v3.8b}, [x0], x1
    968        subs            w4,  w4,  #2
    969        b.ne            1b
    970 
    971        ret
    972 endfunc
    973 
    974 function ff_put_vp8_epel8_h6_neon, export=1
    975        sub             x2,  x2,  #2
    976 
    977        movrel          x7,  subpel_filters, -16
    978        add             x5,  x7,  w5, uxtw #4
    979        ld1             {v0.8h},        [x5]
    980 1:
    981        ld1             {v2.8b, v3.8b}, [x2], x3
    982 
    983        vp8_epel8_h6    v2,  v2,  v3
    984 
    985        st1             {v2.8b}, [x0], x1
    986        subs            w4,  w4,  #1
    987        b.ne            1b
    988 
    989        ret
    990 endfunc
    991 
    992 function ff_put_vp8_epel8_h6v6_neon, export=1
    993        sub             x2,  x2,  x3,  lsl #1
    994        sub             x2,  x2,  #2
    995        sxtw            x4,  w4
    996 
    997        // first pass (horizontal):
    998        movrel          x17,  subpel_filters, -16
    999        sxtw            x5,  w5
   1000        add             x5,  x17,  x5, lsl #4 // x
   1001        sub             sp,  sp,  #168+16
   1002        ld1             {v0.8h},  [x5]
   1003        add             x7,  sp,  #15
   1004        add             x16, x4,  #5   // h
   1005        bic             x7,  x7,  #15
   1006 1:
   1007        ld1             {v1.8b, v2.8b}, [x2], x3
   1008 
   1009        vp8_epel8_h6    v1, v1, v2
   1010 
   1011        st1             {v1.8b}, [x7], #8
   1012        subs            x16, x16, #1
   1013        b.ne            1b
   1014 
   1015        // second pass (vertical):
   1016        sxtw            x6,  w6
   1017        add             x6,  x17,  x6, lsl #4  // y
   1018        add             x7,  sp,   #15
   1019        ld1             {v0.8h},   [x6]
   1020        bic             x7,  x7,   #15
   1021 2:
   1022        ld1             {v1.8b - v4.8b}, [x7], #32
   1023        ld1             {v5.8b - v7.8b}, [x7]
   1024 
   1025        sub             x7,  x7,  #16
   1026 
   1027        vp8_epel8_v6_y2 v1, v2, v1, v2, v3, v4, v5, v6, v7
   1028 
   1029        st1             {v1.8b}, [x0], x1
   1030        st1             {v2.8b}, [x0], x1
   1031        subs            x4, x4, #2
   1032        b.ne            2b
   1033 
   1034        add             sp,  sp,  #168+16
   1035        ret
   1036 endfunc
   1037 
   1038 function ff_put_vp8_epel8_v4_neon, export=1
   1039        sub             x2,  x2,  x3
   1040 
   1041        movrel          x7,  subpel_filters, -16
   1042        add             x6,  x7,  w6, uxtw #4
   1043        ld1             {v0.8h},     [x6]
   1044 1:
   1045        ld1             {v2.8b},     [x2], x3
   1046        ld1             {v3.8b},     [x2], x3
   1047        ld1             {v4.8b},     [x2], x3
   1048        ld1             {v5.8b},     [x2], x3
   1049        ld1             {v6.8b},     [x2]
   1050        sub             x2,  x2,  x3,  lsl #1
   1051 
   1052        vp8_epel8_v4_y2 v2, v2, v3, v4, v5, v6
   1053 
   1054        st1             {v2.d}[0], [x0], x1
   1055        st1             {v2.d}[1], [x0], x1
   1056        subs            w4,  w4,  #2
   1057        b.ne            1b
   1058 
   1059        ret
   1060 endfunc
   1061 
   1062 function ff_put_vp8_epel8_h4_neon, export=1
   1063        sub             x2,  x2,  #1
   1064 
   1065        movrel          x7,  subpel_filters, -16
   1066        add             x5,  x7,  w5, uxtw #4
   1067        ld1             {v0.8h},       [x5]
   1068 1:
   1069        ld1             {v2.8b,v3.8b}, [x2], x3
   1070 
   1071        vp8_epel8_h4    v2,  v2,  v3
   1072 
   1073        st1             {v2.8b}, [x0], x1
   1074        subs            w4,  w4,  #1
   1075        b.ne            1b
   1076 
   1077        ret
   1078 endfunc
   1079 
   1080 function ff_put_vp8_epel8_h4v6_neon, export=1
   1081        sub             x2,  x2,  x3,  lsl #1
   1082        sub             x2,  x2,  #1
   1083        sxtw            x4,  w4
   1084 
   1085        // first pass (horizontal):
   1086        movrel          x17,  subpel_filters, -16
   1087        sxtw            x5,  w5
   1088        add             x5,  x17,  x5, lsl #4 // x
   1089        sub             sp,  sp,  #168+16
   1090        ld1             {v0.8h},  [x5]
   1091        add             x7,  sp,  #15
   1092        add             x16, x4, #5   // h
   1093        bic             x7,  x7,  #15
   1094 1:
   1095        ld1             {v1.8b, v2.8b}, [x2], x3
   1096 
   1097        vp8_epel8_h4    v1, v1, v2
   1098 
   1099        st1             {v1.8b}, [x7], #8
   1100        subs            x16, x16, #1
   1101        b.ne            1b
   1102 
   1103        // second pass (vertical):
   1104        sxtw            x6,  w6
   1105        add             x6,  x17,  x6, lsl #4  // y
   1106        add             x7,  sp,   #15
   1107        ld1             {v0.8h},   [x6]
   1108        bic             x7,  x7,   #15
   1109 2:
   1110        ld1             {v1.8b - v4.8b}, [x7], #32
   1111        ld1             {v5.8b - v7.8b}, [x7]
   1112 
   1113        sub             x7,  x7,  #16
   1114 
   1115        vp8_epel8_v6_y2 v1, v2, v1, v2, v3, v4, v5, v6, v7
   1116 
   1117        st1             {v1.8b}, [x0], x1
   1118        st1             {v2.8b}, [x0], x1
   1119        subs            x4, x4, #2
   1120        b.ne            2b
   1121 
   1122        add             sp,  sp,  #168+16
   1123        ret
   1124 endfunc
   1125 
   1126 function ff_put_vp8_epel8_h4v4_neon, export=1
   1127        sub             x2,  x2,  x3
   1128        sub             x2,  x2,  #1
   1129        sxtw            x4,  w4
   1130 
   1131 
   1132        // first pass (horizontal):
   1133        movrel          x17,  subpel_filters, -16
   1134        sxtw            x5,  w5
   1135        add             x5,  x17,  x5, lsl #4 // x
   1136        sub             sp,  sp,  #168+16
   1137        ld1             {v0.8h},  [x5]
   1138        add             x7,  sp,  #15
   1139        add             x16, x4, #3   // h
   1140        bic             x7,  x7,  #15
   1141 1:
   1142        ld1             {v1.8b, v2.8b}, [x2], x3
   1143 
   1144        vp8_epel8_h4    v1, v1, v2
   1145 
   1146        st1             {v1.8b}, [x7], #8
   1147        subs            x16, x16, #1
   1148        b.ne            1b
   1149 
   1150        // second pass (vertical):
   1151        sxtw            x6,  w6
   1152        add             x6,  x17,  x6, lsl #4  // y
   1153        add             x7,  sp,   #15
   1154        ld1             {v0.8h},   [x6]
   1155        bic             x7,  x7,   #15
   1156 2:
   1157        ld1             {v1.8b - v2.8b}, [x7], #16
   1158        ld1             {v3.8b - v5.8b}, [x7]
   1159 
   1160        vp8_epel8_v4_y2 v1, v1, v2, v3, v4, v5
   1161 
   1162        st1             {v1.d}[0], [x0], x1
   1163        st1             {v1.d}[1], [x0], x1
   1164        subs            x4, x4, #2
   1165        b.ne            2b
   1166 
   1167        add             sp,  sp,  #168+16
   1168        ret
   1169 endfunc
   1170 
   1171 function ff_put_vp8_epel8_h6v4_neon, export=1
   1172        sub             x2,  x2,  x3
   1173        sub             x2,  x2,  #2
   1174        sxtw            x4,  w4
   1175 
   1176 
   1177        // first pass (horizontal):
   1178        movrel          x17,  subpel_filters, -16
   1179        sxtw            x5,  w5
   1180        add             x5,  x17,  x5, lsl #4 // x
   1181        sub             sp,  sp,  #168+16
   1182        ld1             {v0.8h},  [x5]
   1183        add             x7,  sp,  #15
   1184        add             x16, x4, #3   // h
   1185        bic             x7,  x7,  #15
   1186 1:
   1187        ld1             {v1.8b, v2.8b}, [x2], x3
   1188 
   1189        vp8_epel8_h6    v1, v1, v2
   1190 
   1191        st1             {v1.8b}, [x7], #8
   1192        subs            x16, x16, #1
   1193        b.ne            1b
   1194 
   1195        // second pass (vertical):
   1196        sxtw            x6,  w6
   1197        add             x6,  x17,  x6, lsl #4  // y
   1198        add             x7,  sp,   #15
   1199        ld1             {v0.8h},   [x6]
   1200        bic             x7,  x7,   #15
   1201 2:
   1202        ld1             {v1.8b - v2.8b}, [x7], #16
   1203        ld1             {v3.8b - v5.8b}, [x7]
   1204 
   1205        vp8_epel8_v4_y2 v1, v1, v2, v3, v4, v5
   1206 
   1207        st1             {v1.d}[0], [x0], x1
   1208        st1             {v1.d}[1], [x0], x1
   1209        subs            x4, x4, #2
   1210        b.ne            2b
   1211 
   1212        add             sp,  sp,  #168+16
   1213        ret
   1214 endfunc
   1215 
   1216 function ff_put_vp8_epel4_v6_neon, export=1
   1217        sub             x2,  x2,  x3,  lsl #1
   1218 
   1219        movrel          x7,  subpel_filters, -16
   1220        add             x6,  x7,  w6, uxtw #4
   1221        ld1             {v0.8h},    [x6]
   1222 1:
   1223        ld1r            {v2.2s},    [x2], x3
   1224        ld1r            {v3.2s},    [x2], x3
   1225        ld1r            {v4.2s},    [x2], x3
   1226        ld1r            {v5.2s},    [x2], x3
   1227        ld1r            {v6.2s},    [x2], x3
   1228        ld1r            {v7.2s},    [x2], x3
   1229        ld1r            {v28.2s},   [x2]
   1230        sub             x2,  x2,  x3,  lsl #2
   1231        ld1             {v2.s}[1],  [x2], x3
   1232        ld1             {v3.s}[1],  [x2], x3
   1233        ld1             {v4.s}[1],  [x2], x3
   1234        ld1             {v5.s}[1],  [x2], x3
   1235        ld1             {v6.s}[1],  [x2], x3
   1236        ld1             {v7.s}[1],  [x2], x3
   1237        ld1             {v28.s}[1], [x2]
   1238        sub             x2,  x2,  x3,  lsl #2
   1239 
   1240        vp8_epel8_v6_y2 v2, v3, v2, v3, v4, v5, v6, v7, v28
   1241 
   1242        st1             {v2.s}[0],  [x0], x1
   1243        st1             {v3.s}[0],  [x0], x1
   1244        st1             {v2.s}[1],  [x0], x1
   1245        st1             {v3.s}[1],  [x0], x1
   1246        subs            w4,  w4,  #4
   1247        b.ne            1b
   1248 
   1249        ret
   1250 endfunc
   1251 
   1252 function ff_put_vp8_epel4_h6_neon, export=1
   1253        sub             x2,  x2,  #2
   1254 
   1255        movrel          x7,  subpel_filters, -16
   1256        add             x5,  x7,  w5, uxtw #4
   1257        ld1             {v0.8h},       [x5]
   1258 1:
   1259        ld1             {v2.8b,v3.8b}, [x2], x3
   1260        vp8_epel8_h6    v2,  v2,  v3
   1261        st1             {v2.s}[0], [x0], x1
   1262        subs            w4,  w4,  #1
   1263        b.ne            1b
   1264 
   1265        ret
   1266 endfunc
   1267 
   1268 function ff_put_vp8_epel4_h6v6_neon, export=1
   1269        sub             x2,  x2,  x3,  lsl #1
   1270        sub             x2,  x2,  #2
   1271 
   1272        movrel          x7,  subpel_filters, -16
   1273        add             x5,  x7,  w5, uxtw #4
   1274        ld1             {v0.8h},       [x5]
   1275 
   1276        sub             sp,  sp,  #52
   1277        add             w8,  w4,  #5
   1278        mov             x9,  sp
   1279 1:
   1280        ld1             {v2.8b,v3.8b}, [x2], x3
   1281        vp8_epel8_h6    v2,  v2,  v3
   1282        st1             {v2.s}[0],     [x9], #4
   1283        subs            w8,  w8,  #1
   1284        b.ne            1b
   1285 
   1286        add             x6,  x7,  w6, uxtw #4
   1287        ld1             {v0.8h},       [x6]
   1288        mov             x9,  sp
   1289 2:
   1290        ld1             {v2.8b,v3.8b}, [x9], #16
   1291        ld1             {v6.8b},       [x9], #8
   1292        ld1r            {v28.2s},      [x9]
   1293        sub             x9,  x9,  #16
   1294        ld1             {v4.8b,v5.8b}, [x9], #16
   1295        ld1             {v7.8b},       [x9], #8
   1296        ld1             {v28.s}[1],    [x9]
   1297        sub             x9,  x9,  #16
   1298        trn1            v1.2s, v2.2s, v4.2s
   1299        trn2            v4.2s, v2.2s, v4.2s
   1300        trn1            v2.2s, v3.2s, v5.2s
   1301        trn2            v5.2s, v3.2s, v5.2s
   1302        trn1            v3.2s, v6.2s, v7.2s
   1303        trn2            v7.2s, v6.2s, v7.2s
   1304        vp8_epel8_v6_y2 v2, v3, v1, v4, v2, v5, v3, v7, v28
   1305        st1             {v2.s}[0],  [x0], x1
   1306        st1             {v3.s}[0],  [x0], x1
   1307        st1             {v2.s}[1],  [x0], x1
   1308        st1             {v3.s}[1],  [x0], x1
   1309        subs            w4,  w4,  #4
   1310        b.ne            2b
   1311 
   1312        add             sp,  sp,  #52
   1313        ret
   1314 endfunc
   1315 
   1316 function ff_put_vp8_epel4_h4v6_neon, export=1
   1317        sub             x2,  x2,  x3,  lsl #1
   1318        sub             x2,  x2,  #1
   1319 
   1320        movrel          x7,  subpel_filters, -16
   1321        add             x5,  x7,  w5, uxtw #4
   1322        ld1             {v0.8h},       [x5]
   1323 
   1324        sub             sp,  sp,  #52
   1325        add             w8,  w4,  #5
   1326        mov             x9,  sp
   1327 1:
   1328        ld1             {v2.8b},       [x2], x3
   1329        vp8_epel8_h4    v2,  v2,  v2
   1330        st1             {v2.s}[0],     [x9], #4
   1331        subs            w8,  w8,  #1
   1332        b.ne            1b
   1333 
   1334        add             x6,  x7,  w6, uxtw #4
   1335        ld1             {v0.8h},       [x6]
   1336        mov             x9,  sp
   1337 2:
   1338        ld1             {v2.8b,v3.8b}, [x9], #16
   1339        ld1             {v6.8b},       [x9], #8
   1340        ld1r            {v28.2s},      [x9]
   1341        sub             x9,  x9,  #16
   1342        ld1             {v4.8b,v5.8b}, [x9], #16
   1343        ld1             {v7.8b},       [x9], #8
   1344        ld1             {v28.s}[1],    [x9]
   1345        sub             x9,  x9,  #16
   1346        trn1            v1.2s, v2.2s, v4.2s
   1347        trn2            v4.2s, v2.2s, v4.2s
   1348        trn1            v2.2s, v3.2s, v5.2s
   1349        trn2            v5.2s, v3.2s, v5.2s
   1350        trn1            v3.2s, v6.2s, v7.2s
   1351        trn2            v7.2s, v6.2s, v7.2s
   1352        vp8_epel8_v6_y2 v2, v3, v1, v4, v2, v5, v3, v7, v28
   1353        st1             {v2.s}[0],  [x0], x1
   1354        st1             {v3.s}[0],  [x0], x1
   1355        st1             {v2.s}[1],  [x0], x1
   1356        st1             {v3.s}[1],  [x0], x1
   1357        subs            w4,  w4,  #4
   1358        b.ne            2b
   1359 
   1360        add             sp,  sp,  #52
   1361        ret
   1362 endfunc
   1363 
   1364 function ff_put_vp8_epel4_h6v4_neon, export=1
   1365        sub             x2,  x2,  x3
   1366        sub             x2,  x2,  #2
   1367 
   1368        movrel          x7,  subpel_filters, -16
   1369        add             x5,  x7,  w5, uxtw #4
   1370        ld1             {v0.8h},       [x5]
   1371 
   1372        sub             sp,  sp,  #44
   1373        add             w8,  w4,  #3
   1374        mov             x9,  sp
   1375 1:
   1376        ld1             {v2.8b,v3.8b}, [x2], x3
   1377        vp8_epel8_h6    v2, v2, v3
   1378        st1             {v2.s}[0],     [x9], #4
   1379        subs            w8,  w8,  #1
   1380        b.ne            1b
   1381 
   1382        add             x6,  x7,  w6, uxtw #4
   1383        ld1             {v0.8h},       [x6]
   1384        mov             x9,  sp
   1385 2:
   1386        ld1             {v2.8b,v3.8b}, [x9], #16
   1387        ld1r            {v6.2s},       [x9]
   1388        sub             x9,  x9,  #8
   1389        ld1             {v4.8b,v5.8b}, [x9], #16
   1390        ld1             {v6.s}[1],     [x9]
   1391        sub             x9,  x9,  #8
   1392        trn1            v1.2s, v2.2s, v4.2s
   1393        trn2            v4.2s, v2.2s, v4.2s
   1394        trn1            v2.2s, v3.2s, v5.2s
   1395        trn2            v5.2s, v3.2s, v5.2s
   1396        vp8_epel8_v4_y2 v1, v1, v4, v2, v5, v6
   1397        st1             {v1.s}[0],  [x0], x1
   1398        st1             {v1.s}[2],  [x0], x1
   1399        st1             {v1.s}[1],  [x0], x1
   1400        st1             {v1.s}[3],  [x0], x1
   1401        subs            w4,  w4,  #4
   1402        b.ne            2b
   1403 
   1404        add             sp,  sp,  #44
   1405        ret
   1406 endfunc
   1407 
   1408 function ff_put_vp8_epel4_h4_neon, export=1
   1409        sub             x2,  x2,  #1
   1410 
   1411        movrel          x7,  subpel_filters, -16
   1412        add             x5,  x7,  w5, uxtw #4
   1413        ld1             {v0.8h},    [x5]
   1414 1:
   1415        ld1             {v2.8b},    [x2], x3
   1416        vp8_epel8_h4    v2,  v2,  v2
   1417        st1             {v2.s}[0],  [x0], x1
   1418        subs            w4,  w4,  #1
   1419        b.ne            1b
   1420 
   1421        ret
   1422 endfunc
   1423 
   1424 function ff_put_vp8_epel4_v4_neon, export=1
   1425        sub             x2,  x2,  x3
   1426 
   1427        movrel          x7,  subpel_filters, -16
   1428        add             x6,  x7,  w6, uxtw #4
   1429        ld1             {v0.8h},   [x6]
   1430 1:
   1431        ld1r            {v2.2s},   [x2], x3
   1432        ld1r            {v3.2s},   [x2], x3
   1433        ld1r            {v4.2s},   [x2], x3
   1434        ld1r            {v5.2s},   [x2], x3
   1435        ld1r            {v6.2s},   [x2]
   1436        sub             x2,  x2,  x3,  lsl #1
   1437        ld1             {v2.s}[1], [x2], x3
   1438        ld1             {v3.s}[1], [x2], x3
   1439        ld1             {v4.s}[1], [x2], x3
   1440        ld1             {v5.s}[1], [x2], x3
   1441        ld1             {v6.s}[1], [x2]
   1442        sub             x2,  x2,  x3,  lsl #1
   1443 
   1444        vp8_epel8_v4_y2 v2, v2, v3, v4, v5, v6
   1445 
   1446        st1             {v2.s}[0], [x0], x1
   1447        st1             {v2.s}[2], [x0], x1
   1448        st1             {v2.s}[1], [x0], x1
   1449        st1             {v2.s}[3], [x0], x1
   1450        subs            w4,  w4,  #4
   1451        b.ne            1b
   1452 
   1453        ret
   1454 endfunc
   1455 
   1456 function ff_put_vp8_epel4_h4v4_neon, export=1
   1457        sub             x2,  x2,  x3
   1458        sub             x2,  x2,  #1
   1459 
   1460        movrel          x7,  subpel_filters, -16
   1461        add             x5,  x7,  w5, uxtw #4
   1462        ld1             {v0.8h},       [x5]
   1463 
   1464        sub             sp,  sp,  #44
   1465        add             w8,  w4,  #3
   1466        mov             x9,  sp
   1467 1:
   1468        ld1             {v2.8b},       [x2], x3
   1469        vp8_epel8_h4    v2,  v2,  v3
   1470        st1             {v2.s}[0],     [x9], #4
   1471        subs            w8,  w8,  #1
   1472        b.ne            1b
   1473 
   1474        add             x6,  x7,  w6, uxtw #4
   1475        ld1             {v0.8h},       [x6]
   1476        mov             x9,  sp
   1477 2:
   1478        ld1             {v2.8b,v3.8b}, [x9], #16
   1479        ld1r            {v6.2s},       [x9]
   1480        sub             x9,  x9,  #8
   1481        ld1             {v4.8b,v5.8b}, [x9], #16
   1482        ld1             {v6.s}[1],     [x9]
   1483        sub             x9,  x9,  #8
   1484        trn1            v1.2s, v2.2s, v4.2s
   1485        trn2            v4.2s, v2.2s, v4.2s
   1486        trn1            v2.2s, v3.2s, v5.2s
   1487        trn2            v5.2s, v3.2s, v5.2s
   1488        vp8_epel8_v4_y2 v1, v1, v4, v2, v5, v6
   1489        st1             {v1.s}[0], [x0], x1
   1490        st1             {v1.s}[2], [x0], x1
   1491        st1             {v1.s}[1], [x0], x1
   1492        st1             {v1.s}[3], [x0], x1
   1493        subs            w4,  w4,  #4
   1494        b.ne            2b
   1495 
   1496        add             sp,  sp,  #44
   1497        ret
   1498 endfunc
   1499 
   1500 /* Bilinear MC */
   1501 
   1502 function ff_put_vp8_bilin16_h_neon, export=1
   1503        mov             w7,     #8
   1504        dup             v0.8b,  w5
   1505        sub             w5,     w7,     w5
   1506        dup             v1.8b,  w5
   1507 1:
   1508        subs            w4,     w4,     #2
   1509        ld1             {v2.8b,v3.8b,v4.8b},    [x2], x3
   1510        ext             v5.8b,  v3.8b,  v4.8b,  #1
   1511        ext             v4.8b,  v2.8b,  v3.8b,  #1
   1512        umull           v16.8h, v2.8b,  v1.8b
   1513        umlal           v16.8h, v4.8b,  v0.8b
   1514        ld1             {v18.8b,v19.8b,v20.8b}, [x2], x3
   1515        umull           v6.8h,  v3.8b,  v1.8b
   1516        umlal           v6.8h,  v5.8b,  v0.8b
   1517        ext             v21.8b, v19.8b, v20.8b, #1
   1518        ext             v20.8b, v18.8b, v19.8b, #1
   1519        umull           v22.8h, v18.8b, v1.8b
   1520        umlal           v22.8h, v20.8b, v0.8b
   1521        umull           v24.8h, v19.8b, v1.8b
   1522        umlal           v24.8h, v21.8b, v0.8b
   1523        rshrn           v4.8b,  v16.8h, #3
   1524        rshrn2          v4.16b, v6.8h,  #3
   1525        rshrn           v6.8b,  v22.8h, #3
   1526        rshrn2          v6.16b, v24.8h, #3
   1527        st1             {v4.16b}, [x0], x1
   1528        st1             {v6.16b}, [x0], x1
   1529        b.gt            1b
   1530 
   1531        ret
   1532 endfunc
   1533 
   1534 function ff_put_vp8_bilin16_v_neon, export=1
   1535        mov             w7,     #8
   1536        dup             v0.16b, w6
   1537        sub             w6,     w7,     w6
   1538        dup             v1.16b, w6
   1539 
   1540        ld1             {v2.16b}, [x2], x3
   1541 1:
   1542        subs            w4,     w4,     #2
   1543        ld1             {v4.16b}, [x2], x3
   1544        umull           v6.8h,  v2.8b,  v1.8b
   1545        umlal           v6.8h,  v4.8b,  v0.8b
   1546        umull2          v16.8h, v2.16b, v1.16b
   1547        umlal2          v16.8h, v4.16b, v0.16b
   1548        ld1             {v2.16b}, [x2], x3
   1549        umull           v18.8h, v4.8b,  v1.8b
   1550        umlal           v18.8h, v2.8b,  v0.8b
   1551        umull2          v20.8h, v4.16b, v1.16b
   1552        umlal2          v20.8h, v2.16b, v0.16b
   1553        rshrn           v4.8b,  v6.8h,  #3
   1554        rshrn2          v4.16b, v16.8h, #3
   1555        rshrn           v6.8b,  v18.8h, #3
   1556        rshrn2          v6.16b, v20.8h, #3
   1557        st1             {v4.16b}, [x0], x1
   1558        st1             {v6.16b}, [x0], x1
   1559        b.gt            1b
   1560 
   1561        ret
   1562 endfunc
   1563 
   1564 function ff_put_vp8_bilin16_hv_neon, export=1
   1565        mov             w7,      #8
   1566        dup             v0.8b,   w5            // mx
   1567        sub             w5,      w7,     w5
   1568        dup             v1.8b,   w5
   1569        dup             v2.16b,  w6            // my
   1570        sub             w6,      w7,     w6
   1571        dup             v3.16b,  w6
   1572 
   1573        ld1             {v4.8b,v5.8b,v6.8b},    [x2], x3
   1574 
   1575        ext             v7.8b,   v5.8b,  v6.8b, #1
   1576        ext             v6.8b,   v4.8b,  v5.8b, #1
   1577        umull           v16.8h,  v4.8b,  v1.8b
   1578        umlal           v16.8h,  v6.8b,  v0.8b
   1579        umull           v18.8h,  v5.8b,  v1.8b
   1580        umlal           v18.8h,  v7.8b,  v0.8b
   1581        rshrn           v4.8b,   v16.8h, #3
   1582        rshrn2          v4.16b,  v18.8h, #3
   1583 1:
   1584        subs            w4,  w4,  #2
   1585        ld1             {v18.8b,v19.8b,v20.8b},  [x2], x3
   1586        ext             v21.8b,  v19.8b, v20.8b, #1
   1587        ext             v20.8b,  v18.8b, v19.8b, #1
   1588        umull           v22.8h,  v18.8b, v1.8b
   1589        umlal           v22.8h,  v20.8b, v0.8b
   1590        ld1             {v26.8b,v27.8b,v28.8b},  [x2], x3
   1591        umull           v24.8h,  v19.8b, v1.8b
   1592        umlal           v24.8h,  v21.8b, v0.8b
   1593        ext             v29.8b,  v27.8b, v28.8b, #1
   1594        ext             v28.8b,  v26.8b, v27.8b, #1
   1595        umull           v16.8h,  v26.8b, v1.8b
   1596        umlal           v16.8h,  v28.8b, v0.8b
   1597        umull           v18.8h,  v27.8b, v1.8b
   1598        umlal           v18.8h,  v29.8b, v0.8b
   1599        rshrn           v6.8b,   v22.8h, #3
   1600        rshrn2          v6.16b,  v24.8h, #3
   1601        umull           v24.8h,  v4.8b,  v3.8b
   1602        umlal           v24.8h,  v6.8b,  v2.8b
   1603        umull2          v30.8h,  v4.16b, v3.16b
   1604        umlal2          v30.8h,  v6.16b, v2.16b
   1605        rshrn           v4.8b,   v16.8h, #3
   1606        rshrn2          v4.16b,  v18.8h, #3
   1607        umull           v20.8h,  v6.8b,  v3.8b
   1608        umlal           v20.8h,  v4.8b,  v2.8b
   1609        umull2          v22.8h,  v6.16b, v3.16b
   1610        umlal2          v22.8h,  v4.16b, v2.16b
   1611        rshrn           v24.8b,  v24.8h, #3
   1612        rshrn2          v24.16b, v30.8h, #3
   1613        st1             {v24.16b}, [x0], x1
   1614        rshrn           v20.8b,  v20.8h, #3
   1615        rshrn2          v20.16b, v22.8h, #3
   1616        st1             {v20.16b}, [x0], x1
   1617        b.gt            1b
   1618 
   1619        ret
   1620 endfunc
   1621 
   1622 function ff_put_vp8_bilin8_h_neon, export=1
   1623        mov             w7,     #8
   1624        dup             v0.8b,  w5
   1625        sub             w5,     w7,     w5
   1626        dup             v1.8b,  w5
   1627 1:
   1628        subs            w4,     w4,     #2
   1629        ld1             {v2.8b,v3.8b},  [x2],  x3
   1630        ext             v3.8b,  v2.8b,  v3.8b, #1
   1631        umull           v4.8h,  v2.8b,  v1.8b
   1632        umlal           v4.8h,  v3.8b,  v0.8b
   1633        ld1             {v6.8b,v7.8b},  [x2],  x3
   1634        ext             v7.8b,  v6.8b,  v7.8b, #1
   1635        umull           v16.8h, v6.8b,  v1.8b
   1636        umlal           v16.8h, v7.8b,  v0.8b
   1637        rshrn           v4.8b,  v4.8h,  #3
   1638        rshrn           v16.8b, v16.8h, #3
   1639        st1             {v4.8b},  [x0], x1
   1640        st1             {v16.8b}, [x0], x1
   1641        b.gt            1b
   1642 
   1643        ret
   1644 endfunc
   1645 
   1646 function ff_put_vp8_bilin8_v_neon, export=1
   1647        mov             w7,      #8
   1648        dup             v0.8b,   w6
   1649        sub             w6,      w7,    w6
   1650        dup             v1.8b,   w6
   1651 
   1652        ld1             {v2.8b}, [x2],  x3
   1653 1:
   1654        subs            w4,      w4,    #2
   1655        ld1             {v3.8b}, [x2],  x3
   1656        umull           v4.8h,   v2.8b, v1.8b
   1657        umlal           v4.8h,   v3.8b, v0.8b
   1658        ld1             {v2.8b}, [x2],  x3
   1659        umull           v6.8h,   v3.8b, v1.8b
   1660        umlal           v6.8h,   v2.8b, v0.8b
   1661        rshrn           v4.8b,   v4.8h, #3
   1662        rshrn           v6.8b,   v6.8h, #3
   1663        st1             {v4.8b}, [x0],  x1
   1664        st1             {v6.8b}, [x0],  x1
   1665        b.gt            1b
   1666 
   1667        ret
   1668 endfunc
   1669 
   1670 function ff_put_vp8_bilin8_hv_neon, export=1
   1671        mov             w7,     #8
   1672        dup             v0.8b,  w5             // mx
   1673        sub             w5,     w7,     w5
   1674        dup             v1.8b,  w5
   1675        dup             v2.8b,  w6             // my
   1676        sub             w6,     w7,     w6
   1677        dup             v3.8b,  w6
   1678 
   1679        ld1             {v4.8b,v5.8b},  [x2],  x3
   1680        ext             v5.8b,  v4.8b,  v5.8b, #1
   1681        umull           v18.8h, v4.8b,  v1.8b
   1682        umlal           v18.8h, v5.8b,  v0.8b
   1683        rshrn           v22.8b, v18.8h, #3
   1684 1:
   1685        subs            w4,     w4,     #2
   1686        ld1             {v6.8b,v7.8b},  [x2],  x3
   1687        ext             v7.8b,  v6.8b,  v7.8b, #1
   1688        umull           v16.8h, v6.8b,  v1.8b
   1689        umlal           v16.8h, v7.8b,  v0.8b
   1690        ld1             {v4.8b,v5.8b},  [x2],  x3
   1691        ext             v5.8b,  v4.8b,  v5.8b, #1
   1692        umull           v18.8h, v4.8b,  v1.8b
   1693        umlal           v18.8h, v5.8b,  v0.8b
   1694        rshrn           v16.8b, v16.8h, #3
   1695        umull           v20.8h, v22.8b, v3.8b
   1696        umlal           v20.8h, v16.8b, v2.8b
   1697        rshrn           v22.8b, v18.8h, #3
   1698        umull           v24.8h, v16.8b, v3.8b
   1699        umlal           v24.8h, v22.8b, v2.8b
   1700        rshrn           v20.8b, v20.8h, #3
   1701        st1             {v20.8b}, [x0], x1
   1702        rshrn           v23.8b, v24.8h, #3
   1703        st1             {v23.8b}, [x0], x1
   1704        b.gt            1b
   1705 
   1706        ret
   1707 endfunc
   1708 
   1709 function ff_put_vp8_bilin4_h_neon, export=1
   1710        mov             w7,      #8
   1711        dup             v0.8b,   w5
   1712        sub             w5,      w7,     w5
   1713        dup             v1.8b,   w5
   1714 1:
   1715        subs            w4,      w4,     #2
   1716        ld1             {v2.8b}, [x2],   x3
   1717        ext             v3.8b,   v2.8b,  v3.8b,  #1
   1718        ld1             {v6.8b}, [x2],   x3
   1719        ext             v7.8b,   v6.8b,  v7.8b,  #1
   1720        trn1            v2.2s,   v2.2s,  v6.2s
   1721        trn1            v3.2s,   v3.2s,  v7.2s
   1722        umull           v4.8h,   v2.8b,  v1.8b
   1723        umlal           v4.8h,   v3.8b,  v0.8b
   1724        rshrn           v4.8b,   v4.8h,  #3
   1725        st1             {v4.s}[0], [x0], x1
   1726        st1             {v4.s}[1], [x0], x1
   1727        b.gt            1b
   1728 
   1729        ret
   1730 endfunc
   1731 
   1732 function ff_put_vp8_bilin4_v_neon, export=1
   1733        mov             w7,     #8
   1734        dup             v0.8b,  w6
   1735        sub             w6,     w7,  w6
   1736        dup             v1.8b,  w6
   1737 
   1738        ld1r            {v2.2s},    [x2], x3
   1739 1:
   1740        ld1r            {v3.2s},   [x2]
   1741        ld1             {v2.s}[1], [x2], x3
   1742        ld1             {v3.s}[1], [x2], x3
   1743        umull           v4.8h,  v2.8b,  v1.8b
   1744        umlal           v4.8h,  v3.8b,  v0.8b
   1745        trn2            v2.2s,  v3.2s,  v2.2s
   1746        rshrn           v4.8b,  v4.8h,  #3
   1747        st1             {v4.s}[0], [x0], x1
   1748        st1             {v4.s}[1], [x0], x1
   1749        subs            w4,     w4,     #2
   1750        b.gt            1b
   1751 
   1752        ret
   1753 endfunc
   1754 
   1755 function ff_put_vp8_bilin4_hv_neon, export=1
   1756        mov             w7,      #8
   1757        dup             v0.8b,   w5             // mx
   1758        sub             w5,      w7,     w5
   1759        dup             v1.8b,   w5
   1760        dup             v2.8b,   w6             // my
   1761        sub             w6,      w7,     w6
   1762        dup             v3.8b,   w6
   1763 
   1764        ld1             {v4.8b}, [x2],   x3
   1765        ext             v5.8b,   v4.8b,  v4.8b,  #1
   1766        umull           v18.8h,  v4.8b,  v1.8b
   1767        umlal           v18.8h,  v5.8b,  v0.8b
   1768        rshrn           v22.8b,  v18.8h, #3
   1769 1:
   1770        subs            w4,      w4,     #2
   1771        ld1             {v6.8b}, [x2],   x3
   1772        ext             v7.8b,   v6.8b,  v6.8b,  #1
   1773        ld1             {v4.8b}, [x2],   x3
   1774        ext             v5.8b,   v4.8b,  v4.8b,  #1
   1775        trn1            v6.2s,   v6.2s,  v4.2s
   1776        trn1            v7.2s,   v7.2s,  v5.2s
   1777        umull           v16.8h,  v6.8b,  v1.8b
   1778        umlal           v16.8h,  v7.8b,  v0.8b
   1779        rshrn           v16.8b,  v16.8h, #3
   1780        umull           v20.8h,  v16.8b, v2.8b
   1781        trn1            v22.2s,  v22.2s, v16.2s
   1782        umlal           v20.8h,  v22.8b, v3.8b
   1783        rev64           v22.2s,  v16.2s
   1784        rshrn           v20.8b,  v20.8h, #3
   1785        st1             {v20.s}[0], [x0], x1
   1786        st1             {v20.s}[1], [x0], x1
   1787        b.gt            1b
   1788 
   1789        ret
   1790 endfunc