tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

vp8dsp_neon.S (66432B)


      1 /*
      2 * VP8 NEON optimisations
      3 *
      4 * Copyright (c) 2010 Rob Clark <rob@ti.com>
      5 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
      6 *
      7 * This file is part of FFmpeg.
      8 *
      9 * FFmpeg is free software; you can redistribute it and/or
     10 * modify it under the terms of the GNU Lesser General Public
     11 * License as published by the Free Software Foundation; either
     12 * version 2.1 of the License, or (at your option) any later version.
     13 *
     14 * FFmpeg is distributed in the hope that it will be useful,
     15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     17 * Lesser General Public License for more details.
     18 *
     19 * You should have received a copy of the GNU Lesser General Public
     20 * License along with FFmpeg; if not, write to the Free Software
     21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     22 */
     23 
     24 #include "libavutil/arm/asm.S"
     25 #include "neon.S"
     26 
     27 function ff_vp8_luma_dc_wht_neon, export=1
     28        vld1.16         {q0-q1},  [r1,:128]
     29        vmov.i16        q15, #0
     30 
     31        vadd.i16        d4,  d0,  d3
     32        vadd.i16        d6,  d1,  d2
     33        vst1.16         {q15},    [r1,:128]!
     34        vsub.i16        d7,  d1,  d2
     35        vsub.i16        d5,  d0,  d3
     36        vst1.16         {q15},    [r1,:128]
     37        vadd.i16        q0,  q2,  q3
     38        vsub.i16        q1,  q2,  q3
     39 
     40        vmov.i16        q8, #3
     41 
     42        vtrn.32         d0,  d2
     43        vtrn.32         d1,  d3
     44        vtrn.16         d0,  d1
     45        vtrn.16         d2,  d3
     46 
     47        vadd.i16        d0,  d0,  d16
     48 
     49        vadd.i16        d4,  d0,  d3
     50        vadd.i16        d6,  d1,  d2
     51        vsub.i16        d7,  d1,  d2
     52        vsub.i16        d5,  d0,  d3
     53        vadd.i16        q0,  q2,  q3
     54        vsub.i16        q1,  q2,  q3
     55 
     56        vshr.s16        q0,  q0,  #3
     57        vshr.s16        q1,  q1,  #3
     58 
     59        mov             r3,  #32
     60        vst1.16         {d0[0]},  [r0,:16], r3
     61        vst1.16         {d1[0]},  [r0,:16], r3
     62        vst1.16         {d2[0]},  [r0,:16], r3
     63        vst1.16         {d3[0]},  [r0,:16], r3
     64        vst1.16         {d0[1]},  [r0,:16], r3
     65        vst1.16         {d1[1]},  [r0,:16], r3
     66        vst1.16         {d2[1]},  [r0,:16], r3
     67        vst1.16         {d3[1]},  [r0,:16], r3
     68        vst1.16         {d0[2]},  [r0,:16], r3
     69        vst1.16         {d1[2]},  [r0,:16], r3
     70        vst1.16         {d2[2]},  [r0,:16], r3
     71        vst1.16         {d3[2]},  [r0,:16], r3
     72        vst1.16         {d0[3]},  [r0,:16], r3
     73        vst1.16         {d1[3]},  [r0,:16], r3
     74        vst1.16         {d2[3]},  [r0,:16], r3
     75        vst1.16         {d3[3]},  [r0,:16], r3
     76 
     77        bx              lr
     78 endfunc
     79 
     80 function ff_vp8_idct_add_neon, export=1
     81        vld1.16         {q0-q1},  [r1,:128]
     82        movw            r3,  #20091
     83        movt            r3,  #35468/2
     84        vdup.32         d4,  r3
     85 
     86        vmull.s16       q12, d1,  d4[0]
     87        vmull.s16       q13, d3,  d4[0]
     88        vqdmulh.s16     d20, d1,  d4[1]
     89        vqdmulh.s16     d23, d3,  d4[1]
     90        vshrn.s32       d21, q12, #16
     91        vshrn.s32       d22, q13, #16
     92        vadd.s16        d21, d21, d1
     93        vadd.s16        d22, d22, d3
     94 
     95        vadd.s16        d16, d0,  d2
     96        vsub.s16        d17, d0,  d2
     97        vadd.s16        d18, d21, d23
     98        vsub.s16        d19, d20, d22
     99        vadd.s16        q0,  q8,  q9
    100        vsub.s16        q1,  q8,  q9
    101 
    102        vtrn.32         d0,  d3
    103        vtrn.32         d1,  d2
    104        vtrn.16         d0,  d1
    105        vtrn.16         d3,  d2
    106 
    107        vmov.i16        q15, #0
    108        vmull.s16       q12, d1,  d4[0]
    109        vst1.16         {q15},    [r1,:128]!
    110        vmull.s16       q13, d2,  d4[0]
    111        vst1.16         {q15},    [r1,:128]
    112        vqdmulh.s16     d21, d1,  d4[1]
    113        vqdmulh.s16     d23, d2,  d4[1]
    114        vshrn.s32       d20, q12, #16
    115        vshrn.s32       d22, q13, #16
    116        vadd.i16        d20, d20, d1
    117        vadd.i16        d22, d22, d2
    118 
    119        vadd.i16        d16, d0,  d3
    120        vsub.i16        d17, d0,  d3
    121        vadd.i16        d18, d20, d23
    122        vld1.32         {d20[]},  [r0,:32], r2
    123        vsub.i16        d19, d21, d22
    124        vld1.32         {d22[]},  [r0,:32], r2
    125        vadd.s16        q0,  q8,  q9
    126        vld1.32         {d23[]},  [r0,:32], r2
    127        vsub.s16        q1,  q8,  q9
    128        vld1.32         {d21[]},  [r0,:32], r2
    129        vrshr.s16       q0,  q0,  #3
    130        vtrn.32         q10, q11
    131        vrshr.s16       q1,  q1,  #3
    132 
    133        sub             r0,  r0,  r2,  lsl #2
    134 
    135        vtrn.32         d0,  d3
    136        vtrn.32         d1,  d2
    137        vtrn.16         d0,  d1
    138        vtrn.16         d3,  d2
    139 
    140        vaddw.u8        q0,  q0,  d20
    141        vaddw.u8        q1,  q1,  d21
    142        vqmovun.s16     d0,  q0
    143        vqmovun.s16     d1,  q1
    144 
    145        vst1.32         {d0[0]},  [r0,:32], r2
    146        vst1.32         {d0[1]},  [r0,:32], r2
    147        vst1.32         {d1[1]},  [r0,:32], r2
    148        vst1.32         {d1[0]},  [r0,:32], r2
    149 
    150        bx              lr
    151 endfunc
    152 
    153 function ff_vp8_idct_dc_add_neon, export=1
    154        mov             r3,  #0
    155        ldrsh           r12, [r1]
    156        strh            r3,  [r1]
    157        vdup.16         q1,  r12
    158        vrshr.s16       q1,  q1,  #3
    159        vld1.32         {d0[]},   [r0,:32], r2
    160        vld1.32         {d1[]},   [r0,:32], r2
    161        vld1.32         {d0[1]},  [r0,:32], r2
    162        vld1.32         {d1[1]},  [r0,:32], r2
    163        vaddw.u8        q2,  q1,  d0
    164        vaddw.u8        q3,  q1,  d1
    165        sub             r0,  r0,  r2, lsl #2
    166        vqmovun.s16     d0,  q2
    167        vqmovun.s16     d1,  q3
    168        vst1.32         {d0[0]},  [r0,:32], r2
    169        vst1.32         {d1[0]},  [r0,:32], r2
    170        vst1.32         {d0[1]},  [r0,:32], r2
    171        vst1.32         {d1[1]},  [r0,:32], r2
    172        bx              lr
    173 endfunc
    174 
    175 function ff_vp8_idct_dc_add4uv_neon, export=1
    176        vmov.i16        d0,  #0
    177        mov             r3,  #32
    178        vld1.16         {d16[]},  [r1,:16]
    179        vst1.16         {d0[0]},  [r1,:16], r3
    180        vld1.16         {d17[]},  [r1,:16]
    181        vst1.16         {d0[0]},  [r1,:16], r3
    182        vld1.16         {d18[]},  [r1,:16]
    183        vst1.16         {d0[0]},  [r1,:16], r3
    184        vld1.16         {d19[]},  [r1,:16]
    185        vst1.16         {d0[0]},  [r1,:16], r3
    186        mov             r3,  r0
    187        vrshr.s16       q8,  q8,  #3            @ dc >>= 3
    188        vld1.8          {d0},     [r0,:64], r2
    189        vrshr.s16       q9,  q9,  #3
    190        vld1.8          {d1},     [r0,:64], r2
    191        vaddw.u8        q10, q8,  d0
    192        vld1.8          {d2},     [r0,:64], r2
    193        vaddw.u8        q0,  q8,  d1
    194        vld1.8          {d3},     [r0,:64], r2
    195        vaddw.u8        q11, q8,  d2
    196        vld1.8          {d4},     [r0,:64], r2
    197        vaddw.u8        q1,  q8,  d3
    198        vld1.8          {d5},     [r0,:64], r2
    199        vaddw.u8        q12, q9,  d4
    200        vld1.8          {d6},     [r0,:64], r2
    201        vaddw.u8        q2,  q9,  d5
    202        vld1.8          {d7},     [r0,:64], r2
    203        vaddw.u8        q13, q9,  d6
    204        vqmovun.s16     d20, q10
    205        vaddw.u8        q3,  q9,  d7
    206        vqmovun.s16     d21, q0
    207        vqmovun.s16     d22, q11
    208        vst1.8          {d20},    [r3,:64], r2
    209        vqmovun.s16     d23, q1
    210        vst1.8          {d21},    [r3,:64], r2
    211        vqmovun.s16     d24, q12
    212        vst1.8          {d22},    [r3,:64], r2
    213        vqmovun.s16     d25, q2
    214        vst1.8          {d23},    [r3,:64], r2
    215        vqmovun.s16     d26, q13
    216        vst1.8          {d24},    [r3,:64], r2
    217        vqmovun.s16     d27, q3
    218        vst1.8          {d25},    [r3,:64], r2
    219        vst1.8          {d26},    [r3,:64], r2
    220        vst1.8          {d27},    [r3,:64], r2
    221 
    222        bx              lr
    223 endfunc
    224 
    225 function ff_vp8_idct_dc_add4y_neon, export=1
    226        vmov.i16        d0,  #0
    227        mov             r3,  #32
    228        vld1.16         {d16[]},  [r1,:16]
    229        vst1.16         {d0[0]},  [r1,:16], r3
    230        vld1.16         {d17[]},  [r1,:16]
    231        vst1.16         {d0[0]},  [r1,:16], r3
    232        vld1.16         {d18[]},  [r1,:16]
    233        vst1.16         {d0[0]},  [r1,:16], r3
    234        vld1.16         {d19[]},  [r1,:16]
    235        vst1.16         {d0[0]},  [r1,:16], r3
    236        vrshr.s16       q8,  q8,  #3            @ dc >>= 3
    237        vld1.8          {q0},     [r0,:128], r2
    238        vrshr.s16       q9,  q9,  #3
    239        vld1.8          {q1},     [r0,:128], r2
    240        vaddw.u8        q10, q8,  d0
    241        vld1.8          {q2},     [r0,:128], r2
    242        vaddw.u8        q0,  q9,  d1
    243        vld1.8          {q3},     [r0,:128], r2
    244        vaddw.u8        q11, q8,  d2
    245        vaddw.u8        q1,  q9,  d3
    246        vaddw.u8        q12, q8,  d4
    247        vaddw.u8        q2,  q9,  d5
    248        vaddw.u8        q13, q8,  d6
    249        vaddw.u8        q3,  q9,  d7
    250        sub             r0,  r0,  r2,  lsl #2
    251        vqmovun.s16     d20, q10
    252        vqmovun.s16     d21, q0
    253        vqmovun.s16     d22, q11
    254        vqmovun.s16     d23, q1
    255        vqmovun.s16     d24, q12
    256        vst1.8          {q10},    [r0,:128], r2
    257        vqmovun.s16     d25, q2
    258        vst1.8          {q11},    [r0,:128], r2
    259        vqmovun.s16     d26, q13
    260        vst1.8          {q12},    [r0,:128], r2
    261        vqmovun.s16     d27, q3
    262        vst1.8          {q13},    [r0,:128], r2
    263 
    264        bx              lr
    265 endfunc
    266 
    267 @ Register layout:
    268 @   P3..Q3 -> q0..q7
    269 @   flim_E -> q14
    270 @   flim_I -> q15
    271 @   hev_thresh -> r12
    272 @
    273 .macro  vp8_loop_filter, inner=0, simple=0
    274    .if \simple
    275        vabd.u8         q9,  q3,  q4            @ abs(P0-Q0)
    276        vabd.u8         q15, q2,  q5            @ abs(P1-Q1)
    277        vqadd.u8        q9,  q9,  q9            @ abs(P0-Q0) * 2
    278        vshr.u8         q10, q15, #1            @ abs(P1-Q1) / 2
    279        vqadd.u8        q11, q9,  q10           @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
    280        vmov.i8         q13, #0x80
    281        vcle.u8         q8,  q11, q14           @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim
    282    .else
    283        @ calculate hev and normal_limit:
    284        vabd.u8         q12, q2,  q3            @ abs(P1-P0)
    285        vabd.u8         q13, q5,  q4            @ abs(Q1-Q0)
    286        vabd.u8         q10, q0,  q1            @ abs(P3-P2)
    287        vabd.u8         q11, q1,  q2            @ abs(P2-P1)
    288        vcle.u8         q8,  q12, q15           @ abs(P1-P0) <= flim_I
    289        vcle.u8         q9,  q13, q15           @ abs(Q1-Q0) <= flim_I
    290        vcle.u8         q10, q10, q15           @ abs(P3-P2) <= flim_I
    291        vcle.u8         q11, q11, q15           @ abs(P2-P1) <= flim_I
    292        vand            q8,  q8,  q9
    293        vabd.u8         q9,  q7,  q6            @ abs(Q3-Q2)
    294        vand            q8,  q8,  q11
    295        vabd.u8         q11, q6,  q5            @ abs(Q2-Q1)
    296        vand            q8,  q8,  q10
    297        vcle.u8         q10, q9,  q15           @ abs(Q3-Q2) <= flim_I
    298        vcle.u8         q11, q11, q15           @ abs(Q2-Q1) <= flim_I
    299        vabd.u8         q9,  q3,  q4            @ abs(P0-Q0)
    300        vabd.u8         q15, q2,  q5            @ abs(P1-Q1)
    301        vand            q8,  q8,  q10
    302        vqadd.u8        q9,  q9,  q9            @ abs(P0-Q0) * 2
    303        vand            q8,  q8,  q11
    304        vshr.u8         q10, q15, #1            @ abs(P1-Q1) / 2
    305        vdup.8          q15, r12                @ hev_thresh
    306        vqadd.u8        q11, q9,  q10           @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
    307        vcgt.u8         q12, q12, q15           @ abs(P1-P0) > hev_thresh
    308        vcle.u8         q11, q11, q14           @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E
    309        vcgt.u8         q14, q13, q15           @ abs(Q1-Q0) > hev_thresh
    310        vand            q8,  q8,  q11
    311        vmov.i8         q13, #0x80
    312        vorr            q9,  q12, q14
    313    .endif
    314 
    315        @ at this point:
    316        @   q8: normal_limit
    317        @   q9: hev
    318 
    319        @ convert to signed value:
    320        veor            q3,  q3,  q13           @ PS0 = P0 ^ 0x80
    321        veor            q4,  q4,  q13           @ QS0 = Q0 ^ 0x80
    322 
    323        vmov.i16        q12, #3
    324        vsubl.s8        q10, d8,  d6            @ QS0 - PS0
    325        vsubl.s8        q11, d9,  d7            @   (widened to 16 bits)
    326        veor            q2,  q2,  q13           @ PS1 = P1 ^ 0x80
    327        veor            q5,  q5,  q13           @ QS1 = Q1 ^ 0x80
    328        vmul.i16        q10, q10, q12           @ w = 3 * (QS0 - PS0)
    329        vmul.i16        q11, q11, q12
    330 
    331        vqsub.s8        q12, q2,  q5            @ clamp(PS1-QS1)
    332        vmov.i8         q14, #4
    333        vmov.i8         q15, #3
    334    .if \inner
    335        vand            q12, q12, q9            @ if(hev) w += clamp(PS1-QS1)
    336    .endif
    337        vaddw.s8        q10, q10, d24           @ w += clamp(PS1-QS1)
    338        vaddw.s8        q11, q11, d25
    339        vqmovn.s16      d20, q10                @ narrow result back into q10
    340        vqmovn.s16      d21, q11
    341    .if !\inner && !\simple
    342        veor            q1,  q1,  q13           @ PS2 = P2 ^ 0x80
    343        veor            q6,  q6,  q13           @ QS2 = Q2 ^ 0x80
    344    .endif
    345        vand            q10, q10, q8            @ w &= normal_limit
    346 
    347        @ registers used at this point..
    348        @   q0 -> P3  (don't corrupt)
    349        @   q1-q6 -> PS2-QS2
    350        @   q7 -> Q3  (don't corrupt)
    351        @   q9 -> hev
    352        @   q10 -> w
    353        @   q13 -> #0x80
    354        @   q14 -> #4
    355        @   q15 -> #3
    356        @   q8, q11, q12 -> unused
    357 
    358        @ filter_common:   is4tap==1
    359        @   c1 = clamp(w + 4) >> 3;
    360        @   c2 = clamp(w + 3) >> 3;
    361        @   Q0 = s2u(QS0 - c1);
    362        @   P0 = s2u(PS0 + c2);
    363 
    364    .if \simple
    365        vqadd.s8        q11, q10, q14           @ c1 = clamp((w&hev)+4)
    366        vqadd.s8        q12, q10, q15           @ c2 = clamp((w&hev)+3)
    367        vshr.s8         q11, q11, #3            @ c1 >>= 3
    368        vshr.s8         q12, q12, #3            @ c2 >>= 3
    369        vqsub.s8        q4,  q4,  q11           @ QS0 = clamp(QS0-c1)
    370        vqadd.s8        q3,  q3,  q12           @ PS0 = clamp(PS0+c2)
    371        veor            q4,  q4,  q13           @ Q0 = QS0 ^ 0x80
    372        veor            q3,  q3,  q13           @ P0 = PS0 ^ 0x80
    373        veor            q5,  q5,  q13           @ Q1 = QS1 ^ 0x80
    374        veor            q2,  q2,  q13           @ P1 = PS1 ^ 0x80
    375    .elseif \inner
    376        @ the !is4tap case of filter_common, only used for inner blocks
    377        @   c3 = ((c1&~hev) + 1) >> 1;
    378        @   Q1 = s2u(QS1 - c3);
    379        @   P1 = s2u(PS1 + c3);
    380        vqadd.s8        q11, q10, q14           @ c1 = clamp((w&hev)+4)
    381        vqadd.s8        q12, q10, q15           @ c2 = clamp((w&hev)+3)
    382        vshr.s8         q11, q11, #3            @ c1 >>= 3
    383        vshr.s8         q12, q12, #3            @ c2 >>= 3
    384        vqsub.s8        q4,  q4,  q11           @ QS0 = clamp(QS0-c1)
    385        vqadd.s8        q3,  q3,  q12           @ PS0 = clamp(PS0+c2)
    386        vbic            q11, q11, q9            @ c1 & ~hev
    387        veor            q4,  q4,  q13           @ Q0 = QS0 ^ 0x80
    388        vrshr.s8        q11, q11, #1            @ c3 >>= 1
    389        veor            q3,  q3,  q13           @ P0 = PS0 ^ 0x80
    390        vqsub.s8        q5,  q5,  q11           @ QS1 = clamp(QS1-c3)
    391        vqadd.s8        q2,  q2,  q11           @ PS1 = clamp(PS1+c3)
    392        veor            q5,  q5,  q13           @ Q1 = QS1 ^ 0x80
    393        veor            q2,  q2,  q13           @ P1 = PS1 ^ 0x80
    394    .else
    395        vand            q12, q10, q9            @ w & hev
    396        vqadd.s8        q11, q12, q14           @ c1 = clamp((w&hev)+4)
    397        vqadd.s8        q12, q12, q15           @ c2 = clamp((w&hev)+3)
    398        vshr.s8         q11, q11, #3            @ c1 >>= 3
    399        vshr.s8         q12, q12, #3            @ c2 >>= 3
    400        vbic            q10, q10, q9            @ w &= ~hev
    401        vqsub.s8        q4,  q4,  q11           @ QS0 = clamp(QS0-c1)
    402        vqadd.s8        q3,  q3,  q12           @ PS0 = clamp(PS0+c2)
    403 
    404        @ filter_mbedge:
    405        @   a = clamp((27*w + 63) >> 7);
    406        @   Q0 = s2u(QS0 - a);
    407        @   P0 = s2u(PS0 + a);
    408        @   a = clamp((18*w + 63) >> 7);
    409        @   Q1 = s2u(QS1 - a);
    410        @   P1 = s2u(PS1 + a);
    411        @   a = clamp((9*w + 63) >> 7);
    412        @   Q2 = s2u(QS2 - a);
    413        @   P2 = s2u(PS2 + a);
    414        vmov.i16        q9,  #63
    415        vshll.s8        q14, d20, #3
    416        vshll.s8        q15, d21, #3
    417        vaddw.s8        q14, q14, d20
    418        vaddw.s8        q15, q15, d21
    419        vadd.s16        q8,  q9,  q14
    420        vadd.s16        q9,  q9,  q15           @  9*w + 63
    421        vadd.s16        q11, q8,  q14
    422        vadd.s16        q12, q9,  q15           @ 18*w + 63
    423        vadd.s16        q14, q11, q14
    424        vadd.s16        q15, q12, q15           @ 27*w + 63
    425        vqshrn.s16      d16, q8,  #7
    426        vqshrn.s16      d17, q9,  #7            @ clamp(( 9*w + 63)>>7)
    427        vqshrn.s16      d22, q11, #7
    428        vqshrn.s16      d23, q12, #7            @ clamp((18*w + 63)>>7)
    429        vqshrn.s16      d28, q14, #7
    430        vqshrn.s16      d29, q15, #7            @ clamp((27*w + 63)>>7)
    431        vqadd.s8        q1,  q1,  q8            @ PS2 = clamp(PS2+a)
    432        vqsub.s8        q6,  q6,  q8            @ QS2 = clamp(QS2-a)
    433        vqadd.s8        q2,  q2,  q11           @ PS1 = clamp(PS1+a)
    434        vqsub.s8        q5,  q5,  q11           @ QS1 = clamp(QS1-a)
    435        vqadd.s8        q3,  q3,  q14           @ PS0 = clamp(PS0+a)
    436        vqsub.s8        q4,  q4,  q14           @ QS0 = clamp(QS0-a)
    437        veor            q3,  q3,  q13           @ P0 = PS0 ^ 0x80
    438        veor            q4,  q4,  q13           @ Q0 = QS0 ^ 0x80
    439        veor            q2,  q2,  q13           @ P1 = PS1 ^ 0x80
    440        veor            q5,  q5,  q13           @ Q1 = QS1 ^ 0x80
    441        veor            q1,  q1,  q13           @ P2 = PS2 ^ 0x80
    442        veor            q6,  q6,  q13           @ Q2 = QS2 ^ 0x80
    443    .endif
    444 .endm
    445 
    446 .macro  vp8_v_loop_filter16 name, inner=0, simple=0
    447 function ff_vp8_v_loop_filter16\name\()_neon, export=1
    448        vpush           {q4-q7}
    449        sub             r0,  r0,  r1,  lsl #1+!\simple
    450 
    451        @ Load pixels:
    452    .if !\simple
    453        ldr             r12, [sp, #64]          @ hev_thresh
    454        vld1.8          {q0},     [r0,:128], r1 @ P3
    455        vld1.8          {q1},     [r0,:128], r1 @ P2
    456    .endif
    457        vld1.8          {q2},     [r0,:128], r1 @ P1
    458        vld1.8          {q3},     [r0,:128], r1 @ P0
    459        vld1.8          {q4},     [r0,:128], r1 @ Q0
    460        vld1.8          {q5},     [r0,:128], r1 @ Q1
    461    .if !\simple
    462        vld1.8          {q6},     [r0,:128], r1 @ Q2
    463        vld1.8          {q7},     [r0,:128]     @ Q3
    464        vdup.8          q15, r3                 @ flim_I
    465    .endif
    466        vdup.8          q14, r2                 @ flim_E
    467 
    468        vp8_loop_filter inner=\inner, simple=\simple
    469 
    470        @ back up to P2:  dst -= stride * 6
    471        sub             r0,  r0,  r1,  lsl #2
    472    .if !\simple
    473        sub             r0,  r0,  r1,  lsl #1
    474 
    475        @ Store pixels:
    476        vst1.8          {q1},     [r0,:128], r1 @ P2
    477    .endif
    478        vst1.8          {q2},     [r0,:128], r1 @ P1
    479        vst1.8          {q3},     [r0,:128], r1 @ P0
    480        vst1.8          {q4},     [r0,:128], r1 @ Q0
    481        vst1.8          {q5},     [r0,:128], r1 @ Q1
    482    .if !\simple
    483        vst1.8          {q6},     [r0,:128]     @ Q2
    484    .endif
    485 
    486        vpop            {q4-q7}
    487        bx              lr
    488 endfunc
    489 .endm
    490 
    491 vp8_v_loop_filter16
    492 vp8_v_loop_filter16 _inner,  inner=1
    493 vp8_v_loop_filter16 _simple, simple=1
    494 
    495 .macro  vp8_v_loop_filter8uv name, inner=0
    496 function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
    497        vpush           {q4-q7}
    498        sub             r0,  r0,  r2,  lsl #2
    499        sub             r1,  r1,  r2,  lsl #2
    500        ldr             r12, [sp, #64]          @ flim_I
    501 
    502        @ Load pixels:
    503        vld1.8          {d0},     [r0,:64], r2  @ P3
    504        vld1.8          {d1},     [r1,:64], r2  @ P3
    505        vld1.8          {d2},     [r0,:64], r2  @ P2
    506        vld1.8          {d3},     [r1,:64], r2  @ P2
    507        vld1.8          {d4},     [r0,:64], r2  @ P1
    508        vld1.8          {d5},     [r1,:64], r2  @ P1
    509        vld1.8          {d6},     [r0,:64], r2  @ P0
    510        vld1.8          {d7},     [r1,:64], r2  @ P0
    511        vld1.8          {d8},     [r0,:64], r2  @ Q0
    512        vld1.8          {d9},     [r1,:64], r2  @ Q0
    513        vld1.8          {d10},    [r0,:64], r2  @ Q1
    514        vld1.8          {d11},    [r1,:64], r2  @ Q1
    515        vld1.8          {d12},    [r0,:64], r2  @ Q2
    516        vld1.8          {d13},    [r1,:64], r2  @ Q2
    517        vld1.8          {d14},    [r0,:64]      @ Q3
    518        vld1.8          {d15},    [r1,:64]      @ Q3
    519 
    520        vdup.8          q14, r3                 @ flim_E
    521        vdup.8          q15, r12                @ flim_I
    522        ldr             r12, [sp, #68]          @ hev_thresh
    523 
    524        vp8_loop_filter inner=\inner
    525 
    526        @ back up to P2:  u,v -= stride * 6
    527        sub             r0,  r0,  r2,  lsl #2
    528        sub             r1,  r1,  r2,  lsl #2
    529        sub             r0,  r0,  r2,  lsl #1
    530        sub             r1,  r1,  r2,  lsl #1
    531 
    532        @ Store pixels:
    533        vst1.8          {d2},     [r0,:64], r2  @ P2
    534        vst1.8          {d3},     [r1,:64], r2  @ P2
    535        vst1.8          {d4},     [r0,:64], r2  @ P1
    536        vst1.8          {d5},     [r1,:64], r2  @ P1
    537        vst1.8          {d6},     [r0,:64], r2  @ P0
    538        vst1.8          {d7},     [r1,:64], r2  @ P0
    539        vst1.8          {d8},     [r0,:64], r2  @ Q0
    540        vst1.8          {d9},     [r1,:64], r2  @ Q0
    541        vst1.8          {d10},    [r0,:64], r2  @ Q1
    542        vst1.8          {d11},    [r1,:64], r2  @ Q1
    543        vst1.8          {d12},    [r0,:64]      @ Q2
    544        vst1.8          {d13},    [r1,:64]      @ Q2
    545 
    546        vpop            {q4-q7}
    547        bx              lr
    548 endfunc
    549 .endm
    550 
    551 vp8_v_loop_filter8uv
    552 vp8_v_loop_filter8uv _inner, inner=1
    553 
    554 .macro  vp8_h_loop_filter16 name, inner=0, simple=0
    555 function ff_vp8_h_loop_filter16\name\()_neon, export=1
    556        vpush           {q4-q7}
    557        sub             r0,  r0,  #4
    558    .if !\simple
    559        ldr             r12, [sp, #64]          @ hev_thresh
    560    .endif
    561 
    562        @ Load pixels:
    563        vld1.8          {d0},     [r0], r1      @ load first 8-line src data
    564        vld1.8          {d2},     [r0], r1
    565        vld1.8          {d4},     [r0], r1
    566        vld1.8          {d6},     [r0], r1
    567        vld1.8          {d8},     [r0], r1
    568        vld1.8          {d10},    [r0], r1
    569        vld1.8          {d12},    [r0], r1
    570        vld1.8          {d14},    [r0], r1
    571        vld1.8          {d1},     [r0], r1      @ load second 8-line src data
    572        vld1.8          {d3},     [r0], r1
    573        vld1.8          {d5},     [r0], r1
    574        vld1.8          {d7},     [r0], r1
    575        vld1.8          {d9},     [r0], r1
    576        vld1.8          {d11},    [r0], r1
    577        vld1.8          {d13},    [r0], r1
    578        vld1.8          {d15},    [r0], r1
    579 
    580        transpose_8x8   q0,  q1,  q2,  q3,  q4,  q5,  q6,  q7
    581 
    582        vdup.8          q14, r2                 @ flim_E
    583    .if !\simple
    584        vdup.8          q15, r3                 @ flim_I
    585    .endif
    586 
    587        vp8_loop_filter inner=\inner, simple=\simple
    588 
    589        sub             r0,  r0,  r1, lsl #4    @ backup 16 rows
    590 
    591        transpose_8x8   q0,  q1,  q2,  q3,  q4,  q5,  q6,  q7
    592 
    593        @ Store pixels:
    594        vst1.8          {d0},     [r0],     r1
    595        vst1.8          {d2},     [r0],     r1
    596        vst1.8          {d4},     [r0],     r1
    597        vst1.8          {d6},     [r0],     r1
    598        vst1.8          {d8},     [r0],     r1
    599        vst1.8          {d10},    [r0],     r1
    600        vst1.8          {d12},    [r0],     r1
    601        vst1.8          {d14},    [r0],     r1
    602        vst1.8          {d1},     [r0],     r1
    603        vst1.8          {d3},     [r0],     r1
    604        vst1.8          {d5},     [r0],     r1
    605        vst1.8          {d7},     [r0],     r1
    606        vst1.8          {d9},     [r0],     r1
    607        vst1.8          {d11},    [r0],     r1
    608        vst1.8          {d13},    [r0],     r1
    609        vst1.8          {d15},    [r0]
    610 
    611        vpop            {q4-q7}
    612        bx              lr
    613 endfunc
    614 .endm
    615 
    616 vp8_h_loop_filter16
    617 vp8_h_loop_filter16 _inner,  inner=1
    618 vp8_h_loop_filter16 _simple, simple=1
    619 
    620 .macro  vp8_h_loop_filter8uv name, inner=0
    621 function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
    622        vpush           {q4-q7}
    623        sub             r0,  r0,  #4
    624        sub             r1,  r1,  #4
    625        ldr             r12, [sp, #64]          @ flim_I
    626 
    627        @ Load pixels:
    628        vld1.8          {d0},     [r0], r2      @ load u
    629        vld1.8          {d1},     [r1], r2      @ load v
    630        vld1.8          {d2},     [r0], r2
    631        vld1.8          {d3},     [r1], r2
    632        vld1.8          {d4},     [r0], r2
    633        vld1.8          {d5},     [r1], r2
    634        vld1.8          {d6},     [r0], r2
    635        vld1.8          {d7},     [r1], r2
    636        vld1.8          {d8},     [r0], r2
    637        vld1.8          {d9},     [r1], r2
    638        vld1.8          {d10},    [r0], r2
    639        vld1.8          {d11},    [r1], r2
    640        vld1.8          {d12},    [r0], r2
    641        vld1.8          {d13},    [r1], r2
    642        vld1.8          {d14},    [r0], r2
    643        vld1.8          {d15},    [r1], r2
    644 
    645        transpose_8x8   q0,  q1,  q2,  q3,  q4,  q5,  q6,  q7
    646 
    647        vdup.8          q14, r3                 @ flim_E
    648        vdup.8          q15, r12                @ flim_I
    649        ldr             r12, [sp, #68]          @ hev_thresh
    650 
    651        vp8_loop_filter inner=\inner
    652 
    653        sub             r0,  r0,  r2, lsl #3    @ backup u 8 rows
    654        sub             r1,  r1,  r2, lsl #3    @ backup v 8 rows
    655 
    656        transpose_8x8   q0,  q1,  q2,  q3,  q4,  q5,  q6,  q7
    657 
    658        @ Store pixels:
    659        vst1.8          {d0},     [r0], r2
    660        vst1.8          {d1},     [r1], r2
    661        vst1.8          {d2},     [r0], r2
    662        vst1.8          {d3},     [r1], r2
    663        vst1.8          {d4},     [r0], r2
    664        vst1.8          {d5},     [r1], r2
    665        vst1.8          {d6},     [r0], r2
    666        vst1.8          {d7},     [r1], r2
    667        vst1.8          {d8},     [r0], r2
    668        vst1.8          {d9},     [r1], r2
    669        vst1.8          {d10},    [r0], r2
    670        vst1.8          {d11},    [r1], r2
    671        vst1.8          {d12},    [r0], r2
    672        vst1.8          {d13},    [r1], r2
    673        vst1.8          {d14},    [r0]
    674        vst1.8          {d15},    [r1]
    675 
    676        vpop            {q4-q7}
    677        bx              lr
    678 endfunc
    679 .endm
    680 
    681 vp8_h_loop_filter8uv
    682 vp8_h_loop_filter8uv _inner, inner=1
    683 
    684 function ff_put_vp8_pixels16_neon, export=1
    685        ldr             r12, [sp, #0]           @ h
    686 1:
    687        subs            r12, r12, #4
    688        vld1.8          {q0},     [r2], r3
    689        vld1.8          {q1},     [r2], r3
    690        vld1.8          {q2},     [r2], r3
    691        vld1.8          {q3},     [r2], r3
    692        vst1.8          {q0},     [r0,:128], r1
    693        vst1.8          {q1},     [r0,:128], r1
    694        vst1.8          {q2},     [r0,:128], r1
    695        vst1.8          {q3},     [r0,:128], r1
    696        bgt             1b
    697        bx              lr
    698 endfunc
    699 
    700 function ff_put_vp8_pixels8_neon, export=1
    701        ldr             r12, [sp, #0]           @ h
    702 1:
    703        subs            r12, r12, #4
    704        vld1.8          {d0},     [r2], r3
    705        vld1.8          {d1},     [r2], r3
    706        vld1.8          {d2},     [r2], r3
    707        vld1.8          {d3},     [r2], r3
    708        vst1.8          {d0},     [r0,:64], r1
    709        vst1.8          {d1},     [r0,:64], r1
    710        vst1.8          {d2},     [r0,:64], r1
    711        vst1.8          {d3},     [r0,:64], r1
    712        bgt             1b
    713        bx              lr
    714 endfunc
    715 
    716 /* 4/6-tap 8th-pel MC */
    717 
    718 .macro  vp8_epel8_h6    d,   a,   b
    719        vext.8          d27, \a,  \b,  #1
    720        vmovl.u8        q8,  \a
    721        vext.8          d28, \a,  \b,  #2
    722        vmovl.u8        q9,  d27
    723        vext.8          d29, \a,  \b,  #3
    724        vmovl.u8        q10, d28
    725        vext.8          d30, \a,  \b,  #4
    726        vmovl.u8        q11, d29
    727        vext.8          d31, \a,  \b,  #5
    728        vmovl.u8        q12, d30
    729        vmul.u16        q10, q10, d0[2]
    730        vmovl.u8        q13, d31
    731        vmul.u16        q11, q11, d0[3]
    732        vmls.u16        q10, q9,  d0[1]
    733        vmls.u16        q11, q12, d1[0]
    734        vmla.u16        q10, q8,  d0[0]
    735        vmla.u16        q11, q13, d1[1]
    736        vqadd.s16       q11, q10, q11
    737        vqrshrun.s16    \d,  q11, #7
    738 .endm
    739 
    740 .macro  vp8_epel16_h6   d0,  d1,  s0,  s1,  s2,  q0,  q1
    741        vext.8          q14, \q0, \q1, #3
    742        vext.8          q15, \q0, \q1, #4
    743        vmovl.u8        q11, d28
    744        vmovl.u8        q14, d29
    745        vext.8          q3,  \q0, \q1, #2
    746        vmovl.u8        q12, d30
    747        vmovl.u8        q15, d31
    748        vext.8          q8,  \q0, \q1, #1
    749        vmovl.u8        q10, d6
    750        vmovl.u8        q3,  d7
    751        vext.8          q2,  \q0, \q1, #5
    752        vmovl.u8        q13, d4
    753        vmovl.u8        q2,  d5
    754        vmovl.u8        q9,  d16
    755        vmovl.u8        q8,  d17
    756        vmul.u16        q11, q11, d0[3]
    757        vmul.u16        q10, q10, d0[2]
    758        vmul.u16        q3,  q3,  d0[2]
    759        vmul.u16        q14, q14, d0[3]
    760        vmls.u16        q11, q12, d1[0]
    761        vmovl.u8        q12, \s0
    762        vmovl.u8        q1,  \s1
    763        vmls.u16        q10, q9,  d0[1]
    764        vmls.u16        q3,  q8,  d0[1]
    765        vmls.u16        q14, q15, d1[0]
    766        vmla.u16        q10, q12, d0[0]
    767        vmla.u16        q11, q13, d1[1]
    768        vmla.u16        q3,  q1,  d0[0]
    769        vmla.u16        q14, q2,  d1[1]
    770        vqadd.s16       q11, q10, q11
    771        vqadd.s16       q14, q3,  q14
    772        vqrshrun.s16    \d0, q11, #7
    773        vqrshrun.s16    \d1, q14, #7
    774 .endm
    775 
    776 .macro  vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
    777        vmovl.u8        q10, \s0
    778        vmovl.u8        q11, \s3
    779        vmovl.u8        q14, \s6
    780        vmovl.u8        q9,  \s1
    781        vmovl.u8        q12, \s4
    782        vmovl.u8        q8,  \s2
    783        vmovl.u8        q13, \s5
    784        vmul.u16        q10, q10, d0[0]
    785        vmul.u16        q15, q11, d0[3]
    786        vmul.u16        q11, q11, d0[2]
    787        vmul.u16        q14, q14, d1[1]
    788        vmls.u16        q10, q9,  d0[1]
    789        vmls.u16        q15, q12, d1[0]
    790        vmls.u16        q11, q8,  d0[1]
    791        vmls.u16        q14, q13, d1[0]
    792        vmla.u16        q10, q8,  d0[2]
    793        vmla.u16        q15, q13, d1[1]
    794        vmla.u16        q11, q9,  d0[0]
    795        vmla.u16        q14, q12, d0[3]
    796        vqadd.s16       q15, q10, q15
    797        vqadd.s16       q14, q11, q14
    798        vqrshrun.s16    \d0, q15, #7
    799        vqrshrun.s16    \d1, q14, #7
    800 .endm
    801 
    802 .macro  vp8_epel8_h4    d,   a,   b
    803        vext.8          d28, \a,  \b,  #1
    804        vmovl.u8        q9,  \a
    805        vext.8          d29, \a,  \b,  #2
    806        vmovl.u8        q10, d28
    807        vext.8          d30, \a,  \b,  #3
    808        vmovl.u8        q11, d29
    809        vmovl.u8        q12, d30
    810        vmul.u16        q10, q10, d0[2]
    811        vmul.u16        q11, q11, d0[3]
    812        vmls.u16        q10, q9,  d0[1]
    813        vmls.u16        q11, q12, d1[0]
    814        vqadd.s16       q11, q10, q11
    815        vqrshrun.s16    \d,  q11, #7
    816 .endm
    817 
    818 .macro  vp8_epel8_v4_y2 d0,  d1,  s0,  s1,  s2,  s3,  s4
    819        vmovl.u8        q9,  \s0
    820        vmovl.u8        q10, \s1
    821        vmovl.u8        q11, \s2
    822        vmovl.u8        q12, \s3
    823        vmovl.u8        q13, \s4
    824        vmul.u16        q8,  q10, d0[2]
    825        vmul.u16        q14, q11, d0[3]
    826        vmul.u16        q11, q11, d0[2]
    827        vmul.u16        q15, q12, d0[3]
    828        vmls.u16        q8,  q9,  d0[1]
    829        vmls.u16        q14, q12, d1[0]
    830        vmls.u16        q11, q10, d0[1]
    831        vmls.u16        q15, q13, d1[0]
    832        vqadd.s16       q8,  q8,  q14
    833        vqadd.s16       q11, q11, q15
    834        vqrshrun.s16    \d0, q8,  #7
    835        vqrshrun.s16    \d1, q11, #7
    836 .endm
    837 
    838 function ff_put_vp8_epel16_v6_neon, export=1
    839        sub             r2,  r2,  r3,  lsl #1
    840        push            {r4,lr}
    841        vpush           {d8-d15}
    842 
    843        ldr             r4,  [sp, #80]          @ my
    844        movrel          lr,  subpel_filters-16
    845        ldr             r12, [sp, #72]          @ h
    846        add             r4,  lr,  r4, lsl #4
    847        vld1.16         {q0},     [r4,:128]
    848 1:
    849        vld1.8          {d2-d3},  [r2], r3
    850        vld1.8          {d4-d5},  [r2], r3
    851        vld1.8          {d6-d7},  [r2], r3
    852        vld1.8          {d8-d9},  [r2], r3
    853        vld1.8          {d10-d11},[r2], r3
    854        vld1.8          {d12-d13},[r2], r3
    855        vld1.8          {d14-d15},[r2]
    856        sub             r2,  r2,  r3,  lsl #2
    857 
    858        vp8_epel8_v6_y2 d2,  d4,  d2,  d4,  d6,  d8,  d10, d12, d14
    859        vp8_epel8_v6_y2 d3,  d5,  d3,  d5,  d7,  d9,  d11, d13, d15
    860 
    861        vst1.8          {d2-d3},  [r0,:128], r1
    862        vst1.8          {d4-d5},  [r0,:128], r1
    863        subs            r12, r12, #2
    864        bne             1b
    865 
    866        vpop            {d8-d15}
    867        pop             {r4,pc}
    868 endfunc
    869 
    870 function ff_put_vp8_epel16_h6_neon, export=1
    871        sub             r2,  r2,  #2
    872        push            {r4,lr}
    873 
    874        ldr             r4,  [sp, #12]          @ mx
    875        movrel          lr,  subpel_filters-16
    876        ldr             r12, [sp, #8]           @ h
    877        add             r4,  lr,  r4, lsl #4
    878        vld1.16         {q0},     [r4,:128]
    879 1:
    880        vld1.8          {d2-d4},  [r2], r3
    881 
    882        vp8_epel16_h6   d2,  d3,  d2,  d3,  d4,  q1,  q2
    883 
    884        vst1.8          {d2-d3}, [r0,:128], r1
    885        subs            r12, r12, #1
    886        bne             1b
    887 
    888        pop             {r4,pc}
    889 endfunc
    890 
    891 function ff_put_vp8_epel16_h6v6_neon, export=1
    892        sub             r2,  r2,  r3,  lsl #1
    893        sub             r2,  r2,  #2
    894        push            {r4,lr}
    895        vpush           {d8-d15}
    896 
    897        @ first pass (horizontal):
    898        ldr             r4,  [sp, #64+8+4]          @ mx
    899        movrel          lr,  subpel_filters-16
    900        ldr             r12, [sp, #64+8+0]          @ h
    901        add             r4,  lr,  r4, lsl #4
    902        sub             sp,  sp,  #336+16
    903        vld1.16         {q0},     [r4,:128]
    904        add             lr,  sp,  #15
    905        add             r12, r12, #5
    906        bic             lr,  lr,  #15
    907 1:
    908        vld1.8          {d2,d3,d4}, [r2], r3
    909 
    910        vp8_epel16_h6   d2,  d3,  d2,  d3,  d4,  q1,  q2
    911 
    912        vst1.8          {d2-d3}, [lr,:128]!
    913        subs            r12, r12, #1
    914        bne             1b
    915 
    916        @ second pass (vertical):
    917        ldr             r4,  [sp, #336+16+64+8+8]   @ my
    918        movrel          lr,  subpel_filters-16
    919        ldr             r12, [sp, #336+16+64+8+0]   @ h
    920        add             r4,  lr,  r4, lsl #4
    921        add             lr,  sp,  #15
    922        vld1.16         {q0},     [r4,:128]
    923        bic             lr,  lr,  #15
    924 2:
    925        vld1.8          {d2-d5},  [lr,:128]!
    926        vld1.8          {d6-d9},  [lr,:128]!
    927        vld1.8          {d10-d13},[lr,:128]!
    928        vld1.8          {d14-d15},[lr,:128]
    929        sub             lr,  lr,  #64
    930 
    931        vp8_epel8_v6_y2 d2,  d4,  d2,  d4,  d6,  d8,  d10, d12, d14
    932        vp8_epel8_v6_y2 d3,  d5,  d3,  d5,  d7,  d9,  d11, d13, d15
    933 
    934        vst1.8          {d2-d3}, [r0,:128], r1
    935        vst1.8          {d4-d5}, [r0,:128], r1
    936        subs            r12, r12, #2
    937        bne             2b
    938 
    939        add             sp,  sp,  #336+16
    940        vpop            {d8-d15}
    941        pop             {r4,pc}
    942 endfunc
    943 
    944 function ff_put_vp8_epel8_v6_neon, export=1
    945        sub             r2,  r2,  r3,  lsl #1
    946        push            {r4,lr}
    947 
    948        ldr             r4,  [sp, #16]          @ my
    949        movrel          lr,  subpel_filters-16
    950        ldr             r12, [sp, #8]           @ h
    951        add             r4,  lr,  r4, lsl #4
    952        vld1.16         {q0},     [r4,:128]
    953 1:
    954        vld1.8          {d2},  [r2], r3
    955        vld1.8          {d3},  [r2], r3
    956        vld1.8          {d4},  [r2], r3
    957        vld1.8          {d5},  [r2], r3
    958        vld1.8          {d6},  [r2], r3
    959        vld1.8          {d7},  [r2], r3
    960        vld1.8          {d28}, [r2]
    961 
    962        sub             r2,  r2,  r3,  lsl #2
    963 
    964        vp8_epel8_v6_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6,  d7,  d28
    965 
    966        vst1.8          {d2}, [r0,:64], r1
    967        vst1.8          {d3}, [r0,:64], r1
    968        subs            r12, r12, #2
    969        bne             1b
    970 
    971        pop             {r4,pc}
    972 endfunc
    973 
    974 function ff_put_vp8_epel8_h6_neon, export=1
    975        sub             r2,  r2,  #2
    976        push            {r4,lr}
    977 
    978        ldr             r4,  [sp, #12]          @ mx
    979        movrel          lr,  subpel_filters-16
    980        ldr             r12, [sp, #8]           @ h
    981        add             r4,  lr,  r4, lsl #4
    982        vld1.16         {q0},     [r4,:128]
    983 1:
    984        vld1.8          {d2,d3}, [r2], r3
    985 
    986        vp8_epel8_h6    d2,  d2,  d3
    987 
    988        vst1.8          {d2}, [r0,:64], r1
    989        subs            r12, r12, #1
    990        bne             1b
    991 
    992        pop             {r4,pc}
    993 endfunc
    994 
    995 function ff_put_vp8_epel8_h6v6_neon, export=1
    996        sub             r2,  r2,  r3,  lsl #1
    997        sub             r2,  r2,  #2
    998        push            {r4,lr}
    999 
   1000        @ first pass (horizontal):
   1001        ldr             r4,  [sp, #12]          @ mx
   1002        movrel          lr,  subpel_filters-16
   1003        ldr             r12, [sp, #8]           @ h
   1004        add             r4,  lr,  r4, lsl #4
   1005        sub             sp,  sp,  #168+16
   1006        vld1.16         {q0},     [r4,:128]
   1007        add             lr,  sp,  #15
   1008        add             r12, r12, #5
   1009        bic             lr,  lr,  #15
   1010 1:
   1011        vld1.8          {d2,d3}, [r2], r3
   1012 
   1013        vp8_epel8_h6    d2,  d2,  d3
   1014 
   1015        vst1.8          {d2}, [lr,:64]!
   1016        subs            r12, r12, #1
   1017        bne             1b
   1018 
   1019        @ second pass (vertical):
   1020        ldr             r4,  [sp, #168+16+16]   @ my
   1021        movrel          lr,  subpel_filters-16
   1022        ldr             r12, [sp, #168+16+8]    @ h
   1023        add             r4,  lr,  r4, lsl #4
   1024        add             lr,  sp,  #15
   1025        vld1.16         {q0},     [r4,:128]
   1026        bic             lr,  lr,  #15
   1027 2:
   1028        vld1.8          {d2-d5},  [lr,:128]!
   1029        vld1.8          {d6-d7},  [lr,:128]!
   1030        vld1.8          {d30},    [lr,:64]
   1031        sub             lr,  lr,  #32
   1032 
   1033        vp8_epel8_v6_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6,  d7,  d30
   1034 
   1035        vst1.8          {d2}, [r0,:64], r1
   1036        vst1.8          {d3}, [r0,:64], r1
   1037        subs            r12, r12, #2
   1038        bne             2b
   1039 
   1040        add             sp,  sp,  #168+16
   1041        pop             {r4,pc}
   1042 endfunc
   1043 
   1044 function ff_put_vp8_epel8_v4_neon, export=1
   1045        sub             r2,  r2,  r3
   1046        push            {r4,lr}
   1047 
   1048        ldr             r4,  [sp, #16]          @ my
   1049        movrel          lr,  subpel_filters-16
   1050        ldr             r12, [sp, #8]           @ h
   1051        add             r4,  lr,  r4, lsl #4
   1052        vld1.16         {q0},     [r4,:128]
   1053 1:
   1054        vld1.8          {d2},     [r2], r3
   1055        vld1.8          {d3},     [r2], r3
   1056        vld1.8          {d4},     [r2], r3
   1057        vld1.8          {d5},     [r2], r3
   1058        vld1.8          {d6},     [r2]
   1059        sub             r2,  r2,  r3,  lsl #1
   1060 
   1061        vp8_epel8_v4_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6
   1062 
   1063        vst1.8          {d2}, [r0,:64], r1
   1064        vst1.8          {d3}, [r0,:64], r1
   1065        subs            r12, r12, #2
   1066        bne             1b
   1067 
   1068        pop             {r4,pc}
   1069 endfunc
   1070 
   1071 function ff_put_vp8_epel8_h4_neon, export=1
   1072        sub             r2,  r2,  #1
   1073        push            {r4,lr}
   1074 
   1075        ldr             r4,  [sp, #12]          @ mx
   1076        movrel          lr,  subpel_filters-16
   1077        ldr             r12, [sp, #8]           @ h
   1078        add             r4,  lr,  r4, lsl #4
   1079        vld1.16         {q0},     [r4,:128]
   1080 1:
   1081        vld1.8          {d2,d3}, [r2], r3
   1082 
   1083        vp8_epel8_h4    d2,  d2,  d3
   1084 
   1085        vst1.8          {d2}, [r0,:64], r1
   1086        subs            r12, r12, #1
   1087        bne             1b
   1088 
   1089        pop             {r4,pc}
   1090 endfunc
   1091 
   1092 function ff_put_vp8_epel8_h4v4_neon, export=1
   1093        sub             r2,  r2,  r3
   1094        sub             r2,  r2,  #1
   1095        push            {r4,lr}
   1096 
   1097        @ first pass (horizontal):
   1098        ldr             r4,  [sp, #12]          @ mx
   1099        movrel          lr,  subpel_filters-16
   1100        ldr             r12, [sp, #8]           @ h
   1101        add             r4,  lr,  r4, lsl #4
   1102        sub             sp,  sp,  #168+16
   1103        vld1.16         {q0},     [r4,:128]
   1104        add             lr,  sp,  #15
   1105        add             r12, r12, #3
   1106        bic             lr,  lr,  #15
   1107 1:
   1108        vld1.8          {d2,d3}, [r2], r3
   1109 
   1110        vp8_epel8_h4    d2,  d2,  d3
   1111 
   1112        vst1.8          {d2}, [lr,:64]!
   1113        subs            r12, r12, #1
   1114        bne             1b
   1115 
   1116        @ second pass (vertical):
   1117        ldr             r4,  [sp, #168+16+16]   @ my
   1118        movrel          lr,  subpel_filters-16
   1119        ldr             r12, [sp, #168+16+8]    @ h
   1120        add             r4,  lr,  r4, lsl #4
   1121        add             lr,  sp,  #15
   1122        vld1.16         {q0},     [r4,:128]
   1123        bic             lr,  lr,  #15
   1124 2:
   1125        vld1.8          {d2-d5},  [lr,:128]!
   1126        vld1.8          {d6},     [lr,:64]
   1127        sub             lr,  lr,  #16
   1128 
   1129        vp8_epel8_v4_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6
   1130 
   1131        vst1.8          {d2},     [r0,:64], r1
   1132        vst1.8          {d3},     [r0,:64], r1
   1133        subs            r12, r12, #2
   1134        bne             2b
   1135 
   1136        add             sp,  sp,  #168+16
   1137        pop             {r4,pc}
   1138 endfunc
   1139 
   1140 function ff_put_vp8_epel8_h6v4_neon, export=1
   1141        sub             r2,  r2,  r3
   1142        sub             r2,  r2,  #2
   1143        push            {r4,lr}
   1144 
   1145        @ first pass (horizontal):
   1146        ldr             r4,  [sp, #12]          @ mx
   1147        movrel          lr,  subpel_filters-16
   1148        ldr             r12, [sp, #8]           @ h
   1149        add             r4,  lr,  r4, lsl #4
   1150        sub             sp,  sp,  #168+16
   1151        vld1.16         {q0},     [r4,:128]
   1152        add             lr,  sp,  #15
   1153        add             r12, r12, #3
   1154        bic             lr,  lr,  #15
   1155 1:
   1156        vld1.8          {d2,d3}, [r2], r3
   1157 
   1158        vp8_epel8_h6    d2,  d2,  d3
   1159 
   1160        vst1.8          {d2}, [lr,:64]!
   1161        subs            r12, r12, #1
   1162        bne             1b
   1163 
   1164        @ second pass (vertical):
   1165        ldr             r4,  [sp, #168+16+16]   @ my
   1166        movrel          lr,  subpel_filters-16
   1167        ldr             r12, [sp, #168+16+8]    @ h
   1168        add             r4,  lr,  r4, lsl #4
   1169        add             lr,  sp,  #15
   1170        vld1.16         {q0},     [r4,:128]
   1171        bic             lr,  lr,  #15
   1172 2:
   1173        vld1.8          {d2-d5},  [lr,:128]!
   1174        vld1.8          {d6},     [lr,:64]
   1175        sub             lr,  lr,  #16
   1176 
   1177        vp8_epel8_v4_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6
   1178 
   1179        vst1.8          {d2},     [r0,:64], r1
   1180        vst1.8          {d3},     [r0,:64], r1
   1181        subs            r12, r12, #2
   1182        bne             2b
   1183 
   1184        add             sp,  sp,  #168+16
   1185        pop             {r4,pc}
   1186 endfunc
   1187 
   1188 function ff_put_vp8_epel8_h4v6_neon, export=1
   1189        sub             r2,  r2,  r3,  lsl #1
   1190        sub             r2,  r2,  #1
   1191        push            {r4,lr}
   1192 
   1193        @ first pass (horizontal):
   1194        ldr             r4,  [sp, #12]          @ mx
   1195        movrel          lr,  subpel_filters-16
   1196        ldr             r12, [sp, #8]           @ h
   1197        add             r4,  lr,  r4, lsl #4
   1198        sub             sp,  sp,  #168+16
   1199        vld1.16         {q0},     [r4,:128]
   1200        add             lr,  sp,  #15
   1201        add             r12, r12, #5
   1202        bic             lr,  lr,  #15
   1203 1:
   1204        vld1.8          {d2,d3}, [r2], r3
   1205 
   1206        vp8_epel8_h4    d2,  d2,  d3
   1207 
   1208        vst1.8          {d2}, [lr,:64]!
   1209        subs            r12, r12, #1
   1210        bne             1b
   1211 
   1212        @ second pass (vertical):
   1213        ldr             r4,  [sp, #168+16+16]   @ my
   1214        movrel          lr,  subpel_filters-16
   1215        ldr             r12, [sp, #168+16+8]    @ h
   1216        add             r4,  lr,  r4, lsl #4
   1217        add             lr,  sp,  #15
   1218        vld1.16         {q0},     [r4,:128]
   1219        bic             lr,  lr,  #15
   1220 2:
   1221        vld1.8          {d2-d5},  [lr,:128]!
   1222        vld1.8          {d6-d7},  [lr,:128]!
   1223        vld1.8          {d30},    [lr,:64]
   1224        sub             lr,  lr,  #32
   1225 
   1226        vp8_epel8_v6_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6,  d7,  d30
   1227 
   1228        vst1.8          {d2}, [r0,:64], r1
   1229        vst1.8          {d3}, [r0,:64], r1
   1230        subs            r12, r12, #2
   1231        bne             2b
   1232 
   1233        add             sp,  sp,  #168+16
   1234        pop             {r4,pc}
   1235 endfunc
   1236 
   1237 .ltorg
   1238 
   1239 function ff_put_vp8_epel4_v6_neon, export=1
   1240        sub             r2,  r2,  r3,  lsl #1
   1241        push            {r4,lr}
   1242 
   1243        ldr             r4,  [sp, #16]          @ my
   1244        movrel          lr,  subpel_filters-16
   1245        ldr             r12, [sp, #8]           @ h
   1246        add             r4,  lr,  r4, lsl #4
   1247        vld1.16         {q0},     [r4,:128]
   1248 1:
   1249        vld1.32         {d2[]},   [r2], r3
   1250        vld1.32         {d3[]},   [r2], r3
   1251        vld1.32         {d4[]},   [r2], r3
   1252        vld1.32         {d5[]},   [r2], r3
   1253        vld1.32         {d6[]},   [r2], r3
   1254        vld1.32         {d7[]},   [r2], r3
   1255        vld1.32         {d28[]},  [r2]
   1256        sub             r2,  r2,  r3,  lsl #2
   1257        vld1.32         {d2[1]},  [r2], r3
   1258        vld1.32         {d3[1]},  [r2], r3
   1259        vld1.32         {d4[1]},  [r2], r3
   1260        vld1.32         {d5[1]},  [r2], r3
   1261        vld1.32         {d6[1]},  [r2], r3
   1262        vld1.32         {d7[1]},  [r2], r3
   1263        vld1.32         {d28[1]}, [r2]
   1264        sub             r2,  r2,  r3,  lsl #2
   1265 
   1266        vp8_epel8_v6_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6,  d7,  d28
   1267 
   1268        vst1.32         {d2[0]},  [r0,:32], r1
   1269        vst1.32         {d3[0]},  [r0,:32], r1
   1270        vst1.32         {d2[1]},  [r0,:32], r1
   1271        vst1.32         {d3[1]},  [r0,:32], r1
   1272        subs            r12, r12, #4
   1273        bne             1b
   1274 
   1275        pop             {r4,pc}
   1276 endfunc
   1277 
   1278 function ff_put_vp8_epel4_h6_neon, export=1
   1279        sub             r2,  r2,  #2
   1280        push            {r4,lr}
   1281 
   1282        ldr             r4,  [sp, #12]          @ mx
   1283        movrel          lr,  subpel_filters-16
   1284        ldr             r12, [sp, #8]           @ h
   1285        add             r4,  lr,  r4, lsl #4
   1286        vld1.16         {q0},     [r4,:128]
   1287 1:
   1288        vld1.8          {q1},     [r2], r3
   1289        vp8_epel8_h6    d2,  d2,  d3
   1290        vst1.32         {d2[0]},  [r0,:32], r1
   1291        subs            r12, r12, #1
   1292        bne             1b
   1293 
   1294        pop             {r4,pc}
   1295 endfunc
   1296 
   1297 function ff_put_vp8_epel4_h6v6_neon, export=1
   1298        sub             r2,  r2,  r3,  lsl #1
   1299        sub             r2,  r2,  #2
   1300        push            {r4,lr}
   1301 
   1302        ldr             r4,  [sp, #12]          @ mx
   1303        movrel          lr,  subpel_filters-16
   1304        ldr             r12, [sp, #8]           @ h
   1305        add             r4,  lr,  r4, lsl #4
   1306        sub             sp,  sp,  #52+16
   1307        vld1.16         {q0},     [r4,:128]
   1308        add             lr,  sp,  #15
   1309        add             r12, r12, #5
   1310        bic             lr,  lr,  #15
   1311 1:
   1312        vld1.8          {q1},     [r2], r3
   1313        vp8_epel8_h6    d2,  d2,  d3
   1314        vst1.32         {d2[0]},  [lr,:32]!
   1315        subs            r12, r12, #1
   1316        bne             1b
   1317 
   1318        ldr             r4,  [sp, #52+16+16]    @ my
   1319        movrel          lr,  subpel_filters-16
   1320        ldr             r12, [sp, #52+16+8]     @ h
   1321        add             r4,  lr,  r4, lsl #4
   1322        add             lr,  sp,  #15
   1323        vld1.16         {q0},     [r4,:128]
   1324        bic             lr,  lr,  #15
   1325 2:
   1326        vld1.8          {d2-d3},  [lr,:128]!
   1327        vld1.8          {d6},     [lr,:64]!
   1328        vld1.32         {d28[]},  [lr,:32]
   1329        sub             lr,  lr,  #16
   1330        vld1.8          {d4-d5},  [lr]!
   1331        vld1.8          {d7},     [lr,:64]!
   1332        vld1.32         {d28[1]}, [lr,:32]
   1333        sub             lr,  lr,  #16
   1334        vtrn.32         q1,  q2
   1335        vtrn.32         d6,  d7
   1336        vp8_epel8_v6_y2 d2,  d3,  d2,  d4,  d3,  d5,  d6,  d7,  d28
   1337        vst1.32         {d2[0]},  [r0,:32], r1
   1338        vst1.32         {d3[0]},  [r0,:32], r1
   1339        vst1.32         {d2[1]},  [r0,:32], r1
   1340        vst1.32         {d3[1]},  [r0,:32], r1
   1341        subs            r12, r12, #4
   1342        bne             2b
   1343 
   1344        add             sp,  sp,  #52+16
   1345        pop             {r4,pc}
   1346 endfunc
   1347 
   1348 function ff_put_vp8_epel4_h4v6_neon, export=1
   1349        sub             r2,  r2,  r3,  lsl #1
   1350        sub             r2,  r2,  #1
   1351        push            {r4,lr}
   1352 
   1353        ldr             r4,  [sp, #12]          @ mx
   1354        movrel          lr,  subpel_filters-16
   1355        ldr             r12, [sp, #8]           @ h
   1356        add             r4,  lr,  r4, lsl #4
   1357        sub             sp,  sp,  #52+16
   1358        vld1.16         {q0},     [r4,:128]
   1359        add             lr,  sp,  #15
   1360        add             r12, r12, #5
   1361        bic             lr,  lr,  #15
   1362 1:
   1363        vld1.8          {d2},     [r2], r3
   1364        vp8_epel8_h4    d2,  d2,  d2
   1365        vst1.32         {d2[0]},  [lr,:32]!
   1366        subs            r12, r12, #1
   1367        bne             1b
   1368 
   1369        ldr             r4,  [sp, #52+16+16]    @ my
   1370        movrel          lr,  subpel_filters-16
   1371        ldr             r12, [sp, #52+16+8]     @ h
   1372        add             r4,  lr,  r4, lsl #4
   1373        add             lr,  sp,  #15
   1374        vld1.16         {q0},     [r4,:128]
   1375        bic             lr,  lr,  #15
   1376 2:
   1377        vld1.8          {d2-d3},  [lr,:128]!
   1378        vld1.8          {d6},     [lr,:64]!
   1379        vld1.32         {d28[]},  [lr,:32]
   1380        sub             lr,  lr,  #16
   1381        vld1.8          {d4-d5},  [lr]!
   1382        vld1.8          {d7},     [lr,:64]!
   1383        vld1.32         {d28[1]}, [lr,:32]
   1384        sub             lr,  lr,  #16
   1385        vtrn.32         q1,  q2
   1386        vtrn.32         d6,  d7
   1387        vp8_epel8_v6_y2 d2,  d3,  d2,  d4,  d3,  d5,  d6,  d7,  d28
   1388        vst1.32         {d2[0]},  [r0,:32], r1
   1389        vst1.32         {d3[0]},  [r0,:32], r1
   1390        vst1.32         {d2[1]},  [r0,:32], r1
   1391        vst1.32         {d3[1]},  [r0,:32], r1
   1392        subs            r12, r12, #4
   1393        bne             2b
   1394 
   1395        add             sp,  sp,  #52+16
   1396        pop             {r4,pc}
   1397 endfunc
   1398 
   1399 function ff_put_vp8_epel4_h6v4_neon, export=1
   1400        sub             r2,  r2,  r3
   1401        sub             r2,  r2,  #2
   1402        push            {r4,lr}
   1403 
   1404        ldr             r4,  [sp, #12]          @ mx
   1405        movrel          lr,  subpel_filters-16
   1406        ldr             r12, [sp, #8]           @ h
   1407        add             r4,  lr,  r4, lsl #4
   1408        sub             sp,  sp,  #44+16
   1409        vld1.16         {q0},     [r4,:128]
   1410        add             lr,  sp,  #15
   1411        add             r12, r12, #3
   1412        bic             lr,  lr,  #15
   1413 1:
   1414        vld1.8          {q1},     [r2], r3
   1415        vp8_epel8_h6    d2,  d2,  d3
   1416        vst1.32         {d2[0]},  [lr,:32]!
   1417        subs            r12, r12, #1
   1418        bne             1b
   1419 
   1420        ldr             r4,  [sp, #44+16+16]    @ my
   1421        movrel          lr,  subpel_filters-16
   1422        ldr             r12, [sp, #44+16+8]     @ h
   1423        add             r4,  lr,  r4, lsl #4
   1424        add             lr,  sp,  #15
   1425        vld1.16         {q0},     [r4,:128]
   1426        bic             lr,  lr,  #15
   1427 2:
   1428        vld1.8          {d2-d3},  [lr,:128]!
   1429        vld1.32         {d6[]},   [lr,:32]
   1430        sub             lr,  lr,  #8
   1431        vld1.8          {d4-d5},  [lr]!
   1432        vld1.32         {d6[1]},  [lr,:32]
   1433        sub             lr,  lr,  #8
   1434        vtrn.32         q1,  q2
   1435        vp8_epel8_v4_y2 d2,  d3,  d2,  d4,  d3,  d5,  d6
   1436        vst1.32         {d2[0]},  [r0,:32], r1
   1437        vst1.32         {d3[0]},  [r0,:32], r1
   1438        vst1.32         {d2[1]},  [r0,:32], r1
   1439        vst1.32         {d3[1]},  [r0,:32], r1
   1440        subs            r12, r12, #4
   1441        bne             2b
   1442 
   1443        add             sp,  sp,  #44+16
   1444        pop             {r4,pc}
   1445 endfunc
   1446 
   1447 function ff_put_vp8_epel4_h4_neon, export=1
   1448        sub             r2,  r2,  #1
   1449        push            {r4,lr}
   1450 
   1451        ldr             r4,  [sp, #12]          @ mx
   1452        movrel          lr,  subpel_filters-16
   1453        ldr             r12, [sp, #8]           @ h
   1454        add             r4,  lr,  r4, lsl #4
   1455        vld1.16         {q0},     [r4,:128]
   1456 1:
   1457        vld1.8          {d2},     [r2], r3
   1458        vp8_epel8_h4    d2,  d2,  d2
   1459        vst1.32         {d2[0]},  [r0,:32], r1
   1460        subs            r12, r12, #1
   1461        bne             1b
   1462 
   1463        pop             {r4,pc}
   1464 endfunc
   1465 
   1466 function ff_put_vp8_epel4_v4_neon, export=1
   1467        sub             r2,  r2,  r3
   1468        push            {r4,lr}
   1469 
   1470        ldr             r4,  [sp, #16]          @ my
   1471        movrel          lr,  subpel_filters-16
   1472        ldr             r12, [sp, #8]           @ h
   1473        add             r4,  lr,  r4, lsl #4
   1474        vld1.16         {q0},     [r4,:128]
   1475 1:
   1476        vld1.32         {d2[]},   [r2], r3
   1477        vld1.32         {d3[]},   [r2], r3
   1478        vld1.32         {d4[]},   [r2], r3
   1479        vld1.32         {d5[]},   [r2], r3
   1480        vld1.32         {d6[]},   [r2]
   1481        sub             r2,  r2,  r3,  lsl #1
   1482        vld1.32         {d2[1]},  [r2], r3
   1483        vld1.32         {d3[1]},  [r2], r3
   1484        vld1.32         {d4[1]},  [r2], r3
   1485        vld1.32         {d5[1]},  [r2], r3
   1486        vld1.32         {d6[1]},  [r2]
   1487        sub             r2,  r2,  r3,  lsl #1
   1488 
   1489        vp8_epel8_v4_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6
   1490 
   1491        vst1.32         {d2[0]},  [r0,:32], r1
   1492        vst1.32         {d3[0]},  [r0,:32], r1
   1493        vst1.32         {d2[1]},  [r0,:32], r1
   1494        vst1.32         {d3[1]},  [r0,:32], r1
   1495        subs            r12, r12, #4
   1496        bne             1b
   1497 
   1498        pop             {r4,pc}
   1499 endfunc
   1500 
   1501 function ff_put_vp8_epel4_h4v4_neon, export=1
   1502        sub             r2,  r2,  r3
   1503        sub             r2,  r2,  #1
   1504        push            {r4,lr}
   1505 
   1506        ldr             r4,  [sp, #12]          @ mx
   1507        movrel          lr,  subpel_filters-16
   1508        ldr             r12, [sp, #8]           @ h
   1509        add             r4,  lr,  r4, lsl #4
   1510        sub             sp,  sp,  #44+16
   1511        vld1.16         {q0},     [r4,:128]
   1512        add             lr,  sp,  #15
   1513        add             r12, r12, #3
   1514        bic             lr,  lr,  #15
   1515 1:
   1516        vld1.8          {d2},     [r2], r3
   1517        vp8_epel8_h4    d2,  d2,  d3
   1518        vst1.32         {d2[0]},  [lr,:32]!
   1519        subs            r12, r12, #1
   1520        bne             1b
   1521 
   1522        ldr             r4,  [sp, #44+16+16]    @ my
   1523        movrel          lr,  subpel_filters-16
   1524        ldr             r12, [sp, #44+16+8]     @ h
   1525        add             r4,  lr,  r4, lsl #4
   1526        add             lr,  sp,  #15
   1527        vld1.16         {q0},     [r4,:128]
   1528        bic             lr,  lr,  #15
   1529 2:
   1530        vld1.8          {d2-d3},  [lr,:128]!
   1531        vld1.32         {d6[]},   [lr,:32]
   1532        sub             lr,  lr,  #8
   1533        vld1.8          {d4-d5},  [lr]!
   1534        vld1.32         {d6[1]},  [lr,:32]
   1535        sub             lr,  lr,  #8
   1536        vtrn.32         q1,  q2
   1537        vp8_epel8_v4_y2 d2,  d3,  d2,  d4,  d3,  d5,  d6
   1538        vst1.32         {d2[0]},  [r0,:32], r1
   1539        vst1.32         {d3[0]},  [r0,:32], r1
   1540        vst1.32         {d2[1]},  [r0,:32], r1
   1541        vst1.32         {d3[1]},  [r0,:32], r1
   1542        subs            r12, r12, #4
   1543        bne             2b
   1544 
   1545        add             sp,  sp,  #44+16
   1546        pop             {r4,pc}
   1547 endfunc
   1548 
   1549 @ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
   1550 @ arithmetic can be used to apply filters
   1551 const   subpel_filters, align=4
   1552        .short     0,   6, 123,  12,   1,   0,   0,   0
   1553        .short     2,  11, 108,  36,   8,   1,   0,   0
   1554        .short     0,   9,  93,  50,   6,   0,   0,   0
   1555        .short     3,  16,  77,  77,  16,   3,   0,   0
   1556        .short     0,   6,  50,  93,   9,   0,   0,   0
   1557        .short     1,   8,  36, 108,  11,   2,   0,   0
   1558        .short     0,   1,  12, 123,   6,   0,   0,   0
   1559 endconst
   1560 
   1561 /* Bilinear MC */
   1562 
   1563 function ff_put_vp8_bilin16_h_neon, export=1
   1564        ldr             r12, [sp, #4]           @ mx
   1565        vdup.8          d0,  r12
   1566        rsb             r12, r12, #8
   1567        vdup.8          d1,  r12
   1568        ldr             r12, [sp]               @ h
   1569 1:
   1570        subs            r12, r12, #2
   1571        vld1.8          {d2-d4},  [r2], r3
   1572        vext.8          q2,  q1,  q2,  #1
   1573        vmull.u8        q8,  d2,  d1
   1574        vmlal.u8        q8,  d4,  d0
   1575        vld1.8          {d18-d20},[r2], r3
   1576        vmull.u8        q3,  d3,  d1
   1577        vmlal.u8        q3,  d5,  d0
   1578        vext.8          q10, q9,  q10, #1
   1579        vmull.u8        q11, d18, d1
   1580        vmlal.u8        q11, d20, d0
   1581        vmull.u8        q12, d19, d1
   1582        vmlal.u8        q12, d21, d0
   1583        vrshrn.u16      d4,  q8,  #3
   1584        vrshrn.u16      d5,  q3,  #3
   1585        vrshrn.u16      d6,  q11, #3
   1586        vrshrn.u16      d7,  q12, #3
   1587        vst1.8          {q2},     [r0,:128], r1
   1588        vst1.8          {q3},     [r0,:128], r1
   1589        bgt             1b
   1590 
   1591        bx              lr
   1592 endfunc
   1593 
   1594 function ff_put_vp8_bilin16_v_neon, export=1
   1595        ldr             r12, [sp, #8]           @ my
   1596        vdup.8          d0,  r12
   1597        rsb             r12, r12, #8
   1598        vdup.8          d1,  r12
   1599        ldr             r12, [sp]               @ h
   1600        vld1.8          {q1},     [r2], r3
   1601 1:
   1602        subs            r12, r12, #2
   1603        vld1.8          {q2},     [r2], r3
   1604        vmull.u8        q3,  d2,  d1
   1605        vmlal.u8        q3,  d4,  d0
   1606        vmull.u8        q8,  d3,  d1
   1607        vmlal.u8        q8,  d5,  d0
   1608        vld1.8          {q1},     [r2], r3
   1609        vmull.u8        q9,  d4,  d1
   1610        vmlal.u8        q9,  d2,  d0
   1611        vmull.u8        q10, d5,  d1
   1612        vmlal.u8        q10, d3,  d0
   1613        vrshrn.u16      d4,  q3,  #3
   1614        vrshrn.u16      d5,  q8,  #3
   1615        vrshrn.u16      d6,  q9,  #3
   1616        vrshrn.u16      d7,  q10, #3
   1617        vst1.8          {q2},     [r0,:128], r1
   1618        vst1.8          {q3},     [r0,:128], r1
   1619        bgt             1b
   1620 
   1621        bx              lr
   1622 endfunc
   1623 
   1624 function ff_put_vp8_bilin16_hv_neon, export=1
   1625        ldr             r12, [sp, #4]           @ mx
   1626        vdup.8          d0,  r12
   1627        rsb             r12, r12, #8
   1628        vdup.8          d1,  r12
   1629        ldr             r12, [sp, #8]           @ my
   1630        vdup.8          d2,  r12
   1631        rsb             r12, r12, #8
   1632        vdup.8          d3,  r12
   1633        ldr             r12, [sp]               @ h
   1634 
   1635        vld1.8          {d4-d6},  [r2], r3
   1636        vext.8          q3,  q2,  q3,  #1
   1637        vmull.u8        q8,  d4,  d1
   1638        vmlal.u8        q8,  d6,  d0
   1639        vmull.u8        q9,  d5,  d1
   1640        vmlal.u8        q9,  d7,  d0
   1641        vrshrn.u16      d4,  q8,  #3
   1642        vrshrn.u16      d5,  q9,  #3
   1643 1:
   1644        subs            r12, r12, #2
   1645        vld1.8          {d18-d20},[r2], r3
   1646        vext.8          q10, q9,  q10, #1
   1647        vmull.u8        q11, d18, d1
   1648        vmlal.u8        q11, d20, d0
   1649        vld1.8          {d26-d28},[r2], r3
   1650        vmull.u8        q12, d19, d1
   1651        vmlal.u8        q12, d21, d0
   1652        vext.8          q14, q13, q14, #1
   1653        vmull.u8        q8,  d26, d1
   1654        vmlal.u8        q8,  d28, d0
   1655        vmull.u8        q9,  d27, d1
   1656        vmlal.u8        q9,  d29, d0
   1657        vrshrn.u16      d6,  q11, #3
   1658        vrshrn.u16      d7,  q12, #3
   1659        vmull.u8        q12, d4,  d3
   1660        vmlal.u8        q12, d6,  d2
   1661        vmull.u8        q15, d5,  d3
   1662        vmlal.u8        q15, d7,  d2
   1663        vrshrn.u16      d4,  q8,  #3
   1664        vrshrn.u16      d5,  q9,  #3
   1665        vmull.u8        q10, d6,  d3
   1666        vmlal.u8        q10, d4,  d2
   1667        vmull.u8        q11, d7,  d3
   1668        vmlal.u8        q11, d5,  d2
   1669        vrshrn.u16      d24, q12, #3
   1670        vrshrn.u16      d25, q15, #3
   1671        vst1.8          {q12},    [r0,:128], r1
   1672        vrshrn.u16      d20, q10, #3
   1673        vrshrn.u16      d21, q11, #3
   1674        vst1.8          {q10},    [r0,:128], r1
   1675        bgt             1b
   1676 
   1677        bx              lr
   1678 endfunc
   1679 
   1680 function ff_put_vp8_bilin8_h_neon, export=1
   1681        ldr             r12, [sp, #4]           @ mx
   1682        vdup.8          d0,  r12
   1683        rsb             r12, r12, #8
   1684        vdup.8          d1,  r12
   1685        ldr             r12, [sp]               @ h
   1686 1:
   1687        subs            r12, r12, #2
   1688        vld1.8          {q1},     [r2], r3
   1689        vext.8          d3,  d2,  d3,  #1
   1690        vmull.u8        q2,  d2,  d1
   1691        vmlal.u8        q2,  d3,  d0
   1692        vld1.8          {q3},     [r2], r3
   1693        vext.8          d7,  d6,  d7,  #1
   1694        vmull.u8        q8,  d6,  d1
   1695        vmlal.u8        q8,  d7,  d0
   1696        vrshrn.u16      d4,  q2,  #3
   1697        vrshrn.u16      d16, q8,  #3
   1698        vst1.8          {d4},     [r0,:64], r1
   1699        vst1.8          {d16},    [r0,:64], r1
   1700        bgt             1b
   1701 
   1702        bx              lr
   1703 endfunc
   1704 
   1705 function ff_put_vp8_bilin8_v_neon, export=1
   1706        ldr             r12, [sp, #8]           @ my
   1707        vdup.8          d0,  r12
   1708        rsb             r12, r12,  #8
   1709        vdup.8          d1,  r12
   1710        ldr             r12, [sp]               @ h
   1711        vld1.8          {d2},     [r2], r3
   1712 1:
   1713        subs            r12, r12, #2
   1714        vld1.8          {d3},     [r2], r3
   1715        vmull.u8        q2,  d2,  d1
   1716        vmlal.u8        q2,  d3,  d0
   1717        vld1.8          {d2},     [r2], r3
   1718        vmull.u8        q3,  d3,  d1
   1719        vmlal.u8        q3,  d2,  d0
   1720        vrshrn.u16      d4,  q2,  #3
   1721        vrshrn.u16      d6,  q3,  #3
   1722        vst1.8          {d4},     [r0,:64], r1
   1723        vst1.8          {d6},     [r0,:64], r1
   1724        bgt             1b
   1725 
   1726        bx              lr
   1727 endfunc
   1728 
   1729 function ff_put_vp8_bilin8_hv_neon, export=1
   1730        ldr             r12, [sp, #4]           @ mx
   1731        vdup.8          d0,  r12
   1732        rsb             r12, r12, #8
   1733        vdup.8          d1,  r12
   1734        ldr             r12, [sp, #8]           @ my
   1735        vdup.8          d2,  r12
   1736        rsb             r12, r12, #8
   1737        vdup.8          d3,  r12
   1738        ldr             r12, [sp]               @ h
   1739 
   1740        vld1.8          {q2},     [r2], r3
   1741        vext.8          d5,  d4,  d5,  #1
   1742        vmull.u8        q9,  d4,  d1
   1743        vmlal.u8        q9,  d5,  d0
   1744        vrshrn.u16      d22, q9,  #3
   1745 1:
   1746        subs            r12, r12, #2
   1747        vld1.8          {q3},     [r2], r3
   1748        vext.8          d7,  d6,  d7,  #1
   1749        vmull.u8        q8,  d6,  d1
   1750        vmlal.u8        q8,  d7,  d0
   1751        vld1.8          {q2},     [r2], r3
   1752        vext.8          d5,  d4,  d5,  #1
   1753        vmull.u8        q9,  d4,  d1
   1754        vmlal.u8        q9,  d5,  d0
   1755        vrshrn.u16      d16, q8,  #3
   1756        vmull.u8        q10, d22, d3
   1757        vmlal.u8        q10, d16, d2
   1758        vrshrn.u16      d22, q9,  #3
   1759        vmull.u8        q12, d16, d3
   1760        vmlal.u8        q12, d22, d2
   1761        vrshrn.u16      d20, q10, #3
   1762        vst1.8          {d20},    [r0,:64], r1
   1763        vrshrn.u16      d23, q12, #3
   1764        vst1.8          {d23},    [r0,:64], r1
   1765        bgt             1b
   1766 
   1767        bx              lr
   1768 endfunc
   1769 
   1770 function ff_put_vp8_bilin4_h_neon, export=1
   1771        ldr             r12, [sp, #4]           @ mx
   1772        vdup.8          d0,  r12
   1773        rsb             r12, r12, #8
   1774        vdup.8          d1,  r12
   1775        ldr             r12, [sp]               @ h
   1776 1:
   1777        subs            r12, r12, #2
   1778        vld1.8          {d2},     [r2], r3
   1779        vext.8          d3,  d2,  d3,  #1
   1780        vld1.8          {d6},     [r2], r3
   1781        vext.8          d7,  d6,  d7,  #1
   1782        vtrn.32         q1,  q3
   1783        vmull.u8        q2,  d2,  d1
   1784        vmlal.u8        q2,  d3,  d0
   1785        vrshrn.u16      d4,  q2,  #3
   1786        vst1.32         {d4[0]},  [r0,:32], r1
   1787        vst1.32         {d4[1]}, [r0,:32], r1
   1788        bgt             1b
   1789 
   1790        bx              lr
   1791 endfunc
   1792 
   1793 function ff_put_vp8_bilin4_v_neon, export=1
   1794        ldr             r12, [sp, #8]           @ my
   1795        vdup.8          d0,  r12
   1796        rsb             r12, r12, #8
   1797        vdup.8          d1,  r12
   1798        ldr             r12, [sp]               @ h
   1799        vld1.32         {d2[]},   [r2], r3
   1800 1:
   1801        vld1.32         {d3[]},   [r2]
   1802        vld1.32         {d2[1]},  [r2], r3
   1803        vld1.32         {d3[1]},  [r2], r3
   1804        vmull.u8        q2,  d2,  d1
   1805        vmlal.u8        q2,  d3,  d0
   1806        vtrn.32         d3,  d2
   1807        vrshrn.u16      d4,  q2,  #3
   1808        vst1.32         {d4[0]},  [r0,:32], r1
   1809        vst1.32         {d4[1]},  [r0,:32], r1
   1810        subs            r12, r12, #2
   1811        bgt             1b
   1812 
   1813        bx              lr
   1814 endfunc
   1815 
   1816 function ff_put_vp8_bilin4_hv_neon, export=1
   1817        ldr             r12, [sp, #4]           @ mx
   1818        vdup.8          d0,  r12
   1819        rsb             r12, r12, #8
   1820        vdup.8          d1,  r12
   1821        ldr             r12, [sp, #8]           @ my
   1822        vdup.8          d2,  r12
   1823        rsb             r12, r12, #8
   1824        vdup.8          d3,  r12
   1825        ldr             r12, [sp]               @ h
   1826 
   1827        vld1.8          {d4},     [r2], r3
   1828        vext.8          d5,  d4,  d4,  #1
   1829        vmull.u8        q9,  d4,  d1
   1830        vmlal.u8        q9,  d5,  d0
   1831        vrshrn.u16      d22, q9,  #3
   1832 1:
   1833        subs            r12, r12, #2
   1834        vld1.8          {d6},     [r2], r3
   1835        vext.8          d7,  d6,  d6,  #1
   1836        vld1.8          {d4},     [r2], r3
   1837        vext.8          d5,  d4,  d4,  #1
   1838        vtrn.32         q3,  q2
   1839        vmull.u8        q8,  d6,  d1
   1840        vmlal.u8        q8,  d7,  d0
   1841        vrshrn.u16      d16, q8,  #3
   1842        vmull.u8        q10, d16, d2
   1843        vtrn.32         d22, d16
   1844        vmlal.u8        q10, d22, d3
   1845        vrev64.32       d22, d16
   1846        vrshrn.u16      d20, q10, #3
   1847        vst1.32         {d20[0]}, [r0,:32], r1
   1848        vst1.32         {d20[1]}, [r0,:32], r1
   1849        bgt             1b
   1850 
   1851        bx              lr
   1852 endfunc