tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

vp9mc_16bpp_neon.S (21885B)


      1 /*
      2 * Copyright (c) 2017 Google Inc.
      3 *
      4 * This file is part of FFmpeg.
      5 *
      6 * FFmpeg is free software; you can redistribute it and/or
      7 * modify it under the terms of the GNU Lesser General Public
      8 * License as published by the Free Software Foundation; either
      9 * version 2.1 of the License, or (at your option) any later version.
     10 *
     11 * FFmpeg is distributed in the hope that it will be useful,
     12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     14 * Lesser General Public License for more details.
     15 *
     16 * You should have received a copy of the GNU Lesser General Public
     17 * License along with FFmpeg; if not, write to the Free Software
     18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     19 */
     20 
     21 #include "libavutil/arm/asm.S"
     22 
     23 @ All public functions in this file have the following signature:
     24 @ typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
     25 @                            const uint8_t *ref, ptrdiff_t ref_stride,
     26 @                            int h, int mx, int my);
     27 
     28 function ff_vp9_copy128_neon, export=1
     29        ldr             r12, [sp]
     30        sub             r1,  r1,  #96
     31        sub             r3,  r3,  #96
     32 1:
     33        subs            r12, r12, #1
     34        vld1.16         {q0,  q1},  [r2]!
     35        vst1.16         {q0,  q1},  [r0, :128]!
     36        vld1.16         {q2,  q3},  [r2]!
     37        vst1.16         {q2,  q3},  [r0, :128]!
     38        vld1.16         {q8,  q9},  [r2]!
     39        vst1.16         {q8,  q9},  [r0, :128]!
     40        vld1.16         {q10, q11}, [r2], r3
     41        vst1.16         {q10, q11}, [r0, :128], r1
     42        bne             1b
     43        bx              lr
     44 endfunc
     45 
     46 function ff_vp9_avg64_16_neon, export=1
     47        push            {lr}
     48        ldr             r12, [sp, #4]
     49        sub             r1,  r1,  #96
     50        sub             r3,  r3,  #96
     51        mov             lr,  r0
     52 1:
     53        subs            r12, r12, #1
     54        vld1.16         {q8,  q9},  [r2]!
     55        vld1.16         {q0,  q1},  [r0, :128]!
     56        vld1.16         {q10, q11}, [r2]!
     57        vrhadd.u16      q0,  q0,  q8
     58        vld1.16         {q2,  q3},  [r0, :128]!
     59        vrhadd.u16      q1,  q1,  q9
     60        vld1.16         {q12, q13}, [r2]!
     61        vrhadd.u16      q2,  q2,  q10
     62        vst1.16         {q0,  q1},  [lr, :128]!
     63        vrhadd.u16      q3,  q3,  q11
     64        vld1.16         {q8,  q9},  [r0, :128]!
     65        vst1.16         {q2,  q3},  [lr, :128]!
     66        vrhadd.u16      q8,  q8,  q12
     67        vld1.16         {q14, q15}, [r2], r3
     68        vrhadd.u16      q9,  q9,  q13
     69        vld1.16         {q10, q11}, [r0, :128], r1
     70        vrhadd.u16      q10, q10, q14
     71        vst1.16         {q8,  q9},  [lr, :128]!
     72        vrhadd.u16      q11, q11, q15
     73        vst1.16         {q10, q11}, [lr, :128], r1
     74        bne             1b
     75        pop             {pc}
     76 endfunc
     77 
     78 function ff_vp9_avg32_16_neon, export=1
     79        push            {lr}
     80        ldr             r12, [sp, #4]
     81        sub             r1,  r1,  #32
     82        sub             r3,  r3,  #32
     83        mov             lr,  r0
     84 1:
     85        subs            r12, r12, #1
     86        vld1.16         {q8,  q9},  [r2]!
     87        vld1.16         {q0,  q1},  [r0, :128]!
     88        vld1.16         {q10, q11}, [r2], r3
     89        vrhadd.u16      q0,  q0,  q8
     90        vld1.16         {q2,  q3},  [r0, :128], r1
     91        vrhadd.u16      q1,  q1,  q9
     92        vrhadd.u16      q2,  q2,  q10
     93        vst1.16         {q0, q1},  [lr, :128]!
     94        vrhadd.u16      q3,  q3,  q11
     95        vst1.16         {q2, q3},  [lr, :128], r1
     96        bne             1b
     97        pop             {pc}
     98 endfunc
     99 
    100 function ff_vp9_avg16_16_neon, export=1
    101        ldr             r12, [sp]
    102 1:
    103        subs            r12, r12, #1
    104        vld1.16         {q2,  q3},  [r2], r3
    105        vld1.16         {q0,  q1},  [r0, :128]
    106        vrhadd.u16      q0,  q0,  q2
    107        vrhadd.u16      q1,  q1,  q3
    108        vst1.16         {q0,  q1},  [r0, :128], r1
    109        bne             1b
    110        bx              lr
    111 endfunc
    112 
    113 function ff_vp9_avg8_16_neon, export=1
    114        push            {lr}
    115        ldr             r12, [sp, #4]
    116        mov             lr,  r0
    117 1:
    118        subs            r12, r12, #2
    119        vld1.16         {q2},  [r2], r3
    120        vld1.16         {q0},  [r0, :128], r1
    121        vld1.16         {q3},  [r2], r3
    122        vrhadd.u16      q0,  q0,  q2
    123        vld1.16         {q1},  [r0, :128], r1
    124        vrhadd.u16      q1,  q1,  q3
    125        vst1.16         {q0},  [lr, :128], r1
    126        vst1.16         {q1},  [lr, :128], r1
    127        bne             1b
    128        pop             {pc}
    129 endfunc
    130 
    131 function ff_vp9_avg4_16_neon, export=1
    132        ldr             r12, [sp]
    133 1:
    134        subs            r12, r12, #2
    135        vld1.16         {d2},  [r2], r3
    136        vld1.16         {d0},  [r0, :64], r1
    137        vld1.16         {d3},  [r2], r3
    138        vrhadd.u16      d0,  d0,  d2
    139        vld1.16         {d1},  [r0, :64]
    140        sub             r0,  r0,  r1
    141        vrhadd.u16      d1,  d1,  d3
    142        vst1.16         {d0},  [r0, :64], r1
    143        vst1.16         {d1},  [r0, :64], r1
    144        bne             1b
    145        bx              lr
    146 endfunc
    147 
    148 @ Helper macros for vmull/vmlal with a constant from either d0 or d1 depending on index
    149 .macro vmull_lane dst, src, idx
    150 .if \idx < 4
    151       vmull.s16        \dst, \src, d0[\idx]
    152 .else
    153       vmull.s16        \dst, \src, d1[\idx - 4]
    154 .endif
    155 .endm
    156 .macro vmlal_lane dst, src, idx
    157 .if \idx < 4
    158       vmlal.s16        \dst, \src, d0[\idx]
    159 .else
    160       vmlal.s16        \dst, \src, d1[\idx - 4]
    161 .endif
    162 .endm
    163 
    164 @ Extract a vector from src1-src2 and src3-src4, andmultiply-accumulate
    165 @ into dst1 and dst3 (or dst1-dst2 and dst3-dst4 for size >= 8)
    166 .macro extmlal dst1, dst2, dst3, dst4, src1, src2, src3, src4, offset, size
    167        vext.8          q14, \src1, \src2, #(2*\offset)
    168        vext.8          q15, \src3, \src4, #(2*\offset)
    169        vmlal_lane      \dst1,  d28, \offset
    170        vmlal_lane      \dst3,  d30, \offset
    171 .if \size >= 8
    172        vmlal_lane      \dst2,  d29, \offset
    173        vmlal_lane      \dst4,  d31, \offset
    174 .endif
    175 .endm
    176 
    177 
    178 @ Instantiate a horizontal filter function for the given size.
    179 @ This can work on 4 or 8 pixels in parallel; for larger
    180 @ widths it will do 8 pixels at a time and loop horizontally.
    181 @ The actual width (in bytes) is passed in r5, the height in r4 and
    182 @ the filter coefficients in r12.
    183 .macro do_8tap_h type, size
    184 function \type\()_8tap_\size\()h
    185        sub             r2,  r2,  #6
    186        add             r6,  r0,  r1
    187        add             r7,  r2,  r3
    188        add             r1,  r1,  r1
    189        add             r3,  r3,  r3
    190        @ Only size >= 8 loops horizontally and needs
    191        @ reduced dst stride
    192 .if \size >= 8
    193        sub             r1,  r1,  r5
    194 .endif
    195        @ size >= 8 loads two qwords and increments r2,
    196        @ for size 4 it's enough with three dwords and no
    197        @ postincrement
    198 .if \size >= 8
    199        sub             r3,  r3,  r5
    200        sub             r3,  r3,  #16
    201 .endif
    202        @ Load the filter vector
    203        vld1.16         {q0},  [r12,:128]
    204 1:
    205 .if \size >= 8
    206        mov             r12, r5
    207 .endif
    208        @ Load src
    209 .if \size >= 8
    210        vld1.16         {q8,  q9},  [r2]!
    211        vld1.16         {q10, q11}, [r7]!
    212 .else
    213        vld1.16         {d16, d17, d18}, [r2]
    214        vld1.16         {d20, d21, d22}, [r7]
    215 .endif
    216 2:
    217 
    218        vmull.s16       q1,  d16, d0[0]
    219        vmull.s16       q12, d20, d0[0]
    220 .if \size >= 8
    221        vmull.s16       q2,  d17, d0[0]
    222        vmull.s16       q13, d21, d0[0]
    223 .endif
    224        extmlal         q1,  q2,  q12, q13, q8,  q9,  q10, q11, 1, \size
    225        extmlal         q1,  q2,  q12, q13, q8,  q9,  q10, q11, 2, \size
    226        extmlal         q1,  q2,  q12, q13, q8,  q9,  q10, q11, 3, \size
    227        extmlal         q1,  q2,  q12, q13, q8,  q9,  q10, q11, 4, \size
    228        extmlal         q1,  q2,  q12, q13, q8,  q9,  q10, q11, 5, \size
    229        extmlal         q1,  q2,  q12, q13, q8,  q9,  q10, q11, 6, \size
    230        extmlal         q1,  q2,  q12, q13, q8,  q9,  q10, q11, 7, \size
    231 
    232        @ Round, shift and saturate.
    233        @ The vqrshrun takes care of clamping negative values to zero, but
    234        @ we manually need to do vmin with the max pixel value.
    235        vqrshrun.s32    d2,  q1,  #7
    236        vqrshrun.s32    d24, q12, #7
    237 .if \size >= 8
    238        vqrshrun.s32    d3,  q2,  #7
    239        vqrshrun.s32    d25, q13, #7
    240        vmin.u16        q1,  q1,  q3
    241        vmin.u16        q12, q12, q3
    242 .else
    243        vmin.u16        d2,  d2,  d6
    244        vmin.u16        d24, d24, d6
    245 .endif
    246        @ Average
    247 .ifc \type,avg
    248 .if \size >= 8
    249        vld1.16         {q14}, [r0,:128]
    250        vld1.16         {q15}, [r6,:128]
    251        vrhadd.u16      q1,  q1,  q14
    252        vrhadd.u16      q12, q12, q15
    253 .else
    254        vld1.16         {d28}, [r0,:64]
    255        vld1.16         {d30}, [r6,:64]
    256        vrhadd.u16      d2,  d2,  d28
    257        vrhadd.u16      d24, d24, d30
    258 .endif
    259 .endif
    260        @ Store and loop horizontally (for size >= 8)
    261 .if \size >= 8
    262        subs            r12, r12, #16
    263        vst1.16         {q1},  [r0,:128]!
    264        vst1.16         {q12}, [r6,:128]!
    265        beq             3f
    266        vmov            q8,  q9
    267        vmov            q10, q11
    268        vld1.16         {q9},  [r2]!
    269        vld1.16         {q11}, [r7]!
    270        b               2b
    271 .else @ \size == 4
    272        vst1.16         {d2},  [r0,:64]
    273        vst1.16         {d24}, [r6,:64]
    274 .endif
    275 3:
    276        @ Loop vertically
    277        add             r0,  r0,  r1
    278        add             r6,  r6,  r1
    279        add             r2,  r2,  r3
    280        add             r7,  r7,  r3
    281        subs            r4,  r4,  #2
    282        bne             1b
    283        pop             {r4-r7}
    284        bx              lr
    285 endfunc
    286 .endm
    287 
    288 .macro do_8tap_h_size size
    289 do_8tap_h put, \size
    290 do_8tap_h avg, \size
    291 .endm
    292 
    293 do_8tap_h_size 4
    294 do_8tap_h_size 8
    295 
    296 .macro do_8tap_h_func type, filter, offset, size, bpp
    297 function ff_vp9_\type\()_\filter\()\size\()_h_\bpp\()_neon, export=1
    298        push            {r4-r7}
    299        ldr             r4,  [sp, #16]
    300        ldr             r5,  [sp, #20]
    301        vmvn.u16        q3,  #((0xffff << \bpp) & 0xffff)
    302        movrelx         r12, X(ff_vp9_subpel_filters), r6
    303        add             r12, r12, 256*\offset
    304        add             r12, r12, r5, lsl #4
    305        mov             r5,  #2*\size
    306 .if \size >= 8
    307        b               \type\()_8tap_8h
    308 .else
    309        b               \type\()_8tap_4h
    310 .endif
    311 endfunc
    312 .endm
    313 
    314 .macro do_8tap_h_filters size, bpp
    315 do_8tap_h_func put, regular, 1, \size, \bpp
    316 do_8tap_h_func avg, regular, 1, \size, \bpp
    317 do_8tap_h_func put, sharp,   2, \size, \bpp
    318 do_8tap_h_func avg, sharp,   2, \size, \bpp
    319 do_8tap_h_func put, smooth,  0, \size, \bpp
    320 do_8tap_h_func avg, smooth,  0, \size, \bpp
    321 .endm
    322 
    323 .macro do_8tap_h_filters_bpp bpp
    324 do_8tap_h_filters 64, \bpp
    325 do_8tap_h_filters 32, \bpp
    326 do_8tap_h_filters 16, \bpp
    327 do_8tap_h_filters 8,  \bpp
    328 do_8tap_h_filters 4,  \bpp
    329 .endm
    330 
    331 do_8tap_h_filters_bpp 10
    332 do_8tap_h_filters_bpp 12
    333 
    334 .ltorg
    335 
    336 @ Vertical filters
    337 
    338 @ Round, shift and saturate and store qreg1-4
    339 .macro do_store4 qreg1, dreg1, qreg2, dreg2, qreg3, dreg3, qreg4, dreg4, tmp1, tmp2, tmp3, tmp4, minreg, type
    340        vqrshrun.s32    \dreg1,  \qreg1, #7
    341        vqrshrun.s32    \dreg2,  \qreg2, #7
    342        vqrshrun.s32    \dreg3,  \qreg3, #7
    343        vqrshrun.s32    \dreg4,  \qreg4, #7
    344 .ifc \type,avg
    345        vld1.16         {\tmp1},  [r6,:64], r1
    346        vld1.16         {\tmp2},  [r6,:64], r1
    347        vld1.16         {\tmp3},  [r6,:64], r1
    348        vld1.16         {\tmp4},  [r6,:64], r1
    349 .endif
    350        vmin.u16        \dreg1,  \dreg1,  \minreg
    351        vmin.u16        \dreg2,  \dreg2,  \minreg
    352        vmin.u16        \dreg3,  \dreg3,  \minreg
    353        vmin.u16        \dreg4,  \dreg4,  \minreg
    354 .ifc \type,avg
    355        vrhadd.u16      \dreg1,  \dreg1,  \tmp1
    356        vrhadd.u16      \dreg2,  \dreg2,  \tmp2
    357        vrhadd.u16      \dreg3,  \dreg3,  \tmp3
    358        vrhadd.u16      \dreg4,  \dreg4,  \tmp4
    359 .endif
    360        vst1.16         {\dreg1}, [r0,:64], r1
    361        vst1.16         {\dreg2}, [r0,:64], r1
    362        vst1.16         {\dreg3}, [r0,:64], r1
    363        vst1.16         {\dreg4}, [r0,:64], r1
    364 .endm
    365 
    366 @ Round, shift and saturate and store qreg1-4
    367 @ qreg1-2 belong to one line and qreg3-4 to the second line.
    368 @ dreg1-2 == qreg1, dreg3-4 == qreg2.
    369 .macro do_store8 qreg1, qreg2, qreg3, qreg4, dreg1, dreg2, dreg3, dreg4, minreg, type
    370        vqrshrun.s32    \dreg1,  \qreg1, #7
    371        vqrshrun.s32    \dreg2,  \qreg2, #7
    372        vqrshrun.s32    \dreg3,  \qreg3, #7
    373        vqrshrun.s32    \dreg4,  \qreg4, #7
    374 .ifc \type,avg
    375        vld1.16         {\qreg3},  [r6,:128], r1
    376        vld1.16         {\qreg4},  [r6,:128], r1
    377 .endif
    378        vmin.u16        \qreg1,  \qreg1,  \minreg
    379        vmin.u16        \qreg2,  \qreg2,  \minreg
    380 .ifc \type,avg
    381        vrhadd.u16      \qreg1,  \qreg1,  \qreg3
    382        vrhadd.u16      \qreg2,  \qreg2,  \qreg4
    383 .endif
    384        vst1.16         {\qreg1}, [r0,:128], r1
    385        vst1.16         {\qreg2}, [r0,:128], r1
    386 .endm
    387 
    388 @ Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
    389 @ (src1-src8 into dst1, src2-src9 into dst2).
    390 .macro convolve4 dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, tmp1, tmp2
    391        vmull.s16       \dst1, \src1, d0[0]
    392        vmull.s16       \dst2, \src2, d0[0]
    393        vmull.s16       \tmp1, \src2, d0[1]
    394        vmull.s16       \tmp2, \src3, d0[1]
    395        vmlal.s16       \dst1, \src3, d0[2]
    396        vmlal.s16       \dst2, \src4, d0[2]
    397        vmlal.s16       \tmp1, \src4, d0[3]
    398        vmlal.s16       \tmp2, \src5, d0[3]
    399        vmlal.s16       \dst1, \src5, d1[0]
    400        vmlal.s16       \dst2, \src6, d1[0]
    401        vmlal.s16       \tmp1, \src6, d1[1]
    402        vmlal.s16       \tmp2, \src7, d1[1]
    403        vmlal.s16       \dst1, \src7, d1[2]
    404        vmlal.s16       \dst2, \src8, d1[2]
    405        vmlal.s16       \tmp1, \src8, d1[3]
    406        vmlal.s16       \tmp2, \src9, d1[3]
    407        vadd.s32        \dst1, \dst1, \tmp1
    408        vadd.s32        \dst2, \dst2, \tmp2
    409 .endm
    410 
    411 @ Evaluate the filter twice in parallel. This does the same as convolve4 above,
    412 @ but with double width (two input/output registers per row).
    413 .macro convolve8 dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15, src16, src17, src18
    414        vmull.s16       \dst1, \src1,  d0[0]
    415        vmull.s16       \dst2, \src2,  d0[0]
    416        vmull.s16       \dst3, \src3,  d0[0]
    417        vmull.s16       \dst4, \src4,  d0[0]
    418        vmlal.s16       \dst1, \src3,  d0[1]
    419        vmlal.s16       \dst2, \src4,  d0[1]
    420        vmlal.s16       \dst3, \src5,  d0[1]
    421        vmlal.s16       \dst4, \src6,  d0[1]
    422        vmlal.s16       \dst1, \src5,  d0[2]
    423        vmlal.s16       \dst2, \src6,  d0[2]
    424        vmlal.s16       \dst3, \src7,  d0[2]
    425        vmlal.s16       \dst4, \src8,  d0[2]
    426        vmlal.s16       \dst1, \src7,  d0[3]
    427        vmlal.s16       \dst2, \src8,  d0[3]
    428        vmlal.s16       \dst3, \src9,  d0[3]
    429        vmlal.s16       \dst4, \src10, d0[3]
    430        vmlal.s16       \dst1, \src9,  d1[0]
    431        vmlal.s16       \dst2, \src10, d1[0]
    432        vmlal.s16       \dst3, \src11, d1[0]
    433        vmlal.s16       \dst4, \src12, d1[0]
    434        vmlal.s16       \dst1, \src11, d1[1]
    435        vmlal.s16       \dst2, \src12, d1[1]
    436        vmlal.s16       \dst3, \src13, d1[1]
    437        vmlal.s16       \dst4, \src14, d1[1]
    438        vmlal.s16       \dst1, \src13, d1[2]
    439        vmlal.s16       \dst2, \src14, d1[2]
    440        vmlal.s16       \dst3, \src15, d1[2]
    441        vmlal.s16       \dst4, \src16, d1[2]
    442        vmlal.s16       \dst1, \src15, d1[3]
    443        vmlal.s16       \dst2, \src16, d1[3]
    444        vmlal.s16       \dst3, \src17, d1[3]
    445        vmlal.s16       \dst4, \src18, d1[3]
    446 .endm
    447 
    448 @ Instantiate a vertical filter function for filtering 8 pixels at a time.
    449 @ The height is passed in r4, the width in r5 and the filter coefficients
    450 @ in r12.
    451 .macro do_8tap_8v type
    452 function \type\()_8tap_8v
    453        sub             r2,  r2,  r3, lsl #1
    454        sub             r2,  r2,  r3
    455        vld1.16         {q0},  [r12, :128]
    456 1:
    457 .ifc \type,avg
    458        mov             r6,  r0
    459 .endif
    460        mov             r12, r4
    461 
    462        vld1.16         {q5},  [r2], r3
    463        vld1.16         {q6},  [r2], r3
    464        vld1.16         {q7},  [r2], r3
    465        vld1.16         {q8},  [r2], r3
    466        vld1.16         {q9},  [r2], r3
    467        vld1.16         {q10}, [r2], r3
    468        vld1.16         {q11}, [r2], r3
    469 2:
    470        vld1.16         {q12}, [r2], r3
    471        vld1.16         {q13}, [r2], r3
    472        vld1.16         {q14}, [r2], r3
    473        vld1.16         {q15}, [r2], r3
    474        convolve8       q2,  q3,  q4,  q5,  d10, d11, d12, d13, d14, d15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27
    475        do_store8       q2,  q3,  q4,  q5,  d4,  d5,  d6,  d7,  q1,  \type
    476        convolve8       q2,  q3,  q4,  q5,  d14, d15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
    477        do_store8       q2,  q3,  q4,  q5,  d4,  d5,  d6,  d7,  q1,  \type
    478 
    479        subs            r12, r12, #4
    480        beq             8f
    481 
    482        vld1.16         {q4},  [r2], r3
    483        vld1.16         {q5},  [r2], r3
    484        vld1.16         {q6},  [r2], r3
    485        vld1.16         {q7},  [r2], r3
    486        convolve8       q2,  q3,  q8,  q9,  d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31, d8,  d9,  d10, d11
    487        do_store8       q2,  q3,  q8,  q9,  d4,  d5,  d6,  d7,  q1,  \type
    488        convolve8       q2,  q3,  q8,  q9,  d22, d23, d24, d25, d26, d27, d28, d29, d30, d31, d8,  d9,  d10, d11, d12, d13, d14, d15
    489        do_store8       q2,  q3,  q8,  q9,  d4,  d5,  d6,  d7,  q1,  \type
    490 
    491        subs            r12, r12, #4
    492        beq             8f
    493 
    494        vld1.16         {q8},  [r2], r3
    495        vld1.16         {q9},  [r2], r3
    496        vld1.16         {q10}, [r2], r3
    497        vld1.16         {q11}, [r2], r3
    498        convolve8       q2,  q3,  q12, q13, d26, d27, d28, d29, d30, d31, d8,  d9,  d10, d11, d12, d13, d14, d15, d16, d17, d18, d19
    499        do_store8       q2,  q3,  q12, q13, d4,  d5,  d6,  d7,  q1,  \type
    500        convolve8       q2,  q3,  q12, q13, d30, d31, d8,  d9,  d10, d11, d12, d13, d14, d15, d16, d17, d18, d19, d20, d21, d22, d23
    501        do_store8       q2,  q3,  q12, q13, d4,  d5,  d6,  d7,  q1,  \type
    502 
    503        subs            r12, r12, #4
    504        bne             2b
    505 
    506 8:
    507        subs            r5,  r5,  #8
    508        beq             9f
    509        @ r0 -= h * dst_stride
    510        mls             r0,  r1,  r4, r0
    511        @ r2 -= h * src_stride
    512        mls             r2,  r3,  r4, r2
    513        @ r2 -= 8 * src_stride
    514        sub             r2,  r2,  r3, lsl #3
    515        @ r2 += 1 * src_stride
    516        add             r2,  r2,  r3
    517        add             r2,  r2,  #16
    518        add             r0,  r0,  #16
    519        b               1b
    520 9:
    521        vpop            {q4-q7}
    522        pop             {r4-r6}
    523        bx              lr
    524 endfunc
    525 .endm
    526 
    527 do_8tap_8v put
    528 do_8tap_8v avg
    529 
    530 @ Instantiate a vertical filter function for filtering a 4 pixels wide
    531 @ slice. This only is designed to work for 4 or 8 output lines.
    532 .macro do_8tap_4v type
    533 function \type\()_8tap_4v
    534        sub             r2,  r2,  r3, lsl #1
    535        sub             r2,  r2,  r3
    536        vld1.16         {q0},  [r12, :128]
    537 .ifc \type,avg
    538        mov             r6,  r0
    539 .endif
    540 
    541        vld1.16         {d16}, [r2], r3
    542        vld1.16         {d17}, [r2], r3
    543        vld1.16         {d18}, [r2], r3
    544        vld1.16         {d19}, [r2], r3
    545        vld1.16         {d20}, [r2], r3
    546        vld1.16         {d21}, [r2], r3
    547        vld1.16         {d22}, [r2], r3
    548        vld1.16         {d23}, [r2], r3
    549        vld1.16         {d24}, [r2], r3
    550        vld1.16         {d25}, [r2], r3
    551        vld1.16         {d26}, [r2], r3
    552        convolve4       q2,  q3,  d16, d17, d18, d19, d20, d21, d22, d23, d24, q14, q15
    553        convolve4       q14, q15, d18, d19, d20, d21, d22, d23, d24, d25, d26, q8,  q9
    554        do_store4       q2,  d4,  q3,  d6,  q14, d28, q15, d30, d5,  d7,  d29, d31, d2,  \type
    555 
    556        subs            r4,  r4,  #4
    557        beq             9f
    558 
    559        vld1.16         {d27}, [r2], r3
    560        vld1.16         {d28}, [r2], r3
    561        vld1.16         {d29}, [r2], r3
    562        vld1.16         {d30}, [r2], r3
    563        convolve4       q2,  q3,  d20, d21, d22, d23, d24, d25, d26, d27, d28, q8,  q9
    564        convolve4       q8,  q9,  d22, d23, d24, d25, d26, d27, d28, d29, d30, q10, q11
    565        do_store4       q2,  d4,  q3,  d6,  q8,  d16, q9,  d18, d5,  d7,  d17, d19, d2,  \type
    566 
    567 9:
    568        pop             {r4-r6}
    569        bx              lr
    570 endfunc
    571 .endm
    572 
    573 do_8tap_4v put
    574 do_8tap_4v avg
    575 
    576 .macro do_8tap_v_func type, filter, offset, size, bpp
    577 function ff_vp9_\type\()_\filter\()\size\()_v_\bpp\()_neon, export=1
    578        push            {r4-r6}
    579        ldr             r4,  [sp, #12]
    580        ldr             r5,  [sp, #20]
    581 .if \size >= 8
    582        vpush           {q4-q7}
    583 .endif
    584        vmvn.u16        q1,  #((0xffff << \bpp) & 0xffff)
    585        movrelx         r12, X(ff_vp9_subpel_filters), r6
    586        add             r12, r12, 256*\offset
    587        add             r12, r12, r5, lsl #4
    588        mov             r5,  #\size
    589 .if \size >= 8
    590        b               \type\()_8tap_8v
    591 .else
    592        b               \type\()_8tap_4v
    593 .endif
    594 endfunc
    595 .endm
    596 
    597 .macro do_8tap_v_filters size, bpp
    598 do_8tap_v_func put, regular, 1, \size, \bpp
    599 do_8tap_v_func avg, regular, 1, \size, \bpp
    600 do_8tap_v_func put, sharp,   2, \size, \bpp
    601 do_8tap_v_func avg, sharp,   2, \size, \bpp
    602 do_8tap_v_func put, smooth,  0, \size, \bpp
    603 do_8tap_v_func avg, smooth,  0, \size, \bpp
    604 .endm
    605 
    606 .macro do_8tap_v_filters_bpp bpp
    607 do_8tap_v_filters 64, \bpp
    608 do_8tap_v_filters 32, \bpp
    609 do_8tap_v_filters 16, \bpp
    610 do_8tap_v_filters 8,  \bpp
    611 do_8tap_v_filters 4,  \bpp
    612 .endm
    613 
    614 do_8tap_v_filters_bpp 10
    615 do_8tap_v_filters_bpp 12