tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

vp9mc_neon.S (25221B)


      1 /*
      2 * Copyright (c) 2016 Google Inc.
      3 *
      4 * This file is part of FFmpeg.
      5 *
      6 * FFmpeg is free software; you can redistribute it and/or
      7 * modify it under the terms of the GNU Lesser General Public
      8 * License as published by the Free Software Foundation; either
      9 * version 2.1 of the License, or (at your option) any later version.
     10 *
     11 * FFmpeg is distributed in the hope that it will be useful,
     12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     14 * Lesser General Public License for more details.
     15 *
     16 * You should have received a copy of the GNU Lesser General Public
     17 * License along with FFmpeg; if not, write to the Free Software
     18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     19 */
     20 
     21 #include "libavutil/arm/asm.S"
     22 
     23 @ All public functions in this file have the following signature:
     24 @ typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
     25 @                            const uint8_t *ref, ptrdiff_t ref_stride,
     26 @                            int h, int mx, int my);
     27 
     28 function ff_vp9_copy64_neon, export=1
     29        ldr             r12, [sp]
     30        sub             r1,  r1,  #32
     31        sub             r3,  r3,  #32
     32 1:
     33        vld1.8          {q0,  q1},  [r2]!
     34        vst1.8          {q0,  q1},  [r0, :128]!
     35        vld1.8          {q2,  q3},  [r2], r3
     36        subs            r12, r12, #1
     37        vst1.8          {q2,  q3},  [r0, :128], r1
     38        bne             1b
     39        bx              lr
     40 endfunc
     41 
     42 function ff_vp9_avg64_neon, export=1
     43        push            {lr}
     44        ldr             r12, [sp, #4]
     45        sub             r1,  r1,  #32
     46        sub             r3,  r3,  #32
     47        mov             lr,  r0
     48 1:
     49        vld1.8          {q8,  q9},  [r2]!
     50        vld1.8          {q0,  q1},  [r0, :128]!
     51        vld1.8          {q10, q11}, [r2], r3
     52        vrhadd.u8       q0,  q0,  q8
     53        vld1.8          {q2,  q3},  [r0, :128], r1
     54        vrhadd.u8       q1,  q1,  q9
     55        vrhadd.u8       q2,  q2,  q10
     56        vst1.8          {q0,  q1},  [lr, :128]!
     57        vrhadd.u8       q3,  q3,  q11
     58        vst1.8          {q2,  q3},  [lr, :128], r1
     59        subs            r12, r12, #1
     60        bne             1b
     61        pop             {pc}
     62 endfunc
     63 
     64 function ff_vp9_copy32_neon, export=1
     65        ldr             r12, [sp]
     66 1:
     67        vld1.8          {q0,  q1},  [r2], r3
     68        subs            r12, r12, #1
     69        vst1.8          {q0,  q1},  [r0, :128], r1
     70        bne             1b
     71        bx              lr
     72 endfunc
     73 
     74 function ff_vp9_avg32_neon, export=1
     75        ldr             r12, [sp]
     76 1:
     77        vld1.8          {q2,  q3},  [r2], r3
     78        vld1.8          {q0,  q1},  [r0, :128]
     79        vrhadd.u8       q0,  q0,  q2
     80        vrhadd.u8       q1,  q1,  q3
     81        subs            r12, r12, #1
     82        vst1.8          {q0,  q1},  [r0, :128], r1
     83        bne             1b
     84        bx              lr
     85 endfunc
     86 
     87 function ff_vp9_copy16_neon, export=1
     88        push            {r4,lr}
     89        ldr             r12, [sp, #8]
     90        add             r4,  r0,  r1
     91        add             lr,  r2,  r3
     92        add             r1,  r1,  r1
     93        add             r3,  r3,  r3
     94 1:
     95        vld1.8          {q0},  [r2], r3
     96        vld1.8          {q1},  [lr], r3
     97        subs            r12, r12, #2
     98        vst1.8          {q0},  [r0, :128], r1
     99        vst1.8          {q1},  [r4, :128], r1
    100        bne             1b
    101        pop             {r4,pc}
    102 endfunc
    103 
    104 function ff_vp9_avg16_neon, export=1
    105        push            {lr}
    106        ldr             r12, [sp, #4]
    107        mov             lr,  r0
    108 1:
    109        vld1.8          {q2},  [r2], r3
    110        vld1.8          {q0},  [r0, :128], r1
    111        vld1.8          {q3},  [r2], r3
    112        vrhadd.u8       q0,  q0,  q2
    113        vld1.8          {q1},  [r0, :128], r1
    114        vrhadd.u8       q1,  q1,  q3
    115        subs            r12, r12, #2
    116        vst1.8          {q0},  [lr, :128], r1
    117        vst1.8          {q1},  [lr, :128], r1
    118        bne             1b
    119        pop             {pc}
    120 endfunc
    121 
    122 function ff_vp9_copy8_neon, export=1
    123        ldr             r12, [sp]
    124 1:
    125        vld1.8          {d0},  [r2], r3
    126        vld1.8          {d1},  [r2], r3
    127        subs            r12, r12, #2
    128        vst1.8          {d0},  [r0, :64], r1
    129        vst1.8          {d1},  [r0, :64], r1
    130        bne             1b
    131        bx              lr
    132 endfunc
    133 
    134 function ff_vp9_avg8_neon, export=1
    135        ldr             r12, [sp]
    136 1:
    137        vld1.8          {d2},  [r2], r3
    138        vld1.8          {d0},  [r0, :64], r1
    139        vld1.8          {d3},  [r2], r3
    140        vrhadd.u8       d0,  d0,  d2
    141        vld1.8          {d1},  [r0, :64]
    142        sub             r0,  r0,  r1
    143        vrhadd.u8       d1,  d1,  d3
    144        subs            r12, r12, #2
    145        vst1.8          {d0},  [r0, :64], r1
    146        vst1.8          {d1},  [r0, :64], r1
    147        bne             1b
    148        bx              lr
    149 endfunc
    150 
    151 function ff_vp9_copy4_neon, export=1
    152        ldr             r12, [sp]
    153 1:
    154        vld1.32         {d0[]},   [r2], r3
    155        vld1.32         {d1[]},   [r2], r3
    156        vst1.32         {d0[0]},  [r0, :32], r1
    157        vld1.32         {d2[]},   [r2], r3
    158        vst1.32         {d1[0]},  [r0, :32], r1
    159        vld1.32         {d3[]},   [r2], r3
    160        subs            r12, r12, #4
    161        vst1.32         {d2[0]},  [r0, :32], r1
    162        vst1.32         {d3[0]},  [r0, :32], r1
    163        bne             1b
    164        bx              lr
    165 endfunc
    166 
    167 function ff_vp9_avg4_neon, export=1
    168        push            {lr}
    169        ldr             r12, [sp, #4]
    170        mov             lr,  r0
    171 1:
    172        vld1.32         {d4[]},   [r2], r3
    173        vld1.32         {d0[]},   [r0, :32], r1
    174        vld1.32         {d5[]},   [r2], r3
    175        vrhadd.u8       d0,  d0,  d4
    176        vld1.32         {d1[]},   [r0, :32], r1
    177        vld1.32         {d6[]},   [r2], r3
    178        vrhadd.u8       d1,  d1,  d5
    179        vld1.32         {d2[]},   [r0, :32], r1
    180        vld1.32         {d7[]},   [r2], r3
    181        vrhadd.u8       d2,  d2,  d6
    182        vld1.32         {d3[]},   [r0, :32], r1
    183        subs            r12, r12, #4
    184        vst1.32         {d0[0]},  [lr, :32], r1
    185        vrhadd.u8       d3,  d3,  d7
    186        vst1.32         {d1[0]},  [lr, :32], r1
    187        vst1.32         {d2[0]},  [lr, :32], r1
    188        vst1.32         {d3[0]},  [lr, :32], r1
    189        bne             1b
    190        pop             {pc}
    191 endfunc
    192 
    193 @ Helper macros for vmul/vmla with a constant from either d0 or d1 depending on index
    194 .macro vmul_lane dst, src, idx
    195 .if \idx < 4
    196       vmul.s16         \dst, \src, d0[\idx]
    197 .else
    198       vmul.s16         \dst, \src, d1[\idx - 4]
    199 .endif
    200 .endm
    201 .macro vmla_lane dst, src, idx
    202 .if \idx < 4
    203       vmla.s16         \dst, \src, d0[\idx]
    204 .else
    205       vmla.s16         \dst, \src, d1[\idx - 4]
    206 .endif
    207 .endm
    208 
    209 @ Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6
    210 @ for size >= 16), and multiply-accumulate into dst1 and dst3 (or
    211 @ dst1-dst2 and dst3-dst4 for size >= 16)
    212 .macro extmla dst1, dst2, dst3, dst4, dst1d, dst3d, src1, src2, src3, src4, src5, src6, offset, size
    213        vext.8          q14, \src1, \src2, #(2*\offset)
    214        vext.8          q15, \src4, \src5, #(2*\offset)
    215 .if \size >= 16
    216        vmla_lane       \dst1,  q14, \offset
    217        vext.8          q5,  \src2, \src3, #(2*\offset)
    218        vmla_lane       \dst3,  q15, \offset
    219        vext.8          q6,  \src5, \src6, #(2*\offset)
    220        vmla_lane       \dst2,  q5,  \offset
    221        vmla_lane       \dst4,  q6,  \offset
    222 .elseif \size == 8
    223        vmla_lane       \dst1,  q14, \offset
    224        vmla_lane       \dst3,  q15, \offset
    225 .else
    226        vmla_lane       \dst1d, d28, \offset
    227        vmla_lane       \dst3d, d30, \offset
    228 .endif
    229 .endm
    230 @ The same as above, but don't accumulate straight into the
    231 @ destination, but use a temp register and accumulate with saturation.
    232 .macro extmulqadd dst1, dst2, dst3, dst4, dst1d, dst3d, src1, src2, src3, src4, src5, src6, offset, size
    233        vext.8          q14, \src1, \src2, #(2*\offset)
    234        vext.8          q15, \src4, \src5, #(2*\offset)
    235 .if \size >= 16
    236        vmul_lane       q14, q14, \offset
    237        vext.8          q5,  \src2, \src3, #(2*\offset)
    238        vmul_lane       q15, q15, \offset
    239        vext.8          q6,  \src5, \src6, #(2*\offset)
    240        vmul_lane       q5,  q5,  \offset
    241        vmul_lane       q6,  q6,  \offset
    242 .elseif \size == 8
    243        vmul_lane       q14, q14, \offset
    244        vmul_lane       q15, q15, \offset
    245 .else
    246        vmul_lane       d28, d28, \offset
    247        vmul_lane       d30, d30, \offset
    248 .endif
    249 .if \size == 4
    250        vqadd.s16       \dst1d, \dst1d, d28
    251        vqadd.s16       \dst3d, \dst3d, d30
    252 .else
    253        vqadd.s16       \dst1,  \dst1,  q14
    254        vqadd.s16       \dst3,  \dst3,  q15
    255 .if \size >= 16
    256        vqadd.s16       \dst2,  \dst2,  q5
    257        vqadd.s16       \dst4,  \dst4,  q6
    258 .endif
    259 .endif
    260 .endm
    261 
    262 
    263 @ Instantiate a horizontal filter function for the given size.
    264 @ This can work on 4, 8 or 16 pixels in parallel; for larger
    265 @ widths it will do 16 pixels at a time and loop horizontally.
    266 @ The actual width is passed in r5, the height in r4 and
    267 @ the filter coefficients in r12. idx2 is the index of the largest
    268 @ filter coefficient (3 or 4) and idx1 is the other one of them.
    269 .macro do_8tap_h type, size, idx1, idx2
    270 function \type\()_8tap_\size\()h_\idx1\idx2
    271        sub             r2,  r2,  #3
    272        add             r6,  r0,  r1
    273        add             r7,  r2,  r3
    274        add             r1,  r1,  r1
    275        add             r3,  r3,  r3
    276        @ Only size >= 16 loops horizontally and needs
    277        @ reduced dst stride
    278 .if \size >= 16
    279        sub             r1,  r1,  r5
    280 .endif
    281        @ size >= 16 loads two qwords and increments r2,
    282        @ size 4 loads 1 d word, increments r2 and loads 1 32-bit lane
    283        @ for size 8 it's enough with one qword and no postincrement
    284 .if \size >= 16
    285        sub             r3,  r3,  r5
    286        sub             r3,  r3,  #8
    287 .elseif \size == 4
    288        sub             r3,  r3,  #8
    289 .endif
    290        @ Load the filter vector
    291        vld1.16         {q0},  [r12,:128]
    292 1:
    293 .if \size >= 16
    294        mov             r12, r5
    295 .endif
    296        @ Load src
    297 .if \size >= 16
    298        vld1.8          {d18, d19, d20}, [r2]!
    299        vld1.8          {d24, d25, d26}, [r7]!
    300 .elseif \size == 8
    301        vld1.8          {q9},  [r2]
    302        vld1.8          {q12}, [r7]
    303 .else @ size == 4
    304        vld1.8          {d18}, [r2]!
    305        vld1.8          {d24}, [r7]!
    306        vld1.32         {d19[0]}, [r2]
    307        vld1.32         {d25[0]}, [r7]
    308 .endif
    309        vmovl.u8        q8,  d18
    310        vmovl.u8        q9,  d19
    311        vmovl.u8        q11, d24
    312        vmovl.u8        q12, d25
    313 .if \size >= 16
    314        vmovl.u8        q10, d20
    315        vmovl.u8        q13, d26
    316 .endif
    317 2:
    318 
    319        @ Accumulate, adding idx2 last with a separate
    320        @ saturating add. The positive filter coefficients
    321        @ for all indices except idx2 must add up to less
    322        @ than 127 for this not to overflow.
    323        vmul.s16        q1,  q8,  d0[0]
    324        vmul.s16        q3,  q11, d0[0]
    325 .if \size >= 16
    326        vmul.s16        q2,  q9,  d0[0]
    327        vmul.s16        q4,  q12, d0[0]
    328 .endif
    329        extmla          q1,  q2,  q3,  q4,  d2,  d6,  q8,  q9,  q10, q11, q12, q13, 1,     \size
    330        extmla          q1,  q2,  q3,  q4,  d2,  d6,  q8,  q9,  q10, q11, q12, q13, 2,     \size
    331        extmla          q1,  q2,  q3,  q4,  d2,  d6,  q8,  q9,  q10, q11, q12, q13, \idx1, \size
    332        extmla          q1,  q2,  q3,  q4,  d2,  d6,  q8,  q9,  q10, q11, q12, q13, 5,     \size
    333        extmla          q1,  q2,  q3,  q4,  d2,  d6,  q8,  q9,  q10, q11, q12, q13, 6,     \size
    334        extmla          q1,  q2,  q3,  q4,  d2,  d6,  q8,  q9,  q10, q11, q12, q13, 7,     \size
    335        extmulqadd      q1,  q2,  q3,  q4,  d2,  d6,  q8,  q9,  q10, q11, q12, q13, \idx2, \size
    336 
    337        @ Round, shift and saturate
    338        vqrshrun.s16    d2,  q1,  #7
    339        vqrshrun.s16    d6,  q3,  #7
    340 .if \size >= 16
    341        vqrshrun.s16    d3,  q2,  #7
    342        vqrshrun.s16    d7,  q4,  #7
    343 .endif
    344        @ Average
    345 .ifc \type,avg
    346 .if \size >= 16
    347        vld1.8          {q14}, [r0,:128]
    348        vld1.8          {q15}, [r6,:128]
    349        vrhadd.u8       q1,  q1,  q14
    350        vrhadd.u8       q3,  q3,  q15
    351 .elseif \size == 8
    352        vld1.8          {d28}, [r0,:64]
    353        vld1.8          {d30}, [r6,:64]
    354        vrhadd.u8       d2,  d2,  d28
    355        vrhadd.u8       d6,  d6,  d30
    356 .else
    357        @ We only need d28[0], but [] is faster on some cores
    358        vld1.32         {d28[]}, [r0,:32]
    359        vld1.32         {d30[]}, [r6,:32]
    360        vrhadd.u8       d2,  d2,  d28
    361        vrhadd.u8       d6,  d6,  d30
    362 .endif
    363 .endif
    364        @ Store and loop horizontally (for size >= 16)
    365 .if \size >= 16
    366        subs            r12, r12, #16
    367        vst1.8          {q1}, [r0,:128]!
    368        vst1.8          {q3}, [r6,:128]!
    369        beq             3f
    370        vmov            q8,  q10
    371        vmov            q11, q13
    372        vld1.8          {q10}, [r2]!
    373        vld1.8          {q13}, [r7]!
    374        vmovl.u8        q9,  d20
    375        vmovl.u8        q10, d21
    376        vmovl.u8        q12, d26
    377        vmovl.u8        q13, d27
    378        b               2b
    379 .elseif \size == 8
    380        vst1.8          {d2}, [r0,:64]
    381        vst1.8          {d6}, [r6,:64]
    382 .else @ \size == 4
    383        vst1.32         {d2[0]}, [r0,:32]
    384        vst1.32         {d6[0]}, [r6,:32]
    385 .endif
    386 3:
    387        @ Loop vertically
    388        add             r0,  r0,  r1
    389        add             r6,  r6,  r1
    390        add             r2,  r2,  r3
    391        add             r7,  r7,  r3
    392        subs            r4,  r4,  #2
    393        bne             1b
    394 .if \size >= 16
    395        vpop            {q4-q6}
    396 .endif
    397        pop             {r4-r7}
    398        bx              lr
    399 endfunc
    400 .endm
    401 
    402 .macro do_8tap_h_size size
    403 do_8tap_h put, \size, 3, 4
    404 do_8tap_h avg, \size, 3, 4
    405 do_8tap_h put, \size, 4, 3
    406 do_8tap_h avg, \size, 4, 3
    407 .endm
    408 
    409 do_8tap_h_size 4
    410 do_8tap_h_size 8
    411 do_8tap_h_size 16
    412 
    413 .macro do_8tap_h_func type, filter, offset, size
    414 function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1
    415        push            {r4-r7}
    416 .if \size >= 16
    417        vpush           {q4-q6}
    418        ldr             r4,  [sp, #64]
    419        ldr             r5,  [sp, #68]
    420 .else
    421        ldr             r4,  [sp, #16]
    422        ldr             r5,  [sp, #20]
    423 .endif
    424        movrelx         r12, X(ff_vp9_subpel_filters), r6
    425        add             r12, r12, 256*\offset
    426        cmp             r5,  #8
    427        add             r12, r12, r5, lsl #4
    428        mov             r5,  #\size
    429 .if \size >= 16
    430        bge             \type\()_8tap_16h_34
    431        b               \type\()_8tap_16h_43
    432 .else
    433        bge             \type\()_8tap_\size\()h_34
    434        b               \type\()_8tap_\size\()h_43
    435 .endif
    436 endfunc
    437 .endm
    438 
    439 .macro do_8tap_h_filters size
    440 do_8tap_h_func put, regular, 1, \size
    441 do_8tap_h_func avg, regular, 1, \size
    442 do_8tap_h_func put, sharp,   2, \size
    443 do_8tap_h_func avg, sharp,   2, \size
    444 do_8tap_h_func put, smooth,  0, \size
    445 do_8tap_h_func avg, smooth,  0, \size
    446 .endm
    447 
    448 do_8tap_h_filters 64
    449 do_8tap_h_filters 32
    450 do_8tap_h_filters 16
    451 do_8tap_h_filters 8
    452 do_8tap_h_filters 4
    453 
    454 .ltorg
    455 
    456 @ Vertical filters
    457 
    458 @ Round, shift and saturate and store qreg1-2 over 4 lines
    459 .macro do_store4 qreg1, dreg1, qreg2, dreg2, tmp1, tmp2, type
    460        vqrshrun.s16    \dreg1,  \qreg1, #7
    461        vqrshrun.s16    \dreg2,  \qreg2, #7
    462 .ifc \type,avg
    463        vld1.32         {\tmp1[]},   [r0,:32], r1
    464        vld1.32         {\tmp2[]},   [r0,:32], r1
    465        vld1.32         {\tmp1[1]},  [r0,:32], r1
    466        vld1.32         {\tmp2[1]},  [r0,:32], r1
    467        vrhadd.u8       \dreg1,  \dreg1,  \tmp1
    468        vrhadd.u8       \dreg2,  \dreg2,  \tmp2
    469        sub             r0,  r0,  r1, lsl #2
    470 .endif
    471        vst1.32         {\dreg1[0]}, [r0,:32], r1
    472        vst1.32         {\dreg2[0]}, [r0,:32], r1
    473        vst1.32         {\dreg1[1]}, [r0,:32], r1
    474        vst1.32         {\dreg2[1]}, [r0,:32], r1
    475 .endm
    476 
    477 @ Round, shift and saturate and store qreg1-4
    478 .macro do_store qreg1, dreg1, qreg2, dreg2, qreg3, dreg3, qreg4, dreg4, tmp1, tmp2, tmp3, tmp4, type
    479        vqrshrun.s16    \dreg1,  \qreg1, #7
    480        vqrshrun.s16    \dreg2,  \qreg2, #7
    481        vqrshrun.s16    \dreg3,  \qreg3, #7
    482        vqrshrun.s16    \dreg4,  \qreg4, #7
    483 .ifc \type,avg
    484        vld1.8          {\tmp1},  [r0,:64], r1
    485        vld1.8          {\tmp2},  [r0,:64], r1
    486        vld1.8          {\tmp3},  [r0,:64], r1
    487        vld1.8          {\tmp4},  [r0,:64], r1
    488        vrhadd.u8       \dreg1,  \dreg1,  \tmp1
    489        vrhadd.u8       \dreg2,  \dreg2,  \tmp2
    490        vrhadd.u8       \dreg3,  \dreg3,  \tmp3
    491        vrhadd.u8       \dreg4,  \dreg4,  \tmp4
    492        sub             r0,  r0,  r1, lsl #2
    493 .endif
    494        vst1.8          {\dreg1}, [r0,:64], r1
    495        vst1.8          {\dreg2}, [r0,:64], r1
    496        vst1.8          {\dreg3}, [r0,:64], r1
    497        vst1.8          {\dreg4}, [r0,:64], r1
    498 .endm
    499 
    500 @ Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
    501 @ (src1-src8 into dst1, src2-src9 into dst2), adding idx2 separately
    502 @ at the end with saturation. Indices 0 and 7 always have negative or zero
    503 @ coefficients, so they can be accumulated into tmp1-tmp2 together with the
    504 @ largest coefficient.
    505 .macro convolve dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, idx1, idx2, tmp1, tmp2
    506        vmul.s16        \dst1, \src2, d0[1]
    507        vmul.s16        \dst2, \src3, d0[1]
    508        vmul.s16        \tmp1, \src1, d0[0]
    509        vmul.s16        \tmp2, \src2, d0[0]
    510        vmla.s16        \dst1, \src3, d0[2]
    511        vmla.s16        \dst2, \src4, d0[2]
    512 .if \idx1 == 3
    513        vmla.s16        \dst1, \src4, d0[3]
    514        vmla.s16        \dst2, \src5, d0[3]
    515 .else
    516        vmla.s16        \dst1, \src5, d1[0]
    517        vmla.s16        \dst2, \src6, d1[0]
    518 .endif
    519        vmla.s16        \dst1, \src6, d1[1]
    520        vmla.s16        \dst2, \src7, d1[1]
    521        vmla.s16        \tmp1, \src8, d1[3]
    522        vmla.s16        \tmp2, \src9, d1[3]
    523        vmla.s16        \dst1, \src7, d1[2]
    524        vmla.s16        \dst2, \src8, d1[2]
    525 .if \idx2 == 3
    526        vmla.s16        \tmp1, \src4, d0[3]
    527        vmla.s16        \tmp2, \src5, d0[3]
    528 .else
    529        vmla.s16        \tmp1, \src5, d1[0]
    530        vmla.s16        \tmp2, \src6, d1[0]
    531 .endif
    532        vqadd.s16       \dst1, \dst1, \tmp1
    533        vqadd.s16       \dst2, \dst2, \tmp2
    534 .endm
    535 
    536 @ Load pixels and extend them to 16 bit
    537 .macro loadl dst1, dst2, dst3, dst4
    538        vld1.8          {d2}, [r2], r3
    539        vld1.8          {d3}, [r2], r3
    540        vld1.8          {d4}, [r2], r3
    541 .ifnb \dst4
    542        vld1.8          {d5}, [r2], r3
    543 .endif
    544        vmovl.u8        \dst1, d2
    545        vmovl.u8        \dst2, d3
    546        vmovl.u8        \dst3, d4
    547 .ifnb \dst4
    548        vmovl.u8        \dst4, d5
    549 .endif
    550 .endm
    551 
    552 @ Instantiate a vertical filter function for filtering 8 pixels at a time.
    553 @ The height is passed in r4, the width in r5 and the filter coefficients
    554 @ in r12. idx2 is the index of the largest filter coefficient (3 or 4)
    555 @ and idx1 is the other one of them.
    556 .macro do_8tap_8v type, idx1, idx2
    557 function \type\()_8tap_8v_\idx1\idx2
    558        sub             r2,  r2,  r3, lsl #1
    559        sub             r2,  r2,  r3
    560        vld1.16         {q0},  [r12, :128]
    561 1:
    562        mov             r12, r4
    563 
    564        loadl           q5,  q6,  q7
    565        loadl           q8,  q9,  q10, q11
    566 2:
    567        loadl           q12, q13, q14, q15
    568        convolve        q1,  q2,  q5,  q6,  q7,  q8,  q9,  q10, q11, q12, q13, \idx1, \idx2, q4,  q5
    569        convolve        q3,  q4,  q7,  q8,  q9,  q10, q11, q12, q13, q14, q15, \idx1, \idx2, q5,  q6
    570        do_store        q1,  d2,  q2,  d4,  q3,  d6,  q4,  d8,  d3,  d5,  d7,  d9,  \type
    571 
    572        subs            r12, r12, #4
    573        beq             8f
    574 
    575        loadl           q4,  q5,  q6,  q7
    576        convolve        q1,  q2,  q9,  q10, q11, q12, q13, q14, q15, q4,  q5,  \idx1, \idx2, q8,  q9
    577        convolve        q3,  q8,  q11, q12, q13, q14, q15, q4,  q5,  q6,  q7,  \idx1, \idx2, q9,  q10
    578        do_store        q1,  d2,  q2,  d4,  q3,  d6,  q8,  d16, d3,  d5,  d7,  d17, \type
    579 
    580        subs            r12, r12, #4
    581        beq             8f
    582 
    583        loadl           q8,  q9,  q10, q11
    584        convolve        q1,  q2,  q13, q14, q15, q4,  q5,  q6,  q7,  q8,  q9,  \idx1, \idx2, q12, q13
    585        convolve        q3,  q12, q15, q4,  q5,  q6,  q7,  q8,  q9,  q10, q11, \idx1, \idx2, q13, q14
    586        do_store        q1,  d2,  q2,  d4,  q3,  d6,  q12, d24, d3,  d5,  d7,  d25, \type
    587 
    588        subs            r12, r12, #4
    589        bne             2b
    590 
    591 8:
    592        subs            r5,  r5,  #8
    593        beq             9f
    594        @ r0 -= h * dst_stride
    595        mls             r0,  r1,  r4, r0
    596        @ r2 -= h * src_stride
    597        mls             r2,  r3,  r4, r2
    598        @ r2 -= 8 * src_stride
    599        sub             r2,  r2,  r3, lsl #3
    600        @ r2 += 1 * src_stride
    601        add             r2,  r2,  r3
    602        add             r2,  r2,  #8
    603        add             r0,  r0,  #8
    604        b               1b
    605 9:
    606        vpop            {q4-q7}
    607        pop             {r4-r5}
    608        bx              lr
    609 endfunc
    610 .endm
    611 
    612 do_8tap_8v put, 3, 4
    613 do_8tap_8v put, 4, 3
    614 do_8tap_8v avg, 3, 4
    615 do_8tap_8v avg, 4, 3
    616 
    617 @ Instantiate a vertical filter function for filtering a 4 pixels wide
    618 @ slice. The first half of the registers contain one row, while the second
    619 @ half of a register contains the second-next row (also stored in the first
    620 @ half of the register two steps ahead). The convolution does two outputs
    621 @ at a time; the output of q5-q12 into one, and q4-q13 into another one.
    622 @ The first half of first output is the first output row, the first half
    623 @ of the other output is the second output row. The second halves of the
    624 @ registers are rows 3 and 4.
    625 @ This only is designed to work for 4 or 8 output lines.
    626 .macro do_8tap_4v type, idx1, idx2
    627 function \type\()_8tap_4v_\idx1\idx2
    628        sub             r2,  r2,  r3, lsl #1
    629        sub             r2,  r2,  r3
    630        vld1.16         {q0},  [r12, :128]
    631 
    632        vld1.32         {d2[]},   [r2], r3
    633        vld1.32         {d3[]},   [r2], r3
    634        vld1.32         {d4[]},   [r2], r3
    635        vld1.32         {d5[]},   [r2], r3
    636        vld1.32         {d6[]},   [r2], r3
    637        vld1.32         {d7[]},   [r2], r3
    638        vext.8          d2,  d2,  d4,  #4
    639        vld1.32         {d8[]},   [r2], r3
    640        vext.8          d3,  d3,  d5,  #4
    641        vld1.32         {d9[]},   [r2], r3
    642        vmovl.u8        q5,  d2
    643        vext.8          d4,  d4,  d6,  #4
    644        vld1.32         {d28[]},  [r2], r3
    645        vmovl.u8        q6,  d3
    646        vext.8          d5,  d5,  d7,  #4
    647        vld1.32         {d29[]},  [r2], r3
    648        vmovl.u8        q7,  d4
    649        vext.8          d6,  d6,  d8,  #4
    650        vld1.32         {d30[]},  [r2], r3
    651        vmovl.u8        q8,  d5
    652        vext.8          d7,  d7,  d9,  #4
    653        vmovl.u8        q9,  d6
    654        vext.8          d8,  d8,  d28, #4
    655        vmovl.u8        q10, d7
    656        vext.8          d9,  d9,  d29, #4
    657        vmovl.u8        q11, d8
    658        vext.8          d28, d28, d30, #4
    659        vmovl.u8        q12, d9
    660        vmovl.u8        q13, d28
    661 
    662        convolve        q1,  q2,  q5,  q6,  q7,  q8,  q9,  q10, q11, q12, q13, \idx1, \idx2, q4, q3
    663        do_store4       q1,  d2,  q2,  d4,  d3,  d5,  \type
    664        subs            r4,  r4,  #4
    665        beq             9f
    666 
    667        vld1.32         {d2[]},   [r2], r3
    668        vld1.32         {d3[]},   [r2], r3
    669        vext.8          d29, d29, d2,  #4
    670        vext.8          d30, d30, d3,  #4
    671        vld1.32         {d2[1]},  [r2], r3
    672        vmovl.u8        q14, d29
    673        vld1.32         {d3[1]},  [r2], r3
    674        vmovl.u8        q15, d30
    675        vmovl.u8        q5,  d2
    676        vmovl.u8        q6,  d3
    677 
    678        convolve        q1,  q2,  q9,  q10, q11, q12, q13, q14, q15, q5,  q6,  \idx1, \idx2, q4, q3
    679        do_store4       q1,  d2,  q2,  d4,  d3,  d5,  \type
    680 
    681 9:
    682        vpop            {q4-q7}
    683        pop             {r4-r5}
    684        bx              lr
    685 endfunc
    686 .endm
    687 
    688 do_8tap_4v put, 3, 4
    689 do_8tap_4v put, 4, 3
    690 do_8tap_4v avg, 3, 4
    691 do_8tap_4v avg, 4, 3
    692 
    693 .macro do_8tap_v_func type, filter, offset, size
    694 function ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1
    695        push            {r4-r5}
    696        vpush           {q4-q7}
    697        ldr             r4,  [sp, #72]
    698        movrelx         r12, X(ff_vp9_subpel_filters), r5
    699        ldr             r5,  [sp, #80]
    700        add             r12, r12, 256*\offset
    701        add             r12, r12, r5, lsl #4
    702        cmp             r5,  #8
    703        mov             r5,  #\size
    704 .if \size >= 8
    705        bge             \type\()_8tap_8v_34
    706        b               \type\()_8tap_8v_43
    707 .else
    708        bge             \type\()_8tap_4v_34
    709        b               \type\()_8tap_4v_43
    710 .endif
    711 endfunc
    712 .endm
    713 
    714 .macro do_8tap_v_filters size
    715 do_8tap_v_func put, regular, 1, \size
    716 do_8tap_v_func avg, regular, 1, \size
    717 do_8tap_v_func put, sharp,   2, \size
    718 do_8tap_v_func avg, sharp,   2, \size
    719 do_8tap_v_func put, smooth,  0, \size
    720 do_8tap_v_func avg, smooth,  0, \size
    721 .endm
    722 
    723 do_8tap_v_filters 64
    724 do_8tap_v_filters 32
    725 do_8tap_v_filters 16
    726 do_8tap_v_filters 8
    727 do_8tap_v_filters 4