tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

vp9mc_16bpp_neon.S (23422B)


      1 /*
      2 * Copyright (c) 2017 Google Inc.
      3 *
      4 * This file is part of FFmpeg.
      5 *
      6 * FFmpeg is free software; you can redistribute it and/or
      7 * modify it under the terms of the GNU Lesser General Public
      8 * License as published by the Free Software Foundation; either
      9 * version 2.1 of the License, or (at your option) any later version.
     10 *
     11 * FFmpeg is distributed in the hope that it will be useful,
     12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     14 * Lesser General Public License for more details.
     15 *
     16 * You should have received a copy of the GNU Lesser General Public
     17 * License along with FFmpeg; if not, write to the Free Software
     18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     19 */
     20 
     21 #include "libavutil/aarch64/asm.S"
     22 
     23 // All public functions in this file have the following signature:
     24 // typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
     25 //                            const uint8_t *ref, ptrdiff_t ref_stride,
     26 //                            int h, int mx, int my);
     27 
     28 function ff_vp9_avg64_16_neon, export=1
     29        mov             x5,  x0
     30        sub             x1,  x1,  #64
     31        sub             x3,  x3,  #64
     32 1:
     33        ld1             {v4.8h,  v5.8h,  v6.8h,  v7.8h},  [x2], #64
     34        ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x0], #64
     35        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], x3
     36        urhadd          v0.8h,  v0.8h,  v4.8h
     37        urhadd          v1.8h,  v1.8h,  v5.8h
     38        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x1
     39        urhadd          v2.8h,  v2.8h,  v6.8h
     40        urhadd          v3.8h,  v3.8h,  v7.8h
     41        subs            w4,  w4,  #1
     42        urhadd          v16.8h, v16.8h, v20.8h
     43        urhadd          v17.8h, v17.8h, v21.8h
     44        st1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x5], #64
     45        urhadd          v18.8h, v18.8h, v22.8h
     46        urhadd          v19.8h, v19.8h, v23.8h
     47        st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x1
     48        b.ne            1b
     49        ret
     50 endfunc
     51 
     52 function ff_vp9_avg32_16_neon, export=1
     53        mov             x5,  x0
     54 1:
     55        ld1             {v4.8h,  v5.8h,  v6.8h,  v7.8h},  [x2], x3
     56        ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x0], x1
     57        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], x3
     58        urhadd          v0.8h,  v0.8h,  v4.8h
     59        urhadd          v1.8h,  v1.8h,  v5.8h
     60        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x1
     61        urhadd          v2.8h,  v2.8h,  v6.8h
     62        urhadd          v3.8h,  v3.8h,  v7.8h
     63        subs            w4,  w4,  #2
     64        urhadd          v16.8h, v16.8h, v20.8h
     65        urhadd          v17.8h, v17.8h, v21.8h
     66        st1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x5], x1
     67        urhadd          v18.8h, v18.8h, v22.8h
     68        urhadd          v19.8h, v19.8h, v23.8h
     69        st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x1
     70        b.ne            1b
     71        ret
     72 endfunc
     73 
     74 function ff_vp9_avg16_16_neon, export=1
     75 1:
     76        ld1             {v2.8h, v3.8h},  [x2], x3
     77        ld1             {v0.8h, v1.8h},  [x0]
     78        urhadd          v0.8h,  v0.8h,  v2.8h
     79        urhadd          v1.8h,  v1.8h,  v3.8h
     80        subs            w4,  w4,  #1
     81        st1             {v0.8h, v1.8h},  [x0], x1
     82        b.ne            1b
     83        ret
     84 endfunc
     85 
     86 function ff_vp9_avg8_16_neon, export=1
     87        mov             x5,  x0
     88 1:
     89        ld1             {v2.8h},  [x2], x3
     90        ld1             {v0.8h},  [x0], x1
     91        ld1             {v3.8h},  [x2], x3
     92        urhadd          v0.8h,  v0.8h,  v2.8h
     93        ld1             {v1.8h},  [x0], x1
     94        urhadd          v1.8h,  v1.8h,  v3.8h
     95        subs            w4,  w4,  #2
     96        st1             {v0.8h},  [x5], x1
     97        st1             {v1.8h},  [x5], x1
     98        b.ne            1b
     99        ret
    100 endfunc
    101 
    102 function ff_vp9_avg4_16_neon, export=1
    103        mov             x5,  x0
    104 1:
    105        ld1             {v2.4h},  [x2], x3
    106        ld1             {v0.4h},  [x0], x1
    107        ld1             {v3.4h},  [x2], x3
    108        urhadd          v0.4h,  v0.4h,  v2.4h
    109        ld1             {v1.4h},  [x0], x1
    110        urhadd          v1.4h,  v1.4h,  v3.4h
    111        subs            w4,  w4,  #2
    112        st1             {v0.4h},  [x5], x1
    113        st1             {v1.8b},  [x5], x1
    114        b.ne            1b
    115        ret
    116 endfunc
    117 
    118 
    119 // Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6
    120 // for size >= 16), and multiply-accumulate into dst1 and dst5 (or
    121 // dst1-dst2 and dst5-dst6 for size >= 8 and dst1-dst4 and dst5-dst8
    122 // for size >= 16)
    123 .macro extmlal dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, src1, src2, src3, src4, src5, src6, offset, size
    124        ext             v20.16b, \src1\().16b, \src2\().16b, #(2*\offset)
    125        ext             v22.16b, \src4\().16b, \src5\().16b, #(2*\offset)
    126        smlal           \dst1\().4s, v20.4h, v0.h[\offset]
    127        smlal           \dst5\().4s, v22.4h, v0.h[\offset]
    128 .if \size >= 16
    129        ext             v21.16b, \src2\().16b, \src3\().16b, #(2*\offset)
    130        ext             v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
    131 .endif
    132 .if \size >= 8
    133        smlal2          \dst2\().4s, v20.8h, v0.h[\offset]
    134        smlal2          \dst6\().4s, v22.8h, v0.h[\offset]
    135 .endif
    136 .if \size >= 16
    137        smlal           \dst3\().4s, v21.4h, v0.h[\offset]
    138        smlal           \dst7\().4s, v23.4h, v0.h[\offset]
    139        smlal2          \dst4\().4s, v21.8h, v0.h[\offset]
    140        smlal2          \dst8\().4s, v23.8h, v0.h[\offset]
    141 .endif
    142 .endm
    143 
    144 
    145 // Instantiate a horizontal filter function for the given size.
    146 // This can work on 4, 8 or 16 pixels in parallel; for larger
    147 // widths it will do 16 pixels at a time and loop horizontally.
    148 // The actual width (in bytes) is passed in x5, the height in w4 and
    149 // the filter coefficients in x9.
    150 .macro do_8tap_h type, size
    151 function \type\()_8tap_\size\()h
    152        sub             x2,  x2,  #6
    153        add             x6,  x0,  x1
    154        add             x7,  x2,  x3
    155        add             x1,  x1,  x1
    156        add             x3,  x3,  x3
    157        // Only size >= 16 loops horizontally and needs
    158        // reduced dst stride
    159 .if \size >= 16
    160        sub             x1,  x1,  x5
    161 .endif
    162        // size >= 16 loads two qwords and increments r2,
    163        // for size 4/8 it's enough with one qword and no
    164        // postincrement
    165 .if \size >= 16
    166        sub             x3,  x3,  x5
    167        sub             x3,  x3,  #16
    168 .endif
    169        // Load the filter vector
    170        ld1             {v0.8h},  [x9]
    171 1:
    172 .if \size >= 16
    173        mov             x9,  x5
    174 .endif
    175        // Load src
    176 .if \size >= 16
    177        ld1             {v5.8h,  v6.8h,  v7.8h},  [x2], #48
    178        ld1             {v16.8h, v17.8h, v18.8h}, [x7], #48
    179 .else
    180        ld1             {v5.8h,  v6.8h},  [x2]
    181        ld1             {v16.8h, v17.8h}, [x7]
    182 .endif
    183 2:
    184 
    185        smull           v1.4s,  v5.4h,  v0.h[0]
    186        smull           v24.4s, v16.4h, v0.h[0]
    187 .if \size >= 8
    188        smull2          v2.4s,  v5.8h,  v0.h[0]
    189        smull2          v25.4s, v16.8h, v0.h[0]
    190 .endif
    191 .if \size >= 16
    192        smull           v3.4s,  v6.4h,  v0.h[0]
    193        smull           v26.4s, v17.4h, v0.h[0]
    194        smull2          v4.4s,  v6.8h,  v0.h[0]
    195        smull2          v27.4s, v17.8h, v0.h[0]
    196 .endif
    197        extmlal         v1,  v2,  v3,  v4,  v24, v25, v26, v27, v5,  v6,  v7,  v16, v17, v18, 1, \size
    198        extmlal         v1,  v2,  v3,  v4,  v24, v25, v26, v27, v5,  v6,  v7,  v16, v17, v18, 2, \size
    199        extmlal         v1,  v2,  v3,  v4,  v24, v25, v26, v27, v5,  v6,  v7,  v16, v17, v18, 3, \size
    200        extmlal         v1,  v2,  v3,  v4,  v24, v25, v26, v27, v5,  v6,  v7,  v16, v17, v18, 4, \size
    201        extmlal         v1,  v2,  v3,  v4,  v24, v25, v26, v27, v5,  v6,  v7,  v16, v17, v18, 5, \size
    202        extmlal         v1,  v2,  v3,  v4,  v24, v25, v26, v27, v5,  v6,  v7,  v16, v17, v18, 6, \size
    203        extmlal         v1,  v2,  v3,  v4,  v24, v25, v26, v27, v5,  v6,  v7,  v16, v17, v18, 7, \size
    204 
    205        // Round, shift and saturate
    206        // The sqrshrun takes care of clamping negative values to zero, but
    207        // we manually need to do umin with the max pixel value.
    208        sqrshrun        v1.4h,  v1.4s,  #7
    209        sqrshrun        v24.4h, v24.4s, #7
    210 .if \size >= 8
    211        sqrshrun2       v1.8h,  v2.4s,  #7
    212        sqrshrun2       v24.8h, v25.4s, #7
    213        umin            v1.8h,  v1.8h,  v31.8h
    214        umin            v24.8h, v24.8h, v31.8h
    215 .if \size >= 16
    216        sqrshrun        v2.4h,  v3.4s,  #7
    217        sqrshrun        v25.4h, v26.4s, #7
    218        sqrshrun2       v2.8h,  v4.4s,  #7
    219        sqrshrun2       v25.8h, v27.4s, #7
    220        umin            v2.8h,  v2.8h,  v31.8h
    221        umin            v25.8h, v25.8h, v31.8h
    222 .endif
    223 .else
    224        umin            v1.4h,  v1.4h,  v31.4h
    225        umin            v24.4h, v24.4h, v31.4h
    226 .endif
    227        // Average
    228 .ifc \type,avg
    229 .if \size >= 16
    230        ld1             {v3.8h,  v4.8h},  [x0]
    231        ld1             {v29.8h, v30.8h}, [x6]
    232        urhadd          v1.8h,  v1.8h,  v3.8h
    233        urhadd          v2.8h,  v2.8h,  v4.8h
    234        urhadd          v24.8h, v24.8h, v29.8h
    235        urhadd          v25.8h, v25.8h, v30.8h
    236 .elseif \size >= 8
    237        ld1             {v3.8h},  [x0]
    238        ld1             {v4.8h},  [x6]
    239        urhadd          v1.8h,  v1.8h,  v3.8h
    240        urhadd          v24.8h, v24.8h, v4.8h
    241 .else
    242        ld1             {v3.4h},  [x0]
    243        ld1             {v4.4h},  [x6]
    244        urhadd          v1.4h,  v1.4h,  v3.4h
    245        urhadd          v24.4h, v24.4h, v4.4h
    246 .endif
    247 .endif
    248        // Store and loop horizontally (for size >= 16)
    249 .if \size >= 16
    250        subs            x9,  x9,  #32
    251        st1             {v1.8h,  v2.8h},  [x0], #32
    252        st1             {v24.8h, v25.8h}, [x6], #32
    253        b.eq            3f
    254        mov             v5.16b,  v7.16b
    255        mov             v16.16b, v18.16b
    256        ld1             {v6.8h,  v7.8h},  [x2], #32
    257        ld1             {v17.8h, v18.8h}, [x7], #32
    258        b               2b
    259 .elseif \size == 8
    260        st1             {v1.8h},  [x0]
    261        st1             {v24.8h}, [x6]
    262 .else // \size == 4
    263        st1             {v1.4h},  [x0]
    264        st1             {v24.4h}, [x6]
    265 .endif
    266 3:
    267        // Loop vertically
    268        add             x0,  x0,  x1
    269        add             x6,  x6,  x1
    270        add             x2,  x2,  x3
    271        add             x7,  x7,  x3
    272        subs            w4,  w4,  #2
    273        b.ne            1b
    274        ret
    275 endfunc
    276 .endm
    277 
    278 .macro do_8tap_h_size size
    279 do_8tap_h put, \size
    280 do_8tap_h avg, \size
    281 .endm
    282 
    283 do_8tap_h_size 4
    284 do_8tap_h_size 8
    285 do_8tap_h_size 16
    286 
    287 .macro do_8tap_h_func type, filter, offset, size, bpp
    288 function ff_vp9_\type\()_\filter\()\size\()_h_\bpp\()_neon, export=1
    289        mvni            v31.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
    290        movrel          x6,  X(ff_vp9_subpel_filters), 256*\offset
    291        cmp             w5,  #8
    292        add             x9,  x6,  w5, uxtw #4
    293        mov             x5,  #2*\size
    294 .if \size >= 16
    295        b               \type\()_8tap_16h
    296 .else
    297        b               \type\()_8tap_\size\()h
    298 .endif
    299 endfunc
    300 .endm
    301 
    302 .macro do_8tap_h_filters size, bpp
    303 do_8tap_h_func put, regular, 1, \size, \bpp
    304 do_8tap_h_func avg, regular, 1, \size, \bpp
    305 do_8tap_h_func put, sharp,   2, \size, \bpp
    306 do_8tap_h_func avg, sharp,   2, \size, \bpp
    307 do_8tap_h_func put, smooth,  0, \size, \bpp
    308 do_8tap_h_func avg, smooth,  0, \size, \bpp
    309 .endm
    310 
    311 .macro do_8tap_h_filters_bpp bpp
    312 do_8tap_h_filters 64, \bpp
    313 do_8tap_h_filters 32, \bpp
    314 do_8tap_h_filters 16, \bpp
    315 do_8tap_h_filters 8,  \bpp
    316 do_8tap_h_filters 4,  \bpp
    317 .endm
    318 
    319 do_8tap_h_filters_bpp 10
    320 do_8tap_h_filters_bpp 12
    321 
    322 
    323 // Vertical filters
    324 
    325 // Round, shift and saturate and store reg1-reg4
    326 .macro do_store4 reg1, reg2, reg3, reg4, tmp1, tmp2, tmp3, tmp4, minreg, type
    327        sqrshrun        \reg1\().4h,  \reg1\().4s, #7
    328        sqrshrun        \reg2\().4h,  \reg2\().4s, #7
    329        sqrshrun        \reg3\().4h,  \reg3\().4s, #7
    330        sqrshrun        \reg4\().4h,  \reg4\().4s, #7
    331 .ifc \type,avg
    332        ld1             {\tmp1\().4h},  [x7], x1
    333        ld1             {\tmp2\().4h},  [x7], x1
    334        ld1             {\tmp3\().4h},  [x7], x1
    335        ld1             {\tmp4\().4h},  [x7], x1
    336 .endif
    337        umin            \reg1\().4h,  \reg1\().4h,  \minreg\().4h
    338        umin            \reg2\().4h,  \reg2\().4h,  \minreg\().4h
    339        umin            \reg3\().4h,  \reg3\().4h,  \minreg\().4h
    340        umin            \reg4\().4h,  \reg4\().4h,  \minreg\().4h
    341 .ifc \type,avg
    342        urhadd          \reg1\().4h,  \reg1\().4h,  \tmp1\().4h
    343        urhadd          \reg2\().4h,  \reg2\().4h,  \tmp2\().4h
    344        urhadd          \reg3\().4h,  \reg3\().4h,  \tmp3\().4h
    345        urhadd          \reg4\().4h,  \reg4\().4h,  \tmp4\().4h
    346 .endif
    347        st1             {\reg1\().4h},  [x0], x1
    348        st1             {\reg2\().4h},  [x0], x1
    349        st1             {\reg3\().4h},  [x0], x1
    350        st1             {\reg4\().4h},  [x0], x1
    351 .endm
    352 
    353 // Round, shift and saturate and store reg1-8, where
    354 // reg1-2, reg3-4 etc pairwise correspond to 4 rows.
    355 .macro do_store8 reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, minreg, type
    356        sqrshrun        \reg1\().4h,  \reg1\().4s, #7
    357        sqrshrun2       \reg1\().8h,  \reg2\().4s, #7
    358        sqrshrun        \reg2\().4h,  \reg3\().4s, #7
    359        sqrshrun2       \reg2\().8h,  \reg4\().4s, #7
    360        sqrshrun        \reg3\().4h,  \reg5\().4s, #7
    361        sqrshrun2       \reg3\().8h,  \reg6\().4s, #7
    362        sqrshrun        \reg4\().4h,  \reg7\().4s, #7
    363        sqrshrun2       \reg4\().8h,  \reg8\().4s, #7
    364 .ifc \type,avg
    365        ld1             {\reg5\().8h},  [x7], x1
    366        ld1             {\reg6\().8h},  [x7], x1
    367        ld1             {\reg7\().8h},  [x7], x1
    368        ld1             {\reg8\().8h},  [x7], x1
    369 .endif
    370        umin            \reg1\().8h,  \reg1\().8h,  \minreg\().8h
    371        umin            \reg2\().8h,  \reg2\().8h,  \minreg\().8h
    372        umin            \reg3\().8h,  \reg3\().8h,  \minreg\().8h
    373        umin            \reg4\().8h,  \reg4\().8h,  \minreg\().8h
    374 .ifc \type,avg
    375        urhadd          \reg1\().8h,  \reg1\().8h,  \reg5\().8h
    376        urhadd          \reg2\().8h,  \reg2\().8h,  \reg6\().8h
    377        urhadd          \reg3\().8h,  \reg3\().8h,  \reg7\().8h
    378        urhadd          \reg4\().8h,  \reg4\().8h,  \reg8\().8h
    379 .endif
    380        st1             {\reg1\().8h},  [x0], x1
    381        st1             {\reg2\().8h},  [x0], x1
    382        st1             {\reg3\().8h},  [x0], x1
    383        st1             {\reg4\().8h},  [x0], x1
    384 .endm
    385 
    386 // Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
    387 // (src1-src8 into dst1, src2-src9 into dst2).
    388 .macro convolve4 dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, tmp1, tmp2
    389        smull           \dst1\().4s, \src1\().4h, v0.h[0]
    390        smull           \dst2\().4s, \src2\().4h, v0.h[0]
    391        smull           \tmp1\().4s, \src2\().4h, v0.h[1]
    392        smull           \tmp2\().4s, \src3\().4h, v0.h[1]
    393        smlal           \dst1\().4s, \src3\().4h, v0.h[2]
    394        smlal           \dst2\().4s, \src4\().4h, v0.h[2]
    395        smlal           \tmp1\().4s, \src4\().4h, v0.h[3]
    396        smlal           \tmp2\().4s, \src5\().4h, v0.h[3]
    397        smlal           \dst1\().4s, \src5\().4h, v0.h[4]
    398        smlal           \dst2\().4s, \src6\().4h, v0.h[4]
    399        smlal           \tmp1\().4s, \src6\().4h, v0.h[5]
    400        smlal           \tmp2\().4s, \src7\().4h, v0.h[5]
    401        smlal           \dst1\().4s, \src7\().4h, v0.h[6]
    402        smlal           \dst2\().4s, \src8\().4h, v0.h[6]
    403        smlal           \tmp1\().4s, \src8\().4h, v0.h[7]
    404        smlal           \tmp2\().4s, \src9\().4h, v0.h[7]
    405        add             \dst1\().4s, \dst1\().4s, \tmp1\().4s
    406        add             \dst2\().4s, \dst2\().4s, \tmp2\().4s
    407 .endm
    408 
    409 // Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst4
    410 // (src1-src8 into dst1-dst2, src2-src9 into dst3-dst4).
    411 .macro convolve8 dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8, src9
    412        smull           \dst1\().4s, \src1\().4h, v0.h[0]
    413        smull2          \dst2\().4s, \src1\().8h, v0.h[0]
    414        smull           \dst3\().4s, \src2\().4h, v0.h[0]
    415        smull2          \dst4\().4s, \src2\().8h, v0.h[0]
    416        smlal           \dst1\().4s, \src2\().4h, v0.h[1]
    417        smlal2          \dst2\().4s, \src2\().8h, v0.h[1]
    418        smlal           \dst3\().4s, \src3\().4h, v0.h[1]
    419        smlal2          \dst4\().4s, \src3\().8h, v0.h[1]
    420        smlal           \dst1\().4s, \src3\().4h, v0.h[2]
    421        smlal2          \dst2\().4s, \src3\().8h, v0.h[2]
    422        smlal           \dst3\().4s, \src4\().4h, v0.h[2]
    423        smlal2          \dst4\().4s, \src4\().8h, v0.h[2]
    424        smlal           \dst1\().4s, \src4\().4h, v0.h[3]
    425        smlal2          \dst2\().4s, \src4\().8h, v0.h[3]
    426        smlal           \dst3\().4s, \src5\().4h, v0.h[3]
    427        smlal2          \dst4\().4s, \src5\().8h, v0.h[3]
    428        smlal           \dst1\().4s, \src5\().4h, v0.h[4]
    429        smlal2          \dst2\().4s, \src5\().8h, v0.h[4]
    430        smlal           \dst3\().4s, \src6\().4h, v0.h[4]
    431        smlal2          \dst4\().4s, \src6\().8h, v0.h[4]
    432        smlal           \dst1\().4s, \src6\().4h, v0.h[5]
    433        smlal2          \dst2\().4s, \src6\().8h, v0.h[5]
    434        smlal           \dst3\().4s, \src7\().4h, v0.h[5]
    435        smlal2          \dst4\().4s, \src7\().8h, v0.h[5]
    436        smlal           \dst1\().4s, \src7\().4h, v0.h[6]
    437        smlal2          \dst2\().4s, \src7\().8h, v0.h[6]
    438        smlal           \dst3\().4s, \src8\().4h, v0.h[6]
    439        smlal2          \dst4\().4s, \src8\().8h, v0.h[6]
    440        smlal           \dst1\().4s, \src8\().4h, v0.h[7]
    441        smlal2          \dst2\().4s, \src8\().8h, v0.h[7]
    442        smlal           \dst3\().4s, \src9\().4h, v0.h[7]
    443        smlal2          \dst4\().4s, \src9\().8h, v0.h[7]
    444 .endm
    445 
    446 // Instantiate a vertical filter function for filtering 8 pixels at a time.
    447 // The height is passed in x4, the width in x5 and the filter coefficients
    448 // in x6.
    449 .macro do_8tap_8v type
    450 function \type\()_8tap_8v
    451        sub             x2,  x2,  x3, lsl #1
    452        sub             x2,  x2,  x3
    453        ld1             {v0.8h},  [x6]
    454 1:
    455 .ifc \type,avg
    456        mov             x7,  x0
    457 .endif
    458        mov             x6,  x4
    459 
    460        ld1             {v17.8h}, [x2], x3
    461        ld1             {v18.8h}, [x2], x3
    462        ld1             {v19.8h}, [x2], x3
    463        ld1             {v20.8h}, [x2], x3
    464        ld1             {v21.8h}, [x2], x3
    465        ld1             {v22.8h}, [x2], x3
    466        ld1             {v23.8h}, [x2], x3
    467 2:
    468        ld1             {v24.8h}, [x2], x3
    469        ld1             {v25.8h}, [x2], x3
    470        ld1             {v26.8h}, [x2], x3
    471        ld1             {v27.8h}, [x2], x3
    472 
    473        convolve8       v2,  v3,  v4,  v5,  v17, v18, v19, v20, v21, v22, v23, v24, v25
    474        convolve8       v6,  v7,  v30, v31, v19, v20, v21, v22, v23, v24, v25, v26, v27
    475        do_store8       v2,  v3,  v4,  v5,  v6,  v7,  v30, v31, v1,  \type
    476 
    477        subs            x6,  x6,  #4
    478        b.eq            8f
    479 
    480        ld1             {v16.8h}, [x2], x3
    481        ld1             {v17.8h}, [x2], x3
    482        ld1             {v18.8h}, [x2], x3
    483        ld1             {v19.8h}, [x2], x3
    484        convolve8       v2,  v3,  v4,  v5,  v21, v22, v23, v24, v25, v26, v27, v16, v17
    485        convolve8       v6,  v7,  v20, v21, v23, v24, v25, v26, v27, v16, v17, v18, v19
    486        do_store8       v2,  v3,  v4,  v5,  v6,  v7,  v20, v21, v1,  \type
    487 
    488        subs            x6,  x6,  #4
    489        b.eq            8f
    490 
    491        ld1             {v20.8h}, [x2], x3
    492        ld1             {v21.8h}, [x2], x3
    493        ld1             {v22.8h}, [x2], x3
    494        ld1             {v23.8h}, [x2], x3
    495        convolve8       v2,  v3,  v4,  v5,  v25, v26, v27, v16, v17, v18, v19, v20, v21
    496        convolve8       v6,  v7,  v24, v25, v27, v16, v17, v18, v19, v20, v21, v22, v23
    497        do_store8       v2,  v3,  v4,  v5,  v6,  v7,  v24, v25, v1,  \type
    498 
    499        subs            x6,  x6,  #4
    500        b.ne            2b
    501 
    502 8:
    503        subs            x5,  x5,  #8
    504        b.eq            9f
    505        // x0 -= h * dst_stride
    506        msub            x0,  x1,  x4, x0
    507        // x2 -= h * src_stride
    508        msub            x2,  x3,  x4, x2
    509        // x2 -= 8 * src_stride
    510        sub             x2,  x2,  x3, lsl #3
    511        // x2 += 1 * src_stride
    512        add             x2,  x2,  x3
    513        add             x2,  x2,  #16
    514        add             x0,  x0,  #16
    515        b               1b
    516 9:
    517        ret
    518 endfunc
    519 .endm
    520 
    521 do_8tap_8v put
    522 do_8tap_8v avg
    523 
    524 
    525 // Instantiate a vertical filter function for filtering a 4 pixels wide
    526 // slice. This only is designed to work for 4 or 8 output lines.
    527 .macro do_8tap_4v type
    528 function \type\()_8tap_4v
    529        sub             x2,  x2,  x3, lsl #1
    530        sub             x2,  x2,  x3
    531        ld1             {v0.8h},  [x6]
    532 .ifc \type,avg
    533        mov             x7,  x0
    534 .endif
    535 
    536        ld1             {v16.4h}, [x2], x3
    537        ld1             {v17.4h}, [x2], x3
    538        ld1             {v18.4h}, [x2], x3
    539        ld1             {v19.4h}, [x2], x3
    540        ld1             {v20.4h}, [x2], x3
    541        ld1             {v21.4h}, [x2], x3
    542        ld1             {v22.4h}, [x2], x3
    543        ld1             {v23.4h}, [x2], x3
    544        ld1             {v24.4h}, [x2], x3
    545        ld1             {v25.4h}, [x2], x3
    546        ld1             {v26.4h}, [x2], x3
    547 
    548        convolve4       v2,  v3,  v16, v17, v18, v19, v20, v21, v22, v23, v24, v30, v31
    549        convolve4       v4,  v5,  v18, v19, v20, v21, v22, v23, v24, v25, v26, v30, v31
    550        do_store4       v2,  v3,  v4,  v5,  v28, v29, v30, v31, v1,  \type
    551 
    552        subs            x4,  x4,  #4
    553        b.eq            9f
    554 
    555        ld1             {v27.4h}, [x2], x3
    556        ld1             {v28.4h}, [x2], x3
    557        ld1             {v29.4h}, [x2], x3
    558        ld1             {v30.4h}, [x2], x3
    559 
    560        convolve4       v2,  v3,  v20, v21, v22, v23, v24, v25, v26, v27, v28, v16, v17
    561        convolve4       v4,  v5,  v22, v23, v24, v25, v26, v27, v28, v29, v30, v16, v17
    562        do_store4       v2,  v3,  v4,  v5,  v16, v17, v18, v19, v1,  \type
    563 
    564 9:
    565        ret
    566 endfunc
    567 .endm
    568 
    569 do_8tap_4v put
    570 do_8tap_4v avg
    571 
    572 
    573 .macro do_8tap_v_func type, filter, offset, size, bpp
    574 function ff_vp9_\type\()_\filter\()\size\()_v_\bpp\()_neon, export=1
    575        uxtw            x4,  w4
    576        mvni            v1.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
    577        movrel          x5,  X(ff_vp9_subpel_filters), 256*\offset
    578        add             x6,  x5,  w6, uxtw #4
    579        mov             x5,  #\size
    580 .if \size >= 8
    581        b               \type\()_8tap_8v
    582 .else
    583        b               \type\()_8tap_4v
    584 .endif
    585 endfunc
    586 .endm
    587 
    588 .macro do_8tap_v_filters size, bpp
    589 do_8tap_v_func put, regular, 1, \size, \bpp
    590 do_8tap_v_func avg, regular, 1, \size, \bpp
    591 do_8tap_v_func put, sharp,   2, \size, \bpp
    592 do_8tap_v_func avg, sharp,   2, \size, \bpp
    593 do_8tap_v_func put, smooth,  0, \size, \bpp
    594 do_8tap_v_func avg, smooth,  0, \size, \bpp
    595 .endm
    596 
    597 .macro do_8tap_v_filters_bpp bpp
    598 do_8tap_v_filters 64, \bpp
    599 do_8tap_v_filters 32, \bpp
    600 do_8tap_v_filters 16, \bpp
    601 do_8tap_v_filters 8,  \bpp
    602 do_8tap_v_filters 4,  \bpp
    603 .endm
    604 
    605 do_8tap_v_filters_bpp 10
    606 do_8tap_v_filters_bpp 12