tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

mc16_sve.S (58216B)


      1 /*
      2 * Copyright © 2024, Arm Limited
      3 * All rights reserved.
      4 *
      5 * Redistribution and use in source and binary forms, with or without
      6 * modification, are permitted provided that the following conditions are met:
      7 *
      8 * 1. Redistributions of source code must retain the above copyright notice, this
      9 *    list of conditions and the following disclaimer.
     10 *
     11 * 2. Redistributions in binary form must reproduce the above copyright notice,
     12 *    this list of conditions and the following disclaimer in the documentation
     13 *    and/or other materials provided with the distribution.
     14 *
     15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     18 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     19 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     20 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     21 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     22 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     24 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25 */
     26 
     27 #include "src/arm/asm.S"
     28 #include "util.S"
     29 
     30 #define PREP_BIAS 32, lsl #8        // 8192
     31 #define PREP_BIAS_NEG 224, lsl #8   // -8192
     32 
     33 #if HAVE_SVE2
     34 ENABLE_SVE
     35 ENABLE_SVE2
     36 
     37 // No spaces in these expressions, due to gas-preprocessor. It is translated by
     38 // -1 to save the negative offset when getting the address of `mc_subpel_filters`.
     39 #define REGULAR1        (((0*15-1)<<7)|(3*15-1))
     40 #define SMOOTH1         (((1*15-1)<<7)|(4*15-1))
     41 #define SHARP1          (((2*15-1)<<7)|(3*15-1))
     42 
     43 #define FUNC_ALIGN      2
     44 #define JUMP_ALIGN      2
     45 #define LOOP_ALIGN      2
     46 
     47 
     48 // Shuffle indices to permute horizontal samples in preparation for input to
     49 // 16-bit SDOT instructions. The 8-tap horizontal convolution uses sample
     50 // indices in the interval of [-3, 4] relative to the current sample position.
     51 const h_tbl_sve, align=4
     52        .byte  0,  1,  2,  3,  4,  5,  6,  7,   2,  3,  4,  5,  6,  7,  8,  9
     53        .byte  4,  5,  6,  7,  8,  9, 10, 11,   6,  7,  8,  9, 10, 11, 12, 13
     54 endconst
     55 
     56 // Vertical convolutions also use 16-bit SDOT instructions, where two 128-bit
     57 // registers contain a transposed 4x4 matrix of values. Subsequent iterations
     58 // of the vertical convolution can reuse the 3x4 sub-matrix from the previous
     59 // loop iteration. These shuffle indices shift and merge this 4x4 matrix with
     60 // the values of a new line.
     61 const v_tbl_sve, align=4
     62        .byte  2,  3,  4,  5,  6,  7, 16, 17,  10, 11, 12, 13, 14, 15, 24, 25
     63        .byte  2,  3,  4,  5,  6,  7, 16, 17,  10, 11, 12, 13, 14, 15, 18, 19
     64        .byte  2,  3,  4,  5,  6,  7, 20, 21,  10, 11, 12, 13, 14, 15, 22, 23
     65        .byte  2,  3,  4,  5,  6,  7, 24, 25,  10, 11, 12, 13, 14, 15, 26, 27
     66        .byte  2,  3,  4,  5,  6,  7, 28, 29,  10, 11, 12, 13, 14, 15, 30, 31
     67 endconst
     68 
     69 
     70 .macro make_8tap_fn op, type, type_h, type_v, isa, jump=1
     71 function \op\()_8tap_\type\()_16bpc_\isa, export=1, align=FUNC_ALIGN
     72        mov             x9,  \type_h
     73        mov             x10, \type_v
     74    .if \jump
     75        b               \op\()_8tap_\isa
     76    .endif
     77 endfunc
     78 .endm
     79 
     80 .macro filter_8tap_fn type, isa, dst, d_strd, src, s_strd, w, h, mx, my, bdmax, xmx, xmy, ldst, lsrc, wd_strd, ws_strd
     81 make_8tap_fn \type, sharp,          SHARP1,   SHARP1,   \isa
     82 make_8tap_fn \type, sharp_smooth,   SHARP1,   SMOOTH1,  \isa
     83 make_8tap_fn \type, sharp_regular,  SHARP1,   REGULAR1, \isa
     84 make_8tap_fn \type, smooth_sharp,   SMOOTH1,  SHARP1,   \isa
     85 make_8tap_fn \type, smooth,         SMOOTH1,  SMOOTH1,  \isa
     86 make_8tap_fn \type, smooth_regular, SMOOTH1,  REGULAR1, \isa
     87 make_8tap_fn \type, regular_sharp,  REGULAR1, SHARP1,   \isa
     88 make_8tap_fn \type, regular_smooth, REGULAR1, SMOOTH1,  \isa
     89 make_8tap_fn \type, regular,        REGULAR1, REGULAR1, \isa, jump=0
     90 
     91 function \type\()_8tap_\isa, align=FUNC_ALIGN
     92        clz             w8, \w
     93        mov             w11, #0x4081                    // (1<<14) | (1<<7) | 1
     94        ptrue           p0.b, vl16
     95        sub             w8, w8, #24                     // for jump tables
     96        movrel          x12, X(mc_subpel_filters)
     97        cbnz            \mx, L(\type\()_8tap_h_hv_\isa)
     98 .ifc \type, prep
     99        cbz             \my, prep_sve
    100 .else   // put
    101        cbnz            \my, L(\type\()_8tap_v_\isa)
    102        mov             w9, w8
    103        b               X(put_16bpc_neon)
    104 
    105        .align JUMP_ALIGN
    106 .endif
    107 
    108 L(\type\()_8tap_v_\isa):
    109        madd            \my, \my, w11, w10
    110        movrel          x13, v_tbl_sve
    111 .ifc \bdmax, w8                                         // put case, but skip
    112        ld1r            {v5.8h}, [sp]                   // loading into w8
    113 .endif
    114        sub             \src, \src, \s_strd             // src - s_strd
    115        ubfx            w11, \my, #7, #7
    116        and             \my, \my, #0x7F
    117        ldr             q6, [x13]
    118        cmp             \h, #4
    119        csel            \my, \my, w11, le
    120        sub             \src, \src, \s_strd, lsl #1     // src - 3 * s_strd
    121        add             \xmy, x12, \xmy, lsl #3         // subpel V filter address
    122        ldp             q28, q29, [x13, #16]
    123        ld1sb           {z7.h}, p0/z, [\xmy]
    124 .ifc \type, prep
    125        clz             \bdmax, \bdmax
    126        sub             \bdmax, \bdmax, #24
    127        dup             v5.4s, \bdmax
    128 .endif
    129        cmp             \w, #8
    130        b.lt            40f
    131 
    132        // .align JUMP_ALIGN   // fallthrough
    133 80:     // V - 8xN+
    134        ldp             q30, q31, [x13, #48]
    135 .ifc \type, prep
    136        add             \wd_strd, \w, \w                // d_strd = 2 * w
    137 .endif
    138        .align LOOP_ALIGN
    139 81:
    140        add             \lsrc, \src, \s_strd, lsl #1
    141 
    142        ldr             q16, [\src]
    143        ldr             q17, [\src, \s_strd]
    144        ldr             q18, [\lsrc]
    145        ldr             q19, [\lsrc, \s_strd]
    146        add             \lsrc, \lsrc, \s_strd, lsl #1
    147        mov             \ldst, \dst
    148 
    149        ldr             q20, [\lsrc]
    150        ldr             q21, [\lsrc, \s_strd]
    151        add             \lsrc, \lsrc, \s_strd, lsl #1
    152        ldr             q22, [\lsrc]
    153        ldr             q23, [\lsrc, \s_strd]
    154        add             \lsrc, \lsrc, \s_strd, lsl #1
    155        sub             w8, \h, #1
    156 
    157        zip1            v0.8h, v16.8h, v17.8h
    158        zip2            v1.8h, v16.8h, v17.8h
    159        zip1            v2.8h, v18.8h, v19.8h
    160        zip2            v3.8h, v18.8h, v19.8h
    161 
    162        zip1            v18.8h, v20.8h, v21.8h
    163        zip2            v21.8h, v20.8h, v21.8h
    164        zip1            v24.8h, v22.8h, v23.8h
    165        zip2            v27.8h, v22.8h, v23.8h
    166 
    167        zip1            v16.4s, v0.4s, v2.4s
    168        zip2            v19.4s, v0.4s, v2.4s
    169        zip1            v22.4s, v1.4s, v3.4s
    170        zip2            v25.4s, v1.4s, v3.4s
    171 
    172        zip1            v17.4s, v18.4s, v24.4s
    173        zip2            v20.4s, v18.4s, v24.4s
    174        zip1            v23.4s, v21.4s, v27.4s
    175        zip2            v26.4s, v21.4s, v27.4s
    176 
    177        .align LOOP_ALIGN
    178 8:
    179        ld1             {v18.16b}, [\lsrc], \s_strd
    180 
    181        movi            v0.2d, #0
    182        movi            v1.2d, #0
    183        movi            v2.2d, #0
    184        movi            v3.2d, #0
    185        mov             v21.16b, v18.16b
    186        mov             v24.16b, v18.16b
    187        mov             v27.16b, v18.16b
    188 
    189        sdot            z0.d, z16.h, z7.h[0]
    190        tbl             v16.16b, {v16.16b, v17.16b}, v6.16b
    191        sdot            z1.d, z19.h, z7.h[0]
    192        tbl             v19.16b, {v19.16b, v20.16b}, v6.16b
    193        sdot            z2.d, z22.h, z7.h[0]
    194        tbl             v22.16b, {v22.16b, v23.16b}, v6.16b
    195        subs            w8, w8, #1
    196        sdot            z3.d, z25.h, z7.h[0]
    197        tbl             v25.16b, {v25.16b, v26.16b}, v6.16b
    198 
    199        sdot            z0.d, z17.h, z7.h[1]
    200        tbl             v17.16b, {v17.16b, v18.16b}, v28.16b
    201        sdot            z1.d, z20.h, z7.h[1]
    202        tbl             v20.16b, {v20.16b, v21.16b}, v29.16b
    203        sdot            z2.d, z23.h, z7.h[1]
    204        tbl             v23.16b, {v23.16b, v24.16b}, v30.16b
    205        sdot            z3.d, z26.h, z7.h[1]
    206        tbl             v26.16b, {v26.16b, v27.16b}, v31.16b
    207 
    208        uzp1            v0.4s, v0.4s, v1.4s
    209        uzp1            v1.4s, v2.4s, v3.4s
    210 .ifc \type, prep
    211        srshl           v0.4s, v0.4s, v5.4s
    212        srshl           v1.4s, v1.4s, v5.4s
    213        uzp1            v0.8h, v0.8h, v1.8h
    214        sub             z0.h, z0.h, #PREP_BIAS
    215 .else   // put
    216        sqrshrun        v0.4h, v0.4s, #6
    217        sqrshrun2       v0.8h, v1.4s, #6
    218        umin            v0.8h, v0.8h, v5.8h
    219 .endif
    220        st1             {v0.16b}, [\ldst], \d_strd
    221        b.gt            8b
    222 
    223        movi            v0.2d, #0
    224        movi            v1.2d, #0
    225        movi            v2.2d, #0
    226        movi            v3.2d, #0
    227 
    228        sdot            z0.d, z16.h, z7.h[0]
    229        sdot            z1.d, z19.h, z7.h[0]
    230        sdot            z2.d, z22.h, z7.h[0]
    231        sdot            z3.d, z25.h, z7.h[0]
    232 
    233        sdot            z0.d, z17.h, z7.h[1]
    234        sdot            z1.d, z20.h, z7.h[1]
    235        sdot            z2.d, z23.h, z7.h[1]
    236        sdot            z3.d, z26.h, z7.h[1]
    237        subs            \w, \w, #8
    238 
    239        uzp1            v0.4s, v0.4s, v1.4s
    240        uzp1            v1.4s, v2.4s, v3.4s
    241 .ifc \type, prep
    242        srshl           v0.4s, v0.4s, v5.4s
    243        srshl           v1.4s, v1.4s, v5.4s
    244        uzp1            v0.8h, v0.8h, v1.8h
    245        sub             z0.h, z0.h, #PREP_BIAS
    246 .else   // put
    247        sqrshrun        v0.4h, v0.4s, #6
    248        sqrshrun2       v0.8h, v1.4s, #6
    249        umin            v0.8h, v0.8h, v5.8h
    250 .endif
    251        str             q0, [\ldst]
    252 
    253        add             \dst, \dst, #16
    254        add             \src, \src, #16
    255        b.gt            81b
    256        ret
    257 
    258        .align JUMP_ALIGN
    259 40:     // V - 4xN, put only: 2xN
    260 .ifc \type, put
    261        lsr             \d_strd, \d_strd, #1        // hword index for `st1h`
    262        whilelt         p1.h, wzr, \w               // masking for writes
    263 .endif
    264        cmp             \h, #4
    265        b.le            44f
    266 
    267        ldr             d16, [\src]
    268        ldr             d17, [\src, \s_strd]
    269        add             \src, \src, \s_strd, lsl #1
    270        ldr             d18, [\src]
    271        ldr             d19, [\src, \s_strd]
    272        add             \src, \src, \s_strd, lsl #1
    273 
    274        ldr             d20, [\src]
    275        ldr             d21, [\src, \s_strd]
    276        add             \src, \src, \s_strd, lsl #1
    277        ldr             d22, [\src]
    278        ldr             d23, [\src, \s_strd]
    279        add             \src, \src, \s_strd, lsl #1
    280        sub             \h, \h, #2
    281 
    282        zip1            v0.8h, v16.8h, v17.8h
    283        zip1            v2.8h, v18.8h, v19.8h
    284        zip1            v18.8h, v20.8h, v21.8h
    285        zip1            v24.8h, v22.8h, v23.8h
    286 
    287        zip1            v16.4s, v0.4s, v2.4s
    288        zip2            v19.4s, v0.4s, v2.4s
    289        zip1            v17.4s, v18.4s, v24.4s
    290        zip2            v20.4s, v18.4s, v24.4s
    291 
    292        .align LOOP_ALIGN
    293 4:
    294        ldr             d18, [\src]
    295        ldr             d24, [\src, \s_strd]
    296        add             \src, \src, \s_strd, lsl #1
    297 
    298        movi            v0.2d, #0
    299        movi            v1.2d, #0
    300        movi            v2.2d, #0
    301        movi            v3.2d, #0
    302        mov             v21.16b, v18.16b
    303        mov             v27.16b, v24.16b
    304 
    305        sdot            z0.d, z16.h, z7.h[0]
    306        tbl             v22.16b, {v16.16b, v17.16b}, v6.16b
    307        sdot            z1.d, z19.h, z7.h[0]
    308        tbl             v25.16b, {v19.16b, v20.16b}, v6.16b
    309        sdot            z0.d, z17.h, z7.h[1]
    310        tbl             v23.16b, {v17.16b, v18.16b}, v28.16b
    311        sdot            z1.d, z20.h, z7.h[1]
    312        tbl             v26.16b, {v20.16b, v21.16b}, v29.16b
    313        subs            \h, \h, #2
    314 
    315        sdot            z2.d, z22.h, z7.h[0]
    316        tbl             v16.16b, {v22.16b, v23.16b}, v6.16b
    317        sdot            z3.d, z25.h, z7.h[0]
    318        tbl             v19.16b, {v25.16b, v26.16b}, v6.16b
    319        sdot            z2.d, z23.h, z7.h[1]
    320        tbl             v17.16b, {v23.16b, v24.16b}, v28.16b
    321        sdot            z3.d, z26.h, z7.h[1]
    322        tbl             v20.16b, {v26.16b, v27.16b}, v29.16b
    323 
    324        uzp1            v0.4s, v0.4s, v1.4s
    325        uzp1            v1.4s, v2.4s, v3.4s
    326 .ifc \type, prep
    327        srshl           v0.4s, v0.4s, v5.4s
    328        srshl           v1.4s, v1.4s, v5.4s
    329        uzp1            v0.8h, v0.8h, v1.8h
    330        sub             z0.h, z0.h, #PREP_BIAS
    331        str             q0, [\dst], #16
    332 .else   // put
    333        sqrshrun        v0.4h, v0.4s, #6
    334        sqrshrun        v1.4h, v1.4s, #6
    335        umin            v0.4h, v0.4h, v5.4h
    336        umin            v1.4h, v1.4h, v5.4h
    337        st1h            {z0.h}, p1, [\dst]
    338        st1h            {z1.h}, p1, [\dst, \d_strd, lsl #1]
    339        add             \dst, \dst, \d_strd, lsl #2
    340 .endif
    341        b.gt            4b
    342 
    343        ldr             d18, [\src]
    344 
    345        movi            v0.2d, #0
    346        movi            v1.2d, #0
    347        movi            v2.2d, #0
    348        movi            v3.2d, #0
    349        mov             v21.16b, v18.16b
    350 
    351        sdot            z0.d, z16.h, z7.h[0]
    352        tbl             v22.16b, {v16.16b, v17.16b}, v6.16b
    353        sdot            z1.d, z19.h, z7.h[0]
    354        tbl             v25.16b, {v19.16b, v20.16b}, v6.16b
    355        sdot            z0.d, z17.h, z7.h[1]
    356        tbl             v23.16b, {v17.16b, v18.16b}, v28.16b
    357        sdot            z1.d, z20.h, z7.h[1]
    358        tbl             v26.16b, {v20.16b, v21.16b}, v29.16b
    359 
    360        sdot            z2.d, z22.h, z7.h[0]
    361        sdot            z3.d, z25.h, z7.h[0]
    362        sdot            z2.d, z23.h, z7.h[1]
    363        sdot            z3.d, z26.h, z7.h[1]
    364 
    365        uzp1            v0.4s, v0.4s, v1.4s
    366        uzp1            v1.4s, v2.4s, v3.4s
    367 .ifc \type, prep
    368        srshl           v0.4s, v0.4s, v5.4s
    369        srshl           v1.4s, v1.4s, v5.4s
    370        uzp1            v0.8h, v0.8h, v1.8h
    371        sub             z0.h, z0.h, #PREP_BIAS
    372        str             q0, [\dst]
    373 .else   // put
    374        sqrshrun        v0.4h, v0.4s, #6
    375        sqrshrun        v1.4h, v1.4s, #6
    376        umin            v0.4h, v0.4h, v5.4h
    377        umin            v1.4h, v1.4h, v5.4h
    378        st1h            {z0.h}, p1, [\dst]
    379        st1h            {z1.h}, p1, [\dst, \d_strd, lsl #1]
    380 .endif
    381        ret
    382 
    383        .align JUMP_ALIGN
    384 44:     // V - 4x4, put only: 4x2, 2x4, 2x2
    385        add             \src, \src, \s_strd, lsl #1     // src - s_strd
    386        subs            \h, \h, #2
    387 
    388        ldr             d16, [\src]
    389        ldr             d17, [\src, \s_strd]
    390        add             \src, \src, \s_strd, lsl #1
    391        ldr             d18, [\src]
    392        ldr             d19, [\src, \s_strd]
    393        add             \src, \src, \s_strd, lsl #1
    394 
    395        ext             v7.16b, v7.16b, v7.16b, #4      // [\xmy + 2 * 2]
    396 
    397        zip1            v0.8h, v16.8h, v17.8h
    398        zip1            v2.8h, v18.8h, v19.8h
    399        zip1            v16.4s, v0.4s, v2.4s
    400        zip2            v19.4s, v0.4s, v2.4s
    401 
    402 .ifc \type, put
    403        b.eq            42f
    404 .endif
    405        ldr             d17, [\src]
    406        ldr             d23, [\src, \s_strd]
    407        add             \src, \src, \s_strd, lsl #1
    408 
    409        movi            v0.2d, #0
    410        movi            v1.2d, #0
    411        movi            v2.2d, #0
    412        movi            v3.2d, #0
    413        mov             v20.16b, v17.16b
    414        mov             v26.16b, v23.16b
    415 
    416        sdot            z0.d, z16.h, z7.h[0]
    417        tbl             v22.16b, {v16.16b, v17.16b}, v28.16b
    418        sdot            z1.d, z19.h, z7.h[0]
    419        tbl             v25.16b, {v19.16b, v20.16b}, v29.16b
    420        sdot            z2.d, z22.h, z7.h[0]
    421        tbl             v16.16b, {v22.16b, v23.16b}, v28.16b
    422        sdot            z3.d, z25.h, z7.h[0]
    423        tbl             v19.16b, {v25.16b, v26.16b}, v29.16b
    424 
    425        uzp1            v0.4s, v0.4s, v1.4s
    426        uzp1            v1.4s, v2.4s, v3.4s
    427 .ifc \type, prep
    428        srshl           v0.4s, v0.4s, v5.4s
    429        srshl           v1.4s, v1.4s, v5.4s
    430        uzp1            v0.8h, v0.8h, v1.8h
    431        sub             z0.h, z0.h, #PREP_BIAS
    432        str             q0, [\dst], #16
    433 .else   // put
    434        sqrshrun        v0.4h, v0.4s, #6
    435        sqrshrun        v1.4h, v1.4s, #6
    436        umin            v0.4h, v0.4h, v5.4h
    437        umin            v1.4h, v1.4h, v5.4h
    438        st1h            {z0.h}, p1, [\dst]
    439        st1h            {z1.h}, p1, [\dst, \d_strd, lsl #1]
    440        add             \dst, \dst, \d_strd, lsl #2
    441 .endif
    442 
    443 .ifc \type, put
    444        .align JUMP_ALIGN
    445 42:
    446 .endif
    447        ldr             d17, [\src]
    448 
    449        movi            v0.2d, #0
    450        movi            v1.2d, #0
    451        movi            v2.2d, #0
    452        movi            v3.2d, #0
    453        mov             v20.16b, v17.16b
    454 
    455        sdot            z0.d, z16.h, z7.h[0]
    456        tbl             v22.16b, {v16.16b, v17.16b}, v28.16b
    457        sdot            z1.d, z19.h, z7.h[0]
    458        tbl             v25.16b, {v19.16b, v20.16b}, v29.16b
    459 
    460        sdot            z2.d, z22.h, z7.h[0]
    461        sdot            z3.d, z25.h, z7.h[0]
    462 
    463        uzp1            v0.4s, v0.4s, v1.4s
    464        uzp1            v1.4s, v2.4s, v3.4s
    465 .ifc \type, prep
    466        srshl           v0.4s, v0.4s, v5.4s
    467        srshl           v1.4s, v1.4s, v5.4s
    468        uzp1            v0.8h, v0.8h, v1.8h
    469        sub             z0.h, z0.h, #PREP_BIAS
    470        str             q0, [\dst]
    471 .else   // put
    472        sqrshrun        v0.4h, v0.4s, #6
    473        sqrshrun        v1.4h, v1.4s, #6
    474        umin            v0.4h, v0.4h, v5.4h
    475        umin            v1.4h, v1.4h, v5.4h
    476        st1h            {z0.h}, p1, [\dst]
    477        st1h            {z1.h}, p1, [\dst, \d_strd, lsl #1]
    478 .endif
    479        ret
    480 
    481        .align JUMP_ALIGN
    482 L(\type\()_8tap_h_hv_\isa):
    483        madd            \mx, \mx, w11, w9
    484        movrel          x13, h_tbl_sve
    485        sub             \src, \src, #6              // src - 3 * 2
    486        ubfx            w9, \mx, #7, #7
    487        and             \mx, \mx, #0x7F
    488        cmp             \w, #4
    489        csel            \mx, \mx, w9, le
    490        ldp             q30, q31, [x13]
    491        add             \xmx, x12, \xmx, lsl #3     // subpel H filter address
    492        cbz             \my, L(\type\()_8tap_h_\isa)
    493 
    494        // HV cases
    495        madd            w14, \my, w11, w10
    496 .ifc \bdmax, w8
    497        ldr             \bdmax, [sp]
    498 .endif
    499        ubfx            w11, w14, #7, #7
    500        and             w14, w14, #0x7F
    501        ld1sb           {z4.h}, p0/z, [\xmx]
    502        cmp             \h, #4
    503        csel            w14, w14, w11, le
    504 .ifc \type, put
    505        dup             v29.8h, \bdmax
    506 .endif
    507        clz             \bdmax, \bdmax
    508        add             \xmy, x12, x14, lsl #3      // subpel V filter address
    509        ld1sb           {z7.h}, p0/z, [\xmy]
    510 .ifc \type, put
    511        mov             w9, #12
    512        sub             w9, w9, \bdmax
    513        dup             v6.4s, w9
    514 .endif
    515        sub             \bdmax, \bdmax, #24
    516        mov             x15, x30
    517        sub             \src, \src, \s_strd         // src - s_strd - 3 * 2
    518        dup             v5.4s, \bdmax
    519        cmp             w10, SHARP1
    520        b.ne            L(\type\()_6tap_hv_\isa)    // vertical != SHARP1
    521 
    522        // HV 8-tap cases
    523        cmp             \w, #4
    524        b.le            40f
    525 
    526        // .align JUMP_ALIGN    // fallthrough
    527 80:     // HV8 - 8xN+
    528 .ifc \type, prep
    529        add             \wd_strd, \w, \w                // d_strd = 2 * w
    530 .endif
    531        cmp             \h, #4
    532        b.le            84f
    533        sub             \src, \src, \s_strd, lsl #1     // src - 3 * s_strd - 3 * 2
    534 
    535        .align LOOP_ALIGN
    536 81:
    537        mov             \lsrc, \src
    538        mov             \ldst, \dst
    539        mov             w8, \h
    540 
    541        bl              L(\type\()_hv_filter8_\isa)
    542        uzp1            v16.8h, v23.8h, v24.8h
    543        bl              L(\type\()_hv_filter8_\isa)
    544        uzp1            v17.8h, v23.8h, v24.8h
    545        bl              L(\type\()_hv_filter8_\isa)
    546        uzp1            v18.8h, v23.8h, v24.8h
    547        bl              L(\type\()_hv_filter8_\isa)
    548        uzp1            v19.8h, v23.8h, v24.8h
    549        bl              L(\type\()_hv_filter8_\isa)
    550        uzp1            v20.8h, v23.8h, v24.8h
    551        bl              L(\type\()_hv_filter8_\isa)
    552        uzp1            v21.8h, v23.8h, v24.8h
    553        bl              L(\type\()_hv_filter8_\isa)
    554        uzp1            v22.8h, v23.8h, v24.8h
    555 
    556        .align LOOP_ALIGN
    557 8:
    558        ldp             q24, q28, [\lsrc]
    559        smull           v0.4s, v16.4h, v7.h[0]
    560        smull2          v1.4s, v16.8h, v7.h[0]
    561        mov             v16.16b, v17.16b
    562 
    563        movi            v2.2d, #0
    564        movi            v3.2d, #0
    565        tbl             v23.16b, {v24.16b}, v30.16b
    566        tbl             v24.16b, {v24.16b}, v31.16b
    567 
    568        ldur            q26, [\lsrc, #8]
    569        smlal           v0.4s, v17.4h, v7.h[1]
    570        smlal2          v1.4s, v17.8h, v7.h[1]
    571        mov             v17.16b, v18.16b
    572        add             \lsrc, \lsrc, \s_strd
    573 
    574        sdot            z2.d, z23.h, z4.h[0]
    575        sdot            z3.d, z24.h, z4.h[0]
    576        movi            v23.2d, #0
    577        movi            v24.2d, #0
    578        tbl             v25.16b, {v26.16b}, v30.16b
    579        tbl             v26.16b, {v26.16b}, v31.16b
    580        smlal           v0.4s, v18.4h, v7.h[2]
    581        smlal2          v1.4s, v18.8h, v7.h[2]
    582        mov             v18.16b, v19.16b
    583 
    584        sdot            z23.d, z25.h, z4.h[0]
    585        sdot            z24.d, z26.h, z4.h[0]
    586        tbl             v27.16b, {v28.16b}, v30.16b
    587        tbl             v28.16b, {v28.16b}, v31.16b
    588        smlal           v0.4s, v19.4h, v7.h[3]
    589        smlal2          v1.4s, v19.8h, v7.h[3]
    590        mov             v19.16b, v20.16b
    591 
    592        subs            w8, w8, #1
    593        sdot            z2.d, z25.h, z4.h[1]
    594        sdot            z3.d, z26.h, z4.h[1]
    595        sdot            z23.d, z27.h, z4.h[1]
    596        sdot            z24.d, z28.h, z4.h[1]
    597 
    598        smlal           v0.4s, v20.4h, v7.h[4]
    599        smlal2          v1.4s, v20.8h, v7.h[4]
    600        mov             v20.16b, v21.16b
    601 
    602        uzp1            v3.4s, v2.4s, v3.4s
    603        uzp1            v24.4s, v23.4s, v24.4s
    604        smlal           v0.4s, v21.4h, v7.h[5]
    605        smlal2          v1.4s, v21.8h, v7.h[5]
    606        mov             v21.16b, v22.16b
    607 
    608        srshl           v23.4s, v3.4s, v5.4s
    609        srshl           v24.4s, v24.4s, v5.4s
    610        smlal           v0.4s, v22.4h, v7.h[6]
    611        smlal2          v1.4s, v22.8h, v7.h[6]
    612 
    613        uzp1            v22.8h, v23.8h, v24.8h
    614        smlal           v0.4s, v22.4h, v7.h[7]
    615        smlal2          v1.4s, v22.8h, v7.h[7]
    616 
    617 .ifc \type, prep
    618        rshrn           v0.4h, v0.4s, #6
    619        rshrn2          v0.8h, v1.4s, #6
    620        sub             z0.h, z0.h, #PREP_BIAS
    621 .else   // put
    622        srshl           v0.4s, v0.4s, v6.4s
    623        srshl           v1.4s, v1.4s, v6.4s
    624        sqxtun          v0.4h, v0.4s
    625        sqxtun2         v0.8h, v1.4s
    626        umin            v0.8h, v0.8h, v29.8h
    627 .endif
    628        st1             {v0.8h}, [\ldst], \d_strd
    629        b.gt            8b
    630 
    631        subs            \w, \w, #8
    632        add             \src, \src, #16
    633        add             \dst, \dst, #16
    634        b.gt            81b
    635        ret             x15
    636 
    637        .align JUMP_ALIGN
    638 40:     // HV8 - 4xN, put only: 2xN
    639 .ifc \type, put
    640        lsr             \d_strd, \d_strd, #1        // hword index for `st1h`
    641        whilelt         p1.h, wzr, \w               // masking for writes
    642 .endif
    643        ext             v4.16b, v4.16b, v4.16b, #4  // [\xmy + 2 * 2]
    644        add             \src, \src, #4
    645 
    646        cmp             \h, #4
    647        b.le            44f
    648 
    649        sub             \src, \src, \s_strd, lsl #1 // src - 3 * s_strd - 3 * 2
    650        bl              L(\type\()_hv_filter4_\isa)
    651        xtn             v16.4h, v0.4s
    652        bl              L(\type\()_hv_filter4_\isa)
    653        xtn             v17.4h, v0.4s
    654        bl              L(\type\()_hv_filter4_\isa)
    655        xtn             v18.4h, v0.4s
    656        bl              L(\type\()_hv_filter4_\isa)
    657        xtn             v19.4h, v0.4s
    658        bl              L(\type\()_hv_filter4_\isa)
    659        xtn             v20.4h, v0.4s
    660        bl              L(\type\()_hv_filter4_\isa)
    661        xtn             v21.4h, v0.4s
    662        bl              L(\type\()_hv_filter4_\isa)
    663        xtn             v22.4h, v0.4s
    664 
    665        .align LOOP_ALIGN
    666 4:
    667        ld1             {v3.16b}, [\src], \s_strd
    668 
    669        smull           v24.4s, v16.4h, v7.h[0]
    670        smlal           v24.4s, v17.4h, v7.h[1]
    671        tbl             v2.16b, {v3.16b}, v30.16b
    672        tbl             v3.16b, {v3.16b}, v31.16b
    673        movi            v0.2d, #0
    674        movi            v1.2d, #0
    675        mov             v16.16b, v17.16b
    676        mov             v17.16b, v18.16b
    677 
    678        smlal           v24.4s, v18.4h, v7.h[2]
    679        smlal           v24.4s, v19.4h, v7.h[3]
    680        sdot            z0.d, z2.h, z4.h[0]
    681        sdot            z1.d, z3.h, z4.h[0]
    682        mov             v18.16b, v19.16b
    683        mov             v19.16b, v20.16b
    684        uzp1            v0.4s, v0.4s, v1.4s
    685 
    686        smlal           v24.4s, v20.4h, v7.h[4]
    687        smlal           v24.4s, v21.4h, v7.h[5]
    688        srshl           v0.4s, v0.4s, v5.4s
    689        mov             v20.16b, v21.16b
    690        mov             v21.16b, v22.16b
    691 
    692        subs            \h, \h, #1
    693        smlal           v24.4s, v22.4h, v7.h[6]
    694        xtn             v22.4h, v0.4s
    695        smlal           v24.4s, v22.4h, v7.h[7]
    696 
    697 .ifc \type, prep
    698        rshrn           v0.4h, v24.4s, #6
    699        sub             z0.h, z0.h, #PREP_BIAS
    700        str             d0, [\dst], #8
    701 .else   // put
    702        srshl           v0.4s, v24.4s, v6.4s
    703        sqxtun          v0.4h, v0.4s
    704        umin            v0.4h, v0.4h, v29.4h
    705        st1h            {z0.h}, p1, [\dst]
    706        add             \dst, \dst, \d_strd, lsl #1
    707 .endif
    708        b.gt            4b
    709        ret             x15
    710 
    711        .align JUMP_ALIGN
    712 L(\type\()_6tap_hv_\isa):
    713        cmp             \w, #4
    714        b.le            46f
    715 
    716        // .align JUMP_ALIGN    // fallthrough
    717 80:     // HV6 - 8xN+
    718 .ifc \type, prep
    719        add             \wd_strd, \w, \w        // d_strd = 2 * w
    720 .endif
    721        cmp             \h, #4
    722        b.le            84f
    723        sub             \src, \src, \s_strd     // src - 2 * s_strd - 3 * 2
    724 
    725        .align LOOP_ALIGN
    726 81:
    727        mov             \lsrc, \src
    728        mov             \ldst, \dst
    729        mov             w8, \h
    730 
    731        bl              L(\type\()_hv_filter8_\isa)
    732        uzp1            v16.8h, v23.8h, v24.8h
    733        bl              L(\type\()_hv_filter8_\isa)
    734        uzp1            v17.8h, v23.8h, v24.8h
    735        bl              L(\type\()_hv_filter8_\isa)
    736        uzp1            v18.8h, v23.8h, v24.8h
    737        bl              L(\type\()_hv_filter8_\isa)
    738        uzp1            v19.8h, v23.8h, v24.8h
    739        bl              L(\type\()_hv_filter8_\isa)
    740        uzp1            v20.8h, v23.8h, v24.8h
    741 
    742        .align LOOP_ALIGN
    743 8:
    744        ldp             q24, q28, [\lsrc]
    745 
    746        smull           v0.4s, v16.4h, v7.h[1]
    747        smull2          v1.4s, v16.8h, v7.h[1]
    748        mov             v16.16b, v17.16b
    749 
    750        tbl             v23.16b, {v24.16b}, v30.16b
    751        tbl             v24.16b, {v24.16b}, v31.16b
    752        movi            v2.2d, #0
    753        movi            v3.2d, #0
    754 
    755        ldur            q26, [\lsrc, #8]
    756        add             \lsrc, \lsrc, \s_strd
    757 
    758        sdot            z2.d, z23.h, z4.h[0]
    759        sdot            z3.d, z24.h, z4.h[0]
    760        tbl             v25.16b, {v26.16b}, v30.16b
    761        tbl             v26.16b, {v26.16b}, v31.16b
    762        movi            v23.2d, #0
    763        movi            v24.2d, #0
    764 
    765        sdot            z23.d, z25.h, z4.h[0]
    766        sdot            z24.d, z26.h, z4.h[0]
    767        tbl             v27.16b, {v28.16b}, v30.16b
    768        tbl             v28.16b, {v28.16b}, v31.16b
    769        smlal           v0.4s, v17.4h, v7.h[2]
    770        smlal2          v1.4s, v17.8h, v7.h[2]
    771        mov             v17.16b, v18.16b
    772 
    773        sdot            z2.d, z25.h, z4.h[1]
    774        sdot            z3.d, z26.h, z4.h[1]
    775        sdot            z23.d, z27.h, z4.h[1]
    776        sdot            z24.d, z28.h, z4.h[1]
    777 
    778        smlal           v0.4s, v18.4h, v7.h[3]
    779        smlal2          v1.4s, v18.8h, v7.h[3]
    780        mov             v18.16b, v19.16b
    781 
    782        uzp1            v3.4s, v2.4s, v3.4s
    783        uzp1            v24.4s, v23.4s, v24.4s
    784        smlal           v0.4s, v19.4h, v7.h[4]
    785        smlal2          v1.4s, v19.8h, v7.h[4]
    786        mov             v19.16b, v20.16b
    787 
    788        srshl           v23.4s, v3.4s, v5.4s
    789        srshl           v24.4s, v24.4s, v5.4s
    790        smlal           v0.4s, v20.4h, v7.h[5]
    791        smlal2          v1.4s, v20.8h, v7.h[5]
    792 
    793        subs            w8, w8, #1
    794        uzp1            v20.8h, v23.8h, v24.8h
    795        smlal           v0.4s, v20.4h, v7.h[6]
    796        smlal2          v1.4s, v20.8h, v7.h[6]
    797 
    798 .ifc \type, prep
    799        rshrn           v0.4h, v0.4s, #6
    800        rshrn2          v0.8h, v1.4s, #6
    801        sub             z0.h, z0.h, #PREP_BIAS
    802 .else   // put
    803        srshl           v0.4s, v0.4s, v6.4s
    804        srshl           v1.4s, v1.4s, v6.4s
    805        sqxtun          v0.4h, v0.4s
    806        sqxtun2         v0.8h, v1.4s
    807        umin            v0.8h, v0.8h, v29.8h
    808 .endif
    809        st1             {v0.8h}, [\ldst], \d_strd
    810        b.gt            8b
    811 
    812        add             \dst, \dst, #16
    813        subs            \w, \w, #8
    814        add             \src, \src, #16
    815        b.gt            81b
    816        ret             x15
    817 
    818        .align LOOP_ALIGN
    819 84:     // HV4 - 8x4, 8x2
    820        mov             \lsrc, \src
    821        mov             \ldst, \dst
    822        mov             w8, \h
    823 
    824        bl              L(\type\()_hv_filter8_\isa)
    825        uzp1            v17.8h, v23.8h, v24.8h
    826        bl              L(\type\()_hv_filter8_\isa)
    827        uzp1            v18.8h, v23.8h, v24.8h
    828        bl              L(\type\()_hv_filter8_\isa)
    829        uzp1            v19.8h, v23.8h, v24.8h
    830 
    831        .align LOOP_ALIGN
    832 81:
    833        ldp             q24, q28, [\lsrc]
    834        ldur            q26, [\lsrc, #8]
    835        add             \lsrc, \lsrc, \s_strd
    836 
    837        tbl             v23.16b, {v24.16b}, v30.16b
    838        tbl             v24.16b, {v24.16b}, v31.16b
    839        movi            v2.2d, #0
    840        movi            v3.2d, #0
    841        sdot            z2.d, z23.h, z4.h[0]
    842        sdot            z3.d, z24.h, z4.h[0]
    843 
    844        tbl             v25.16b, {v26.16b}, v30.16b
    845        tbl             v26.16b, {v26.16b}, v31.16b
    846        movi            v23.2d, #0
    847        movi            v24.2d, #0
    848        sdot            z23.d, z25.h, z4.h[0]
    849        sdot            z24.d, z26.h, z4.h[0]
    850 
    851        tbl             v27.16b, {v28.16b}, v30.16b
    852        tbl             v28.16b, {v28.16b}, v31.16b
    853        sdot            z2.d, z25.h, z4.h[1]
    854        sdot            z3.d, z26.h, z4.h[1]
    855        sdot            z23.d, z27.h, z4.h[1]
    856        sdot            z24.d, z28.h, z4.h[1]
    857 
    858        smull           v0.4s, v17.4h, v7.h[2]
    859        smull2          v1.4s, v17.8h, v7.h[2]
    860        mov             v17.16b, v18.16b
    861 
    862        subs            w8, w8, #1
    863        uzp1            v3.4s, v2.4s, v3.4s
    864        uzp1            v24.4s, v23.4s, v24.4s
    865        smlal           v0.4s, v18.4h, v7.h[3]
    866        smlal2          v1.4s, v18.8h, v7.h[3]
    867        mov             v18.16b, v19.16b
    868 
    869        srshl           v23.4s, v3.4s, v5.4s
    870        srshl           v24.4s, v24.4s, v5.4s
    871        smlal           v0.4s, v19.4h, v7.h[4]
    872        smlal2          v1.4s, v19.8h, v7.h[4]
    873 
    874        uzp1            v19.8h, v23.8h, v24.8h
    875        smlal           v0.4s, v19.4h, v7.h[5]
    876        smlal2          v1.4s, v19.8h, v7.h[5]
    877 
    878 .ifc \type, prep
    879        rshrn           v0.4h, v0.4s, #6
    880        rshrn2          v0.8h, v1.4s, #6
    881        sub             z0.h, z0.h, #PREP_BIAS
    882 .else   // put
    883        srshl           v0.4s, v0.4s, v6.4s
    884        srshl           v1.4s, v1.4s, v6.4s
    885        sqxtun          v0.4h, v0.4s
    886        sqxtun2         v0.8h, v1.4s
    887        umin            v0.8h, v0.8h, v29.8h
    888 .endif
    889        st1             {v0.8h}, [\ldst], \d_strd
    890        b.gt            81b
    891 
    892        subs            \w, \w, #8
    893        add             \dst, \dst, #16
    894        add             \src, \src, #16
    895        b.gt            84b
    896        ret             x15
    897 
    898        .align FUNC_ALIGN
    899 L(\type\()_hv_filter8_\isa):
    900        ldp             q24, q28, [\lsrc]
    901        ldur            q26, [\lsrc, #8]
    902        add             \lsrc, \lsrc, \s_strd
    903 
    904        tbl             v23.16b, {v24.16b}, v30.16b
    905        tbl             v24.16b, {v24.16b}, v31.16b
    906        movi            v2.2d, #0
    907        movi            v3.2d, #0
    908        sdot            z2.d, z23.h, z4.h[0]
    909        sdot            z3.d, z24.h, z4.h[0]
    910 
    911        tbl             v25.16b, {v26.16b}, v30.16b
    912        tbl             v26.16b, {v26.16b}, v31.16b
    913        movi            v23.2d, #0
    914        movi            v24.2d, #0
    915        sdot            z23.d, z25.h, z4.h[0]
    916        sdot            z24.d, z26.h, z4.h[0]
    917 
    918        tbl             v27.16b, {v28.16b}, v30.16b
    919        tbl             v28.16b, {v28.16b}, v31.16b
    920        sdot            z2.d, z25.h, z4.h[1]
    921        sdot            z3.d, z26.h, z4.h[1]
    922        sdot            z23.d, z27.h, z4.h[1]
    923        sdot            z24.d, z28.h, z4.h[1]
    924 
    925        uzp1            v3.4s, v2.4s, v3.4s
    926        uzp1            v24.4s, v23.4s, v24.4s
    927        srshl           v23.4s, v3.4s, v5.4s
    928        srshl           v24.4s, v24.4s, v5.4s
    929        ret
    930 
    931        .align FUNC_ALIGN
    932 L(\type\()_hv_filter4_\isa):
    933        ld1             {v3.16b}, [\src], \s_strd
    934 
    935        tbl             v2.16b, {v3.16b}, v30.16b
    936        tbl             v3.16b, {v3.16b}, v31.16b
    937        movi            v0.2d, #0
    938        movi            v1.2d, #0
    939        sdot            z0.d, z2.h, z4.h[0]
    940        sdot            z1.d, z3.h, z4.h[0]
    941 
    942        uzp1            v0.4s, v0.4s, v1.4s
    943        srshl           v0.4s, v0.4s, v5.4s
    944        ret
    945 
    946        .align JUMP_ALIGN
    947 46:     // H4V6 - 4xN, put only: 2xN
    948 .ifc \type, put
    949        lsr             \d_strd, \d_strd, #1        // hword index for `st1h`
    950        whilelt         p1.h, wzr, \w               // masking for writes
    951 .endif
    952        ext             v4.16b, v4.16b, v4.16b, #4  // [\xmy + 2 * 2]
    953        add             \src, \src, #4
    954 
    955        cmp             \h, #4
    956        b.le            44f
    957 
    958        sub             \src, \src, \s_strd         // src - 2 * s_strd - 3 * 2
    959        bl              L(\type\()_hv_filter4_\isa)
    960        xtn             v16.4h, v0.4s
    961        bl              L(\type\()_hv_filter4_\isa)
    962        xtn             v17.4h, v0.4s
    963        bl              L(\type\()_hv_filter4_\isa)
    964        xtn             v18.4h, v0.4s
    965        bl              L(\type\()_hv_filter4_\isa)
    966        xtn             v19.4h, v0.4s
    967        bl              L(\type\()_hv_filter4_\isa)
    968        xtn             v20.4h, v0.4s
    969 
    970        .align LOOP_ALIGN
    971 4:
    972        ld1             {v3.16b}, [\src], \s_strd
    973        smull           v24.4s, v16.4h, v7.h[1]
    974        smlal           v24.4s, v17.4h, v7.h[2]
    975 
    976        tbl             v2.16b, {v3.16b}, v30.16b
    977        tbl             v3.16b, {v3.16b}, v31.16b
    978        movi            v0.2d, #0
    979        movi            v1.2d, #0
    980        sdot            z0.d, z2.h, z4.h[0]
    981        sdot            z1.d, z3.h, z4.h[0]
    982 
    983        mov             v16.16b, v17.16b
    984        mov             v17.16b, v18.16b
    985        smlal           v24.4s, v18.4h, v7.h[3]
    986        smlal           v24.4s, v19.4h, v7.h[4]
    987        uzp1            v0.4s, v0.4s, v1.4s
    988 
    989        mov             v18.16b, v19.16b
    990        mov             v19.16b, v20.16b
    991        subs            \h, \h, #1
    992        srshl           v0.4s, v0.4s, v5.4s
    993        smlal           v24.4s, v20.4h, v7.h[5]
    994        xtn             v20.4h, v0.4s
    995        smlal           v24.4s, v20.4h, v7.h[6]
    996 
    997 .ifc \type, prep
    998        rshrn           v0.4h, v24.4s, #6
    999        sub             z0.h, z0.h, #PREP_BIAS
   1000        str             d0, [\dst], #8
   1001 .else   // put
   1002        srshl           v0.4s, v24.4s, v6.4s
   1003        sqxtun          v0.4h, v0.4s
   1004        umin            v0.4h, v0.4h, v29.4h
   1005        st1h            {z0.h}, p1, [\dst]
   1006        add             \dst, \dst, \d_strd, lsl #1
   1007 .endif
   1008        b.gt            4b
   1009        ret             x15
   1010 
   1011        .align JUMP_ALIGN
   1012 44:     // H4V4 - 4x4, put only: 4x2, 2x4, 2x2
   1013        bl              L(\type\()_hv_filter4_\isa)
   1014        xtn             v17.4h, v0.4s
   1015        bl              L(\type\()_hv_filter4_\isa)
   1016        xtn             v18.4h, v0.4s
   1017        bl              L(\type\()_hv_filter4_\isa)
   1018        xtn             v19.4h, v0.4s
   1019 
   1020        .align LOOP_ALIGN
   1021 4:
   1022        ld1             {v3.16b}, [\src], \s_strd
   1023        smull           v24.4s, v17.4h, v7.h[2]
   1024        smlal           v24.4s, v18.4h, v7.h[3]
   1025 
   1026        tbl             v2.16b, {v3.16b}, v30.16b
   1027        tbl             v3.16b, {v3.16b}, v31.16b
   1028        movi            v0.2d, #0
   1029        movi            v1.2d, #0
   1030        sdot            z0.d, z2.h, z4.h[0]
   1031        sdot            z1.d, z3.h, z4.h[0]
   1032        uzp1            v0.4s, v0.4s, v1.4s
   1033 
   1034        mov             v17.16b, v18.16b
   1035        mov             v18.16b, v19.16b
   1036        subs            \h, \h, #1
   1037        srshl           v0.4s, v0.4s, v5.4s
   1038        smlal           v24.4s, v19.4h, v7.h[4]
   1039        xtn             v19.4h, v0.4s
   1040        smlal           v24.4s, v19.4h, v7.h[5]
   1041 
   1042 .ifc \type, prep
   1043        rshrn           v0.4h, v24.4s, #6
   1044        sub             z0.h, z0.h, #PREP_BIAS
   1045        str             d0, [\dst], #8
   1046 .else   // put
   1047        srshl           v0.4s, v24.4s, v6.4s
   1048        sqxtun          v0.4h, v0.4s
   1049        umin            v0.4h, v0.4h, v29.4h
   1050        st1h            {z0.h}, p1, [\dst]
   1051        add             \dst, \dst, \d_strd, lsl #1
   1052 .endif
   1053        b.gt            4b
   1054        ret             x15
   1055 
   1056        .align JUMP_ALIGN
   1057 L(\type\()_8tap_h_\isa):
   1058        movrel          x11, \type\()_8tap_h_\isa\()_tbl
   1059        ldrsw           x12, [x11, x8, lsl #2]
   1060 .ifc \bdmax, w8
   1061        ldr             \bdmax, [sp]
   1062 .endif
   1063 .ifc \type, prep
   1064        clz             \bdmax, \bdmax
   1065        sub             \bdmax, \bdmax, #24
   1066        dup             v5.4s, \bdmax
   1067 .else   // put
   1068        mov             w9, #34             // rounding for 10-bit case
   1069        mov             w10, #40            // rounding for 12-bit case
   1070        cmp             \bdmax, #0xFFF
   1071        csel            w9, w9, w10, ne     // select rounding based on \bdmax
   1072        dup             v5.8h, \bdmax
   1073        dup             v6.2d, x9
   1074 .endif
   1075        add             x11, x11, x12
   1076        ld1sb           {z4.h}, p0/z, [\xmx]
   1077        br              x11
   1078 
   1079        .align JUMP_ALIGN
   1080 20:     // H - 4xN, put only: 2xN
   1081 40:
   1082        AARCH64_VALID_JUMP_TARGET
   1083        add             \src, \src, #4              // src - 1 * 2
   1084        ext             v4.16b, v4.16b, v4.16b, #4  // [\xmy + 2 * 2]
   1085 .ifc \type, put
   1086        lsr             \d_strd, \d_strd, #1        // hword index for `st1h`
   1087        whilelt         p1.h, wzr, \w               // masking for writes
   1088 .endif
   1089        .align LOOP_ALIGN
   1090 4:
   1091        ldr             q17, [\src]
   1092        ldr             q19, [\src, \s_strd]
   1093        add             \src, \src, \s_strd, lsl #1
   1094 
   1095 .ifc \type, prep
   1096        movi            v0.2d, #0
   1097        movi            v1.2d, #0
   1098        movi            v2.2d, #0
   1099        movi            v3.2d, #0
   1100 .else
   1101        mov             v0.16b, v6.16b
   1102        mov             v1.16b, v6.16b
   1103        mov             v2.16b, v6.16b
   1104        mov             v3.16b, v6.16b
   1105 .endif
   1106        tbl             v16.16b, {v17.16b}, v30.16b
   1107        tbl             v17.16b, {v17.16b}, v31.16b
   1108        sdot            z0.d, z16.h, z4.h[0]
   1109        sdot            z1.d, z17.h, z4.h[0]
   1110        subs            \h, \h, #2
   1111        tbl             v18.16b, {v19.16b}, v30.16b
   1112        tbl             v19.16b, {v19.16b}, v31.16b
   1113        sdot            z2.d, z18.h, z4.h[0]
   1114        sdot            z3.d, z19.h, z4.h[0]
   1115 
   1116        uzp1            v0.4s, v0.4s, v1.4s
   1117        uzp1            v1.4s, v2.4s, v3.4s
   1118 .ifc \type, prep
   1119        srshl           v0.4s, v0.4s, v5.4s
   1120        srshl           v1.4s, v1.4s, v5.4s
   1121        uzp1            v0.8h, v0.8h, v1.8h
   1122        sub             z0.h, z0.h, #PREP_BIAS
   1123        str             q0, [\dst], #16
   1124 .else   // put
   1125        sqshrun         v0.4h, v0.4s, #6
   1126        sqshrun         v1.4h, v1.4s, #6
   1127        umin            v0.4h, v0.4h, v5.4h
   1128        umin            v1.4h, v1.4h, v5.4h
   1129        st1h            {z0.h}, p1, [\dst]
   1130        st1h            {z1.h}, p1, [\dst, \d_strd, lsl #1]
   1131        add             \dst, \dst, \d_strd, lsl #2
   1132 .endif
   1133        b.gt            4b
   1134        ret
   1135 
   1136        .align JUMP_ALIGN
   1137 80:     // H - 8xN
   1138        AARCH64_VALID_JUMP_TARGET
   1139 
   1140        .align LOOP_ALIGN
   1141 8:
   1142        ldp             q17, q21, [\src]
   1143        ldur            q19, [\src, #8]
   1144 
   1145 .ifc \type, prep
   1146        movi            v0.2d, #0
   1147        movi            v2.2d, #0
   1148 .else
   1149        mov             v0.16b, v6.16b
   1150        mov             v2.16b, v6.16b
   1151 .endif
   1152        tbl             v16.16b, {v17.16b}, v30.16b
   1153        tbl             v17.16b, {v17.16b}, v31.16b
   1154        add             \src, \src, \s_strd
   1155        sdot            z0.d, z16.h, z4.h[0]
   1156        sdot            z2.d, z17.h, z4.h[0]
   1157 
   1158        tbl             v18.16b, {v19.16b}, v30.16b
   1159        tbl             v19.16b, {v19.16b}, v31.16b
   1160 .ifc \type, prep
   1161        movi            v16.2d, #0
   1162        movi            v17.2d, #0
   1163 .else
   1164        mov             v16.16b, v6.16b
   1165        mov             v17.16b, v6.16b
   1166 .endif
   1167        ldp             q23, q27, [\src]
   1168        ldur            q25, [\src, #8]
   1169 
   1170        sdot            z16.d, z18.h, z4.h[0]
   1171        sdot            z17.d, z19.h, z4.h[0]
   1172 
   1173        tbl             v22.16b, {v23.16b}, v30.16b
   1174        tbl             v23.16b, {v23.16b}, v31.16b
   1175 .ifc \type, prep
   1176        movi            v1.2d, #0
   1177        movi            v3.2d, #0
   1178 .else
   1179        mov             v1.16b, v6.16b
   1180        mov             v3.16b, v6.16b
   1181 .endif
   1182        add             \src, \src, \s_strd
   1183        sdot            z1.d, z22.h, z4.h[0]
   1184        sdot            z3.d, z23.h, z4.h[0]
   1185 
   1186        tbl             v24.16b, {v25.16b}, v30.16b
   1187        tbl             v25.16b, {v25.16b}, v31.16b
   1188 .ifc \type, prep
   1189        movi            v22.2d, #0
   1190        movi            v23.2d, #0
   1191 .else
   1192        mov             v22.16b, v6.16b
   1193        mov             v23.16b, v6.16b
   1194 .endif
   1195        sdot            z22.d, z24.h, z4.h[0]
   1196        sdot            z23.d, z25.h, z4.h[0]
   1197 
   1198        tbl             v20.16b, {v21.16b}, v30.16b
   1199        tbl             v21.16b, {v21.16b}, v31.16b
   1200        sdot            z0.d, z18.h, z4.h[1]
   1201        sdot            z2.d, z19.h, z4.h[1]
   1202        tbl             v26.16b, {v27.16b}, v30.16b
   1203        tbl             v27.16b, {v27.16b}, v31.16b
   1204        sdot            z16.d, z20.h, z4.h[1]
   1205        sdot            z17.d, z21.h, z4.h[1]
   1206 
   1207        sdot            z1.d, z24.h, z4.h[1]
   1208        sdot            z3.d, z25.h, z4.h[1]
   1209 
   1210        sdot            z22.d, z26.h, z4.h[1]
   1211        sdot            z23.d, z27.h, z4.h[1]
   1212 
   1213        subs            \h, \h, #2
   1214        uzp1            v0.4s, v0.4s, v2.4s
   1215        uzp1            v2.4s, v16.4s, v17.4s
   1216        uzp1            v1.4s, v1.4s, v3.4s
   1217        uzp1            v3.4s, v22.4s, v23.4s
   1218 .ifc \type, prep
   1219        srshl           v0.4s, v0.4s, v5.4s
   1220        srshl           v2.4s, v2.4s, v5.4s
   1221        srshl           v1.4s, v1.4s, v5.4s
   1222        srshl           v3.4s, v3.4s, v5.4s
   1223        uzp1            v0.8h, v0.8h, v2.8h
   1224        uzp1            v1.8h, v1.8h, v3.8h
   1225        sub             z0.h, z0.h, #PREP_BIAS
   1226        sub             z1.h, z1.h, #PREP_BIAS
   1227        stp             q0, q1, [\dst], #32
   1228 .else   // put
   1229        sqshrun         v0.4h, v0.4s, #6
   1230        sqshrun2        v0.8h, v2.4s, #6
   1231        sqshrun         v1.4h, v1.4s, #6
   1232        sqshrun2        v1.8h, v3.4s, #6
   1233        umin            v0.8h, v0.8h, v5.8h
   1234        umin            v1.8h, v1.8h, v5.8h
   1235        st1             {v0.16b}, [\dst], \d_strd
   1236        st1             {v1.16b}, [\dst], \d_strd
   1237 .endif
   1238        b.gt            8b
   1239        ret
   1240 
   1241        .align JUMP_ALIGN
   1242 160:    // H - 16xN
   1243        AARCH64_VALID_JUMP_TARGET
   1244 
   1245        .align LOOP_ALIGN
   1246 16:
   1247        ldp             q17, q21, [\src]
   1248        ldur            q19, [\src, #8]
   1249 
   1250 .ifc \type, prep
   1251        movi            v0.2d, #0
   1252        movi            v2.2d, #0
   1253 .else
   1254        mov             v0.16b, v6.16b
   1255        mov             v2.16b, v6.16b
   1256 .endif
   1257        tbl             v16.16b, {v17.16b}, v30.16b
   1258        tbl             v17.16b, {v17.16b}, v31.16b
   1259        sdot            z0.d, z16.h, z4.h[0]
   1260        sdot            z2.d, z17.h, z4.h[0]
   1261 
   1262        tbl             v18.16b, {v19.16b}, v30.16b
   1263        tbl             v19.16b, {v19.16b}, v31.16b
   1264 .ifc \type, prep
   1265        movi            v16.2d, #0
   1266        movi            v17.2d, #0
   1267 .else
   1268        mov             v16.16b, v6.16b
   1269        mov             v17.16b, v6.16b
   1270 .endif
   1271        ldur            q25, [\src, #24]
   1272        ldr             q27, [\src, #32]
   1273 
   1274        sdot            z16.d, z18.h, z4.h[0]
   1275        sdot            z17.d, z19.h, z4.h[0]
   1276 
   1277        tbl             v22.16b, {v21.16b}, v30.16b
   1278        tbl             v23.16b, {v21.16b}, v31.16b
   1279 .ifc \type, prep
   1280        movi            v1.2d, #0
   1281        movi            v3.2d, #0
   1282 .else
   1283        mov             v1.16b, v6.16b
   1284        mov             v3.16b, v6.16b
   1285 .endif
   1286        add             \src, \src, \s_strd
   1287        sdot            z1.d, z22.h, z4.h[0]
   1288        sdot            z3.d, z23.h, z4.h[0]
   1289 
   1290        tbl             v24.16b, {v25.16b}, v30.16b
   1291        tbl             v25.16b, {v25.16b}, v31.16b
   1292 .ifc \type, prep
   1293        movi            v22.2d, #0
   1294        movi            v23.2d, #0
   1295 .else
   1296        mov             v22.16b, v6.16b
   1297        mov             v23.16b, v6.16b
   1298 .endif
   1299        sdot            z22.d, z24.h, z4.h[0]
   1300        sdot            z23.d, z25.h, z4.h[0]
   1301 
   1302        tbl             v20.16b, {v21.16b}, v30.16b
   1303        tbl             v21.16b, {v21.16b}, v31.16b
   1304        sdot            z0.d, z18.h, z4.h[1]
   1305        sdot            z2.d, z19.h, z4.h[1]
   1306        tbl             v26.16b, {v27.16b}, v30.16b
   1307        tbl             v27.16b, {v27.16b}, v31.16b
   1308        sdot            z16.d, z20.h, z4.h[1]
   1309        sdot            z17.d, z21.h, z4.h[1]
   1310 
   1311        sdot            z1.d, z24.h, z4.h[1]
   1312        sdot            z3.d, z25.h, z4.h[1]
   1313 
   1314        sdot            z22.d, z26.h, z4.h[1]
   1315        sdot            z23.d, z27.h, z4.h[1]
   1316 
   1317        subs            \h, \h, #1
   1318        uzp1            v0.4s, v0.4s, v2.4s
   1319        uzp1            v2.4s, v16.4s, v17.4s
   1320        uzp1            v1.4s, v1.4s, v3.4s
   1321        uzp1            v3.4s, v22.4s, v23.4s
   1322 .ifc \type, prep
   1323        srshl           v0.4s, v0.4s, v5.4s
   1324        srshl           v2.4s, v2.4s, v5.4s
   1325        srshl           v1.4s, v1.4s, v5.4s
   1326        srshl           v3.4s, v3.4s, v5.4s
   1327        uzp1            v0.8h, v0.8h, v2.8h
   1328        uzp1            v1.8h, v1.8h, v3.8h
   1329        sub             z0.h, z0.h, #PREP_BIAS
   1330        sub             z1.h, z1.h, #PREP_BIAS
   1331        stp             q0, q1, [\dst], #32
   1332 .else   // put
   1333        sqshrun         v0.4h, v0.4s, #6
   1334        sqshrun2        v0.8h, v2.4s, #6
   1335        sqshrun         v1.4h, v1.4s, #6
   1336        sqshrun2        v1.8h, v3.4s, #6
   1337        umin            v0.8h, v0.8h, v5.8h
   1338        umin            v1.8h, v1.8h, v5.8h
   1339        st1             {v0.16b, v1.16b}, [\dst], \d_strd
   1340 .endif
   1341        b.gt            16b
   1342        ret
   1343 
   1344        .align JUMP_ALIGN
   1345 320:    // H - 32xN+
   1346 640:
   1347 1280:
   1348        AARCH64_VALID_JUMP_TARGET
   1349 .ifc \type, put
   1350        sub             \d_strd, \d_strd, \w, uxtw #1
   1351 .endif
   1352        sub             \s_strd, \s_strd, \w, uxtw #1
   1353        mov             w8, \w
   1354 
   1355        .align LOOP_ALIGN
   1356 32:
   1357        ldp             q17, q21, [\src]
   1358        ldur            q19, [\src, #8]
   1359 
   1360 .ifc \type, prep
   1361        movi            v0.2d, #0
   1362        movi            v2.2d, #0
   1363 .else
   1364        mov             v0.16b, v6.16b
   1365        mov             v2.16b, v6.16b
   1366 .endif
   1367        tbl             v16.16b, {v17.16b}, v30.16b
   1368        tbl             v17.16b, {v17.16b}, v31.16b
   1369        sdot            z0.d, z16.h, z4.h[0]
   1370        sdot            z2.d, z17.h, z4.h[0]
   1371 
   1372        tbl             v18.16b, {v19.16b}, v30.16b
   1373        tbl             v19.16b, {v19.16b}, v31.16b
   1374 .ifc \type, prep
   1375        movi            v16.2d, #0
   1376        movi            v17.2d, #0
   1377 .else
   1378        mov             v16.16b, v6.16b
   1379        mov             v17.16b, v6.16b
   1380 .endif
   1381        ldur            q25, [\src, #24]
   1382 
   1383        sdot            z16.d, z18.h, z4.h[0]
   1384        sdot            z17.d, z19.h, z4.h[0]
   1385 
   1386        ldr             q27, [\src, #32]!
   1387 
   1388        tbl             v22.16b, {v21.16b}, v30.16b
   1389        tbl             v23.16b, {v21.16b}, v31.16b
   1390 .ifc \type, prep
   1391        movi            v1.2d, #0
   1392        movi            v3.2d, #0
   1393 .else
   1394        mov             v1.16b, v6.16b
   1395        mov             v3.16b, v6.16b
   1396 .endif
   1397        sdot            z1.d, z22.h, z4.h[0]
   1398        sdot            z3.d, z23.h, z4.h[0]
   1399 
   1400        tbl             v24.16b, {v25.16b}, v30.16b
   1401        tbl             v25.16b, {v25.16b}, v31.16b
   1402 .ifc \type, prep
   1403        movi            v22.2d, #0
   1404        movi            v23.2d, #0
   1405 .else
   1406        mov             v22.16b, v6.16b
   1407        mov             v23.16b, v6.16b
   1408 .endif
   1409        sdot            z22.d, z24.h, z4.h[0]
   1410        sdot            z23.d, z25.h, z4.h[0]
   1411 
   1412        tbl             v20.16b, {v21.16b}, v30.16b
   1413        tbl             v21.16b, {v21.16b}, v31.16b
   1414        sdot            z0.d, z18.h, z4.h[1]
   1415        sdot            z2.d, z19.h, z4.h[1]
   1416        tbl             v26.16b, {v27.16b}, v30.16b
   1417        tbl             v27.16b, {v27.16b}, v31.16b
   1418        sdot            z16.d, z20.h, z4.h[1]
   1419        sdot            z17.d, z21.h, z4.h[1]
   1420 
   1421        sdot            z1.d, z24.h, z4.h[1]
   1422        sdot            z3.d, z25.h, z4.h[1]
   1423 
   1424        sdot            z22.d, z26.h, z4.h[1]
   1425        sdot            z23.d, z27.h, z4.h[1]
   1426 
   1427        subs            w8, w8, #16
   1428        uzp1            v0.4s, v0.4s, v2.4s
   1429        uzp1            v2.4s, v16.4s, v17.4s
   1430        uzp1            v1.4s, v1.4s, v3.4s
   1431        uzp1            v3.4s, v22.4s, v23.4s
   1432 .ifc \type, prep
   1433        srshl           v0.4s, v0.4s, v5.4s
   1434        srshl           v2.4s, v2.4s, v5.4s
   1435        srshl           v1.4s, v1.4s, v5.4s
   1436        srshl           v3.4s, v3.4s, v5.4s
   1437        uzp1            v0.8h, v0.8h, v2.8h
   1438        uzp1            v1.8h, v1.8h, v3.8h
   1439        sub             z0.h, z0.h, #PREP_BIAS
   1440        sub             z1.h, z1.h, #PREP_BIAS
   1441 .else   // put
   1442        sqshrun         v0.4h, v0.4s, #6
   1443        sqshrun2        v0.8h, v2.4s, #6
   1444        sqshrun         v1.4h, v1.4s, #6
   1445        sqshrun2        v1.8h, v3.4s, #6
   1446        umin            v0.8h, v0.8h, v5.8h
   1447        umin            v1.8h, v1.8h, v5.8h
   1448 .endif
   1449        stp             q0, q1, [\dst], #32
   1450        b.gt            32b
   1451 
   1452        add             \src, \src, \s_strd
   1453 .ifc \type, put
   1454        add             \dst, \dst, \d_strd
   1455 .endif
   1456        subs            \h, \h, #1
   1457        mov             w8, \w
   1458        b.gt            32b
   1459        ret
   1460 endfunc
   1461 
   1462 jumptable \type\()_8tap_h_\isa\()_tbl
   1463        .word 1280b - \type\()_8tap_h_\isa\()_tbl
   1464        .word 640b  - \type\()_8tap_h_\isa\()_tbl
   1465        .word 320b  - \type\()_8tap_h_\isa\()_tbl
   1466        .word 160b  - \type\()_8tap_h_\isa\()_tbl
   1467        .word 80b   - \type\()_8tap_h_\isa\()_tbl
   1468        .word 40b   - \type\()_8tap_h_\isa\()_tbl
   1469 .ifc \type, put
   1470        .word 20b   - \type\()_8tap_h_\isa\()_tbl
   1471 .endif
   1472 endjumptable
   1473 .endm
   1474 
   1475 
   1476 function prep_sve
   1477        movrel          x9, prep_tbl
   1478        mov             w6, #19
   1479        ldrsw           x8, [x9, x8, lsl #2]
   1480        sub             w6, w6, w7, lsr #8          // 19 - bdmax / 256
   1481        add             x9, x9, x8
   1482        movi            v30.8h, #PREP_BIAS_NEG
   1483        dup             v29.8h, w6                  // 10b: 1 << 4, 12b: 1 << 2
   1484        br              x9
   1485 
   1486        .align JUMP_ALIGN
   1487 40:     // prep - 4xN
   1488        AARCH64_VALID_JUMP_TARGET
   1489 
   1490        .align LOOP_ALIGN
   1491 4:
   1492        ldr             d0, [x1]
   1493        ldr             d1, [x1, x2]
   1494        add             x1, x1, x2, lsl #1
   1495        subs            w4, w4, #2
   1496        mad             z0.h, p0/m, z29.h, z30.h
   1497        mad             z1.h, p0/m, z29.h, z30.h
   1498        stp             d0, d1, [x0], #16
   1499        b.gt            4b
   1500        ret
   1501 
   1502        .align JUMP_ALIGN
   1503 80:     // prep - 8xN
   1504        AARCH64_VALID_JUMP_TARGET
   1505 
   1506        .align LOOP_ALIGN
   1507 8:
   1508        ld1             {v0.8h}, [x1], x2
   1509        ld1             {v1.8h}, [x1], x2
   1510        subs            w4, w4, #2
   1511        mad             z0.h, p0/m, z29.h, z30.h
   1512        mad             z1.h, p0/m, z29.h, z30.h
   1513        stp             q0, q1, [x0], #32
   1514        b.gt            8b
   1515        ret
   1516 
   1517        .align JUMP_ALIGN
   1518 160:    // prep - 16xN
   1519        AARCH64_VALID_JUMP_TARGET
   1520 
   1521        .align LOOP_ALIGN
   1522 16:
   1523        ld1             {v0.8h, v1.8h}, [x1], x2
   1524        mad             z0.h, p0/m, z29.h, z30.h
   1525        mad             z1.h, p0/m, z29.h, z30.h
   1526        subs            w4, w4, #2
   1527        ld1             {v2.8h, v3.8h}, [x1], x2
   1528        mad             z2.h, p0/m, z29.h, z30.h
   1529        mad             z3.h, p0/m, z29.h, z30.h
   1530        stp             q0, q1, [x0]
   1531        stp             q2, q3, [x0, #32]
   1532        add             x0, x0, #64
   1533        b.gt            16b
   1534        ret
   1535 
   1536        .align JUMP_ALIGN
   1537 320:    // prep - 32xN
   1538        AARCH64_VALID_JUMP_TARGET
   1539 
   1540        .align LOOP_ALIGN
   1541 32:
   1542        ldp             q0, q1, [x1]
   1543        mad             z0.h, p0/m, z29.h, z30.h
   1544        mad             z1.h, p0/m, z29.h, z30.h
   1545        ldp             q2, q3, [x1, #32]
   1546        subs            w4, w4, #1
   1547        mad             z2.h, p0/m, z29.h, z30.h
   1548        mad             z3.h, p0/m, z29.h, z30.h
   1549        add             x1, x1, x2
   1550        stp             q0, q1, [x0]
   1551        stp             q2, q3, [x0, #32]
   1552        add             x0, x0, #64
   1553        b.gt            32b
   1554        ret
   1555 
   1556        .align JUMP_ALIGN
   1557 640:    // prep - 64xN
   1558        AARCH64_VALID_JUMP_TARGET
   1559 
   1560        .align LOOP_ALIGN
   1561 64:
   1562        ldp             q0, q1, [x1]
   1563        mad             z0.h, p0/m, z29.h, z30.h
   1564        mad             z1.h, p0/m, z29.h, z30.h
   1565        ldp             q2, q3, [x1, #32]
   1566        mad             z2.h, p0/m, z29.h, z30.h
   1567        mad             z3.h, p0/m, z29.h, z30.h
   1568        ldp             q4, q5, [x1, #64]
   1569        mad             z4.h, p0/m, z29.h, z30.h
   1570        mad             z5.h, p0/m, z29.h, z30.h
   1571        ldp             q6, q7, [x1, #96]
   1572        add             x1, x1, x2
   1573        subs            w4, w4, #1
   1574        mad             z6.h, p0/m, z29.h, z30.h
   1575        mad             z7.h, p0/m, z29.h, z30.h
   1576        stp             q0, q1, [x0]
   1577        stp             q2, q3, [x0, #32]
   1578        stp             q4, q5, [x0, #64]
   1579        stp             q6, q7, [x0, #96]
   1580        add             x0, x0, #128
   1581        b.gt            64b
   1582        ret
   1583 
   1584        .align JUMP_ALIGN
   1585 1280:   // prep - 128xN
   1586        AARCH64_VALID_JUMP_TARGET
   1587 
   1588        .align LOOP_ALIGN
   1589 128:
   1590        ldp             q0, q1, [x1]
   1591        mad             z0.h, p0/m, z29.h, z30.h
   1592        mad             z1.h, p0/m, z29.h, z30.h
   1593        ldp             q2, q3, [x1, #32]
   1594        mad             z2.h, p0/m, z29.h, z30.h
   1595        mad             z3.h, p0/m, z29.h, z30.h
   1596        ldp             q4, q5, [x1, #64]
   1597        mad             z4.h, p0/m, z29.h, z30.h
   1598        mad             z5.h, p0/m, z29.h, z30.h
   1599        ldp             q6, q7, [x1, #96]
   1600        mad             z6.h, p0/m, z29.h, z30.h
   1601        mad             z7.h, p0/m, z29.h, z30.h
   1602        ldp             q16, q17, [x1, #128]
   1603        mad             z16.h, p0/m, z29.h, z30.h
   1604        mad             z17.h, p0/m, z29.h, z30.h
   1605        ldp             q18, q19, [x1, #160]
   1606        mad             z18.h, p0/m, z29.h, z30.h
   1607        mad             z19.h, p0/m, z29.h, z30.h
   1608        ldp             q20, q21, [x1, #192]
   1609        mad             z20.h, p0/m, z29.h, z30.h
   1610        mad             z21.h, p0/m, z29.h, z30.h
   1611        ldp             q22, q23, [x1, #224]
   1612        add             x1, x1, x2
   1613        mad             z22.h, p0/m, z29.h, z30.h
   1614        mad             z23.h, p0/m, z29.h, z30.h
   1615        subs            w4, w4, #1
   1616        stp             q0, q1, [x0]
   1617        stp             q2, q3, [x0, #32]
   1618        stp             q4, q5, [x0, #64]
   1619        stp             q6, q7, [x0, #96]
   1620        stp             q16, q17, [x0, #128]
   1621        stp             q18, q19, [x0, #160]
   1622        stp             q20, q21, [x0, #192]
   1623        stp             q22, q23, [x0, #224]
   1624        add             x0, x0, #256
   1625        b.gt            128b
   1626        ret
   1627 endfunc
   1628 
   1629 jumptable prep_tbl
   1630        .word 1280b - prep_tbl
   1631        .word 640b  - prep_tbl
   1632        .word 320b  - prep_tbl
   1633        .word 160b  - prep_tbl
   1634        .word 80b   - prep_tbl
   1635        .word 40b   - prep_tbl
   1636 endjumptable
   1637 
   1638 
   1639 // dst(x0), d_strd(x9), src(x1), s_strd(x2), w(w3), h(w4), mx(w5), my(w6), bdmax(w7)
   1640 // xmx(x5), xmy(x6), ldst(x5), lsrc(x6), wd_strd(w9), ws_strd(w2)
   1641 filter_8tap_fn prep, sve2, x0, x9, x1, x2, w3, w4, w5, w6, w7, x5, x6, x5, x6, w9, w2
   1642 
   1643 // dst(x0) d_strd(x1) src(x2) s_strd(x3) w(w4) h(w5) mx(w6) my(w7), bdmax(w8)
   1644 // xmx(x6), xmy(x7), ldst(x6), lsrc(x7), wd_strd(w1), ws_strd(w3)
   1645 filter_8tap_fn  put, sve2, x0, x1, x2, x3, w4, w5, w6, w7, w8, x6, x7, x6, x7, w1, w3
   1646 
   1647 DISABLE_SVE2
   1648 DISABLE_SVE
   1649 #endif  // HAVE_SVE2