tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

itx.S (121621B)


      1 /******************************************************************************
      2 * Copyright © 2018, VideoLAN and dav1d authors
      3 * Copyright © 2019, Martin Storsjo
      4 * All rights reserved.
      5 *
      6 * Redistribution and use in source and binary forms, with or without
      7 * modification, are permitted provided that the following conditions are met:
      8 *
      9 * 1. Redistributions of source code must retain the above copyright notice, this
     10 *    list of conditions and the following disclaimer.
     11 *
     12 * 2. Redistributions in binary form must reproduce the above copyright notice,
     13 *    this list of conditions and the following disclaimer in the documentation
     14 *    and/or other materials provided with the distribution.
     15 *
     16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 *****************************************************************************/
     27 
     28 #include "src/arm/asm.S"
     29 #include "util.S"
     30 
     31 // The exported functions in this file have got the following signature:
     32 // void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob);
     33 
     34 // Most of the functions use the following register layout:
     35 // x0-x3  external parameters
     36 // x4     function pointer to first transform
     37 // x5     function pointer to second transform
     38 // x6     output parameter for helper function
     39 // x7     input parameter for helper function
     40 // x8     input stride for helper function
     41 // x9-x12 scratch variables for helper functions
     42 // x13    pointer to list of eob thresholds
     43 // x14    return pointer for helper function
     44 // x15    return pointer for main function
     45 
     46 // The SIMD registers most often use the following layout:
     47 // v0-v1   multiplication coefficients
     48 // v2-v7   scratch registers
     49 // v8-v15  unused
     50 // v16-v31 inputs/outputs of transforms
     51 
     52 // Potential further optimizations, that are left unimplemented for now:
     53 // - Trying to keep multiplication coefficients in registers across multiple
     54 //   transform functions. (The register layout is designed to potentially
     55 //   allow this.)
     56 // - Use a simplified version of the transforms themselves for cases where
     57 //   we know a significant number of inputs are zero. E.g. if the eob value
     58 //   indicates only a quarter of input values are set, for idct16 and up,
     59 //   a significant amount of calculation can be skipped, at the cost of more
     60 //   code duplication and special casing.
     61 
     62 const idct_coeffs, align=4
     63        // idct4
     64        .short          2896, 2896*8, 1567, 3784
     65        // idct8
     66        .short          799, 4017, 3406, 2276
     67        // idct16
     68        .short          401, 4076, 3166, 2598
     69        .short          1931, 3612, 3920, 1189
     70        // idct32
     71        .short          201, 4091, 3035, 2751
     72        .short          1751, 3703, 3857, 1380
     73        .short          995, 3973, 3513, 2106
     74        .short          2440, 3290, 4052, 601
     75 endconst
     76 
     77 const idct64_coeffs, align=4
     78        .short          101*8, 4095*8, 2967*8, -2824*8
     79        .short          1660*8, 3745*8, 3822*8, -1474*8
     80        .short          4076, 401, 4017, 799
     81        .short          0, 0, 0, 0
     82 
     83        .short          4036*8, -700*8, 2359*8, 3349*8
     84        .short          3461*8, -2191*8, 897*8, 3996*8
     85        .short          -3166, -2598, -799, -4017
     86        .short          0, 0, 0, 0
     87 
     88        .short          501*8, 4065*8, 3229*8, -2520*8
     89        .short          2019*8, 3564*8, 3948*8, -1092*8
     90        .short          3612, 1931, 2276, 3406
     91        .short          0, 0, 0, 0
     92 
     93        .short          4085*8, -301*8, 2675*8, 3102*8
     94        .short          3659*8, -1842*8, 1285*8, 3889*8
     95        .short          -3920, -1189, -3406, -2276
     96        .short          0, 0, 0, 0
     97 endconst
     98 
     99 const iadst4_coeffs, align=4
    100        // .h[4-5] can be interpreted as .s[2]
    101        .short          1321, 3803, 2482, 3344, 3344, 0
    102 endconst
    103 
    104 const iadst8_coeffs, align=4
    105        .short          4076, 401, 3612, 1931
    106        .short          2598, 3166, 1189, 3920
    107        // idct_coeffs
    108        .short          2896, 0, 1567, 3784, 0, 0, 0, 0
    109 endconst
    110 
    111 const iadst16_coeffs, align=4
    112        .short          4091, 201, 3973, 995
    113        .short          3703, 1751, 3290, 2440
    114        .short          2751, 3035, 2106, 3513
    115        .short          1380, 3857, 601, 4052
    116 endconst
    117 
    118 .macro smull_smlal d0, d1, s0, s1, c0, c1, sz
    119        smull           \d0\().4s, \s0\().4h, \c0
    120        smlal           \d0\().4s, \s1\().4h, \c1
    121 .ifc \sz, .8h
    122        smull2          \d1\().4s, \s0\().8h, \c0
    123        smlal2          \d1\().4s, \s1\().8h, \c1
    124 .endif
    125 .endm
    126 
    127 .macro smull_smlsl d0, d1, s0, s1, c0, c1, sz
    128        smull           \d0\().4s, \s0\().4h, \c0
    129        smlsl           \d0\().4s, \s1\().4h, \c1
    130 .ifc \sz, .8h
    131        smull2          \d1\().4s, \s0\().8h, \c0
    132        smlsl2          \d1\().4s, \s1\().8h, \c1
    133 .endif
    134 .endm
    135 
    136 .macro sqrshrn_sz d0, s0, s1, shift, sz
    137        sqrshrn         \d0\().4h, \s0\().4s, \shift
    138 .ifc \sz, .8h
    139        sqrshrn2        \d0\().8h, \s1\().4s, \shift
    140 .endif
    141 .endm
    142 
    143 .macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7
    144        sqrdmulh        \r0\sz,  \r0\sz,  \c
    145        sqrdmulh        \r1\sz,  \r1\sz,  \c
    146        sqrdmulh        \r2\sz,  \r2\sz,  \c
    147        sqrdmulh        \r3\sz,  \r3\sz,  \c
    148 .ifnb \r4
    149        sqrdmulh        \r4\sz,  \r4\sz,  \c
    150        sqrdmulh        \r5\sz,  \r5\sz,  \c
    151        sqrdmulh        \r6\sz,  \r6\sz,  \c
    152        sqrdmulh        \r7\sz,  \r7\sz,  \c
    153 .endif
    154 .endm
    155 
    156 .macro load_add_store load, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src, shiftbits=4
    157 .ifnb \load
    158        ld1             {\load},  [\src], x1
    159 .endif
    160 .ifnb \shift
    161        srshr           \shift,  \shift,  #\shiftbits
    162 .endif
    163 .ifnb \addsrc
    164        uaddw           \adddst, \adddst, \addsrc
    165 .endif
    166 .ifnb \narrowsrc
    167        sqxtun          \narrowdst, \narrowsrc
    168 .endif
    169 .ifnb \store
    170        st1             {\store},  [\dst], x1
    171 .endif
    172 .endm
    173 .macro load_add_store_8x16 dst, src
    174        mov             \src, \dst
    175        load_add_store  v2.8b, v16.8h,      ,       ,       ,      ,      , \dst, \src
    176        load_add_store  v3.8b, v17.8h,      ,       ,       ,      ,      , \dst, \src
    177        load_add_store  v4.8b, v18.8h, v2.8b, v16.8h,       ,      ,      , \dst, \src
    178        load_add_store  v5.8b, v19.8h, v3.8b, v17.8h, v16.8h, v2.8b,      , \dst, \src
    179        load_add_store  v6.8b, v20.8h, v4.8b, v18.8h, v17.8h, v3.8b, v2.8b, \dst, \src
    180        load_add_store  v7.8b, v21.8h, v5.8b, v19.8h, v18.8h, v4.8b, v3.8b, \dst, \src
    181        load_add_store  v2.8b, v22.8h, v6.8b, v20.8h, v19.8h, v5.8b, v4.8b, \dst, \src
    182        load_add_store  v3.8b, v23.8h, v7.8b, v21.8h, v20.8h, v6.8b, v5.8b, \dst, \src
    183        load_add_store  v4.8b, v24.8h, v2.8b, v22.8h, v21.8h, v7.8b, v6.8b, \dst, \src
    184        load_add_store  v5.8b, v25.8h, v3.8b, v23.8h, v22.8h, v2.8b, v7.8b, \dst, \src
    185        load_add_store  v6.8b, v26.8h, v4.8b, v24.8h, v23.8h, v3.8b, v2.8b, \dst, \src
    186        load_add_store  v7.8b, v27.8h, v5.8b, v25.8h, v24.8h, v4.8b, v3.8b, \dst, \src
    187        load_add_store  v2.8b, v28.8h, v6.8b, v26.8h, v25.8h, v5.8b, v4.8b, \dst, \src
    188        load_add_store  v3.8b, v29.8h, v7.8b, v27.8h, v26.8h, v6.8b, v5.8b, \dst, \src
    189        load_add_store  v4.8b, v30.8h, v2.8b, v28.8h, v27.8h, v7.8b, v6.8b, \dst, \src
    190        load_add_store  v5.8b, v31.8h, v3.8b, v29.8h, v28.8h, v2.8b, v7.8b, \dst, \src
    191        load_add_store       ,       , v4.8b, v30.8h, v29.8h, v3.8b, v2.8b, \dst, \src
    192        load_add_store       ,       , v5.8b, v31.8h, v30.8h, v4.8b, v3.8b, \dst, \src
    193        load_add_store       ,       ,      ,       , v31.8h, v5.8b, v4.8b, \dst, \src
    194        load_add_store       ,       ,      ,       ,       ,      , v5.8b, \dst, \src
    195 .endm
    196 .macro load_add_store_8x8 dst, src, shiftbits=4
    197        mov             \src, \dst
    198        load_add_store  v2.8b, v16.8h,      ,       ,       ,      ,      , \dst, \src, \shiftbits
    199        load_add_store  v3.8b, v17.8h,      ,       ,       ,      ,      , \dst, \src, \shiftbits
    200        load_add_store  v4.8b, v18.8h, v2.8b, v16.8h,       ,      ,      , \dst, \src, \shiftbits
    201        load_add_store  v5.8b, v19.8h, v3.8b, v17.8h, v16.8h, v2.8b,      , \dst, \src, \shiftbits
    202        load_add_store  v6.8b, v20.8h, v4.8b, v18.8h, v17.8h, v3.8b, v2.8b, \dst, \src, \shiftbits
    203        load_add_store  v7.8b, v21.8h, v5.8b, v19.8h, v18.8h, v4.8b, v3.8b, \dst, \src, \shiftbits
    204        load_add_store  v2.8b, v22.8h, v6.8b, v20.8h, v19.8h, v5.8b, v4.8b, \dst, \src, \shiftbits
    205        load_add_store  v3.8b, v23.8h, v7.8b, v21.8h, v20.8h, v6.8b, v5.8b, \dst, \src, \shiftbits
    206        load_add_store       ,       , v2.8b, v22.8h, v21.8h, v7.8b, v6.8b, \dst, \src, \shiftbits
    207        load_add_store       ,       , v3.8b, v23.8h, v22.8h, v2.8b, v7.8b, \dst, \src, \shiftbits
    208        load_add_store       ,       ,      ,       , v23.8h, v3.8b, v2.8b, \dst, \src, \shiftbits
    209        load_add_store       ,       ,      ,       ,       ,      , v3.8b, \dst, \src, \shiftbits
    210 .endm
    211 .macro load_add_store_8x4 dst, src
    212        mov             \src, \dst
    213        load_add_store  v2.8b, v16.8h,      ,       ,       ,      ,      , \dst, \src
    214        load_add_store  v3.8b, v17.8h,      ,       ,       ,      ,      , \dst, \src
    215        load_add_store  v4.8b, v18.8h, v2.8b, v16.8h,       ,      ,      , \dst, \src
    216        load_add_store  v5.8b, v19.8h, v3.8b, v17.8h, v16.8h, v2.8b,      , \dst, \src
    217        load_add_store       ,       , v4.8b, v18.8h, v17.8h, v3.8b, v2.8b, \dst, \src
    218        load_add_store       ,       , v5.8b, v19.8h, v18.8h, v4.8b, v3.8b, \dst, \src
    219        load_add_store       ,       ,      ,       , v19.8h, v5.8b, v4.8b, \dst, \src
    220        load_add_store       ,       ,      ,       ,       ,      , v5.8b, \dst, \src
    221 .endm
    222 .macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src
    223 .ifnb \load
    224        ld1             {\load}[0],  [\src], x1
    225 .endif
    226 .ifnb \inssrc
    227        ins             \insdst\().d[1],   \inssrc\().d[0]
    228 .endif
    229 .ifnb \shift
    230        srshr           \shift,  \shift,  #4
    231 .endif
    232 .ifnb \load
    233        ld1             {\load}[1],  [\src], x1
    234 .endif
    235 .ifnb \addsrc
    236        uaddw           \adddst, \adddst, \addsrc
    237 .endif
    238 .ifnb \store
    239        st1             {\store}[0],  [\dst], x1
    240 .endif
    241 .ifnb \narrowsrc
    242        sqxtun          \narrowdst, \narrowsrc
    243 .endif
    244 .ifnb \store
    245        st1             {\store}[1],  [\dst], x1
    246 .endif
    247 .endm
    248 .macro load_add_store_4x16 dst, src
    249        mov             \src, \dst
    250        load_add_store4 v0.s, v17, v16,       ,      ,       ,       ,      ,     , \dst, \src
    251        load_add_store4 v1.s, v19, v18,       ,      ,       ,       ,      ,     , \dst, \src
    252        load_add_store4 v2.s, v21, v20, v16.8h,      ,       ,       ,      ,     , \dst, \src
    253        load_add_store4 v3.s, v23, v22, v18.8h, v0.8b, v16.8h,       ,      ,     , \dst, \src
    254        load_add_store4 v4.s, v25, v24, v20.8h, v1.8b, v18.8h, v16.8h, v0.8b,     , \dst, \src
    255        load_add_store4 v5.s, v27, v26, v22.8h, v2.8b, v20.8h, v18.8h, v1.8b, v0.s, \dst, \src
    256        load_add_store4 v6.s, v29, v28, v24.8h, v3.8b, v22.8h, v20.8h, v2.8b, v1.s, \dst, \src
    257        load_add_store4 v7.s, v31, v30, v26.8h, v4.8b, v24.8h, v22.8h, v3.8b, v2.s, \dst, \src
    258        load_add_store4     ,    ,    , v28.8h, v5.8b, v26.8h, v24.8h, v4.8b, v3.s, \dst, \src
    259        load_add_store4     ,    ,    , v30.8h, v6.8b, v28.8h, v26.8h, v5.8b, v4.s, \dst, \src
    260        load_add_store4     ,    ,    ,       , v7.8b, v30.8h, v28.8h, v6.8b, v5.s, \dst, \src
    261        load_add_store4     ,    ,    ,       ,      ,       , v30.8h, v7.8b, v6.s, \dst, \src
    262        load_add_store4     ,    ,    ,       ,      ,       ,       ,      , v7.s, \dst, \src
    263 .endm
    264 .macro load_add_store_4x8 dst, src
    265        mov             \src, \dst
    266        load_add_store4 v0.s, v17, v16,       ,      ,       ,       ,      ,     , \dst, \src
    267        load_add_store4 v1.s, v19, v18,       ,      ,       ,       ,      ,     , \dst, \src
    268        load_add_store4 v2.s, v21, v20, v16.8h,      ,       ,       ,      ,     , \dst, \src
    269        load_add_store4 v3.s, v23, v22, v18.8h, v0.8b, v16.8h,       ,      ,     , \dst, \src
    270        load_add_store4     ,    ,    , v20.8h, v1.8b, v18.8h, v16.8h, v0.8b,     , \dst, \src
    271        load_add_store4     ,    ,    , v22.8h, v2.8b, v20.8h, v18.8h, v1.8b, v0.s, \dst, \src
    272        load_add_store4     ,    ,    ,       , v3.8b, v22.8h, v20.8h, v2.8b, v1.s, \dst, \src
    273        load_add_store4     ,    ,    ,       ,      ,       , v22.8h, v3.8b, v2.s, \dst, \src
    274        load_add_store4     ,    ,    ,       ,      ,       ,       ,      , v3.s, \dst, \src
    275 .endm
    276 
    277 .macro idct_dc w, h, shift
    278        cbnz            w3,  1f
    279        mov             w16, #2896*8
    280        ld1r            {v16.8h}, [x2]
    281        dup             v0.4h,   w16
    282        sqrdmulh        v16.8h,  v16.8h,  v0.h[0]
    283        strh            wzr, [x2]
    284 .if (\w == 2*\h) || (2*\w == \h)
    285        sqrdmulh        v16.8h,  v16.8h,  v0.h[0]
    286 .endif
    287 .if \shift > 0
    288        srshr           v16.8h,  v16.8h,  #\shift
    289 .endif
    290        sqrdmulh        v16.8h,  v16.8h,  v0.h[0]
    291        srshr           v16.8h,  v16.8h,  #4
    292        mov             w4,  #\h
    293        b               idct_dc_w\w\()_neon
    294 1:
    295 .endm
    296 
    297 function idct_dc_w4_neon
    298 1:
    299        ld1             {v0.s}[0], [x0], x1
    300        ld1             {v0.s}[1], [x0], x1
    301        ld1             {v1.s}[0], [x0], x1
    302        ld1             {v1.s}[1], [x0], x1
    303        subs            w4,  w4,  #4
    304        sub             x0,  x0,  x1, lsl #2
    305        uaddw           v0.8h,   v16.8h,  v0.8b
    306        sqxtun          v0.8b,   v0.8h
    307        uaddw           v1.8h,   v16.8h,  v1.8b
    308        st1             {v0.s}[0], [x0], x1
    309        sqxtun          v1.8b,   v1.8h
    310        st1             {v0.s}[1], [x0], x1
    311        st1             {v1.s}[0], [x0], x1
    312        st1             {v1.s}[1], [x0], x1
    313        b.gt            1b
    314        ret
    315 endfunc
    316 
    317 function idct_dc_w8_neon
    318 1:
    319        ld1             {v0.8b}, [x0], x1
    320        ld1             {v1.8b}, [x0], x1
    321        ld1             {v2.8b}, [x0], x1
    322        uaddw           v20.8h,  v16.8h, v0.8b
    323        ld1             {v3.8b}, [x0], x1
    324        sub             x0,  x0,  x1, lsl #2
    325        subs            w4,  w4,  #4
    326        uaddw           v21.8h,  v16.8h, v1.8b
    327        sqxtun          v0.8b,   v20.8h
    328        uaddw           v22.8h,  v16.8h, v2.8b
    329        sqxtun          v1.8b,   v21.8h
    330        uaddw           v23.8h,  v16.8h, v3.8b
    331        st1             {v0.8b}, [x0], x1
    332        sqxtun          v2.8b,   v22.8h
    333        st1             {v1.8b}, [x0], x1
    334        sqxtun          v3.8b,   v23.8h
    335        st1             {v2.8b}, [x0], x1
    336        st1             {v3.8b}, [x0], x1
    337        b.gt            1b
    338        ret
    339 endfunc
    340 
    341 function idct_dc_w16_neon
    342 1:
    343        ld1             {v0.16b}, [x0], x1
    344        ld1             {v1.16b}, [x0], x1
    345        ld1             {v2.16b}, [x0], x1
    346        subs            w4,  w4,  #4
    347        uaddw           v20.8h,  v16.8h, v0.8b
    348        uaddw2          v21.8h,  v16.8h, v0.16b
    349        ld1             {v3.16b}, [x0], x1
    350        uaddw           v22.8h,  v16.8h, v1.8b
    351        uaddw2          v23.8h,  v16.8h, v1.16b
    352        sub             x0,  x0,  x1, lsl #2
    353        uaddw           v24.8h,  v16.8h, v2.8b
    354        uaddw2          v25.8h,  v16.8h, v2.16b
    355        sqxtun          v0.8b,   v20.8h
    356        sqxtun2         v0.16b,  v21.8h
    357        uaddw           v26.8h,  v16.8h, v3.8b
    358        uaddw2          v27.8h,  v16.8h, v3.16b
    359        sqxtun          v1.8b,   v22.8h
    360        sqxtun2         v1.16b,  v23.8h
    361        sqxtun          v2.8b,   v24.8h
    362        sqxtun2         v2.16b,  v25.8h
    363        st1             {v0.16b}, [x0], x1
    364        sqxtun          v3.8b,   v26.8h
    365        sqxtun2         v3.16b,  v27.8h
    366        st1             {v1.16b}, [x0], x1
    367        st1             {v2.16b}, [x0], x1
    368        st1             {v3.16b}, [x0], x1
    369        b.gt            1b
    370        ret
    371 endfunc
    372 
    373 function idct_dc_w32_neon
    374 1:
    375        ld1             {v0.16b, v1.16b},  [x0], x1
    376        subs            w4,  w4,  #2
    377        uaddw           v20.8h,  v16.8h, v0.8b
    378        uaddw2          v21.8h,  v16.8h, v0.16b
    379        ld1             {v2.16b, v3.16b},  [x0]
    380        uaddw           v22.8h,  v16.8h, v1.8b
    381        uaddw2          v23.8h,  v16.8h, v1.16b
    382        sub             x0,  x0,  x1
    383        uaddw           v24.8h,  v16.8h, v2.8b
    384        uaddw2          v25.8h,  v16.8h, v2.16b
    385        sqxtun          v0.8b,   v20.8h
    386        sqxtun2         v0.16b,  v21.8h
    387        uaddw           v26.8h,  v16.8h, v3.8b
    388        uaddw2          v27.8h,  v16.8h, v3.16b
    389        sqxtun          v1.8b,   v22.8h
    390        sqxtun2         v1.16b,  v23.8h
    391        sqxtun          v2.8b,   v24.8h
    392        sqxtun2         v2.16b,  v25.8h
    393        st1             {v0.16b, v1.16b},  [x0], x1
    394        sqxtun          v3.8b,   v26.8h
    395        sqxtun2         v3.16b,  v27.8h
    396        st1             {v2.16b, v3.16b},  [x0], x1
    397        b.gt            1b
    398        ret
    399 endfunc
    400 
    401 function idct_dc_w64_neon
    402 1:
    403        ld1             {v0.16b, v1.16b, v2.16b, v3.16b},  [x0]
    404        subs            w4,  w4,  #1
    405        uaddw           v20.8h,  v16.8h, v0.8b
    406        uaddw2          v21.8h,  v16.8h, v0.16b
    407        uaddw           v22.8h,  v16.8h, v1.8b
    408        uaddw2          v23.8h,  v16.8h, v1.16b
    409        uaddw           v24.8h,  v16.8h, v2.8b
    410        uaddw2          v25.8h,  v16.8h, v2.16b
    411        sqxtun          v0.8b,   v20.8h
    412        sqxtun2         v0.16b,  v21.8h
    413        uaddw           v26.8h,  v16.8h, v3.8b
    414        uaddw2          v27.8h,  v16.8h, v3.16b
    415        sqxtun          v1.8b,   v22.8h
    416        sqxtun2         v1.16b,  v23.8h
    417        sqxtun          v2.8b,   v24.8h
    418        sqxtun2         v2.16b,  v25.8h
    419        sqxtun          v3.8b,   v26.8h
    420        sqxtun2         v3.16b,  v27.8h
    421        st1             {v0.16b, v1.16b, v2.16b, v3.16b},  [x0], x1
    422        b.gt            1b
    423        ret
    424 endfunc
    425 
    426 .macro iwht4
    427        add             v16.4h,  v16.4h,  v17.4h
    428        sub             v21.4h,  v18.4h,  v19.4h
    429        sub             v20.4h,  v16.4h,  v21.4h
    430        sshr            v20.4h,  v20.4h,  #1
    431        sub             v18.4h,  v20.4h,  v17.4h
    432        sub             v17.4h,  v20.4h,  v19.4h
    433        add             v19.4h,  v21.4h,  v18.4h
    434        sub             v16.4h,  v16.4h,  v17.4h
    435 .endm
    436 
    437 .macro idct_4 r0, r1, r2, r3, sz
    438        smull_smlal     v6,  v7,  \r1, \r3, v0.h[3], v0.h[2], \sz
    439        smull_smlsl     v4,  v5,  \r1, \r3, v0.h[2], v0.h[3], \sz
    440        smull_smlal     v2,  v3,  \r0, \r2, v0.h[0], v0.h[0], \sz
    441        sqrshrn_sz      v6,  v6,  v7,  #12, \sz
    442        sqrshrn_sz      v7,  v4,  v5,  #12, \sz
    443        smull_smlsl     v4,  v5,  \r0, \r2, v0.h[0], v0.h[0], \sz
    444        sqrshrn_sz      v2,  v2,  v3,  #12, \sz
    445        sqrshrn_sz      v3,  v4,  v5,  #12, \sz
    446        sqadd           \r0\sz,  v2\sz,   v6\sz
    447        sqsub           \r3\sz,  v2\sz,   v6\sz
    448        sqadd           \r1\sz,  v3\sz,   v7\sz
    449        sqsub           \r2\sz,  v3\sz,   v7\sz
    450 .endm
    451 
    452 function inv_dct_4h_x4_neon, export=1
    453        movrel          x16, idct_coeffs
    454        ld1             {v0.4h}, [x16]
    455        idct_4          v16, v17, v18, v19, .4h
    456        ret
    457 endfunc
    458 
    459 function inv_dct_8h_x4_neon, export=1
    460        movrel          x16, idct_coeffs
    461        ld1             {v0.4h}, [x16]
    462        idct_4          v16, v17, v18, v19, .8h
    463        ret
    464 endfunc
    465 
    466 .macro iadst_4x4 o0, o1, o2, o3
    467        movrel          x16, iadst4_coeffs
    468        ld1             {v0.8h}, [x16]
    469 
    470        ssubl           v3.4s,   v16.4h,  v18.4h
    471        smull           v4.4s,   v16.4h,  v0.h[0]
    472        smlal           v4.4s,   v18.4h,  v0.h[1]
    473        smlal           v4.4s,   v19.4h,  v0.h[2]
    474        smull           v7.4s,   v17.4h,  v0.h[3]
    475        saddw           v3.4s,   v3.4s,   v19.4h
    476        smull           v5.4s,   v16.4h,  v0.h[2]
    477        smlsl           v5.4s,   v18.4h,  v0.h[0]
    478        smlsl           v5.4s,   v19.4h,  v0.h[1]
    479 
    480        add             \o3\().4s, v4.4s,     v5.4s
    481        mul             \o2\().4s, v3.4s,     v0.s[2]
    482        add             \o0\().4s, v4.4s,     v7.4s
    483        add             \o1\().4s, v5.4s,     v7.4s
    484        sub             \o3\().4s, \o3\().4s, v7.4s
    485 
    486        sqrshrn         \o0\().4h, \o0\().4s, #12
    487        sqrshrn         \o2\().4h, \o2\().4s, #12
    488        sqrshrn         \o1\().4h, \o1\().4s, #12
    489        sqrshrn         \o3\().4h, \o3\().4s, #12
    490 .endm
    491 
    492 function inv_adst_4h_x4_neon, export=1
    493        iadst_4x4       v16, v17, v18, v19
    494        ret
    495 endfunc
    496 
    497 function inv_flipadst_4h_x4_neon, export=1
    498        iadst_4x4       v19, v18, v17, v16
    499        ret
    500 endfunc
    501 
    502 .macro iadst_8x4 o0, o1, o2, o3
    503        movrel          x16, iadst4_coeffs
    504        ld1             {v0.8h}, [x16]
    505 
    506        ssubl           v2.4s,   v16.4h,  v18.4h
    507        ssubl2          v3.4s,   v16.8h,  v18.8h
    508        smull           v4.4s,   v16.4h,  v0.h[0]
    509        smlal           v4.4s,   v18.4h,  v0.h[1]
    510        smlal           v4.4s,   v19.4h,  v0.h[2]
    511        smull2          v5.4s,   v16.8h,  v0.h[0]
    512        smlal2          v5.4s,   v18.8h,  v0.h[1]
    513        smlal2          v5.4s,   v19.8h,  v0.h[2]
    514        saddw           v2.4s,   v2.4s,   v19.4h
    515        saddw2          v3.4s,   v3.4s,   v19.8h
    516        smull           v6.4s,   v16.4h,  v0.h[2]
    517        smlsl           v6.4s,   v18.4h,  v0.h[0]
    518        smlsl           v6.4s,   v19.4h,  v0.h[1]
    519        smull2          v7.4s,   v16.8h,  v0.h[2]
    520        smlsl2          v7.4s,   v18.8h,  v0.h[0]
    521        smlsl2          v7.4s,   v19.8h,  v0.h[1]
    522 
    523        mul             v18.4s,  v2.4s,   v0.s[2]
    524        mul             v19.4s,  v3.4s,   v0.s[2]
    525 
    526        smull           v2.4s,   v17.4h,  v0.h[3]
    527        smull2          v3.4s,   v17.8h,  v0.h[3]
    528 
    529        add             v16.4s,  v4.4s,   v2.4s // out0
    530        add             v17.4s,  v5.4s,   v3.4s
    531 
    532        add             v4.4s,   v4.4s,   v6.4s // out3
    533        add             v5.4s,   v5.4s,   v7.4s
    534 
    535        add             v6.4s,   v6.4s,   v2.4s // out1
    536        add             v7.4s,   v7.4s,   v3.4s
    537 
    538        sub             v4.4s,   v4.4s,   v2.4s // out3
    539        sub             v5.4s,   v5.4s,   v3.4s
    540 
    541        sqrshrn         v18.4h,  v18.4s, #12
    542        sqrshrn2        v18.8h,  v19.4s, #12
    543 
    544        sqrshrn         \o0\().4h, v16.4s, #12
    545        sqrshrn2        \o0\().8h, v17.4s, #12
    546 
    547 .ifc \o2, v17
    548        mov             v17.16b,   v18.16b
    549 .endif
    550 
    551        sqrshrn         \o1\().4h, v6.4s,  #12
    552        sqrshrn2        \o1\().8h, v7.4s,  #12
    553 
    554        sqrshrn         \o3\().4h, v4.4s,  #12
    555        sqrshrn2        \o3\().8h, v5.4s,  #12
    556 .endm
    557 
    558 function inv_adst_8h_x4_neon, export=1
    559        iadst_8x4       v16, v17, v18, v19
    560        ret
    561 endfunc
    562 
    563 function inv_flipadst_8h_x4_neon, export=1
    564        iadst_8x4       v19, v18, v17, v16
    565        ret
    566 endfunc
    567 
    568 function inv_identity_4h_x4_neon, export=1
    569        mov             w16, #(5793-4096)*8
    570        dup             v0.4h,   w16
    571        sqrdmulh        v4.4h,   v16.4h,  v0.h[0]
    572        sqrdmulh        v5.4h,   v17.4h,  v0.h[0]
    573        sqrdmulh        v6.4h,   v18.4h,  v0.h[0]
    574        sqrdmulh        v7.4h,   v19.4h,  v0.h[0]
    575        sqadd           v16.4h,  v16.4h,  v4.4h
    576        sqadd           v17.4h,  v17.4h,  v5.4h
    577        sqadd           v18.4h,  v18.4h,  v6.4h
    578        sqadd           v19.4h,  v19.4h,  v7.4h
    579        ret
    580 endfunc
    581 
    582 function inv_identity_8h_x4_neon, export=1
    583        mov             w16, #(5793-4096)*8
    584        dup             v0.4h,   w16
    585        sqrdmulh        v4.8h,   v16.8h,  v0.h[0]
    586        sqrdmulh        v5.8h,   v17.8h,  v0.h[0]
    587        sqrdmulh        v6.8h,   v18.8h,  v0.h[0]
    588        sqrdmulh        v7.8h,   v19.8h,  v0.h[0]
    589        sqadd           v16.8h,  v16.8h,  v4.8h
    590        sqadd           v17.8h,  v17.8h,  v5.8h
    591        sqadd           v18.8h,  v18.8h,  v6.8h
    592        sqadd           v19.8h,  v19.8h,  v7.8h
    593        ret
    594 endfunc
    595 
    596 .macro identity_8x4_shift1 r0, r1, r2, r3, c
    597 .irp i, \r0\().8h, \r1\().8h, \r2\().8h, \r3\().8h
    598        sqrdmulh        v2.8h,  \i,  \c
    599        srhadd          \i,     \i,  v2.8h
    600 .endr
    601 .endm
    602 
    603 function inv_txfm_add_wht_wht_4x4_8bpc_neon, export=1
    604        mov             x15, x30
    605        movi            v31.8h,  #0
    606        ld1             {v16.4h,v17.4h,v18.4h,v19.4h}, [x2]
    607        st1             {v31.8h}, [x2], #16
    608 
    609        sshr            v16.4h,  v16.4h,  #2
    610        sshr            v17.4h,  v17.4h,  #2
    611        sshr            v18.4h,  v18.4h,  #2
    612        sshr            v19.4h,  v19.4h,  #2
    613 
    614        iwht4
    615 
    616        st1             {v31.8h}, [x2], #16
    617        transpose_4x4h  v16, v17, v18, v19, v20, v21, v22, v23
    618 
    619        iwht4
    620 
    621        ld1             {v0.s}[0], [x0], x1
    622        ld1             {v0.s}[1], [x0], x1
    623        ins             v16.d[1], v17.d[0]
    624        ins             v18.d[1], v19.d[0]
    625        ld1             {v1.s}[0], [x0], x1
    626        ld1             {v1.s}[1], [x0], x1
    627 
    628        b               L(itx_4x4_end)
    629 endfunc
    630 
    631 function inv_txfm_add_4x4_neon
    632        movi            v31.8h,  #0
    633        ld1             {v16.4h,v17.4h,v18.4h,v19.4h}, [x2]
    634        st1             {v31.8h}, [x2], #16
    635 
    636        blr             x4
    637 
    638        st1             {v31.8h}, [x2], #16
    639        transpose_4x4h  v16, v17, v18, v19, v20, v21, v22, v23
    640 
    641        blr             x5
    642 
    643        ld1             {v0.s}[0], [x0], x1
    644        ld1             {v0.s}[1], [x0], x1
    645        ins             v16.d[1], v17.d[0]
    646        ins             v18.d[1], v19.d[0]
    647        ld1             {v1.s}[0], [x0], x1
    648        ld1             {v1.s}[1], [x0], x1
    649        srshr           v16.8h,  v16.8h,  #4
    650        srshr           v18.8h,  v18.8h,  #4
    651 
    652 L(itx_4x4_end):
    653        sub             x0,  x0,  x1, lsl #2
    654        uaddw           v16.8h,  v16.8h,  v0.8b
    655        sqxtun          v0.8b,   v16.8h
    656        uaddw           v18.8h,  v18.8h,  v1.8b
    657        st1             {v0.s}[0], [x0], x1
    658        sqxtun          v1.8b,   v18.8h
    659        st1             {v0.s}[1], [x0], x1
    660        st1             {v1.s}[0], [x0], x1
    661        st1             {v1.s}[1], [x0], x1
    662 
    663        ret             x15
    664 endfunc
    665 
    666 .macro def_fn_4x4 txfm1, txfm2
    667 function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_neon, export=1
    668        mov             x15, x30
    669 
    670 .ifc \txfm1\()_\txfm2, dct_dct
    671        cbnz            w3,  1f
    672        mov             w16, #2896*8
    673        ld1r            {v16.8h}, [x2]
    674        dup             v4.8h,   w16
    675        strh            wzr, [x2]
    676        sqrdmulh        v16.8h,  v16.8h,  v4.h[0]
    677        ld1             {v0.s}[0], [x0], x1
    678        sqrdmulh        v20.8h,  v16.8h,  v4.h[0]
    679        ld1             {v0.s}[1], [x0], x1
    680        srshr           v16.8h,  v20.8h,  #4
    681        ld1             {v1.s}[0], [x0], x1
    682        srshr           v18.8h,  v20.8h,  #4
    683        ld1             {v1.s}[1], [x0], x1
    684        b               L(itx_4x4_end)
    685 1:
    686 .endif
    687        adr             x4,  inv_\txfm1\()_4h_x4_neon
    688        adr             x5,  inv_\txfm2\()_4h_x4_neon
    689        b               inv_txfm_add_4x4_neon
    690 endfunc
    691 .endm
    692 
    693 def_fn_4x4 dct, dct
    694 def_fn_4x4 identity, identity
    695 def_fn_4x4 dct, adst
    696 def_fn_4x4 dct, flipadst
    697 def_fn_4x4 dct, identity
    698 def_fn_4x4 adst, dct
    699 def_fn_4x4 adst, adst
    700 def_fn_4x4 adst, flipadst
    701 def_fn_4x4 flipadst, dct
    702 def_fn_4x4 flipadst, adst
    703 def_fn_4x4 flipadst, flipadst
    704 def_fn_4x4 identity, dct
    705 
    706 def_fn_4x4 adst, identity
    707 def_fn_4x4 flipadst, identity
    708 def_fn_4x4 identity, adst
    709 def_fn_4x4 identity, flipadst
    710 
    711 .macro idct_8 r0, r1, r2, r3, r4, r5, r6, r7, sz, szb
    712        idct_4          \r0, \r2, \r4, \r6, \sz
    713 
    714        smull_smlsl     v2,  v3,  \r1, \r7, v0.h[4], v0.h[5], \sz // -> t4a
    715        smull_smlal     v4,  v5,  \r1, \r7, v0.h[5], v0.h[4], \sz // -> t7a
    716        smull_smlsl     v6,  v7,  \r5, \r3, v0.h[6], v0.h[7], \sz // -> t5a
    717        sqrshrn_sz      \r1, v2,  v3,  #12, \sz                   // t4a
    718        sqrshrn_sz      \r7, v4,  v5,  #12, \sz                   // t7a
    719        smull_smlal     v2,  v3,  \r5, \r3, v0.h[7], v0.h[6], \sz // -> t6a
    720        sqrshrn_sz      \r3, v6,  v7,  #12, \sz                   // t5a
    721        sqrshrn_sz      \r5, v2,  v3,  #12, \sz                   // t6a
    722 
    723        sqadd           v2\sz,   \r1\sz,  \r3\sz // t4
    724        sqsub           \r1\sz,  \r1\sz,  \r3\sz // t5a
    725        sqadd           v3\sz,   \r7\sz,  \r5\sz // t7
    726        sqsub           \r3\sz,  \r7\sz,  \r5\sz // t6a
    727 
    728        smull_smlsl     v4,  v5,  \r3, \r1, v0.h[0], v0.h[0], \sz // -> t5
    729        smull_smlal     v6,  v7,  \r3, \r1, v0.h[0], v0.h[0], \sz // -> t6
    730        sqrshrn_sz      v4,  v4,  v5,  #12, \sz // t5
    731        sqrshrn_sz      v5,  v6,  v7,  #12, \sz // t6
    732 
    733        sqsub           \r7\sz,  \r0\sz,  v3\sz // out7
    734        sqadd           \r0\sz,  \r0\sz,  v3\sz // out0
    735        sqadd           \r1\sz,  \r2\sz,  v5\sz // out1
    736        sqsub           v6\sz,   \r2\sz,  v5\sz // out6
    737        sqadd           \r2\sz,  \r4\sz,  v4\sz // out2
    738        sqsub           \r5\sz,  \r4\sz,  v4\sz // out5
    739        sqadd           \r3\sz,  \r6\sz,  v2\sz // out3
    740        sqsub           \r4\sz,  \r6\sz,  v2\sz // out4
    741        mov             \r6\szb, v6\szb         // out6
    742 .endm
    743 
    744 function inv_dct_8h_x8_neon, export=1
    745        movrel          x16, idct_coeffs
    746        ld1             {v0.8h}, [x16]
    747        idct_8          v16, v17, v18, v19, v20, v21, v22, v23, .8h, .16b
    748        ret
    749 endfunc
    750 
    751 function inv_dct_4h_x8_neon, export=1
    752        movrel          x16, idct_coeffs
    753        ld1             {v0.8h}, [x16]
    754        idct_8          v16, v17, v18, v19, v20, v21, v22, v23, .4h, .8b
    755        ret
    756 endfunc
    757 
    758 .macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7, sz
    759        movrel          x16, iadst8_coeffs
    760        ld1             {v0.8h, v1.8h}, [x16]
    761 
    762        smull_smlal     v2,  v3,  v23, v16, v0.h[0], v0.h[1], \sz
    763        smull_smlsl     v4,  v5,  v23, v16, v0.h[1], v0.h[0], \sz
    764        smull_smlal     v6,  v7,  v21, v18, v0.h[2], v0.h[3], \sz
    765        sqrshrn_sz      v16, v2,  v3,  #12, \sz  // t0a
    766        sqrshrn_sz      v23, v4,  v5,  #12, \sz  // t1a
    767        smull_smlsl     v2,  v3,  v21, v18, v0.h[3], v0.h[2], \sz
    768        smull_smlal     v4,  v5,  v19, v20, v0.h[4], v0.h[5], \sz
    769        sqrshrn_sz      v18, v6,  v7,  #12, \sz  // t2a
    770        sqrshrn_sz      v21, v2,  v3,  #12, \sz  // t3a
    771        smull_smlsl     v6,  v7,  v19, v20, v0.h[5], v0.h[4], \sz
    772        smull_smlal     v2,  v3,  v17, v22, v0.h[6], v0.h[7], \sz
    773        sqrshrn_sz      v20, v4,  v5,  #12, \sz  // t4a
    774        sqrshrn_sz      v19, v6,  v7,  #12, \sz  // t5a
    775        smull_smlsl     v4,  v5,  v17, v22, v0.h[7], v0.h[6], \sz
    776        sqrshrn_sz      v22, v2,  v3,  #12, \sz  // t6a
    777        sqrshrn_sz      v17, v4,  v5,  #12, \sz  // t7a
    778 
    779        sqadd           v2\sz,   v16\sz,  v20\sz // t0
    780        sqsub           v3\sz,   v16\sz,  v20\sz // t4
    781        sqadd           v4\sz,   v23\sz,  v19\sz // t1
    782        sqsub           v5\sz,   v23\sz,  v19\sz // t5
    783        sqadd           v6\sz,   v18\sz,  v22\sz // t2
    784        sqsub           v7\sz,   v18\sz,  v22\sz // t6
    785        sqadd           v18\sz,  v21\sz,  v17\sz // t3
    786        sqsub           v19\sz,  v21\sz,  v17\sz // t7
    787 
    788        smull_smlal     v16, v17, v3,  v5,  v1.h[3], v1.h[2], \sz
    789        smull_smlsl     v20, v21, v3,  v5,  v1.h[2], v1.h[3], \sz
    790        smull_smlsl     v22, v23, v19, v7,  v1.h[3], v1.h[2], \sz
    791 
    792        sqrshrn_sz      v3,  v16, v17, #12, \sz  // t4a
    793        sqrshrn_sz      v5,  v20, v21, #12, \sz  // t5a
    794 
    795        smull_smlal     v16, v17, v19, v7,  v1.h[2], v1.h[3], \sz
    796 
    797        sqrshrn_sz      v7,  v22, v23, #12, \sz  // t6a
    798        sqrshrn_sz      v19, v16, v17, #12, \sz  // t7a
    799 
    800        sqadd           \o0\()\sz, v2\sz, v6\sz  // out0
    801        sqsub           v2\sz,     v2\sz, v6\sz  // t2
    802        sqadd           \o7\()\sz, v4\sz, v18\sz // out7
    803        sqsub           v4\sz,     v4\sz, v18\sz // t3
    804        sqneg           \o7\()\sz, \o7\()\sz     // out7
    805 
    806        sqadd           \o1\()\sz, v3\sz, v7\sz  // out1
    807        sqsub           v3\sz,     v3\sz, v7\sz  // t6
    808        sqadd           \o6\()\sz, v5\sz, v19\sz // out6
    809        sqsub           v5\sz,     v5\sz, v19\sz // t7
    810        sqneg           \o1\()\sz, \o1\()\sz     // out1
    811 
    812        smull_smlal     v18, v19, v2,  v4,  v1.h[0], v1.h[0], \sz // -> out3 (v19 or v20)
    813        smull_smlsl     v6,  v7,  v2,  v4,  v1.h[0], v1.h[0], \sz // -> out4 (v20 or v19)
    814        smull_smlsl     v20, v21, v3,  v5,  v1.h[0], v1.h[0], \sz // -> out5 (v21 or v18)
    815        sqrshrn_sz      v2,  v18, v19, #12, \sz // out3
    816        smull_smlal     v18, v19, v3,  v5,  v1.h[0], v1.h[0], \sz // -> out2 (v18 or v21)
    817        sqrshrn_sz      v3,  v20, v21, #12, \sz // out5
    818        sqrshrn_sz      \o2, v18, v19, #12, \sz // out2 (v18 or v21)
    819        sqrshrn_sz      \o4, v6,  v7,  #12, \sz // out4 (v20 or v19)
    820 
    821        sqneg           \o3\()\sz, v2\sz     // out3
    822        sqneg           \o5\()\sz, v3\sz     // out5
    823 .endm
    824 
    825 function inv_adst_8h_x8_neon, export=1
    826        iadst_8         v16, v17, v18, v19, v20, v21, v22, v23, .8h
    827        ret
    828 endfunc
    829 
    830 function inv_flipadst_8h_x8_neon, export=1
    831        iadst_8         v23, v22, v21, v20, v19, v18, v17, v16, .8h
    832        ret
    833 endfunc
    834 
    835 function inv_adst_4h_x8_neon, export=1
    836        iadst_8         v16, v17, v18, v19, v20, v21, v22, v23, .4h
    837        ret
    838 endfunc
    839 
    840 function inv_flipadst_4h_x8_neon, export=1
    841        iadst_8         v23, v22, v21, v20, v19, v18, v17, v16, .4h
    842        ret
    843 endfunc
    844 
    845 function inv_identity_8h_x8_neon, export=1
    846        sqshl           v16.8h,  v16.8h,  #1
    847        sqshl           v17.8h,  v17.8h,  #1
    848        sqshl           v18.8h,  v18.8h,  #1
    849        sqshl           v19.8h,  v19.8h,  #1
    850        sqshl           v20.8h,  v20.8h,  #1
    851        sqshl           v21.8h,  v21.8h,  #1
    852        sqshl           v22.8h,  v22.8h,  #1
    853        sqshl           v23.8h,  v23.8h,  #1
    854        ret
    855 endfunc
    856 
    857 function inv_identity_4h_x8_neon, export=1
    858        sqshl           v16.4h,  v16.4h,  #1
    859        sqshl           v17.4h,  v17.4h,  #1
    860        sqshl           v18.4h,  v18.4h,  #1
    861        sqshl           v19.4h,  v19.4h,  #1
    862        sqshl           v20.4h,  v20.4h,  #1
    863        sqshl           v21.4h,  v21.4h,  #1
    864        sqshl           v22.4h,  v22.4h,  #1
    865        sqshl           v23.4h,  v23.4h,  #1
    866        ret
    867 endfunc
    868 
    869 .macro def_fn_8x8_base variant
    870 function inv_txfm_\variant\()add_8x8_neon
    871        movi            v28.8h,  #0
    872        movi            v29.8h,  #0
    873        movi            v30.8h,  #0
    874        movi            v31.8h,  #0
    875        ld1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x2]
    876        st1             {v28.8h,v29.8h,v30.8h,v31.8h}, [x2], #64
    877        ld1             {v20.8h,v21.8h,v22.8h,v23.8h}, [x2]
    878        st1             {v28.8h,v29.8h,v30.8h,v31.8h}, [x2]
    879 
    880 .ifc \variant, identity_
    881        // The identity shl #1 and downshift srshr #1 cancel out
    882 
    883        b               L(itx_8x8_epilog)
    884 .else
    885        blr             x4
    886 
    887        srshr           v16.8h,  v16.8h,  #1
    888        srshr           v17.8h,  v17.8h,  #1
    889        srshr           v18.8h,  v18.8h,  #1
    890        srshr           v19.8h,  v19.8h,  #1
    891        srshr           v20.8h,  v20.8h,  #1
    892        srshr           v21.8h,  v21.8h,  #1
    893        srshr           v22.8h,  v22.8h,  #1
    894        srshr           v23.8h,  v23.8h,  #1
    895 
    896 L(itx_8x8_epilog):
    897        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
    898 
    899        blr             x5
    900 
    901        load_add_store_8x8 x0, x7
    902        ret             x15
    903 .endif
    904 endfunc
    905 .endm
    906 
    907 def_fn_8x8_base identity_
    908 def_fn_8x8_base
    909 
    910 .macro def_fn_8x8 txfm1, txfm2
    911 function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1
    912        mov             x15, x30
    913 
    914 .ifc \txfm1\()_\txfm2, dct_dct
    915        idct_dc         8,   8,   1
    916 .endif
    917        adr             x5,  inv_\txfm2\()_8h_x8_neon
    918 .ifc \txfm1, identity
    919        b               inv_txfm_identity_add_8x8_neon
    920 .else
    921        adr             x4,  inv_\txfm1\()_8h_x8_neon
    922        b               inv_txfm_add_8x8_neon
    923 .endif
    924 endfunc
    925 .endm
    926 
    927 def_fn_8x8 dct, dct
    928 def_fn_8x8 identity, identity
    929 def_fn_8x8 dct, adst
    930 def_fn_8x8 dct, flipadst
    931 def_fn_8x8 dct, identity
    932 def_fn_8x8 adst, dct
    933 def_fn_8x8 adst, adst
    934 def_fn_8x8 adst, flipadst
    935 def_fn_8x8 flipadst, dct
    936 def_fn_8x8 flipadst, adst
    937 def_fn_8x8 flipadst, flipadst
    938 def_fn_8x8 identity, dct
    939 def_fn_8x8 adst, identity
    940 def_fn_8x8 flipadst, identity
    941 def_fn_8x8 identity, adst
    942 def_fn_8x8 identity, flipadst
    943 
    944 function inv_txfm_add_8x4_neon
    945        movi            v30.8h,  #0
    946        movi            v31.8h,  #0
    947        mov             w16, #2896*8
    948        dup             v0.4h,   w16
    949        ld1             {v16.4h,v17.4h,v18.4h,v19.4h}, [x2]
    950        st1             {v30.8h,v31.8h}, [x2], #32
    951        ld1             {v20.4h,v21.4h,v22.4h,v23.4h}, [x2]
    952        st1             {v30.8h,v31.8h}, [x2]
    953 
    954        scale_input     .4h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
    955 
    956        blr             x4
    957 
    958        transpose_4x4h  v16, v17, v18, v19, v4,  v5,  v6,  v7
    959        transpose_4x4h  v20, v21, v22, v23, v4,  v5,  v6,  v7
    960        ins             v16.d[1], v20.d[0]
    961        ins             v17.d[1], v21.d[0]
    962        ins             v18.d[1], v22.d[0]
    963        ins             v19.d[1], v23.d[0]
    964 
    965        blr             x5
    966 
    967        load_add_store_8x4 x0, x7
    968        ret             x15
    969 endfunc
    970 
    971 function inv_txfm_add_4x8_neon
    972        movi            v28.8h,  #0
    973        movi            v29.8h,  #0
    974        movi            v30.8h,  #0
    975        movi            v31.8h,  #0
    976        mov             w16, #2896*8
    977        dup             v0.4h,   w16
    978        ld1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x2]
    979        st1             {v28.8h,v29.8h,v30.8h,v31.8h}, [x2]
    980 
    981        scale_input     .8h, v0.h[0], v16, v17, v18, v19
    982 
    983        blr             x4
    984 
    985        transpose_4x8h  v16, v17, v18, v19, v4,  v5,  v6,  v7
    986        ins             v20.d[0], v16.d[1]
    987        ins             v21.d[0], v17.d[1]
    988        ins             v22.d[0], v18.d[1]
    989        ins             v23.d[0], v19.d[1]
    990 
    991        blr             x5
    992 
    993        load_add_store_4x8 x0, x7
    994        ret             x15
    995 endfunc
    996 
    997 .macro def_fn_48 w, h, txfm1, txfm2
    998 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
    999        mov             x15, x30
   1000 
   1001 .ifc \txfm1\()_\txfm2, dct_dct
   1002        idct_dc         \w,  \h,  0
   1003 .endif
   1004        adr             x4,  inv_\txfm1\()_\h\()h_x\w\()_neon
   1005        adr             x5,  inv_\txfm2\()_\w\()h_x\h\()_neon
   1006        b               inv_txfm_add_\w\()x\h\()_neon
   1007 endfunc
   1008 .endm
   1009 
   1010 .macro def_fns_48 w, h
   1011 def_fn_48 \w, \h, dct, dct
   1012 def_fn_48 \w, \h, identity, identity
   1013 def_fn_48 \w, \h, dct, adst
   1014 def_fn_48 \w, \h, dct, flipadst
   1015 def_fn_48 \w, \h, dct, identity
   1016 def_fn_48 \w, \h, adst, dct
   1017 def_fn_48 \w, \h, adst, adst
   1018 def_fn_48 \w, \h, adst, flipadst
   1019 def_fn_48 \w, \h, flipadst, dct
   1020 def_fn_48 \w, \h, flipadst, adst
   1021 def_fn_48 \w, \h, flipadst, flipadst
   1022 def_fn_48 \w, \h, identity, dct
   1023 def_fn_48 \w, \h, adst, identity
   1024 def_fn_48 \w, \h, flipadst, identity
   1025 def_fn_48 \w, \h, identity, adst
   1026 def_fn_48 \w, \h, identity, flipadst
   1027 .endm
   1028 
   1029 def_fns_48 4, 8
   1030 def_fns_48 8, 4
   1031 
   1032 
   1033 .macro idct_16 sz, szb
   1034        idct_8          v16, v18, v20, v22, v24, v26, v28, v30, \sz, \szb
   1035 
   1036        smull_smlsl     v2,  v3,  v17, v31, v1.h[0], v1.h[1], \sz // -> t8a
   1037        smull_smlal     v4,  v5,  v17, v31, v1.h[1], v1.h[0], \sz // -> t15a
   1038        smull_smlsl     v6,  v7,  v25, v23, v1.h[2], v1.h[3], \sz // -> t9a
   1039        sqrshrn_sz      v17, v2,  v3,  #12, \sz                   // t8a
   1040        sqrshrn_sz      v31, v4,  v5,  #12, \sz                   // t15a
   1041        smull_smlal     v2,  v3,  v25, v23, v1.h[3], v1.h[2], \sz // -> t14a
   1042        smull_smlsl     v4,  v5,  v21, v27, v1.h[4], v1.h[5], \sz // -> t10a
   1043        sqrshrn_sz      v23, v6,  v7,  #12, \sz                   // t9a
   1044        sqrshrn_sz      v25, v2,  v3,  #12, \sz                   // t14a
   1045        smull_smlal     v6,  v7,  v21, v27, v1.h[5], v1.h[4], \sz // -> t13a
   1046        smull_smlsl     v2,  v3,  v29, v19, v1.h[6], v1.h[7], \sz // -> t11a
   1047        sqrshrn_sz      v21, v4,  v5,  #12, \sz                   // t10a
   1048        sqrshrn_sz      v27, v6,  v7,  #12, \sz                   // t13a
   1049        smull_smlal     v4,  v5,  v29, v19, v1.h[7], v1.h[6], \sz // -> t12a
   1050        sqrshrn_sz      v19, v2,  v3,  #12, \sz                   // t11a
   1051        sqrshrn_sz      v29, v4,  v5,  #12, \sz                   // t12a
   1052 
   1053        sqsub           v2\sz,   v17\sz,  v23\sz  // t9
   1054        sqadd           v17\sz,  v17\sz,  v23\sz  // t8
   1055        sqsub           v3\sz,   v31\sz,  v25\sz  // t14
   1056        sqadd           v31\sz,  v31\sz,  v25\sz  // t15
   1057        sqsub           v23\sz,  v19\sz,  v21\sz  // t10
   1058        sqadd           v19\sz,  v19\sz,  v21\sz  // t11
   1059        sqadd           v25\sz,  v29\sz,  v27\sz  // t12
   1060        sqsub           v29\sz,  v29\sz,  v27\sz  // t13
   1061 
   1062        smull_smlsl     v4,  v5,  v3,  v2,  v0.h[2], v0.h[3], \sz // -> t9a
   1063        smull_smlal     v6,  v7,  v3,  v2,  v0.h[3], v0.h[2], \sz // -> t14a
   1064        sqrshrn_sz      v21, v4,  v5,  #12, \sz                   // t9a
   1065        sqrshrn_sz      v27, v6,  v7,  #12, \sz                   // t14a
   1066 
   1067        smull_smlsl     v4,  v5,  v29, v23, v0.h[2], v0.h[3], \sz // -> t13a
   1068        smull_smlal     v6,  v7,  v29, v23, v0.h[3], v0.h[2], \sz // -> t10a
   1069        sqrshrn_sz      v29, v4,  v5,  #12, \sz                   // t13a
   1070        neg             v6.4s,   v6.4s
   1071 .ifc \sz, .8h
   1072        neg             v7.4s,   v7.4s
   1073 .endif
   1074        sqrshrn_sz      v23, v6,  v7,  #12, \sz                   // t10a
   1075 
   1076        sqsub           v2\sz,   v17\sz,  v19\sz  // t11a
   1077        sqadd           v17\sz,  v17\sz,  v19\sz  // t8a
   1078        sqsub           v3\sz,   v31\sz,  v25\sz  // t12a
   1079        sqadd           v31\sz,  v31\sz,  v25\sz  // t15a
   1080        sqadd           v19\sz,  v21\sz,  v23\sz  // t9
   1081        sqsub           v21\sz,  v21\sz,  v23\sz  // t10
   1082        sqsub           v25\sz,  v27\sz,  v29\sz  // t13
   1083        sqadd           v27\sz,  v27\sz,  v29\sz  // t14
   1084 
   1085        smull_smlsl     v4,  v5,  v3,  v2,  v0.h[0], v0.h[0], \sz // -> t11
   1086        smull_smlal     v6,  v7,  v3,  v2,  v0.h[0], v0.h[0], \sz // -> t12
   1087        smull_smlsl     v2,  v3,  v25, v21, v0.h[0], v0.h[0], \sz // -> t10a
   1088 
   1089        sqrshrn_sz      v4,  v4,  v5,  #12, \sz   // t11
   1090        sqrshrn_sz      v5,  v6,  v7,  #12, \sz   // t12
   1091        smull_smlal     v6,  v7,  v25, v21, v0.h[0], v0.h[0], \sz // -> t13a
   1092        sqrshrn_sz      v2,  v2,  v3,  #12, \sz   // t10a
   1093        sqrshrn_sz      v3,  v6,  v7,  #12, \sz   // t13a
   1094 
   1095        sqadd           v6\sz,   v16\sz,  v31\sz  // out0
   1096        sqsub           v31\sz,  v16\sz,  v31\sz  // out15
   1097        mov             v16\szb, v6\szb
   1098        sqadd           v23\sz,  v30\sz,  v17\sz  // out7
   1099        sqsub           v7\sz,   v30\sz,  v17\sz  // out8
   1100        sqadd           v17\sz,  v18\sz,  v27\sz  // out1
   1101        sqsub           v30\sz,  v18\sz,  v27\sz  // out14
   1102        sqadd           v18\sz,  v20\sz,  v3\sz   // out2
   1103        sqsub           v29\sz,  v20\sz,  v3\sz   // out13
   1104        sqadd           v3\sz,   v28\sz,  v19\sz  // out6
   1105        sqsub           v25\sz,  v28\sz,  v19\sz  // out9
   1106        sqadd           v19\sz,  v22\sz,  v5\sz   // out3
   1107        sqsub           v28\sz,  v22\sz,  v5\sz   // out12
   1108        sqadd           v20\sz,  v24\sz,  v4\sz   // out4
   1109        sqsub           v27\sz,  v24\sz,  v4\sz   // out11
   1110        sqadd           v21\sz,  v26\sz,  v2\sz   // out5
   1111        sqsub           v26\sz,  v26\sz,  v2\sz   // out10
   1112        mov             v24\szb, v7\szb
   1113        mov             v22\szb, v3\szb
   1114 .endm
   1115 
   1116 function inv_dct_8h_x16_neon, export=1
   1117        movrel          x16, idct_coeffs
   1118        ld1             {v0.8h, v1.8h}, [x16]
   1119        idct_16         .8h, .16b
   1120        ret
   1121 endfunc
   1122 
   1123 function inv_dct_4h_x16_neon, export=1
   1124        movrel          x16, idct_coeffs
   1125        ld1             {v0.8h, v1.8h}, [x16]
   1126        idct_16         .4h, .8b
   1127        ret
   1128 endfunc
   1129 
   1130 .macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15, sz, szb
   1131        movrel          x16, iadst16_coeffs
   1132        ld1             {v0.8h, v1.8h}, [x16]
   1133        movrel          x16, idct_coeffs
   1134 
   1135        smull_smlal     v2,  v3,  v31, v16, v0.h[0], v0.h[1], \sz // -> t0
   1136        smull_smlsl     v4,  v5,  v31, v16, v0.h[1], v0.h[0], \sz // -> t1
   1137        smull_smlal     v6,  v7,  v29, v18, v0.h[2], v0.h[3], \sz // -> t2
   1138        sqrshrn_sz      v16, v2,  v3,  #12, \sz   // t0
   1139        sqrshrn_sz      v31, v4,  v5,  #12, \sz   // t1
   1140        smull_smlsl     v2,  v3,  v29, v18, v0.h[3], v0.h[2], \sz // -> t3
   1141        smull_smlal     v4,  v5,  v27, v20, v0.h[4], v0.h[5], \sz // -> t4
   1142        sqrshrn_sz      v18, v6,  v7,  #12, \sz   // t2
   1143        sqrshrn_sz      v29, v2,  v3,  #12, \sz   // t3
   1144        smull_smlsl     v6,  v7,  v27, v20, v0.h[5], v0.h[4], \sz // -> t5
   1145        smull_smlal     v2,  v3,  v25, v22, v0.h[6], v0.h[7], \sz // -> t6
   1146        sqrshrn_sz      v20, v4,  v5,  #12, \sz   // t4
   1147        sqrshrn_sz      v27, v6,  v7,  #12, \sz   // t5
   1148        smull_smlsl     v4,  v5,  v25, v22, v0.h[7], v0.h[6], \sz // -> t7
   1149        smull_smlal     v6,  v7,  v23, v24, v1.h[0], v1.h[1], \sz // -> t8
   1150        sqrshrn_sz      v22, v2,  v3,  #12, \sz   // t6
   1151        sqrshrn_sz      v25, v4,  v5,  #12, \sz   // t7
   1152        smull_smlsl     v2,  v3,  v23, v24, v1.h[1], v1.h[0], \sz // -> t9
   1153        smull_smlal     v4,  v5,  v21, v26, v1.h[2], v1.h[3], \sz // -> t10
   1154        sqrshrn_sz      v23, v6,  v7,  #12, \sz   // t8
   1155        sqrshrn_sz      v24, v2,  v3,  #12, \sz   // t9
   1156        smull_smlsl     v6,  v7,  v21, v26, v1.h[3], v1.h[2], \sz // -> t11
   1157        smull_smlal     v2,  v3,  v19, v28, v1.h[4], v1.h[5], \sz // -> t12
   1158        sqrshrn_sz      v21, v4,  v5,  #12, \sz   // t10
   1159        sqrshrn_sz      v26, v6,  v7,  #12, \sz   // t11
   1160        smull_smlsl     v4,  v5,  v19, v28, v1.h[5], v1.h[4], \sz // -> t13
   1161        smull_smlal     v6,  v7,  v17, v30, v1.h[6], v1.h[7], \sz // -> t14
   1162        sqrshrn_sz      v19, v2,  v3,  #12, \sz   // t12
   1163        sqrshrn_sz      v28, v4,  v5,  #12, \sz   // t13
   1164        smull_smlsl     v2,  v3,  v17, v30, v1.h[7], v1.h[6], \sz // -> t15
   1165        sqrshrn_sz      v17, v6,  v7,  #12, \sz   // t14
   1166        sqrshrn_sz      v30, v2,  v3,  #12, \sz   // t15
   1167 
   1168        ld1             {v0.8h}, [x16]
   1169 
   1170        sqsub           v2\sz,   v16\sz,  v23\sz // t8a
   1171        sqadd           v16\sz,  v16\sz,  v23\sz // t0a
   1172        sqsub           v3\sz,   v31\sz,  v24\sz // t9a
   1173        sqadd           v31\sz,  v31\sz,  v24\sz // t1a
   1174        sqadd           v23\sz,  v18\sz,  v21\sz // t2a
   1175        sqsub           v18\sz,  v18\sz,  v21\sz // t10a
   1176        sqadd           v24\sz,  v29\sz,  v26\sz // t3a
   1177        sqsub           v29\sz,  v29\sz,  v26\sz // t11a
   1178        sqadd           v21\sz,  v20\sz,  v19\sz // t4a
   1179        sqsub           v20\sz,  v20\sz,  v19\sz // t12a
   1180        sqadd           v26\sz,  v27\sz,  v28\sz // t5a
   1181        sqsub           v27\sz,  v27\sz,  v28\sz // t13a
   1182        sqadd           v19\sz,  v22\sz,  v17\sz // t6a
   1183        sqsub           v22\sz,  v22\sz,  v17\sz // t14a
   1184        sqadd           v28\sz,  v25\sz,  v30\sz // t7a
   1185        sqsub           v25\sz,  v25\sz,  v30\sz // t15a
   1186 
   1187        smull_smlal     v4,  v5,  v2,  v3,  v0.h[5], v0.h[4], \sz // -> t8
   1188        smull_smlsl     v6,  v7,  v2,  v3,  v0.h[4], v0.h[5], \sz // -> t9
   1189        smull_smlal     v2,  v3,  v18, v29, v0.h[7], v0.h[6], \sz // -> t10
   1190        sqrshrn_sz      v17, v4,  v5,  #12, \sz  // t8
   1191        sqrshrn_sz      v30, v6,  v7,  #12, \sz  // t9
   1192        smull_smlsl     v4,  v5,  v18, v29, v0.h[6], v0.h[7], \sz // -> t11
   1193        smull_smlsl     v6,  v7,  v27, v20, v0.h[5], v0.h[4], \sz // -> t12
   1194        sqrshrn_sz      v18, v2,  v3,  #12, \sz  // t10
   1195        sqrshrn_sz      v29, v4,  v5,  #12, \sz  // t11
   1196        smull_smlal     v2,  v3,  v27, v20, v0.h[4], v0.h[5], \sz // -> t13
   1197        smull_smlsl     v4,  v5,  v25, v22, v0.h[7], v0.h[6], \sz // -> t14
   1198        sqrshrn_sz      v27, v6,  v7,  #12, \sz  // t12
   1199        sqrshrn_sz      v20, v2,  v3,  #12, \sz  // t13
   1200        smull_smlal     v6,  v7,  v25, v22, v0.h[6], v0.h[7], \sz // -> t15
   1201        sqrshrn_sz      v25, v4,  v5,  #12, \sz  // t14
   1202        sqrshrn_sz      v22, v6,  v7,  #12, \sz  // t15
   1203 
   1204        sqsub           v2\sz,   v16\sz,  v21\sz // t4
   1205        sqadd           v16\sz,  v16\sz,  v21\sz // t0
   1206        sqsub           v3\sz,   v31\sz,  v26\sz // t5
   1207        sqadd           v31\sz,  v31\sz,  v26\sz // t1
   1208        sqadd           v21\sz,  v23\sz,  v19\sz // t2
   1209        sqsub           v23\sz,  v23\sz,  v19\sz // t6
   1210        sqadd           v26\sz,  v24\sz,  v28\sz // t3
   1211        sqsub           v24\sz,  v24\sz,  v28\sz // t7
   1212        sqadd           v19\sz,  v17\sz,  v27\sz // t8a
   1213        sqsub           v17\sz,  v17\sz,  v27\sz // t12a
   1214        sqadd           v28\sz,  v30\sz,  v20\sz // t9a
   1215        sqsub           v30\sz,  v30\sz,  v20\sz // t13a
   1216        sqadd           v27\sz,  v18\sz,  v25\sz // t10a
   1217        sqsub           v18\sz,  v18\sz,  v25\sz // t14a
   1218        sqadd           v20\sz,  v29\sz,  v22\sz // t11a
   1219        sqsub           v29\sz,  v29\sz,  v22\sz // t15a
   1220 
   1221        smull_smlal     v4,  v5,  v2,  v3,  v0.h[3], v0.h[2], \sz // -> t4a
   1222        smull_smlsl     v6,  v7,  v2,  v3,  v0.h[2], v0.h[3], \sz // -> t5a
   1223        smull_smlsl     v2,  v3,  v24, v23, v0.h[3], v0.h[2], \sz // -> t6a
   1224        sqrshrn_sz      v22, v4,  v5,  #12, \sz // t4a
   1225        sqrshrn_sz      v25, v6,  v7,  #12, \sz // t5a
   1226        smull_smlal     v4,  v5,  v24, v23, v0.h[2], v0.h[3], \sz // -> t7a
   1227        smull_smlal     v6,  v7,  v17, v30, v0.h[3], v0.h[2], \sz // -> t12
   1228        sqrshrn_sz      v24, v2,  v3,  #12, \sz // t6a
   1229        sqrshrn_sz      v23, v4,  v5,  #12, \sz // t7a
   1230        smull_smlsl     v2,  v3,  v17, v30, v0.h[2], v0.h[3], \sz // -> t13
   1231        smull_smlsl     v4,  v5,  v29, v18, v0.h[3], v0.h[2], \sz // -> t14
   1232        sqrshrn_sz      v17, v6,  v7,  #12, \sz // t12
   1233        smull_smlal     v6,  v7,  v29, v18, v0.h[2], v0.h[3], \sz // -> t15
   1234        sqrshrn_sz      v29, v2,  v3,  #12, \sz // t13
   1235        sqrshrn_sz      v30, v4,  v5,  #12, \sz // t14
   1236        sqrshrn_sz      v18, v6,  v7,  #12, \sz // t15
   1237 
   1238        sqsub           v2\sz,   v16\sz,  v21\sz // t2a
   1239 .ifc \o0, v16
   1240        sqadd           \o0\sz,  v16\sz,  v21\sz // out0
   1241        sqsub           v21\sz,  v31\sz,  v26\sz // t3a
   1242        sqadd           \o15\sz, v31\sz,  v26\sz // out15
   1243 .else
   1244        sqadd           v4\sz,   v16\sz,  v21\sz // out0
   1245        sqsub           v21\sz,  v31\sz,  v26\sz // t3a
   1246        sqadd           \o15\sz, v31\sz,  v26\sz // out15
   1247        mov             \o0\szb, v4\szb
   1248 .endif
   1249        sqneg           \o15\sz, \o15\sz         // out15
   1250 
   1251        sqsub           v3\sz,   v29\sz,  v18\sz // t15a
   1252        sqadd           \o13\sz, v29\sz,  v18\sz // out13
   1253        sqadd           \o2\sz,  v17\sz,  v30\sz // out2
   1254        sqsub           v26\sz,  v17\sz,  v30\sz // t14a
   1255        sqneg           \o13\sz, \o13\sz         // out13
   1256 
   1257        sqadd           \o1\sz,  v19\sz,  v27\sz // out1
   1258        sqsub           v27\sz,  v19\sz,  v27\sz // t10
   1259        sqadd           \o14\sz, v28\sz,  v20\sz // out14
   1260        sqsub           v20\sz,  v28\sz,  v20\sz // t11
   1261        sqneg           \o1\sz,  \o1\sz          // out1
   1262 
   1263        sqadd           \o3\sz,  v22\sz,  v24\sz // out3
   1264        sqsub           v22\sz,  v22\sz,  v24\sz // t6
   1265        sqadd           \o12\sz, v25\sz,  v23\sz // out12
   1266        sqsub           v23\sz,  v25\sz,  v23\sz // t7
   1267        sqneg           \o3\sz,  \o3\sz          // out3
   1268 
   1269        smull_smlsl     v24, v25, v2,  v21, v0.h[0], v0.h[0], \sz // -> out8 (v24 or v23)
   1270        smull_smlal     v4,  v5,  v2,  v21, v0.h[0], v0.h[0], \sz // -> out7 (v23 or v24)
   1271        smull_smlal     v6,  v7,  v26, v3,  v0.h[0], v0.h[0], \sz // -> out5 (v21 or v26)
   1272 
   1273        sqrshrn_sz      v24, v24, v25, #12, \sz // out8
   1274        sqrshrn_sz      v4,  v4,  v5,  #12, \sz // out7
   1275        sqrshrn_sz      v5,  v6,  v7,  #12, \sz // out5
   1276        smull_smlsl     v6,  v7,  v26, v3,  v0.h[0], v0.h[0], \sz // -> out10 (v26 or v21)
   1277        smull_smlal     v2,  v3,  v22, v23, v0.h[0], v0.h[0], \sz // -> out4 (v20 or v27)
   1278        sqrshrn_sz      v26, v6,  v7,  #12, \sz // out10
   1279 
   1280        smull_smlsl     v6,  v7,  v22, v23, v0.h[0], v0.h[0], \sz // -> out11 (v27 or v20)
   1281        smull_smlal     v22, v23, v27, v20, v0.h[0], v0.h[0], \sz // -> out6 (v22 or v25)
   1282        smull_smlsl     v21, v25, v27, v20, v0.h[0], v0.h[0], \sz // -> out9 (v25 or v22)
   1283 
   1284        sqrshrn_sz      \o4, v2,  v3,  #12, \sz // out4
   1285        sqrshrn_sz      v6,  v6,  v7,  #12, \sz // out11
   1286        sqrshrn_sz      v7,  v21, v25, #12, \sz // out9
   1287        sqrshrn_sz      \o6, v22, v23, #12, \sz // out6
   1288 
   1289 .ifc \o8, v23
   1290        mov             \o8\szb,  v24\szb
   1291        mov             \o10\szb, v26\szb
   1292 .endif
   1293 
   1294        sqneg           \o7\sz,  v4\sz // out7
   1295        sqneg           \o5\sz,  v5\sz // out5
   1296        sqneg           \o11\sz, v6\sz // out11
   1297        sqneg           \o9\sz,  v7\sz // out9
   1298 .endm
   1299 
   1300 function inv_adst_8h_x16_neon, export=1
   1301        iadst_16        v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, .8h, .16b
   1302        ret
   1303 endfunc
   1304 
   1305 function inv_flipadst_8h_x16_neon, export=1
   1306        iadst_16        v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, .8h, .16b
   1307        ret
   1308 endfunc
   1309 
   1310 function inv_adst_4h_x16_neon, export=1
   1311        iadst_16        v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, .4h, .8b
   1312        ret
   1313 endfunc
   1314 
   1315 function inv_flipadst_4h_x16_neon, export=1
   1316        iadst_16        v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, .4h, .8b
   1317        ret
   1318 endfunc
   1319 
   1320 function inv_identity_8h_x16_neon, export=1
   1321        mov             w16, #2*(5793-4096)*8
   1322        dup             v0.4h,   w16
   1323 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
   1324        sqrdmulh        v2.8h,      v\i\().8h,  v0.h[0]
   1325        sqadd           v\i\().8h,  v\i\().8h,  v\i\().8h
   1326        sqadd           v\i\().8h,  v\i\().8h,  v2.8h
   1327 .endr
   1328        ret
   1329 endfunc
   1330 
   1331 function inv_identity_4h_x16_neon, export=1
   1332        mov             w16, #2*(5793-4096)*8
   1333        dup             v0.4h,   w16
   1334 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
   1335        sqrdmulh        v2.4h,      v\i\().4h,  v0.h[0]
   1336        sqadd           v\i\().4h,  v\i\().4h,  v\i\().4h
   1337        sqadd           v\i\().4h,  v\i\().4h,  v2.4h
   1338 .endr
   1339        ret
   1340 endfunc
   1341 
   1342 .macro identity_8x16_shift2 c
   1343 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
   1344        sqrdmulh        v2.8h,   \i,      \c
   1345        sshr            v2.8h,   v2.8h,   #1
   1346        srhadd          \i,      \i,      v2.8h
   1347 .endr
   1348 .endm
   1349 
   1350 .macro identity_8x16_shift1 c
   1351 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
   1352        sqrdmulh        v2.8h,   \i,      \c
   1353        srshr           v2.8h,   v2.8h,   #1
   1354        sqadd           \i,      \i,      v2.8h
   1355 .endr
   1356 .endm
   1357 
   1358 .macro identity_8x8_shift1 c
   1359 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
   1360        sqrdmulh        v2.8h,   \i,      \c
   1361        srshr           v2.8h,   v2.8h,   #1
   1362        sqadd           \i,      \i,      v2.8h
   1363 .endr
   1364 .endm
   1365 
   1366 .macro identity_8x8 c
   1367 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
   1368        sqrdmulh        v2.8h,   \i,      \c
   1369        sqadd           \i,      \i,      \i
   1370        sqadd           \i,      \i,      v2.8h
   1371 .endr
   1372 .endm
   1373 
   1374 .macro def_horz_16 scale=0, identity=0, shift=2, suffix
   1375 function inv_txfm_horz\suffix\()_16x8_neon
   1376        AARCH64_VALID_CALL_TARGET
   1377        mov             x14, x30
   1378        movi            v7.8h,  #0
   1379 .if \identity
   1380        mov             w16, #2*(5793-4096)*8
   1381        dup             v0.4h,   w16
   1382 .elseif \scale
   1383        mov             w16, #2896*8
   1384        dup             v0.4h,   w16
   1385 .endif
   1386 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
   1387        ld1             {\i}, [x7]
   1388        st1             {v7.8h}, [x7], x8
   1389 .endr
   1390 .if \scale
   1391        scale_input     .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
   1392        scale_input     .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31
   1393 .endif
   1394 .if \identity
   1395        identity_8x16_shift2 v0.h[0]
   1396        b               L(horz_16x8_epilog)
   1397 .else
   1398        blr             x4
   1399 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
   1400        srshr           \i,  \i,  #\shift
   1401 .endr
   1402 .if \shift == 1
   1403        b               L(horz_16x8_epilog)
   1404 .else
   1405 L(horz_16x8_epilog):
   1406        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
   1407        transpose_8x8h  v24, v25, v26, v27, v28, v29, v30, v31, v4, v5
   1408 
   1409 .irp i, v16.8h, v24.8h, v17.8h, v25.8h, v18.8h, v26.8h, v19.8h, v27.8h, v20.8h, v28.8h, v21.8h, v29.8h, v22.8h, v30.8h, v23.8h, v31.8h
   1410        st1             {\i}, [x6], #16
   1411 .endr
   1412 
   1413        ret             x14
   1414 .endif
   1415 .endif
   1416 endfunc
   1417 .endm
   1418 
   1419 def_horz_16 scale=1, identity=0, shift=1, suffix=_scale
   1420 def_horz_16 scale=0, identity=1, shift=0, suffix=_identity
   1421 def_horz_16 scale=0, identity=0, shift=2
   1422 
   1423 function inv_txfm_add_vert_8x16_neon
   1424        mov             x14, x30
   1425 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
   1426        ld1             {v\i\().8h}, [x7], x8
   1427 .endr
   1428        blr             x5
   1429        load_add_store_8x16 x6, x7
   1430        ret             x14
   1431 endfunc
   1432 
   1433 function inv_txfm_add_16x16_neon
   1434        mov             x15, x30
   1435        sub             sp,  sp,  #512
   1436        mov             x8,  #16*2
   1437 .irp i, 0, 8
   1438        add             x6,  sp,  #(\i*16*2)
   1439 .if \i == 8
   1440        cmp             w3,  w13
   1441        b.lt            1f
   1442 .endif
   1443        add             x7,  x2,  #(\i*2)
   1444        blr             x9
   1445 .endr
   1446        b               2f
   1447 1:
   1448        movi            v4.8h,  #0
   1449        movi            v5.8h,  #0
   1450        movi            v6.8h,  #0
   1451        movi            v7.8h,  #0
   1452 .rept 4
   1453        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
   1454 .endr
   1455 2:
   1456 .irp i, 0, 8
   1457        add             x6,  x0,  #(\i)
   1458        add             x7,  sp,  #(\i*2)
   1459        bl              inv_txfm_add_vert_8x16_neon
   1460 .endr
   1461 
   1462        add             sp,  sp,  #512
   1463        ret             x15
   1464 endfunc
   1465 
   1466 .macro def_fn_16x16 txfm1, txfm2, eob_half
   1467 function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_neon, export=1
   1468 .ifc \txfm1\()_\txfm2, dct_dct
   1469        idct_dc         16,  16,  2
   1470 .endif
   1471 .ifc \txfm1, identity
   1472        adr             x9,  inv_txfm_horz_identity_16x8_neon
   1473 .else
   1474        adr             x9,  inv_txfm_horz_16x8_neon
   1475        adr             x4,  inv_\txfm1\()_8h_x16_neon
   1476 .endif
   1477        adr             x5,  inv_\txfm2\()_8h_x16_neon
   1478        mov             x13, #\eob_half
   1479        b               inv_txfm_add_16x16_neon
   1480 endfunc
   1481 .endm
   1482 
   1483 def_fn_16x16 dct, dct, 36
   1484 def_fn_16x16 identity, identity, 36
   1485 def_fn_16x16 dct, adst, 36
   1486 def_fn_16x16 dct, flipadst, 36
   1487 def_fn_16x16 dct, identity, 8
   1488 def_fn_16x16 adst, dct, 36
   1489 def_fn_16x16 adst, adst, 36
   1490 def_fn_16x16 adst, flipadst, 36
   1491 def_fn_16x16 flipadst, dct, 36
   1492 def_fn_16x16 flipadst, adst, 36
   1493 def_fn_16x16 flipadst, flipadst, 36
   1494 def_fn_16x16 identity, dct, 8
   1495 
   1496 .macro def_fn_416_base variant
   1497 function inv_txfm_\variant\()add_16x4_neon
   1498        mov             x15, x30
   1499        movi            v4.8h,  #0
   1500 
   1501 .ifc \variant, identity_
   1502 .irp i, v16.4h, v17.4h, v18.4h, v19.4h
   1503        ld1             {\i},    [x2]
   1504        st1             {v4.4h}, [x2], #8
   1505 .endr
   1506 .irp i, v16.d, v17.d, v18.d, v19.d
   1507        ld1             {\i}[1], [x2]
   1508        st1             {v4.4h}, [x2], #8
   1509 .endr
   1510        mov             w16, #2*(5793-4096)*8
   1511        dup             v0.4h,   w16
   1512 .irp i, v20.4h, v21.4h, v22.4h, v23.4h
   1513        ld1             {\i},    [x2]
   1514        st1             {v4.4h}, [x2], #8
   1515 .endr
   1516 .irp i, v20.d, v21.d, v22.d, v23.d
   1517        ld1             {\i}[1], [x2]
   1518        st1             {v4.4h}, [x2], #8
   1519 .endr
   1520 
   1521        identity_8x16_shift1 v0.h[0]
   1522 
   1523        b               L(itx_16x4_epilog)
   1524 .else
   1525 .irp i, v16.4h, v17.4h, v18.4h, v19.4h, v20.4h, v21.4h, v22.4h, v23.4h, v24.4h, v25.4h, v26.4h, v27.4h, v28.4h, v29.4h, v30.4h, v31.4h
   1526        ld1             {\i},    [x2]
   1527        st1             {v4.4h}, [x2], #8
   1528 .endr
   1529 
   1530        blr             x4
   1531 
   1532        ins             v16.d[1], v20.d[0]
   1533        ins             v17.d[1], v21.d[0]
   1534        ins             v18.d[1], v22.d[0]
   1535        ins             v19.d[1], v23.d[0]
   1536 .irp i, v16.8h, v17.8h, v18.8h, v19.8h
   1537        srshr           \i,  \i,  #1
   1538 .endr
   1539 
   1540        ins             v24.d[1], v28.d[0]
   1541        ins             v25.d[1], v29.d[0]
   1542        ins             v26.d[1], v30.d[0]
   1543        ins             v27.d[1], v31.d[0]
   1544        srshr           v20.8h,  v24.8h,  #1
   1545        srshr           v21.8h,  v25.8h,  #1
   1546        srshr           v22.8h,  v26.8h,  #1
   1547        srshr           v23.8h,  v27.8h,  #1
   1548 
   1549 L(itx_16x4_epilog):
   1550        transpose_4x8h  v16, v17, v18, v19, v2,  v3,  v4,  v5
   1551        blr             x5
   1552        mov             x6,  x0
   1553        load_add_store_8x4 x6, x7
   1554 
   1555        transpose_4x8h_mov v20, v21, v22, v23, v2,  v3,  v4,  v5, v16, v17, v18, v19
   1556        blr             x5
   1557        add             x6,  x0,  #8
   1558        load_add_store_8x4 x6, x7
   1559 
   1560        ret             x15
   1561 .endif
   1562 endfunc
   1563 
   1564 function inv_txfm_\variant\()add_4x16_neon
   1565        mov             x15, x30
   1566        movi            v2.8h,   #0
   1567 
   1568        mov             x11, #32
   1569        cmp             w3,  w13
   1570        b.lt            1f
   1571 
   1572        add             x6,  x2,  #16
   1573 .ifc \variant, identity_
   1574 .irp i, v24.8h, v25.8h, v26.8h, v27.8h
   1575        ld1             {\i},    [x6]
   1576        st1             {v2.8h}, [x6], x11
   1577 .endr
   1578        mov             w16, #(5793-4096)*8
   1579        dup             v0.4h,   w16
   1580        identity_8x4_shift1 v24, v25, v26, v27, v0.h[0]
   1581 .else
   1582 .irp i, v16.8h, v17.8h, v18.8h, v19.8h
   1583        ld1             {\i},    [x6]
   1584        st1             {v2.8h}, [x6], x11
   1585 .endr
   1586        blr             x4
   1587        srshr           v24.8h,  v16.8h,  #1
   1588        srshr           v25.8h,  v17.8h,  #1
   1589        srshr           v26.8h,  v18.8h,  #1
   1590        srshr           v27.8h,  v19.8h,  #1
   1591 .endif
   1592        transpose_4x8h  v24, v25, v26, v27, v4,  v5,  v6,  v7
   1593        ins             v28.d[0], v24.d[1]
   1594        ins             v29.d[0], v25.d[1]
   1595        ins             v30.d[0], v26.d[1]
   1596        ins             v31.d[0], v27.d[1]
   1597 
   1598        b               2f
   1599 1:
   1600 .irp i, v24.4h, v25.4h, v26.4h, v27.4h, v28.4h, v29.4h, v30.4h, v31.4h
   1601        movi            \i,  #0
   1602 .endr
   1603 2:
   1604        movi            v2.8h,   #0
   1605 .irp i, v16.8h, v17.8h, v18.8h, v19.8h
   1606        ld1             {\i},    [x2]
   1607        st1             {v2.8h}, [x2], x11
   1608 .endr
   1609 .ifc \variant, identity_
   1610        mov             w16, #(5793-4096)*8
   1611        dup             v0.4h,   w16
   1612        identity_8x4_shift1 v16, v17, v18, v19, v0.h[0]
   1613 
   1614        b               L(itx_4x16_epilog)
   1615 .else
   1616        blr             x4
   1617 .irp i, v16.8h, v17.8h, v18.8h, v19.8h
   1618        srshr           \i,  \i,  #1
   1619 .endr
   1620 L(itx_4x16_epilog):
   1621        transpose_4x8h  v16, v17, v18, v19, v4,  v5,  v6,  v7
   1622        ins             v20.d[0], v16.d[1]
   1623        ins             v21.d[0], v17.d[1]
   1624        ins             v22.d[0], v18.d[1]
   1625        ins             v23.d[0], v19.d[1]
   1626 
   1627        blr             x5
   1628 
   1629        load_add_store_4x16 x0, x6
   1630 
   1631        ret             x15
   1632 .endif
   1633 endfunc
   1634 .endm
   1635 
   1636 def_fn_416_base identity_
   1637 def_fn_416_base
   1638 
   1639 .macro def_fn_416 w, h, txfm1, txfm2, eob_half
   1640 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
   1641 .ifc \txfm1\()_\txfm2, dct_dct
   1642        idct_dc         \w,  \h,  1
   1643 .endif
   1644 .if \w == 4
   1645 .ifnc \txfm1, identity
   1646        adr             x4,  inv_\txfm1\()_8h_x\w\()_neon
   1647 .endif
   1648        adr             x5,  inv_\txfm2\()_4h_x\h\()_neon
   1649        mov             w13, #\eob_half
   1650 .else
   1651 .ifnc \txfm1, identity
   1652        adr             x4,  inv_\txfm1\()_4h_x\w\()_neon
   1653 .endif
   1654        adr             x5,  inv_\txfm2\()_8h_x\h\()_neon
   1655 .endif
   1656 .ifc \txfm1, identity
   1657        b               inv_txfm_identity_add_\w\()x\h\()_neon
   1658 .else
   1659        b               inv_txfm_add_\w\()x\h\()_neon
   1660 .endif
   1661 endfunc
   1662 .endm
   1663 
   1664 .macro def_fns_416 w, h
   1665 def_fn_416 \w, \h, dct, dct, 29
   1666 def_fn_416 \w, \h, identity, identity, 29
   1667 def_fn_416 \w, \h, dct, adst, 29
   1668 def_fn_416 \w, \h, dct, flipadst, 29
   1669 def_fn_416 \w, \h, dct, identity, 8
   1670 def_fn_416 \w, \h, adst, dct, 29
   1671 def_fn_416 \w, \h, adst, adst, 29
   1672 def_fn_416 \w, \h, adst, flipadst, 29
   1673 def_fn_416 \w, \h, flipadst, dct, 29
   1674 def_fn_416 \w, \h, flipadst, adst, 29
   1675 def_fn_416 \w, \h, flipadst, flipadst, 29
   1676 def_fn_416 \w, \h, identity, dct, 32
   1677 def_fn_416 \w, \h, adst, identity, 8
   1678 def_fn_416 \w, \h, flipadst, identity, 8
   1679 def_fn_416 \w, \h, identity, adst, 32
   1680 def_fn_416 \w, \h, identity, flipadst, 32
   1681 .endm
   1682 
   1683 def_fns_416 4, 16
   1684 def_fns_416 16, 4
   1685 
   1686 
   1687 .macro def_fn_816_base variant
   1688 function inv_txfm_\variant\()add_16x8_neon
   1689        mov             x15, x30
   1690        movi            v4.8h,  #0
   1691        mov             w16, #2896*8
   1692        dup             v0.4h,   w16
   1693 
   1694 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
   1695        ld1             {\i},    [x2]
   1696        st1             {v4.8h}, [x2], #16
   1697 .endr
   1698 
   1699        scale_input     .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
   1700        scale_input     .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31
   1701 .ifc \variant, identity_
   1702        mov             w16, #2*(5793-4096)*8
   1703        dup             v0.4h,   w16
   1704        identity_8x16_shift1 v0.h[0]
   1705 
   1706        b               L(itx_16x8_epilog)
   1707 .else
   1708        blr             x4
   1709 
   1710 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
   1711        srshr           \i,  \i,  #1
   1712 .endr
   1713 
   1714 L(itx_16x8_epilog):
   1715        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
   1716 
   1717        blr             x5
   1718 
   1719        mov             x6,  x0
   1720        load_add_store_8x8 x6, x7
   1721 
   1722        transpose_8x8h_mov v24, v25, v26, v27, v28, v29, v30, v31, v2, v3, v16, v17, v18, v19, v20, v21, v22, v23
   1723 
   1724        blr             x5
   1725 
   1726        add             x0,  x0,  #8
   1727        load_add_store_8x8 x0, x7
   1728 
   1729        ret             x15
   1730 .endif
   1731 endfunc
   1732 
   1733 function inv_txfm_\variant\()add_8x16_neon
   1734        mov             x15, x30
   1735        movi            v4.8h,   #0
   1736        mov             w16, #2896*8
   1737        dup             v0.4h,   w16
   1738        mov             x11, #32
   1739 
   1740        cmp             w3,  w13
   1741        b.lt            1f
   1742 
   1743        add             x6,  x2,  #16
   1744 .ifc \variant, identity_
   1745 .irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
   1746        ld1             {\i},    [x6]
   1747        st1             {v4.8h}, [x6], x11
   1748 .endr
   1749        scale_input     .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31
   1750        // The identity shl #1 and downshift srshr #1 cancel out
   1751 .else
   1752 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
   1753        ld1             {\i},    [x6]
   1754        st1             {v4.8h}, [x6], x11
   1755 .endr
   1756        scale_input     .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
   1757        blr             x4
   1758 
   1759        srshr           v24.8h,  v16.8h,  #1
   1760        srshr           v25.8h,  v17.8h,  #1
   1761        srshr           v26.8h,  v18.8h,  #1
   1762        srshr           v27.8h,  v19.8h,  #1
   1763        srshr           v28.8h,  v20.8h,  #1
   1764        srshr           v29.8h,  v21.8h,  #1
   1765        srshr           v30.8h,  v22.8h,  #1
   1766        srshr           v31.8h,  v23.8h,  #1
   1767 .endif
   1768        transpose_8x8h  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
   1769 
   1770        b               2f
   1771 
   1772 1:
   1773 .irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
   1774        movi            \i,  #0
   1775 .endr
   1776 
   1777 2:
   1778        movi            v4.8h,   #0
   1779        mov             w16, #2896*8
   1780        dup             v0.4h,   w16
   1781 
   1782 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
   1783        ld1             {\i},    [x2]
   1784        st1             {v4.8h}, [x2], x11
   1785 .endr
   1786        scale_input     .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
   1787 .ifc \variant, identity_
   1788        // The identity shl #1 and downshift srshr #1 cancel out
   1789 
   1790        b               L(itx_8x16_epilog)
   1791 .else
   1792        blr             x4
   1793 
   1794 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
   1795        srshr           \i,  \i,  #1
   1796 .endr
   1797 
   1798 L(itx_8x16_epilog):
   1799        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
   1800 
   1801        blr             x5
   1802 
   1803        load_add_store_8x16 x0, x6
   1804 
   1805        ret             x15
   1806 .endif
   1807 endfunc
   1808 .endm
   1809 
   1810 def_fn_816_base identity_
   1811 def_fn_816_base
   1812 
   1813 .macro def_fn_816 w, h, txfm1, txfm2, eob_half
   1814 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
   1815 .ifc \txfm1\()_\txfm2, dct_dct
   1816        idct_dc         \w,  \h,  1
   1817 .endif
   1818 .ifnc \txfm1, identity
   1819        adr             x4,  inv_\txfm1\()_8h_x\w\()_neon
   1820 .endif
   1821        adr             x5,  inv_\txfm2\()_8h_x\h\()_neon
   1822 .if \w == 8
   1823        mov             x13, #\eob_half
   1824 .endif
   1825 .ifc \txfm1, identity
   1826        b               inv_txfm_identity_add_\w\()x\h\()_neon
   1827 .else
   1828        b               inv_txfm_add_\w\()x\h\()_neon
   1829 .endif
   1830 endfunc
   1831 .endm
   1832 
   1833 .macro def_fns_816 w, h
   1834 def_fn_816 \w, \h, dct, dct, 43
   1835 def_fn_816 \w, \h, identity, identity, 43
   1836 def_fn_816 \w, \h, dct, adst, 43
   1837 def_fn_816 \w, \h, dct, flipadst, 43
   1838 def_fn_816 \w, \h, dct, identity, 8
   1839 def_fn_816 \w, \h, adst, dct, 43
   1840 def_fn_816 \w, \h, adst, adst, 43
   1841 def_fn_816 \w, \h, adst, flipadst, 43
   1842 def_fn_816 \w, \h, flipadst, dct, 43
   1843 def_fn_816 \w, \h, flipadst, adst, 43
   1844 def_fn_816 \w, \h, flipadst, flipadst, 43
   1845 def_fn_816 \w, \h, identity, dct, 64
   1846 def_fn_816 \w, \h, adst, identity, 8
   1847 def_fn_816 \w, \h, flipadst, identity, 8
   1848 def_fn_816 \w, \h, identity, adst, 64
   1849 def_fn_816 \w, \h, identity, flipadst, 64
   1850 .endm
   1851 
   1852 def_fns_816 8, 16
   1853 def_fns_816 16, 8
   1854 
   1855 function inv_dct32_odd_8h_x16_neon, export=1
   1856        movrel          x16, idct_coeffs, 2*16
   1857        ld1             {v0.8h, v1.8h}, [x16]
   1858        sub             x16, x16, #2*16
   1859 
   1860        smull_smlsl     v2,  v3,  v16, v31, v0.h[0], v0.h[1], .8h // -> t16a
   1861        smull_smlal     v4,  v5,  v16, v31, v0.h[1], v0.h[0], .8h // -> t31a
   1862        smull_smlsl     v6,  v7,  v24, v23, v0.h[2], v0.h[3], .8h // -> t17a
   1863        sqrshrn_sz      v16, v2,  v3,  #12, .8h                   // t16a
   1864        sqrshrn_sz      v31, v4,  v5,  #12, .8h                   // t31a
   1865        smull_smlal     v2,  v3,  v24, v23, v0.h[3], v0.h[2], .8h // -> t30a
   1866        smull_smlsl     v4,  v5,  v20, v27, v0.h[4], v0.h[5], .8h // -> t18a
   1867        sqrshrn_sz      v24, v6,  v7,  #12, .8h                   // t17a
   1868        sqrshrn_sz      v23, v2,  v3,  #12, .8h                   // t30a
   1869        smull_smlal     v6,  v7,  v20, v27, v0.h[5], v0.h[4], .8h // -> t29a
   1870        smull_smlsl     v2,  v3,  v28, v19, v0.h[6], v0.h[7], .8h // -> t19a
   1871        sqrshrn_sz      v20, v4,  v5,  #12, .8h                   // t18a
   1872        sqrshrn_sz      v27, v6,  v7,  #12, .8h                   // t29a
   1873        smull_smlal     v4,  v5,  v28, v19, v0.h[7], v0.h[6], .8h // -> t28a
   1874        smull_smlsl     v6,  v7,  v18, v29, v1.h[0], v1.h[1], .8h // -> t20a
   1875        sqrshrn_sz      v28, v2,  v3,  #12, .8h                   // t19a
   1876        sqrshrn_sz      v19, v4,  v5,  #12, .8h                   // t28a
   1877        smull_smlal     v2,  v3,  v18, v29, v1.h[1], v1.h[0], .8h // -> t27a
   1878        smull_smlsl     v4,  v5,  v26, v21, v1.h[2], v1.h[3], .8h // -> t21a
   1879        sqrshrn_sz      v18, v6,  v7,  #12, .8h                   // t20a
   1880        sqrshrn_sz      v29, v2,  v3,  #12, .8h                   // t27a
   1881        smull_smlal     v6,  v7,  v26, v21, v1.h[3], v1.h[2], .8h // -> t26a
   1882        smull_smlsl     v2,  v3,  v22, v25, v1.h[4], v1.h[5], .8h // -> t22a
   1883        sqrshrn_sz      v26, v4,  v5,  #12, .8h                   // t21a
   1884        sqrshrn_sz      v21, v6,  v7,  #12, .8h                   // t26a
   1885        smull_smlal     v4,  v5,  v22, v25, v1.h[5], v1.h[4], .8h // -> t25a
   1886        smull_smlsl     v6,  v7,  v30, v17, v1.h[6], v1.h[7], .8h // -> t23a
   1887        sqrshrn_sz      v22, v2,  v3,  #12, .8h                   // t22a
   1888        sqrshrn_sz      v25, v4,  v5,  #12, .8h                   // t25a
   1889        smull_smlal     v2,  v3,  v30, v17, v1.h[7], v1.h[6], .8h // -> t24a
   1890        sqrshrn_sz      v30, v6,  v7,  #12, .8h                   // t23a
   1891        sqrshrn_sz      v17, v2,  v3,  #12, .8h                   // t24a
   1892 
   1893        ld1             {v0.8h}, [x16]
   1894 
   1895        sqsub           v2.8h,   v16.8h,  v24.8h // t17
   1896        sqadd           v16.8h,  v16.8h,  v24.8h // t16
   1897        sqsub           v3.8h,   v31.8h,  v23.8h // t30
   1898        sqadd           v31.8h,  v31.8h,  v23.8h // t31
   1899        sqsub           v24.8h,  v28.8h,  v20.8h // t18
   1900        sqadd           v28.8h,  v28.8h,  v20.8h // t19
   1901        sqadd           v23.8h,  v18.8h,  v26.8h // t20
   1902        sqsub           v18.8h,  v18.8h,  v26.8h // t21
   1903        sqsub           v20.8h,  v30.8h,  v22.8h // t22
   1904        sqadd           v30.8h,  v30.8h,  v22.8h // t23
   1905        sqadd           v26.8h,  v17.8h,  v25.8h // t24
   1906        sqsub           v17.8h,  v17.8h,  v25.8h // t25
   1907        sqsub           v22.8h,  v29.8h,  v21.8h // t26
   1908        sqadd           v29.8h,  v29.8h,  v21.8h // t27
   1909        sqadd           v25.8h,  v19.8h,  v27.8h // t28
   1910        sqsub           v19.8h,  v19.8h,  v27.8h // t29
   1911 
   1912        smull_smlsl     v4,  v5,  v3,  v2,  v0.h[4], v0.h[5], .8h // -> t17a
   1913        smull_smlal     v6,  v7,  v3,  v2,  v0.h[5], v0.h[4], .8h // -> t30a
   1914        smull_smlal     v2,  v3,  v19, v24, v0.h[5], v0.h[4], .8h // -> t18a
   1915        sqrshrn_sz      v21, v4,  v5,  #12, .8h                   // t17a
   1916        sqrshrn_sz      v27, v6,  v7,  #12, .8h                   // t30a
   1917        neg             v2.4s,   v2.4s                            // -> t18a
   1918        neg             v3.4s,   v3.4s                            // -> t18a
   1919        smull_smlsl     v4,  v5,  v19, v24, v0.h[4], v0.h[5], .8h // -> t29a
   1920        smull_smlsl     v6,  v7,  v22, v18, v0.h[6], v0.h[7], .8h // -> t21a
   1921        sqrshrn_sz      v19, v2,  v3,  #12, .8h                   // t18a
   1922        sqrshrn_sz      v24, v4,  v5,  #12, .8h                   // t29a
   1923        smull_smlal     v2,  v3,  v22, v18, v0.h[7], v0.h[6], .8h // -> t26a
   1924        smull_smlal     v4,  v5,  v17, v20, v0.h[7], v0.h[6], .8h // -> t22a
   1925        sqrshrn_sz      v22, v6,  v7,  #12, .8h                   // t21a
   1926        sqrshrn_sz      v18, v2,  v3,  #12, .8h                   // t26a
   1927        neg             v4.4s,   v4.4s                            // -> t22a
   1928        neg             v5.4s,   v5.4s                            // -> t22a
   1929        smull_smlsl     v6,  v7,  v17, v20, v0.h[6], v0.h[7], .8h // -> t25a
   1930        sqrshrn_sz      v17, v4,  v5,  #12, .8h                   // t22a
   1931        sqrshrn_sz      v20, v6,  v7,  #12, .8h                   // t25a
   1932 
   1933        sqsub           v2.8h,   v27.8h,  v24.8h // t29
   1934        sqadd           v27.8h,  v27.8h,  v24.8h // t30
   1935        sqsub           v3.8h,   v21.8h,  v19.8h // t18
   1936        sqadd           v21.8h,  v21.8h,  v19.8h // t17
   1937        sqsub           v24.8h,  v16.8h,  v28.8h // t19a
   1938        sqadd           v16.8h,  v16.8h,  v28.8h // t16a
   1939        sqsub           v19.8h,  v30.8h,  v23.8h // t20a
   1940        sqadd           v30.8h,  v30.8h,  v23.8h // t23a
   1941        sqsub           v28.8h,  v17.8h,  v22.8h // t21
   1942        sqadd           v17.8h,  v17.8h,  v22.8h // t22
   1943        sqadd           v23.8h,  v26.8h,  v29.8h // t24a
   1944        sqsub           v26.8h,  v26.8h,  v29.8h // t27a
   1945        sqadd           v22.8h,  v20.8h,  v18.8h // t25
   1946        sqsub           v20.8h,  v20.8h,  v18.8h // t26
   1947        sqsub           v29.8h,  v31.8h,  v25.8h // t28a
   1948        sqadd           v31.8h,  v31.8h,  v25.8h // t31a
   1949 
   1950        smull_smlsl     v4,  v5,  v2,  v3,  v0.h[2], v0.h[3], .8h // -> t18a
   1951        smull_smlal     v6,  v7,  v2,  v3,  v0.h[3], v0.h[2], .8h // -> t29a
   1952        smull_smlsl     v2,  v3,  v29, v24, v0.h[2], v0.h[3], .8h // -> t19
   1953        sqrshrn_sz      v18, v4,  v5,  #12, .8h                   // t18a
   1954        sqrshrn_sz      v25, v6,  v7,  #12, .8h                   // t29a
   1955        smull_smlal     v4,  v5,  v29, v24, v0.h[3], v0.h[2], .8h // -> t28
   1956        smull_smlal     v6,  v7,  v26, v19, v0.h[3], v0.h[2], .8h // -> t20
   1957        sqrshrn_sz      v29, v2,  v3,  #12, .8h                   // t19
   1958        sqrshrn_sz      v24, v4,  v5,  #12, .8h                   // t28
   1959        neg             v6.4s,   v6.4s                            // -> t20
   1960        neg             v7.4s,   v7.4s                            // -> t20
   1961        smull_smlsl     v2,  v3,  v26, v19, v0.h[2], v0.h[3], .8h // -> t27
   1962        smull_smlal     v4,  v5,  v20, v28, v0.h[3], v0.h[2], .8h // -> t21a
   1963        sqrshrn_sz      v26, v6,  v7,  #12, .8h                   // t20
   1964        sqrshrn_sz      v19, v2,  v3,  #12, .8h                   // t27
   1965        neg             v4.4s,   v4.4s                            // -> t21a
   1966        neg             v5.4s,   v5.4s                            // -> t21a
   1967        smull_smlsl     v6,  v7,  v20, v28, v0.h[2], v0.h[3], .8h // -> t26a
   1968        sqrshrn_sz      v20, v4,  v5,  #12, .8h                   // t21a
   1969        sqrshrn_sz      v28, v6,  v7,  #12, .8h                   // t26a
   1970 
   1971        sqsub           v2.8h,   v16.8h,  v30.8h // t23
   1972        sqadd           v16.8h,  v16.8h,  v30.8h // t16 = out16
   1973        sqsub           v3.8h,   v31.8h,  v23.8h // t24
   1974        sqadd           v31.8h,  v31.8h,  v23.8h // t31 = out31
   1975        sqsub           v23.8h,  v21.8h,  v17.8h // t22a
   1976        sqadd           v17.8h,  v21.8h,  v17.8h // t17a = out17
   1977        sqadd           v30.8h,  v27.8h,  v22.8h // t30a = out30
   1978        sqsub           v21.8h,  v27.8h,  v22.8h // t25a
   1979        sqsub           v27.8h,  v18.8h,  v20.8h // t21
   1980        sqadd           v18.8h,  v18.8h,  v20.8h // t18 = out18
   1981        sqadd           v4.8h,   v29.8h,  v26.8h // t19a = out19
   1982        sqsub           v26.8h,  v29.8h,  v26.8h // t20a
   1983        sqadd           v29.8h,  v25.8h,  v28.8h // t29 = out29
   1984        sqsub           v25.8h,  v25.8h,  v28.8h // t26
   1985        sqadd           v28.8h,  v24.8h,  v19.8h // t28a = out28
   1986        sqsub           v24.8h,  v24.8h,  v19.8h // t27a
   1987        mov             v19.16b, v4.16b          // out19
   1988 
   1989        smull_smlsl     v4,  v5,  v24, v26, v0.h[0], v0.h[0], .8h // -> t20
   1990        smull_smlal     v6,  v7,  v24, v26, v0.h[0], v0.h[0], .8h // -> t27
   1991        sqrshrn_sz      v20, v4,  v5,  #12, .8h   // t20
   1992        sqrshrn_sz      v22, v6,  v7,  #12, .8h   // t27
   1993 
   1994        smull_smlal     v4,  v5,  v25, v27, v0.h[0], v0.h[0], .8h // -> t26a
   1995        smull_smlsl     v6,  v7,  v25, v27, v0.h[0], v0.h[0], .8h // -> t21a
   1996        mov             v27.16b,  v22.16b         // t27
   1997        sqrshrn_sz      v26, v4,  v5,  #12, .8h   // t26a
   1998 
   1999        smull_smlsl     v24, v25, v21, v23, v0.h[0], v0.h[0], .8h // -> t22
   2000        smull_smlal     v4,  v5,  v21, v23, v0.h[0], v0.h[0], .8h // -> t25
   2001        sqrshrn_sz      v21, v6,  v7,  #12, .8h   // t21a
   2002        sqrshrn_sz      v22, v24, v25, #12, .8h   // t22
   2003        sqrshrn_sz      v25, v4,  v5,  #12, .8h   // t25
   2004 
   2005        smull_smlsl     v4,  v5,  v3,  v2,  v0.h[0], v0.h[0], .8h // -> t23a
   2006        smull_smlal     v6,  v7,  v3,  v2,  v0.h[0], v0.h[0], .8h // -> t24a
   2007        sqrshrn_sz      v23, v4,  v5,  #12, .8h   // t23a
   2008        sqrshrn_sz      v24, v6,  v7,  #12, .8h   // t24a
   2009 
   2010        ret
   2011 endfunc
   2012 
   2013 .macro def_horz_32 scale=0, shift=2, suffix
   2014 function inv_txfm_horz\suffix\()_dct_32x8_neon
   2015        mov             x14, x30
   2016        movi            v7.8h,  #0
   2017        lsl             x8,  x8,  #1
   2018 .if \scale
   2019        mov             w16, #2896*8
   2020        dup             v0.4h,   w16
   2021 .endif
   2022 
   2023 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
   2024        ld1             {\i}, [x7]
   2025        st1             {v7.8h}, [x7], x8
   2026 .endr
   2027        sub             x7,  x7,  x8, lsl #4
   2028        add             x7,  x7,  x8, lsr #1
   2029 .if \scale
   2030        scale_input     .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
   2031        scale_input     .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31
   2032 .endif
   2033        bl              inv_dct_8h_x16_neon
   2034        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
   2035        transpose_8x8h  v24, v25, v26, v27, v28, v29, v30, v31, v4, v5
   2036 
   2037 .macro store1 r0, r1
   2038        st1             {\r0}, [x6], #16
   2039        st1             {\r1}, [x6], #16
   2040        add             x6,  x6,  #32
   2041 .endm
   2042        store1          v16.8h,  v24.8h
   2043        store1          v17.8h,  v25.8h
   2044        store1          v18.8h,  v26.8h
   2045        store1          v19.8h,  v27.8h
   2046        store1          v20.8h,  v28.8h
   2047        store1          v21.8h,  v29.8h
   2048        store1          v22.8h,  v30.8h
   2049        store1          v23.8h,  v31.8h
   2050 .purgem store1
   2051        sub             x6,  x6,  #64*8
   2052 
   2053        movi            v7.8h,  #0
   2054 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
   2055        ld1             {\i}, [x7]
   2056        st1             {v7.8h}, [x7], x8
   2057 .endr
   2058 .if \scale
   2059        // This relies on the fact that the idct also leaves the right coeff in v0.h[1]
   2060        scale_input     .8h, v0.h[1], v16, v17, v18, v19, v20, v21, v22, v23
   2061        scale_input     .8h, v0.h[1], v24, v25, v26, v27, v28, v29, v30, v31
   2062 .endif
   2063        bl              inv_dct32_odd_8h_x16_neon
   2064        transpose_8x8h  v31, v30, v29, v28, v27, v26, v25, v24, v4, v5
   2065        transpose_8x8h  v23, v22, v21, v20, v19, v18, v17, v16, v4, v5
   2066 .macro store2 r0, r1, shift
   2067        ld1             {v4.8h, v5.8h}, [x6]
   2068        sqsub           v7.8h,   v4.8h,   \r0
   2069        sqsub           v6.8h,   v5.8h,   \r1
   2070        sqadd           v4.8h,   v4.8h,   \r0
   2071        sqadd           v5.8h,   v5.8h,   \r1
   2072        rev64           v6.8h,   v6.8h
   2073        rev64           v7.8h,   v7.8h
   2074        srshr           v4.8h,   v4.8h,   #\shift
   2075        srshr           v5.8h,   v5.8h,   #\shift
   2076        srshr           v6.8h,   v6.8h,   #\shift
   2077        srshr           v7.8h,   v7.8h,   #\shift
   2078        ext             v6.16b,  v6.16b,  v6.16b,  #8
   2079        st1             {v4.8h, v5.8h}, [x6], #32
   2080        ext             v7.16b,  v7.16b,  v7.16b,  #8
   2081        st1             {v6.8h, v7.8h}, [x6], #32
   2082 .endm
   2083 
   2084        store2          v31.8h,  v23.8h, \shift
   2085        store2          v30.8h,  v22.8h, \shift
   2086        store2          v29.8h,  v21.8h, \shift
   2087        store2          v28.8h,  v20.8h, \shift
   2088        store2          v27.8h,  v19.8h, \shift
   2089        store2          v26.8h,  v18.8h, \shift
   2090        store2          v25.8h,  v17.8h, \shift
   2091        store2          v24.8h,  v16.8h, \shift
   2092 .purgem store2
   2093        ret             x14
   2094 endfunc
   2095 .endm
   2096 
   2097 def_horz_32 scale=0, shift=2
   2098 def_horz_32 scale=1, shift=1, suffix=_scale
   2099 
   2100 function inv_txfm_add_vert_dct_8x32_neon
   2101        mov             x14, x30
   2102        lsl             x8,  x8,  #1
   2103 
   2104 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
   2105        ld1             {v\i\().8h}, [x7], x8
   2106 .endr
   2107        sub             x7,  x7,  x8, lsl #4
   2108 
   2109        bl              inv_dct_8h_x16_neon
   2110 
   2111 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
   2112        st1             {v\i\().8h}, [x7], x8
   2113 .endr
   2114        sub             x7,  x7,  x8, lsl #4
   2115        add             x7,  x7,  x8, lsr #1
   2116 
   2117 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
   2118        ld1             {v\i\().8h}, [x7], x8
   2119 .endr
   2120        sub             x7,  x7,  x8, lsl #4
   2121        sub             x7,  x7,  x8, lsr #1
   2122        bl              inv_dct32_odd_8h_x16_neon
   2123 
   2124        neg             x9,  x8
   2125        mov             x10, x6
   2126 .macro combine r0, r1, r2, r3, op, stride
   2127        ld1             {v5.8h}, [x7],    \stride
   2128        ld1             {v2.8b}, [x10],   x1
   2129        ld1             {v6.8h}, [x7],    \stride
   2130        ld1             {v3.8b}, [x10],   x1
   2131        \op             v5.8h,   v5.8h,   \r0
   2132        ld1             {v7.8h}, [x7],    \stride
   2133        ld1             {v4.8b}, [x10],   x1
   2134        srshr           v5.8h,   v5.8h,   #4
   2135        \op             v6.8h,   v6.8h,   \r1
   2136        uaddw           v5.8h,   v5.8h,   v2.8b
   2137        srshr           v6.8h,   v6.8h,   #4
   2138        \op             v7.8h,   v7.8h,   \r2
   2139        sqxtun          v2.8b,   v5.8h
   2140        ld1             {v5.8h}, [x7],    \stride
   2141        uaddw           v6.8h,   v6.8h,   v3.8b
   2142        srshr           v7.8h,   v7.8h,   #4
   2143        \op             v5.8h,   v5.8h,   \r3
   2144        st1             {v2.8b}, [x6],    x1
   2145        ld1             {v2.8b}, [x10],   x1
   2146        sqxtun          v3.8b,   v6.8h
   2147        uaddw           v7.8h,   v7.8h,   v4.8b
   2148        srshr           v5.8h,   v5.8h,   #4
   2149        st1             {v3.8b}, [x6],    x1
   2150        sqxtun          v4.8b,   v7.8h
   2151        uaddw           v5.8h,   v5.8h,   v2.8b
   2152        st1             {v4.8b}, [x6],    x1
   2153        sqxtun          v2.8b,   v5.8h
   2154        st1             {v2.8b}, [x6],    x1
   2155 .endm
   2156        combine         v31.8h, v30.8h, v29.8h, v28.8h, sqadd, x8
   2157        combine         v27.8h, v26.8h, v25.8h, v24.8h, sqadd, x8
   2158        combine         v23.8h, v22.8h, v21.8h, v20.8h, sqadd, x8
   2159        combine         v19.8h, v18.8h, v17.8h, v16.8h, sqadd, x8
   2160        sub             x7,  x7,  x8
   2161        combine         v16.8h, v17.8h, v18.8h, v19.8h, sqsub, x9
   2162        combine         v20.8h, v21.8h, v22.8h, v23.8h, sqsub, x9
   2163        combine         v24.8h, v25.8h, v26.8h, v27.8h, sqsub, x9
   2164        combine         v28.8h, v29.8h, v30.8h, v31.8h, sqsub, x9
   2165 .purgem combine
   2166 
   2167        ret             x14
   2168 endfunc
   2169 
   2170 const eob_32x32
   2171        .short 36, 136, 300, 1024
   2172 endconst
   2173 
   2174 const eob_16x32
   2175        .short 36, 151, 279, 512
   2176 endconst
   2177 
   2178 const eob_16x32_shortside
   2179        .short 36, 512
   2180 endconst
   2181 
   2182 const eob_8x32
   2183        .short 43, 107, 171, 256
   2184 endconst
   2185 
   2186 function inv_txfm_add_identity_identity_32x32_8bpc_neon, export=1
   2187        movi            v0.8h,  #0
   2188        movrel          x13, eob_32x32
   2189 
   2190        mov             x8,  #2*32
   2191 1:
   2192        mov             w9,  #0
   2193        movrel          x12, eob_32x32
   2194 2:
   2195        add             w9,  w9,  #8
   2196 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
   2197        ld1             {v\i\().8h}, [x2]
   2198        st1             {v0.8h}, [x2], x8
   2199 .endr
   2200        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
   2201 
   2202        load_add_store_8x8 x0, x7, shiftbits=2
   2203        ldrh            w11, [x12], #2
   2204        sub             x0,  x0,  x1, lsl #3
   2205        add             x0,  x0,  #8
   2206        cmp             w3,  w11
   2207        b.ge            2b
   2208 
   2209        ldrh            w11, [x13], #2
   2210        cmp             w3,  w11
   2211        b.lt            9f
   2212 
   2213        sub             x0,  x0,  w9, uxtw
   2214        add             x0,  x0,  x1, lsl #3
   2215        msub            x2,  x8,  x9,  x2
   2216        add             x2,  x2,  #2*8
   2217        b               1b
   2218 9:
   2219        ret
   2220 endfunc
   2221 
   2222 .macro shift_8_regs op, shift
   2223 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
   2224        \op             \i,  \i,  #\shift
   2225 .endr
   2226 .endm
   2227 
   2228 .macro def_identity_1632 w, h, wshort, hshort
   2229 function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1
   2230        mov             w16, #2896*8
   2231        mov             w17, #2*(5793-4096)*8
   2232        dup             v1.4h,   w16
   2233        movi            v0.8h,   #0
   2234        mov             v1.h[1], w17
   2235        movrel          x13, eob_16x32\hshort
   2236 
   2237        mov             x8,  #2*\h
   2238 1:
   2239        mov             w9,  #0
   2240        movrel          x12, eob_16x32\wshort
   2241 2:
   2242        add             w9,  w9,  #8
   2243 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
   2244        ld1             {\i}, [x2]
   2245        st1             {v0.8h}, [x2], x8
   2246 .endr
   2247        scale_input     .8h, v1.h[0], v16, v17, v18, v19, v20, v21, v22, v23
   2248 
   2249 .if \w == 16
   2250        // 16x32
   2251        identity_8x8_shift1 v1.h[1]
   2252 .else
   2253        // 32x16
   2254        shift_8_regs    sqshl, 1
   2255        identity_8x8    v1.h[1]
   2256 .endif
   2257 
   2258        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
   2259 
   2260 .if \w == 16
   2261        load_add_store_8x8 x0, x7, shiftbits=2
   2262 .else
   2263        load_add_store_8x8 x0, x7, shiftbits=4
   2264 .endif
   2265        ldrh            w11, [x12], #2
   2266        sub             x0,  x0,  x1, lsl #3
   2267        add             x0,  x0,  #8
   2268        cmp             w3,  w11
   2269        b.ge            2b
   2270 
   2271        ldrh            w11, [x13], #2
   2272        cmp             w3,  w11
   2273        b.lt            9f
   2274 
   2275        sub             x0,  x0,  w9, uxtw
   2276        add             x0,  x0,  x1, lsl #3
   2277        msub            x2,  x8,  x9,  x2
   2278        add             x2,  x2,  #2*8
   2279        b               1b
   2280 9:
   2281        ret
   2282 endfunc
   2283 .endm
   2284 
   2285 def_identity_1632 16, 32, _shortside,
   2286 def_identity_1632 32, 16, , _shortside
   2287 
   2288 .macro def_identity_832 w, h
   2289 function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1
   2290        movi            v0.8h,  #0
   2291        movrel          x13, eob_8x32
   2292 
   2293        mov             w8,  #2*\h
   2294 1:
   2295        ldrh            w12, [x13], #2
   2296 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
   2297        ld1             {\i}, [x2]
   2298        st1             {v0.8h}, [x2], x8
   2299 .endr
   2300 
   2301 .if \w == 8
   2302        // 8x32
   2303        shift_8_regs    srshr, 1
   2304 .endif
   2305 
   2306        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
   2307 
   2308        cmp             w3,  w12
   2309 .if \w == 8
   2310        load_add_store_8x8 x0, x7, shiftbits=2
   2311 .else
   2312        load_add_store_8x8 x0, x7, shiftbits=3
   2313 .endif
   2314 
   2315        b.lt            9f
   2316 .if \w == 8
   2317        sub             x2,  x2,  x8, lsl #3
   2318        add             x2,  x2,  #2*8
   2319 .else
   2320        sub             x0,  x0,  x1, lsl #3
   2321        add             x0,  x0,  #8
   2322 .endif
   2323        b               1b
   2324 
   2325 9:
   2326        ret
   2327 endfunc
   2328 .endm
   2329 
   2330 def_identity_832 8, 32
   2331 def_identity_832 32, 8
   2332 
   2333 function inv_txfm_add_dct_dct_32x32_8bpc_neon, export=1
   2334        idct_dc         32,  32,  2
   2335 
   2336        mov             x15, x30
   2337        sub             sp,  sp,  #2048
   2338        movrel          x13, eob_32x32
   2339        ldrh            w12, [x13], #2
   2340 
   2341 .irp i, 0, 8, 16, 24
   2342        add             x6,  sp,  #(\i*32*2)
   2343 .if \i > 0
   2344        mov             w8,  #(32 - \i)
   2345        cmp             w3,  w12
   2346        b.lt            1f
   2347 .if \i < 24
   2348        ldrh            w12, [x13], #2
   2349 .endif
   2350 .endif
   2351        add             x7,  x2,  #(\i*2)
   2352        mov             x8,  #32*2
   2353        bl              inv_txfm_horz_dct_32x8_neon
   2354 .endr
   2355        b               3f
   2356 
   2357 1:
   2358        movi            v4.8h,  #0
   2359        movi            v5.8h,  #0
   2360        movi            v6.8h,  #0
   2361        movi            v7.8h,  #0
   2362 2:
   2363        subs            w8,  w8,  #4
   2364 .rept 4
   2365        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
   2366 .endr
   2367        b.gt            2b
   2368 
   2369 3:
   2370 .irp i, 0, 8, 16, 24
   2371        add             x6,  x0,  #(\i)
   2372        add             x7,  sp,  #(\i*2)
   2373        mov             x8,  #32*2
   2374        bl              inv_txfm_add_vert_dct_8x32_neon
   2375 .endr
   2376 
   2377        add             sp,  sp,  #2048
   2378        ret             x15
   2379 endfunc
   2380 
   2381 function inv_txfm_add_dct_dct_16x32_8bpc_neon, export=1
   2382        idct_dc         16,  32,  1
   2383 
   2384        mov             x15, x30
   2385        sub             sp,  sp,  #1024
   2386        movrel          x13, eob_16x32
   2387        ldrh            w12, [x13], #2
   2388        adr             x4,  inv_dct_8h_x16_neon
   2389 
   2390 .irp i, 0, 8, 16, 24
   2391        add             x6,  sp,  #(\i*16*2)
   2392        add             x7,  x2,  #(\i*2)
   2393 .if \i > 0
   2394        mov             w8,  #(32 - \i)
   2395        cmp             w3,  w12
   2396        b.lt            1f
   2397 .if \i < 24
   2398        ldrh            w12, [x13], #2
   2399 .endif
   2400 .endif
   2401        mov             x8,  #2*32
   2402        bl              inv_txfm_horz_scale_16x8_neon
   2403 .endr
   2404        b               3f
   2405 
   2406 1:
   2407        movi            v4.8h,  #0
   2408        movi            v5.8h,  #0
   2409        movi            v6.8h,  #0
   2410        movi            v7.8h,  #0
   2411 2:
   2412        subs            w8,  w8,  #8
   2413 .rept 4
   2414        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
   2415 .endr
   2416        b.gt            2b
   2417 
   2418 3:
   2419 .irp i, 0, 8
   2420        add             x6,  x0,  #(\i)
   2421        add             x7,  sp,  #(\i*2)
   2422        mov             x8,  #16*2
   2423        bl              inv_txfm_add_vert_dct_8x32_neon
   2424 .endr
   2425 
   2426        add             sp,  sp,  #1024
   2427        ret             x15
   2428 endfunc
   2429 
   2430 function inv_txfm_add_dct_dct_32x16_8bpc_neon, export=1
   2431        idct_dc         32,  16,  1
   2432 
   2433        mov             x15, x30
   2434        sub             sp,  sp,  #1024
   2435 
   2436        adr             x5,  inv_dct_8h_x16_neon
   2437 
   2438 .irp i, 0, 8
   2439        add             x6,  sp,  #(\i*32*2)
   2440        add             x7,  x2,  #(\i*2)
   2441 .if \i > 0
   2442        mov             w8,  #(16 - \i)
   2443        cmp             w3,  #36
   2444        b.lt            1f
   2445 .endif
   2446        mov             x8,  #2*16
   2447        bl              inv_txfm_horz_scale_dct_32x8_neon
   2448 .endr
   2449        b               3f
   2450 
   2451 1:
   2452        movi            v4.8h,  #0
   2453        movi            v5.8h,  #0
   2454        movi            v6.8h,  #0
   2455        movi            v7.8h,  #0
   2456 2:
   2457        subs            w8,  w8,  #4
   2458 .rept 4
   2459        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
   2460 .endr
   2461        b.gt            2b
   2462 
   2463 3:
   2464        mov             x8,  #32*2
   2465 .irp i, 0, 8, 16, 24
   2466        add             x6,  x0,  #(\i)
   2467        add             x7,  sp,  #(\i*2)
   2468        bl              inv_txfm_add_vert_8x16_neon
   2469 .endr
   2470 
   2471        add             sp,  sp,  #1024
   2472        ret             x15
   2473 endfunc
   2474 
   2475 function inv_txfm_add_dct_dct_8x32_8bpc_neon, export=1
   2476        idct_dc         8,   32, 2
   2477 
   2478        mov             x15, x30
   2479        sub             sp,  sp,  #512
   2480 
   2481        movrel          x13, eob_8x32
   2482 
   2483        movi            v28.8h,  #0
   2484        mov             x8,  #2*32
   2485        mov             w9,  #32
   2486        mov             x6,  sp
   2487 1:
   2488 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
   2489        ld1             {v\i\().8h}, [x2]
   2490        st1             {v28.8h}, [x2], x8
   2491 .endr
   2492        ldrh            w12, [x13], #2
   2493        sub             x2,  x2,  x8, lsl #3
   2494        sub             w9,  w9,  #8
   2495        add             x2,  x2,  #2*8
   2496 
   2497        bl              inv_dct_8h_x8_neon
   2498 
   2499 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
   2500        srshr           v\i\().8h,  v\i\().8h,  #2
   2501 .endr
   2502 
   2503        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
   2504 
   2505        st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64
   2506        cmp             w3,  w12
   2507        st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x6], #64
   2508 
   2509        b.ge            1b
   2510        cbz             w9,  3f
   2511 
   2512        movi            v29.8h,  #0
   2513        movi            v30.8h,  #0
   2514        movi            v31.8h,  #0
   2515 2:
   2516        subs            w9,  w9,  #8
   2517 .rept 2
   2518        st1             {v28.8h,v29.8h,v30.8h,v31.8h}, [x6], #64
   2519 .endr
   2520        b.gt            2b
   2521 
   2522 3:
   2523        mov             x6,  x0
   2524        mov             x7,  sp
   2525        mov             x8,  #8*2
   2526        bl              inv_txfm_add_vert_dct_8x32_neon
   2527 
   2528        add             sp,  sp,  #512
   2529        ret             x15
   2530 endfunc
   2531 
   2532 function inv_txfm_add_dct_dct_32x8_8bpc_neon, export=1
   2533        idct_dc         32,  8,   2
   2534 
   2535        mov             x15, x30
   2536        sub             sp,  sp,  #512
   2537 
   2538        mov             x6,  sp
   2539        mov             x7,  x2
   2540        mov             x8,  #8*2
   2541        bl              inv_txfm_horz_dct_32x8_neon
   2542 
   2543        mov             x8,  #2*32
   2544        mov             w9,  #0
   2545 1:
   2546        add             x6,  x0,  x9
   2547        add             x7,  sp,  x9, lsl #1 // #(\i*2)
   2548 
   2549 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
   2550        ld1             {v\i\().8h}, [x7], x8
   2551 .endr
   2552        add             w9,  w9,  #8
   2553 
   2554        bl              inv_dct_8h_x8_neon
   2555 
   2556        cmp             w9,  #32
   2557 
   2558        load_add_store_8x8 x6, x7
   2559 
   2560        b.lt            1b
   2561 
   2562        add             sp,  sp,  #512
   2563        ret             x15
   2564 endfunc
   2565 
   2566 function inv_dct64_step1_neon
   2567        // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
   2568        // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
   2569        // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
   2570        // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
   2571 
   2572        ld1             {v0.8h, v1.8h}, [x17], #32
   2573 
   2574        sqrdmulh        v23.8h,  v16.8h,  v0.h[1]   // t63a
   2575        sqrdmulh        v16.8h,  v16.8h,  v0.h[0]   // t32a
   2576        sqrdmulh        v22.8h,  v17.8h,  v0.h[2]   // t62a
   2577        sqrdmulh        v17.8h,  v17.8h,  v0.h[3]   // t33a
   2578        sqrdmulh        v21.8h,  v18.8h,  v0.h[5]   // t61a
   2579        sqrdmulh        v18.8h,  v18.8h,  v0.h[4]   // t34a
   2580        sqrdmulh        v20.8h,  v19.8h,  v0.h[6]   // t60a
   2581        sqrdmulh        v19.8h,  v19.8h,  v0.h[7]   // t35a
   2582 
   2583        sqadd           v24.8h,  v16.8h,  v17.8h    // t32
   2584        sqsub           v25.8h,  v16.8h,  v17.8h    // t33
   2585        sqsub           v26.8h,  v19.8h,  v18.8h    // t34
   2586        sqadd           v27.8h,  v19.8h,  v18.8h    // t35
   2587        sqadd           v28.8h,  v20.8h,  v21.8h    // t60
   2588        sqsub           v29.8h,  v20.8h,  v21.8h    // t61
   2589        sqsub           v30.8h,  v23.8h,  v22.8h    // t62
   2590        sqadd           v31.8h,  v23.8h,  v22.8h    // t63
   2591 
   2592        smull_smlal     v2,  v3,  v29, v26, v1.h[0], v1.h[1], .8h // -> t34a
   2593        smull_smlsl     v4,  v5,  v29, v26, v1.h[1], v1.h[0], .8h // -> t61a
   2594        neg             v2.4s,   v2.4s              // t34a
   2595        neg             v3.4s,   v3.4s              // t34a
   2596        smull_smlsl     v6,  v7,  v30, v25, v1.h[1], v1.h[0], .8h // -> t33a
   2597        sqrshrn_sz      v26, v2,  v3,  #12, .8h     // t34a
   2598        smull_smlal     v2,  v3,  v30, v25, v1.h[0], v1.h[1], .8h // -> t62a
   2599        sqrshrn_sz      v29, v4,  v5,  #12, .8h     // t61a
   2600        sqrshrn_sz      v25, v6,  v7,  #12, .8h     // t33a
   2601        sqrshrn_sz      v30, v2,  v3,  #12, .8h     // t62a
   2602 
   2603        sqadd           v16.8h,  v24.8h,  v27.8h    // t32a
   2604        sqsub           v19.8h,  v24.8h,  v27.8h    // t35a
   2605        sqadd           v17.8h,  v25.8h,  v26.8h    // t33
   2606        sqsub           v18.8h,  v25.8h,  v26.8h    // t34
   2607        sqsub           v20.8h,  v31.8h,  v28.8h    // t60a
   2608        sqadd           v23.8h,  v31.8h,  v28.8h    // t63a
   2609        sqsub           v21.8h,  v30.8h,  v29.8h    // t61
   2610        sqadd           v22.8h,  v30.8h,  v29.8h    // t62
   2611 
   2612        smull_smlal     v2,  v3,  v21, v18, v1.h[2], v1.h[3], .8h // -> t61a
   2613        smull_smlsl     v4,  v5,  v21, v18, v1.h[3], v1.h[2], .8h // -> t34a
   2614        smull_smlal     v6,  v7,  v20, v19, v1.h[2], v1.h[3], .8h // -> t60
   2615        sqrshrn_sz      v21, v2,  v3,  #12, .8h     // t61a
   2616        sqrshrn_sz      v18, v4,  v5,  #12, .8h     // t34a
   2617        smull_smlsl     v2,  v3,  v20, v19, v1.h[3], v1.h[2], .8h // -> t35
   2618        sqrshrn_sz      v20, v6,  v7,  #12, .8h     // t60
   2619        sqrshrn_sz      v19, v2,  v3,  #12, .8h     // t35
   2620 
   2621        st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64
   2622        st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x6], #64
   2623 
   2624        ret
   2625 endfunc
   2626 
   2627 function inv_dct64_step2_neon
   2628        movrel          x16, idct_coeffs
   2629        ld1             {v0.4h}, [x16]
   2630 1:
   2631        // t32a/33/34a/35/60/61a/62/63a
   2632        // t56a/57/58a/59/36/37a/38/39a
   2633        // t40a/41/42a/43/52/53a/54/55a
   2634        // t48a/49/50a/51/44/45a/46/47a
   2635        ldr             q16, [x6, #2*8*0]  // t32a
   2636        ldr             q17, [x9, #2*8*8]  // t39a
   2637        ldr             q18, [x9, #2*8*0]  // t63a
   2638        ldr             q19, [x6, #2*8*8]  // t56a
   2639        ldr             q20, [x6, #2*8*16] // t40a
   2640        ldr             q21, [x9, #2*8*24] // t47a
   2641        ldr             q22, [x9, #2*8*16] // t55a
   2642        ldr             q23, [x6, #2*8*24] // t48a
   2643 
   2644        sqadd           v24.8h,  v16.8h, v17.8h // t32
   2645        sqsub           v25.8h,  v16.8h, v17.8h // t39
   2646        sqadd           v26.8h,  v18.8h, v19.8h // t63
   2647        sqsub           v27.8h,  v18.8h, v19.8h // t56
   2648        sqsub           v28.8h,  v21.8h, v20.8h // t40
   2649        sqadd           v29.8h,  v21.8h, v20.8h // t47
   2650        sqadd           v30.8h,  v23.8h, v22.8h // t48
   2651        sqsub           v31.8h,  v23.8h, v22.8h // t55
   2652 
   2653        smull_smlal     v2,  v3,  v27, v25, v0.h[3], v0.h[2], .8h // -> t56a
   2654        smull_smlsl     v4,  v5,  v27, v25, v0.h[2], v0.h[3], .8h // -> t39a
   2655        smull_smlal     v6,  v7,  v31, v28, v0.h[3], v0.h[2], .8h // -> t40a
   2656        sqrshrn_sz      v25, v2,  v3,  #12, .8h     // t56a
   2657        sqrshrn_sz      v27, v4,  v5,  #12, .8h     // t39a
   2658        neg             v6.4s,   v6.4s              // t40a
   2659        neg             v7.4s,   v7.4s              // t40a
   2660        smull_smlsl     v2,  v3,  v31, v28, v0.h[2], v0.h[3], .8h // -> t55a
   2661        sqrshrn_sz      v31, v6,  v7,  #12, .8h     // t40a
   2662        sqrshrn_sz      v28, v2,  v3,  #12, .8h     // t55a
   2663 
   2664        sqadd           v16.8h,  v24.8h,  v29.8h    // t32a
   2665        sqsub           v19.8h,  v24.8h,  v29.8h    // t47a
   2666        sqadd           v17.8h,  v27.8h,  v31.8h    // t39
   2667        sqsub           v18.8h,  v27.8h,  v31.8h    // t40
   2668        sqsub           v20.8h,  v26.8h,  v30.8h    // t48a
   2669        sqadd           v23.8h,  v26.8h,  v30.8h    // t63a
   2670        sqsub           v21.8h,  v25.8h,  v28.8h    // t55
   2671        sqadd           v22.8h,  v25.8h,  v28.8h    // t56
   2672 
   2673        smull_smlsl     v2,  v3,  v21, v18, v0.h[0], v0.h[0], .8h // -> t40a
   2674        smull_smlal     v4,  v5,  v21, v18, v0.h[0], v0.h[0], .8h // -> t55a
   2675        smull_smlsl     v6,  v7,  v20, v19, v0.h[0], v0.h[0], .8h // -> t47
   2676        sqrshrn_sz      v18, v2,  v3,  #12, .8h     // t40a
   2677        sqrshrn_sz      v21, v4,  v5,  #12, .8h     // t55a
   2678        smull_smlal     v2,  v3,  v20, v19, v0.h[0], v0.h[0], .8h // -> t48
   2679        sqrshrn_sz      v19, v6,  v7,  #12, .8h     // t47
   2680        sqrshrn_sz      v20, v2,  v3,  #12, .8h     // t48
   2681 
   2682        str             q16, [x6, #2*8*0]  // t32a
   2683        str             q17, [x9, #2*8*0]  // t39
   2684        str             q18, [x6, #2*8*8]  // t40a
   2685        str             q19, [x9, #2*8*8]  // t47
   2686        str             q20, [x6, #2*8*16] // t48
   2687        str             q21, [x9, #2*8*16] // t55a
   2688        str             q22, [x6, #2*8*24] // t56
   2689        str             q23, [x9, #2*8*24] // t63a
   2690 
   2691        add             x6,  x6,  #2*8
   2692        sub             x9,  x9,  #2*8
   2693        cmp             x6,  x9
   2694        b.lt            1b
   2695        ret
   2696 endfunc
   2697 
   2698 .macro load8 src, strd, zero, clear
   2699 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
   2700 .if \clear
   2701        ld1             {\i}, [\src]
   2702        st1             {\zero}, [\src], \strd
   2703 .else
   2704        ld1             {\i}, [\src], \strd
   2705 .endif
   2706 .endr
   2707 .endm
   2708 
   2709 .macro store16 dst
   2710 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
   2711        st1             {\i}, [\dst], #16
   2712 .endr
   2713 .endm
   2714 
   2715 .macro clear_upper8
   2716 .irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
   2717        movi            \i,  #0
   2718 .endr
   2719 .endm
   2720 
   2721 .macro movi_if reg, val, cond
   2722 .if \cond
   2723        movi            \reg, \val
   2724 .endif
   2725 .endm
   2726 
   2727 .macro movdup_if reg, gpr, val, cond
   2728 .if \cond
   2729        mov             \gpr, \val
   2730        dup             \reg, \gpr
   2731 .endif
   2732 .endm
   2733 
   2734 .macro st1_if regs, dst, cond
   2735 .if \cond
   2736        st1             \regs, \dst
   2737 .endif
   2738 .endm
   2739 
   2740 .macro str_if reg, dst, cond
   2741 .if \cond
   2742        str             \reg, \dst
   2743 .endif
   2744 .endm
   2745 
   2746 .macro stroff_if reg, dst, dstoff, cond
   2747 .if \cond
   2748        str             \reg, \dst, \dstoff
   2749 .endif
   2750 .endm
   2751 
   2752 .macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7
   2753 .if \cond
   2754        scale_input     .8h, \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
   2755 .endif
   2756 .endm
   2757 
   2758 .macro def_dct64_func suffix, clear=0, scale=0
   2759 function inv_txfm_dct\suffix\()_8h_x64_neon, export=1
   2760        mov             x14, x30
   2761        mov             x6,  sp
   2762        lsl             x8,  x8,  #2
   2763 
   2764        movdup_if       v0.4h, w16, #2896*8, \scale
   2765        movi_if         v7.8h,  #0, \clear
   2766        load8           x7,  x8,  v7.8h, \clear
   2767        clear_upper8
   2768        sub             x7,  x7,  x8, lsl #3
   2769        add             x7,  x7,  x8, lsr #1
   2770        scale_if        \scale, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
   2771 
   2772        bl              inv_dct_8h_x16_neon
   2773 
   2774        store16         x6
   2775 
   2776        movdup_if       v0.4h, w16, #2896*8, \scale
   2777        movi_if         v7.8h,  #0, \clear
   2778        load8           x7,  x8,  v7.8h, \clear
   2779        clear_upper8
   2780        sub             x7,  x7,  x8, lsl #3
   2781        lsr             x8,  x8,  #1
   2782        sub             x7,  x7,  x8, lsr #1
   2783        scale_if        \scale, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
   2784 
   2785        bl              inv_dct32_odd_8h_x16_neon
   2786 
   2787        add             x10, x6,  #16*15
   2788        sub             x6,  x6,  #16*16
   2789 
   2790        mov             x9,  #-16
   2791 
   2792 .macro store_addsub r0, r1, r2, r3
   2793        ld1             {v2.8h}, [x6], #16
   2794        ld1             {v3.8h}, [x6], #16
   2795        sqadd           v6.8h,  v2.8h,  \r0
   2796        sqsub           \r0,    v2.8h,  \r0
   2797        ld1             {v4.8h}, [x6], #16
   2798        sqadd           v7.8h,  v3.8h,  \r1
   2799        sqsub           \r1,    v3.8h,  \r1
   2800        ld1             {v5.8h}, [x6], #16
   2801        sqadd           v2.8h,  v4.8h,  \r2
   2802        sub             x6,  x6,  #16*4
   2803        sqsub           \r2,    v4.8h,  \r2
   2804        st1             {v6.8h}, [x6], #16
   2805        st1             {\r0},   [x10], x9
   2806        sqadd           v3.8h,  v5.8h,  \r3
   2807        sqsub           \r3,    v5.8h,  \r3
   2808        st1             {v7.8h}, [x6], #16
   2809        st1             {\r1},   [x10], x9
   2810        st1             {v2.8h}, [x6], #16
   2811        st1             {\r2},   [x10], x9
   2812        st1             {v3.8h}, [x6], #16
   2813        st1             {\r3},   [x10], x9
   2814 .endm
   2815        store_addsub    v31.8h, v30.8h, v29.8h, v28.8h
   2816        store_addsub    v27.8h, v26.8h, v25.8h, v24.8h
   2817        store_addsub    v23.8h, v22.8h, v21.8h, v20.8h
   2818        store_addsub    v19.8h, v18.8h, v17.8h, v16.8h
   2819 .purgem store_addsub
   2820 
   2821        add             x6,  x6,  #2*8*16
   2822 
   2823        movrel          x17, idct64_coeffs
   2824        movdup_if       v0.4h, w16, #2896*8, \scale
   2825        movi_if         v7.8h,  #0, \clear
   2826        add             x9,  x7,  x8, lsl #4 // offset 16
   2827        add             x10, x7,  x8, lsl #3 // offset 8
   2828        sub             x9,  x9,  x8         // offset 15
   2829        sub             x11, x10, x8         // offset 7
   2830        ld1             {v16.8h}, [x7]  // in1  (offset 0)
   2831        ld1             {v17.8h}, [x9]  // in31 (offset 15)
   2832        ld1             {v18.8h}, [x10] // in17 (offset 8)
   2833        ld1             {v19.8h}, [x11] // in15 (offset 7)
   2834        st1_if          {v7.8h}, [x7],  \clear
   2835        st1_if          {v7.8h}, [x9],  \clear
   2836        st1_if          {v7.8h}, [x10], \clear
   2837        st1_if          {v7.8h}, [x11], \clear
   2838        scale_if        \scale, v0.h[0], v16, v17, v18, v19
   2839        bl              inv_dct64_step1_neon
   2840        movdup_if       v0.4h, w16, #2896*8, \scale
   2841        movi_if         v7.8h,  #0, \clear
   2842        add             x7,  x7,  x8, lsl #2 // offset 4
   2843        sub             x9,  x9,  x8, lsl #2 // offset 11
   2844        sub             x10, x7,  x8         // offset 3
   2845        add             x11, x9,  x8         // offset 12
   2846        ld1             {v16.8h}, [x10] // in7  (offset 3)
   2847        ld1             {v17.8h}, [x11] // in25 (offset 12)
   2848        ld1             {v18.8h}, [x9]  // in23 (offset 11)
   2849        ld1             {v19.8h}, [x7]  // in9  (offset 4)
   2850        st1_if          {v7.8h}, [x7],  \clear
   2851        st1_if          {v7.8h}, [x9],  \clear
   2852        st1_if          {v7.8h}, [x10], \clear
   2853        st1_if          {v7.8h}, [x11], \clear
   2854        scale_if        \scale, v0.h[0], v16, v17, v18, v19
   2855        bl              inv_dct64_step1_neon
   2856        movdup_if       v0.4h, w16, #2896*8, \scale
   2857        movi_if         v7.8h,  #0, \clear
   2858        sub             x10, x10, x8, lsl #1 // offset 1
   2859        sub             x9,  x9,  x8, lsl #1 // offset 9
   2860        add             x7,  x7,  x8         // offset 5
   2861        add             x11, x11, x8         // offset 13
   2862        ldr             q16, [x10, x8] // in5  (offset 2)
   2863        ldr             q17, [x11]     // in27 (offset 13)
   2864        ldr             q18, [x9,  x8] // in21 (offset 10)
   2865        ldr             q19, [x7]      // in11 (offset 5)
   2866        stroff_if       q7,  [x10, x8], \clear
   2867        str_if          q7,  [x11],     \clear
   2868        stroff_if       q7,  [x9,  x8], \clear
   2869        str_if          q7,  [x7],      \clear
   2870        scale_if        \scale, v0.h[0], v16, v17, v18, v19
   2871        bl              inv_dct64_step1_neon
   2872        movdup_if       v0.4h, w16, #2896*8, \scale
   2873        movi_if         v7.8h,  #0, \clear
   2874        ldr             q16, [x10]     // in3  (offset 1)
   2875        ldr             q17, [x11, x8] // in29 (offset 14)
   2876        ldr             q18, [x9]      // in19 (offset 9)
   2877        ldr             q19, [x7,  x8] // in13 (offset 6)
   2878        str_if          q7,  [x10],     \clear
   2879        stroff_if       q7,  [x11, x8], \clear
   2880        str_if          q7,  [x9],      \clear
   2881        stroff_if       q7,  [x7,  x8], \clear
   2882        scale_if        \scale, v0.h[0], v16, v17, v18, v19
   2883        bl              inv_dct64_step1_neon
   2884 
   2885        sub             x6,  x6,  #2*8*32
   2886        add             x9,  x6,  #2*8*7
   2887 
   2888        bl              inv_dct64_step2_neon
   2889 
   2890        ret             x14
   2891 endfunc
   2892 .endm
   2893 
   2894 def_dct64_func
   2895 def_dct64_func _clear, clear=1
   2896 def_dct64_func _clear_scale, clear=1, scale=1
   2897 
   2898 
   2899 function inv_txfm_horz_dct_64x8_neon
   2900        mov             x14, x30
   2901 
   2902        mov             x7,  sp
   2903        add             x8,  sp,  #2*8*(64 - 4)
   2904        add             x9,  x6,  #2*56
   2905        mov             x10, #2*64
   2906        mov             x11, #-2*8*4
   2907 
   2908        dup             v7.8h,  w12
   2909 1:
   2910        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64
   2911        ld1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11
   2912        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64
   2913        ld1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11
   2914        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
   2915        transpose_8x8h  v31, v30, v29, v28, v27, v26, v25, v24, v4, v5
   2916 
   2917 .macro store_addsub src0, src1, src2, src3
   2918        sqsub           v1.8h,   \src0,   \src1
   2919        sqadd           v0.8h,   \src0,   \src1
   2920        sqsub           v3.8h,   \src2,   \src3
   2921        srshl           v1.8h,   v1.8h,   v7.8h
   2922        sqadd           v2.8h,   \src2,   \src3
   2923        srshl           v0.8h,   v0.8h,   v7.8h
   2924        srshl           v3.8h,   v3.8h,   v7.8h
   2925        rev64           v1.8h,   v1.8h
   2926        srshl           v2.8h,   v2.8h,   v7.8h
   2927        rev64           v3.8h,   v3.8h
   2928        ext             v1.16b,  v1.16b,  v1.16b,  #8
   2929        st1             {v0.8h},  [x6], x10
   2930        ext             v3.16b,  v3.16b,  v3.16b,  #8
   2931        st1             {v1.8h},  [x9], x10
   2932        st1             {v2.8h},  [x6], x10
   2933        st1             {v3.8h},  [x9], x10
   2934 .endm
   2935        store_addsub    v16.8h,  v31.8h,  v17.8h,  v30.8h
   2936        store_addsub    v18.8h,  v29.8h,  v19.8h,  v28.8h
   2937        store_addsub    v20.8h,  v27.8h,  v21.8h,  v26.8h
   2938        store_addsub    v22.8h,  v25.8h,  v23.8h,  v24.8h
   2939 .purgem store_addsub
   2940        sub             x6,  x6,  x10, lsl #3
   2941        sub             x9,  x9,  x10, lsl #3
   2942        add             x6,  x6,  #16
   2943        sub             x9,  x9,  #16
   2944 
   2945        cmp             x7,  x8
   2946        b.lt            1b
   2947        ret             x14
   2948 endfunc
   2949 
   2950 function inv_txfm_add_vert_dct_8x64_neon
   2951        mov             x14, x30
   2952        lsl             x8,  x8,  #1
   2953 
   2954        mov             x7,  sp
   2955        add             x8,  sp,  #2*8*(64 - 4)
   2956        add             x9,  x6,  x1, lsl #6
   2957        sub             x9,  x9,  x1
   2958        neg             x10, x1
   2959        mov             x11, #-2*8*4
   2960 
   2961 1:
   2962        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64
   2963        ld1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11
   2964        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64
   2965        ld1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11
   2966 
   2967 .macro add_dest_addsub src0, src1, src2, src3
   2968        ld1             {v0.8b}, [x6], x1
   2969        ld1             {v1.8b}, [x9], x10
   2970        sqadd           v4.8h,   \src0,   \src1
   2971        ld1             {v2.8b}, [x6]
   2972        sqsub           v5.8h,   \src0,   \src1
   2973        ld1             {v3.8b}, [x9]
   2974        sqadd           v6.8h,   \src2,   \src3
   2975        sqsub           v7.8h,   \src2,   \src3
   2976        sub             x6,  x6,  x1
   2977        sub             x9,  x9,  x10
   2978        srshr           v4.8h,   v4.8h,   #4
   2979        srshr           v5.8h,   v5.8h,   #4
   2980        srshr           v6.8h,   v6.8h,   #4
   2981        uaddw           v4.8h,   v4.8h,   v0.8b
   2982        srshr           v7.8h,   v7.8h,   #4
   2983        uaddw           v5.8h,   v5.8h,   v1.8b
   2984        uaddw           v6.8h,   v6.8h,   v2.8b
   2985        sqxtun          v0.8b,   v4.8h
   2986        uaddw           v7.8h,   v7.8h,   v3.8b
   2987        sqxtun          v1.8b,   v5.8h
   2988        st1             {v0.8b}, [x6], x1
   2989        sqxtun          v2.8b,   v6.8h
   2990        st1             {v1.8b}, [x9], x10
   2991        sqxtun          v3.8b,   v7.8h
   2992        st1             {v2.8b}, [x6], x1
   2993        st1             {v3.8b}, [x9], x10
   2994 .endm
   2995        add_dest_addsub v16.8h,  v31.8h,  v17.8h,  v30.8h
   2996        add_dest_addsub v18.8h,  v29.8h,  v19.8h,  v28.8h
   2997        add_dest_addsub v20.8h,  v27.8h,  v21.8h,  v26.8h
   2998        add_dest_addsub v22.8h,  v25.8h,  v23.8h,  v24.8h
   2999 .purgem add_dest_addsub
   3000        cmp             x7,  x8
   3001        b.lt            1b
   3002 
   3003        ret             x14
   3004 endfunc
   3005 
   3006 function inv_txfm_add_dct_dct_64x64_8bpc_neon, export=1
   3007        idct_dc         64,  64,  2
   3008 
   3009        mov             x15, x30
   3010 
   3011        sub_sp          64*32*2+64*8*2
   3012        add             x5,  sp, #64*8*2
   3013 
   3014        movrel          x13, eob_32x32
   3015 
   3016 .irp i, 0, 8, 16, 24
   3017        add             x6,  x5,  #(\i*64*2)
   3018 .if \i > 0
   3019        mov             w8,  #(32 - \i)
   3020        cmp             w3,  w12
   3021        b.lt            1f
   3022 .endif
   3023        add             x7,  x2,  #(\i*2)
   3024        mov             x8,  #32*2
   3025        mov             x12, #-2 // shift
   3026        bl              inv_txfm_dct_clear_8h_x64_neon
   3027        add             x6,  x5,  #(\i*64*2)
   3028        bl              inv_txfm_horz_dct_64x8_neon
   3029 .if \i < 24
   3030        ldrh            w12, [x13], #2
   3031 .endif
   3032 .endr
   3033        b               3f
   3034 
   3035 1:
   3036        movi            v4.8h,  #0
   3037        movi            v5.8h,  #0
   3038        movi            v6.8h,  #0
   3039        movi            v7.8h,  #0
   3040 2:
   3041        subs            w8,  w8,  #2
   3042 .rept 4
   3043        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
   3044 .endr
   3045        b.gt            2b
   3046 
   3047 3:
   3048 .irp i, 0, 8, 16, 24, 32, 40, 48, 56
   3049        add             x7,  x5,  #(\i*2)
   3050        mov             x8,  #64*2
   3051        bl              inv_txfm_dct_8h_x64_neon
   3052        add             x6,  x0,  #(\i)
   3053        bl              inv_txfm_add_vert_dct_8x64_neon
   3054 .endr
   3055 
   3056        add             sp,  x5,  #64*32*2
   3057        ret             x15
   3058 endfunc
   3059 
   3060 function inv_txfm_add_dct_dct_64x32_8bpc_neon, export=1
   3061        idct_dc         64,  32,  1
   3062 
   3063        mov             x15, x30
   3064 
   3065        sub_sp          64*32*2+64*8*2
   3066        add             x5,  sp, #64*8*2
   3067 
   3068        movrel          x13, eob_32x32
   3069 
   3070 .irp i, 0, 8, 16, 24
   3071        add             x6,  x5,  #(\i*64*2)
   3072 .if \i > 0
   3073        mov             w8,  #(32 - \i)
   3074        cmp             w3,  w12
   3075        b.lt            1f
   3076 .endif
   3077        add             x7,  x2,  #(\i*2)
   3078        mov             x8,  #32*2
   3079        mov             x12, #-1 // shift
   3080        bl              inv_txfm_dct_clear_scale_8h_x64_neon
   3081        add             x6,  x5,  #(\i*64*2)
   3082        bl              inv_txfm_horz_dct_64x8_neon
   3083 .if \i < 24
   3084        ldrh            w12, [x13], #2
   3085 .endif
   3086 .endr
   3087        b               3f
   3088 
   3089 1:
   3090        movi            v4.8h,  #0
   3091        movi            v5.8h,  #0
   3092        movi            v6.8h,  #0
   3093        movi            v7.8h,  #0
   3094 2:
   3095        subs            w8,  w8,  #2
   3096 .rept 4
   3097        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
   3098 .endr
   3099        b.gt            2b
   3100 
   3101 3:
   3102 .irp i, 0, 8, 16, 24, 32, 40, 48, 56
   3103        add             x6,  x0,  #(\i)
   3104        add             x7,  x5,  #(\i*2)
   3105        mov             x8,  #64*2
   3106        bl              inv_txfm_add_vert_dct_8x32_neon
   3107 .endr
   3108 
   3109        add             sp,  x5,  #64*32*2
   3110        ret             x15
   3111 endfunc
   3112 
   3113 function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1
   3114        idct_dc         32,  64,  1
   3115 
   3116        mov             x15, x30
   3117 
   3118        sub_sp          32*32*2+64*8*2
   3119        add             x5,  sp, #64*8*2
   3120 
   3121        movrel          x13, eob_32x32
   3122        ldrh            w12, [x13], #2
   3123 
   3124 .irp i, 0, 8, 16, 24
   3125        add             x6,  x5,  #(\i*32*2)
   3126 .if \i > 0
   3127        mov             w8,  #(32 - \i)
   3128        cmp             w3,  w12
   3129        b.lt            1f
   3130 .if \i < 24
   3131        ldrh            w12, [x13], #2
   3132 .endif
   3133 .endif
   3134        add             x7,  x2,  #(\i*2)
   3135        mov             x8,  #32*2
   3136        bl              inv_txfm_horz_scale_dct_32x8_neon
   3137 .endr
   3138        b               3f
   3139 
   3140 1:
   3141        movi            v4.8h,  #0
   3142        movi            v5.8h,  #0
   3143        movi            v6.8h,  #0
   3144        movi            v7.8h,  #0
   3145 2:
   3146        subs            w8,  w8,  #4
   3147 .rept 4
   3148        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
   3149 .endr
   3150        b.gt            2b
   3151 
   3152 3:
   3153 .irp i, 0, 8, 16, 24
   3154        add             x7,  x5,  #(\i*2)
   3155        mov             x8,  #32*2
   3156        bl              inv_txfm_dct_8h_x64_neon
   3157        add             x6,  x0,  #(\i)
   3158        bl              inv_txfm_add_vert_dct_8x64_neon
   3159 .endr
   3160 
   3161        add             sp,  x5,  #32*32*2
   3162        ret             x15
   3163 endfunc
   3164 
   3165 function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1
   3166        idct_dc         64,  16,  2
   3167 
   3168        mov             x15, x30
   3169 
   3170        sub_sp          64*16*2+64*8*2
   3171        add             x4,  sp, #64*8*2
   3172 
   3173        movrel          x13, eob_16x32
   3174 
   3175 .irp i, 0, 8
   3176        add             x6,  x4,  #(\i*64*2)
   3177 .if \i > 0
   3178        mov             w8,  #(16 - \i)
   3179        cmp             w3,  w12
   3180        b.lt            1f
   3181 .endif
   3182        add             x7,  x2,  #(\i*2)
   3183        mov             x8,  #16*2
   3184        mov             x12, #-2 // shift
   3185        bl              inv_txfm_dct_clear_8h_x64_neon
   3186        add             x6,  x4,  #(\i*64*2)
   3187        bl              inv_txfm_horz_dct_64x8_neon
   3188 .if \i < 8
   3189        ldrh            w12, [x13], #2
   3190 .endif
   3191 .endr
   3192        b               3f
   3193 
   3194 1:
   3195        movi            v4.8h,  #0
   3196        movi            v5.8h,  #0
   3197        movi            v6.8h,  #0
   3198        movi            v7.8h,  #0
   3199 2:
   3200        subs            w8,  w8,  #2
   3201 .rept 4
   3202        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
   3203 .endr
   3204        b.gt            2b
   3205 
   3206 3:
   3207        adr             x5,  inv_dct_8h_x16_neon
   3208        mov             x8,  #64*2
   3209 .irp i, 0, 8, 16, 24, 32, 40, 48, 56
   3210        add             x6,  x0,  #(\i)
   3211        add             x7,  x4,  #(\i*2)
   3212        bl              inv_txfm_add_vert_8x16_neon
   3213 .endr
   3214 
   3215        add             sp,  x4,  #64*16*2
   3216        ret             x15
   3217 endfunc
   3218 
   3219 function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1
   3220        idct_dc         16,  64,  2
   3221 
   3222        mov             x15, x30
   3223 
   3224        sub_sp          16*32*2+64*8*2
   3225        add             x5,  sp, #64*8*2
   3226 
   3227        movrel          x13, eob_16x32
   3228        ldrh            w12, [x13], #2
   3229 
   3230        adr             x4,  inv_dct_8h_x16_neon
   3231 .irp i, 0, 8, 16, 24
   3232        add             x6,  x5,  #(\i*16*2)
   3233 .if \i > 0
   3234        mov             w8,  #(32 - \i)
   3235        cmp             w3,  w12
   3236        b.lt            1f
   3237 .if \i < 24
   3238        ldrh            w12, [x13], #2
   3239 .endif
   3240 .endif
   3241        add             x7,  x2,  #(\i*2)
   3242        mov             x8,  #32*2
   3243        bl              inv_txfm_horz_16x8_neon
   3244 .endr
   3245        b               3f
   3246 
   3247 1:
   3248        movi            v4.8h,  #0
   3249        movi            v5.8h,  #0
   3250        movi            v6.8h,  #0
   3251        movi            v7.8h,  #0
   3252 2:
   3253        subs            w8,  w8,  #8
   3254 .rept 4
   3255        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
   3256 .endr
   3257        b.gt            2b
   3258 
   3259 3:
   3260 .irp i, 0, 8
   3261        add             x7,  x5,  #(\i*2)
   3262        mov             x8,  #16*2
   3263        bl              inv_txfm_dct_8h_x64_neon
   3264        add             x6,  x0,  #(\i)
   3265        bl              inv_txfm_add_vert_dct_8x64_neon
   3266 .endr
   3267 
   3268        add             sp,  x5,  #16*32*2
   3269        ret             x15
   3270 endfunc