tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

itx16.S (134855B)


      1 /******************************************************************************
      2 * Copyright © 2018, VideoLAN and dav1d authors
      3 * Copyright © 2020, Martin Storsjo
      4 * All rights reserved.
      5 *
      6 * Redistribution and use in source and binary forms, with or without
      7 * modification, are permitted provided that the following conditions are met:
      8 *
      9 * 1. Redistributions of source code must retain the above copyright notice, this
     10 *    list of conditions and the following disclaimer.
     11 *
     12 * 2. Redistributions in binary form must reproduce the above copyright notice,
     13 *    this list of conditions and the following disclaimer in the documentation
     14 *    and/or other materials provided with the distribution.
     15 *
     16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 *****************************************************************************/
     27 
     28 #include "src/arm/asm.S"
     29 #include "util.S"
     30 
     31 // The exported functions in this file have got the following signature:
     32 // void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob,
     33 //                int bitdepth_max);
     34 
     35 // Most of the functions use the following register layout:
     36 // x0-x3  external parameters
     37 // x4     function pointer to first transform
     38 // x5     function pointer to second transform
     39 // x6     output parameter for helper function
     40 // x7     input parameter for helper function
     41 // x8     input stride for helper function
     42 // x9-x12 scratch variables for helper functions
     43 // x13    pointer to list of eob thresholds
     44 // x14    return pointer for helper function
     45 // x15    return pointer for main function
     46 
     47 // The SIMD registers most often use the following layout:
     48 // v0-v1   multiplication coefficients
     49 // v2-v7   scratch registers
     50 // v8-v15  unused
     51 // v16-v31 inputs/outputs of transforms
     52 
     53 const idct_coeffs, align=4
     54        // idct4
     55        .int            2896, 2896*8*(1<<16), 1567, 3784
     56        // idct8
     57        .int            799, 4017, 3406, 2276
     58        // idct16
     59        .int            401, 4076, 3166, 2598
     60        .int            1931, 3612, 3920, 1189
     61        // idct32
     62        .int            201, 4091, 3035, 2751
     63        .int            1751, 3703, 3857, 1380
     64        .int            995, 3973, 3513, 2106
     65        .int            2440, 3290, 4052, 601
     66 endconst
     67 
     68 const idct64_coeffs, align=4
     69        .int            101*8*(1<<16), 4095*8*(1<<16), 2967*8*(1<<16), -2824*8*(1<<16)
     70        .int            1660*8*(1<<16), 3745*8*(1<<16), 3822*8*(1<<16), -1474*8*(1<<16)
     71        .int            4076, 401, 4017, 799
     72 
     73        .int            4036*8*(1<<16), -700*8*(1<<16), 2359*8*(1<<16), 3349*8*(1<<16)
     74        .int            3461*8*(1<<16), -2191*8*(1<<16), 897*8*(1<<16), 3996*8*(1<<16)
     75        .int            -3166, -2598, -799, -4017
     76 
     77        .int            501*8*(1<<16), 4065*8*(1<<16), 3229*8*(1<<16), -2520*8*(1<<16)
     78        .int            2019*8*(1<<16), 3564*8*(1<<16), 3948*8*(1<<16), -1092*8*(1<<16)
     79        .int            3612, 1931, 2276, 3406
     80 
     81        .int            4085*8*(1<<16), -301*8*(1<<16), 2675*8*(1<<16), 3102*8*(1<<16)
     82        .int            3659*8*(1<<16), -1842*8*(1<<16), 1285*8*(1<<16), 3889*8*(1<<16)
     83        .int            -3920, -1189, -3406, -2276
     84 endconst
     85 
     86 const iadst4_coeffs, align=4
     87        .int            1321, 3803, 2482, 3344
     88 endconst
     89 
     90 const iadst8_coeffs, align=4
     91        .int            4076, 401, 3612, 1931
     92        .int            2598, 3166, 1189, 3920
     93        // idct_coeffs
     94        .int            2896, 0, 1567, 3784
     95 endconst
     96 
     97 const iadst16_coeffs, align=4
     98        .int            4091, 201, 3973, 995
     99        .int            3703, 1751, 3290, 2440
    100        .int            2751, 3035, 2106, 3513
    101        .int            1380, 3857, 601, 4052
    102 endconst
    103 
    104 .macro mul_mla d, s0, s1, c0, c1
    105        mul             \d\().4s, \s0\().4s, \c0
    106        mla             \d\().4s, \s1\().4s, \c1
    107 .endm
    108 
    109 .macro mul_mls d, s0, s1, c0, c1
    110        mul             \d\().4s, \s0\().4s, \c0
    111        mls             \d\().4s, \s1\().4s, \c1
    112 .endm
    113 
    114 .macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7
    115        sqrdmulh        \r0\sz,  \r0\sz,  \c
    116        sqrdmulh        \r1\sz,  \r1\sz,  \c
    117        sqrdmulh        \r2\sz,  \r2\sz,  \c
    118        sqrdmulh        \r3\sz,  \r3\sz,  \c
    119 .ifnb \r4
    120        sqrdmulh        \r4\sz,  \r4\sz,  \c
    121        sqrdmulh        \r5\sz,  \r5\sz,  \c
    122        sqrdmulh        \r6\sz,  \r6\sz,  \c
    123        sqrdmulh        \r7\sz,  \r7\sz,  \c
    124 .endif
    125 .endm
    126 
    127 .macro smin_4s r0, r1, r2
    128        smin            \r0\().4s, \r1\().4s, \r2\().4s
    129 .endm
    130 .macro smax_4s r0, r1, r2
    131        smax            \r0\().4s, \r1\().4s, \r2\().4s
    132 .endm
    133 
    134 .macro load_add_store load, shift, addsrc, adddst, min, store, dst, src, shiftbits=4
    135 .ifnb \load
    136        ld1             {\load},  [\src], x1
    137 .endif
    138 .ifnb \shift
    139        srshr           \shift,  \shift,  #\shiftbits
    140 .endif
    141 .ifnb \addsrc
    142        usqadd          \adddst, \addsrc
    143 .endif
    144 .ifnb \min
    145        smin            \min,  \min,  v7.8h
    146 .endif
    147 .ifnb \store
    148        st1             {\store},  [\dst], x1
    149 .endif
    150 .endm
    151 .macro load_add_store_8x16 dst, src
    152        mov             \src, \dst
    153        mvni            v7.8h,   #0xfc, lsl #8 // 0x3ff
    154        load_add_store  v2.8h,  v16.8h,       ,      ,       ,       ,  \dst, \src
    155        load_add_store  v3.8h,  v17.8h,       ,      ,       ,       ,  \dst, \src
    156        load_add_store  v4.8h,  v18.8h, v16.8h, v2.8h,       ,       ,  \dst, \src
    157        load_add_store  v5.8h,  v19.8h, v17.8h, v3.8h,  v2.8h,       ,  \dst, \src
    158        load_add_store  v16.8h, v20.8h, v18.8h, v4.8h,  v3.8h,  v2.8h,  \dst, \src
    159        load_add_store  v17.8h, v21.8h, v19.8h, v5.8h,  v4.8h,  v3.8h,  \dst, \src
    160        load_add_store  v18.8h, v22.8h, v20.8h, v16.8h, v5.8h,  v4.8h,  \dst, \src
    161        load_add_store  v19.8h, v23.8h, v21.8h, v17.8h, v16.8h, v5.8h,  \dst, \src
    162        load_add_store  v20.8h, v24.8h, v22.8h, v18.8h, v17.8h, v16.8h, \dst, \src
    163        load_add_store  v21.8h, v25.8h, v23.8h, v19.8h, v18.8h, v17.8h, \dst, \src
    164        load_add_store  v22.8h, v26.8h, v24.8h, v20.8h, v19.8h, v18.8h, \dst, \src
    165        load_add_store  v23.8h, v27.8h, v25.8h, v21.8h, v20.8h, v19.8h, \dst, \src
    166        load_add_store  v24.8h, v28.8h, v26.8h, v22.8h, v21.8h, v20.8h, \dst, \src
    167        load_add_store  v25.8h, v29.8h, v27.8h, v23.8h, v22.8h, v21.8h, \dst, \src
    168        load_add_store  v26.8h, v30.8h, v28.8h, v24.8h, v23.8h, v22.8h, \dst, \src
    169        load_add_store  v27.8h, v31.8h, v29.8h, v25.8h, v24.8h, v23.8h, \dst, \src
    170        load_add_store        ,       , v30.8h, v26.8h, v25.8h, v24.8h, \dst, \src
    171        load_add_store        ,       , v31.8h, v27.8h, v26.8h, v25.8h, \dst, \src
    172        load_add_store        ,       ,       ,       , v27.8h, v26.8h, \dst, \src
    173        load_add_store        ,       ,       ,       ,       , v27.8h, \dst, \src
    174 .endm
    175 .macro load_add_store_8x8 dst, src, shiftbits=4
    176        mov             \src, \dst
    177        mvni            v7.8h,   #0xfc, lsl #8 // 0x3ff
    178        load_add_store  v2.8h,  v16.8h,       ,      ,       ,       ,  \dst, \src, \shiftbits
    179        load_add_store  v3.8h,  v17.8h,       ,      ,       ,       ,  \dst, \src, \shiftbits
    180        load_add_store  v4.8h,  v18.8h, v16.8h, v2.8h,       ,       ,  \dst, \src, \shiftbits
    181        load_add_store  v5.8h,  v19.8h, v17.8h, v3.8h,  v2.8h,       ,  \dst, \src, \shiftbits
    182        load_add_store  v16.8h, v20.8h, v18.8h, v4.8h,  v3.8h,  v2.8h,  \dst, \src, \shiftbits
    183        load_add_store  v17.8h, v21.8h, v19.8h, v5.8h,  v4.8h,  v3.8h,  \dst, \src, \shiftbits
    184        load_add_store  v18.8h, v22.8h, v20.8h, v16.8h, v5.8h,  v4.8h,  \dst, \src, \shiftbits
    185        load_add_store  v19.8h, v23.8h, v21.8h, v17.8h, v16.8h, v5.8h,  \dst, \src, \shiftbits
    186        load_add_store        ,       , v22.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits
    187        load_add_store        ,       , v23.8h, v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits
    188        load_add_store        ,       ,       ,       , v19.8h, v18.8h, \dst, \src, \shiftbits
    189        load_add_store        ,       ,       ,       ,       , v19.8h, \dst, \src, \shiftbits
    190 .endm
    191 .macro load_add_store_8x4 dst, src, shiftbits=4
    192        mov             \src, \dst
    193        mvni            v7.8h,   #0xfc, lsl #8 // 0x3ff
    194        load_add_store  v2.8h, v16.8h,       ,      ,      ,      , \dst, \src, \shiftbits
    195        load_add_store  v3.8h, v17.8h,       ,      ,      ,      , \dst, \src, \shiftbits
    196        load_add_store  v4.8h, v18.8h, v16.8h, v2.8h,      ,      , \dst, \src, \shiftbits
    197        load_add_store  v5.8h, v19.8h, v17.8h, v3.8h, v2.8h,      , \dst, \src, \shiftbits
    198        load_add_store       ,       , v18.8h, v4.8h, v3.8h, v2.8h, \dst, \src, \shiftbits
    199        load_add_store       ,       , v19.8h, v5.8h, v4.8h, v3.8h, \dst, \src, \shiftbits
    200        load_add_store       ,       ,       ,      , v5.8h, v4.8h, \dst, \src, \shiftbits
    201        load_add_store       ,       ,       ,      ,      , v5.8h, \dst, \src, \shiftbits
    202 .endm
    203 .macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, min, store, dst, src
    204 .ifnb \load
    205        ld1             {\load}[0],  [\src], x1
    206 .endif
    207 .ifnb \inssrc
    208        ins             \insdst\().d[1],   \inssrc\().d[0]
    209 .endif
    210 .ifnb \shift
    211        srshr           \shift,  \shift,  #4
    212 .endif
    213 .ifnb \load
    214        ld1             {\load}[1],  [\src], x1
    215 .endif
    216 .ifnb \addsrc
    217        usqadd          \adddst, \addsrc
    218 .endif
    219 .ifnb \store
    220        st1             {\store}[0],  [\dst], x1
    221 .endif
    222 .ifnb \min
    223        smin            \min,  \min,  v7.8h
    224 .endif
    225 .ifnb \store
    226        st1             {\store}[1],  [\dst], x1
    227 .endif
    228 .endm
    229 .macro load_add_store_4x16 dst, src
    230        mov             \src, \dst
    231        mvni            v7.8h,   #0xfc, lsl #8 // 0x3ff
    232        load_add_store4 v0.d,  v17, v16,       ,       ,      ,       ,      ,  \dst, \src
    233        load_add_store4 v1.d,  v19, v18,       ,       ,      ,       ,      ,  \dst, \src
    234        load_add_store4 v2.d,  v21, v20, v16.8h,       ,      ,       ,      ,  \dst, \src
    235        load_add_store4 v3.d,  v23, v22, v18.8h, v16.8h, v0.8h,       ,      ,  \dst, \src
    236        load_add_store4 v17.d, v25, v24, v20.8h, v18.8h, v1.8h,  v0.8h,      ,  \dst, \src
    237        load_add_store4 v19.d, v27, v26, v22.8h, v20.8h, v2.8h,  v1.8h,  v0.d,  \dst, \src
    238        load_add_store4 v21.d, v29, v28, v24.8h, v22.8h, v3.8h,  v2.8h,  v1.d,  \dst, \src
    239        load_add_store4 v23.d, v31, v30, v26.8h, v24.8h, v17.8h, v3.8h,  v2.d,  \dst, \src
    240        load_add_store4      ,    ,    , v28.8h, v26.8h, v19.8h, v17.8h, v3.d,  \dst, \src
    241        load_add_store4      ,    ,    , v30.8h, v28.8h, v21.8h, v19.8h, v17.d, \dst, \src
    242        load_add_store4      ,    ,    ,       , v30.8h, v23.8h, v21.8h, v19.d, \dst, \src
    243        load_add_store4      ,    ,    ,       ,      ,        , v23.8h, v21.d, \dst, \src
    244        load_add_store4      ,    ,    ,       ,      ,        ,       , v23.d, \dst, \src
    245 .endm
    246 .macro load_add_store_4x8 dst, src
    247        mov             \src, \dst
    248        mvni            v7.8h,   #0xfc, lsl #8 // 0x3ff
    249        load_add_store4 v0.d, v17, v16,       ,       ,      ,      ,     , \dst, \src
    250        load_add_store4 v1.d, v19, v18,       ,       ,      ,      ,     , \dst, \src
    251        load_add_store4 v2.d, v21, v20, v16.8h,       ,      ,      ,     , \dst, \src
    252        load_add_store4 v3.d, v23, v22, v18.8h, v16.8h, v0.8h,      ,     , \dst, \src
    253        load_add_store4     ,    ,    , v20.8h, v18.8h, v1.8h, v0.8h,     , \dst, \src
    254        load_add_store4     ,    ,    , v22.8h, v20.8h, v2.8h, v1.8h, v0.d, \dst, \src
    255        load_add_store4     ,    ,    ,       , v22.8h, v3.8h, v2.8h, v1.d, \dst, \src
    256        load_add_store4     ,    ,    ,       ,       ,      , v3.8h, v2.d, \dst, \src
    257        load_add_store4     ,    ,    ,       ,       ,      ,      , v3.d, \dst, \src
    258 .endm
    259 
    260 .macro idct_dc w, h, shift
    261        cbnz            w3,  1f
    262        movz            w16, #2896*8, lsl #16
    263        ld1r            {v16.4s}, [x2]
    264        dup             v0.2s,   w16
    265        sqrdmulh        v20.4s,  v16.4s,  v0.s[0]
    266        str             wzr, [x2]
    267 .if (\w == 2*\h) || (2*\w == \h)
    268        sqrdmulh        v20.4s,  v20.4s,  v0.s[0]
    269 .endif
    270 .if \shift > 0
    271        sqrshrn         v16.4h,  v20.4s,  #\shift
    272        sqrshrn2        v16.8h,  v20.4s,  #\shift
    273 .else
    274        sqxtn           v16.4h,  v20.4s
    275        sqxtn2          v16.8h,  v20.4s
    276 .endif
    277        sqrdmulh        v16.8h,  v16.8h,  v0.h[1]
    278        srshr           v16.8h,  v16.8h,  #4
    279        mov             w4,  #\h
    280        b               idct_dc_w\w\()_neon
    281 1:
    282 .endm
    283 
    284 function idct_dc_w4_neon
    285        mvni            v31.8h,  #0xfc, lsl #8 // 0x3ff
    286 1:
    287        ld1             {v0.d}[0], [x0], x1
    288        ld1             {v0.d}[1], [x0], x1
    289        ld1             {v1.d}[0], [x0], x1
    290        subs            w4,  w4,  #4
    291        ld1             {v1.d}[1], [x0], x1
    292        usqadd          v0.8h,   v16.8h
    293        sub             x0,  x0,  x1, lsl #2
    294        usqadd          v1.8h,   v16.8h
    295        smin            v0.8h,   v0.8h,   v31.8h
    296        st1             {v0.d}[0], [x0], x1
    297        smin            v1.8h,   v1.8h,   v31.8h
    298        st1             {v0.d}[1], [x0], x1
    299        st1             {v1.d}[0], [x0], x1
    300        st1             {v1.d}[1], [x0], x1
    301        b.gt            1b
    302        ret
    303 endfunc
    304 
    305 function idct_dc_w8_neon
    306        mvni            v31.8h,  #0xfc, lsl #8 // 0x3ff
    307 1:
    308        ld1             {v0.8h}, [x0], x1
    309        subs            w4,  w4,  #4
    310        ld1             {v1.8h}, [x0], x1
    311        usqadd          v0.8h,   v16.8h
    312        ld1             {v2.8h}, [x0], x1
    313        usqadd          v1.8h,   v16.8h
    314        ld1             {v3.8h}, [x0], x1
    315        usqadd          v2.8h,   v16.8h
    316        usqadd          v3.8h,   v16.8h
    317        sub             x0,  x0,  x1, lsl #2
    318        smin            v0.8h,   v0.8h,   v31.8h
    319        smin            v1.8h,   v1.8h,   v31.8h
    320        st1             {v0.8h}, [x0], x1
    321        smin            v2.8h,   v2.8h,   v31.8h
    322        st1             {v1.8h}, [x0], x1
    323        smin            v3.8h,   v3.8h,   v31.8h
    324        st1             {v2.8h}, [x0], x1
    325        st1             {v3.8h}, [x0], x1
    326        b.gt            1b
    327        ret
    328 endfunc
    329 
    330 function idct_dc_w16_neon
    331        mvni            v31.8h,  #0xfc, lsl #8 // 0x3ff
    332 1:
    333        ld1             {v0.8h, v1.8h}, [x0], x1
    334        subs            w4,  w4,  #2
    335        ld1             {v2.8h, v3.8h}, [x0], x1
    336        usqadd          v0.8h,   v16.8h
    337        usqadd          v1.8h,   v16.8h
    338        sub             x0,  x0,  x1, lsl #1
    339        usqadd          v2.8h,   v16.8h
    340        usqadd          v3.8h,   v16.8h
    341        smin            v0.8h,   v0.8h,   v31.8h
    342        smin            v1.8h,   v1.8h,   v31.8h
    343        smin            v2.8h,   v2.8h,   v31.8h
    344        st1             {v0.8h, v1.8h}, [x0], x1
    345        smin            v3.8h,   v3.8h,   v31.8h
    346        st1             {v2.8h, v3.8h}, [x0], x1
    347        b.gt            1b
    348        ret
    349 endfunc
    350 
    351 function idct_dc_w32_neon
    352        mvni            v31.8h,  #0xfc, lsl #8 // 0x3ff
    353 1:
    354        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
    355        subs            w4,  w4,  #1
    356        usqadd          v0.8h,   v16.8h
    357        usqadd          v1.8h,   v16.8h
    358        usqadd          v2.8h,   v16.8h
    359        usqadd          v3.8h,   v16.8h
    360        smin            v0.8h,   v0.8h,   v31.8h
    361        smin            v1.8h,   v1.8h,   v31.8h
    362        smin            v2.8h,   v2.8h,   v31.8h
    363        smin            v3.8h,   v3.8h,   v31.8h
    364        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
    365        b.gt            1b
    366        ret
    367 endfunc
    368 
    369 function idct_dc_w64_neon
    370        mvni            v31.8h,  #0xfc, lsl #8 // 0x3ff
    371        sub             x1,  x1,  #64
    372 1:
    373        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
    374        subs            w4,  w4,  #1
    375        usqadd          v0.8h,   v16.8h
    376        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0]
    377        usqadd          v1.8h,   v16.8h
    378        sub             x0,  x0,  #64
    379        usqadd          v2.8h,   v16.8h
    380        usqadd          v3.8h,   v16.8h
    381        usqadd          v4.8h,   v16.8h
    382        usqadd          v5.8h,   v16.8h
    383        usqadd          v6.8h,   v16.8h
    384        usqadd          v7.8h,   v16.8h
    385        smin            v0.8h,   v0.8h,   v31.8h
    386        smin            v1.8h,   v1.8h,   v31.8h
    387        smin            v2.8h,   v2.8h,   v31.8h
    388        smin            v3.8h,   v3.8h,   v31.8h
    389        smin            v4.8h,   v4.8h,   v31.8h
    390        smin            v5.8h,   v5.8h,   v31.8h
    391        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
    392        smin            v6.8h,   v6.8h,   v31.8h
    393        smin            v7.8h,   v7.8h,   v31.8h
    394        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
    395        b.gt            1b
    396        ret
    397 endfunc
    398 
    399 .macro iwht4
    400        add             v16.4s,  v16.4s,  v17.4s
    401        sub             v21.4s,  v18.4s,  v19.4s
    402        sub             v20.4s,  v16.4s,  v21.4s
    403        sshr            v20.4s,  v20.4s,  #1
    404        sub             v18.4s,  v20.4s,  v17.4s
    405        sub             v17.4s,  v20.4s,  v19.4s
    406        add             v19.4s,  v21.4s,  v18.4s
    407        sub             v16.4s,  v16.4s,  v17.4s
    408 .endm
    409 
    410 .macro idct_4 r0, r1, r2, r3
    411        mul_mla         v6,  \r1, \r3, v0.s[3], v0.s[2]
    412        mul_mla         v2,  \r0, \r2, v0.s[0], v0.s[0]
    413        mul_mls         v4,  \r1, \r3, v0.s[2], v0.s[3]
    414        mul_mls         v3,  \r0, \r2, v0.s[0], v0.s[0]
    415        srshr           v6.4s,  v6.4s,  #12
    416        srshr           v2.4s,  v2.4s,  #12
    417        srshr           v7.4s,  v4.4s,  #12
    418        srshr           v3.4s,  v3.4s,  #12
    419        sqadd           \r0\().4s,  v2.4s,   v6.4s
    420        sqsub           \r3\().4s,  v2.4s,   v6.4s
    421        sqadd           \r1\().4s,  v3.4s,   v7.4s
    422        sqsub           \r2\().4s,  v3.4s,   v7.4s
    423 .endm
    424 
    425 function inv_dct_4s_x4_neon
    426        AARCH64_VALID_CALL_TARGET
    427        movrel          x16, idct_coeffs
    428        ld1             {v0.4s}, [x16]
    429        idct_4          v16, v17, v18, v19
    430        ret
    431 endfunc
    432 
    433 .macro iadst_4x4 o0, o1, o2, o3
    434        movrel          x16, iadst4_coeffs
    435        ld1             {v0.4s}, [x16]
    436 
    437        sub             v3.4s,   v16.4s,  v18.4s
    438        mul             v4.4s,   v16.4s,  v0.s[0]
    439        mla             v4.4s,   v18.4s,  v0.s[1]
    440        mla             v4.4s,   v19.4s,  v0.s[2]
    441        mul             v7.4s,   v17.4s,  v0.s[3]
    442        add             v3.4s,   v3.4s,   v19.4s
    443        mul             v5.4s,   v16.4s,  v0.s[2]
    444        mls             v5.4s,   v18.4s,  v0.s[0]
    445        mls             v5.4s,   v19.4s,  v0.s[1]
    446 
    447        add             \o3\().4s, v4.4s,     v5.4s
    448        mul             \o2\().4s, v3.4s,     v0.s[3]
    449        add             \o0\().4s, v4.4s,     v7.4s
    450        add             \o1\().4s, v5.4s,     v7.4s
    451        sub             \o3\().4s, \o3\().4s, v7.4s
    452 
    453        srshr           \o0\().4s, \o0\().4s, #12
    454        srshr           \o2\().4s, \o2\().4s, #12
    455        srshr           \o1\().4s, \o1\().4s, #12
    456        srshr           \o3\().4s, \o3\().4s, #12
    457 .endm
    458 
    459 function inv_adst_4s_x4_neon
    460        AARCH64_VALID_CALL_TARGET
    461        iadst_4x4       v16, v17, v18, v19
    462        ret
    463 endfunc
    464 
    465 function inv_flipadst_4s_x4_neon
    466        AARCH64_VALID_CALL_TARGET
    467        iadst_4x4       v19, v18, v17, v16
    468        ret
    469 endfunc
    470 
    471 function inv_identity_4s_x4_neon
    472        AARCH64_VALID_CALL_TARGET
    473        movz            w16, #(5793-4096)*8, lsl #16
    474        dup             v0.2s,   w16
    475        sqrdmulh        v4.4s,   v16.4s,  v0.s[0]
    476        sqrdmulh        v5.4s,   v17.4s,  v0.s[0]
    477        sqrdmulh        v6.4s,   v18.4s,  v0.s[0]
    478        sqrdmulh        v7.4s,   v19.4s,  v0.s[0]
    479        sqadd           v16.4s,  v16.4s,  v4.4s
    480        sqadd           v17.4s,  v17.4s,  v5.4s
    481        sqadd           v18.4s,  v18.4s,  v6.4s
    482        sqadd           v19.4s,  v19.4s,  v7.4s
    483        ret
    484 endfunc
    485 
    486 function inv_txfm_add_wht_wht_4x4_16bpc_neon, export=1
    487        mov             x15, x30
    488        movi            v30.4s,  #0
    489        movi            v31.4s,  #0
    490        ld1             {v16.4s,v17.4s,v18.4s,v19.4s}, [x2]
    491        st1             {v30.4s, v31.4s}, [x2], #32
    492 
    493        sshr            v16.4s,  v16.4s,  #2
    494        sshr            v17.4s,  v17.4s,  #2
    495        sshr            v18.4s,  v18.4s,  #2
    496        sshr            v19.4s,  v19.4s,  #2
    497 
    498        iwht4
    499 
    500        st1             {v30.4s, v31.4s}, [x2], #32
    501        transpose_4x4s  v16, v17, v18, v19, v20, v21, v22, v23
    502 
    503        iwht4
    504 
    505        ld1             {v0.d}[0], [x0], x1
    506        sqxtn           v16.4h,  v16.4s
    507        ld1             {v0.d}[1], [x0], x1
    508        sqxtn2          v16.8h,  v17.4s
    509        ld1             {v1.d}[0], [x0], x1
    510        sqxtn           v18.4h,  v18.4s
    511        ld1             {v1.d}[1], [x0], x1
    512        sqxtn2          v18.8h,  v19.4s
    513 
    514        b               L(itx_4x4_end)
    515 endfunc
    516 
    517 // HBD inv_txfm_add_4x4_neon deviates from the common pattern with registers
    518 // x0-x4  external parameters
    519 // x5     function pointer to first transform
    520 // x6     function pointer to second transform
    521 function inv_txfm_add_4x4_neon
    522        movi            v30.4s,  #0
    523        movi            v31.4s,  #0
    524        ld1             {v16.4s,v17.4s,v18.4s,v19.4s}, [x2]
    525        st1             {v30.4s, v31.4s}, [x2], #32
    526 
    527        blr             x5
    528 
    529        st1             {v30.4s, v31.4s}, [x2], #32
    530        sqxtn           v16.4h,  v16.4s
    531        sqxtn           v17.4h,  v17.4s
    532        sqxtn           v18.4h,  v18.4s
    533        sqxtn           v19.4h,  v19.4s
    534        transpose_4x4h  v16, v17, v18, v19, v20, v21, v22, v23
    535 
    536        blr             x6
    537 
    538        ld1             {v0.d}[0], [x0], x1
    539        ld1             {v0.d}[1], [x0], x1
    540        ins             v16.d[1], v17.d[0]
    541        ins             v18.d[1], v19.d[0]
    542        ld1             {v1.d}[0], [x0], x1
    543        ld1             {v1.d}[1], [x0], x1
    544        srshr           v16.8h,  v16.8h,  #4
    545        srshr           v18.8h,  v18.8h,  #4
    546 
    547 L(itx_4x4_end):
    548        dup             v31.8h,  w4
    549        sub             x0,  x0,  x1, lsl #2
    550        usqadd          v0.8h,   v16.8h
    551        usqadd          v1.8h,   v18.8h
    552        smin            v0.8h,   v0.8h,   v31.8h
    553        st1             {v0.d}[0], [x0], x1
    554        smin            v1.8h,   v1.8h,   v31.8h
    555        st1             {v0.d}[1], [x0], x1
    556        st1             {v1.d}[0], [x0], x1
    557        st1             {v1.d}[1], [x0], x1
    558 
    559        ret             x15
    560 endfunc
    561 
    562 .macro def_fn_4x4 txfm1, txfm2
    563 function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_16bpc_neon, export=1
    564        mov             x15, x30
    565 
    566 .ifc \txfm1\()_\txfm2, dct_dct
    567        cbnz            w3,  1f
    568        movz            w16, #2896*8, lsl #16
    569        ld1r            {v16.4s}, [x2]
    570        dup             v4.2s,   w16
    571        str             wzr, [x2]
    572        sqrdmulh        v16.4s,  v16.4s,  v4.s[0]
    573        ld1             {v0.d}[0], [x0], x1
    574        sqxtn           v20.4h,  v16.4s
    575        sqxtn2          v20.8h,  v16.4s
    576        ld1             {v0.d}[1], [x0], x1
    577        sqrdmulh        v20.8h,  v20.8h,  v4.h[1]
    578        ld1             {v1.d}[0], [x0], x1
    579        srshr           v16.8h,  v20.8h,  #4
    580        ld1             {v1.d}[1], [x0], x1
    581        srshr           v18.8h,  v20.8h,  #4
    582        movi            v30.8h,  #0
    583        b               L(itx_4x4_end)
    584 1:
    585 .endif
    586        adr             x5,  inv_\txfm1\()_4s_x4_neon
    587        movrel          x6,  X(inv_\txfm2\()_4h_x4_neon)
    588        b               inv_txfm_add_4x4_neon
    589 endfunc
    590 .endm
    591 
    592 def_fn_4x4 dct, dct
    593 def_fn_4x4 identity, identity
    594 def_fn_4x4 dct, adst
    595 def_fn_4x4 dct, flipadst
    596 def_fn_4x4 dct, identity
    597 def_fn_4x4 adst, dct
    598 def_fn_4x4 adst, adst
    599 def_fn_4x4 adst, flipadst
    600 def_fn_4x4 flipadst, dct
    601 def_fn_4x4 flipadst, adst
    602 def_fn_4x4 flipadst, flipadst
    603 def_fn_4x4 identity, dct
    604 
    605 def_fn_4x4 adst, identity
    606 def_fn_4x4 flipadst, identity
    607 def_fn_4x4 identity, adst
    608 def_fn_4x4 identity, flipadst
    609 
    610 .macro idct_8 r0, r1, r2, r3, r4, r5, r6, r7
    611        idct_4          \r0, \r2, \r4, \r6
    612 
    613        movi            v5.4s,  #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
    614        mvni            v4.4s,  #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
    615 .irp r, \r0, \r2, \r4, \r6
    616        smin_4s         \r, \r, v5
    617 .endr
    618 .irp r, \r0, \r2, \r4, \r6
    619        smax_4s         \r, \r, v4
    620 .endr
    621 
    622        mul_mls         v2,  \r1, \r7, v1.s[0], v1.s[1]  // -> t4a
    623        mul_mla         v3,  \r1, \r7, v1.s[1], v1.s[0]  // -> t7a
    624        mul_mls         v6,  \r5, \r3, v1.s[2], v1.s[3]  // -> t5a
    625        mul_mla         v7,  \r5, \r3, v1.s[3], v1.s[2]  // -> t6a
    626        srshr           \r1\().4s, v2.4s,  #12           // t4a
    627        srshr           \r7\().4s, v3.4s,  #12           // t7a
    628        srshr           \r3\().4s, v6.4s,  #12           // t5a
    629        srshr           \r5\().4s, v7.4s,  #12           // t6a
    630 
    631        sqadd           v2.4s,     \r1\().4s,  \r3\().4s // t4
    632        sqsub           \r1\().4s, \r1\().4s,  \r3\().4s // t5a
    633        sqadd           v3.4s,     \r7\().4s,  \r5\().4s // t7
    634        sqsub           \r3\().4s, \r7\().4s,  \r5\().4s // t6a
    635 
    636 .irp r, v2, \r1, v3, \r3
    637        smin_4s         \r, \r, v5
    638 .endr
    639 .irp r, v2, \r1, v3, \r3
    640        smax_4s         \r, \r, v4
    641 .endr
    642 
    643        mul_mls         v7,  \r3, \r1, v0.s[0], v0.s[0]  // -> t5
    644        mul_mla         v6,  \r3, \r1, v0.s[0], v0.s[0]  // -> t6
    645        srshr           v7.4s,  v7.4s,  #12              // t5
    646        srshr           v6.4s,  v6.4s,  #12              // t6
    647 
    648        sqsub           \r7\().4s,  \r0\().4s,  v3.4s    // out7
    649        sqadd           \r0\().4s,  \r0\().4s,  v3.4s    // out0
    650        sqadd           \r1\().4s,  \r2\().4s,  v6.4s    // out1
    651        sqsub           v6.4s,      \r2\().4s,  v6.4s    // out6
    652        sqadd           \r2\().4s,  \r4\().4s,  v7.4s    // out2
    653        sqsub           \r5\().4s,  \r4\().4s,  v7.4s    // out5
    654        sqadd           \r3\().4s,  \r6\().4s,  v2.4s    // out3
    655        sqsub           \r4\().4s,  \r6\().4s,  v2.4s    // out4
    656        mov             \r6\().16b, v6.16b               // out6
    657 .endm
    658 
    659 function inv_dct_4s_x8_neon
    660        AARCH64_VALID_CALL_TARGET
    661        movrel          x16, idct_coeffs
    662        ld1             {v0.4s, v1.4s}, [x16]
    663        idct_8          v16, v17, v18, v19, v20, v21, v22, v23
    664        ret
    665 endfunc
    666 
    667 .macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7
    668        movrel          x16, iadst8_coeffs
    669        ld1             {v0.4s, v1.4s}, [x16], #32
    670 
    671        mul_mla         v2,  v23, v16, v0.s[0], v0.s[1]
    672        mul_mls         v4,  v23, v16, v0.s[1], v0.s[0]
    673        mul_mla         v6,  v21, v18, v0.s[2], v0.s[3]
    674        srshr           v16.4s, v2.4s,  #12  // t0a
    675        srshr           v23.4s, v4.4s,  #12  // t1a
    676        mul_mls         v2,  v21, v18, v0.s[3], v0.s[2]
    677        mul_mla         v4,  v19, v20, v1.s[0], v1.s[1]
    678        srshr           v18.4s, v6.4s,  #12  // t2a
    679        srshr           v21.4s, v2.4s,  #12  // t3a
    680        mul_mls         v6,  v19, v20, v1.s[1], v1.s[0]
    681        mul_mla         v2,  v17, v22, v1.s[2], v1.s[3]
    682        srshr           v20.4s, v4.4s,  #12  // t4a
    683        srshr           v19.4s, v6.4s,  #12  // t5a
    684        mul_mls         v4,  v17, v22, v1.s[3], v1.s[2]
    685        srshr           v22.4s, v2.4s,  #12  // t6a
    686        srshr           v17.4s, v4.4s,  #12  // t7a
    687 
    688        ld1             {v0.4s}, [x16]
    689 
    690        movi            v1.4s,   #1, msl #16     // row_clip_max = ~(~bdmax << 7), 0x1ffff
    691 
    692        sqadd           v2.4s,   v16.4s,  v20.4s // t0
    693        sqsub           v3.4s,   v16.4s,  v20.4s // t4
    694        mvni            v20.4s,  #1, msl #16     // row_clip_min = (~bdmax << 7), 0xfffe0000
    695        sqadd           v4.4s,   v23.4s,  v19.4s // t1
    696        sqsub           v5.4s,   v23.4s,  v19.4s // t5
    697        sqadd           v6.4s,   v18.4s,  v22.4s // t2
    698        sqsub           v7.4s,   v18.4s,  v22.4s // t6
    699        sqadd           v18.4s,  v21.4s,  v17.4s // t3
    700        sqsub           v19.4s,  v21.4s,  v17.4s // t7
    701 
    702 .irp r, v2, v3, v4, v5, v6, v7, v18, v19
    703        smin_4s         \r, \r, v1
    704 .endr
    705 .irp r, v2, v3, v4, v5, v6, v7, v18, v19
    706        smax_4s         \r, \r, v20
    707 .endr
    708 
    709        mul_mla         v16, v3,  v5,  v0.s[3], v0.s[2]
    710        mul_mls         v20, v3,  v5,  v0.s[2], v0.s[3]
    711        mul_mls         v22, v19, v7,  v0.s[3], v0.s[2]
    712 
    713        srshr           v3.4s,  v16.4s, #12  // t4a
    714        srshr           v5.4s,  v20.4s, #12  // t5a
    715 
    716        mul_mla         v16, v19, v7,  v0.s[2], v0.s[3]
    717 
    718        srshr           v7.4s,  v22.4s, #12  // t6a
    719        srshr           v19.4s, v16.4s, #12  // t7a
    720 
    721        sqadd           \o0\().4s, v2.4s, v6.4s  // out0
    722        sqsub           v2.4s,     v2.4s, v6.4s  // t2
    723        sqadd           \o7\().4s, v4.4s, v18.4s // out7
    724        sqsub           v4.4s,     v4.4s, v18.4s // t3
    725 
    726        mvni            v18.4s,  #1, msl #16     // row_clip_min = (~bdmax << 7), 0xfffe0000
    727 
    728        sqadd           \o1\().4s, v3.4s, v7.4s  // out1
    729        sqsub           v3.4s,     v3.4s, v7.4s  // t6
    730        sqadd           \o6\().4s, v5.4s, v19.4s // out6
    731        sqsub           v5.4s,     v5.4s, v19.4s // t7
    732 
    733        // Not clipping the output registers, as they will be downshifted and
    734        // narrowed afterwards anyway.
    735 .irp r, v2, v4, v3, v5
    736        smin_4s         \r, \r, v1
    737 .endr
    738 .irp r, v2, v4, v3, v5
    739        smax_4s         \r, \r, v18
    740 .endr
    741 
    742        sqneg           \o7\().4s, \o7\().4s     // out7
    743        sqneg           \o1\().4s, \o1\().4s     // out1
    744 
    745        mul_mla         v18, v2,  v4,  v0.s[0], v0.s[0] // -> out3 (v19 or v20)
    746        mul_mls         v6,  v2,  v4,  v0.s[0], v0.s[0] // -> out4 (v20 or v19)
    747        mul_mls         v20, v3,  v5,  v0.s[0], v0.s[0] // -> out5 (v21 or v18)
    748        srshr           v2.4s,  v18.4s, #12 // out3
    749        mul_mla         v18, v3,  v5,  v0.s[0], v0.s[0] // -> out2 (v18 or v21)
    750        srshr           v3.4s,  v20.4s, #12 // out5
    751        srshr           \o2\().4s, v18.4s, #12 // out2 (v18 or v21)
    752        srshr           \o4\().4s, v6.4s,  #12 // out4 (v20 or v19)
    753 
    754        sqneg           \o3\().4s, v2.4s     // out3
    755        sqneg           \o5\().4s, v3.4s     // out5
    756 .endm
    757 
    758 function inv_adst_4s_x8_neon
    759        AARCH64_VALID_CALL_TARGET
    760        iadst_8         v16, v17, v18, v19, v20, v21, v22, v23
    761        ret
    762 endfunc
    763 
    764 function inv_flipadst_4s_x8_neon
    765        AARCH64_VALID_CALL_TARGET
    766        iadst_8         v23, v22, v21, v20, v19, v18, v17, v16
    767        ret
    768 endfunc
    769 
    770 function inv_identity_4s_x8_neon
    771        AARCH64_VALID_CALL_TARGET
    772        sqshl           v16.4s,  v16.4s,  #1
    773        sqshl           v17.4s,  v17.4s,  #1
    774        sqshl           v18.4s,  v18.4s,  #1
    775        sqshl           v19.4s,  v19.4s,  #1
    776        sqshl           v20.4s,  v20.4s,  #1
    777        sqshl           v21.4s,  v21.4s,  #1
    778        sqshl           v22.4s,  v22.4s,  #1
    779        sqshl           v23.4s,  v23.4s,  #1
    780        ret
    781 endfunc
    782 
    783 function inv_txfm_add_8x8_neon
    784        movi            v31.4s,  #0
    785 
    786        cmp             w3,  w13
    787        mov             x11, #32
    788        b.lt            1f
    789 
    790        add             x6,  x2,  #16
    791 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
    792        ld1             {\i},     [x6]
    793        st1             {v31.4s}, [x6], x11
    794 .endr
    795 
    796        blr             x4
    797 
    798        sqrshrn         v24.4h,  v16.4s,  #1
    799        sqrshrn         v25.4h,  v17.4s,  #1
    800        sqrshrn         v26.4h,  v18.4s,  #1
    801        sqrshrn         v27.4h,  v19.4s,  #1
    802        sqrshrn2        v24.8h,  v20.4s,  #1
    803        sqrshrn2        v25.8h,  v21.4s,  #1
    804        sqrshrn2        v26.8h,  v22.4s,  #1
    805        sqrshrn2        v27.8h,  v23.4s,  #1
    806 
    807        transpose_4x8h  v24, v25, v26, v27, v2, v3, v4, v5
    808 
    809        b               2f
    810 
    811 1:
    812 .irp i, v24.8h, v25.8h, v26.8h, v27.8h
    813        movi            \i,  #0
    814 .endr
    815 
    816 2:
    817 
    818 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
    819        ld1             {\i},     [x2]
    820        st1             {v31.4s}, [x2], x11
    821 .endr
    822 
    823        blr             x4
    824 
    825        sqrshrn         v16.4h,  v16.4s,  #1
    826        sqrshrn         v17.4h,  v17.4s,  #1
    827        sqrshrn         v18.4h,  v18.4s,  #1
    828        sqrshrn         v19.4h,  v19.4s,  #1
    829        sqrshrn2        v16.8h,  v20.4s,  #1
    830        sqrshrn2        v17.8h,  v21.4s,  #1
    831        sqrshrn2        v18.8h,  v22.4s,  #1
    832        sqrshrn2        v19.8h,  v23.4s,  #1
    833 
    834        transpose_4x8h  v16, v17, v18, v19, v20, v21, v22, v23
    835 
    836        mov             v20.16b, v24.16b
    837        mov             v21.16b, v25.16b
    838        mov             v22.16b, v26.16b
    839        mov             v23.16b, v27.16b
    840 
    841        blr             x5
    842 
    843        load_add_store_8x8 x0, x7
    844        ret             x15
    845 endfunc
    846 
    847 .macro def_fn_8x8 txfm1, txfm2, eob_half
    848 function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_16bpc_neon, export=1
    849        mov             x15, x30
    850 
    851 .ifc \txfm1\()_\txfm2, dct_dct
    852        idct_dc         8,   8,   1
    853 .endif
    854        movrel          x5,  X(inv_\txfm2\()_8h_x8_neon)
    855        mov             w13, #\eob_half
    856        adr             x4,  inv_\txfm1\()_4s_x8_neon
    857        b               inv_txfm_add_8x8_neon
    858 endfunc
    859 .endm
    860 
    861 def_fn_8x8 dct, dct, 10
    862 def_fn_8x8 identity, identity, 10
    863 def_fn_8x8 dct, adst, 10
    864 def_fn_8x8 dct, flipadst, 10
    865 def_fn_8x8 dct, identity, 4
    866 def_fn_8x8 adst, dct, 10
    867 def_fn_8x8 adst, adst, 10
    868 def_fn_8x8 adst, flipadst, 10
    869 def_fn_8x8 flipadst, dct, 10
    870 def_fn_8x8 flipadst, adst, 10
    871 def_fn_8x8 flipadst, flipadst, 10
    872 def_fn_8x8 identity, dct, 4
    873 def_fn_8x8 adst, identity, 4
    874 def_fn_8x8 flipadst, identity, 4
    875 def_fn_8x8 identity, adst, 4
    876 def_fn_8x8 identity, flipadst, 4
    877 
    878 function inv_txfm_add_8x4_neon
    879        movi            v28.4s,  #0
    880        movi            v29.4s,  #0
    881        movi            v30.4s,  #0
    882        movi            v31.4s,  #0
    883        ld1             {v16.4s,v17.4s,v18.4s,v19.4s}, [x2]
    884        st1             {v28.4s,v29.4s,v30.4s,v31.4s}, [x2], #64
    885        movz            w16, #2896*8, lsl #16
    886        dup             v0.2s,   w16
    887        ld1             {v20.4s,v21.4s,v22.4s,v23.4s}, [x2]
    888        st1             {v28.4s,v29.4s,v30.4s,v31.4s}, [x2]
    889 
    890        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
    891 
    892        blr             x4
    893 
    894        sqxtn           v16.4h,  v16.4s
    895        sqxtn           v17.4h,  v17.4s
    896        sqxtn           v18.4h,  v18.4s
    897        sqxtn           v19.4h,  v19.4s
    898        sqxtn           v20.4h,  v20.4s
    899        sqxtn           v21.4h,  v21.4s
    900        sqxtn           v22.4h,  v22.4s
    901        sqxtn           v23.4h,  v23.4s
    902 
    903        transpose_4x4h  v16, v17, v18, v19, v4,  v5,  v6,  v7
    904        transpose_4x4h  v20, v21, v22, v23, v4,  v5,  v6,  v7
    905        ins             v16.d[1], v20.d[0]
    906        ins             v17.d[1], v21.d[0]
    907        ins             v18.d[1], v22.d[0]
    908        ins             v19.d[1], v23.d[0]
    909 
    910        blr             x5
    911 
    912        load_add_store_8x4 x0, x7
    913        ret             x15
    914 endfunc
    915 
    916 function inv_txfm_add_4x8_neon
    917        movz            w16, #2896*8, lsl #16
    918        movi            v31.4s,  #0
    919        dup             v30.2s,  w16
    920 
    921        cmp             w3,  w13
    922        mov             x11, #32
    923        b.lt            1f
    924 
    925        add             x6,  x2,  #16
    926 .irp i, v16.4s, v17.4s, v18.4s, v19.4s
    927        ld1             {\i},     [x6]
    928        st1             {v31.4s}, [x6], x11
    929 .endr
    930        scale_input     .4s, v30.s[0], v16, v17, v18, v19
    931        blr             x4
    932        sqxtn           v20.4h,  v16.4s
    933        sqxtn           v21.4h,  v17.4s
    934        sqxtn           v22.4h,  v18.4s
    935        sqxtn           v23.4h,  v19.4s
    936        transpose_4x4h  v20, v21, v22, v23, v4,  v5,  v6,  v7
    937 
    938        b               2f
    939 
    940 1:
    941 .irp i, v20, v21, v22, v23
    942        movi            \i\().4h, #0
    943 .endr
    944 
    945 2:
    946 
    947 .irp i, v16.4s, v17.4s, v18.4s, v19.4s
    948        ld1             {\i},     [x2]
    949        st1             {v31.4s}, [x2], x11
    950 .endr
    951        scale_input     .4s, v30.s[0], v16, v17, v18, v19
    952        blr             x4
    953        sqxtn           v16.4h,  v16.4s
    954        sqxtn           v17.4h,  v17.4s
    955        sqxtn           v18.4h,  v18.4s
    956        sqxtn           v19.4h,  v19.4s
    957        transpose_4x4h  v16, v17, v18, v19, v4,  v5,  v6,  v7
    958 
    959        blr             x5
    960 
    961        load_add_store_4x8 x0, x7
    962        ret             x15
    963 endfunc
    964 
    965 .macro def_fn_48 w, h, txfm1, txfm2, eob_half
    966 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1
    967        mov             x15, x30
    968 
    969 .ifc \txfm1\()_\txfm2, dct_dct
    970        idct_dc         \w,  \h,  0
    971 .endif
    972        adr             x4,  inv_\txfm1\()_4s_x\w\()_neon
    973 .if \w == 4
    974        mov             w13, #\eob_half
    975 .endif
    976        movrel          x5,  X(inv_\txfm2\()_\w\()h_x\h\()_neon)
    977        b               inv_txfm_add_\w\()x\h\()_neon
    978 endfunc
    979 .endm
    980 
    981 .macro def_fns_48 w, h
    982 def_fn_48 \w, \h, dct, dct, 13
    983 def_fn_48 \w, \h, identity, identity, 13
    984 def_fn_48 \w, \h, dct, adst, 13
    985 def_fn_48 \w, \h, dct, flipadst, 13
    986 def_fn_48 \w, \h, dct, identity, 4
    987 def_fn_48 \w, \h, adst, dct, 13
    988 def_fn_48 \w, \h, adst, adst, 13
    989 def_fn_48 \w, \h, adst, flipadst, 13
    990 def_fn_48 \w, \h, flipadst, dct, 13
    991 def_fn_48 \w, \h, flipadst, adst, 13
    992 def_fn_48 \w, \h, flipadst, flipadst, 13
    993 def_fn_48 \w, \h, identity, dct, 16
    994 def_fn_48 \w, \h, adst, identity, 4
    995 def_fn_48 \w, \h, flipadst, identity, 4
    996 def_fn_48 \w, \h, identity, adst, 16
    997 def_fn_48 \w, \h, identity, flipadst, 16
    998 .endm
    999 
   1000 def_fns_48 4, 8
   1001 def_fns_48 8, 4
   1002 
   1003 
   1004 function inv_dct_4s_x16_neon
   1005        AARCH64_VALID_CALL_TARGET
   1006        movrel          x16, idct_coeffs
   1007        ld1             {v0.4s, v1.4s}, [x16], #32
   1008 
   1009        idct_8          v16, v18, v20, v22, v24, v26, v28, v30
   1010 
   1011        // idct_8 leaves the row_clip_max/min constants in v5 and v4
   1012 .irp r, v16, v18, v20, v22, v24, v26, v28, v30
   1013        smin            \r\().4s,  \r\().4s,  v5.4s
   1014 .endr
   1015 .irp r, v16, v18, v20, v22, v24, v26, v28, v30
   1016        smax            \r\().4s,  \r\().4s,  v4.4s
   1017 .endr
   1018 
   1019        ld1             {v0.4s, v1.4s}, [x16]
   1020        sub             x16, x16, #32
   1021 
   1022        mul_mls         v2,  v17, v31, v0.s[0], v0.s[1] // -> t8a
   1023        mul_mla         v3,  v17, v31, v0.s[1], v0.s[0] // -> t15a
   1024        mul_mls         v6,  v25, v23, v0.s[2], v0.s[3] // -> t9a
   1025        srshr           v17.4s, v2.4s,  #12             // t8a
   1026        srshr           v31.4s, v3.4s,  #12             // t15a
   1027        mul_mla         v2,  v25, v23, v0.s[3], v0.s[2] // -> t14a
   1028        mul_mls         v3,  v21, v27, v1.s[0], v1.s[1] // -> t10a
   1029        srshr           v23.4s, v6.4s,  #12             // t9a
   1030        srshr           v25.4s, v2.4s,  #12             // t14a
   1031        mul_mla         v6,  v21, v27, v1.s[1], v1.s[0] // -> t13a
   1032        mul_mls         v2,  v29, v19, v1.s[2], v1.s[3] // -> t11a
   1033        srshr           v21.4s, v3.4s,  #12             // t10a
   1034        srshr           v27.4s, v6.4s,  #12             // t13a
   1035        mul_mla         v3,  v29, v19, v1.s[3], v1.s[2] // -> t12a
   1036        srshr           v19.4s, v2.4s,  #12             // t11a
   1037        srshr           v29.4s, v3.4s,  #12             // t12a
   1038 
   1039        ld1             {v0.4s}, [x16]
   1040 
   1041        sqsub           v2.4s,   v17.4s,  v23.4s  // t9
   1042        sqadd           v17.4s,  v17.4s,  v23.4s  // t8
   1043        sqsub           v3.4s,   v31.4s,  v25.4s  // t14
   1044        sqadd           v31.4s,  v31.4s,  v25.4s  // t15
   1045        sqsub           v23.4s,  v19.4s,  v21.4s  // t10
   1046        sqadd           v19.4s,  v19.4s,  v21.4s  // t11
   1047        sqadd           v25.4s,  v29.4s,  v27.4s  // t12
   1048        sqsub           v29.4s,  v29.4s,  v27.4s  // t13
   1049 
   1050 .irp r, v2, v17, v3, v31, v23, v19, v25, v29
   1051        smin            \r\().4s,  \r\().4s,  v5.4s
   1052 .endr
   1053 .irp r, v2, v17, v3, v31, v23, v19, v25, v29
   1054        smax            \r\().4s,  \r\().4s,  v4.4s
   1055 .endr
   1056 
   1057        mul_mls         v7,  v3,  v2,  v0.s[2], v0.s[3] // -> t9a
   1058        mul_mla         v6,  v3,  v2,  v0.s[3], v0.s[2] // -> t14a
   1059        srshr           v21.4s, v7.4s,  #12             // t9a
   1060        srshr           v27.4s, v6.4s,  #12             // t14a
   1061 
   1062        mul_mls         v7,  v29, v23, v0.s[2], v0.s[3] // -> t13a
   1063        mul_mla         v6,  v29, v23, v0.s[3], v0.s[2] // -> t10a
   1064        srshr           v29.4s, v7.4s,  #12             // t13a
   1065        neg             v6.4s,   v6.4s
   1066        srshr           v23.4s, v6.4s,  #12             // t10a
   1067 
   1068        sqsub           v2.4s,   v17.4s,  v19.4s  // t11a
   1069        sqadd           v17.4s,  v17.4s,  v19.4s  // t8a
   1070        sqsub           v3.4s,   v31.4s,  v25.4s  // t12a
   1071        sqadd           v31.4s,  v31.4s,  v25.4s  // t15a
   1072        sqadd           v19.4s,  v21.4s,  v23.4s  // t9
   1073        sqsub           v21.4s,  v21.4s,  v23.4s  // t10
   1074        sqsub           v25.4s,  v27.4s,  v29.4s  // t13
   1075        sqadd           v27.4s,  v27.4s,  v29.4s  // t14
   1076 
   1077 .irp r, v2, v17, v3, v31, v19, v21, v25, v27
   1078        smin            \r\().4s,  \r\().4s,  v5.4s
   1079 .endr
   1080 .irp r, v2, v17, v3, v31, v19, v21, v25, v27
   1081        smax            \r\().4s,  \r\().4s,  v4.4s
   1082 .endr
   1083 
   1084        mul_mls         v7,  v3,  v2,  v0.s[0], v0.s[0] // -> t11
   1085        mul_mla         v6,  v3,  v2,  v0.s[0], v0.s[0] // -> t12
   1086        mul_mls         v2,  v25, v21, v0.s[0], v0.s[0] // -> t10a
   1087 
   1088        srshr           v7.4s,  v7.4s,  #12   // t11
   1089        srshr           v6.4s,  v6.4s,  #12   // t12
   1090        mul_mla         v3,  v25, v21, v0.s[0], v0.s[0] // -> t13a
   1091        srshr           v2.4s,  v2.4s,  #12   // t10a
   1092        srshr           v3.4s,  v3.4s,  #12   // t13a
   1093 
   1094        sqadd           v1.4s,   v16.4s,  v31.4s  // out0
   1095        sqsub           v31.4s,  v16.4s,  v31.4s  // out15
   1096        mov             v16.16b, v1.16b
   1097        sqadd           v23.4s,  v30.4s,  v17.4s  // out7
   1098        sqsub           v1.4s,   v30.4s,  v17.4s  // out8
   1099        sqadd           v17.4s,  v18.4s,  v27.4s  // out1
   1100        sqsub           v30.4s,  v18.4s,  v27.4s  // out14
   1101        sqadd           v18.4s,  v20.4s,  v3.4s   // out2
   1102        sqsub           v29.4s,  v20.4s,  v3.4s   // out13
   1103        sqadd           v3.4s,   v28.4s,  v19.4s  // out6
   1104        sqsub           v25.4s,  v28.4s,  v19.4s  // out9
   1105        sqadd           v19.4s,  v22.4s,  v6.4s   // out3
   1106        sqsub           v28.4s,  v22.4s,  v6.4s   // out12
   1107        sqadd           v20.4s,  v24.4s,  v7.4s   // out4
   1108        sqsub           v27.4s,  v24.4s,  v7.4s   // out11
   1109        sqadd           v21.4s,  v26.4s,  v2.4s   // out5
   1110        sqsub           v26.4s,  v26.4s,  v2.4s   // out10
   1111        mov             v24.16b, v1.16b
   1112        mov             v22.16b, v3.16b
   1113 
   1114        ret
   1115 endfunc
   1116 
   1117 .macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15
   1118        movrel          x16, iadst16_coeffs
   1119        ld1             {v0.4s, v1.4s}, [x16], #32
   1120 
   1121        mul_mla         v2,  v31, v16, v0.s[0], v0.s[1] // -> t0
   1122        mul_mls         v4,  v31, v16, v0.s[1], v0.s[0] // -> t1
   1123        mul_mla         v6,  v29, v18, v0.s[2], v0.s[3] // -> t2
   1124        srshr           v16.4s, v2.4s,  #12             // t0
   1125        srshr           v31.4s, v4.4s,  #12             // t1
   1126        mul_mls         v2,  v29, v18, v0.s[3], v0.s[2] // -> t3
   1127        mul_mla         v4,  v27, v20, v1.s[0], v1.s[1] // -> t4
   1128        srshr           v18.4s, v6.4s,  #12             // t2
   1129        srshr           v29.4s, v2.4s,  #12             // t3
   1130        mul_mls         v6,  v27, v20, v1.s[1], v1.s[0] // -> t5
   1131        mul_mla         v2,  v25, v22, v1.s[2], v1.s[3] // -> t6
   1132        srshr           v20.4s, v4.4s,  #12             // t4
   1133        srshr           v27.4s, v6.4s,  #12             // t5
   1134        mul_mls         v4,  v25, v22, v1.s[3], v1.s[2] // -> t7
   1135        ld1             {v0.4s, v1.4s}, [x16]
   1136        movrel          x16, idct_coeffs
   1137        mul_mla         v6,  v23, v24, v0.s[0], v0.s[1] // -> t8
   1138        srshr           v22.4s, v2.4s,  #12             // t6
   1139        srshr           v25.4s, v4.4s,  #12             // t7
   1140        mul_mls         v2,  v23, v24, v0.s[1], v0.s[0] // -> t9
   1141        mul_mla         v4,  v21, v26, v0.s[2], v0.s[3] // -> t10
   1142        srshr           v23.4s, v6.4s,  #12             // t8
   1143        srshr           v24.4s, v2.4s,  #12             // t9
   1144        mul_mls         v6,  v21, v26, v0.s[3], v0.s[2] // -> t11
   1145        mul_mla         v2,  v19, v28, v1.s[0], v1.s[1] // -> t12
   1146        srshr           v21.4s, v4.4s,  #12             // t10
   1147        srshr           v26.4s, v6.4s,  #12             // t11
   1148        mul_mls         v4,  v19, v28, v1.s[1], v1.s[0] // -> t13
   1149        mul_mla         v6,  v17, v30, v1.s[2], v1.s[3] // -> t14
   1150        srshr           v19.4s, v2.4s,  #12             // t12
   1151        srshr           v28.4s, v4.4s,  #12             // t13
   1152        mul_mls         v2,  v17, v30, v1.s[3], v1.s[2] // -> t15
   1153        srshr           v17.4s, v6.4s,  #12             // t14
   1154        srshr           v30.4s, v2.4s,  #12             // t15
   1155 
   1156        ld1             {v0.4s, v1.4s}, [x16]
   1157 
   1158        movi            v5.4s,   #1, msl #16     // row_clip_max = ~(~bdmax << 7), 0x1ffff
   1159        mvni            v7.4s,   #1, msl #16     // row_clip_min = (~bdmax << 7), 0xfffe0000
   1160 
   1161        sqsub           v2.4s,   v16.4s,  v23.4s // t8a
   1162        sqadd           v16.4s,  v16.4s,  v23.4s // t0a
   1163        sqsub           v3.4s,   v31.4s,  v24.4s // t9a
   1164        sqadd           v31.4s,  v31.4s,  v24.4s // t1a
   1165        sqadd           v23.4s,  v18.4s,  v21.4s // t2a
   1166        sqsub           v18.4s,  v18.4s,  v21.4s // t10a
   1167        sqadd           v24.4s,  v29.4s,  v26.4s // t3a
   1168        sqsub           v29.4s,  v29.4s,  v26.4s // t11a
   1169        sqadd           v21.4s,  v20.4s,  v19.4s // t4a
   1170        sqsub           v20.4s,  v20.4s,  v19.4s // t12a
   1171        sqadd           v26.4s,  v27.4s,  v28.4s // t5a
   1172        sqsub           v27.4s,  v27.4s,  v28.4s // t13a
   1173        sqadd           v19.4s,  v22.4s,  v17.4s // t6a
   1174        sqsub           v22.4s,  v22.4s,  v17.4s // t14a
   1175        sqadd           v28.4s,  v25.4s,  v30.4s // t7a
   1176        sqsub           v25.4s,  v25.4s,  v30.4s // t15a
   1177 
   1178 .irp r, v2, v16, v3, v31, v23, v18, v24, v29, v21, v20, v26, v27, v19, v22, v28, v25
   1179        smin_4s         \r, \r, v5
   1180 .endr
   1181 .irp r, v2, v16, v3, v31, v23, v18, v24, v29, v21, v20, v26, v27, v19, v22, v28, v25
   1182        smax_4s         \r, \r, v7
   1183 .endr
   1184 
   1185        mul_mla         v4,  v2,  v3,  v1.s[1], v1.s[0] // -> t8
   1186        mul_mls         v6,  v2,  v3,  v1.s[0], v1.s[1] // -> t9
   1187        mul_mla         v2,  v18, v29, v1.s[3], v1.s[2] // -> t10
   1188        srshr           v17.4s, v4.4s,  #12             // t8
   1189        srshr           v30.4s, v6.4s,  #12             // t9
   1190        mul_mls         v4,  v18, v29, v1.s[2], v1.s[3] // -> t11
   1191        mul_mls         v6,  v27, v20, v1.s[1], v1.s[0] // -> t12
   1192        srshr           v18.4s, v2.4s,  #12             // t10
   1193        srshr           v29.4s, v4.4s,  #12             // t11
   1194        mul_mla         v2,  v27, v20, v1.s[0], v1.s[1] // -> t13
   1195        mul_mls         v4,  v25, v22, v1.s[3], v1.s[2] // -> t14
   1196        srshr           v27.4s, v6.4s,  #12             // t12
   1197        srshr           v20.4s, v2.4s,  #12             // t13
   1198        mul_mla         v6,  v25, v22, v1.s[2], v1.s[3] // -> t15
   1199        srshr           v25.4s, v4.4s,  #12             // t14
   1200        srshr           v22.4s, v6.4s,  #12             // t15
   1201 
   1202        sqsub           v2.4s,   v16.4s,  v21.4s // t4
   1203        sqadd           v16.4s,  v16.4s,  v21.4s // t0
   1204        sqsub           v3.4s,   v31.4s,  v26.4s // t5
   1205        sqadd           v31.4s,  v31.4s,  v26.4s // t1
   1206        sqadd           v21.4s,  v23.4s,  v19.4s // t2
   1207        sqsub           v23.4s,  v23.4s,  v19.4s // t6
   1208        sqadd           v26.4s,  v24.4s,  v28.4s // t3
   1209        sqsub           v24.4s,  v24.4s,  v28.4s // t7
   1210        sqadd           v19.4s,  v17.4s,  v27.4s // t8a
   1211        sqsub           v17.4s,  v17.4s,  v27.4s // t12a
   1212        sqadd           v28.4s,  v30.4s,  v20.4s // t9a
   1213        sqsub           v30.4s,  v30.4s,  v20.4s // t13a
   1214        sqadd           v27.4s,  v18.4s,  v25.4s // t10a
   1215        sqsub           v18.4s,  v18.4s,  v25.4s // t14a
   1216        sqadd           v20.4s,  v29.4s,  v22.4s // t11a
   1217        sqsub           v29.4s,  v29.4s,  v22.4s // t15a
   1218 
   1219 .irp r, v2, v16, v3, v31, v21, v23, v26, v24, v19, v17, v28, v30, v27, v18, v20, v29
   1220        smin_4s         \r, \r, v5
   1221 .endr
   1222 .irp r, v2, v16, v3, v31, v21, v23, v26, v24, v19, v17, v28, v30, v27, v18, v20, v29
   1223        smax_4s         \r, \r, v7
   1224 .endr
   1225 
   1226        mul_mla         v4,  v2,  v3,  v0.s[3], v0.s[2] // -> t4a
   1227        mul_mls         v6,  v2,  v3,  v0.s[2], v0.s[3] // -> t5a
   1228        mul_mls         v2,  v24, v23, v0.s[3], v0.s[2] // -> t6a
   1229        srshr           v22.4s, v4.4s,  #12             // t4a
   1230        srshr           v25.4s, v6.4s,  #12             // t5a
   1231        mul_mla         v4,  v24, v23, v0.s[2], v0.s[3] // -> t7a
   1232        mul_mla         v6,  v17, v30, v0.s[3], v0.s[2] // -> t12
   1233        srshr           v24.4s, v2.4s,  #12             // t6a
   1234        srshr           v23.4s, v4.4s,  #12             // t7a
   1235        mul_mls         v2,  v17, v30, v0.s[2], v0.s[3] // -> t13
   1236        mul_mls         v4,  v29, v18, v0.s[3], v0.s[2] // -> t14
   1237        srshr           v17.4s, v6.4s,  #12             // t12
   1238        mul_mla         v6,  v29, v18, v0.s[2], v0.s[3] // -> t15
   1239        srshr           v29.4s, v2.4s,  #12             // t13
   1240        srshr           v30.4s, v4.4s,  #12             // t14
   1241        srshr           v18.4s, v6.4s,  #12             // t15
   1242 
   1243        sqsub           v2.4s,   v16.4s,  v21.4s // t2a
   1244 .ifc \o0, v16
   1245        sqadd           \o0\().4s,  v16.4s,  v21.4s // out0
   1246        sqsub           v21.4s,     v31.4s,  v26.4s // t3a
   1247        sqadd           \o15\().4s, v31.4s,  v26.4s // out15
   1248 .else
   1249        sqadd           v4.4s,      v16.4s,  v21.4s // out0
   1250        sqsub           v21.4s,     v31.4s,  v26.4s // t3a
   1251        sqadd           \o15\().4s, v31.4s,  v26.4s // out15
   1252        mov             \o0\().16b, v4.16b
   1253 .endif
   1254 
   1255        sqsub           v3.4s,      v29.4s,  v18.4s // t15a
   1256        sqadd           \o13\().4s, v29.4s,  v18.4s // out13
   1257        sqadd           \o2\().4s,  v17.4s,  v30.4s // out2
   1258        sqsub           v26.4s,     v17.4s,  v30.4s // t14a
   1259 
   1260        sqadd           \o1\().4s,  v19.4s,  v27.4s // out1
   1261        sqsub           v27.4s,     v19.4s,  v27.4s // t10
   1262        sqadd           \o14\().4s, v28.4s,  v20.4s // out14
   1263        sqsub           v20.4s,     v28.4s,  v20.4s // t11
   1264 
   1265        sqadd           \o3\().4s,  v22.4s,  v24.4s // out3
   1266        sqsub           v22.4s,     v22.4s,  v24.4s // t6
   1267        sqadd           \o12\().4s, v25.4s,  v23.4s // out12
   1268        sqsub           v23.4s,     v25.4s,  v23.4s // t7
   1269 
   1270        // Not clipping the output registers, as they will be downshifted and
   1271        // narrowed afterwards anyway.
   1272 .irp r, v2, v21, v3, v26, v27, v20, v22, v23
   1273        smin_4s         \r, \r, v5
   1274 .endr
   1275 .irp r, v2, v21, v3, v26, v27, v20, v22, v23
   1276        smax_4s         \r, \r, v7
   1277 .endr
   1278 
   1279        sqneg           \o15\().4s, \o15\().4s      // out15
   1280        sqneg           \o13\().4s, \o13\().4s      // out13
   1281        sqneg           \o1\().4s,  \o1\().4s       // out1
   1282        sqneg           \o3\().4s,  \o3\().4s       // out3
   1283 
   1284        mul_mls         v24, v2,  v21, v0.s[0], v0.s[0] // -> out8 (v24 or v23)
   1285        mul_mla         v4,  v2,  v21, v0.s[0], v0.s[0] // -> out7 (v23 or v24)
   1286        mul_mla         v6,  v26, v3,  v0.s[0], v0.s[0] // -> out5 (v21 or v26)
   1287 
   1288        srshr           v24.4s, v24.4s, #12             // out8
   1289        srshr           v4.4s,  v4.4s,  #12             // out7
   1290        srshr           v5.4s,  v6.4s,  #12             // out5
   1291        mul_mls         v6,  v26, v3,  v0.s[0], v0.s[0] // -> out10 (v26 or v21)
   1292        mul_mla         v2,  v22, v23, v0.s[0], v0.s[0] // -> out4 (v20 or v27)
   1293        srshr           v26.4s, v6.4s,  #12             // out10
   1294 
   1295        mul_mls         v6,  v22, v23, v0.s[0], v0.s[0] // -> out11 (v27 or v20)
   1296        mul_mla         v22, v27, v20, v0.s[0], v0.s[0] // -> out6 (v22 or v25)
   1297        mul_mls         v21, v27, v20, v0.s[0], v0.s[0] // -> out9 (v25 or v22)
   1298 
   1299        srshr           \o4\().4s,   v2.4s,  #12        // out4
   1300        srshr           v6.4s,       v6.4s,  #12        // out11
   1301        srshr           v7.4s,       v21.4s, #12        // out9
   1302        srshr           \o6\().4s,   v22.4s, #12        // out6
   1303 
   1304 .ifc \o8, v23
   1305        mov             \o8\().16b,  v24.16b
   1306        mov             \o10\().16b, v26.16b
   1307 .endif
   1308 
   1309        sqneg           \o7\().4s,   v4.4s // out7
   1310        sqneg           \o5\().4s,   v5.4s // out5
   1311        sqneg           \o11\().4s,  v6.4s // out11
   1312        sqneg           \o9\().4s,   v7.4s // out9
   1313 .endm
   1314 
   1315 function inv_adst_4s_x16_neon
   1316        AARCH64_VALID_CALL_TARGET
   1317        iadst_16        v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
   1318        ret
   1319 endfunc
   1320 
   1321 function inv_flipadst_4s_x16_neon
   1322        AARCH64_VALID_CALL_TARGET
   1323        iadst_16        v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16
   1324        ret
   1325 endfunc
   1326 
   1327 function inv_identity_4s_x16_neon
   1328        AARCH64_VALID_CALL_TARGET
   1329        movz            w16, #2*(5793-4096)*8, lsl #16
   1330        dup             v0.2s,   w16
   1331 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
   1332        sqrdmulh        v2.4s,      v\i\().4s,  v0.s[0]
   1333        sqadd           v\i\().4s,  v\i\().4s,  v\i\().4s
   1334        sqadd           v\i\().4s,  v\i\().4s,  v2.4s
   1335 .endr
   1336        ret
   1337 endfunc
   1338 
   1339 .macro identity_4x16_shift1 c
   1340 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
   1341        sqrdmulh        v3.4s,   \i,      \c
   1342        srshr           v3.4s,   v3.4s,   #1
   1343        sqadd           \i,      \i,      v3.4s
   1344 .endr
   1345 .endm
   1346 
   1347 .macro identity_4x16 c
   1348 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
   1349        sqrdmulh        v3.4s,   \i,      \c
   1350        sqadd           \i,      \i,      \i
   1351        sqadd           \i,      \i,      v3.4s
   1352 .endr
   1353 .endm
   1354 
   1355 .macro def_horz_16 scale=0, shift=2, suffix
   1356 function inv_txfm_horz\suffix\()_16x4_neon
   1357        mov             x14, x30
   1358        movi            v7.4s,  #0
   1359 .if \scale
   1360        movz            w16, #2896*8, lsl #16
   1361        dup             v0.2s,   w16
   1362 .endif
   1363 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
   1364        ld1             {\i}, [x7]
   1365        st1             {v7.4s}, [x7], x8
   1366 .endr
   1367 .if \scale
   1368        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
   1369        scale_input     .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
   1370 .endif
   1371        blr             x4
   1372        sqrshrn         v16.4h,  v16.4s,  #\shift
   1373        sqrshrn         v17.4h,  v17.4s,  #\shift
   1374        sqrshrn         v18.4h,  v18.4s,  #\shift
   1375        sqrshrn         v19.4h,  v19.4s,  #\shift
   1376        sqrshrn2        v16.8h,  v20.4s,  #\shift
   1377        sqrshrn2        v17.8h,  v21.4s,  #\shift
   1378        sqrshrn2        v18.8h,  v22.4s,  #\shift
   1379        sqrshrn2        v19.8h,  v23.4s,  #\shift
   1380        sqrshrn         v20.4h,  v24.4s,  #\shift
   1381        sqrshrn         v21.4h,  v25.4s,  #\shift
   1382        sqrshrn         v22.4h,  v26.4s,  #\shift
   1383        sqrshrn         v23.4h,  v27.4s,  #\shift
   1384        sqrshrn2        v20.8h,  v28.4s,  #\shift
   1385        sqrshrn2        v21.8h,  v29.4s,  #\shift
   1386        sqrshrn2        v22.8h,  v30.4s,  #\shift
   1387        sqrshrn2        v23.8h,  v31.4s,  #\shift
   1388 .if \scale
   1389        b               L(horz_16x4_epilog)
   1390 .else
   1391 L(horz_16x4_epilog):
   1392        transpose_4x8h  v16, v17, v18, v19, v4,  v5,  v6,  v7
   1393        transpose_4x8h  v20, v21, v22, v23, v4,  v5,  v6,  v7
   1394 
   1395 .irp i, v16.8h, v20.8h, v17.8h, v21.8h, v18.8h, v22.8h, v19.8h, v23.8h
   1396        st1             {\i}, [x6], #16
   1397 .endr
   1398 
   1399        ret             x14
   1400 .endif
   1401 endfunc
   1402 .endm
   1403 
   1404 def_horz_16 scale=1, shift=1, suffix=_scale
   1405 def_horz_16 scale=0, shift=2
   1406 
   1407 function inv_txfm_add_vert_8x16_neon
   1408        mov             x14, x30
   1409 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
   1410        ld1             {v\i\().8h}, [x7], x8
   1411 .endr
   1412        blr             x5
   1413        load_add_store_8x16 x6, x7
   1414        ret             x14
   1415 endfunc
   1416 
   1417 function inv_txfm_add_16x16_neon
   1418        mov             x15, x30
   1419        sub             sp,  sp,  #512
   1420        ldrh            w12, [x13], #2
   1421 .irp i, 0, 4, 8, 12
   1422        add             x6,  sp,  #(\i*16*2)
   1423 .if \i > 0
   1424        mov             w8,  #(16 - \i)
   1425        cmp             w3,  w12
   1426        b.lt            1f
   1427 .if \i < 12
   1428        ldrh            w12, [x13], #2
   1429 .endif
   1430 .endif
   1431        add             x7,  x2,  #(\i*4)
   1432        mov             x8,  #16*4
   1433        bl              inv_txfm_horz_16x4_neon
   1434 .endr
   1435        b               3f
   1436 1:
   1437        movi            v4.8h,  #0
   1438        movi            v5.8h,  #0
   1439        movi            v6.8h,  #0
   1440        movi            v7.8h,  #0
   1441 2:
   1442        subs            w8,  w8,  #4
   1443 .rept 2
   1444        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
   1445 .endr
   1446        b.gt            2b
   1447 3:
   1448 .irp i, 0, 8
   1449        add             x6,  x0,  #(\i*2)
   1450        add             x7,  sp,  #(\i*2)
   1451        mov             x8,  #32
   1452        bl              inv_txfm_add_vert_8x16_neon
   1453 .endr
   1454 
   1455        add             sp,  sp,  #512
   1456        ret             x15
   1457 endfunc
   1458 
   1459 const eob_16x16
   1460        .short 10, 36, 78, 256
   1461 endconst
   1462 
   1463 const eob_16x16_identity
   1464        .short 4, 8, 12, 256
   1465 endconst
   1466 
   1467 .macro def_fn_16x16 txfm1, txfm2
   1468 function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_16bpc_neon, export=1
   1469 .ifc \txfm1\()_\txfm2, dct_dct
   1470        idct_dc         16,  16,  2
   1471 .endif
   1472        adr             x4,  inv_\txfm1\()_4s_x16_neon
   1473        movrel          x5,  X(inv_\txfm2\()_8h_x16_neon)
   1474 .ifc \txfm1, identity
   1475 .ifc \txfm2, identity
   1476        movrel          x13, eob_16x16
   1477 .else
   1478        movrel          x13, eob_16x16_identity
   1479 .endif
   1480 .else
   1481 .ifc \txfm2, identity
   1482        movrel          x13, eob_16x16_identity
   1483 .else
   1484        movrel          x13, eob_16x16
   1485 .endif
   1486 .endif
   1487        b               inv_txfm_add_16x16_neon
   1488 endfunc
   1489 .endm
   1490 
   1491 def_fn_16x16 dct, dct
   1492 def_fn_16x16 identity, identity
   1493 def_fn_16x16 dct, adst
   1494 def_fn_16x16 dct, flipadst
   1495 def_fn_16x16 dct, identity
   1496 def_fn_16x16 adst, dct
   1497 def_fn_16x16 adst, adst
   1498 def_fn_16x16 adst, flipadst
   1499 def_fn_16x16 flipadst, dct
   1500 def_fn_16x16 flipadst, adst
   1501 def_fn_16x16 flipadst, flipadst
   1502 def_fn_16x16 identity, dct
   1503 
   1504 function inv_txfm_add_16x4_neon
   1505        mov             x15, x30
   1506        movi            v4.4s,  #0
   1507 
   1508 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
   1509        ld1             {\i},    [x2]
   1510        st1             {v4.4s}, [x2], #16
   1511 .endr
   1512 
   1513        blr             x4
   1514 
   1515        sqrshrn         v16.4h,  v16.4s,  #1
   1516        sqrshrn         v17.4h,  v17.4s,  #1
   1517        sqrshrn         v18.4h,  v18.4s,  #1
   1518        sqrshrn         v19.4h,  v19.4s,  #1
   1519        sqrshrn2        v16.8h,  v20.4s,  #1
   1520        sqrshrn2        v17.8h,  v21.4s,  #1
   1521        sqrshrn2        v18.8h,  v22.4s,  #1
   1522        sqrshrn2        v19.8h,  v23.4s,  #1
   1523        transpose_4x8h  v16, v17, v18, v19, v2,  v3,  v4,  v5
   1524        blr             x5
   1525        mov             x6,  x0
   1526        load_add_store_8x4 x6, x7
   1527 
   1528        sqrshrn         v16.4h,  v24.4s,  #1
   1529        sqrshrn         v17.4h,  v25.4s,  #1
   1530        sqrshrn         v18.4h,  v26.4s,  #1
   1531        sqrshrn         v19.4h,  v27.4s,  #1
   1532        sqrshrn2        v16.8h,  v28.4s,  #1
   1533        sqrshrn2        v17.8h,  v29.4s,  #1
   1534        sqrshrn2        v18.8h,  v30.4s,  #1
   1535        sqrshrn2        v19.8h,  v31.4s,  #1
   1536        transpose_4x8h  v16, v17, v18, v19, v2,  v3,  v4,  v5
   1537        blr             x5
   1538        add             x6,  x0,  #16
   1539        load_add_store_8x4 x6, x7
   1540 
   1541        ret             x15
   1542 endfunc
   1543 
   1544 function inv_txfm_add_4x16_neon
   1545        ldrh            w12, [x13, #4]
   1546        mov             x15, x30
   1547 
   1548        mov             x11, #64
   1549 
   1550        cmp             w3,  w12
   1551        ldrh            w12, [x13, #2]
   1552        b.lt            1f
   1553 
   1554        add             x6,  x2,  #48
   1555        movi            v2.4s,   #0
   1556 .irp i, v16.4s, v17.4s, v18.4s, v19.4s
   1557        ld1             {\i},    [x6]
   1558        st1             {v2.4s}, [x6], x11
   1559 .endr
   1560        blr             x4
   1561        sqrshrn         v28.4h,  v16.4s,  #1
   1562        sqrshrn         v29.4h,  v17.4s,  #1
   1563        sqrshrn         v30.4h,  v18.4s,  #1
   1564        sqrshrn         v31.4h,  v19.4s,  #1
   1565        transpose_4x4h  v28, v29, v30, v31, v4,  v5,  v6,  v7
   1566 
   1567        b               2f
   1568 1:
   1569 .irp i, v28.4h, v29.4h, v30.4h, v31.4h
   1570        movi            \i,  #0
   1571 .endr
   1572 2:
   1573        cmp             w3,  w12
   1574        ldrh            w12, [x13, #0]
   1575        b.lt            1f
   1576 
   1577        add             x6,  x2,  #32
   1578        movi            v2.4s,   #0
   1579 .irp i, v16.4s, v17.4s, v18.4s, v19.4s
   1580        ld1             {\i},    [x6]
   1581        st1             {v2.4s}, [x6], x11
   1582 .endr
   1583        blr             x4
   1584        sqrshrn         v24.4h,  v16.4s,  #1
   1585        sqrshrn         v25.4h,  v17.4s,  #1
   1586        sqrshrn         v26.4h,  v18.4s,  #1
   1587        sqrshrn         v27.4h,  v19.4s,  #1
   1588        transpose_4x4h  v24, v25, v26, v27, v4,  v5,  v6,  v7
   1589 
   1590        b               2f
   1591 1:
   1592 .irp i, v24.4h, v25.4h, v26.4h, v27.4h
   1593        movi            \i,  #0
   1594 .endr
   1595 2:
   1596        cmp             w3,  w12
   1597        b.lt            1f
   1598 
   1599        add             x6,  x2,  #16
   1600        movi            v2.4s,   #0
   1601 .irp i, v16.4s, v17.4s, v18.4s, v19.4s
   1602        ld1             {\i},    [x6]
   1603        st1             {v2.4s}, [x6], x11
   1604 .endr
   1605        blr             x4
   1606        sqrshrn         v20.4h,  v16.4s,  #1
   1607        sqrshrn         v21.4h,  v17.4s,  #1
   1608        sqrshrn         v22.4h,  v18.4s,  #1
   1609        sqrshrn         v23.4h,  v19.4s,  #1
   1610        transpose_4x4h  v20, v21, v22, v23, v4,  v5,  v6,  v7
   1611 
   1612        b               2f
   1613 1:
   1614 .irp i, v20.4h, v21.4h, v22.4h, v23.4h
   1615        movi            \i,  #0
   1616 .endr
   1617 2:
   1618 
   1619        movi            v2.4s,   #0
   1620 .irp i, v16.4s, v17.4s, v18.4s, v19.4s
   1621        ld1             {\i},    [x2]
   1622        st1             {v2.4s}, [x2], x11
   1623 .endr
   1624        blr             x4
   1625        sqrshrn         v16.4h,  v16.4s,  #1
   1626        sqrshrn         v17.4h,  v17.4s,  #1
   1627        sqrshrn         v18.4h,  v18.4s,  #1
   1628        sqrshrn         v19.4h,  v19.4s,  #1
   1629        transpose_4x8h  v16, v17, v18, v19, v4,  v5,  v6,  v7
   1630 
   1631        blr             x5
   1632 
   1633        load_add_store_4x16 x0, x6
   1634 
   1635        ret             x15
   1636 endfunc
   1637 
   1638 const eob_4x16
   1639        .short 13, 29, 45, 64
   1640 endconst
   1641 
   1642 const eob_4x16_identity1
   1643        .short 16, 32, 48, 64
   1644 endconst
   1645 
   1646 const eob_4x16_identity2
   1647        .short 4, 8, 12, 64
   1648 endconst
   1649 
   1650 .macro def_fn_416 w, h, txfm1, txfm2
   1651 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1
   1652 .ifc \txfm1\()_\txfm2, dct_dct
   1653        idct_dc         \w,  \h,  1
   1654 .endif
   1655 .if \w == 4
   1656        adr             x4,  inv_\txfm1\()_4s_x\w\()_neon
   1657        movrel          x5,  X(inv_\txfm2\()_4h_x\h\()_neon)
   1658 .ifc \txfm1, identity
   1659 .ifc \txfm2, identity
   1660        movrel          x13, eob_4x16
   1661 .else
   1662        movrel          x13, eob_4x16_identity1
   1663 .endif
   1664 .else
   1665 .ifc \txfm2, identity
   1666        movrel          x13, eob_4x16_identity2
   1667 .else
   1668        movrel          x13, eob_4x16
   1669 .endif
   1670 .endif
   1671 .else
   1672        adr             x4,  inv_\txfm1\()_4s_x\w\()_neon
   1673        movrel          x5,  X(inv_\txfm2\()_8h_x\h\()_neon)
   1674 .endif
   1675        b               inv_txfm_add_\w\()x\h\()_neon
   1676 endfunc
   1677 .endm
   1678 
   1679 .macro def_fns_416 w, h
   1680 def_fn_416 \w, \h, dct, dct
   1681 def_fn_416 \w, \h, identity, identity
   1682 def_fn_416 \w, \h, dct, adst
   1683 def_fn_416 \w, \h, dct, flipadst
   1684 def_fn_416 \w, \h, dct, identity
   1685 def_fn_416 \w, \h, adst, dct
   1686 def_fn_416 \w, \h, adst, adst
   1687 def_fn_416 \w, \h, adst, flipadst
   1688 def_fn_416 \w, \h, flipadst, dct
   1689 def_fn_416 \w, \h, flipadst, adst
   1690 def_fn_416 \w, \h, flipadst, flipadst
   1691 def_fn_416 \w, \h, identity, dct
   1692 def_fn_416 \w, \h, adst, identity
   1693 def_fn_416 \w, \h, flipadst, identity
   1694 def_fn_416 \w, \h, identity, adst
   1695 def_fn_416 \w, \h, identity, flipadst
   1696 .endm
   1697 
   1698 def_fns_416 4, 16
   1699 def_fns_416 16, 4
   1700 
   1701 
   1702 function inv_txfm_add_16x8_neon
   1703        mov             x15, x30
   1704        stp             d8,  d9,  [sp, #-0x40]!
   1705        stp             d10, d11, [sp, #0x10]
   1706        stp             d12, d13, [sp, #0x20]
   1707        stp             d14, d15, [sp, #0x30]
   1708 
   1709        cmp             w3,  w13
   1710        mov             x11, #32
   1711        b.lt            1f
   1712 
   1713        movi            v4.4s,  #0
   1714        movz            w16, #2896*8, lsl #16
   1715        dup             v0.2s,   w16
   1716 
   1717        add             x6,  x2,  #16
   1718 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
   1719        ld1             {\i},    [x6]
   1720        st1             {v4.4s}, [x6], x11
   1721 .endr
   1722 
   1723        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
   1724        scale_input     .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
   1725        blr             x4
   1726 
   1727        sqrshrn         v8.4h,   v16.4s,  #1
   1728        sqrshrn         v9.4h,   v17.4s,  #1
   1729        sqrshrn         v10.4h,  v18.4s,  #1
   1730        sqrshrn         v11.4h,  v19.4s,  #1
   1731        sqrshrn2        v8.8h,   v20.4s,  #1
   1732        sqrshrn2        v9.8h,   v21.4s,  #1
   1733        sqrshrn2        v10.8h,  v22.4s,  #1
   1734        sqrshrn2        v11.8h,  v23.4s,  #1
   1735        sqrshrn         v12.4h,  v24.4s,  #1
   1736        sqrshrn         v13.4h,  v25.4s,  #1
   1737        sqrshrn         v14.4h,  v26.4s,  #1
   1738        sqrshrn         v15.4h,  v27.4s,  #1
   1739        sqrshrn2        v12.8h,  v28.4s,  #1
   1740        sqrshrn2        v13.8h,  v29.4s,  #1
   1741        sqrshrn2        v14.8h,  v30.4s,  #1
   1742        sqrshrn2        v15.8h,  v31.4s,  #1
   1743 
   1744        transpose_4x8h  v8,  v9,  v10, v11, v2,  v3,  v4,  v5
   1745        transpose_4x8h  v12, v13, v14, v15, v2,  v3,  v4,  v5
   1746 
   1747        b               2f
   1748 1:
   1749 .irp i, v8.8h, v9.8h, v10.8h, v11.8h, v12.8h, v13.8h, v14.8h, v15.8h
   1750        movi            \i,  #0
   1751 .endr
   1752 2:
   1753        movz            w16, #2896*8, lsl #16
   1754        dup             v0.2s,   w16
   1755 
   1756        movi            v4.4s,  #0
   1757 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
   1758        ld1             {\i},    [x2]
   1759        st1             {v4.4s}, [x2], x11
   1760 .endr
   1761 
   1762        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
   1763        scale_input     .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
   1764        blr             x4
   1765 
   1766        sqrshrn         v16.4h,  v16.4s,  #1
   1767        sqrshrn         v17.4h,  v17.4s,  #1
   1768        sqrshrn         v18.4h,  v18.4s,  #1
   1769        sqrshrn         v19.4h,  v19.4s,  #1
   1770        sqrshrn2        v16.8h,  v20.4s,  #1
   1771        sqrshrn2        v17.8h,  v21.4s,  #1
   1772        sqrshrn2        v18.8h,  v22.4s,  #1
   1773        sqrshrn2        v19.8h,  v23.4s,  #1
   1774 
   1775        mov             v20.16b, v8.16b
   1776        mov             v21.16b, v9.16b
   1777        mov             v22.16b, v10.16b
   1778        mov             v23.16b, v11.16b
   1779 
   1780        transpose_4x8h  v16, v17, v18, v19, v2,  v3,  v4,  v5
   1781 
   1782        sqrshrn         v8.4h,   v24.4s,  #1
   1783        sqrshrn         v9.4h,   v25.4s,  #1
   1784        sqrshrn         v10.4h,  v26.4s,  #1
   1785        sqrshrn         v11.4h,  v27.4s,  #1
   1786        sqrshrn2        v8.8h,   v28.4s,  #1
   1787        sqrshrn2        v9.8h,   v29.4s,  #1
   1788        sqrshrn2        v10.8h,  v30.4s,  #1
   1789        sqrshrn2        v11.8h,  v31.4s,  #1
   1790 
   1791        transpose_4x8h  v8,  v9, v10, v11, v2,  v3,  v4,  v5
   1792 
   1793        blr             x5
   1794 
   1795        mov             x6,  x0
   1796        load_add_store_8x8 x6, x7
   1797 
   1798        mov             v16.16b, v8.16b
   1799        mov             v17.16b, v9.16b
   1800        mov             v18.16b, v10.16b
   1801        mov             v19.16b, v11.16b
   1802        mov             v20.16b, v12.16b
   1803        mov             v21.16b, v13.16b
   1804        mov             v22.16b, v14.16b
   1805        mov             v23.16b, v15.16b
   1806 
   1807        blr             x5
   1808 
   1809        add             x0,  x0,  #16
   1810        load_add_store_8x8 x0, x7
   1811 
   1812        ldp             d14, d15, [sp, #0x30]
   1813        ldp             d12, d13, [sp, #0x20]
   1814        ldp             d10, d11, [sp, #0x10]
   1815        ldp             d8,  d9,  [sp], 0x40
   1816        ret             x15
   1817 endfunc
   1818 
   1819 function inv_txfm_add_8x16_neon
   1820        mov             x15, x30
   1821        stp             d8,  d9,  [sp, #-0x20]!
   1822        stp             d10, d11, [sp, #0x10]
   1823        ldrh            w12, [x13, #4]
   1824 
   1825        mov             x11, #64
   1826 
   1827        cmp             w3,  w12
   1828        ldrh            w12, [x13, #2]
   1829        b.lt            1f
   1830 
   1831        add             x6,  x2,  #48
   1832        movi            v4.4s,   #0
   1833        movz            w16, #2896*8, lsl #16
   1834        dup             v0.2s,   w16
   1835 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
   1836        ld1             {\i},    [x6]
   1837        st1             {v4.4s}, [x6], x11
   1838 .endr
   1839        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
   1840        blr             x4
   1841 
   1842        sqrshrn         v28.4h,  v16.4s,  #1
   1843        sqrshrn         v29.4h,  v17.4s,  #1
   1844        sqrshrn         v30.4h,  v18.4s,  #1
   1845        sqrshrn         v31.4h,  v19.4s,  #1
   1846        sqrshrn2        v28.8h,  v20.4s,  #1
   1847        sqrshrn2        v29.8h,  v21.4s,  #1
   1848        sqrshrn2        v30.8h,  v22.4s,  #1
   1849        sqrshrn2        v31.8h,  v23.4s,  #1
   1850        transpose_4x8h  v28, v29, v30, v31, v2, v3, v4, v5
   1851 
   1852        b               2f
   1853 
   1854 1:
   1855 .irp i, v28.8h, v29.8h, v30.8h, v31.8h
   1856        movi            \i,  #0
   1857 .endr
   1858 
   1859 2:
   1860        cmp             w3,  w12
   1861        ldrh            w12, [x13, #0]
   1862        b.lt            1f
   1863 
   1864        add             x6,  x2,  #32
   1865        movi            v4.4s,   #0
   1866        movz            w16, #2896*8, lsl #16
   1867        dup             v0.2s,   w16
   1868 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
   1869        ld1             {\i},    [x6]
   1870        st1             {v4.4s}, [x6], x11
   1871 .endr
   1872        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
   1873        blr             x4
   1874 
   1875        sqrshrn         v24.4h,  v16.4s,  #1
   1876        sqrshrn         v25.4h,  v17.4s,  #1
   1877        sqrshrn         v26.4h,  v18.4s,  #1
   1878        sqrshrn         v27.4h,  v19.4s,  #1
   1879        sqrshrn2        v24.8h,  v20.4s,  #1
   1880        sqrshrn2        v25.8h,  v21.4s,  #1
   1881        sqrshrn2        v26.8h,  v22.4s,  #1
   1882        sqrshrn2        v27.8h,  v23.4s,  #1
   1883        transpose_4x8h  v24, v25, v26, v27, v2, v3, v4, v5
   1884 
   1885        b               2f
   1886 
   1887 1:
   1888 .irp i, v24.8h, v25.8h, v26.8h, v27.8h
   1889        movi            \i,  #0
   1890 .endr
   1891 
   1892 2:
   1893        cmp             w3,  w12
   1894        b.lt            1f
   1895 
   1896        add             x6,  x2,  #16
   1897        movi            v4.4s,   #0
   1898        movz            w16, #2896*8, lsl #16
   1899        dup             v0.2s,   w16
   1900 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
   1901        ld1             {\i},    [x6]
   1902        st1             {v4.4s}, [x6], x11
   1903 .endr
   1904        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
   1905        blr             x4
   1906 
   1907        sqrshrn         v8.4h,   v16.4s,  #1
   1908        sqrshrn         v9.4h,   v17.4s,  #1
   1909        sqrshrn         v10.4h,  v18.4s,  #1
   1910        sqrshrn         v11.4h,  v19.4s,  #1
   1911        sqrshrn2        v8.8h,   v20.4s,  #1
   1912        sqrshrn2        v9.8h,   v21.4s,  #1
   1913        sqrshrn2        v10.8h,  v22.4s,  #1
   1914        sqrshrn2        v11.8h,  v23.4s,  #1
   1915        transpose_4x8h  v8,  v9,  v10, v11, v2, v3, v4, v5
   1916 
   1917        b               2f
   1918 
   1919 1:
   1920 .irp i, v8.8h, v9.8h, v10.8h, v11.8h
   1921        movi            \i,  #0
   1922 .endr
   1923 
   1924 2:
   1925        movi            v4.4s,   #0
   1926        movz            w16, #2896*8, lsl #16
   1927        dup             v0.2s,   w16
   1928 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
   1929        ld1             {\i},    [x2]
   1930        st1             {v4.4s}, [x2], x11
   1931 .endr
   1932        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
   1933        blr             x4
   1934 
   1935        sqrshrn         v16.4h,  v16.4s,  #1
   1936        sqrshrn         v17.4h,  v17.4s,  #1
   1937        sqrshrn         v18.4h,  v18.4s,  #1
   1938        sqrshrn         v19.4h,  v19.4s,  #1
   1939        sqrshrn2        v16.8h,  v20.4s,  #1
   1940        sqrshrn2        v17.8h,  v21.4s,  #1
   1941        sqrshrn2        v18.8h,  v22.4s,  #1
   1942        sqrshrn2        v19.8h,  v23.4s,  #1
   1943        transpose_4x8h  v16, v17, v18, v19, v2, v3, v4, v5
   1944 
   1945        mov             v20.16b, v8.16b
   1946        mov             v21.16b, v9.16b
   1947        mov             v22.16b, v10.16b
   1948        mov             v23.16b, v11.16b
   1949 
   1950        blr             x5
   1951 
   1952        load_add_store_8x16 x0, x6
   1953 
   1954        ldp             d10, d11, [sp, #0x10]
   1955        ldp             d8,  d9,  [sp], 0x20
   1956 
   1957        ret             x15
   1958 endfunc
   1959 
   1960 const eob_8x16
   1961        .short 10, 43, 75, 128
   1962 endconst
   1963 
   1964 const eob_8x16_identity1
   1965        .short 4, 64, 96, 128
   1966 endconst
   1967 
   1968 const eob_8x16_identity2
   1969        .short 4, 8, 12, 128
   1970 endconst
   1971 
   1972 .macro def_fn_816 w, h, txfm1, txfm2
   1973 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1
   1974 .ifc \txfm1\()_\txfm2, dct_dct
   1975        idct_dc         \w,  \h,  1
   1976 .endif
   1977        adr             x4,  inv_\txfm1\()_4s_x\w\()_neon
   1978        movrel          x5,  X(inv_\txfm2\()_8h_x\h\()_neon)
   1979 .ifc \txfm1, identity
   1980 .ifc \txfm2, identity
   1981        movrel          x13, eob_8x16
   1982 .else
   1983        movrel          x13, eob_8x16_identity1
   1984 .endif
   1985 .else
   1986 .ifc \txfm2, identity
   1987        movrel          x13, eob_8x16_identity2
   1988 .else
   1989        movrel          x13, eob_8x16
   1990 .endif
   1991 .endif
   1992 .if \h == 8
   1993        ldrh            w13, [x13]
   1994 .endif
   1995        b               inv_txfm_add_\w\()x\h\()_neon
   1996 endfunc
   1997 .endm
   1998 
   1999 .macro def_fns_816 w, h
   2000 def_fn_816 \w, \h, dct, dct
   2001 def_fn_816 \w, \h, identity, identity
   2002 def_fn_816 \w, \h, dct, adst
   2003 def_fn_816 \w, \h, dct, flipadst
   2004 def_fn_816 \w, \h, dct, identity
   2005 def_fn_816 \w, \h, adst, dct
   2006 def_fn_816 \w, \h, adst, adst
   2007 def_fn_816 \w, \h, adst, flipadst
   2008 def_fn_816 \w, \h, flipadst, dct
   2009 def_fn_816 \w, \h, flipadst, adst
   2010 def_fn_816 \w, \h, flipadst, flipadst
   2011 def_fn_816 \w, \h, identity, dct
   2012 def_fn_816 \w, \h, adst, identity
   2013 def_fn_816 \w, \h, flipadst, identity
   2014 def_fn_816 \w, \h, identity, adst
   2015 def_fn_816 \w, \h, identity, flipadst
   2016 .endm
   2017 
   2018 def_fns_816 8, 16
   2019 def_fns_816 16, 8
   2020 
   2021 function inv_dct32_odd_4s_x16_neon
   2022        movrel          x16, idct_coeffs, 4*16
   2023        ld1             {v0.4s, v1.4s}, [x16], #32
   2024 
   2025        mul_mls         v2,  v16, v31, v0.s[0], v0.s[1] // -> t16a
   2026        mul_mla         v4,  v16, v31, v0.s[1], v0.s[0] // -> t31a
   2027        mul_mls         v6,  v24, v23, v0.s[2], v0.s[3] // -> t17a
   2028        srshr           v16.4s, v2.4s,  #12             // t16a
   2029        srshr           v31.4s, v4.4s,  #12             // t31a
   2030        mul_mla         v2,  v24, v23, v0.s[3], v0.s[2] // -> t30a
   2031        mul_mls         v4,  v20, v27, v1.s[0], v1.s[1] // -> t18a
   2032        srshr           v24.4s, v6.4s,  #12             // t17a
   2033        srshr           v23.4s, v2.4s,  #12             // t30a
   2034        mul_mla         v6,  v20, v27, v1.s[1], v1.s[0] // -> t29a
   2035        mul_mls         v2,  v28, v19, v1.s[2], v1.s[3] // -> t19a
   2036        srshr           v20.4s, v4.4s,  #12             // t18a
   2037        srshr           v27.4s, v6.4s,  #12             // t29a
   2038        mul_mla         v4,  v28, v19, v1.s[3], v1.s[2] // -> t28a
   2039        ld1             {v0.4s, v1.4s}, [x16]
   2040        sub             x16, x16, #4*24
   2041        mul_mls         v6,  v18, v29, v0.s[0], v0.s[1] // -> t20a
   2042        srshr           v28.4s, v2.4s,  #12             // t19a
   2043        srshr           v19.4s, v4.4s,  #12             // t28a
   2044        mul_mla         v2,  v18, v29, v0.s[1], v0.s[0] // -> t27a
   2045        mul_mls         v4,  v26, v21, v0.s[2], v0.s[3] // -> t21a
   2046        srshr           v18.4s, v6.4s,  #12             // t20a
   2047        srshr           v29.4s, v2.4s,  #12             // t27a
   2048        mul_mla         v6,  v26, v21, v0.s[3], v0.s[2] // -> t26a
   2049        mul_mls         v2,  v22, v25, v1.s[0], v1.s[1] // -> t22a
   2050        srshr           v26.4s, v4.4s,  #12             // t21a
   2051        srshr           v21.4s, v6.4s,  #12             // t26a
   2052        mul_mla         v4,  v22, v25, v1.s[1], v1.s[0] // -> t25a
   2053        mul_mls         v6,  v30, v17, v1.s[2], v1.s[3] // -> t23a
   2054        srshr           v22.4s, v2.4s,  #12             // t22a
   2055        srshr           v25.4s, v4.4s,  #12             // t25a
   2056        mul_mla         v2,  v30, v17, v1.s[3], v1.s[2] // -> t24a
   2057        srshr           v30.4s, v6.4s,  #12             // t23a
   2058        srshr           v17.4s, v2.4s,  #12             // t24a
   2059 
   2060        ld1             {v0.4s, v1.4s}, [x16]
   2061 
   2062        movi            v5.4s,   #1, msl #16     // row_clip_max = ~(~bdmax << 7), 0x1ffff
   2063        mvni            v4.4s,   #1, msl #16     // row_clip_min = (~bdmax << 7), 0xfffe0000
   2064 
   2065        sqsub           v2.4s,   v16.4s,  v24.4s // t17
   2066        sqadd           v16.4s,  v16.4s,  v24.4s // t16
   2067        sqsub           v3.4s,   v31.4s,  v23.4s // t30
   2068        sqadd           v31.4s,  v31.4s,  v23.4s // t31
   2069        sqsub           v24.4s,  v28.4s,  v20.4s // t18
   2070        sqadd           v28.4s,  v28.4s,  v20.4s // t19
   2071        sqadd           v23.4s,  v18.4s,  v26.4s // t20
   2072        sqsub           v18.4s,  v18.4s,  v26.4s // t21
   2073        sqsub           v20.4s,  v30.4s,  v22.4s // t22
   2074        sqadd           v30.4s,  v30.4s,  v22.4s // t23
   2075        sqadd           v26.4s,  v17.4s,  v25.4s // t24
   2076        sqsub           v17.4s,  v17.4s,  v25.4s // t25
   2077        sqsub           v22.4s,  v29.4s,  v21.4s // t26
   2078        sqadd           v29.4s,  v29.4s,  v21.4s // t27
   2079        sqadd           v25.4s,  v19.4s,  v27.4s // t28
   2080        sqsub           v19.4s,  v19.4s,  v27.4s // t29
   2081 
   2082 .irp r, v2, v16, v3, v31, v24, v28, v23, v18, v20, v30, v26, v17, v22, v29, v25, v19
   2083        smin            \r\().4s, \r\().4s, v5.4s
   2084 .endr
   2085 .irp r, v2, v16, v3, v31, v24, v28, v23, v18, v20, v30, v26, v17, v22, v29, v25, v19
   2086        smax            \r\().4s, \r\().4s, v4.4s
   2087 .endr
   2088 
   2089        mul_mls         v7,  v3,  v2,  v1.s[0], v1.s[1] // -> t17a
   2090        mul_mla         v6,  v3,  v2,  v1.s[1], v1.s[0] // -> t30a
   2091        mul_mla         v2,  v19, v24, v1.s[1], v1.s[0] // -> t18a
   2092        srshr           v21.4s, v7.4s,  #12             // t17a
   2093        srshr           v27.4s, v6.4s,  #12             // t30a
   2094        neg             v2.4s,   v2.4s                  // -> t18a
   2095        mul_mls         v7,  v19, v24, v1.s[0], v1.s[1] // -> t29a
   2096        mul_mls         v6,  v22, v18, v1.s[2], v1.s[3] // -> t21a
   2097        srshr           v19.4s, v2.4s,  #12             // t18a
   2098        srshr           v24.4s, v7.4s,  #12             // t29a
   2099        mul_mla         v2,  v22, v18, v1.s[3], v1.s[2] // -> t26a
   2100        mul_mla         v7,  v17, v20, v1.s[3], v1.s[2] // -> t22a
   2101        srshr           v22.4s, v6.4s,  #12             // t21a
   2102        srshr           v18.4s, v2.4s,  #12             // t26a
   2103        neg             v7.4s,   v7.4s                  // -> t22a
   2104        mul_mls         v6,  v17, v20, v1.s[2], v1.s[3] // -> t25a
   2105        srshr           v17.4s, v7.4s,  #12             // t22a
   2106        srshr           v20.4s, v6.4s,  #12             // t25a
   2107 
   2108        sqsub           v2.4s,   v27.4s,  v24.4s // t29
   2109        sqadd           v27.4s,  v27.4s,  v24.4s // t30
   2110        sqsub           v3.4s,   v21.4s,  v19.4s // t18
   2111        sqadd           v21.4s,  v21.4s,  v19.4s // t17
   2112        sqsub           v24.4s,  v16.4s,  v28.4s // t19a
   2113        sqadd           v16.4s,  v16.4s,  v28.4s // t16a
   2114        sqsub           v19.4s,  v30.4s,  v23.4s // t20a
   2115        sqadd           v30.4s,  v30.4s,  v23.4s // t23a
   2116        sqsub           v28.4s,  v17.4s,  v22.4s // t21
   2117        sqadd           v17.4s,  v17.4s,  v22.4s // t22
   2118        sqadd           v23.4s,  v26.4s,  v29.4s // t24a
   2119        sqsub           v26.4s,  v26.4s,  v29.4s // t27a
   2120        sqadd           v22.4s,  v20.4s,  v18.4s // t25
   2121        sqsub           v20.4s,  v20.4s,  v18.4s // t26
   2122        sqsub           v29.4s,  v31.4s,  v25.4s // t28a
   2123        sqadd           v31.4s,  v31.4s,  v25.4s // t31a
   2124 
   2125 .irp r, v2, v27, v3, v21, v24, v16, v19, v30, v28, v17, v23, v26, v22, v20, v29, v31
   2126        smin            \r\().4s, \r\().4s, v5.4s
   2127 .endr
   2128 .irp r, v2, v27, v3, v21, v24, v16, v19, v30, v28, v17, v23, v26, v22, v20, v29, v31
   2129        smax            \r\().4s, \r\().4s, v4.4s
   2130 .endr
   2131 
   2132        mul_mls         v7,  v2,  v3,  v0.s[2], v0.s[3] // -> t18a
   2133        mul_mla         v6,  v2,  v3,  v0.s[3], v0.s[2] // -> t29a
   2134        mul_mls         v2,  v29, v24, v0.s[2], v0.s[3] // -> t19
   2135        srshr           v18.4s, v7.4s,  #12             // t18a
   2136        srshr           v25.4s, v6.4s,  #12             // t29a
   2137        mul_mla         v7,  v29, v24, v0.s[3], v0.s[2] // -> t28
   2138        mul_mla         v6,  v26, v19, v0.s[3], v0.s[2] // -> t20
   2139        srshr           v29.4s, v2.4s,  #12             // t19
   2140        srshr           v24.4s, v7.4s,  #12             // t28
   2141        neg             v6.4s,   v6.4s                  // -> t20
   2142        mul_mls         v2,  v26, v19, v0.s[2], v0.s[3] // -> t27
   2143        mul_mla         v7,  v20, v28, v0.s[3], v0.s[2] // -> t21a
   2144        srshr           v26.4s, v6.4s,  #12             // t20
   2145        srshr           v19.4s, v2.4s,  #12             // t27
   2146        neg             v7.4s,   v7.4s                  // -> t21a
   2147        mul_mls         v6,  v20, v28, v0.s[2], v0.s[3] // -> t26a
   2148        srshr           v20.4s, v7.4s,  #12             // t21a
   2149        srshr           v28.4s, v6.4s,  #12             // t26a
   2150 
   2151        sqsub           v2.4s,   v16.4s,  v30.4s // t23
   2152        sqadd           v16.4s,  v16.4s,  v30.4s // t16 = out16
   2153        sqsub           v3.4s,   v31.4s,  v23.4s // t24
   2154        sqadd           v31.4s,  v31.4s,  v23.4s // t31 = out31
   2155        sqsub           v23.4s,  v21.4s,  v17.4s // t22a
   2156        sqadd           v17.4s,  v21.4s,  v17.4s // t17a = out17
   2157        sqadd           v30.4s,  v27.4s,  v22.4s // t30a = out30
   2158        sqsub           v21.4s,  v27.4s,  v22.4s // t25a
   2159        sqsub           v27.4s,  v18.4s,  v20.4s // t21
   2160        sqadd           v18.4s,  v18.4s,  v20.4s // t18 = out18
   2161        sqadd           v7.4s,   v29.4s,  v26.4s // t19a = out19
   2162        sqsub           v26.4s,  v29.4s,  v26.4s // t20a
   2163        sqadd           v29.4s,  v25.4s,  v28.4s // t29 = out29
   2164        sqsub           v25.4s,  v25.4s,  v28.4s // t26
   2165        sqadd           v28.4s,  v24.4s,  v19.4s // t28a = out28
   2166        sqsub           v24.4s,  v24.4s,  v19.4s // t27a
   2167        mov             v19.16b, v7.16b          // out19
   2168 
   2169 .irp r, v2, v16, v3, v31, v23, v17, v30, v21, v27, v18, v19, v26, v29, v25, v28, v24
   2170        smin            \r\().4s, \r\().4s, v5.4s
   2171 .endr
   2172 .irp r, v2, v16, v3, v31, v23, v17, v30, v21, v27, v18, v19, v26, v29, v25, v28, v24
   2173        smax            \r\().4s, \r\().4s, v4.4s
   2174 .endr
   2175 
   2176        mul_mls         v7,  v24, v26, v0.s[0], v0.s[0] // -> t20
   2177        mul_mla         v6,  v24, v26, v0.s[0], v0.s[0] // -> t27
   2178        srshr           v20.4s, v7.4s,  #12             // t20
   2179        srshr           v22.4s, v6.4s,  #12             // t27
   2180 
   2181        mul_mla         v7,  v25, v27, v0.s[0], v0.s[0] // -> t26a
   2182        mul_mls         v6,  v25, v27, v0.s[0], v0.s[0] // -> t21a
   2183        mov             v27.16b,  v22.16b               // t27
   2184        srshr           v26.4s, v7.4s,  #12             // t26a
   2185 
   2186        mul_mls         v24, v21, v23, v0.s[0], v0.s[0] // -> t22
   2187        mul_mla         v7,  v21, v23, v0.s[0], v0.s[0] // -> t25
   2188        srshr           v21.4s, v6.4s,  #12             // t21a
   2189        srshr           v22.4s, v24.4s, #12             // t22
   2190        srshr           v25.4s, v7.4s,  #12             // t25
   2191 
   2192        mul_mls         v7,  v3,  v2,  v0.s[0], v0.s[0] // -> t23a
   2193        mul_mla         v6,  v3,  v2,  v0.s[0], v0.s[0] // -> t24a
   2194        srshr           v23.4s, v7.4s,  #12             // t23a
   2195        srshr           v24.4s, v6.4s,  #12             // t24a
   2196 
   2197        ret
   2198 endfunc
   2199 
   2200 .macro def_horz_32 scale=0, shift=2, suffix
   2201 function inv_txfm_horz\suffix\()_dct_32x4_neon
   2202        mov             x14, x30
   2203        movi            v7.4s,  #0
   2204        lsl             x8,  x8,  #1
   2205 .if \scale
   2206        movz            w16, #2896*8, lsl #16
   2207        dup             v0.2s,   w16
   2208 .endif
   2209 
   2210 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
   2211        ld1             {\i}, [x7]
   2212        st1             {v7.4s}, [x7], x8
   2213 .endr
   2214        sub             x7,  x7,  x8, lsl #4
   2215        add             x7,  x7,  x8, lsr #1
   2216 .if \scale
   2217        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
   2218        scale_input     .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
   2219 .endif
   2220        bl              inv_dct_4s_x16_neon
   2221 
   2222        // idct_16 leaves the row_clip_max/min constants in v5 and v4
   2223 .irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
   2224        smin_4s         \r, \r, v5
   2225 .endr
   2226 .irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
   2227        smax_4s         \r, \r, v4
   2228 .endr
   2229 
   2230        transpose_4x4s  v16, v17, v18, v19, v2,  v3,  v4,  v5
   2231        transpose_4x4s  v20, v21, v22, v23, v2,  v3,  v4,  v5
   2232        transpose_4x4s  v24, v25, v26, v27, v2,  v3,  v4,  v5
   2233        transpose_4x4s  v28, v29, v30, v31, v2,  v3,  v4,  v5
   2234 
   2235 .macro store1 r0, r1, r2, r3
   2236        st1             {\r0}, [x6], #16
   2237        st1             {\r1}, [x6], #16
   2238        st1             {\r2}, [x6], #16
   2239        st1             {\r3}, [x6], #16
   2240 .endm
   2241        store1          v16.4s,  v20.4s,  v24.4s,  v28.4s
   2242        store1          v17.4s,  v21.4s,  v25.4s,  v29.4s
   2243        store1          v18.4s,  v22.4s,  v26.4s,  v30.4s
   2244        store1          v19.4s,  v23.4s,  v27.4s,  v31.4s
   2245 .purgem store1
   2246        sub             x6,  x6,  #64*4
   2247 
   2248        movi            v7.4s,  #0
   2249 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
   2250        ld1             {\i}, [x7]
   2251        st1             {v7.4s}, [x7], x8
   2252 .endr
   2253 .if \scale
   2254        // This relies on the fact that the idct also leaves the right coeff in v0.s[1]
   2255        scale_input     .4s, v0.s[1], v16, v17, v18, v19, v20, v21, v22, v23
   2256        scale_input     .4s, v0.s[1], v24, v25, v26, v27, v28, v29, v30, v31
   2257 .endif
   2258        bl              inv_dct32_odd_4s_x16_neon
   2259        transpose_4x4s  v31, v30, v29, v28, v2,  v3,  v4,  v5
   2260        transpose_4x4s  v27, v26, v25, v24, v2,  v3,  v4,  v5
   2261        transpose_4x4s  v23, v22, v21, v20, v2,  v3,  v4,  v5
   2262        transpose_4x4s  v19, v18, v17, v16, v2,  v3,  v4,  v5
   2263 .macro store2 r0, r1, r2, r3, shift
   2264        ld1             {v0.4s, v1.4s, v2.4s, v3.4s}, [x6]
   2265        sqsub           v4.4s,   v0.4s,   \r0
   2266        sqadd           v0.4s,   v0.4s,   \r0
   2267        sqsub           v5.4s,   v1.4s,   \r1
   2268        sqadd           v1.4s,   v1.4s,   \r1
   2269        sqsub           v6.4s,   v2.4s,   \r2
   2270        sqadd           v2.4s,   v2.4s,   \r2
   2271        sqsub           v7.4s,   v3.4s,   \r3
   2272        sqadd           v3.4s,   v3.4s,   \r3
   2273        sqrshrn         v0.4h,   v0.4s,   #\shift
   2274        sqrshrn2        v0.8h,   v1.4s,   #\shift
   2275        sqrshrn         v1.4h,   v2.4s,   #\shift
   2276        sqrshrn2        v1.8h,   v3.4s,   #\shift
   2277        sqrshrn         v2.4h,   v7.4s,   #\shift
   2278        sqrshrn2        v2.8h,   v6.4s,   #\shift
   2279        sqrshrn         v3.4h,   v5.4s,   #\shift
   2280        sqrshrn2        v3.8h,   v4.4s,   #\shift
   2281        st1             {v0.8h, v1.8h}, [x6], #32
   2282        rev64           v2.8h,   v2.8h
   2283        rev64           v3.8h,   v3.8h
   2284        st1             {v2.8h, v3.8h}, [x6], #32
   2285 .endm
   2286 
   2287        store2          v31.4s,  v27.4s,  v23.4s,  v19.4s,  \shift
   2288        store2          v30.4s,  v26.4s,  v22.4s,  v18.4s,  \shift
   2289        store2          v29.4s,  v25.4s,  v21.4s,  v17.4s,  \shift
   2290        store2          v28.4s,  v24.4s,  v20.4s,  v16.4s,  \shift
   2291 .purgem store2
   2292        ret             x14
   2293 endfunc
   2294 .endm
   2295 
   2296 def_horz_32 scale=0, shift=2
   2297 def_horz_32 scale=1, shift=1, suffix=_scale
   2298 
   2299 function inv_txfm_add_vert_dct_8x32_neon
   2300        mov             x14, x30
   2301        lsl             x8,  x8,  #1
   2302 
   2303 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
   2304        ld1             {v\i\().8h}, [x7], x8
   2305 .endr
   2306        sub             x7,  x7,  x8, lsl #4
   2307 
   2308        bl              X(inv_dct_8h_x16_neon)
   2309 
   2310 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
   2311        st1             {v\i\().8h}, [x7], x8
   2312 .endr
   2313        sub             x7,  x7,  x8, lsl #4
   2314        add             x7,  x7,  x8, lsr #1
   2315 
   2316 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
   2317        ld1             {v\i\().8h}, [x7], x8
   2318 .endr
   2319        sub             x7,  x7,  x8, lsl #4
   2320        sub             x7,  x7,  x8, lsr #1
   2321        bl              X(inv_dct32_odd_8h_x16_neon)
   2322 
   2323        neg             x9,  x8
   2324        mov             x10, x6
   2325        mvni            v1.8h,   #0xfc, lsl #8 // 0x3ff
   2326 .macro combine r0, r1, r2, r3, op, stride
   2327        ld1             {v5.8h}, [x7],    \stride
   2328        ld1             {v2.8h}, [x10],   x1
   2329        ld1             {v6.8h}, [x7],    \stride
   2330        ld1             {v3.8h}, [x10],   x1
   2331        \op             v5.8h,   v5.8h,   \r0
   2332        ld1             {v7.8h}, [x7],    \stride
   2333        ld1             {v4.8h}, [x10],   x1
   2334        srshr           v5.8h,   v5.8h,   #4
   2335        \op             v6.8h,   v6.8h,   \r1
   2336        usqadd          v2.8h,   v5.8h
   2337        srshr           v6.8h,   v6.8h,   #4
   2338        \op             v7.8h,   v7.8h,   \r2
   2339        ld1             {v5.8h}, [x7],    \stride
   2340        usqadd          v3.8h,   v6.8h
   2341        smin            v2.8h,   v2.8h,   v1.8h
   2342        srshr           v7.8h,   v7.8h,   #4
   2343        \op             v5.8h,   v5.8h,   \r3
   2344        st1             {v2.8h}, [x6],    x1
   2345        ld1             {v2.8h}, [x10],   x1
   2346        usqadd          v4.8h,   v7.8h
   2347        smin            v3.8h,   v3.8h,   v1.8h
   2348        srshr           v5.8h,   v5.8h,   #4
   2349        st1             {v3.8h}, [x6],    x1
   2350        usqadd          v2.8h,   v5.8h
   2351        smin            v4.8h,   v4.8h,   v1.8h
   2352        st1             {v4.8h}, [x6],    x1
   2353        smin            v2.8h,   v2.8h,   v1.8h
   2354        st1             {v2.8h}, [x6],    x1
   2355 .endm
   2356        combine         v31.8h, v30.8h, v29.8h, v28.8h, sqadd, x8
   2357        combine         v27.8h, v26.8h, v25.8h, v24.8h, sqadd, x8
   2358        combine         v23.8h, v22.8h, v21.8h, v20.8h, sqadd, x8
   2359        combine         v19.8h, v18.8h, v17.8h, v16.8h, sqadd, x8
   2360        sub             x7,  x7,  x8
   2361        combine         v16.8h, v17.8h, v18.8h, v19.8h, sqsub, x9
   2362        combine         v20.8h, v21.8h, v22.8h, v23.8h, sqsub, x9
   2363        combine         v24.8h, v25.8h, v26.8h, v27.8h, sqsub, x9
   2364        combine         v28.8h, v29.8h, v30.8h, v31.8h, sqsub, x9
   2365 .purgem combine
   2366 
   2367        ret             x14
   2368 endfunc
   2369 
   2370 const eob_32x32
   2371        .short 10, 36, 78, 136, 210, 300, 406, 1024
   2372 endconst
   2373 
   2374 const eob_16x32
   2375        .short 10, 36, 78, 151, 215, 279, 343, 512
   2376 endconst
   2377 
   2378 const eob_16x32_shortside
   2379        .short 10, 36, 78, 512
   2380 endconst
   2381 
   2382 const eob_8x32
   2383        .short 10, 43, 75, 107, 139, 171, 203, 256
   2384 endconst
   2385 
   2386 function inv_txfm_add_identity_identity_32x32_16bpc_neon, export=1
   2387        movi            v0.8h,  #0
   2388        movi            v1.8h,  #0
   2389        movrel          x13, eob_32x32, 2
   2390 
   2391        mov             x8,  #4*32
   2392 1:
   2393        mov             w9,  #0
   2394        movrel          x12, eob_32x32, 2
   2395 2:
   2396        add             w9,  w9,  #8
   2397        ld1             {v16.4s, v17.4s}, [x2]
   2398        st1             {v0.4s, v1.4s},   [x2], x8
   2399        ld1             {v18.4s, v19.4s}, [x2]
   2400        st1             {v0.4s, v1.4s},   [x2], x8
   2401        ld1             {v20.4s, v21.4s}, [x2]
   2402        st1             {v0.4s, v1.4s},   [x2], x8
   2403        ld1             {v22.4s, v23.4s}, [x2]
   2404        st1             {v0.4s, v1.4s},   [x2], x8
   2405        ld1             {v24.4s, v25.4s}, [x2]
   2406        st1             {v0.4s, v1.4s},   [x2], x8
   2407        ld1             {v26.4s, v27.4s}, [x2]
   2408        st1             {v0.4s, v1.4s},   [x2], x8
   2409        ld1             {v28.4s, v29.4s}, [x2]
   2410        st1             {v0.4s, v1.4s},   [x2], x8
   2411        ld1             {v30.4s, v31.4s}, [x2]
   2412        st1             {v0.4s, v1.4s},   [x2], x8
   2413        sqxtn           v16.4h,  v16.4s
   2414        sqxtn2          v16.8h,  v17.4s
   2415        sqxtn           v17.4h,  v18.4s
   2416        sqxtn2          v17.8h,  v19.4s
   2417        sqxtn           v18.4h,  v20.4s
   2418        sqxtn2          v18.8h,  v21.4s
   2419        sqxtn           v19.4h,  v22.4s
   2420        sqxtn2          v19.8h,  v23.4s
   2421        sqxtn           v20.4h,  v24.4s
   2422        sqxtn2          v20.8h,  v25.4s
   2423        sqxtn           v21.4h,  v26.4s
   2424        sqxtn2          v21.8h,  v27.4s
   2425        sqxtn           v22.4h,  v28.4s
   2426        sqxtn2          v22.8h,  v29.4s
   2427        sqxtn           v23.4h,  v30.4s
   2428        sqxtn2          v23.8h,  v31.4s
   2429        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
   2430 
   2431        load_add_store_8x8 x0, x7, shiftbits=2
   2432        ldrh            w11, [x12], #4
   2433        sub             x0,  x0,  x1, lsl #3
   2434        add             x0,  x0,  #2*8
   2435        cmp             w3,  w11
   2436        b.ge            2b
   2437 
   2438        ldrh            w11, [x13], #4
   2439        cmp             w3,  w11
   2440        b.lt            9f
   2441 
   2442        sub             x0,  x0,  w9, uxtw #1
   2443        add             x0,  x0,  x1, lsl #3
   2444        msub            x2,  x8,  x9,  x2
   2445        add             x2,  x2,  #4*8
   2446        b               1b
   2447 9:
   2448        ret
   2449 endfunc
   2450 
   2451 .macro shift_16_regs op, shift
   2452 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
   2453        \op             \i,  \i,  #\shift
   2454 .endr
   2455 .endm
   2456 
   2457 .macro def_identity_1632 w, h, wshort, hshort
   2458 function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1
   2459        movz            w16, #2896*8, lsl #16
   2460        movz            w17, #2*(5793-4096)*8, lsl #16
   2461        movi            v0.4s,   #0
   2462        movi            v1.4s,   #0
   2463        movrel          x13, eob_16x32\hshort, 2
   2464 
   2465        mov             x8,  #4*\h
   2466 1:
   2467        mov             w9,  #0
   2468        movrel          x12, eob_16x32\wshort, 2
   2469 2:
   2470        add             w9,  w9,  #8
   2471        ld1             {v16.4s, v17.4s}, [x2]
   2472        st1             {v0.4s, v1.4s},   [x2], x8
   2473        dup             v2.2s,   w16
   2474        ld1             {v18.4s, v19.4s}, [x2]
   2475        st1             {v0.4s, v1.4s},   [x2], x8
   2476        mov             v2.s[1], w17
   2477        ld1             {v20.4s, v21.4s}, [x2]
   2478        st1             {v0.4s, v1.4s},   [x2], x8
   2479        ld1             {v22.4s, v23.4s}, [x2]
   2480        st1             {v0.4s, v1.4s},   [x2], x8
   2481        ld1             {v24.4s, v25.4s}, [x2]
   2482        st1             {v0.4s, v1.4s},   [x2], x8
   2483        ld1             {v26.4s, v27.4s}, [x2]
   2484        st1             {v0.4s, v1.4s},   [x2], x8
   2485        ld1             {v28.4s, v29.4s}, [x2]
   2486        st1             {v0.4s, v1.4s},   [x2], x8
   2487        ld1             {v30.4s, v31.4s}, [x2]
   2488        st1             {v0.4s, v1.4s},   [x2], x8
   2489        scale_input     .4s, v2.s[0], v16, v17, v18, v19, v20, v21, v22, v23
   2490        scale_input     .4s, v2.s[0], v24, v25, v26, v27, v28, v29, v30, v31
   2491 
   2492 .if \w == 16
   2493        // 16x32
   2494        identity_4x16_shift1 v2.s[1]
   2495 .else
   2496        // 32x16
   2497        shift_16_regs   sqshl, 1
   2498        identity_4x16   v2.s[1]
   2499 .endif
   2500        sqxtn           v16.4h,  v16.4s
   2501        sqxtn2          v16.8h,  v17.4s
   2502        sqxtn           v17.4h,  v18.4s
   2503        sqxtn2          v17.8h,  v19.4s
   2504        sqxtn           v18.4h,  v20.4s
   2505        sqxtn2          v18.8h,  v21.4s
   2506        sqxtn           v19.4h,  v22.4s
   2507        sqxtn2          v19.8h,  v23.4s
   2508        sqxtn           v20.4h,  v24.4s
   2509        sqxtn2          v20.8h,  v25.4s
   2510        sqxtn           v21.4h,  v26.4s
   2511        sqxtn2          v21.8h,  v27.4s
   2512        sqxtn           v22.4h,  v28.4s
   2513        sqxtn2          v22.8h,  v29.4s
   2514        sqxtn           v23.4h,  v30.4s
   2515        sqxtn2          v23.8h,  v31.4s
   2516 
   2517        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
   2518 
   2519 .if \w == 16
   2520        load_add_store_8x8 x0, x7, shiftbits=2
   2521 .else
   2522        load_add_store_8x8 x0, x7, shiftbits=4
   2523 .endif
   2524        ldrh            w11, [x12], #4
   2525        sub             x0,  x0,  x1, lsl #3
   2526        add             x0,  x0,  #16
   2527        cmp             w3,  w11
   2528        b.ge            2b
   2529 
   2530        ldrh            w11, [x13], #4
   2531        cmp             w3,  w11
   2532        b.lt            9f
   2533 
   2534        sub             x0,  x0,  w9, uxtw #1
   2535        add             x0,  x0,  x1, lsl #3
   2536        msub            x2,  x8,  x9,  x2
   2537        add             x2,  x2,  #4*8
   2538        b               1b
   2539 9:
   2540        ret
   2541 endfunc
   2542 .endm
   2543 
   2544 def_identity_1632 16, 32, _shortside,
   2545 def_identity_1632 32, 16, , _shortside
   2546 
   2547 .macro def_identity_832 w, h
   2548 function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1
   2549        movi            v0.4s,  #0
   2550        movi            v1.4s,  #0
   2551        // Working on 8x8 blocks, read every other entry from eob_8x32
   2552        movrel          x13, eob_8x32, 2
   2553 
   2554        mov             w8,  #4*\h
   2555 1:
   2556        // Working on 8x8 blocks, read every other entry from eob_8x32
   2557        ldrh            w12, [x13], #4
   2558        ld1             {v16.4s, v17.4s}, [x2]
   2559        st1             {v0.4s, v1.4s},   [x2], x8
   2560        ld1             {v18.4s, v19.4s}, [x2]
   2561        st1             {v0.4s, v1.4s},   [x2], x8
   2562        ld1             {v20.4s, v21.4s}, [x2]
   2563        st1             {v0.4s, v1.4s},   [x2], x8
   2564        ld1             {v22.4s, v23.4s}, [x2]
   2565        st1             {v0.4s, v1.4s},   [x2], x8
   2566        ld1             {v24.4s, v25.4s}, [x2]
   2567        st1             {v0.4s, v1.4s},   [x2], x8
   2568        ld1             {v26.4s, v27.4s}, [x2]
   2569        st1             {v0.4s, v1.4s},   [x2], x8
   2570        ld1             {v28.4s, v29.4s}, [x2]
   2571        st1             {v0.4s, v1.4s},   [x2], x8
   2572        ld1             {v30.4s, v31.4s}, [x2]
   2573        st1             {v0.4s, v1.4s},   [x2], x8
   2574 
   2575 .if \w == 8
   2576        sqrshrn         v16.4h,  v16.4s,  #1
   2577        sqrshrn2        v16.8h,  v17.4s,  #1
   2578        sqrshrn         v17.4h,  v18.4s,  #1
   2579        sqrshrn2        v17.8h,  v19.4s,  #1
   2580        sqrshrn         v18.4h,  v20.4s,  #1
   2581        sqrshrn2        v18.8h,  v21.4s,  #1
   2582        sqrshrn         v19.4h,  v22.4s,  #1
   2583        sqrshrn2        v19.8h,  v23.4s,  #1
   2584        sqrshrn         v20.4h,  v24.4s,  #1
   2585        sqrshrn2        v20.8h,  v25.4s,  #1
   2586        sqrshrn         v21.4h,  v26.4s,  #1
   2587        sqrshrn2        v21.8h,  v27.4s,  #1
   2588        sqrshrn         v22.4h,  v28.4s,  #1
   2589        sqrshrn2        v22.8h,  v29.4s,  #1
   2590        sqrshrn         v23.4h,  v30.4s,  #1
   2591        sqrshrn2        v23.8h,  v31.4s,  #1
   2592 .else
   2593        sqxtn           v16.4h,  v16.4s
   2594        sqxtn2          v16.8h,  v17.4s
   2595        sqxtn           v17.4h,  v18.4s
   2596        sqxtn2          v17.8h,  v19.4s
   2597        sqxtn           v18.4h,  v20.4s
   2598        sqxtn2          v18.8h,  v21.4s
   2599        sqxtn           v19.4h,  v22.4s
   2600        sqxtn2          v19.8h,  v23.4s
   2601        sqxtn           v20.4h,  v24.4s
   2602        sqxtn2          v20.8h,  v25.4s
   2603        sqxtn           v21.4h,  v26.4s
   2604        sqxtn2          v21.8h,  v27.4s
   2605        sqxtn           v22.4h,  v28.4s
   2606        sqxtn2          v22.8h,  v29.4s
   2607        sqxtn           v23.4h,  v30.4s
   2608        sqxtn2          v23.8h,  v31.4s
   2609 .endif
   2610 
   2611        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
   2612 
   2613 
   2614        cmp             w3,  w12
   2615 .if \w == 8
   2616        load_add_store_8x8 x0, x7, shiftbits=2
   2617 .else
   2618        load_add_store_8x8 x0, x7, shiftbits=3
   2619 .endif
   2620 
   2621        b.lt            9f
   2622 .if \w == 8
   2623        sub             x2,  x2,  x8, lsl #3
   2624        add             x2,  x2,  #4*8
   2625 .else
   2626        sub             x0,  x0,  x1, lsl #3
   2627        add             x0,  x0,  #2*8
   2628 .endif
   2629        b               1b
   2630 
   2631 9:
   2632        ret
   2633 endfunc
   2634 .endm
   2635 
   2636 def_identity_832 8, 32
   2637 def_identity_832 32, 8
   2638 
   2639 function inv_txfm_add_dct_dct_32x32_16bpc_neon, export=1
   2640        idct_dc         32,  32,  2
   2641 
   2642        mov             x15, x30
   2643        sub             sp,  sp,  #2048
   2644        movrel          x13, eob_32x32
   2645        ldrh            w12, [x13], #2
   2646 
   2647 .irp i, 0, 4, 8, 12, 16, 20, 24, 28
   2648        add             x6,  sp,  #(\i*32*2)
   2649 .if \i > 0
   2650        mov             w8,  #(32 - \i)
   2651        cmp             w3,  w12
   2652        b.lt            1f
   2653 .if \i < 28
   2654        ldrh            w12, [x13], #2
   2655 .endif
   2656 .endif
   2657        add             x7,  x2,  #(\i*4)
   2658        mov             x8,  #32*4
   2659        bl              inv_txfm_horz_dct_32x4_neon
   2660 .endr
   2661        b               3f
   2662 
   2663 1:
   2664        movi            v4.8h,  #0
   2665        movi            v5.8h,  #0
   2666        movi            v6.8h,  #0
   2667        movi            v7.8h,  #0
   2668 2:
   2669        subs            w8,  w8,  #4
   2670 .rept 4
   2671        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
   2672 .endr
   2673        b.gt            2b
   2674 
   2675 3:
   2676 .irp i, 0, 8, 16, 24
   2677        add             x6,  x0,  #(\i*2)
   2678        add             x7,  sp,  #(\i*2)
   2679        mov             x8,  #32*2
   2680        bl              inv_txfm_add_vert_dct_8x32_neon
   2681 .endr
   2682 
   2683        add             sp,  sp,  #2048
   2684        ret             x15
   2685 endfunc
   2686 
   2687 function inv_txfm_add_dct_dct_16x32_16bpc_neon, export=1
   2688        idct_dc         16,  32,  1
   2689 
   2690        mov             x15, x30
   2691        sub             sp,  sp,  #1024
   2692        movrel          x13, eob_16x32
   2693        ldrh            w12, [x13], #2
   2694        adr             x4,  inv_dct_4s_x16_neon
   2695 
   2696 .irp i, 0, 4, 8, 12, 16, 20, 24, 28
   2697        add             x6,  sp,  #(\i*16*2)
   2698        add             x7,  x2,  #(\i*4)
   2699 .if \i > 0
   2700        mov             w8,  #(32 - \i)
   2701        cmp             w3,  w12
   2702        b.lt            1f
   2703 .if \i < 28
   2704        ldrh            w12, [x13], #2
   2705 .endif
   2706 .endif
   2707        mov             x8,  #4*32
   2708        bl              inv_txfm_horz_scale_16x4_neon
   2709 .endr
   2710        b               3f
   2711 
   2712 1:
   2713        movi            v4.8h,  #0
   2714        movi            v5.8h,  #0
   2715        movi            v6.8h,  #0
   2716        movi            v7.8h,  #0
   2717 2:
   2718        subs            w8,  w8,  #4
   2719 .rept 2
   2720        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
   2721 .endr
   2722        b.gt            2b
   2723 
   2724 3:
   2725 .irp i, 0, 8
   2726        add             x6,  x0,  #(\i*2)
   2727        add             x7,  sp,  #(\i*2)
   2728        mov             x8,  #16*2
   2729        bl              inv_txfm_add_vert_dct_8x32_neon
   2730 .endr
   2731 
   2732        add             sp,  sp,  #1024
   2733        ret             x15
   2734 endfunc
   2735 
   2736 function inv_txfm_add_dct_dct_32x16_16bpc_neon, export=1
   2737        idct_dc         32,  16,  1
   2738 
   2739        mov             x15, x30
   2740        sub             sp,  sp,  #1024
   2741 
   2742        movrel          x13, eob_16x32
   2743        movrel          x5,  X(inv_dct_8h_x16_neon)
   2744        ldrh            w12, [x13], #2
   2745 
   2746 .irp i, 0, 4, 8, 12
   2747        add             x6,  sp,  #(\i*32*2)
   2748        add             x7,  x2,  #(\i*4)
   2749 .if \i > 0
   2750        mov             w8,  #(16 - \i)
   2751        cmp             w3,  w12
   2752        b.lt            1f
   2753 .if \i < 12
   2754        ldrh            w12, [x13], #2
   2755 .endif
   2756 .endif
   2757        mov             x8,  #4*16
   2758        bl              inv_txfm_horz_scale_dct_32x4_neon
   2759 .endr
   2760        b               3f
   2761 
   2762 1:
   2763        movi            v4.8h,  #0
   2764        movi            v5.8h,  #0
   2765        movi            v6.8h,  #0
   2766        movi            v7.8h,  #0
   2767 2:
   2768        subs            w8,  w8,  #4
   2769 .rept 4
   2770        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
   2771 .endr
   2772        b.gt            2b
   2773 
   2774 3:
   2775 .irp i, 0, 8, 16, 24
   2776        add             x6,  x0,  #(\i*2)
   2777        add             x7,  sp,  #(\i*2)
   2778        mov             x8,  #32*2
   2779        bl              inv_txfm_add_vert_8x16_neon
   2780 .endr
   2781 
   2782        add             sp,  sp,  #1024
   2783        ret             x15
   2784 endfunc
   2785 
   2786 function inv_txfm_add_dct_dct_8x32_16bpc_neon, export=1
   2787        idct_dc         8,   32, 2
   2788 
   2789        mov             x15, x30
   2790        sub             sp,  sp,  #512
   2791 
   2792        movrel          x13, eob_8x32
   2793 
   2794        movi            v28.4s,  #0
   2795        mov             x8,  #4*32
   2796        mov             w9,  #32
   2797        mov             x6,  sp
   2798        mov             x7,  x2
   2799 1:
   2800 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
   2801        ld1             {v\i\().4s}, [x7]
   2802        st1             {v28.4s}, [x7], x8
   2803 .endr
   2804        ldrh            w12, [x13], #2
   2805        sub             w9,  w9,  #4
   2806        sub             x7,  x7,  x8, lsl #3
   2807        add             x7,  x7,  #4*4
   2808 
   2809        bl              inv_dct_4s_x8_neon
   2810 
   2811        sqrshrn         v16.4h,  v16.4s,  #2
   2812        sqrshrn         v17.4h,  v17.4s,  #2
   2813        sqrshrn         v18.4h,  v18.4s,  #2
   2814        sqrshrn         v19.4h,  v19.4s,  #2
   2815        sqrshrn2        v16.8h,  v20.4s,  #2
   2816        sqrshrn2        v17.8h,  v21.4s,  #2
   2817        sqrshrn2        v18.8h,  v22.4s,  #2
   2818        sqrshrn2        v19.8h,  v23.4s,  #2
   2819 
   2820        transpose_4x8h  v16, v17, v18, v19, v2,  v3,  v4,  v5
   2821 
   2822        cmp             w3,  w12
   2823        st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64
   2824 
   2825        b.ge            1b
   2826        cbz             w9,  3f
   2827 
   2828        movi            v29.8h,  #0
   2829        movi            v30.8h,  #0
   2830        movi            v31.8h,  #0
   2831 2:
   2832        subs            w9,  w9,  #4
   2833        st1             {v28.8h,v29.8h,v30.8h,v31.8h}, [x6], #64
   2834        b.gt            2b
   2835 
   2836 3:
   2837        mov             x6,  x0
   2838        mov             x7,  sp
   2839        mov             x8,  #8*2
   2840        bl              inv_txfm_add_vert_dct_8x32_neon
   2841 
   2842        add             sp,  sp,  #512
   2843        ret             x15
   2844 endfunc
   2845 
   2846 function inv_txfm_add_dct_dct_32x8_16bpc_neon, export=1
   2847        idct_dc         32,  8,   2
   2848 
   2849        mov             x15, x30
   2850        sub             sp,  sp,  #512
   2851 
   2852 .irp i, 0, 4
   2853        add             x6,  sp,  #(\i*32*2)
   2854        add             x7,  x2,  #(\i*4)
   2855 .if \i > 0
   2856        cmp             w3,  #10
   2857        b.lt            1f
   2858 .endif
   2859        mov             x8,  #8*4
   2860        bl              inv_txfm_horz_dct_32x4_neon
   2861 .endr
   2862        b               2f
   2863 
   2864 1:
   2865        movi            v4.8h,   #0
   2866        movi            v5.8h,   #0
   2867        movi            v6.8h,   #0
   2868        movi            v7.8h,   #0
   2869 .rept 4
   2870        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
   2871 .endr
   2872 
   2873 2:
   2874        mov             x8,  #2*32
   2875        mov             w9,  #0
   2876 1:
   2877        add             x6,  x0,  x9, lsl #1
   2878        add             x7,  sp,  x9, lsl #1 // #(\i*2)
   2879 
   2880 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
   2881        ld1             {v\i\().8h}, [x7], x8
   2882 .endr
   2883        add             w9,  w9,  #8
   2884 
   2885        bl              X(inv_dct_8h_x8_neon)
   2886 
   2887        cmp             w9,  #32
   2888 
   2889        load_add_store_8x8 x6, x7
   2890 
   2891        b.lt            1b
   2892 
   2893        add             sp,  sp,  #512
   2894        ret             x15
   2895 endfunc
   2896 
   2897 function inv_dct64_step1_neon
   2898        // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
   2899        // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
   2900        // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
   2901        // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
   2902 
   2903        ld1             {v0.4s, v1.4s}, [x17], #32
   2904 
   2905        sqrdmulh        v23.4s,  v16.4s,  v0.s[1]       // t63a
   2906        sqrdmulh        v16.4s,  v16.4s,  v0.s[0]       // t32a
   2907        sqrdmulh        v22.4s,  v17.4s,  v0.s[2]       // t62a
   2908        sqrdmulh        v17.4s,  v17.4s,  v0.s[3]       // t33a
   2909        sqrdmulh        v21.4s,  v18.4s,  v1.s[1]       // t61a
   2910        sqrdmulh        v18.4s,  v18.4s,  v1.s[0]       // t34a
   2911        sqrdmulh        v20.4s,  v19.4s,  v1.s[2]       // t60a
   2912        sqrdmulh        v19.4s,  v19.4s,  v1.s[3]       // t35a
   2913 
   2914        ld1             {v0.4s}, [x17], #16
   2915 
   2916        sqadd           v24.4s,  v16.4s,  v17.4s        // t32
   2917        sqsub           v25.4s,  v16.4s,  v17.4s        // t33
   2918        sqsub           v26.4s,  v19.4s,  v18.4s        // t34
   2919        sqadd           v27.4s,  v19.4s,  v18.4s        // t35
   2920        sqadd           v28.4s,  v20.4s,  v21.4s        // t60
   2921        sqsub           v29.4s,  v20.4s,  v21.4s        // t61
   2922        sqsub           v30.4s,  v23.4s,  v22.4s        // t62
   2923        sqadd           v31.4s,  v23.4s,  v22.4s        // t63
   2924 
   2925 .irp r, v24, v25, v26, v27, v28, v29, v30, v31
   2926        smin_4s         \r, \r, v5
   2927 .endr
   2928 .irp r, v24, v25, v26, v27, v28, v29, v30, v31
   2929        smax_4s         \r, \r, v4
   2930 .endr
   2931 
   2932        mul_mla         v2,  v29, v26, v0.s[0], v0.s[1] // -> t34a
   2933        mul_mls         v7,  v29, v26, v0.s[1], v0.s[0] // -> t61a
   2934        neg             v2.4s,   v2.4s                  // t34a
   2935        mul_mls         v6,  v30, v25, v0.s[1], v0.s[0] // -> t33a
   2936        srshr           v26.4s, v2.4s,  #12             // t34a
   2937        mul_mla         v2,  v30, v25, v0.s[0], v0.s[1] // -> t62a
   2938        srshr           v29.4s, v7.4s,  #12             // t61a
   2939        srshr           v25.4s, v6.4s,  #12             // t33a
   2940        srshr           v30.4s, v2.4s,  #12             // t62a
   2941 
   2942        sqadd           v16.4s,  v24.4s,  v27.4s        // t32a
   2943        sqsub           v19.4s,  v24.4s,  v27.4s        // t35a
   2944        sqadd           v17.4s,  v25.4s,  v26.4s        // t33
   2945        sqsub           v18.4s,  v25.4s,  v26.4s        // t34
   2946        sqsub           v20.4s,  v31.4s,  v28.4s        // t60a
   2947        sqadd           v23.4s,  v31.4s,  v28.4s        // t63a
   2948        sqsub           v21.4s,  v30.4s,  v29.4s        // t61
   2949        sqadd           v22.4s,  v30.4s,  v29.4s        // t62
   2950 
   2951 .irp r, v16, v19, v17, v18, v20, v23, v21, v22
   2952        smin_4s         \r, \r, v5
   2953 .endr
   2954 .irp r, v16, v19, v17, v18, v20, v23, v21, v22
   2955        smax_4s         \r, \r, v4
   2956 .endr
   2957 
   2958        mul_mla         v2,  v21, v18, v0.s[2], v0.s[3] // -> t61a
   2959        mul_mls         v7,  v21, v18, v0.s[3], v0.s[2] // -> t34a
   2960        mul_mla         v6,  v20, v19, v0.s[2], v0.s[3] // -> t60
   2961        srshr           v21.4s, v2.4s,  #12             // t61a
   2962        srshr           v18.4s, v7.4s,  #12             // t34a
   2963        mul_mls         v2,  v20, v19, v0.s[3], v0.s[2] // -> t35
   2964        srshr           v20.4s, v6.4s,  #12             // t60
   2965        srshr           v19.4s, v2.4s,  #12             // t35
   2966 
   2967        st1             {v16.4s, v17.4s, v18.4s, v19.4s}, [x6], #64
   2968        st1             {v20.4s, v21.4s, v22.4s, v23.4s}, [x6], #64
   2969 
   2970        ret
   2971 endfunc
   2972 
   2973 function inv_dct64_step2_neon
   2974        movrel          x16, idct_coeffs
   2975        ld1             {v0.4s}, [x16]
   2976 1:
   2977        // t32a/33/34a/35/60/61a/62/63a
   2978        // t56a/57/58a/59/36/37a/38/39a
   2979        // t40a/41/42a/43/52/53a/54/55a
   2980        // t48a/49/50a/51/44/45a/46/47a
   2981        ldr             q16, [x6, #4*4*0]  // t32a
   2982        ldr             q17, [x9, #4*4*8]  // t39a
   2983        ldr             q18, [x9, #4*4*0]  // t63a
   2984        ldr             q19, [x6, #4*4*8]  // t56a
   2985        ldr             q20, [x6, #4*4*16] // t40a
   2986        ldr             q21, [x9, #4*4*24] // t47a
   2987        ldr             q22, [x9, #4*4*16] // t55a
   2988        ldr             q23, [x6, #4*4*24] // t48a
   2989 
   2990        sqadd           v24.4s,  v16.4s, v17.4s         // t32
   2991        sqsub           v25.4s,  v16.4s, v17.4s         // t39
   2992        sqadd           v26.4s,  v18.4s, v19.4s         // t63
   2993        sqsub           v27.4s,  v18.4s, v19.4s         // t56
   2994        sqsub           v28.4s,  v21.4s, v20.4s         // t40
   2995        sqadd           v29.4s,  v21.4s, v20.4s         // t47
   2996        sqadd           v30.4s,  v23.4s, v22.4s         // t48
   2997        sqsub           v31.4s,  v23.4s, v22.4s         // t55
   2998 
   2999 .irp r, v24, v25, v26, v27, v28, v29, v30, v31
   3000        smin_4s         \r, \r, v5
   3001 .endr
   3002 .irp r, v24, v25, v26, v27, v28, v29, v30, v31
   3003        smax_4s         \r, \r, v4
   3004 .endr
   3005 
   3006        mul_mla         v2,  v27, v25, v0.s[3], v0.s[2] // -> t56a
   3007        mul_mls         v7,  v27, v25, v0.s[2], v0.s[3] // -> t39a
   3008        mul_mla         v6,  v31, v28, v0.s[3], v0.s[2] // -> t40a
   3009        srshr           v25.4s, v2.4s,  #12             // t56a
   3010        srshr           v27.4s, v7.4s,  #12             // t39a
   3011        neg             v6.4s,   v6.4s                  // t40a
   3012        mul_mls         v2,  v31, v28, v0.s[2], v0.s[3] // -> t55a
   3013        srshr           v31.4s, v6.4s,  #12             // t40a
   3014        srshr           v28.4s, v2.4s,  #12             // t55a
   3015 
   3016        sqadd           v16.4s,  v24.4s,  v29.4s        // t32a
   3017        sqsub           v19.4s,  v24.4s,  v29.4s        // t47a
   3018        sqadd           v17.4s,  v27.4s,  v31.4s        // t39
   3019        sqsub           v18.4s,  v27.4s,  v31.4s        // t40
   3020        sqsub           v20.4s,  v26.4s,  v30.4s        // t48a
   3021        sqadd           v23.4s,  v26.4s,  v30.4s        // t63a
   3022        sqsub           v21.4s,  v25.4s,  v28.4s        // t55
   3023        sqadd           v22.4s,  v25.4s,  v28.4s        // t56
   3024 
   3025 .irp r, v16, v19, v17, v18, v20, v23, v21, v22
   3026        smin_4s         \r, \r, v5
   3027 .endr
   3028 .irp r, v16, v19, v17, v18, v20, v23, v21, v22
   3029        smax_4s         \r, \r, v4
   3030 .endr
   3031 
   3032        mul_mls         v2,  v21, v18, v0.s[0], v0.s[0] // -> t40a
   3033        mul_mla         v7,  v21, v18, v0.s[0], v0.s[0] // -> t55a
   3034        mul_mls         v6,  v20, v19, v0.s[0], v0.s[0] // -> t47
   3035        srshr           v18.4s, v2.4s,  #12             // t40a
   3036        srshr           v21.4s, v7.4s,  #12             // t55a
   3037        mul_mla         v2,  v20, v19, v0.s[0], v0.s[0] // -> t48
   3038        srshr           v19.4s, v6.4s,  #12             // t47
   3039        srshr           v20.4s, v2.4s,  #12             // t48
   3040 
   3041        str             q16, [x6, #4*4*0]  // t32a
   3042        str             q17, [x9, #4*4*0]  // t39
   3043        str             q18, [x6, #4*4*8]  // t40a
   3044        str             q19, [x9, #4*4*8]  // t47
   3045        str             q20, [x6, #4*4*16] // t48
   3046        str             q21, [x9, #4*4*16] // t55a
   3047        str             q22, [x6, #4*4*24] // t56
   3048        str             q23, [x9, #4*4*24] // t63a
   3049 
   3050        add             x6,  x6,  #4*4
   3051        sub             x9,  x9,  #4*4
   3052        cmp             x6,  x9
   3053        b.lt            1b
   3054        ret
   3055 endfunc
   3056 
   3057 .macro load8 src, strd, zero, clear
   3058 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
   3059 .if \clear
   3060        ld1             {\i}, [\src]
   3061        st1             {\zero}, [\src], \strd
   3062 .else
   3063        ld1             {\i}, [\src], \strd
   3064 .endif
   3065 .endr
   3066 .endm
   3067 
   3068 .macro store16 dst
   3069 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
   3070        st1             {\i}, [\dst], #16
   3071 .endr
   3072 .endm
   3073 
   3074 .macro clear_upper8
   3075 .irp i, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
   3076        movi            \i,  #0
   3077 .endr
   3078 .endm
   3079 
   3080 .macro movi_if reg, val, cond
   3081 .if \cond
   3082        movi            \reg, \val
   3083 .endif
   3084 .endm
   3085 
   3086 .macro movz16dup_if reg, gpr, val, cond
   3087 .if \cond
   3088        movz            \gpr, \val, lsl #16
   3089        dup             \reg, \gpr
   3090 .endif
   3091 .endm
   3092 
   3093 .macro st1_if regs, dst, cond
   3094 .if \cond
   3095        st1             \regs, \dst
   3096 .endif
   3097 .endm
   3098 
   3099 .macro str_if reg, dst, cond
   3100 .if \cond
   3101        str             \reg, \dst
   3102 .endif
   3103 .endm
   3104 
   3105 .macro stroff_if reg, dst, dstoff, cond
   3106 .if \cond
   3107        str             \reg, \dst, \dstoff
   3108 .endif
   3109 .endm
   3110 
   3111 .macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7
   3112 .if \cond
   3113        scale_input     .4s, \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
   3114 .endif
   3115 .endm
   3116 
   3117 .macro def_dct64_func suffix, clear=0, scale=0
   3118 function inv_txfm_dct\suffix\()_4s_x64_neon
   3119        mov             x14, x30
   3120        mov             x6,  sp
   3121        lsl             x8,  x8,  #2
   3122 
   3123        movz16dup_if    v0.2s, w16, #2896*8, \scale
   3124        movi_if         v7.4s,  #0, \clear
   3125        load8           x7,  x8,  v7.4s, \clear
   3126        clear_upper8
   3127        sub             x7,  x7,  x8, lsl #3
   3128        add             x7,  x7,  x8, lsr #1
   3129        scale_if        \scale, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
   3130 
   3131        bl              inv_dct_4s_x16_neon
   3132 
   3133        // idct_16 leaves the row_clip_max/min constants in v5 and v4
   3134 .irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
   3135        smin_4s         \r, \r, v5
   3136 .endr
   3137 .irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
   3138        smax_4s         \r, \r, v4
   3139 .endr
   3140 
   3141        store16         x6
   3142 
   3143        movz16dup_if    v0.2s, w16, #2896*8, \scale
   3144        movi_if         v7.8h,  #0, \clear
   3145        load8           x7,  x8,  v7.4s, \clear
   3146        clear_upper8
   3147        sub             x7,  x7,  x8, lsl #3
   3148        lsr             x8,  x8,  #1
   3149        sub             x7,  x7,  x8, lsr #1
   3150        scale_if        \scale, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
   3151 
   3152        bl              inv_dct32_odd_4s_x16_neon
   3153 
   3154        add             x10, x6,  #16*15
   3155        sub             x6,  x6,  #16*16
   3156 
   3157        mov             x9,  #-16
   3158 
   3159        movi            v1.4s,  #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
   3160        mvni            v0.4s,  #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
   3161 
   3162 .macro store_addsub r0, r1, r2, r3
   3163        ld1             {v2.4s}, [x6], #16
   3164        ld1             {v3.4s}, [x6], #16
   3165        sqadd           v6.4s,  v2.4s,  \r0
   3166        sqsub           \r0,    v2.4s,  \r0
   3167        ld1             {v4.4s}, [x6], #16
   3168        sqadd           v7.4s,  v3.4s,  \r1
   3169        sqsub           \r1,    v3.4s,  \r1
   3170        smin            v6.4s,  v6.4s,  v1.4s
   3171        smin            \r0,    \r0,    v1.4s
   3172        ld1             {v5.4s}, [x6], #16
   3173        sqadd           v2.4s,  v4.4s,  \r2
   3174        sub             x6,  x6,  #16*4
   3175        smax            v6.4s,  v6.4s,  v0.4s
   3176        smax            \r0,    \r0,    v0.4s
   3177        sqsub           \r2,    v4.4s,  \r2
   3178        smin            v7.4s,  v7.4s,  v1.4s
   3179        smin            \r1,    \r1,    v1.4s
   3180        st1             {v6.4s}, [x6], #16
   3181        st1             {\r0},   [x10], x9
   3182        smin            v2.4s,  v2.4s,  v1.4s
   3183        smin            \r2,    \r2,    v1.4s
   3184        smax            v7.4s,  v7.4s,  v0.4s
   3185        smax            \r1,    \r1,    v0.4s
   3186        sqadd           v3.4s,  v5.4s,  \r3
   3187        sqsub           \r3,    v5.4s,  \r3
   3188        smax            v2.4s,  v2.4s,  v0.4s
   3189        smax            \r2,    \r2,    v0.4s
   3190        smin            v3.4s,  v3.4s,  v1.4s
   3191        smin            \r3,    \r3,    v1.4s
   3192        st1             {v7.4s}, [x6], #16
   3193        st1             {\r1},   [x10], x9
   3194        smax            v3.4s,  v3.4s,  v0.4s
   3195        smax            \r3,    \r3,    v0.4s
   3196        st1             {v2.4s}, [x6], #16
   3197        st1             {\r2},   [x10], x9
   3198        st1             {v3.4s}, [x6], #16
   3199        st1             {\r3},   [x10], x9
   3200 .endm
   3201        store_addsub    v31.4s, v30.4s, v29.4s, v28.4s
   3202        store_addsub    v27.4s, v26.4s, v25.4s, v24.4s
   3203        store_addsub    v23.4s, v22.4s, v21.4s, v20.4s
   3204        store_addsub    v19.4s, v18.4s, v17.4s, v16.4s
   3205 .purgem store_addsub
   3206 
   3207        add             x6,  x6,  #4*4*16
   3208 
   3209        movrel          x17, idct64_coeffs
   3210        movi            v5.4s,  #1, msl #16  // row_clip_max = ~(~bdmax << 7), 0x1ffff
   3211        mvni            v4.4s,  #1, msl #16  // row_clip_min = (~bdmax << 7), 0xfffe0000
   3212        movz16dup_if    v0.2s, w16, #2896*8, \scale
   3213        movi_if         v7.4s,  #0, \clear
   3214        add             x9,  x7,  x8, lsl #4 // offset 16
   3215        add             x10, x7,  x8, lsl #3 // offset 8
   3216        sub             x9,  x9,  x8         // offset 15
   3217        sub             x11, x10, x8         // offset 7
   3218        ld1             {v16.4s}, [x7]  // in1  (offset 0)
   3219        ld1             {v17.4s}, [x9]  // in31 (offset 15)
   3220        ld1             {v18.4s}, [x10] // in17 (offset 8)
   3221        ld1             {v19.4s}, [x11] // in15 (offset 7)
   3222        st1_if          {v7.4s}, [x7],  \clear
   3223        st1_if          {v7.4s}, [x9],  \clear
   3224        st1_if          {v7.4s}, [x10], \clear
   3225        st1_if          {v7.4s}, [x11], \clear
   3226        scale_if        \scale, v0.s[0], v16, v17, v18, v19
   3227        bl              inv_dct64_step1_neon
   3228        movz16dup_if    v0.2s, w16, #2896*8, \scale
   3229        movi_if         v7.4s,  #0, \clear
   3230        add             x7,  x7,  x8, lsl #2 // offset 4
   3231        sub             x9,  x9,  x8, lsl #2 // offset 11
   3232        sub             x10, x7,  x8         // offset 3
   3233        add             x11, x9,  x8         // offset 12
   3234        ld1             {v16.4s}, [x10] // in7  (offset 3)
   3235        ld1             {v17.4s}, [x11] // in25 (offset 12)
   3236        ld1             {v18.4s}, [x9]  // in23 (offset 11)
   3237        ld1             {v19.4s}, [x7]  // in9  (offset 4)
   3238        st1_if          {v7.4s}, [x7],  \clear
   3239        st1_if          {v7.4s}, [x9],  \clear
   3240        st1_if          {v7.4s}, [x10], \clear
   3241        st1_if          {v7.4s}, [x11], \clear
   3242        scale_if        \scale, v0.s[0], v16, v17, v18, v19
   3243        bl              inv_dct64_step1_neon
   3244        movz16dup_if    v0.2s, w16, #2896*8, \scale
   3245        movi_if         v7.4s,  #0, \clear
   3246        sub             x10, x10, x8, lsl #1 // offset 1
   3247        sub             x9,  x9,  x8, lsl #1 // offset 9
   3248        add             x7,  x7,  x8         // offset 5
   3249        add             x11, x11, x8         // offset 13
   3250        ldr             q16, [x10, x8] // in5  (offset 2)
   3251        ldr             q17, [x11]     // in27 (offset 13)
   3252        ldr             q18, [x9,  x8] // in21 (offset 10)
   3253        ldr             q19, [x7]      // in11 (offset 5)
   3254        stroff_if       q7,  [x10, x8], \clear
   3255        str_if          q7,  [x11],     \clear
   3256        stroff_if       q7,  [x9,  x8], \clear
   3257        str_if          q7,  [x7],      \clear
   3258        scale_if        \scale, v0.s[0], v16, v17, v18, v19
   3259        bl              inv_dct64_step1_neon
   3260        movz16dup_if    v0.2s, w16, #2896*8, \scale
   3261        movi_if         v7.4s,  #0, \clear
   3262        ldr             q16, [x10]     // in3  (offset 1)
   3263        ldr             q17, [x11, x8] // in29 (offset 14)
   3264        ldr             q18, [x9]      // in19 (offset 9)
   3265        ldr             q19, [x7,  x8] // in13 (offset 6)
   3266        str_if          q7,  [x10],     \clear
   3267        stroff_if       q7,  [x11, x8], \clear
   3268        str_if          q7,  [x9],      \clear
   3269        stroff_if       q7,  [x7,  x8], \clear
   3270        scale_if        \scale, v0.s[0], v16, v17, v18, v19
   3271        bl              inv_dct64_step1_neon
   3272 
   3273        sub             x6,  x6,  #4*4*32
   3274        add             x9,  x6,  #4*4*7
   3275 
   3276        bl              inv_dct64_step2_neon
   3277 
   3278        ret             x14
   3279 endfunc
   3280 .endm
   3281 
   3282 def_dct64_func _clear, clear=1
   3283 def_dct64_func _clear_scale, clear=1, scale=1
   3284 
   3285 
   3286 function inv_txfm_horz_dct_64x4_neon
   3287        mov             x14, x30
   3288 
   3289        mov             x7,  sp
   3290        add             x8,  sp,  #4*4*(64 - 4)
   3291        add             x9,  x6,  #2*56
   3292        mov             x10, #2*64
   3293        mov             x11, #-4*4*4
   3294 
   3295        dup             v7.4s,  w12
   3296 1:
   3297        ld1             {v16.4s, v17.4s, v18.4s, v19.4s}, [x7], #64
   3298        ld1             {v28.4s, v29.4s, v30.4s, v31.4s}, [x8], x11
   3299        ld1             {v20.4s, v21.4s, v22.4s, v23.4s}, [x7], #64
   3300        ld1             {v24.4s, v25.4s, v26.4s, v27.4s}, [x8], x11
   3301        transpose_4x4s  v16, v17, v18, v19, v2,  v3,  v4,  v5
   3302        transpose_4x4s  v20, v21, v22, v23, v2,  v3,  v4,  v5
   3303        transpose_4x4s  v31, v30, v29, v28, v2,  v3,  v4,  v5
   3304        transpose_4x4s  v27, v26, v25, v24, v2,  v3,  v4,  v5
   3305 
   3306 .macro store_addsub src0, src1, src2, src3
   3307        sqsub           v1.4s,   \src0,   \src1
   3308        sqadd           v0.4s,   \src0,   \src1
   3309        sqsub           v3.4s,   \src2,   \src3
   3310        srshl           v1.4s,   v1.4s,   v7.4s
   3311        sqadd           v2.4s,   \src2,   \src3
   3312        srshl           v3.4s,   v3.4s,   v7.4s
   3313        srshl           v0.4s,   v0.4s,   v7.4s
   3314        srshl           v2.4s,   v2.4s,   v7.4s
   3315        sqxtn           v3.4h,   v3.4s
   3316        sqxtn2          v3.8h,   v1.4s
   3317        sqxtn           v0.4h,   v0.4s
   3318        sqxtn2          v0.8h,   v2.4s
   3319        rev64           v3.8h,   v3.8h
   3320        st1             {v0.8h},  [x6], x10
   3321        st1             {v3.8h},  [x9], x10
   3322 .endm
   3323        store_addsub    v16.4s,  v31.4s,  v20.4s,  v27.4s
   3324        store_addsub    v17.4s,  v30.4s,  v21.4s,  v26.4s
   3325        store_addsub    v18.4s,  v29.4s,  v22.4s,  v25.4s
   3326        store_addsub    v19.4s,  v28.4s,  v23.4s,  v24.4s
   3327 .purgem store_addsub
   3328        sub             x6,  x6,  x10, lsl #2
   3329        sub             x9,  x9,  x10, lsl #2
   3330        add             x6,  x6,  #16
   3331        sub             x9,  x9,  #16
   3332 
   3333        cmp             x7,  x8
   3334        b.lt            1b
   3335        ret             x14
   3336 endfunc
   3337 
   3338 function inv_txfm_add_vert_dct_8x64_neon
   3339        mov             x14, x30
   3340        lsl             x8,  x8,  #1
   3341 
   3342        mov             x7,  sp
   3343        add             x8,  sp,  #2*8*(64 - 4)
   3344        add             x9,  x6,  x1, lsl #6
   3345        sub             x9,  x9,  x1
   3346        neg             x10, x1
   3347        mov             x11, #-2*8*4
   3348 
   3349 1:
   3350        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64
   3351        ld1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11
   3352        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64
   3353        ld1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11
   3354 
   3355        mvni            v7.8h,   #0xfc, lsl #8 // 0x3ff
   3356 .macro add_dest_addsub src0, src1, src2, src3
   3357        ld1             {v0.8h}, [x6], x1
   3358        ld1             {v1.8h}, [x9], x10
   3359        sqadd           v4.8h,   \src0,   \src1
   3360        ld1             {v2.8h}, [x6]
   3361        sqsub           \src0,   \src0,   \src1
   3362        ld1             {v3.8h}, [x9]
   3363        sqadd           v5.8h,   \src2,   \src3
   3364        sqsub           \src2,   \src2,   \src3
   3365        sub             x6,  x6,  x1
   3366        sub             x9,  x9,  x10
   3367        srshr           v4.8h,   v4.8h,   #4
   3368        srshr           v5.8h,   v5.8h,   #4
   3369        srshr           \src0,   \src0,   #4
   3370        usqadd          v0.8h,   v4.8h
   3371        srshr           \src2,   \src2,   #4
   3372        usqadd          v1.8h,   \src0
   3373        usqadd          v2.8h,   v5.8h
   3374        smin            v0.8h,   v0.8h,   v7.8h
   3375        usqadd          v3.8h,   \src2
   3376        smin            v1.8h,   v1.8h,   v7.8h
   3377        st1             {v0.8h}, [x6], x1
   3378        smin            v2.8h,   v2.8h,   v7.8h
   3379        st1             {v1.8h}, [x9], x10
   3380        smin            v3.8h,   v3.8h,   v7.8h
   3381        st1             {v2.8h}, [x6], x1
   3382        st1             {v3.8h}, [x9], x10
   3383 .endm
   3384        add_dest_addsub v16.8h,  v31.8h,  v17.8h,  v30.8h
   3385        add_dest_addsub v18.8h,  v29.8h,  v19.8h,  v28.8h
   3386        add_dest_addsub v20.8h,  v27.8h,  v21.8h,  v26.8h
   3387        add_dest_addsub v22.8h,  v25.8h,  v23.8h,  v24.8h
   3388 .purgem add_dest_addsub
   3389        cmp             x7,  x8
   3390        b.lt            1b
   3391 
   3392        ret             x14
   3393 endfunc
   3394 
   3395 function inv_txfm_add_dct_dct_64x64_16bpc_neon, export=1
   3396        idct_dc         64,  64,  2
   3397 
   3398        mov             x15, x30
   3399 
   3400        sub_sp          64*32*2+64*4*4
   3401        add             x5,  sp, #64*4*4
   3402 
   3403        movrel          x13, eob_32x32
   3404 
   3405 .irp i, 0, 4, 8, 12, 16, 20, 24, 28
   3406        add             x6,  x5,  #(\i*64*2)
   3407 .if \i > 0
   3408        mov             w8,  #(32 - \i)
   3409        cmp             w3,  w12
   3410        b.lt            1f
   3411 .endif
   3412        add             x7,  x2,  #(\i*4)
   3413        mov             x8,  #32*4
   3414        mov             x12, #-2 // shift
   3415        bl              inv_txfm_dct_clear_4s_x64_neon
   3416        add             x6,  x5,  #(\i*64*2)
   3417        bl              inv_txfm_horz_dct_64x4_neon
   3418 .if \i < 28
   3419        ldrh            w12, [x13], #2
   3420 .endif
   3421 .endr
   3422        b               3f
   3423 
   3424 1:
   3425        movi            v4.8h,  #0
   3426        movi            v5.8h,  #0
   3427        movi            v6.8h,  #0
   3428        movi            v7.8h,  #0
   3429 2:
   3430        subs            w8,  w8,  #2
   3431 .rept 4
   3432        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
   3433 .endr
   3434        b.gt            2b
   3435 
   3436 3:
   3437 .irp i, 0, 8, 16, 24, 32, 40, 48, 56
   3438        add             x7,  x5,  #(\i*2)
   3439        mov             x8,  #64*2
   3440        bl              X(inv_txfm_dct_8h_x64_neon)
   3441        add             x6,  x0,  #(\i*2)
   3442        bl              inv_txfm_add_vert_dct_8x64_neon
   3443 .endr
   3444 
   3445        add             sp,  x5,  #64*32*2
   3446        ret             x15
   3447 endfunc
   3448 
   3449 function inv_txfm_add_dct_dct_64x32_16bpc_neon, export=1
   3450        idct_dc         64,  32,  1
   3451 
   3452        mov             x15, x30
   3453 
   3454        sub_sp          64*32*2+64*4*4
   3455        add             x5,  sp, #64*4*4
   3456 
   3457        movrel          x13, eob_32x32
   3458 
   3459 .irp i, 0, 4, 8, 12, 16, 20, 24, 28
   3460        add             x6,  x5,  #(\i*64*2)
   3461 .if \i > 0
   3462        mov             w8,  #(32 - \i)
   3463        cmp             w3,  w12
   3464        b.lt            1f
   3465 .endif
   3466        add             x7,  x2,  #(\i*4)
   3467        mov             x8,  #32*4
   3468        mov             x12, #-1 // shift
   3469        bl              inv_txfm_dct_clear_scale_4s_x64_neon
   3470        add             x6,  x5,  #(\i*64*2)
   3471        bl              inv_txfm_horz_dct_64x4_neon
   3472 .if \i < 28
   3473        ldrh            w12, [x13], #2
   3474 .endif
   3475 .endr
   3476        b               3f
   3477 
   3478 1:
   3479        movi            v4.8h,  #0
   3480        movi            v5.8h,  #0
   3481        movi            v6.8h,  #0
   3482        movi            v7.8h,  #0
   3483 2:
   3484        subs            w8,  w8,  #2
   3485 .rept 4
   3486        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
   3487 .endr
   3488        b.gt            2b
   3489 
   3490 3:
   3491 .irp i, 0, 8, 16, 24, 32, 40, 48, 56
   3492        add             x6,  x0,  #(\i*2)
   3493        add             x7,  x5,  #(\i*2)
   3494        mov             x8,  #64*2
   3495        bl              inv_txfm_add_vert_dct_8x32_neon
   3496 .endr
   3497 
   3498        add             sp,  x5,  #64*32*2
   3499        ret             x15
   3500 endfunc
   3501 
   3502 function inv_txfm_add_dct_dct_32x64_16bpc_neon, export=1
   3503        idct_dc         32,  64,  1
   3504 
   3505        mov             x15, x30
   3506 
   3507        sub_sp          32*32*2+64*8*2
   3508        add             x5,  sp, #64*8*2
   3509 
   3510        movrel          x13, eob_32x32
   3511        ldrh            w12, [x13], #2
   3512 
   3513 .irp i, 0, 4, 8, 12, 16, 20, 24, 28
   3514        add             x6,  x5,  #(\i*32*2)
   3515 .if \i > 0
   3516        mov             w8,  #(32 - \i)
   3517        cmp             w3,  w12
   3518        b.lt            1f
   3519        ldrh            w12, [x13], #2
   3520 .endif
   3521        add             x7,  x2,  #(\i*4)
   3522        mov             x8,  #32*4
   3523        bl              inv_txfm_horz_scale_dct_32x4_neon
   3524 .endr
   3525        b               3f
   3526 
   3527 1:
   3528        movi            v4.8h,  #0
   3529        movi            v5.8h,  #0
   3530        movi            v6.8h,  #0
   3531        movi            v7.8h,  #0
   3532 2:
   3533        subs            w8,  w8,  #4
   3534 .rept 4
   3535        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
   3536 .endr
   3537        b.gt            2b
   3538 
   3539 3:
   3540 .irp i, 0, 8, 16, 24
   3541        add             x7,  x5,  #(\i*2)
   3542        mov             x8,  #32*2
   3543        bl              X(inv_txfm_dct_8h_x64_neon)
   3544        add             x6,  x0,  #(\i*2)
   3545        bl              inv_txfm_add_vert_dct_8x64_neon
   3546 .endr
   3547 
   3548        add             sp,  x5,  #32*32*2
   3549        ret             x15
   3550 endfunc
   3551 
   3552 function inv_txfm_add_dct_dct_64x16_16bpc_neon, export=1
   3553        idct_dc         64,  16,  2
   3554 
   3555        mov             x15, x30
   3556 
   3557        sub_sp          64*16*2+64*4*4
   3558        add             x4,  sp, #64*4*4
   3559 
   3560        movrel          x13, eob_16x32
   3561 
   3562 .irp i, 0, 4, 8, 12
   3563        add             x6,  x4,  #(\i*64*2)
   3564 .if \i > 0
   3565        mov             w8,  #(16 - \i)
   3566        cmp             w3,  w12
   3567        b.lt            1f
   3568 .endif
   3569        add             x7,  x2,  #(\i*4)
   3570        mov             x8,  #16*4
   3571        mov             x12, #-2 // shift
   3572        bl              inv_txfm_dct_clear_4s_x64_neon
   3573        add             x6,  x4,  #(\i*64*2)
   3574        bl              inv_txfm_horz_dct_64x4_neon
   3575 .if \i < 12
   3576        ldrh            w12, [x13], #2
   3577 .endif
   3578 .endr
   3579        b               3f
   3580 
   3581 1:
   3582        movi            v4.8h,  #0
   3583        movi            v5.8h,  #0
   3584        movi            v6.8h,  #0
   3585        movi            v7.8h,  #0
   3586 2:
   3587        subs            w8,  w8,  #2
   3588 .rept 4
   3589        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
   3590 .endr
   3591        b.gt            2b
   3592 
   3593 3:
   3594        movrel          x5,  X(inv_dct_8h_x16_neon)
   3595 .irp i, 0, 8, 16, 24, 32, 40, 48, 56
   3596        add             x6,  x0,  #(\i*2)
   3597        add             x7,  x4,  #(\i*2)
   3598        mov             x8,  #64*2
   3599        bl              inv_txfm_add_vert_8x16_neon
   3600 .endr
   3601 
   3602        add             sp,  x4,  #64*16*2
   3603        ret             x15
   3604 endfunc
   3605 
   3606 function inv_txfm_add_dct_dct_16x64_16bpc_neon, export=1
   3607        idct_dc         16,  64,  2
   3608 
   3609        mov             x15, x30
   3610 
   3611        sub_sp          16*32*2+64*8*2
   3612        add             x5,  sp, #64*8*2
   3613 
   3614        movrel          x13, eob_16x32
   3615        ldrh            w12, [x13], #2
   3616 
   3617        adr             x4,  inv_dct_4s_x16_neon
   3618 .irp i, 0, 4, 8, 12, 16, 20, 24, 28
   3619        add             x6,  x5,  #(\i*16*2)
   3620 .if \i > 0
   3621        mov             w8,  #(32 - \i)
   3622        cmp             w3,  w12
   3623        b.lt            1f
   3624 .if \i < 28
   3625        ldrh            w12, [x13], #2
   3626 .endif
   3627 .endif
   3628        add             x7,  x2,  #(\i*4)
   3629        mov             x8,  #32*4
   3630        bl              inv_txfm_horz_16x4_neon
   3631 .endr
   3632        b               3f
   3633 
   3634 1:
   3635        movi            v4.8h,  #0
   3636        movi            v5.8h,  #0
   3637        movi            v6.8h,  #0
   3638        movi            v7.8h,  #0
   3639 2:
   3640        subs            w8,  w8,  #4
   3641 .rept 2
   3642        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
   3643 .endr
   3644        b.gt            2b
   3645 
   3646 3:
   3647 .irp i, 0, 8
   3648        add             x7,  x5,  #(\i*2)
   3649        mov             x8,  #16*2
   3650        bl              X(inv_txfm_dct_8h_x64_neon)
   3651        add             x6,  x0,  #(\i*2)
   3652        bl              inv_txfm_add_vert_dct_8x64_neon
   3653 .endr
   3654 
   3655        add             sp,  x5,  #16*32*2
   3656        ret             x15
   3657 endfunc