tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

vp9itxfm_neon.S (63534B)


      1 /*
      2 * Copyright (c) 2016 Google Inc.
      3 *
      4 * This file is part of FFmpeg.
      5 *
      6 * FFmpeg is free software; you can redistribute it and/or
      7 * modify it under the terms of the GNU Lesser General Public
      8 * License as published by the Free Software Foundation; either
      9 * version 2.1 of the License, or (at your option) any later version.
     10 *
     11 * FFmpeg is distributed in the hope that it will be useful,
     12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     14 * Lesser General Public License for more details.
     15 *
     16 * You should have received a copy of the GNU Lesser General Public
     17 * License along with FFmpeg; if not, write to the Free Software
     18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     19 */
     20 
     21 #include "libavutil/aarch64/asm.S"
     22 #include "neon.S"
     23 
     24 const itxfm4_coeffs, align=4
     25        .short  11585, 0, 6270, 15137
     26 iadst4_coeffs:
     27        .short  5283, 15212, 9929, 13377
     28 endconst
     29 
     30 const iadst8_coeffs, align=4
     31        .short  16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
     32 idct_coeffs:
     33        .short  11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
     34        .short  1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
     35        .short  804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
     36        .short  3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
     37 endconst
     38 
     39 const iadst16_coeffs, align=4
     40        .short  16364, 804, 15893, 3981, 11003, 12140, 8423, 14053
     41        .short  14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
     42 endconst
     43 
     44 // out1 = ((in1 + in2) * v0[0] + (1 << 13)) >> 14
     45 // out2 = ((in1 - in2) * v0[0] + (1 << 13)) >> 14
     46 // in/out are .8h registers; this can do with 4 temp registers, but is
     47 // more efficient if 6 temp registers are available.
     48 .macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0
     49 .if \neg > 0
     50        neg             \tmp4\().4h, v0.4h
     51 .endif
     52        add             \tmp1\().8h, \in1\().8h,  \in2\().8h
     53        sub             \tmp2\().8h, \in1\().8h,  \in2\().8h
     54 .if \neg > 0
     55        smull           \tmp3\().4s, \tmp1\().4h, \tmp4\().h[0]
     56        smull2          \tmp4\().4s, \tmp1\().8h, \tmp4\().h[0]
     57 .else
     58        smull           \tmp3\().4s, \tmp1\().4h, v0.h[0]
     59        smull2          \tmp4\().4s, \tmp1\().8h, v0.h[0]
     60 .endif
     61 .ifb \tmp5
     62        rshrn           \out1\().4h, \tmp3\().4s, #14
     63        rshrn2          \out1\().8h, \tmp4\().4s, #14
     64        smull           \tmp3\().4s, \tmp2\().4h, v0.h[0]
     65        smull2          \tmp4\().4s, \tmp2\().8h, v0.h[0]
     66        rshrn           \out2\().4h, \tmp3\().4s, #14
     67        rshrn2          \out2\().8h, \tmp4\().4s, #14
     68 .else
     69        smull           \tmp5\().4s, \tmp2\().4h, v0.h[0]
     70        smull2          \tmp6\().4s, \tmp2\().8h, v0.h[0]
     71        rshrn           \out1\().4h, \tmp3\().4s, #14
     72        rshrn2          \out1\().8h, \tmp4\().4s, #14
     73        rshrn           \out2\().4h, \tmp5\().4s, #14
     74        rshrn2          \out2\().8h, \tmp6\().4s, #14
     75 .endif
     76 .endm
     77 
     78 // Same as dmbutterfly0 above, but treating the input in in2 as zero,
     79 // writing the same output into both out1 and out2.
     80 .macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6
     81        smull           \tmp1\().4s,  \in1\().4h,  v0.h[0]
     82        smull2          \tmp2\().4s,  \in1\().8h,  v0.h[0]
     83        rshrn           \out1\().4h,  \tmp1\().4s, #14
     84        rshrn2          \out1\().8h,  \tmp2\().4s, #14
     85        rshrn           \out2\().4h,  \tmp1\().4s, #14
     86        rshrn2          \out2\().8h,  \tmp2\().4s, #14
     87 .endm
     88 
     89 // out1,out2 = in1 * coef1 - in2 * coef2
     90 // out3,out4 = in1 * coef2 + in2 * coef1
     91 // out are 4 x .4s registers, in are 2 x .8h registers
     92 .macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2
     93        smull           \out1\().4s, \in1\().4h, \coef1
     94        smull2          \out2\().4s, \in1\().8h, \coef1
     95        smull           \out3\().4s, \in1\().4h, \coef2
     96        smull2          \out4\().4s, \in1\().8h, \coef2
     97        smlsl           \out1\().4s, \in2\().4h, \coef2
     98        smlsl2          \out2\().4s, \in2\().8h, \coef2
     99        smlal           \out3\().4s, \in2\().4h, \coef1
    100        smlal2          \out4\().4s, \in2\().8h, \coef1
    101 .endm
    102 
    103 // inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14
    104 // inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14
    105 // inout are 2 x .8h registers
    106 .macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0
    107        dmbutterfly_l   \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2
    108 .if \neg > 0
    109        neg             \tmp3\().4s, \tmp3\().4s
    110        neg             \tmp4\().4s, \tmp4\().4s
    111 .endif
    112        rshrn           \inout1\().4h, \tmp1\().4s,  #14
    113        rshrn2          \inout1\().8h, \tmp2\().4s,  #14
    114        rshrn           \inout2\().4h, \tmp3\().4s,  #14
    115        rshrn2          \inout2\().8h, \tmp4\().4s,  #14
    116 .endm
    117 
    118 // Same as dmbutterfly above, but treating the input in inout2 as zero
    119 .macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
    120        smull           \tmp1\().4s, \inout1\().4h, \coef1
    121        smull2          \tmp2\().4s, \inout1\().8h, \coef1
    122        smull           \tmp3\().4s, \inout1\().4h, \coef2
    123        smull2          \tmp4\().4s, \inout1\().8h, \coef2
    124        rshrn           \inout1\().4h, \tmp1\().4s, #14
    125        rshrn2          \inout1\().8h, \tmp2\().4s, #14
    126        rshrn           \inout2\().4h, \tmp3\().4s, #14
    127        rshrn2          \inout2\().8h, \tmp4\().4s, #14
    128 .endm
    129 
    130 // Same as dmbutterfly above, but treating the input in inout1 as zero
    131 .macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
    132        smull           \tmp1\().4s, \inout2\().4h, \coef2
    133        smull2          \tmp2\().4s, \inout2\().8h, \coef2
    134        smull           \tmp3\().4s, \inout2\().4h, \coef1
    135        smull2          \tmp4\().4s, \inout2\().8h, \coef1
    136        neg             \tmp1\().4s, \tmp1\().4s
    137        neg             \tmp2\().4s, \tmp2\().4s
    138        rshrn           \inout2\().4h, \tmp3\().4s, #14
    139        rshrn2          \inout2\().8h, \tmp4\().4s, #14
    140        rshrn           \inout1\().4h, \tmp1\().4s, #14
    141        rshrn2          \inout1\().8h, \tmp2\().4s, #14
    142 .endm
    143 
    144 .macro dsmull_h out1, out2, in, coef
    145        smull           \out1\().4s, \in\().4h, \coef
    146        smull2          \out2\().4s, \in\().8h, \coef
    147 .endm
    148 
    149 .macro drshrn_h out, in1, in2, shift
    150        rshrn           \out\().4h, \in1\().4s, \shift
    151        rshrn2          \out\().8h, \in2\().4s, \shift
    152 .endm
    153 
    154 
    155 // out1 = in1 + in2
    156 // out2 = in1 - in2
    157 .macro butterfly_8h out1, out2, in1, in2
    158        add             \out1\().8h, \in1\().8h, \in2\().8h
    159        sub             \out2\().8h, \in1\().8h, \in2\().8h
    160 .endm
    161 
    162 // out1 = in1 - in2
    163 // out2 = in1 + in2
    164 .macro butterfly_8h_r out1, out2, in1, in2
    165        sub             \out1\().8h, \in1\().8h, \in2\().8h
    166        add             \out2\().8h, \in1\().8h, \in2\().8h
    167 .endm
    168 
    169 // out1 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
    170 // out2 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
    171 // out are 2 x .8h registers, in are 4 x .4s registers
    172 .macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
    173        add             \tmp1\().4s, \in1\().4s, \in3\().4s
    174        add             \tmp2\().4s, \in2\().4s, \in4\().4s
    175        sub             \tmp3\().4s, \in1\().4s, \in3\().4s
    176        sub             \tmp4\().4s, \in2\().4s, \in4\().4s
    177        rshrn           \out1\().4h, \tmp1\().4s,  #14
    178        rshrn2          \out1\().8h, \tmp2\().4s,  #14
    179        rshrn           \out2\().4h, \tmp3\().4s,  #14
    180        rshrn2          \out2\().8h, \tmp4\().4s,  #14
    181 .endm
    182 
    183 .macro iwht4 c0, c1, c2, c3
    184        add             \c0\().4h, \c0\().4h, \c1\().4h
    185        sub             v17.4h,    \c2\().4h, \c3\().4h
    186        sub             v16.4h,    \c0\().4h, v17.4h
    187        sshr            v16.4h,    v16.4h,    #1
    188        sub             \c2\().4h, v16.4h,    \c1\().4h
    189        sub             \c1\().4h, v16.4h,    \c3\().4h
    190        add             \c3\().4h, v17.4h,    \c2\().4h
    191        sub             \c0\().4h, \c0\().4h, \c1\().4h
    192 .endm
    193 
    194 .macro idct4 c0, c1, c2, c3
    195        smull           v22.4s,    \c1\().4h, v0.h[3]
    196        smull           v20.4s,    \c1\().4h, v0.h[2]
    197        add             v16.4h,    \c0\().4h, \c2\().4h
    198        sub             v17.4h,    \c0\().4h, \c2\().4h
    199        smlal           v22.4s,    \c3\().4h, v0.h[2]
    200        smull           v18.4s,    v16.4h,    v0.h[0]
    201        smull           v19.4s,    v17.4h,    v0.h[0]
    202        smlsl           v20.4s,    \c3\().4h, v0.h[3]
    203        rshrn           v22.4h,    v22.4s,    #14
    204        rshrn           v18.4h,    v18.4s,    #14
    205        rshrn           v19.4h,    v19.4s,    #14
    206        rshrn           v20.4h,    v20.4s,    #14
    207        add             \c0\().4h, v18.4h,    v22.4h
    208        sub             \c3\().4h, v18.4h,    v22.4h
    209        add             \c1\().4h, v19.4h,    v20.4h
    210        sub             \c2\().4h, v19.4h,    v20.4h
    211 .endm
    212 
    213 .macro iadst4 c0, c1, c2, c3
    214        smull           v16.4s,    \c0\().4h, v0.h[4]
    215        smlal           v16.4s,    \c2\().4h, v0.h[5]
    216        smlal           v16.4s,    \c3\().4h, v0.h[6]
    217        smull           v17.4s,    \c0\().4h, v0.h[6]
    218        smlsl           v17.4s,    \c2\().4h, v0.h[4]
    219        sub             \c0\().4h, \c0\().4h, \c2\().4h
    220        smlsl           v17.4s,    \c3\().4h, v0.h[5]
    221        add             \c0\().4h, \c0\().4h, \c3\().4h
    222        smull           v19.4s,    \c1\().4h, v0.h[7]
    223        smull           v18.4s,    \c0\().4h, v0.h[7]
    224        add             v20.4s,    v16.4s,    v19.4s
    225        add             v21.4s,    v17.4s,    v19.4s
    226        rshrn           \c0\().4h, v20.4s,    #14
    227        add             v16.4s,    v16.4s,    v17.4s
    228        rshrn           \c1\().4h, v21.4s,    #14
    229        sub             v16.4s,    v16.4s,    v19.4s
    230        rshrn           \c2\().4h, v18.4s,    #14
    231        rshrn           \c3\().4h, v16.4s,    #14
    232 .endm
    233 
    234 // The public functions in this file have got the following signature:
    235 // void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
    236 
    237 .macro itxfm_func4x4 txfm1, txfm2
    238 function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1
    239 .ifc \txfm1,\txfm2
    240 .ifc \txfm1,idct
    241        movrel          x4,  itxfm4_coeffs
    242        ld1             {v0.4h}, [x4]
    243 .endif
    244 .ifc \txfm1,iadst
    245        movrel          x4,  iadst4_coeffs
    246        ld1             {v0.d}[1], [x4]
    247 .endif
    248 .else
    249        movrel          x4,  itxfm4_coeffs
    250        ld1             {v0.8h}, [x4]
    251 .endif
    252 
    253        movi            v31.8h, #0
    254 .ifc \txfm1\()_\txfm2,idct_idct
    255        cmp             w3,  #1
    256        b.ne            1f
    257        // DC-only for idct/idct
    258        ld1             {v2.h}[0], [x2]
    259        smull           v2.4s,  v2.4h, v0.h[0]
    260        rshrn           v2.4h,  v2.4s, #14
    261        smull           v2.4s,  v2.4h, v0.h[0]
    262        rshrn           v2.4h,  v2.4s, #14
    263        st1             {v31.h}[0], [x2]
    264        dup             v4.4h,  v2.h[0]
    265        mov             v5.16b, v4.16b
    266        mov             v6.16b, v4.16b
    267        mov             v7.16b, v4.16b
    268        b               2f
    269 .endif
    270 
    271 1:
    272        ld1             {v4.4h,v5.4h,v6.4h,v7.4h},  [x2]
    273        st1             {v31.8h}, [x2], #16
    274 
    275 .ifc \txfm1,iwht
    276        sshr            v4.4h,  v4.4h,  #2
    277        sshr            v5.4h,  v5.4h,  #2
    278        sshr            v6.4h,  v6.4h,  #2
    279        sshr            v7.4h,  v7.4h,  #2
    280 .endif
    281 
    282        \txfm1\()4      v4,  v5,  v6,  v7
    283 
    284        st1             {v31.8h}, [x2], #16
    285        // Transpose 4x4 with 16 bit elements
    286        transpose_4x4H  v4,  v5,  v6,  v7,  v16, v17, v18, v19
    287 
    288        \txfm2\()4      v4,  v5,  v6,  v7
    289 2:
    290        ld1             {v0.s}[0],   [x0], x1
    291        ld1             {v1.s}[0],   [x0], x1
    292 .ifnc \txfm1,iwht
    293        srshr           v4.4h,  v4.4h,  #4
    294        srshr           v5.4h,  v5.4h,  #4
    295        srshr           v6.4h,  v6.4h,  #4
    296        srshr           v7.4h,  v7.4h,  #4
    297 .endif
    298        uaddw           v4.8h,  v4.8h,  v0.8b
    299        uaddw           v5.8h,  v5.8h,  v1.8b
    300        ld1             {v2.s}[0],   [x0], x1
    301        ld1             {v3.s}[0],   [x0], x1
    302        sqxtun          v0.8b,  v4.8h
    303        sqxtun          v1.8b,  v5.8h
    304        sub             x0,  x0,  x1, lsl #2
    305 
    306        uaddw           v6.8h,  v6.8h,  v2.8b
    307        uaddw           v7.8h,  v7.8h,  v3.8b
    308        st1             {v0.s}[0],  [x0], x1
    309        sqxtun          v2.8b,  v6.8h
    310        sqxtun          v3.8b,  v7.8h
    311 
    312        st1             {v1.s}[0],  [x0], x1
    313        st1             {v2.s}[0],  [x0], x1
    314        st1             {v3.s}[0],  [x0], x1
    315 
    316        ret
    317 endfunc
    318 .endm
    319 
    320 itxfm_func4x4 idct,  idct
    321 itxfm_func4x4 iadst, idct
    322 itxfm_func4x4 idct,  iadst
    323 itxfm_func4x4 iadst, iadst
    324 itxfm_func4x4 iwht,  iwht
    325 
    326 
    327 .macro idct8
    328        dmbutterfly0    v16, v20, v16, v20, v2, v3, v4, v5, v6, v7 // v16 = t0a, v20 = t1a
    329        dmbutterfly     v18, v22, v0.h[2], v0.h[3], v2, v3, v4, v5 // v18 = t2a, v22 = t3a
    330        dmbutterfly     v17, v23, v0.h[4], v0.h[5], v2, v3, v4, v5 // v17 = t4a, v23 = t7a
    331        dmbutterfly     v21, v19, v0.h[6], v0.h[7], v2, v3, v4, v5 // v21 = t5a, v19 = t6a
    332 
    333        butterfly_8h    v24, v25, v16, v22 // v24 = t0, v25 = t3
    334        butterfly_8h    v28, v29, v17, v21 // v28 = t4, v29 = t5a
    335        butterfly_8h    v30, v31, v23, v19 // v30 = t7, v31 = t6a
    336        butterfly_8h    v26, v27, v20, v18 // v26 = t1, v27 = t2
    337 
    338        dmbutterfly0    v31, v29, v31, v29, v2, v3, v4, v5, v6, v7 // v31 = t6, v29 = t5
    339 
    340        butterfly_8h    v16, v23, v24, v30 // v16 = out[0], v23 = out[7]
    341        butterfly_8h    v17, v22, v26, v31 // v17 = out[1], v22 = out[6]
    342        butterfly_8h    v18, v21, v27, v29 // q13 = out[2], q10 = out[5]
    343        butterfly_8h    v19, v20, v25, v28 // v17 = out[3], q12 = out[4]
    344 .endm
    345 
    346 .macro iadst8
    347        dmbutterfly_l   v24, v25, v26, v27, v23, v16, v1.h[1], v1.h[0]   // v24,v25 = t1a, v26,v27 = t0a
    348        dmbutterfly_l   v28, v29, v30, v31, v21, v18, v1.h[3], v1.h[2]   // v28,v29 = t3a, v30,v31 = t2a
    349        dmbutterfly_l   v2,  v3,  v4,  v5,  v19, v20, v1.h[5], v1.h[4]   // v2,v3   = t5a, v4,v5   = t4a
    350        dmbutterfly_l   v16, v18, v21, v23, v17, v22, v1.h[7], v1.h[6]   // v16,v18 = t7a, v21,v23 = t6a
    351 
    352        dbutterfly_n    v4,  v5,  v26, v27, v4,  v5,  v6,  v7, v26, v27  // v4  = t0, v5  = t4
    353        dbutterfly_n    v2,  v3,  v24, v25, v2,  v3,  v6,  v7, v26, v27  // v2  = t1, v3  = t5
    354        dbutterfly_n    v24, v25, v30, v31, v21, v23, v6,  v7, v26, v27  // v24 = t2, v25 = t6
    355        dbutterfly_n    v30, v31, v28, v29, v16, v18, v6,  v7, v26, v27  // v30 = t3, v31 = t7
    356 
    357        butterfly_8h    v16, v6,  v4, v24 // v16 = out[0],  v6 = t2
    358        butterfly_8h    v23, v7,  v2, v30 // v23 = -out[7], v7 = t3
    359        neg             v23.8h,   v23.8h  // v23 = out[7]
    360 
    361        dmbutterfly0    v19, v20, v6, v7, v24, v26, v27, v28, v29, v30   // v19 = -out[3], v20 = out[4]
    362        neg             v19.8h,   v19.8h  // v19 = out[3]
    363 
    364        dmbutterfly_l   v26, v27, v28, v29, v5,  v3,  v0.h[2], v0.h[3]   // v26,v27 = t5a, v28,v29 = t4a
    365        dmbutterfly_l   v2,  v3,  v4,  v5,  v31, v25, v0.h[3], v0.h[2]   // v2,v3   = t6a, v4,v5   = t7a
    366 
    367        dbutterfly_n    v17, v30, v28, v29, v2,  v3,  v6,  v7,  v24, v25 // v17 = -out[1], v30 = t6
    368        dbutterfly_n    v22, v31, v26, v27, v4,  v5,  v6,  v7,  v24, v25 // v22 = out[6],  v31 = t7
    369        neg             v17.8h,   v17.8h  // v17 = out[1]
    370 
    371        dmbutterfly0    v18, v21, v30, v31, v2,  v3,  v4,  v5,  v6,  v7  // v18 = out[2], v21 = -out[5]
    372        neg             v21.8h,   v21.8h  // v21 = out[5]
    373 .endm
    374 
    375 
    376 .macro itxfm_func8x8 txfm1, txfm2
    377 function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1
    378        // The iadst also uses a few coefficients from
    379        // idct, so those always need to be loaded.
    380 .ifc \txfm1\()_\txfm2,idct_idct
    381        movrel          x4,  idct_coeffs
    382 .else
    383        movrel          x4,  iadst8_coeffs
    384        ld1             {v1.8h}, [x4], #16
    385 .endif
    386        ld1             {v0.8h}, [x4]
    387 
    388        movi            v2.8h, #0
    389        movi            v3.8h, #0
    390        movi            v4.8h, #0
    391        movi            v5.8h, #0
    392 
    393 .ifc \txfm1\()_\txfm2,idct_idct
    394        cmp             w3,  #1
    395        b.ne            1f
    396        // DC-only for idct/idct
    397        ld1             {v2.h}[0],  [x2]
    398        smull           v2.4s,  v2.4h, v0.h[0]
    399        rshrn           v2.4h,  v2.4s, #14
    400        smull           v2.4s,  v2.4h, v0.h[0]
    401        rshrn           v2.4h,  v2.4s, #14
    402        st1             {v3.h}[0],  [x2]
    403        dup             v16.8h,  v2.h[0]
    404        mov             v17.16b, v16.16b
    405        mov             v18.16b, v16.16b
    406        mov             v19.16b, v16.16b
    407        mov             v20.16b, v16.16b
    408        mov             v21.16b, v16.16b
    409        mov             v22.16b, v16.16b
    410        mov             v23.16b, v16.16b
    411        b               2f
    412 .endif
    413 1:
    414        ld1             {v16.8h,v17.8h,v18.8h,v19.8h},  [x2], #64
    415        ld1             {v20.8h,v21.8h,v22.8h,v23.8h},  [x2], #64
    416        sub             x2,  x2,  #128
    417        st1             {v2.8h,v3.8h,v4.8h,v5.8h},      [x2], #64
    418        st1             {v2.8h,v3.8h,v4.8h,v5.8h},      [x2], #64
    419 
    420        \txfm1\()8
    421 
    422        // Transpose 8x8 with 16 bit elements
    423        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
    424 
    425        \txfm2\()8
    426 2:
    427        mov             x3,  x0
    428        // Add into the destination
    429        ld1             {v0.8b},  [x0], x1
    430        srshr           v16.8h, v16.8h, #5
    431        ld1             {v1.8b},  [x0], x1
    432        srshr           v17.8h, v17.8h, #5
    433        ld1             {v2.8b},  [x0], x1
    434        srshr           v18.8h, v18.8h, #5
    435        uaddw           v16.8h, v16.8h, v0.8b
    436        ld1             {v3.8b},  [x0], x1
    437        srshr           v19.8h, v19.8h, #5
    438        uaddw           v17.8h, v17.8h, v1.8b
    439        ld1             {v4.8b},  [x0], x1
    440        srshr           v20.8h, v20.8h, #5
    441        uaddw           v18.8h, v18.8h, v2.8b
    442        sqxtun          v0.8b,  v16.8h
    443        ld1             {v5.8b},  [x0], x1
    444        srshr           v21.8h, v21.8h, #5
    445        uaddw           v19.8h, v19.8h, v3.8b
    446        sqxtun          v1.8b,  v17.8h
    447        ld1             {v6.8b},  [x0], x1
    448        srshr           v22.8h, v22.8h, #5
    449        uaddw           v20.8h, v20.8h, v4.8b
    450        sqxtun          v2.8b,  v18.8h
    451        ld1             {v7.8b},  [x0], x1
    452        srshr           v23.8h, v23.8h, #5
    453        uaddw           v21.8h, v21.8h, v5.8b
    454        sqxtun          v3.8b,  v19.8h
    455 
    456        st1             {v0.8b},  [x3], x1
    457        uaddw           v22.8h, v22.8h, v6.8b
    458        st1             {v1.8b},  [x3], x1
    459        sqxtun          v4.8b,  v20.8h
    460        st1             {v2.8b},  [x3], x1
    461        uaddw           v23.8h, v23.8h, v7.8b
    462        st1             {v3.8b},  [x3], x1
    463        sqxtun          v5.8b,  v21.8h
    464        st1             {v4.8b},  [x3], x1
    465        sqxtun          v6.8b,  v22.8h
    466        st1             {v5.8b},  [x3], x1
    467        sqxtun          v7.8b,  v23.8h
    468 
    469        st1             {v6.8b},  [x3], x1
    470        st1             {v7.8b},  [x3], x1
    471 
    472        ret
    473 endfunc
    474 .endm
    475 
    476 itxfm_func8x8 idct,  idct
    477 itxfm_func8x8 iadst, idct
    478 itxfm_func8x8 idct,  iadst
    479 itxfm_func8x8 iadst, iadst
    480 
    481 
    482 function idct16x16_dc_add_neon
    483        movrel          x4,  idct_coeffs
    484        ld1             {v0.4h}, [x4]
    485 
    486        movi            v1.4h,  #0
    487 
    488        ld1             {v2.h}[0], [x2]
    489        smull           v2.4s,  v2.4h,  v0.h[0]
    490        rshrn           v2.4h,  v2.4s,  #14
    491        smull           v2.4s,  v2.4h,  v0.h[0]
    492        rshrn           v2.4h,  v2.4s,  #14
    493        dup             v2.8h,  v2.h[0]
    494        st1             {v1.h}[0], [x2]
    495 
    496        srshr           v2.8h,  v2.8h,  #6
    497 
    498        mov             x3,  x0
    499        mov             x4,  #16
    500 1:
    501        // Loop to add the constant from v2 into all 16x16 outputs
    502        subs            x4,  x4,  #2
    503        ld1             {v3.16b},  [x0], x1
    504        ld1             {v4.16b},  [x0], x1
    505        uaddw           v16.8h, v2.8h,  v3.8b
    506        uaddw2          v17.8h, v2.8h,  v3.16b
    507        uaddw           v18.8h, v2.8h,  v4.8b
    508        uaddw2          v19.8h, v2.8h,  v4.16b
    509        sqxtun          v3.8b,  v16.8h
    510        sqxtun2         v3.16b, v17.8h
    511        sqxtun          v4.8b,  v18.8h
    512        sqxtun2         v4.16b, v19.8h
    513        st1             {v3.16b},  [x3], x1
    514        st1             {v4.16b},  [x3], x1
    515        b.ne            1b
    516 
    517        ret
    518 endfunc
    519 
    520 .macro idct16_end
    521        butterfly_8h    v18, v7,  v4,  v7                // v18 = t0a,  v7  = t7a
    522        butterfly_8h    v19, v22, v5,  v22               // v19 = t1a,  v22 = t6
    523        butterfly_8h    v4,  v26, v20, v26               // v4  = t2a,  v26 = t5
    524        butterfly_8h    v5,  v6,  v28, v6                // v5  = t3a,  v6  = t4
    525        butterfly_8h    v20, v28, v16, v24               // v20 = t8a,  v28 = t11a
    526        butterfly_8h    v24, v21, v23, v21               // v24 = t9,   v21 = t10
    527        butterfly_8h    v23, v27, v25, v27               // v23 = t14,  v27 = t13
    528        butterfly_8h    v25, v29, v29, v17               // v25 = t15a, v29 = t12a
    529 
    530        dmbutterfly0    v2,  v3,  v27, v21, v2,  v3,  v16, v17, v30, v31 // v2  = t13a, v3  = t10a
    531        dmbutterfly0    v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12,  v27 = t11
    532 
    533        butterfly_8h    v16, v31, v18, v25               // v16 = out[0], v31 = out[15]
    534        butterfly_8h    v17, v30, v19, v23               // v17 = out[1], v30 = out[14]
    535        butterfly_8h_r  v25, v22, v22, v24               // v25 = out[9], v22 = out[6]
    536        butterfly_8h    v23, v24, v7,  v20               // v23 = out[7], v24 = out[8]
    537        butterfly_8h    v18, v29, v4,  v2                // v18 = out[2], v29 = out[13]
    538        butterfly_8h    v19, v28, v5,  v28               // v19 = out[3], v28 = out[12]
    539        butterfly_8h    v20, v27, v6,  v27               // v20 = out[4], v27 = out[11]
    540        butterfly_8h    v21, v26, v26, v3                // v21 = out[5], v26 = out[10]
    541        ret
    542 .endm
    543 
    544 function idct16
    545        dmbutterfly0    v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a,  v24 = t1a
    546        dmbutterfly     v20, v28, v0.h[2], v0.h[3], v2, v3, v4, v5 // v20 = t2a,  v28 = t3a
    547        dmbutterfly     v18, v30, v0.h[4], v0.h[5], v2, v3, v4, v5 // v18 = t4a,  v30 = t7a
    548        dmbutterfly     v26, v22, v0.h[6], v0.h[7], v2, v3, v4, v5 // v26 = t5a,  v22 = t6a
    549        dmbutterfly     v17, v31, v1.h[0], v1.h[1], v2, v3, v4, v5 // v17 = t8a,  v31 = t15a
    550        dmbutterfly     v25, v23, v1.h[2], v1.h[3], v2, v3, v4, v5 // v25 = t9a,  v23 = t14a
    551        dmbutterfly     v21, v27, v1.h[4], v1.h[5], v2, v3, v4, v5 // v21 = t10a, v27 = t13a
    552        dmbutterfly     v29, v19, v1.h[6], v1.h[7], v2, v3, v4, v5 // v29 = t11a, v19 = t12a
    553 
    554        butterfly_8h    v4,  v28, v16, v28               // v4  = t0,   v28 = t3
    555        butterfly_8h    v5,  v20, v24, v20               // v5  = t1,   v20 = t2
    556        butterfly_8h    v6,  v26, v18, v26               // v6  = t4,   v26 = t5
    557        butterfly_8h    v7,  v22, v30, v22               // v7  = t7,   v22 = t6
    558        butterfly_8h    v16, v25, v17, v25               // v16 = t8,   v25 = t9
    559        butterfly_8h    v24, v21, v29, v21               // v24 = t11,  v21 = t10
    560        butterfly_8h    v17, v27, v19, v27               // v17 = t12,  v27 = t13
    561        butterfly_8h    v29, v23, v31, v23               // v29 = t15,  v23 = t14
    562 
    563        dmbutterfly0    v22, v26, v22, v26, v2, v3, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
    564        dmbutterfly     v23, v25, v0.h[2], v0.h[3], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
    565        dmbutterfly     v27, v21, v0.h[2], v0.h[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
    566        idct16_end
    567 endfunc
    568 
    569 function idct16_half
    570        dmbutterfly0_h  v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a,  v24 = t1a
    571        dmbutterfly_h1  v20, v28, v0.h[2], v0.h[3], v2, v3, v4, v5 // v20 = t2a,  v28 = t3a
    572        dmbutterfly_h1  v18, v30, v0.h[4], v0.h[5], v2, v3, v4, v5 // v18 = t4a,  v30 = t7a
    573        dmbutterfly_h2  v26, v22, v0.h[6], v0.h[7], v2, v3, v4, v5 // v26 = t5a,  v22 = t6a
    574        dmbutterfly_h1  v17, v31, v1.h[0], v1.h[1], v2, v3, v4, v5 // v17 = t8a,  v31 = t15a
    575        dmbutterfly_h2  v25, v23, v1.h[2], v1.h[3], v2, v3, v4, v5 // v25 = t9a,  v23 = t14a
    576        dmbutterfly_h1  v21, v27, v1.h[4], v1.h[5], v2, v3, v4, v5 // v21 = t10a, v27 = t13a
    577        dmbutterfly_h2  v29, v19, v1.h[6], v1.h[7], v2, v3, v4, v5 // v29 = t11a, v19 = t12a
    578 
    579        butterfly_8h    v4,  v28, v16, v28               // v4  = t0,   v28 = t3
    580        butterfly_8h    v5,  v20, v24, v20               // v5  = t1,   v20 = t2
    581        butterfly_8h    v6,  v26, v18, v26               // v6  = t4,   v26 = t5
    582        butterfly_8h    v7,  v22, v30, v22               // v7  = t7,   v22 = t6
    583        butterfly_8h    v16, v25, v17, v25               // v16 = t8,   v25 = t9
    584        butterfly_8h    v24, v21, v29, v21               // v24 = t11,  v21 = t10
    585        butterfly_8h    v17, v27, v19, v27               // v17 = t12,  v27 = t13
    586        butterfly_8h    v29, v23, v31, v23               // v29 = t15,  v23 = t14
    587 
    588        dmbutterfly0    v22, v26, v22, v26, v2, v3, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
    589        dmbutterfly     v23, v25, v0.h[2], v0.h[3], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
    590        dmbutterfly     v27, v21, v0.h[2], v0.h[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
    591        idct16_end
    592 endfunc
    593 
    594 function idct16_quarter
    595        dsmull_h        v24, v25, v19, v1.h[7]
    596        dsmull_h        v4,  v5,  v17, v1.h[0]
    597        dsmull_h        v7,  v6,  v18, v0.h[5]
    598        dsmull_h        v30, v31, v18, v0.h[4]
    599        neg             v24.4s,  v24.4s
    600        neg             v25.4s,  v25.4s
    601        dsmull_h        v29, v28, v17, v1.h[1]
    602        dsmull_h        v26, v27, v19, v1.h[6]
    603        dsmull_h        v22, v23, v16, v0.h[0]
    604        drshrn_h        v24, v24, v25, #14
    605        drshrn_h        v16, v4,  v5,  #14
    606        drshrn_h        v7,  v7,  v6,  #14
    607        drshrn_h        v6,  v30, v31, #14
    608        drshrn_h        v29, v29, v28, #14
    609        drshrn_h        v17, v26, v27, #14
    610        drshrn_h        v28, v22, v23, #14
    611 
    612        dmbutterfly_l   v20, v21, v22, v23, v17, v24, v0.h[2], v0.h[3]
    613        dmbutterfly_l   v18, v19, v30, v31, v29, v16, v0.h[2], v0.h[3]
    614        neg             v22.4s,  v22.4s
    615        neg             v23.4s,  v23.4s
    616        drshrn_h        v27, v20, v21, #14
    617        drshrn_h        v21, v22, v23, #14
    618        drshrn_h        v23, v18, v19, #14
    619        drshrn_h        v25, v30, v31, #14
    620        mov             v4.16b,  v28.16b
    621        mov             v5.16b,  v28.16b
    622        dmbutterfly0    v22, v26, v7,  v6,  v18, v19, v30, v31
    623        mov             v20.16b, v28.16b
    624        idct16_end
    625 endfunc
    626 
    627 function iadst16
    628        ld1             {v0.8h,v1.8h}, [x11]
    629 
    630        dmbutterfly_l   v6,  v7,  v4,  v5,  v31, v16, v0.h[1], v0.h[0]   // v6,v7   = t1,   v4,v5   = t0
    631        dmbutterfly_l   v10, v11, v8,  v9,  v23, v24, v0.h[5], v0.h[4]   // v10,v11 = t9,   v8,v9   = t8
    632        dbutterfly_n    v31, v24, v6,  v7,  v10, v11, v12, v13, v10, v11 // v31     = t1a,  v24     = t9a
    633        dmbutterfly_l   v14, v15, v12, v13, v29, v18, v0.h[3], v0.h[2]   // v14,v15 = t3,   v12,v13 = t2
    634        dbutterfly_n    v16, v23, v4,  v5,  v8,  v9,  v6,  v7,  v8,  v9  // v16     = t0a,  v23     = t8a
    635 
    636        dmbutterfly_l   v6,  v7,  v4,  v5,  v21, v26, v0.h[7], v0.h[6]   // v6,v7   = t11,  v4,v5   = t10
    637        dbutterfly_n    v29, v26, v14, v15, v6,  v7,  v8,  v9,  v6,  v7  // v29     = t3a,  v26     = t11a
    638        dmbutterfly_l   v10, v11, v8,  v9,  v27, v20, v1.h[1], v1.h[0]   // v10,v11 = t5,   v8,v9   = t4
    639        dbutterfly_n    v18, v21, v12, v13, v4,  v5,  v6,  v7,  v4,  v5  // v18     = t2a,  v21     = t10a
    640 
    641        dmbutterfly_l   v14, v15, v12, v13, v19, v28, v1.h[5], v1.h[4]   // v14,v15 = t13,  v12,v13 = t12
    642        dbutterfly_n    v20, v28, v10, v11, v14, v15, v4,  v5,  v14, v15 // v20     = t5a,  v28     = t13a
    643        dmbutterfly_l   v6,  v7,  v4,  v5,  v25, v22, v1.h[3], v1.h[2]   // v6,v7   = t7,   v4,v5   = t6
    644        dbutterfly_n    v27, v19, v8,  v9,  v12, v13, v10, v11, v12, v13 // v27     = t4a,  v19     = t12a
    645 
    646        dmbutterfly_l   v10, v11, v8,  v9,  v17, v30, v1.h[7], v1.h[6]   // v10,v11 = t15,  v8,v9   = t14
    647        ld1             {v0.8h}, [x10]
    648        dbutterfly_n    v22, v30, v6,  v7,  v10, v11, v12, v13, v10, v11 // v22     = t7a,  v30     = t15a
    649        dmbutterfly_l   v14, v15, v12, v13, v23, v24, v0.h[4], v0.h[5]   // v14,v15 = t9,   v12,v13 = t8
    650        dbutterfly_n    v25, v17, v4,  v5,  v8,  v9,  v6,  v7,  v8,  v9  // v25     = t6a,  v17     = t14a
    651 
    652        dmbutterfly_l   v4,  v5,  v6,  v7,  v28, v19, v0.h[5], v0.h[4]   // v4,v5   = t12,  v6,v7   = t13
    653        dbutterfly_n    v23, v19, v12, v13, v4,  v5,  v8,  v9,  v4,  v5  // v23     = t8a,  v19     = t12a
    654        dmbutterfly_l   v10, v11, v8,  v9,  v21, v26, v0.h[6], v0.h[7]   // v10,v11 = t11,  v8,v9   = t10
    655        butterfly_8h_r  v4,  v27, v16, v27               // v4  = t4,   v27 = t0
    656        dbutterfly_n    v24, v28, v14, v15, v6,  v7,  v12, v13, v6,  v7  // v24     = t9a,  v28     = t13a
    657 
    658        dmbutterfly_l   v12, v13, v14, v15, v30, v17, v0.h[7], v0.h[6]   // v12,v13 = t14,  v14,v15 = t15
    659        butterfly_8h_r  v5,  v20, v31, v20               // v5  = t5, v20 = t1
    660        dbutterfly_n    v21, v17, v8,  v9,  v12, v13, v6,  v7,  v12, v13 // v21     = t10a, v17     = t14a
    661        dbutterfly_n    v26, v30, v10, v11, v14, v15, v8,  v9,  v14, v15 // v26     = t11a, v30     = t15a
    662 
    663        butterfly_8h_r  v6,  v25, v18, v25               // v6  = t6, v25 = t2
    664        butterfly_8h_r  v7,  v22, v29, v22               // v7  = t7, v22 = t3
    665 
    666        dmbutterfly_l   v10, v11, v8,  v9,  v19, v28, v0.h[2], v0.h[3]   // v10,v11 = t13,  v8,v9   = t12
    667        dmbutterfly_l   v12, v13, v14, v15, v30, v17, v0.h[3], v0.h[2]   // v12,v13 = t14,  v14,v15 = t15
    668 
    669        dbutterfly_n    v18, v30, v8,  v9,  v12, v13, v16, v17, v12, v13 // v18   = out[2], v30     = t14a
    670        dbutterfly_n    v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17     = t15a
    671        neg             v29.8h, v29.8h                   // v29 = out[13]
    672 
    673        dmbutterfly_l   v10, v11, v8,  v9,  v4,  v5,  v0.h[2], v0.h[3]   // v10,v11 = t5a,  v8,v9   = t4a
    674        dmbutterfly_l   v12, v13, v14, v15, v7,  v6,  v0.h[3], v0.h[2]   // v12,v13 = t6a,  v14,v15 = t7a
    675 
    676        butterfly_8h    v2,  v6,  v27, v25               // v2 = out[0], v6 = t2a
    677        butterfly_8h    v3,  v7,  v23, v21               // v3 =-out[1], v7 = t10
    678 
    679        dbutterfly_n    v19, v31, v8,  v9,  v12, v13, v4,  v5,  v8,  v9  // v19 = -out[3],  v31 = t6
    680        neg             v19.8h, v19.8h                   // v19 = out[3]
    681        dbutterfly_n    v28, v16, v10, v11, v14, v15, v4,  v5,  v10, v11 // v28 = out[12],  v16 = t7
    682 
    683        butterfly_8h    v5,  v8,  v20, v22               // v5 =-out[15],v8 = t3a
    684        butterfly_8h    v4,  v9,  v24, v26               // v4 = out[14],v9 = t11
    685 
    686        dmbutterfly0    v23, v24, v6,  v8,  v10, v11, v12, v13, v14, v15, 1 // v23 = out[7], v24 = out[8]
    687        dmbutterfly0    v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1 // v21 = out[5], v26 = out[10]
    688        dmbutterfly0    v20, v27, v16, v31, v10, v11, v12, v13, v14, v15    // v20 = out[4], v27 = out[11]
    689        dmbutterfly0    v22, v25, v9,  v7,  v10, v11, v12, v13, v14, v15    // v22 = out[6], v25 = out[9]
    690 
    691        neg             v31.8h,  v5.8h                    // v31 = out[15]
    692        neg             v17.8h,  v3.8h                    // v17 = out[1]
    693 
    694        mov             v16.16b, v2.16b
    695        mov             v30.16b, v4.16b
    696        ret
    697 endfunc
    698 
    699 // Helper macros; we can't use these expressions directly within
    700 // e.g. .irp due to the extra concatenation \(). Therefore wrap
    701 // them in macros to allow using .irp below.
    702 .macro load i, src, inc
    703        ld1             {v\i\().8h},  [\src], \inc
    704 .endm
    705 .macro store i, dst, inc
    706        st1             {v\i\().8h},  [\dst], \inc
    707 .endm
    708 .macro movi_v i, size, imm
    709        movi            v\i\()\size,  \imm
    710 .endm
    711 .macro load_clear i, src, inc
    712        ld1             {v\i\().8h}, [\src]
    713        st1             {v2.8h},  [\src], \inc
    714 .endm
    715 
    716 .macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7, tmp1, tmp2
    717        srshr           \coef0, \coef0, #6
    718        ld1             {v2.8b},  [x0], x1
    719        srshr           \coef1, \coef1, #6
    720        ld1             {v3.8b},  [x3], x1
    721        srshr           \coef2, \coef2, #6
    722        ld1             {v4.8b},  [x0], x1
    723        srshr           \coef3, \coef3, #6
    724        uaddw           \coef0, \coef0, v2.8b
    725        ld1             {v5.8b},  [x3], x1
    726        uaddw           \coef1, \coef1, v3.8b
    727        srshr           \coef4, \coef4, #6
    728        ld1             {v6.8b},  [x0], x1
    729        srshr           \coef5, \coef5, #6
    730        ld1             {v7.8b},  [x3], x1
    731        sqxtun          v2.8b,  \coef0
    732        srshr           \coef6, \coef6, #6
    733        sqxtun          v3.8b,  \coef1
    734        srshr           \coef7, \coef7, #6
    735        uaddw           \coef2, \coef2, v4.8b
    736        ld1             {\tmp1},  [x0], x1
    737        uaddw           \coef3, \coef3, v5.8b
    738        ld1             {\tmp2},  [x3], x1
    739        sqxtun          v4.8b,  \coef2
    740        sub             x0,  x0,  x1, lsl #2
    741        sub             x3,  x3,  x1, lsl #2
    742        sqxtun          v5.8b,  \coef3
    743        uaddw           \coef4, \coef4, v6.8b
    744        st1             {v2.8b},  [x0], x1
    745        uaddw           \coef5, \coef5, v7.8b
    746        st1             {v3.8b},  [x3], x1
    747        sqxtun          v6.8b,  \coef4
    748        st1             {v4.8b},  [x0], x1
    749        sqxtun          v7.8b,  \coef5
    750        st1             {v5.8b},  [x3], x1
    751        uaddw           \coef6, \coef6, \tmp1
    752        st1             {v6.8b},  [x0], x1
    753        uaddw           \coef7, \coef7, \tmp2
    754        st1             {v7.8b},  [x3], x1
    755        sqxtun          \tmp1,  \coef6
    756        sqxtun          \tmp2,  \coef7
    757        st1             {\tmp1},  [x0], x1
    758        st1             {\tmp2},  [x3], x1
    759 .endm
    760 
    761 // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
    762 // transpose into a horizontal 16x8 slice and store.
    763 // x0 = dst (temp buffer)
    764 // x1 = slice offset
    765 // x2 = src
    766 // x9 = input stride
    767 .macro itxfm16_1d_funcs txfm
    768 function \txfm\()16_1d_8x16_pass1_neon
    769        mov             x14, x30
    770 
    771        movi            v2.8h, #0
    772 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
    773        load_clear      \i,  x2,  x9
    774 .endr
    775 
    776        bl              \txfm\()16
    777 
    778        // Do two 8x8 transposes. Originally, v16-v31 contain the
    779        // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
    780        // transposed 8x8 blocks.
    781        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
    782        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
    783 
    784        // Store the transposed 8x8 blocks horizontally.
    785        cmp             x1,  #8
    786        b.eq            1f
    787 .irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
    788        store           \i,  x0,  #16
    789 .endr
    790        ret             x14
    791 1:
    792        // Special case: For the last input column (x1 == 8),
    793        // which would be stored as the last row in the temp buffer,
    794        // don't store the first 8x8 block, but keep it in registers
    795        // for the first slice of the second pass (where it is the
    796        // last 8x8 block).
    797 .irp i, 24, 25, 26, 27, 28, 29, 30, 31
    798        add             x0,  x0,  #16
    799        store           \i,  x0,  #16
    800 .endr
    801        mov             v24.16b, v16.16b
    802        mov             v25.16b, v17.16b
    803        mov             v26.16b, v18.16b
    804        mov             v27.16b, v19.16b
    805        mov             v28.16b, v20.16b
    806        mov             v29.16b, v21.16b
    807        mov             v30.16b, v22.16b
    808        mov             v31.16b, v23.16b
    809        ret             x14
    810 endfunc
    811 
    812 // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
    813 // load the destination pixels (from a similar 8x16 slice), add and store back.
    814 // x0 = dst
    815 // x1 = dst stride
    816 // x2 = src (temp buffer)
    817 // x3 = slice offset
    818 // x9 = temp buffer stride
    819 function \txfm\()16_1d_8x16_pass2_neon
    820        mov             x14, x30
    821 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
    822        load            \i,  x2,  x9
    823 .endr
    824        cbz             x3,  1f
    825 .irp i, 24, 25, 26, 27, 28, 29, 30, 31
    826        load            \i,  x2,  x9
    827 .endr
    828 1:
    829 
    830        add             x3,  x0,  x1
    831        lsl             x1,  x1,  #1
    832        bl              \txfm\()16
    833 
    834        load_add_store  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
    835        load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
    836 
    837        ret             x14
    838 endfunc
    839 .endm
    840 
    841 itxfm16_1d_funcs idct
    842 itxfm16_1d_funcs iadst
    843 
    844 .macro itxfm_func16x16 txfm1, txfm2
    845 function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
    846 .ifc \txfm1\()_\txfm2,idct_idct
    847        cmp             w3,  #1
    848        b.eq            idct16x16_dc_add_neon
    849 .endif
    850        mov             x15, x30
    851        // iadst16 requires clobbering v8-v15, but idct16 doesn't need to.
    852 .ifnc \txfm1\()_\txfm2,idct_idct
    853        stp             d8,  d9,  [sp, #-0x40]!
    854        stp             d14, d15, [sp, #0x30]
    855        stp             d12, d13, [sp, #0x20]
    856        stp             d10, d11, [sp, #0x10]
    857 .endif
    858 
    859        sub             sp,  sp,  #512
    860 
    861        mov             x4,  x0
    862        mov             x5,  x1
    863        mov             x6,  x2
    864 
    865        movrel          x10, idct_coeffs
    866 .ifnc \txfm1\()_\txfm2,idct_idct
    867        movrel          x11, iadst16_coeffs
    868 .endif
    869 .ifc \txfm1,idct
    870        ld1             {v0.8h,v1.8h}, [x10]
    871 .endif
    872        mov             x9,  #32
    873 
    874 .ifc \txfm1\()_\txfm2,idct_idct
    875        cmp             w3,  #10
    876        b.le            idct16x16_quarter_add_neon
    877        cmp             w3,  #38
    878        b.le            idct16x16_half_add_neon
    879 .endif
    880 
    881 .irp i, 0, 8
    882        add             x0,  sp,  #(\i*32)
    883 .ifc \txfm1\()_\txfm2,idct_idct
    884 .if \i == 8
    885        cmp             w3,  #38
    886        b.le            1f
    887 .endif
    888 .endif
    889        mov             x1,  #\i
    890        add             x2,  x6,  #(\i*2)
    891        bl              \txfm1\()16_1d_8x16_pass1_neon
    892 .endr
    893 .ifc \txfm1\()_\txfm2,iadst_idct
    894        ld1             {v0.8h,v1.8h}, [x10]
    895 .endif
    896 
    897 .ifc \txfm1\()_\txfm2,idct_idct
    898        b               3f
    899 1:
    900        // Set v24-v31 to zero, for the in-register passthrough of
    901        // coefficients to pass 2. Since we only do two slices, this can
    902        // only ever happen for the second slice. So we only need to store
    903        // zeros to the temp buffer for the second half of the buffer.
    904        // Move x0 to the second half, and use x9 == 32 as increment.
    905        add             x0,  x0,  #16
    906 .irp i, 24, 25, 26, 27, 28, 29, 30, 31
    907        movi_v          \i,  .16b, #0
    908        st1             {v24.8h},  [x0], x9
    909 .endr
    910 3:
    911 .endif
    912 
    913 .irp i, 0, 8
    914        add             x0,  x4,  #(\i)
    915        mov             x1,  x5
    916        add             x2,  sp,  #(\i*2)
    917        mov             x3,  #\i
    918        bl              \txfm2\()16_1d_8x16_pass2_neon
    919 .endr
    920 
    921        add             sp,  sp,  #512
    922 .ifnc \txfm1\()_\txfm2,idct_idct
    923        ldp             d10, d11, [sp, #0x10]
    924        ldp             d12, d13, [sp, #0x20]
    925        ldp             d14, d15, [sp, #0x30]
    926        ldp             d8,  d9,  [sp], #0x40
    927 .endif
    928        ret             x15
    929 endfunc
    930 .endm
    931 
    932 itxfm_func16x16 idct,  idct
    933 itxfm_func16x16 iadst, idct
    934 itxfm_func16x16 idct,  iadst
    935 itxfm_func16x16 iadst, iadst
    936 
    937 function idct16_1d_8x16_pass1_quarter_neon
    938        mov             x14, x30
    939        movi            v2.8h, #0
    940 .irp i, 16, 17, 18, 19
    941        load_clear      \i,  x2,  x9
    942 .endr
    943 
    944        bl              idct16_quarter
    945 
    946        // Do two 8x8 transposes. Originally, v16-v31 contain the
    947        // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
    948        // transposed 8x8 blocks.
    949        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
    950        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
    951 
    952        // Store the transposed 8x8 blocks horizontally.
    953        // The first 8x8 block is kept in registers for the second pass,
    954        // store the rest in the temp buffer.
    955        // Since only a 4x4 part of the input was nonzero, this means that
    956        // only 4 rows are nonzero after transposing, and the second pass
    957        // only reads the topmost 4 rows. Therefore only store the topmost
    958        // 4 rows.
    959        add             x0,  x0,  #16
    960 .irp i, 24, 25, 26, 27
    961        store           \i,  x0,  x9
    962 .endr
    963        ret             x14
    964 endfunc
    965 
    966 function idct16_1d_8x16_pass2_quarter_neon
    967        mov             x14, x30
    968        cbz             x3,  1f
    969 .irp i, 16, 17, 18, 19
    970        load            \i,  x2,  x9
    971 .endr
    972 1:
    973 
    974        add             x3,  x0,  x1
    975        lsl             x1,  x1,  #1
    976        bl              idct16_quarter
    977 
    978        load_add_store  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
    979        load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
    980 
    981        ret             x14
    982 endfunc
    983 
    984 function idct16_1d_8x16_pass1_half_neon
    985        mov             x14, x30
    986        movi            v2.8h, #0
    987 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
    988        load_clear      \i,  x2,  x9
    989 .endr
    990 
    991        bl              idct16_half
    992 
    993        // Do two 8x8 transposes. Originally, v16-v31 contain the
    994        // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
    995        // transposed 8x8 blocks.
    996        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
    997        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
    998 
    999        // Store the transposed 8x8 blocks horizontally.
   1000        // The first 8x8 block is kept in registers for the second pass,
   1001        // store the rest in the temp buffer.
   1002        add             x0,  x0,  #16
   1003 .irp i, 24, 25, 26, 27, 28, 29, 30, 31
   1004        store           \i,  x0,  x9
   1005 .endr
   1006        ret             x14
   1007 endfunc
   1008 
   1009 function idct16_1d_8x16_pass2_half_neon
   1010        mov             x14, x30
   1011        cbz             x3,  1f
   1012 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
   1013        load            \i,  x2,  x9
   1014 .endr
   1015 1:
   1016 
   1017        add             x3,  x0,  x1
   1018        lsl             x1,  x1,  #1
   1019        bl              idct16_half
   1020 
   1021        load_add_store  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
   1022        load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
   1023 
   1024        ret             x14
   1025 endfunc
   1026 
   1027 .macro idct16_partial size
   1028 function idct16x16_\size\()_add_neon
   1029        add             x0,  sp,  #(0*32)
   1030        add             x2,  x6,  #(0*2)
   1031        bl              idct16_1d_8x16_pass1_\size\()_neon
   1032 .irp i, 0, 8
   1033        add             x0,  x4,  #(\i)
   1034        mov             x1,  x5
   1035        add             x2,  sp,  #(\i*2)
   1036        mov             x3,  #\i
   1037        bl              idct16_1d_8x16_pass2_\size\()_neon
   1038 .endr
   1039 
   1040        add             sp,  sp,  #512
   1041        ret             x15
   1042 endfunc
   1043 .endm
   1044 
   1045 idct16_partial quarter
   1046 idct16_partial half
   1047 
   1048 function idct32x32_dc_add_neon
   1049        movrel          x4,  idct_coeffs
   1050        ld1             {v0.4h}, [x4]
   1051 
   1052        movi            v1.4h,  #0
   1053 
   1054        ld1             {v2.h}[0], [x2]
   1055        smull           v2.4s,  v2.4h,  v0.h[0]
   1056        rshrn           v2.4h,  v2.4s,  #14
   1057        smull           v2.4s,  v2.4h,  v0.h[0]
   1058        rshrn           v2.4h,  v2.4s,  #14
   1059        dup             v2.8h,  v2.h[0]
   1060        st1             {v1.h}[0], [x2]
   1061 
   1062        srshr           v0.8h,  v2.8h,  #6
   1063 
   1064        mov             x3,  x0
   1065        mov             x4,  #32
   1066 1:
   1067        // Loop to add the constant v0 into all 32x32 outputs
   1068        subs            x4,  x4,  #2
   1069        ld1             {v1.16b,v2.16b},  [x0], x1
   1070        uaddw           v16.8h, v0.8h,  v1.8b
   1071        uaddw2          v17.8h, v0.8h,  v1.16b
   1072        ld1             {v3.16b,v4.16b},  [x0], x1
   1073        uaddw           v18.8h, v0.8h,  v2.8b
   1074        uaddw2          v19.8h, v0.8h,  v2.16b
   1075        uaddw           v20.8h, v0.8h,  v3.8b
   1076        uaddw2          v21.8h, v0.8h,  v3.16b
   1077        uaddw           v22.8h, v0.8h,  v4.8b
   1078        uaddw2          v23.8h, v0.8h,  v4.16b
   1079        sqxtun          v1.8b,  v16.8h
   1080        sqxtun2         v1.16b, v17.8h
   1081        sqxtun          v2.8b,  v18.8h
   1082        sqxtun2         v2.16b, v19.8h
   1083        sqxtun          v3.8b,  v20.8h
   1084        sqxtun2         v3.16b, v21.8h
   1085        st1             {v1.16b,v2.16b},  [x3], x1
   1086        sqxtun          v4.8b,  v22.8h
   1087        sqxtun2         v4.16b, v23.8h
   1088        st1             {v3.16b,v4.16b},  [x3], x1
   1089        b.ne            1b
   1090 
   1091        ret
   1092 endfunc
   1093 
   1094 .macro idct32_end
   1095        butterfly_8h    v16, v5,  v4,  v5  // v16 = t16a, v5  = t19a
   1096        butterfly_8h    v17, v20, v23, v20 // v17 = t17,  v20 = t18
   1097        butterfly_8h    v18, v6,  v7,  v6  // v18 = t23a, v6  = t20a
   1098        butterfly_8h    v19, v21, v22, v21 // v19 = t22,  v21 = t21
   1099        butterfly_8h    v4,  v28, v28, v30 // v4  = t24a, v28 = t27a
   1100        butterfly_8h    v23, v26, v25, v26 // v23 = t25,  v26 = t26
   1101        butterfly_8h    v7,  v3,  v29, v31 // v7  = t31a, v3  = t28a
   1102        butterfly_8h    v22, v27, v24, v27 // v22 = t30,  v27 = t29
   1103 
   1104        dmbutterfly     v27, v20, v0.h[2], v0.h[3], v24, v25, v30, v31        // v27 = t18a, v20 = t29a
   1105        dmbutterfly     v3,  v5,  v0.h[2], v0.h[3], v24, v25, v30, v31        // v3  = t19,  v5  = t28
   1106        dmbutterfly     v28, v6,  v0.h[2], v0.h[3], v24, v25, v30, v31, neg=1 // v28 = t27,  v6  = t20
   1107        dmbutterfly     v26, v21, v0.h[2], v0.h[3], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a
   1108 
   1109        butterfly_8h    v31, v24, v7,  v4  // v31 = t31,  v24 = t24
   1110        butterfly_8h    v30, v25, v22, v23 // v30 = t30a, v25 = t25a
   1111        butterfly_8h_r  v23, v16, v16, v18 // v23 = t23,  v16 = t16
   1112        butterfly_8h_r  v22, v17, v17, v19 // v22 = t22a, v17 = t17a
   1113        butterfly_8h    v18, v21, v27, v21 // v18 = t18,  v21 = t21
   1114        butterfly_8h_r  v27, v28, v5,  v28 // v27 = t27a, v28 = t28a
   1115        butterfly_8h    v29, v26, v20, v26 // v29 = t29,  v26 = t26
   1116        butterfly_8h    v19, v20, v3,  v6  // v19 = t19a, v20 = t20
   1117 
   1118        dmbutterfly0    v27, v20, v27, v20, v2, v3, v4, v5, v6, v7 // v27 = t27,  v20 = t20
   1119        dmbutterfly0    v26, v21, v26, v21, v2, v3, v4, v5, v6, v7 // v26 = t26a, v21 = t21a
   1120        dmbutterfly0    v25, v22, v25, v22, v2, v3, v4, v5, v6, v7 // v25 = t25,  v22 = t22
   1121        dmbutterfly0    v24, v23, v24, v23, v2, v3, v4, v5, v6, v7 // v24 = t24a, v23 = t23a
   1122        ret
   1123 .endm
   1124 
   1125 function idct32_odd
   1126        dmbutterfly     v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
   1127        dmbutterfly     v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
   1128        dmbutterfly     v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
   1129        dmbutterfly     v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
   1130        dmbutterfly     v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
   1131        dmbutterfly     v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
   1132        dmbutterfly     v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
   1133        dmbutterfly     v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
   1134 
   1135        butterfly_8h    v4,  v24, v16, v24 // v4  = t16, v24 = t17
   1136        butterfly_8h    v5,  v20, v28, v20 // v5  = t19, v20 = t18
   1137        butterfly_8h    v6,  v26, v18, v26 // v6  = t20, v26 = t21
   1138        butterfly_8h    v7,  v22, v30, v22 // v7  = t23, v22 = t22
   1139        butterfly_8h    v28, v25, v17, v25 // v28 = t24, v25 = t25
   1140        butterfly_8h    v30, v21, v29, v21 // v30 = t27, v21 = t26
   1141        butterfly_8h    v29, v23, v31, v23 // v29 = t31, v23 = t30
   1142        butterfly_8h    v31, v27, v19, v27 // v31 = t28, v27 = t29
   1143 
   1144        dmbutterfly     v23, v24, v0.h[4], v0.h[5], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
   1145        dmbutterfly     v27, v20, v0.h[4], v0.h[5], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
   1146        dmbutterfly     v21, v26, v0.h[6], v0.h[7], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
   1147        dmbutterfly     v25, v22, v0.h[6], v0.h[7], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
   1148        idct32_end
   1149 endfunc
   1150 
   1151 function idct32_odd_half
   1152        dmbutterfly_h1  v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
   1153        dmbutterfly_h2  v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
   1154        dmbutterfly_h1  v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
   1155        dmbutterfly_h2  v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
   1156        dmbutterfly_h1  v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
   1157        dmbutterfly_h2  v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
   1158        dmbutterfly_h1  v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
   1159        dmbutterfly_h2  v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
   1160 
   1161        butterfly_8h    v4,  v24, v16, v24 // v4  = t16, v24 = t17
   1162        butterfly_8h    v5,  v20, v28, v20 // v5  = t19, v20 = t18
   1163        butterfly_8h    v6,  v26, v18, v26 // v6  = t20, v26 = t21
   1164        butterfly_8h    v7,  v22, v30, v22 // v7  = t23, v22 = t22
   1165        butterfly_8h    v28, v25, v17, v25 // v28 = t24, v25 = t25
   1166        butterfly_8h    v30, v21, v29, v21 // v30 = t27, v21 = t26
   1167        butterfly_8h    v29, v23, v31, v23 // v29 = t31, v23 = t30
   1168        butterfly_8h    v31, v27, v19, v27 // v31 = t28, v27 = t29
   1169 
   1170        dmbutterfly     v23, v24, v0.h[4], v0.h[5], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
   1171        dmbutterfly     v27, v20, v0.h[4], v0.h[5], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
   1172        dmbutterfly     v21, v26, v0.h[6], v0.h[7], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
   1173        dmbutterfly     v25, v22, v0.h[6], v0.h[7], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
   1174        idct32_end
   1175 endfunc
   1176 
   1177 function idct32_odd_quarter
   1178        dsmull_h        v4,  v5,  v16, v8.h[0]
   1179        dsmull_h        v28, v29, v19, v8.h[7]
   1180        dsmull_h        v30, v31, v16, v8.h[1]
   1181        dsmull_h        v22, v23, v17, v9.h[6]
   1182        dsmull_h        v7,  v6,  v17, v9.h[7]
   1183        dsmull_h        v26, v27, v19, v8.h[6]
   1184        dsmull_h        v20, v21, v18, v9.h[0]
   1185        dsmull_h        v24, v25, v18, v9.h[1]
   1186 
   1187        neg             v28.4s, v28.4s
   1188        neg             v29.4s, v29.4s
   1189        neg             v7.4s,  v7.4s
   1190        neg             v6.4s,  v6.4s
   1191 
   1192        drshrn_h        v4,  v4,  v5,  #14
   1193        drshrn_h        v5,  v28, v29, #14
   1194        drshrn_h        v29, v30, v31, #14
   1195        drshrn_h        v28, v22, v23, #14
   1196        drshrn_h        v7,  v7,  v6,  #14
   1197        drshrn_h        v31, v26, v27, #14
   1198        drshrn_h        v6,  v20, v21, #14
   1199        drshrn_h        v30, v24, v25, #14
   1200 
   1201        dmbutterfly_l   v16, v17, v18, v19, v29, v4,  v0.h[4], v0.h[5]
   1202        dmbutterfly_l   v27, v26, v20, v21, v31, v5,  v0.h[4], v0.h[5]
   1203        drshrn_h        v23, v16, v17, #14
   1204        drshrn_h        v24, v18, v19, #14
   1205        neg             v20.4s, v20.4s
   1206        neg             v21.4s, v21.4s
   1207        drshrn_h        v27, v27, v26, #14
   1208        drshrn_h        v20, v20, v21, #14
   1209        dmbutterfly_l   v16, v17, v18, v19, v30, v6,  v0.h[6], v0.h[7]
   1210        drshrn_h        v21, v16, v17, #14
   1211        drshrn_h        v26, v18, v19, #14
   1212        dmbutterfly_l   v16, v17, v18, v19, v28, v7,  v0.h[6], v0.h[7]
   1213        drshrn_h        v25, v16, v17, #14
   1214        neg             v18.4s, v18.4s
   1215        neg             v19.4s, v19.4s
   1216        drshrn_h        v22, v18, v19, #14
   1217 
   1218        idct32_end
   1219 endfunc
   1220 
   1221 .macro idct32_funcs suffix
   1222 // Do an 32-point IDCT of a 8x32 slice out of a 32x32 matrix.
   1223 // The 32-point IDCT can be decomposed into two 16-point IDCTs;
   1224 // a normal IDCT16 with every other input component (the even ones, with
   1225 // each output written twice), followed by a separate 16-point IDCT
   1226 // of the odd inputs, added/subtracted onto the outputs of the first idct16.
   1227 // x0 = dst (temp buffer)
   1228 // x1 = unused
   1229 // x2 = src
   1230 // x9 = double input stride
   1231 function idct32_1d_8x32_pass1\suffix\()_neon
   1232        mov             x14, x30
   1233        movi            v2.8h,  #0
   1234 
   1235        // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
   1236 .ifb \suffix
   1237 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
   1238        load_clear      \i, x2, x9
   1239 .endr
   1240 .endif
   1241 .ifc \suffix,_quarter
   1242 .irp i, 16, 17, 18, 19
   1243        load_clear      \i, x2, x9
   1244 .endr
   1245 .endif
   1246 .ifc \suffix,_half
   1247 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
   1248        load_clear      \i, x2, x9
   1249 .endr
   1250 .endif
   1251 
   1252        bl              idct16\suffix
   1253 
   1254        // Do two 8x8 transposes. Originally, v16-v31 contain the
   1255        // 16 rows. Afterwards, v16-v23 and v24-v31 contain the
   1256        // two transposed 8x8 blocks.
   1257        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
   1258        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
   1259 
   1260        // Store the registers a, b horizontally, followed by the
   1261        // same registers b, a mirrored.
   1262 .macro store_rev a, b
   1263        // There's no rev128 instruction, but we reverse each 64 bit
   1264        // half, and then flip them using an ext with 8 bytes offset.
   1265        rev64           v3.8h, \b
   1266        st1             {\a},  [x0], #16
   1267        rev64           v2.8h, \a
   1268        ext             v3.16b, v3.16b, v3.16b, #8
   1269        st1             {\b},  [x0], #16
   1270        ext             v2.16b, v2.16b, v2.16b, #8
   1271        st1             {v3.8h},  [x0], #16
   1272        st1             {v2.8h},  [x0], #16
   1273 .endm
   1274        store_rev       v16.8h, v24.8h
   1275        store_rev       v17.8h, v25.8h
   1276        store_rev       v18.8h, v26.8h
   1277        store_rev       v19.8h, v27.8h
   1278        store_rev       v20.8h, v28.8h
   1279        store_rev       v21.8h, v29.8h
   1280        store_rev       v22.8h, v30.8h
   1281        store_rev       v23.8h, v31.8h
   1282        sub             x0,  x0,  #512
   1283 .purgem store_rev
   1284 
   1285        // Move x2 back to the start of the input, and move
   1286        // to the first odd row
   1287 .ifb \suffix
   1288        sub             x2,  x2,  x9, lsl #4
   1289 .endif
   1290 .ifc \suffix,_quarter
   1291        sub             x2,  x2,  x9, lsl #2
   1292 .endif
   1293 .ifc \suffix,_half
   1294        sub             x2,  x2,  x9, lsl #3
   1295 .endif
   1296        add             x2,  x2,  #64
   1297 
   1298        movi            v2.8h,  #0
   1299        // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
   1300 .ifb \suffix
   1301 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
   1302        load_clear      \i, x2, x9
   1303 .endr
   1304 .endif
   1305 .ifc \suffix,_quarter
   1306 .irp i, 16, 17, 18, 19
   1307        load_clear      \i, x2, x9
   1308 .endr
   1309 .endif
   1310 .ifc \suffix,_half
   1311 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
   1312        load_clear      \i, x2, x9
   1313 .endr
   1314 .endif
   1315 
   1316        bl              idct32_odd\suffix
   1317 
   1318        transpose_8x8H  v31, v30, v29, v28, v27, v26, v25, v24, v2, v3
   1319        transpose_8x8H  v23, v22, v21, v20, v19, v18, v17, v16, v2, v3
   1320 
   1321        // Store the registers a, b horizontally,
   1322        // adding into the output first, and the mirrored,
   1323        // subtracted from the output.
   1324 .macro store_rev a, b
   1325        ld1             {v4.8h},  [x0]
   1326        rev64           v3.8h, \b
   1327        add             v4.8h, v4.8h, \a
   1328        rev64           v2.8h, \a
   1329        st1             {v4.8h},  [x0], #16
   1330        ext             v3.16b, v3.16b, v3.16b, #8
   1331        ld1             {v5.8h},  [x0]
   1332        ext             v2.16b, v2.16b, v2.16b, #8
   1333        add             v5.8h, v5.8h, \b
   1334        st1             {v5.8h},  [x0], #16
   1335        ld1             {v6.8h},  [x0]
   1336        sub             v6.8h, v6.8h, v3.8h
   1337        st1             {v6.8h},  [x0], #16
   1338        ld1             {v7.8h},  [x0]
   1339        sub             v7.8h, v7.8h, v2.8h
   1340        st1             {v7.8h},  [x0], #16
   1341 .endm
   1342 
   1343        store_rev       v31.8h, v23.8h
   1344        store_rev       v30.8h, v22.8h
   1345        store_rev       v29.8h, v21.8h
   1346        store_rev       v28.8h, v20.8h
   1347        store_rev       v27.8h, v19.8h
   1348        store_rev       v26.8h, v18.8h
   1349        store_rev       v25.8h, v17.8h
   1350        store_rev       v24.8h, v16.8h
   1351 .purgem store_rev
   1352        ret             x14
   1353 endfunc
   1354 
   1355 // This is mostly the same as 8x32_pass1, but without the transpose,
   1356 // and use the source as temp buffer between the two idct passes, and
   1357 // add into the destination.
   1358 // x0 = dst
   1359 // x1 = dst stride
   1360 // x2 = src (temp buffer)
   1361 // x7 = negative double temp buffer stride
   1362 // x9 = double temp buffer stride
   1363 function idct32_1d_8x32_pass2\suffix\()_neon
   1364        mov             x14, x30
   1365        // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
   1366 .ifb \suffix
   1367 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
   1368        load            \i, x2, x9
   1369 .endr
   1370        sub             x2,  x2,  x9, lsl #4
   1371 .endif
   1372 .ifc \suffix,_quarter
   1373 .irp i, 16, 17, 18, 19
   1374        load            \i, x2, x9
   1375 .endr
   1376        sub             x2,  x2,  x9, lsl #2
   1377 .endif
   1378 .ifc \suffix,_half
   1379 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
   1380        load            \i, x2, x9
   1381 .endr
   1382        sub             x2,  x2,  x9, lsl #3
   1383 .endif
   1384 
   1385        bl              idct16\suffix
   1386 
   1387 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
   1388        store           \i, x2, x9
   1389 .endr
   1390 
   1391        sub             x2,  x2,  x9, lsl #4
   1392        add             x2,  x2,  #64
   1393 
   1394        // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
   1395 .ifb \suffix
   1396 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
   1397        load            \i, x2, x9
   1398 .endr
   1399        sub             x2,  x2,  x9, lsl #4
   1400 .endif
   1401 .ifc \suffix,_quarter
   1402 .irp i, 16, 17, 18, 19
   1403        load            \i, x2, x9
   1404 .endr
   1405        sub             x2,  x2,  x9, lsl #2
   1406 .endif
   1407 .ifc \suffix,_half
   1408 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
   1409        load            \i, x2, x9
   1410 .endr
   1411        sub             x2,  x2,  x9, lsl #3
   1412 .endif
   1413        sub             x2,  x2,  #64
   1414 
   1415        bl              idct32_odd\suffix
   1416 
   1417 .macro load_acc_store a, b, c, d, neg=0
   1418 .if \neg == 0
   1419        ld1             {v4.8h},  [x2], x9
   1420        ld1             {v5.8h},  [x2], x9
   1421        add             v4.8h, v4.8h, \a
   1422        ld1             {v6.8h},  [x2], x9
   1423        add             v5.8h, v5.8h, \b
   1424        ld1             {v7.8h},  [x2], x9
   1425        add             v6.8h, v6.8h, \c
   1426        add             v7.8h, v7.8h, \d
   1427 .else
   1428        ld1             {v4.8h},  [x2], x7
   1429        ld1             {v5.8h},  [x2], x7
   1430        sub             v4.8h, v4.8h, \a
   1431        ld1             {v6.8h},  [x2], x7
   1432        sub             v5.8h, v5.8h, \b
   1433        ld1             {v7.8h},  [x2], x7
   1434        sub             v6.8h, v6.8h, \c
   1435        sub             v7.8h, v7.8h, \d
   1436 .endif
   1437        ld1             {v10.8b}, [x0], x1
   1438        ld1             {v11.8b}, [x0], x1
   1439        srshr           v4.8h, v4.8h, #6
   1440        ld1             {v2.8b}, [x0], x1
   1441        srshr           v5.8h, v5.8h, #6
   1442        uaddw           v4.8h, v4.8h, v10.8b
   1443        ld1             {v3.8b}, [x0], x1
   1444        srshr           v6.8h, v6.8h, #6
   1445        uaddw           v5.8h, v5.8h, v11.8b
   1446        srshr           v7.8h, v7.8h, #6
   1447        sub             x0,  x0,  x1, lsl #2
   1448        uaddw           v6.8h, v6.8h, v2.8b
   1449        sqxtun          v4.8b, v4.8h
   1450        uaddw           v7.8h, v7.8h, v3.8b
   1451        sqxtun          v5.8b, v5.8h
   1452        st1             {v4.8b}, [x0], x1
   1453        sqxtun          v6.8b, v6.8h
   1454        st1             {v5.8b}, [x0], x1
   1455        sqxtun          v7.8b, v7.8h
   1456        st1             {v6.8b}, [x0], x1
   1457        st1             {v7.8b}, [x0], x1
   1458 .endm
   1459        load_acc_store  v31.8h, v30.8h, v29.8h, v28.8h
   1460        load_acc_store  v27.8h, v26.8h, v25.8h, v24.8h
   1461        load_acc_store  v23.8h, v22.8h, v21.8h, v20.8h
   1462        load_acc_store  v19.8h, v18.8h, v17.8h, v16.8h
   1463        sub             x2,  x2,  x9
   1464        load_acc_store  v16.8h, v17.8h, v18.8h, v19.8h, 1
   1465        load_acc_store  v20.8h, v21.8h, v22.8h, v23.8h, 1
   1466        load_acc_store  v24.8h, v25.8h, v26.8h, v27.8h, 1
   1467        load_acc_store  v28.8h, v29.8h, v30.8h, v31.8h, 1
   1468 .purgem load_acc_store
   1469        ret             x14
   1470 endfunc
   1471 .endm
   1472 
   1473 idct32_funcs
   1474 idct32_funcs _quarter
   1475 idct32_funcs _half
   1476 
   1477 const min_eob_idct_idct_32, align=4
   1478        .short  0, 34, 135, 336
   1479 endconst
   1480 
   1481 function ff_vp9_idct_idct_32x32_add_neon, export=1
   1482        cmp             w3,  #1
   1483        b.eq            idct32x32_dc_add_neon
   1484 
   1485        movrel          x10, idct_coeffs
   1486 
   1487        mov             x15, x30
   1488 
   1489        stp             d8,  d9,  [sp, #-0x20]!
   1490        stp             d10, d11, [sp, #0x10]
   1491 
   1492        sub             sp,  sp,  #2048
   1493 
   1494        mov             x4,  x0
   1495        mov             x5,  x1
   1496        mov             x6,  x2
   1497 
   1498        // Double stride of the input, since we only read every other line
   1499        mov             x9,  #128
   1500        neg             x7,  x9
   1501 
   1502        ld1             {v0.8h,v1.8h}, [x10], #32
   1503        ld1             {v8.8h,v9.8h}, [x10]
   1504 
   1505        cmp             w3,  #34
   1506        b.le            idct32x32_quarter_add_neon
   1507        cmp             w3,  #135
   1508        b.le            idct32x32_half_add_neon
   1509 
   1510        movrel          x12, min_eob_idct_idct_32, 2
   1511 
   1512 .irp i, 0, 8, 16, 24
   1513        add             x0,  sp,  #(\i*64)
   1514 .if \i > 0
   1515        ldrh            w1,  [x12], #2
   1516        cmp             w3,  w1
   1517        mov             x1,  #(32 - \i)/4
   1518        b.le            1f
   1519 .endif
   1520        add             x2,  x6,  #(\i*2)
   1521        bl              idct32_1d_8x32_pass1_neon
   1522 .endr
   1523        b               3f
   1524 
   1525 1:
   1526        // Write zeros to the temp buffer for pass 2
   1527        movi            v16.8h,  #0
   1528        movi            v17.8h,  #0
   1529        movi            v18.8h,  #0
   1530        movi            v19.8h,  #0
   1531 2:
   1532        subs            x1,  x1,  #1
   1533 .rept 4
   1534        st1             {v16.8h,v17.8h,v18.8h,v19.8h},  [x0], #64
   1535 .endr
   1536        b.ne            2b
   1537 3:
   1538 .irp i, 0, 8, 16, 24
   1539        add             x0,  x4,  #(\i)
   1540        mov             x1,  x5
   1541        add             x2,  sp,  #(\i*2)
   1542        bl              idct32_1d_8x32_pass2_neon
   1543 .endr
   1544 
   1545        add             sp,  sp,  #2048
   1546 
   1547        ldp             d10, d11, [sp, #0x10]
   1548        ldp             d8,  d9,  [sp], #0x20
   1549 
   1550        ret             x15
   1551 endfunc
   1552 
   1553 .macro idct32_partial size
   1554 function idct32x32_\size\()_add_neon
   1555        add             x0,  sp,  #(0*64)
   1556        add             x2,  x6,  #(0*2)
   1557        bl              idct32_1d_8x32_pass1_\size\()_neon
   1558 .ifc \size,half
   1559        add             x0,  sp,  #(8*64)
   1560        add             x2,  x6,  #(8*2)
   1561        bl              idct32_1d_8x32_pass1_\size\()_neon
   1562 .endif
   1563 .irp i, 0, 8, 16, 24
   1564        add             x0,  x4,  #(\i)
   1565        mov             x1,  x5
   1566        add             x2,  sp,  #(\i*2)
   1567        bl              idct32_1d_8x32_pass2_\size\()_neon
   1568 .endr
   1569 
   1570        add             sp,  sp,  #2048
   1571 
   1572        ldp             d10, d11, [sp, #0x10]
   1573        ldp             d8,  d9,  [sp], #0x20
   1574 
   1575        ret             x15
   1576 endfunc
   1577 .endm
   1578 
   1579 idct32_partial quarter
   1580 idct32_partial half