tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

vp9itxfm_16bpp_neon.S (81568B)


      1 /*
      2 * Copyright (c) 2017 Google Inc.
      3 *
      4 * This file is part of FFmpeg.
      5 *
      6 * FFmpeg is free software; you can redistribute it and/or
      7 * modify it under the terms of the GNU Lesser General Public
      8 * License as published by the Free Software Foundation; either
      9 * version 2.1 of the License, or (at your option) any later version.
     10 *
     11 * FFmpeg is distributed in the hope that it will be useful,
     12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     14 * Lesser General Public License for more details.
     15 *
     16 * You should have received a copy of the GNU Lesser General Public
     17 * License along with FFmpeg; if not, write to the Free Software
     18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     19 */
     20 
     21 #include "libavutil/aarch64/asm.S"
     22 #include "neon.S"
     23 
     24 const itxfm4_coeffs, align=4
     25        .short  11585, 0, 6270, 15137
     26 iadst4_coeffs:
     27        .short  5283, 15212, 9929, 13377
     28 endconst
     29 
     30 const iadst8_coeffs, align=4
     31        .short  16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
     32 idct_coeffs:
     33        .short  11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
     34        .short  1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
     35        .short  804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
     36        .short  3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
     37 endconst
     38 
     39 const iadst16_coeffs, align=4
     40        .short  16364, 804, 15893, 3981, 11003, 12140, 8423, 14053
     41        .short  14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
     42 endconst
     43 
     44 .macro transpose_4x4s r0, r1, r2, r3, r4, r5, r6, r7
     45        trn1            \r4\().4s,  \r0\().4s,  \r1\().4s
     46        trn2            \r5\().4s,  \r0\().4s,  \r1\().4s
     47        trn1            \r6\().4s,  \r2\().4s,  \r3\().4s
     48        trn2            \r7\().4s,  \r2\().4s,  \r3\().4s
     49        trn1            \r0\().2d,  \r4\().2d,  \r6\().2d
     50        trn2            \r2\().2d,  \r4\().2d,  \r6\().2d
     51        trn1            \r1\().2d,  \r5\().2d,  \r7\().2d
     52        trn2            \r3\().2d,  \r5\().2d,  \r7\().2d
     53 .endm
     54 
     55 // Transpose a 8x8 matrix of 32 bit elements, where each row is spread out
     56 // over two registers.
     57 .macro transpose_8x8s r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15, t0, t1, t2, t3
     58        transpose_4x4s  \r0,  \r2,  \r4,  \r6,  \t0, \t1, \t2, \t3
     59        transpose_4x4s  \r9,  \r11, \r13, \r15, \t0, \t1, \t2, \t3
     60 
     61        // Do 4x4 transposes of r1,r3,r5,r7 and r8,r10,r12,r14
     62        // while swapping the two 4x4 matrices between each other
     63 
     64        // First step of the 4x4 transpose of r1-r7, into t0-t3
     65        trn1            \t0\().4s,  \r1\().4s,  \r3\().4s
     66        trn2            \t1\().4s,  \r1\().4s,  \r3\().4s
     67        trn1            \t2\().4s,  \r5\().4s,  \r7\().4s
     68        trn2            \t3\().4s,  \r5\().4s,  \r7\().4s
     69 
     70        // First step of the 4x4 transpose of r8-r12, into r1-r7
     71        trn1            \r1\().4s,  \r8\().4s,  \r10\().4s
     72        trn2            \r3\().4s,  \r8\().4s,  \r10\().4s
     73        trn1            \r5\().4s,  \r12\().4s, \r14\().4s
     74        trn2            \r7\().4s,  \r12\().4s, \r14\().4s
     75 
     76        // Second step of the 4x4 transpose of r1-r7 (now in t0-r3), into r8-r12
     77        trn1            \r8\().2d,  \t0\().2d,  \t2\().2d
     78        trn2            \r12\().2d, \t0\().2d,  \t2\().2d
     79        trn1            \r10\().2d, \t1\().2d,  \t3\().2d
     80        trn2            \r14\().2d, \t1\().2d,  \t3\().2d
     81 
     82        // Second step of the 4x4 transpose of r8-r12 (now in r1-r7), in place as far as possible
     83        trn1            \t0\().2d,  \r1\().2d,  \r5\().2d
     84        trn2            \r5\().2d,  \r1\().2d,  \r5\().2d
     85        trn1            \t1\().2d,  \r3\().2d,  \r7\().2d
     86        trn2            \r7\().2d,  \r3\().2d,  \r7\().2d
     87 
     88        // Move the outputs of trn1 back in place
     89        mov             \r1\().16b,  \t0\().16b
     90        mov             \r3\().16b,  \t1\().16b
     91 .endm
     92 
     93 // out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
     94 // out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
     95 // in/out are .4s registers; this can do with 4 temp registers, but is
     96 // more efficient if 6 temp registers are available.
     97 .macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0
     98 .if \neg > 0
     99        neg             \tmp4\().4s, v0.4s
    100 .endif
    101        add             \tmp1\().4s, \in1\().4s,  \in2\().4s
    102        sub             \tmp2\().4s, \in1\().4s,  \in2\().4s
    103 .if \neg > 0
    104        smull           \tmp3\().2d, \tmp1\().2s, \tmp4\().s[0]
    105        smull2          \tmp4\().2d, \tmp1\().4s, \tmp4\().s[0]
    106 .else
    107        smull           \tmp3\().2d, \tmp1\().2s, v0.s[0]
    108        smull2          \tmp4\().2d, \tmp1\().4s, v0.s[0]
    109 .endif
    110 .ifb \tmp5
    111        rshrn           \out1\().2s, \tmp3\().2d, #14
    112        rshrn2          \out1\().4s, \tmp4\().2d, #14
    113        smull           \tmp3\().2d, \tmp2\().2s, v0.s[0]
    114        smull2          \tmp4\().2d, \tmp2\().4s, v0.s[0]
    115        rshrn           \out2\().2s, \tmp3\().2d, #14
    116        rshrn2          \out2\().4s, \tmp4\().2d, #14
    117 .else
    118        smull           \tmp5\().2d, \tmp2\().2s, v0.s[0]
    119        smull2          \tmp6\().2d, \tmp2\().4s, v0.s[0]
    120        rshrn           \out1\().2s, \tmp3\().2d, #14
    121        rshrn2          \out1\().4s, \tmp4\().2d, #14
    122        rshrn           \out2\().2s, \tmp5\().2d, #14
    123        rshrn2          \out2\().4s, \tmp6\().2d, #14
    124 .endif
    125 .endm
    126 
    127 // Same as dmbutterfly0 above, but treating the input in in2 as zero,
    128 // writing the same output into both out1 and out2.
    129 .macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6
    130        smull           \tmp1\().2d, \in1\().2s,  v0.s[0]
    131        smull2          \tmp2\().2d, \in1\().4s,  v0.s[0]
    132        rshrn           \out1\().2s, \tmp1\().2d, #14
    133        rshrn2          \out1\().4s, \tmp2\().2d, #14
    134        rshrn           \out2\().2s, \tmp1\().2d, #14
    135        rshrn2          \out2\().4s, \tmp2\().2d, #14
    136 .endm
    137 
    138 // out1,out2 = in1 * coef1 - in2 * coef2
    139 // out3,out4 = in1 * coef2 + in2 * coef1
    140 // out are 4 x .2d registers, in are 2 x .4s registers
    141 .macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2
    142        smull           \out1\().2d, \in1\().2s, \coef1
    143        smull2          \out2\().2d, \in1\().4s, \coef1
    144        smull           \out3\().2d, \in1\().2s, \coef2
    145        smull2          \out4\().2d, \in1\().4s, \coef2
    146        smlsl           \out1\().2d, \in2\().2s, \coef2
    147        smlsl2          \out2\().2d, \in2\().4s, \coef2
    148        smlal           \out3\().2d, \in2\().2s, \coef1
    149        smlal2          \out4\().2d, \in2\().4s, \coef1
    150 .endm
    151 
    152 // inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14
    153 // inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14
    154 // inout are 2 x .4s registers
    155 .macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0
    156        dmbutterfly_l   \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2
    157 .if \neg > 0
    158        neg             \tmp3\().2d, \tmp3\().2d
    159        neg             \tmp4\().2d, \tmp4\().2d
    160 .endif
    161        rshrn           \inout1\().2s, \tmp1\().2d,  #14
    162        rshrn2          \inout1\().4s, \tmp2\().2d,  #14
    163        rshrn           \inout2\().2s, \tmp3\().2d,  #14
    164        rshrn2          \inout2\().4s, \tmp4\().2d,  #14
    165 .endm
    166 
    167 // Same as dmbutterfly above, but treating the input in inout2 as zero
    168 .macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
    169        smull           \tmp1\().2d, \inout1\().2s, \coef1
    170        smull2          \tmp2\().2d, \inout1\().4s, \coef1
    171        smull           \tmp3\().2d, \inout1\().2s, \coef2
    172        smull2          \tmp4\().2d, \inout1\().4s, \coef2
    173        rshrn           \inout1\().2s, \tmp1\().2d, #14
    174        rshrn2          \inout1\().4s, \tmp2\().2d, #14
    175        rshrn           \inout2\().2s, \tmp3\().2d, #14
    176        rshrn2          \inout2\().4s, \tmp4\().2d, #14
    177 .endm
    178 
    179 // Same as dmbutterfly above, but treating the input in inout1 as zero
    180 .macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
    181        smull           \tmp1\().2d, \inout2\().2s, \coef2
    182        smull2          \tmp2\().2d, \inout2\().4s, \coef2
    183        smull           \tmp3\().2d, \inout2\().2s, \coef1
    184        smull2          \tmp4\().2d, \inout2\().4s, \coef1
    185        neg             \tmp1\().2d, \tmp1\().2d
    186        neg             \tmp2\().2d, \tmp2\().2d
    187        rshrn           \inout2\().2s, \tmp3\().2d, #14
    188        rshrn2          \inout2\().4s, \tmp4\().2d, #14
    189        rshrn           \inout1\().2s, \tmp1\().2d, #14
    190        rshrn2          \inout1\().4s, \tmp2\().2d, #14
    191 .endm
    192 
    193 .macro dsmull_h out1, out2, in, coef
    194        smull           \out1\().2d, \in\().2s, \coef
    195        smull2          \out2\().2d, \in\().4s, \coef
    196 .endm
    197 
    198 .macro drshrn_h out, in1, in2, shift
    199        rshrn           \out\().2s, \in1\().2d, \shift
    200        rshrn2          \out\().4s, \in2\().2d, \shift
    201 .endm
    202 
    203 
    204 // out1 = in1 + in2
    205 // out2 = in1 - in2
    206 .macro butterfly_4s out1, out2, in1, in2
    207        add             \out1\().4s, \in1\().4s, \in2\().4s
    208        sub             \out2\().4s, \in1\().4s, \in2\().4s
    209 .endm
    210 
    211 // out1 = in1 - in2
    212 // out2 = in1 + in2
    213 .macro butterfly_4s_r out1, out2, in1, in2
    214        sub             \out1\().4s, \in1\().4s, \in2\().4s
    215        add             \out2\().4s, \in1\().4s, \in2\().4s
    216 .endm
    217 
    218 // out1 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
    219 // out2 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
    220 // out are 2 x .4s registers, in are 4 x .2d registers
    221 .macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
    222        add             \tmp1\().2d, \in1\().2d, \in3\().2d
    223        add             \tmp2\().2d, \in2\().2d, \in4\().2d
    224        sub             \tmp3\().2d, \in1\().2d, \in3\().2d
    225        sub             \tmp4\().2d, \in2\().2d, \in4\().2d
    226        rshrn           \out1\().2s, \tmp1\().2d,  #14
    227        rshrn2          \out1\().4s, \tmp2\().2d,  #14
    228        rshrn           \out2\().2s, \tmp3\().2d,  #14
    229        rshrn2          \out2\().4s, \tmp4\().2d,  #14
    230 .endm
    231 
    232 .macro iwht4_10 c0, c1, c2, c3
    233        add             \c0\().4s, \c0\().4s, \c1\().4s
    234        sub             v17.4s,    \c2\().4s, \c3\().4s
    235        sub             v16.4s,    \c0\().4s, v17.4s
    236        sshr            v16.4s,    v16.4s,    #1
    237        sub             \c2\().4s, v16.4s,    \c1\().4s
    238        sub             \c1\().4s, v16.4s,    \c3\().4s
    239        add             \c3\().4s, v17.4s,    \c2\().4s
    240        sub             \c0\().4s, \c0\().4s, \c1\().4s
    241 .endm
    242 
    243 .macro iwht4_12 c0, c1, c2, c3
    244        iwht4_10        \c0, \c1, \c2, \c3
    245 .endm
    246 
    247 .macro idct4_10 c0, c1, c2, c3
    248        mul             v22.4s,    \c1\().4s, v0.s[3]
    249        mul             v20.4s,    \c1\().4s, v0.s[2]
    250        add             v16.4s,    \c0\().4s, \c2\().4s
    251        sub             v17.4s,    \c0\().4s, \c2\().4s
    252        mla             v22.4s,    \c3\().4s, v0.s[2]
    253        mul             v18.4s,    v16.4s,    v0.s[0]
    254        mul             v24.4s,    v17.4s,    v0.s[0]
    255        mls             v20.4s,    \c3\().4s, v0.s[3]
    256        srshr           v22.4s,    v22.4s,    #14
    257        srshr           v18.4s,    v18.4s,    #14
    258        srshr           v24.4s,    v24.4s,    #14
    259        srshr           v20.4s,    v20.4s,    #14
    260        add             \c0\().4s, v18.4s,    v22.4s
    261        sub             \c3\().4s, v18.4s,    v22.4s
    262        add             \c1\().4s, v24.4s,    v20.4s
    263        sub             \c2\().4s, v24.4s,    v20.4s
    264 .endm
    265 
    266 .macro idct4_12 c0, c1, c2, c3
    267        smull           v22.2d,    \c1\().2s, v0.s[3]
    268        smull2          v23.2d,    \c1\().4s, v0.s[3]
    269        smull           v20.2d,    \c1\().2s, v0.s[2]
    270        smull2          v21.2d,    \c1\().4s, v0.s[2]
    271        add             v16.4s,    \c0\().4s, \c2\().4s
    272        sub             v17.4s,    \c0\().4s, \c2\().4s
    273        smlal           v22.2d,    \c3\().2s, v0.s[2]
    274        smlal2          v23.2d,    \c3\().4s, v0.s[2]
    275        smull           v18.2d,    v16.2s,    v0.s[0]
    276        smull2          v19.2d,    v16.4s,    v0.s[0]
    277        smull           v24.2d,    v17.2s,    v0.s[0]
    278        smull2          v25.2d,    v17.4s,    v0.s[0]
    279        smlsl           v20.2d,    \c3\().2s, v0.s[3]
    280        smlsl2          v21.2d,    \c3\().4s, v0.s[3]
    281        rshrn           v22.2s,    v22.2d,    #14
    282        rshrn2          v22.4s,    v23.2d,    #14
    283        rshrn           v18.2s,    v18.2d,    #14
    284        rshrn2          v18.4s,    v19.2d,    #14
    285        rshrn           v24.2s,    v24.2d,    #14
    286        rshrn2          v24.4s,    v25.2d,    #14
    287        rshrn           v20.2s,    v20.2d,    #14
    288        rshrn2          v20.4s,    v21.2d,    #14
    289        add             \c0\().4s, v18.4s,    v22.4s
    290        sub             \c3\().4s, v18.4s,    v22.4s
    291        add             \c1\().4s, v24.4s,    v20.4s
    292        sub             \c2\().4s, v24.4s,    v20.4s
    293 .endm
    294 
    295 .macro iadst4_10 c0, c1, c2, c3
    296        mul             v16.4s,    \c0\().4s, v1.s[0]
    297        mla             v16.4s,    \c2\().4s, v1.s[1]
    298        mla             v16.4s,    \c3\().4s, v1.s[2]
    299        mul             v18.4s,    \c0\().4s, v1.s[2]
    300        mls             v18.4s,    \c2\().4s, v1.s[0]
    301        sub             \c0\().4s, \c0\().4s, \c2\().4s
    302        mls             v18.4s,    \c3\().4s, v1.s[1]
    303        add             \c0\().4s, \c0\().4s, \c3\().4s
    304        mul             v22.4s,    \c1\().4s, v1.s[3]
    305        mul             v20.4s,    \c0\().4s, v1.s[3]
    306        add             v24.4s,    v16.4s,    v22.4s
    307        add             v26.4s,    v18.4s,    v22.4s
    308        srshr           \c0\().4s, v24.4s,    #14
    309        add             v16.4s,    v16.4s,    v18.4s
    310        srshr           \c1\().4s, v26.4s,    #14
    311        sub             v16.4s,    v16.4s,    v22.4s
    312        srshr           \c2\().4s, v20.4s,    #14
    313        srshr           \c3\().4s, v16.4s,    #14
    314 .endm
    315 
    316 .macro iadst4_12 c0, c1, c2, c3
    317        smull           v16.2d,    \c0\().2s, v1.s[0]
    318        smull2          v17.2d,    \c0\().4s, v1.s[0]
    319        smlal           v16.2d,    \c2\().2s, v1.s[1]
    320        smlal2          v17.2d,    \c2\().4s, v1.s[1]
    321        smlal           v16.2d,    \c3\().2s, v1.s[2]
    322        smlal2          v17.2d,    \c3\().4s, v1.s[2]
    323        smull           v18.2d,    \c0\().2s, v1.s[2]
    324        smull2          v19.2d,    \c0\().4s, v1.s[2]
    325        smlsl           v18.2d,    \c2\().2s, v1.s[0]
    326        smlsl2          v19.2d,    \c2\().4s, v1.s[0]
    327        sub             \c0\().4s, \c0\().4s, \c2\().4s
    328        smlsl           v18.2d,    \c3\().2s, v1.s[1]
    329        smlsl2          v19.2d,    \c3\().4s, v1.s[1]
    330        add             \c0\().4s, \c0\().4s, \c3\().4s
    331        smull           v22.2d,    \c1\().2s, v1.s[3]
    332        smull2          v23.2d,    \c1\().4s, v1.s[3]
    333        smull           v20.2d,    \c0\().2s, v1.s[3]
    334        smull2          v21.2d,    \c0\().4s, v1.s[3]
    335        add             v24.2d,    v16.2d,    v22.2d
    336        add             v25.2d,    v17.2d,    v23.2d
    337        add             v26.2d,    v18.2d,    v22.2d
    338        add             v27.2d,    v19.2d,    v23.2d
    339        rshrn           \c0\().2s, v24.2d,    #14
    340        rshrn2          \c0\().4s, v25.2d,    #14
    341        add             v16.2d,    v16.2d,    v18.2d
    342        add             v17.2d,    v17.2d,    v19.2d
    343        rshrn           \c1\().2s, v26.2d,    #14
    344        rshrn2          \c1\().4s, v27.2d,    #14
    345        sub             v16.2d,    v16.2d,    v22.2d
    346        sub             v17.2d,    v17.2d,    v23.2d
    347        rshrn           \c2\().2s, v20.2d,    #14
    348        rshrn2          \c2\().4s, v21.2d,    #14
    349        rshrn           \c3\().2s, v16.2d,    #14
    350        rshrn2          \c3\().4s, v17.2d,    #14
    351 .endm
    352 
    353 // The public functions in this file have got the following signature:
    354 // void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
    355 
    356 .macro itxfm_func4x4 txfm1, txfm2, bpp
    357 function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_\bpp\()_neon, export=1
    358 .ifc \txfm1,\txfm2
    359 .ifc \txfm1,idct
    360        movrel          x4,  itxfm4_coeffs
    361        ld1             {v0.4h}, [x4]
    362        sxtl            v0.4s,  v0.4h
    363 .endif
    364 .ifc \txfm1,iadst
    365        movrel          x4,  iadst4_coeffs
    366        ld1             {v0.d}[1], [x4]
    367        sxtl2           v1.4s,  v0.8h
    368 .endif
    369 .else
    370        movrel          x4,  itxfm4_coeffs
    371        ld1             {v0.8h}, [x4]
    372        sxtl2           v1.4s,  v0.8h
    373        sxtl            v0.4s,  v0.4h
    374 .endif
    375 
    376        movi            v30.4s, #0
    377        movi            v31.4s, #0
    378 .ifc \txfm1\()_\txfm2,idct_idct
    379        cmp             w3,  #1
    380        b.ne            1f
    381        // DC-only for idct/idct
    382        ld1             {v2.s}[0],  [x2]
    383        smull           v2.2d,  v2.2s, v0.s[0]
    384        rshrn           v2.2s,  v2.2d, #14
    385        smull           v2.2d,  v2.2s, v0.s[0]
    386        rshrn           v2.2s,  v2.2d, #14
    387        st1             {v31.s}[0], [x2]
    388        dup             v4.4s,  v2.s[0]
    389        mov             v5.16b, v4.16b
    390        mov             v6.16b, v4.16b
    391        mov             v7.16b, v4.16b
    392        b               2f
    393 .endif
    394 
    395 1:
    396        ld1             {v4.4s,v5.4s,v6.4s,v7.4s},  [x2]
    397        st1             {v30.4s,v31.4s}, [x2], #32
    398 
    399 .ifc \txfm1,iwht
    400        sshr            v4.4s,  v4.4s,  #2
    401        sshr            v5.4s,  v5.4s,  #2
    402        sshr            v6.4s,  v6.4s,  #2
    403        sshr            v7.4s,  v7.4s,  #2
    404 .endif
    405 
    406        \txfm1\()4_\bpp v4,  v5,  v6,  v7
    407 
    408        st1             {v30.4s,v31.4s}, [x2], #32
    409        // Transpose 4x4 with 32 bit elements
    410        transpose_4x4s  v4,  v5,  v6,  v7,  v16, v17, v18, v19
    411 
    412        \txfm2\()4_\bpp v4,  v5,  v6,  v7
    413 2:
    414        mvni            v31.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
    415        ld1             {v0.4h},   [x0], x1
    416        ld1             {v1.4h},   [x0], x1
    417 .ifnc \txfm1,iwht
    418        srshr           v4.4s,  v4.4s,  #4
    419        srshr           v5.4s,  v5.4s,  #4
    420        srshr           v6.4s,  v6.4s,  #4
    421        srshr           v7.4s,  v7.4s,  #4
    422 .endif
    423        uaddw           v4.4s,  v4.4s,  v0.4h
    424        uaddw           v5.4s,  v5.4s,  v1.4h
    425        ld1             {v2.4h},   [x0], x1
    426        ld1             {v3.4h},   [x0], x1
    427        sqxtun          v0.4h,  v4.4s
    428        sqxtun2         v0.8h,  v5.4s
    429        sub             x0,  x0,  x1, lsl #2
    430 
    431        uaddw           v6.4s,  v6.4s,  v2.4h
    432        umin            v0.8h,  v0.8h,  v31.8h
    433        uaddw           v7.4s,  v7.4s,  v3.4h
    434        st1             {v0.4h},   [x0], x1
    435        sqxtun          v2.4h,  v6.4s
    436        sqxtun2         v2.8h,  v7.4s
    437        umin            v2.8h,  v2.8h,  v31.8h
    438 
    439        st1             {v0.d}[1], [x0], x1
    440        st1             {v2.4h},   [x0], x1
    441        st1             {v2.d}[1], [x0], x1
    442 
    443        ret
    444 endfunc
    445 .endm
    446 
    447 .macro itxfm_funcs4x4 bpp
    448 itxfm_func4x4 idct,  idct,  \bpp
    449 itxfm_func4x4 iadst, idct,  \bpp
    450 itxfm_func4x4 idct,  iadst, \bpp
    451 itxfm_func4x4 iadst, iadst, \bpp
    452 itxfm_func4x4 iwht,  iwht,  \bpp
    453 .endm
    454 
    455 itxfm_funcs4x4 10
    456 itxfm_funcs4x4 12
    457 
    458 function idct8x8_dc_add_neon
    459        movrel          x4,  idct_coeffs
    460        ld1             {v0.4h}, [x4]
    461 
    462        movi            v1.4h,  #0
    463        sxtl            v0.4s,  v0.4h
    464 
    465        ld1             {v2.s}[0],  [x2]
    466        smull           v2.2d,  v2.2s,  v0.s[0]
    467        rshrn           v2.2s,  v2.2d,  #14
    468        smull           v2.2d,  v2.2s,  v0.s[0]
    469        rshrn           v2.2s,  v2.2d,  #14
    470        st1             {v1.s}[0],  [x2]
    471        dup             v2.4s,  v2.s[0]
    472 
    473        srshr           v2.4s,  v2.4s,  #5
    474 
    475        mov             x4,  #8
    476        mov             x3,  x0
    477        dup             v31.8h, w5
    478 1:
    479        // Loop to add the constant from v2 into all 8x8 outputs
    480        subs            x4,  x4,  #2
    481        ld1             {v3.8h},  [x0], x1
    482        ld1             {v4.8h},  [x0], x1
    483        uaddw           v16.4s, v2.4s,  v3.4h
    484        uaddw2          v17.4s, v2.4s,  v3.8h
    485        uaddw           v18.4s, v2.4s,  v4.4h
    486        uaddw2          v19.4s, v2.4s,  v4.8h
    487        sqxtun          v3.4h,  v16.4s
    488        sqxtun2         v3.8h,  v17.4s
    489        sqxtun          v4.4h,  v18.4s
    490        sqxtun2         v4.8h,  v19.4s
    491        umin            v3.8h,  v3.8h,  v31.8h
    492        umin            v4.8h,  v4.8h,  v31.8h
    493        st1             {v3.8h},  [x3], x1
    494        st1             {v4.8h},  [x3], x1
    495        b.ne            1b
    496 
    497        ret
    498 endfunc
    499 
    500 .macro idct8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1, t2, t3, t4, t5
    501        dmbutterfly0    \r0, \r4, \r0, \r4, \t0, \t1, \t2, \t3, \t4, \t5 // r0 = t0a, r4 = t1a
    502        dmbutterfly     \r2, \r6, v0.s[2], v0.s[3], \t0, \t1, \t2, \t3   // r2 = t2a, r6 = t3a
    503        dmbutterfly     \r1, \r7, v1.s[0], v1.s[1], \t0, \t1, \t2, \t3   // r1 = t4a, r7 = t7a
    504        dmbutterfly     \r5, \r3, v1.s[2], v1.s[3], \t0, \t1, \t2, \t3   // r5 = t5a, r3 = t6a
    505 
    506        butterfly_4s    \t0, \t1, \r0, \r6 // t0 = t0, t1 = t3
    507        butterfly_4s    \t2, \r5, \r1, \r5 // t2 = t4, r5 = t5a
    508        butterfly_4s    \t3, \r6, \r7, \r3 // t3 = t7, r6 = t6a
    509        butterfly_4s    \r7, \r4, \r4, \r2 // r7 = t1, r4 = t2
    510 
    511        dmbutterfly0    \r6, \r5, \r6, \r5, \r0, \r1, \r2, \r3, \t4, \t5 // r6 = t6, r5 = t5
    512 
    513        butterfly_4s    \r1, \r6, \r7, \r6 // r1 = out[1], r6 = out[6]
    514        butterfly_4s    \r0, \r7, \t0, \t3 // r0 = out[0], r7 = out[7]
    515        butterfly_4s    \r2, \r5, \r4, \r5 // r2 = out[2], r5 = out[5]
    516        butterfly_4s    \r3, \r4, \t1, \t2 // r3 = out[3], r4 = out[4]
    517 .endm
    518 
    519 .macro iadst8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1, t2, t3, t4, t5
    520        dmbutterfly_l   \t2, \t3, \t0, \t1, \r7, \r0, v2.s[1], v2.s[0]   // t2,t3 = t1a, t0,t1 = t0a
    521        dmbutterfly_l   \r0, \r7, \t4, \t5, \r3, \r4, v3.s[1], v3.s[0]   // r0,r7 = t5a, t4,t5 = t4a
    522 
    523        dbutterfly_n    \r3, \t0, \t0, \t1, \t4, \t5, \r3, \r4, \t0, \t1 // r3 = t0, t0 = t4
    524        dbutterfly_n    \r4, \t1, \t2, \t3, \r0, \r7, \r4, \t1, \t4, \t5 // r4 = t1, t1 = t5
    525 
    526        dmbutterfly_l   \t4, \t5, \t2, \t3, \r5, \r2, v2.s[3], v2.s[2]   // t4,t5 = t3a, t2,t3 = t2a
    527        dmbutterfly_l   \r2, \r5, \r0, \r7, \r1, \r6, v3.s[3], v3.s[2]   // r2,r5 = t7a, r0,r7 = t6a
    528 
    529        dbutterfly_n    \r1, \t2, \t2, \t3, \r0, \r7, \r1, \r6, \t2, \t3 // r1 = t2, t2 = t6
    530        dbutterfly_n    \r0, \t4, \t4, \t5, \r2, \r5, \r0, \r7, \t4, \t5 // r0 = t3, t4 = t7
    531 
    532        butterfly_4s    \r7, \r4, \r4, \r0   // r7 = -out[7], r4 = t3
    533        neg             \r7\().4s, \r7\().4s // r7 = out[7]
    534        butterfly_4s    \r0, \r1, \r3, \r1   // r0 = out[0],  r1 = t2
    535 
    536        dmbutterfly_l   \r2, \r3, \t3, \t5, \t0, \t1, v0.s[2], v0.s[3]   // r2,r3 = t5a, t3,t5 = t4a
    537        dmbutterfly_l   \t0, \t1, \r5, \r6, \t4, \t2, v0.s[3], v0.s[2]   // t0,t1 = t6a, r5,r6 = t7a
    538 
    539        dbutterfly_n    \r6, \t2, \r2, \r3, \r5, \r6, \t2, \t4, \r2, \r3 // r6 = out[6],  t2 = t7
    540 
    541        dmbutterfly0    \r3, \r4, \r1, \r4, \t4, \r5, \r1, \r2           // r3 = -out[3], r4 = out[4]
    542        neg             \r3\().4s, \r3\().4s  // r3 = out[3]
    543 
    544        dbutterfly_n    \r1, \t0, \t3, \t5, \t0, \t1, \r1, \r2, \t0, \t1 // r1 = -out[1], t0 = t6
    545        neg             \r1\().4s, \r1\().4s  // r1 = out[1]
    546 
    547        dmbutterfly0    \r2, \r5, \t0, \t2, \t1, \t3, \t4, \t5           // r2 = out[2],  r5 = -out[5]
    548        neg             \r5\().4s, \r5\().4s  // r5 = out[5]
    549 .endm
    550 
    551 
    552 .macro itxfm_func8x8 txfm1, txfm2
    553 function vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
    554 .ifc \txfm1\()_\txfm2,idct_idct
    555        cmp             w3,  #1
    556        b.eq            idct8x8_dc_add_neon
    557 .endif
    558        // The iadst also uses a few coefficients from
    559        // idct, so those always need to be loaded.
    560 .ifc \txfm1\()_\txfm2,idct_idct
    561        movrel          x4,  idct_coeffs
    562 .else
    563        movrel          x4,  iadst8_coeffs
    564        ld1             {v1.8h}, [x4], #16
    565        stp             d8,  d9,  [sp, #-0x10]!
    566        sxtl2           v3.4s,  v1.8h
    567        sxtl            v2.4s,  v1.4h
    568 .endif
    569        ld1             {v0.8h}, [x4]
    570        sxtl2           v1.4s,  v0.8h
    571        sxtl            v0.4s,  v0.4h
    572 
    573        movi            v4.4s, #0
    574        movi            v5.4s, #0
    575        movi            v6.4s, #0
    576        movi            v7.4s, #0
    577 
    578 1:
    579        ld1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x2], #64
    580        ld1             {v20.4s,v21.4s,v22.4s,v23.4s},  [x2], #64
    581        ld1             {v24.4s,v25.4s,v26.4s,v27.4s},  [x2], #64
    582        ld1             {v28.4s,v29.4s,v30.4s,v31.4s},  [x2], #64
    583        sub             x2,  x2,  #256
    584        st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
    585        st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
    586        st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
    587        st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
    588 
    589 .ifc \txfm1\()_\txfm2,idct_idct
    590        idct8           v16, v18, v20, v22, v24, v26, v28, v30, v2,  v3,  v4,  v5,  v6,  v7
    591        idct8           v17, v19, v21, v23, v25, v27, v29, v31, v2,  v3,  v4,  v5,  v6,  v7
    592 .else
    593        \txfm1\()8      v16, v18, v20, v22, v24, v26, v28, v30, v4,  v5,  v6,  v7,  v8,  v9
    594        \txfm1\()8      v17, v19, v21, v23, v25, v27, v29, v31, v4,  v5,  v6,  v7,  v8,  v9
    595 .endif
    596 
    597        // Transpose 8x8 with 16 bit elements
    598        transpose_8x8s  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v4, v5, v6, v7
    599 
    600 .ifc \txfm1\()_\txfm2,idct_idct
    601        idct8           v16, v18, v20, v22, v24, v26, v28, v30, v2,  v3,  v4,  v5,  v6,  v7
    602        idct8           v17, v19, v21, v23, v25, v27, v29, v31, v2,  v3,  v4,  v5,  v6,  v7
    603 .else
    604        \txfm2\()8      v16, v18, v20, v22, v24, v26, v28, v30, v4,  v5,  v6,  v7,  v8,  v9
    605        \txfm2\()8      v17, v19, v21, v23, v25, v27, v29, v31, v4,  v5,  v6,  v7,  v8,  v9
    606 .endif
    607 2:
    608        mov             x3,  x0
    609        // Add into the destination
    610        ld1             {v0.8h},  [x0], x1
    611        srshr           v16.4s, v16.4s, #5
    612        srshr           v17.4s, v17.4s, #5
    613        ld1             {v1.8h},  [x0], x1
    614        srshr           v18.4s, v18.4s, #5
    615        srshr           v19.4s, v19.4s, #5
    616        ld1             {v2.8h},  [x0], x1
    617        srshr           v20.4s, v20.4s, #5
    618        srshr           v21.4s, v21.4s, #5
    619        uaddw           v16.4s, v16.4s, v0.4h
    620        uaddw2          v17.4s, v17.4s, v0.8h
    621        ld1             {v3.8h},  [x0], x1
    622        srshr           v22.4s, v22.4s, #5
    623        srshr           v23.4s, v23.4s, #5
    624        uaddw           v18.4s, v18.4s, v1.4h
    625        uaddw2          v19.4s, v19.4s, v1.8h
    626        ld1             {v4.8h},  [x0], x1
    627        srshr           v24.4s, v24.4s, #5
    628        srshr           v25.4s, v25.4s, #5
    629        uaddw           v20.4s, v20.4s, v2.4h
    630        uaddw2          v21.4s, v21.4s, v2.8h
    631        sqxtun          v0.4h,  v16.4s
    632        sqxtun2         v0.8h,  v17.4s
    633        dup             v16.8h, w5
    634        ld1             {v5.8h},  [x0], x1
    635        srshr           v26.4s, v26.4s, #5
    636        srshr           v27.4s, v27.4s, #5
    637        uaddw           v22.4s, v22.4s, v3.4h
    638        uaddw2          v23.4s, v23.4s, v3.8h
    639        sqxtun          v1.4h,  v18.4s
    640        sqxtun2         v1.8h,  v19.4s
    641        umin            v0.8h,  v0.8h,  v16.8h
    642        ld1             {v6.8h},  [x0], x1
    643        srshr           v28.4s, v28.4s, #5
    644        srshr           v29.4s, v29.4s, #5
    645        uaddw           v24.4s, v24.4s, v4.4h
    646        uaddw2          v25.4s, v25.4s, v4.8h
    647        sqxtun          v2.4h,  v20.4s
    648        sqxtun2         v2.8h,  v21.4s
    649        umin            v1.8h,  v1.8h,  v16.8h
    650        ld1             {v7.8h},  [x0], x1
    651        srshr           v30.4s, v30.4s, #5
    652        srshr           v31.4s, v31.4s, #5
    653        uaddw           v26.4s, v26.4s, v5.4h
    654        uaddw2          v27.4s, v27.4s, v5.8h
    655        sqxtun          v3.4h,  v22.4s
    656        sqxtun2         v3.8h,  v23.4s
    657        umin            v2.8h,  v2.8h,  v16.8h
    658 
    659        st1             {v0.8h},  [x3], x1
    660        uaddw           v28.4s, v28.4s, v6.4h
    661        uaddw2          v29.4s, v29.4s, v6.8h
    662        st1             {v1.8h},  [x3], x1
    663        sqxtun          v4.4h,  v24.4s
    664        sqxtun2         v4.8h,  v25.4s
    665        umin            v3.8h,  v3.8h,  v16.8h
    666        st1             {v2.8h},  [x3], x1
    667        uaddw           v30.4s, v30.4s, v7.4h
    668        uaddw2          v31.4s, v31.4s, v7.8h
    669        st1             {v3.8h},  [x3], x1
    670        sqxtun          v5.4h,  v26.4s
    671        sqxtun2         v5.8h,  v27.4s
    672        umin            v4.8h,  v4.8h,  v16.8h
    673        st1             {v4.8h},  [x3], x1
    674        sqxtun          v6.4h,  v28.4s
    675        sqxtun2         v6.8h,  v29.4s
    676        umin            v5.8h,  v5.8h,  v16.8h
    677        st1             {v5.8h},  [x3], x1
    678        sqxtun          v7.4h,  v30.4s
    679        sqxtun2         v7.8h,  v31.4s
    680        umin            v6.8h,  v6.8h,  v16.8h
    681 
    682        st1             {v6.8h},  [x3], x1
    683        umin            v7.8h,  v7.8h,  v16.8h
    684        st1             {v7.8h},  [x3], x1
    685 
    686 .ifnc \txfm1\()_\txfm2,idct_idct
    687        ldp             d8,  d9,  [sp], 0x10
    688 .endif
    689        ret
    690 endfunc
    691 
    692 function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_10_neon, export=1
    693        mov             x5,  #0x03ff
    694        b               vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
    695 endfunc
    696 
    697 function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_12_neon, export=1
    698        mov             x5,  #0x0fff
    699        b               vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
    700 endfunc
    701 .endm
    702 
    703 itxfm_func8x8 idct,  idct
    704 itxfm_func8x8 iadst, idct
    705 itxfm_func8x8 idct,  iadst
    706 itxfm_func8x8 iadst, iadst
    707 
    708 
    709 function idct16x16_dc_add_neon
    710        movrel          x4,  idct_coeffs
    711        ld1             {v0.4h}, [x4]
    712        sxtl            v0.4s,  v0.4h
    713 
    714        movi            v1.4h,  #0
    715 
    716        ld1             {v2.s}[0],  [x2]
    717        smull           v2.2d,  v2.2s,  v0.s[0]
    718        rshrn           v2.2s,  v2.2d,  #14
    719        smull           v2.2d,  v2.2s,  v0.s[0]
    720        rshrn           v2.2s,  v2.2d,  #14
    721        st1             {v1.s}[0],  [x2]
    722        dup             v2.4s,  v2.s[0]
    723 
    724        srshr           v0.4s,  v2.4s,  #6
    725 
    726        mov             x3, x0
    727        mov             x4, #16
    728        dup             v31.8h, w13
    729 1:
    730        // Loop to add the constant from v2 into all 16x16 outputs
    731        subs            x4,  x4,  #2
    732        ld1             {v1.8h,v2.8h},  [x0], x1
    733        uaddw           v16.4s, v0.4s,  v1.4h
    734        uaddw2          v17.4s, v0.4s,  v1.8h
    735        ld1             {v3.8h,v4.8h},  [x0], x1
    736        uaddw           v18.4s, v0.4s,  v2.4h
    737        uaddw2          v19.4s, v0.4s,  v2.8h
    738        uaddw           v20.4s, v0.4s,  v3.4h
    739        uaddw2          v21.4s, v0.4s,  v3.8h
    740        uaddw           v22.4s, v0.4s,  v4.4h
    741        uaddw2          v23.4s, v0.4s,  v4.8h
    742        sqxtun          v1.4h,  v16.4s
    743        sqxtun2         v1.8h,  v17.4s
    744        sqxtun          v2.4h,  v18.4s
    745        sqxtun2         v2.8h,  v19.4s
    746        sqxtun          v3.4h,  v20.4s
    747        sqxtun2         v3.8h,  v21.4s
    748        sqxtun          v4.4h,  v22.4s
    749        sqxtun2         v4.8h,  v23.4s
    750        umin            v1.8h,  v1.8h,  v31.8h
    751        umin            v2.8h,  v2.8h,  v31.8h
    752        st1             {v1.8h,v2.8h},  [x3], x1
    753        umin            v3.8h,  v3.8h,  v31.8h
    754        umin            v4.8h,  v4.8h,  v31.8h
    755        st1             {v3.8h,v4.8h},  [x3], x1
    756        b.ne            1b
    757 
    758        ret
    759 endfunc
    760 
    761 .macro idct16_end
    762        butterfly_4s    v18, v7,  v4,  v7                // v18 = t0a,  v7  = t7a
    763        butterfly_4s    v19, v22, v5,  v22               // v19 = t1a,  v22 = t6
    764        butterfly_4s    v4,  v26, v20, v26               // v4  = t2a,  v26 = t5
    765        butterfly_4s    v5,  v6,  v28, v6                // v5  = t3a,  v6  = t4
    766        butterfly_4s    v20, v28, v16, v24               // v20 = t8a,  v28 = t11a
    767        butterfly_4s    v24, v21, v23, v21               // v24 = t9,   v21 = t10
    768        butterfly_4s    v23, v27, v25, v27               // v23 = t14,  v27 = t13
    769        butterfly_4s    v25, v29, v29, v17               // v25 = t15a, v29 = t12a
    770 
    771        dmbutterfly0    v8,  v9,  v27, v21, v8,  v9,  v16, v17, v30, v31 // v8  = t13a, v9  = t10a
    772        dmbutterfly0    v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12,  v27 = t11
    773 
    774        butterfly_4s    v16, v31, v18, v25               // v16 = out[0], v31 = out[15]
    775        butterfly_4s    v17, v30, v19, v23               // v17 = out[1], v30 = out[14]
    776        butterfly_4s_r  v25, v22, v22, v24               // v25 = out[9], v22 = out[6]
    777        butterfly_4s    v23, v24, v7,  v20               // v23 = out[7], v24 = out[8]
    778        butterfly_4s    v18, v29, v4,  v8                // v18 = out[2], v29 = out[13]
    779        butterfly_4s    v19, v28, v5,  v28               // v19 = out[3], v28 = out[12]
    780        butterfly_4s    v20, v27, v6,  v27               // v20 = out[4], v27 = out[11]
    781        butterfly_4s    v21, v26, v26, v9                // v21 = out[5], v26 = out[10]
    782        ret
    783 .endm
    784 
    785 function idct16
    786        dmbutterfly0    v16, v24, v16, v24, v4, v5, v6, v7, v8, v9 // v16 = t0a,  v24 = t1a
    787        dmbutterfly     v20, v28, v0.s[2], v0.s[3], v4, v5, v6, v7 // v20 = t2a,  v28 = t3a
    788        dmbutterfly     v18, v30, v1.s[0], v1.s[1], v4, v5, v6, v7 // v18 = t4a,  v30 = t7a
    789        dmbutterfly     v26, v22, v1.s[2], v1.s[3], v4, v5, v6, v7 // v26 = t5a,  v22 = t6a
    790        dmbutterfly     v17, v31, v2.s[0], v2.s[1], v4, v5, v6, v7 // v17 = t8a,  v31 = t15a
    791        dmbutterfly     v25, v23, v2.s[2], v2.s[3], v4, v5, v6, v7 // v25 = t9a,  v23 = t14a
    792        dmbutterfly     v21, v27, v3.s[0], v3.s[1], v4, v5, v6, v7 // v21 = t10a, v27 = t13a
    793        dmbutterfly     v29, v19, v3.s[2], v3.s[3], v4, v5, v6, v7 // v29 = t11a, v19 = t12a
    794 
    795        butterfly_4s    v4,  v28, v16, v28               // v4  = t0,   v28 = t3
    796        butterfly_4s    v5,  v20, v24, v20               // v5  = t1,   v20 = t2
    797        butterfly_4s    v6,  v26, v18, v26               // v6  = t4,   v26 = t5
    798        butterfly_4s    v7,  v22, v30, v22               // v7  = t7,   v22 = t6
    799        butterfly_4s    v16, v25, v17, v25               // v16 = t8,   v25 = t9
    800        butterfly_4s    v24, v21, v29, v21               // v24 = t11,  v21 = t10
    801        butterfly_4s    v17, v27, v19, v27               // v17 = t12,  v27 = t13
    802        butterfly_4s    v29, v23, v31, v23               // v29 = t15,  v23 = t14
    803 
    804        dmbutterfly0    v22, v26, v22, v26, v8, v9, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
    805        dmbutterfly     v23, v25, v0.s[2], v0.s[3], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
    806        dmbutterfly     v27, v21, v0.s[2], v0.s[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
    807        idct16_end
    808 endfunc
    809 
    810 function idct16_half
    811        dmbutterfly0_h  v16, v24, v16, v24, v4, v5, v6, v7, v8, v9 // v16 = t0a,  v24 = t1a
    812        dmbutterfly_h1  v20, v28, v0.s[2], v0.s[3], v4, v5, v6, v7 // v20 = t2a,  v28 = t3a
    813        dmbutterfly_h1  v18, v30, v1.s[0], v1.s[1], v4, v5, v6, v7 // v18 = t4a,  v30 = t7a
    814        dmbutterfly_h2  v26, v22, v1.s[2], v1.s[3], v4, v5, v6, v7 // v26 = t5a,  v22 = t6a
    815        dmbutterfly_h1  v17, v31, v2.s[0], v2.s[1], v4, v5, v6, v7 // v17 = t8a,  v31 = t15a
    816        dmbutterfly_h2  v25, v23, v2.s[2], v2.s[3], v4, v5, v6, v7 // v25 = t9a,  v23 = t14a
    817        dmbutterfly_h1  v21, v27, v3.s[0], v3.s[1], v4, v5, v6, v7 // v21 = t10a, v27 = t13a
    818        dmbutterfly_h2  v29, v19, v3.s[2], v3.s[3], v4, v5, v6, v7 // v29 = t11a, v19 = t12a
    819 
    820        butterfly_4s    v4,  v28, v16, v28               // v4  = t0,   v28 = t3
    821        butterfly_4s    v5,  v20, v24, v20               // v5  = t1,   v20 = t2
    822        butterfly_4s    v6,  v26, v18, v26               // v6  = t4,   v26 = t5
    823        butterfly_4s    v7,  v22, v30, v22               // v7  = t7,   v22 = t6
    824        butterfly_4s    v16, v25, v17, v25               // v16 = t8,   v25 = t9
    825        butterfly_4s    v24, v21, v29, v21               // v24 = t11,  v21 = t10
    826        butterfly_4s    v17, v27, v19, v27               // v17 = t12,  v27 = t13
    827        butterfly_4s    v29, v23, v31, v23               // v29 = t15,  v23 = t14
    828 
    829        dmbutterfly0    v22, v26, v22, v26, v8, v9, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
    830        dmbutterfly     v23, v25, v0.s[2], v0.s[3], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
    831        dmbutterfly     v27, v21, v0.s[2], v0.s[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
    832        idct16_end
    833 endfunc
    834 
    835 function idct16_quarter
    836        dsmull_h        v24, v25, v19, v3.s[3]
    837        dsmull_h        v4,  v5,  v17, v2.s[0]
    838        dsmull_h        v7,  v6,  v18, v1.s[1]
    839        dsmull_h        v30, v31, v18, v1.s[0]
    840        neg             v24.2d,  v24.2d
    841        neg             v25.2d,  v25.2d
    842        dsmull_h        v29, v28, v17, v2.s[1]
    843        dsmull_h        v26, v27, v19, v3.s[2]
    844        dsmull_h        v22, v23, v16, v0.s[0]
    845        drshrn_h        v24, v24, v25, #14
    846        drshrn_h        v16, v4,  v5,  #14
    847        drshrn_h        v7,  v7,  v6,  #14
    848        drshrn_h        v6,  v30, v31, #14
    849        drshrn_h        v29, v29, v28, #14
    850        drshrn_h        v17, v26, v27, #14
    851        drshrn_h        v28, v22, v23, #14
    852 
    853        dmbutterfly_l   v20, v21, v22, v23, v17, v24, v0.s[2], v0.s[3]
    854        dmbutterfly_l   v18, v19, v30, v31, v29, v16, v0.s[2], v0.s[3]
    855        neg             v22.2d,  v22.2d
    856        neg             v23.2d,  v23.2d
    857        drshrn_h        v27, v20, v21, #14
    858        drshrn_h        v21, v22, v23, #14
    859        drshrn_h        v23, v18, v19, #14
    860        drshrn_h        v25, v30, v31, #14
    861        mov             v4.16b,  v28.16b
    862        mov             v5.16b,  v28.16b
    863        dmbutterfly0    v22, v26, v7,  v6,  v18, v19, v30, v31
    864        mov             v20.16b, v28.16b
    865        idct16_end
    866 endfunc
    867 
    868 function iadst16
    869        ld1             {v0.8h,v1.8h}, [x11]
    870        sxtl            v2.4s,  v1.4h
    871        sxtl2           v3.4s,  v1.8h
    872        sxtl2           v1.4s,  v0.8h
    873        sxtl            v0.4s,  v0.4h
    874 
    875        dmbutterfly_l   v6,  v7,  v4,  v5,  v31, v16, v0.s[1], v0.s[0]   // v6,v7   = t1,   v4,v5   = t0
    876        dmbutterfly_l   v10, v11, v8,  v9,  v23, v24, v1.s[1], v1.s[0]   // v10,v11 = t9,   v8,v9   = t8
    877        dbutterfly_n    v31, v24, v6,  v7,  v10, v11, v12, v13, v10, v11 // v31     = t1a,  v24     = t9a
    878        dmbutterfly_l   v14, v15, v12, v13, v29, v18, v0.s[3], v0.s[2]   // v14,v15 = t3,   v12,v13 = t2
    879        dbutterfly_n    v16, v23, v4,  v5,  v8,  v9,  v6,  v7,  v8,  v9  // v16     = t0a,  v23     = t8a
    880 
    881        dmbutterfly_l   v6,  v7,  v4,  v5,  v21, v26, v1.s[3], v1.s[2]   // v6,v7   = t11,  v4,v5   = t10
    882        dbutterfly_n    v29, v26, v14, v15, v6,  v7,  v8,  v9,  v6,  v7  // v29     = t3a,  v26     = t11a
    883        dmbutterfly_l   v10, v11, v8,  v9,  v27, v20, v2.s[1], v2.s[0]   // v10,v11 = t5,   v8,v9   = t4
    884        dbutterfly_n    v18, v21, v12, v13, v4,  v5,  v6,  v7,  v4,  v5  // v18     = t2a,  v21     = t10a
    885 
    886        dmbutterfly_l   v14, v15, v12, v13, v19, v28, v3.s[1], v3.s[0]   // v14,v15 = t13,  v12,v13 = t12
    887        dbutterfly_n    v20, v28, v10, v11, v14, v15, v4,  v5,  v14, v15 // v20     = t5a,  v28     = t13a
    888        dmbutterfly_l   v6,  v7,  v4,  v5,  v25, v22, v2.s[3], v2.s[2]   // v6,v7   = t7,   v4,v5   = t6
    889        dbutterfly_n    v27, v19, v8,  v9,  v12, v13, v10, v11, v12, v13 // v27     = t4a,  v19     = t12a
    890 
    891        dmbutterfly_l   v10, v11, v8,  v9,  v17, v30, v3.s[3], v3.s[2]   // v10,v11 = t15,  v8,v9   = t14
    892        ld1             {v0.8h}, [x10]
    893        dbutterfly_n    v22, v30, v6,  v7,  v10, v11, v12, v13, v10, v11 // v22     = t7a,  v30     = t15a
    894        sxtl2           v1.4s,  v0.8h
    895        sxtl            v0.4s,  v0.4h
    896        dmbutterfly_l   v14, v15, v12, v13, v23, v24, v1.s[0], v1.s[1]   // v14,v15 = t9,   v12,v13 = t8
    897        dbutterfly_n    v25, v17, v4,  v5,  v8,  v9,  v6,  v7,  v8,  v9  // v25     = t6a,  v17     = t14a
    898 
    899        dmbutterfly_l   v4,  v5,  v6,  v7,  v28, v19, v1.s[1], v1.s[0]   // v4,v5   = t12,  v6,v7   = t13
    900        dbutterfly_n    v23, v19, v12, v13, v4,  v5,  v8,  v9,  v4,  v5  // v23     = t8a,  v19     = t12a
    901        dmbutterfly_l   v10, v11, v8,  v9,  v21, v26, v1.s[2], v1.s[3]   // v10,v11 = t11,  v8,v9   = t10
    902        butterfly_4s_r  v4,  v27, v16, v27               // v4  = t4,   v27 = t0
    903        dbutterfly_n    v24, v28, v14, v15, v6,  v7,  v12, v13, v6,  v7  // v24     = t9a,  v28     = t13a
    904 
    905        dmbutterfly_l   v12, v13, v14, v15, v30, v17, v1.s[3], v1.s[2]   // v12,v13 = t14,  v14,v15 = t15
    906        butterfly_4s_r  v5,  v20, v31, v20               // v5  = t5, v20 = t1
    907        dbutterfly_n    v21, v17, v8,  v9,  v12, v13, v6,  v7,  v12, v13 // v21     = t10a, v17     = t14a
    908        dbutterfly_n    v26, v30, v10, v11, v14, v15, v8,  v9,  v14, v15 // v26     = t11a, v30     = t15a
    909 
    910        butterfly_4s_r  v6,  v25, v18, v25               // v6  = t6, v25 = t2
    911        butterfly_4s_r  v7,  v22, v29, v22               // v7  = t7, v22 = t3
    912 
    913        dmbutterfly_l   v10, v11, v8,  v9,  v19, v28, v0.s[2], v0.s[3]   // v10,v11 = t13,  v8,v9   = t12
    914        dmbutterfly_l   v12, v13, v14, v15, v30, v17, v0.s[3], v0.s[2]   // v12,v13 = t14,  v14,v15 = t15
    915 
    916        dbutterfly_n    v18, v30, v8,  v9,  v12, v13, v16, v17, v12, v13 // v18   = out[2], v30     = t14a
    917        dbutterfly_n    v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17     = t15a
    918        neg             v29.4s, v29.4s                   // v29 = out[13]
    919 
    920        dmbutterfly_l   v10, v11, v8,  v9,  v4,  v5,  v0.s[2], v0.s[3]   // v10,v11 = t5a,  v8,v9   = t4a
    921        dmbutterfly_l   v12, v13, v14, v15, v7,  v6,  v0.s[3], v0.s[2]   // v12,v13 = t6a,  v14,v15 = t7a
    922 
    923        butterfly_4s    v2,  v6,  v27, v25               // v2 = out[0], v6 = t2a
    924        butterfly_4s    v3,  v7,  v23, v21               // v3 =-out[1], v7 = t10
    925 
    926        dbutterfly_n    v19, v31, v8,  v9,  v12, v13, v4,  v5,  v8,  v9  // v19 = -out[3],  v31 = t6
    927        neg             v19.4s, v19.4s                   // v19 = out[3]
    928        dbutterfly_n    v28, v16, v10, v11, v14, v15, v4,  v5,  v10, v11 // v28 = out[12],  v16 = t7
    929 
    930        butterfly_4s    v5,  v8,  v20, v22               // v5 =-out[15],v8 = t3a
    931        butterfly_4s    v4,  v9,  v24, v26               // v4 = out[14],v9 = t11
    932 
    933        dmbutterfly0    v23, v24, v6,  v8,  v10, v11, v12, v13, v14, v15, 1 // v23 = out[7], v24 = out[8]
    934        dmbutterfly0    v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1 // v21 = out[5], v26 = out[10]
    935        dmbutterfly0    v20, v27, v16, v31, v10, v11, v12, v13, v14, v15    // v20 = out[4], v27 = out[11]
    936        dmbutterfly0    v22, v25, v9,  v7,  v10, v11, v12, v13, v14, v15    // v22 = out[6], v25 = out[9]
    937 
    938        neg             v31.4s,  v5.4s                    // v31 = out[15]
    939        neg             v17.4s,  v3.4s                    // v17 = out[1]
    940 
    941        mov             v16.16b, v2.16b
    942        mov             v30.16b, v4.16b
    943        ret
    944 endfunc
    945 
    946 // Helper macros; we can't use these expressions directly within
    947 // e.g. .irp due to the extra concatenation \(). Therefore wrap
    948 // them in macros to allow using .irp below.
    949 .macro load i, src, inc
    950        ld1             {v\i\().4s},  [\src], \inc
    951 .endm
    952 .macro store i, dst, inc
    953        st1             {v\i\().4s},  [\dst], \inc
    954 .endm
    955 .macro movi_v i, size, imm
    956        movi            v\i\()\size,  \imm
    957 .endm
    958 .macro load_clear i, src, inc
    959        ld1             {v\i\().4s}, [\src]
    960        st1             {v4.4s},  [\src], \inc
    961 .endm
    962 
    963 .macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7
    964        srshr           \coef0, \coef0, #6
    965        ld1             {v4.4h},   [x0], x1
    966        srshr           \coef1, \coef1, #6
    967        ld1             {v4.d}[1], [x3], x1
    968        srshr           \coef2, \coef2, #6
    969        ld1             {v5.4h},   [x0], x1
    970        srshr           \coef3, \coef3, #6
    971        uaddw           \coef0, \coef0, v4.4h
    972        ld1             {v5.d}[1], [x3], x1
    973        srshr           \coef4, \coef4, #6
    974        uaddw2          \coef1, \coef1, v4.8h
    975        ld1             {v6.4h},   [x0], x1
    976        srshr           \coef5, \coef5, #6
    977        uaddw           \coef2, \coef2, v5.4h
    978        ld1             {v6.d}[1], [x3], x1
    979        sqxtun          v4.4h,  \coef0
    980        srshr           \coef6, \coef6, #6
    981        uaddw2          \coef3, \coef3, v5.8h
    982        ld1             {v7.4h},   [x0], x1
    983        sqxtun2         v4.8h,  \coef1
    984        srshr           \coef7, \coef7, #6
    985        uaddw           \coef4, \coef4, v6.4h
    986        ld1             {v7.d}[1], [x3], x1
    987        umin            v4.8h,  v4.8h,  v8.8h
    988        sub             x0,  x0,  x1, lsl #2
    989        sub             x3,  x3,  x1, lsl #2
    990        sqxtun          v5.4h,  \coef2
    991        uaddw2          \coef5, \coef5, v6.8h
    992        st1             {v4.4h},   [x0], x1
    993        sqxtun2         v5.8h,  \coef3
    994        uaddw           \coef6, \coef6, v7.4h
    995        st1             {v4.d}[1], [x3], x1
    996        umin            v5.8h,  v5.8h,  v8.8h
    997        sqxtun          v6.4h,  \coef4
    998        uaddw2          \coef7, \coef7, v7.8h
    999        st1             {v5.4h},   [x0], x1
   1000        sqxtun2         v6.8h,  \coef5
   1001        st1             {v5.d}[1], [x3], x1
   1002        umin            v6.8h,  v6.8h,  v8.8h
   1003        sqxtun          v7.4h,  \coef6
   1004        st1             {v6.4h},   [x0], x1
   1005        sqxtun2         v7.8h,  \coef7
   1006        st1             {v6.d}[1], [x3], x1
   1007        umin            v7.8h,  v7.8h,  v8.8h
   1008        st1             {v7.4h},   [x0], x1
   1009        st1             {v7.d}[1], [x3], x1
   1010 .endm
   1011 
   1012 // Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
   1013 // transpose into a horizontal 16x4 slice and store.
   1014 // x0 = dst (temp buffer)
   1015 // x1 = slice offset
   1016 // x2 = src
   1017 // x9 = input stride
   1018 .macro itxfm16_1d_funcs txfm
   1019 function \txfm\()16_1d_4x16_pass1_neon
   1020        mov             x14, x30
   1021 
   1022        movi            v4.4s, #0
   1023 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
   1024        load_clear      \i,  x2,  x9
   1025 .endr
   1026 
   1027        bl              \txfm\()16
   1028 
   1029        // Do four 4x4 transposes. Originally, v16-v31 contain the
   1030        // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
   1031        // contain the four transposed 4x4 blocks.
   1032        transpose_4x4s  v16, v17, v18, v19, v4, v5, v6, v7
   1033        transpose_4x4s  v20, v21, v22, v23, v4, v5, v6, v7
   1034        transpose_4x4s  v24, v25, v26, v27, v4, v5, v6, v7
   1035        transpose_4x4s  v28, v29, v30, v31, v4, v5, v6, v7
   1036 
   1037        // Store the transposed 4x4 blocks horizontally.
   1038        cmp             x1,  #12
   1039        b.eq            1f
   1040 .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
   1041        store           \i,  x0,  #16
   1042 .endr
   1043        ret             x14
   1044 1:
   1045        // Special case: For the last input column (x1 == 12),
   1046        // which would be stored as the last row in the temp buffer,
   1047        // don't store the first 4x4 block, but keep it in registers
   1048        // for the first slice of the second pass (where it is the
   1049        // last 4x4 block).
   1050        add             x0,  x0,  #16
   1051        st1             {v20.4s},  [x0], #16
   1052        st1             {v24.4s},  [x0], #16
   1053        st1             {v28.4s},  [x0], #16
   1054        add             x0,  x0,  #16
   1055        st1             {v21.4s},  [x0], #16
   1056        st1             {v25.4s},  [x0], #16
   1057        st1             {v29.4s},  [x0], #16
   1058        add             x0,  x0,  #16
   1059        st1             {v22.4s},  [x0], #16
   1060        st1             {v26.4s},  [x0], #16
   1061        st1             {v30.4s},  [x0], #16
   1062        add             x0,  x0,  #16
   1063        st1             {v23.4s},  [x0], #16
   1064        st1             {v27.4s},  [x0], #16
   1065        st1             {v31.4s},  [x0], #16
   1066 
   1067        mov             v28.16b, v16.16b
   1068        mov             v29.16b, v17.16b
   1069        mov             v30.16b, v18.16b
   1070        mov             v31.16b, v19.16b
   1071        ret             x14
   1072 endfunc
   1073 
   1074 // Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
   1075 // load the destination pixels (from a similar 4x16 slice), add and store back.
   1076 // x0 = dst
   1077 // x1 = dst stride
   1078 // x2 = src (temp buffer)
   1079 // x3 = slice offset
   1080 // x9 = temp buffer stride
   1081 function \txfm\()16_1d_4x16_pass2_neon
   1082        mov             x14, x30
   1083 
   1084 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
   1085        load            \i,  x2,  x9
   1086 .endr
   1087        cbz             x3,  1f
   1088 .irp i, 28, 29, 30, 31
   1089        load            \i,  x2,  x9
   1090 .endr
   1091 1:
   1092 
   1093        add             x3,  x0,  x1
   1094        lsl             x1,  x1,  #1
   1095        bl              \txfm\()16
   1096 
   1097        dup             v8.8h, w13
   1098        load_add_store  v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
   1099        load_add_store  v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
   1100 
   1101        ret             x14
   1102 endfunc
   1103 .endm
   1104 
   1105 itxfm16_1d_funcs idct
   1106 itxfm16_1d_funcs iadst
   1107 
   1108 // This is the minimum eob value for each subpartition, in increments of 4
   1109 const min_eob_idct_idct_16, align=4
   1110        .short  0, 10, 38, 89
   1111 endconst
   1112 
   1113 .macro itxfm_func16x16 txfm1, txfm2
   1114 function vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
   1115 .ifc \txfm1\()_\txfm2,idct_idct
   1116        cmp             w3,  #1
   1117        b.eq            idct16x16_dc_add_neon
   1118 .endif
   1119        mov             x15, x30
   1120        // iadst16 requires clobbering v8-v15, idct16 only clobbers v8-v9.
   1121 .ifnc \txfm1\()_\txfm2,idct_idct
   1122        stp             d14, d15, [sp, #-0x10]!
   1123        stp             d12, d13, [sp, #-0x10]!
   1124        stp             d10, d11, [sp, #-0x10]!
   1125 .endif
   1126        stp             d8,  d9,  [sp, #-0x10]!
   1127 
   1128        sub             sp,  sp,  #1024
   1129 
   1130        mov             x4,  x0
   1131        mov             x5,  x1
   1132        mov             x6,  x2
   1133 
   1134        movrel          x10, idct_coeffs
   1135 .ifnc \txfm1\()_\txfm2,idct_idct
   1136        movrel          x11, iadst16_coeffs
   1137 .endif
   1138 .ifc \txfm1,idct
   1139        ld1             {v0.8h,v1.8h}, [x10]
   1140        sxtl            v2.4s,  v1.4h
   1141        sxtl2           v3.4s,  v1.8h
   1142        sxtl2           v1.4s,  v0.8h
   1143        sxtl            v0.4s,  v0.4h
   1144 .endif
   1145        mov             x9,  #64
   1146 
   1147 .ifc \txfm1\()_\txfm2,idct_idct
   1148        cmp             w3,  #10
   1149        b.le            idct16x16_quarter_add_16_neon
   1150        cmp             w3,  #38
   1151        b.le            idct16x16_half_add_16_neon
   1152 
   1153        movrel          x12, min_eob_idct_idct_16, 2
   1154 .endif
   1155 
   1156 .irp i, 0, 4, 8, 12
   1157        add             x0,  sp,  #(\i*64)
   1158 .ifc \txfm1\()_\txfm2,idct_idct
   1159 .if \i > 0
   1160        ldrh            w1,  [x12], #2
   1161        cmp             w3,  w1
   1162        mov             x1,  #(16 - \i)/4
   1163        b.le            1f
   1164 .endif
   1165 .endif
   1166        mov             x1,  #\i
   1167        add             x2,  x6,  #(\i*4)
   1168        bl              \txfm1\()16_1d_4x16_pass1_neon
   1169 .endr
   1170 .ifc \txfm1\()_\txfm2,iadst_idct
   1171        ld1             {v0.8h,v1.8h}, [x10]
   1172        sxtl            v2.4s,  v1.4h
   1173        sxtl2           v3.4s,  v1.8h
   1174        sxtl2           v1.4s,  v0.8h
   1175        sxtl            v0.4s,  v0.4h
   1176 .endif
   1177 
   1178 .ifc \txfm1\()_\txfm2,idct_idct
   1179        b               3f
   1180 1:
   1181        // Set v28-v31 to zero, for the in-register passthrough of
   1182        // coefficients to pass 2.
   1183        movi            v28.4s,  #0
   1184        movi            v29.4s,  #0
   1185        movi            v30.4s,  #0
   1186        movi            v31.4s,  #0
   1187 2:
   1188        subs            x1,  x1,  #1
   1189 .rept 4
   1190        st1             {v28.4s,v29.4s,v30.4s,v31.4s}, [x0], x9
   1191 .endr
   1192        b.ne            2b
   1193 3:
   1194 .endif
   1195 
   1196 .irp i, 0, 4, 8, 12
   1197        add             x0,  x4,  #(\i*2)
   1198        mov             x1,  x5
   1199        add             x2,  sp,  #(\i*4)
   1200        mov             x3,  #\i
   1201        bl              \txfm2\()16_1d_4x16_pass2_neon
   1202 .endr
   1203 
   1204        add             sp,  sp,  #1024
   1205        ldp             d8,  d9,  [sp], 0x10
   1206 .ifnc \txfm1\()_\txfm2,idct_idct
   1207        ldp             d10, d11, [sp], 0x10
   1208        ldp             d12, d13, [sp], 0x10
   1209        ldp             d14, d15, [sp], 0x10
   1210 .endif
   1211        ret             x15
   1212 endfunc
   1213 
   1214 function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_10_neon, export=1
   1215        mov             x13, #0x03ff
   1216        b               vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
   1217 endfunc
   1218 
   1219 function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_12_neon, export=1
   1220        mov             x13, #0x0fff
   1221        b               vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
   1222 endfunc
   1223 .endm
   1224 
   1225 itxfm_func16x16 idct,  idct
   1226 itxfm_func16x16 iadst, idct
   1227 itxfm_func16x16 idct,  iadst
   1228 itxfm_func16x16 iadst, iadst
   1229 
   1230 function idct16_1d_4x16_pass1_quarter_neon
   1231        mov             x14, x30
   1232 
   1233        movi            v4.4s, #0
   1234 .irp i, 16, 17, 18, 19
   1235        load_clear      \i,  x2,  x9
   1236 .endr
   1237 
   1238        bl              idct16_quarter
   1239 
   1240        // Do four 4x4 transposes. Originally, v16-v31 contain the
   1241        // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
   1242        // contain the four transposed 4x4 blocks.
   1243        transpose_4x4s  v16, v17, v18, v19, v4, v5, v6, v7
   1244        transpose_4x4s  v20, v21, v22, v23, v4, v5, v6, v7
   1245        transpose_4x4s  v24, v25, v26, v27, v4, v5, v6, v7
   1246        transpose_4x4s  v28, v29, v30, v31, v4, v5, v6, v7
   1247 
   1248        // Store the transposed 4x4 blocks horizontally.
   1249        // The first 4x4 block is kept in registers for the second pass,
   1250        // store the rest in the temp buffer.
   1251        add             x0,  x0,  #16
   1252        st1             {v20.4s},  [x0], #16
   1253        st1             {v24.4s},  [x0], #16
   1254        st1             {v28.4s},  [x0], #16
   1255        add             x0,  x0,  #16
   1256        st1             {v21.4s},  [x0], #16
   1257        st1             {v25.4s},  [x0], #16
   1258        st1             {v29.4s},  [x0], #16
   1259        add             x0,  x0,  #16
   1260        st1             {v22.4s},  [x0], #16
   1261        st1             {v26.4s},  [x0], #16
   1262        st1             {v30.4s},  [x0], #16
   1263        add             x0,  x0,  #16
   1264        st1             {v23.4s},  [x0], #16
   1265        st1             {v27.4s},  [x0], #16
   1266        st1             {v31.4s},  [x0], #16
   1267        ret             x14
   1268 endfunc
   1269 
   1270 function idct16_1d_4x16_pass2_quarter_neon
   1271        mov             x14, x30
   1272 
   1273        // Only load the top 4 lines, and only do it for the later slices.
   1274        // For the first slice, d16-d19 is kept in registers from the first pass.
   1275        cbz             x3,  1f
   1276 .irp i, 16, 17, 18, 19
   1277        load            \i,  x2,  x9
   1278 .endr
   1279 1:
   1280 
   1281        add             x3,  x0,  x1
   1282        lsl             x1,  x1,  #1
   1283        bl              idct16_quarter
   1284 
   1285        dup             v8.8h, w13
   1286        load_add_store  v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
   1287        load_add_store  v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
   1288 
   1289        ret             x14
   1290 endfunc
   1291 
   1292 function idct16_1d_4x16_pass1_half_neon
   1293        mov             x14, x30
   1294 
   1295        movi            v4.4s, #0
   1296 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
   1297        load_clear      \i,  x2,  x9
   1298 .endr
   1299 
   1300        bl              idct16_half
   1301 
   1302        // Do four 4x4 transposes. Originally, v16-v31 contain the
   1303        // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
   1304        // contain the four transposed 4x4 blocks.
   1305        transpose_4x4s  v16, v17, v18, v19, v4, v5, v6, v7
   1306        transpose_4x4s  v20, v21, v22, v23, v4, v5, v6, v7
   1307        transpose_4x4s  v24, v25, v26, v27, v4, v5, v6, v7
   1308        transpose_4x4s  v28, v29, v30, v31, v4, v5, v6, v7
   1309 
   1310        // Store the transposed 4x4 blocks horizontally.
   1311        cmp             x1,  #4
   1312        b.eq            1f
   1313 .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
   1314        store           \i,  x0,  #16
   1315 .endr
   1316        ret             x14
   1317 1:
   1318        // Special case: For the second input column (r1 == 4),
   1319        // which would be stored as the second row in the temp buffer,
   1320        // don't store the first 4x4 block, but keep it in registers
   1321        // for the first slice of the second pass (where it is the
   1322        // second 4x4 block).
   1323        add             x0,  x0,  #16
   1324        st1             {v20.4s},  [x0], #16
   1325        st1             {v24.4s},  [x0], #16
   1326        st1             {v28.4s},  [x0], #16
   1327        add             x0,  x0,  #16
   1328        st1             {v21.4s},  [x0], #16
   1329        st1             {v25.4s},  [x0], #16
   1330        st1             {v29.4s},  [x0], #16
   1331        add             x0,  x0,  #16
   1332        st1             {v22.4s},  [x0], #16
   1333        st1             {v26.4s},  [x0], #16
   1334        st1             {v30.4s},  [x0], #16
   1335        add             x0,  x0,  #16
   1336        st1             {v23.4s},  [x0], #16
   1337        st1             {v27.4s},  [x0], #16
   1338        st1             {v31.4s},  [x0], #16
   1339 
   1340        mov             v20.16b, v16.16b
   1341        mov             v21.16b, v17.16b
   1342        mov             v22.16b, v18.16b
   1343        mov             v23.16b, v19.16b
   1344        ret             x14
   1345 endfunc
   1346 
   1347 function idct16_1d_4x16_pass2_half_neon
   1348        mov             x14, x30
   1349 
   1350 .irp i, 16, 17, 18, 19
   1351        load            \i,  x2,  x9
   1352 .endr
   1353        cbz             x3,  1f
   1354 .irp i, 20, 21, 22, 23
   1355        load            \i,  x2,  x9
   1356 .endr
   1357 1:
   1358 
   1359        add             x3,  x0,  x1
   1360        lsl             x1,  x1,  #1
   1361        bl              idct16_half
   1362 
   1363        dup             v8.8h, w13
   1364        load_add_store  v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
   1365        load_add_store  v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
   1366 
   1367        ret             x14
   1368 endfunc
   1369 
   1370 .macro idct16_partial size
   1371 function idct16x16_\size\()_add_16_neon
   1372        add             x0,  sp,  #(0*64)
   1373        mov             x1,  #0
   1374        add             x2,  x6,  #(0*4)
   1375        bl              idct16_1d_4x16_pass1_\size\()_neon
   1376 .ifc \size,half
   1377        add             x0,  sp,  #(4*64)
   1378        mov             x1,  #4
   1379        add             x2,  x6,  #(4*4)
   1380        bl              idct16_1d_4x16_pass1_\size\()_neon
   1381 .endif
   1382 
   1383 .irp i, 0, 4, 8, 12
   1384        add             x0,  x4,  #(\i*2)
   1385        mov             x1,  x5
   1386        add             x2,  sp,  #(\i*4)
   1387        mov             x3,  #\i
   1388        bl              idct16_1d_4x16_pass2_\size\()_neon
   1389 .endr
   1390 
   1391        add             sp,  sp,  #1024
   1392        ldp             d8,  d9,  [sp], 0x10
   1393        ret             x15
   1394 endfunc
   1395 .endm
   1396 
   1397 idct16_partial quarter
   1398 idct16_partial half
   1399 
   1400 function idct32x32_dc_add_neon
   1401        movrel          x4,  idct_coeffs
   1402        ld1             {v0.4h}, [x4]
   1403        sxtl            v0.4s,  v0.4h
   1404 
   1405        movi            v1.4h,  #0
   1406 
   1407        ld1             {v2.s}[0],  [x2]
   1408        smull           v2.2d,  v2.2s,  v0.s[0]
   1409        rshrn           v2.2s,  v2.2d,  #14
   1410        smull           v2.2d,  v2.2s,  v0.s[0]
   1411        rshrn           v2.2s,  v2.2d,  #14
   1412        st1             {v1.s}[0],  [x2]
   1413        dup             v2.4s,  v2.s[0]
   1414 
   1415        srshr           v0.4s,  v2.4s,  #6
   1416 
   1417        mov             x3,  x0
   1418        mov             x4,  #32
   1419        sub             x1,  x1,  #32
   1420        dup             v31.8h, w13
   1421 1:
   1422        // Loop to add the constant v0 into all 32x32 outputs
   1423        subs            x4,  x4,  #1
   1424        ld1             {v1.8h,v2.8h},  [x0], #32
   1425        uaddw           v16.4s, v0.4s,  v1.4h
   1426        uaddw2          v17.4s, v0.4s,  v1.8h
   1427        ld1             {v3.8h,v4.8h},  [x0], x1
   1428        uaddw           v18.4s, v0.4s,  v2.4h
   1429        uaddw2          v19.4s, v0.4s,  v2.8h
   1430        uaddw           v20.4s, v0.4s,  v3.4h
   1431        uaddw2          v21.4s, v0.4s,  v3.8h
   1432        uaddw           v22.4s, v0.4s,  v4.4h
   1433        uaddw2          v23.4s, v0.4s,  v4.8h
   1434        sqxtun          v1.4h,  v16.4s
   1435        sqxtun2         v1.8h,  v17.4s
   1436        sqxtun          v2.4h,  v18.4s
   1437        sqxtun2         v2.8h,  v19.4s
   1438        sqxtun          v3.4h,  v20.4s
   1439        sqxtun2         v3.8h,  v21.4s
   1440        sqxtun          v4.4h,  v22.4s
   1441        sqxtun2         v4.8h,  v23.4s
   1442        umin            v1.8h,  v1.8h,  v31.8h
   1443        umin            v2.8h,  v2.8h,  v31.8h
   1444        st1             {v1.8h,v2.8h},  [x3], #32
   1445        umin            v3.8h,  v3.8h,  v31.8h
   1446        umin            v4.8h,  v4.8h,  v31.8h
   1447        st1             {v3.8h,v4.8h},  [x3], x1
   1448        b.ne            1b
   1449 
   1450        ret
   1451 endfunc
   1452 
   1453 .macro idct32_end
   1454        butterfly_4s    v16, v5,  v4,  v5  // v16 = t16a, v5  = t19a
   1455        butterfly_4s    v17, v20, v23, v20 // v17 = t17,  v20 = t18
   1456        butterfly_4s    v18, v6,  v7,  v6  // v18 = t23a, v6  = t20a
   1457        butterfly_4s    v19, v21, v22, v21 // v19 = t22,  v21 = t21
   1458        butterfly_4s    v4,  v28, v28, v30 // v4  = t24a, v28 = t27a
   1459        butterfly_4s    v23, v26, v25, v26 // v23 = t25,  v26 = t26
   1460        butterfly_4s    v7,  v8,  v29, v31 // v7  = t31a, v3  = t28a
   1461        butterfly_4s    v22, v27, v24, v27 // v22 = t30,  v27 = t29
   1462 
   1463        dmbutterfly     v27, v20, v0.s[2], v0.s[3], v24, v25, v30, v31        // v27 = t18a, v20 = t29a
   1464        dmbutterfly     v8,  v5,  v0.s[2], v0.s[3], v24, v25, v30, v31        // v3  = t19,  v5  = t28
   1465        dmbutterfly     v28, v6,  v0.s[2], v0.s[3], v24, v25, v30, v31, neg=1 // v28 = t27,  v6  = t20
   1466        dmbutterfly     v26, v21, v0.s[2], v0.s[3], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a
   1467 
   1468        butterfly_4s    v31, v24, v7,  v4  // v31 = t31,  v24 = t24
   1469        butterfly_4s    v30, v25, v22, v23 // v30 = t30a, v25 = t25a
   1470        butterfly_4s_r  v23, v16, v16, v18 // v23 = t23,  v16 = t16
   1471        butterfly_4s_r  v22, v17, v17, v19 // v22 = t22a, v17 = t17a
   1472        butterfly_4s    v18, v21, v27, v21 // v18 = t18,  v21 = t21
   1473        butterfly_4s_r  v27, v28, v5,  v28 // v27 = t27a, v28 = t28a
   1474        butterfly_4s    v29, v26, v20, v26 // v29 = t29,  v26 = t26
   1475        butterfly_4s    v19, v20, v8,  v6  // v19 = t19a, v20 = t20
   1476 
   1477        dmbutterfly0    v27, v20, v27, v20, v4, v5, v6, v7, v8, v9 // v27 = t27,  v20 = t20
   1478        dmbutterfly0    v26, v21, v26, v21, v4, v5, v6, v7, v8, v9 // v26 = t26a, v21 = t21a
   1479        dmbutterfly0    v25, v22, v25, v22, v4, v5, v6, v7, v8, v9 // v25 = t25,  v22 = t22
   1480        dmbutterfly0    v24, v23, v24, v23, v4, v5, v6, v7, v8, v9 // v24 = t24a, v23 = t23a
   1481        ret
   1482 .endm
   1483 
   1484 function idct32_odd
   1485        dmbutterfly     v16, v31, v10.s[0], v10.s[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
   1486        dmbutterfly     v24, v23, v10.s[2], v10.s[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
   1487        dmbutterfly     v20, v27, v11.s[0], v11.s[1], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
   1488        dmbutterfly     v28, v19, v11.s[2], v11.s[3], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
   1489        dmbutterfly     v18, v29, v12.s[0], v12.s[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
   1490        dmbutterfly     v26, v21, v12.s[2], v12.s[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
   1491        dmbutterfly     v22, v25, v13.s[0], v13.s[1], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
   1492        dmbutterfly     v30, v17, v13.s[2], v13.s[3], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
   1493 
   1494        butterfly_4s    v4,  v24, v16, v24 // v4  = t16, v24 = t17
   1495        butterfly_4s    v5,  v20, v28, v20 // v5  = t19, v20 = t18
   1496        butterfly_4s    v6,  v26, v18, v26 // v6  = t20, v26 = t21
   1497        butterfly_4s    v7,  v22, v30, v22 // v7  = t23, v22 = t22
   1498        butterfly_4s    v28, v25, v17, v25 // v28 = t24, v25 = t25
   1499        butterfly_4s    v30, v21, v29, v21 // v30 = t27, v21 = t26
   1500        butterfly_4s    v29, v23, v31, v23 // v29 = t31, v23 = t30
   1501        butterfly_4s    v31, v27, v19, v27 // v31 = t28, v27 = t29
   1502 
   1503        dmbutterfly     v23, v24, v1.s[0], v1.s[1], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
   1504        dmbutterfly     v27, v20, v1.s[0], v1.s[1], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
   1505        dmbutterfly     v21, v26, v1.s[2], v1.s[3], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
   1506        dmbutterfly     v25, v22, v1.s[2], v1.s[3], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
   1507        idct32_end
   1508 endfunc
   1509 
   1510 function idct32_odd_half
   1511        dmbutterfly_h1  v16, v31, v10.s[0], v10.s[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
   1512        dmbutterfly_h2  v24, v23, v10.s[2], v10.s[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
   1513        dmbutterfly_h1  v20, v27, v11.s[0], v11.s[1], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
   1514        dmbutterfly_h2  v28, v19, v11.s[2], v11.s[3], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
   1515        dmbutterfly_h1  v18, v29, v12.s[0], v12.s[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
   1516        dmbutterfly_h2  v26, v21, v12.s[2], v12.s[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
   1517        dmbutterfly_h1  v22, v25, v13.s[0], v13.s[1], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
   1518        dmbutterfly_h2  v30, v17, v13.s[2], v13.s[3], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
   1519 
   1520        butterfly_4s    v4,  v24, v16, v24 // v4  = t16, v24 = t17
   1521        butterfly_4s    v5,  v20, v28, v20 // v5  = t19, v20 = t18
   1522        butterfly_4s    v6,  v26, v18, v26 // v6  = t20, v26 = t21
   1523        butterfly_4s    v7,  v22, v30, v22 // v7  = t23, v22 = t22
   1524        butterfly_4s    v28, v25, v17, v25 // v28 = t24, v25 = t25
   1525        butterfly_4s    v30, v21, v29, v21 // v30 = t27, v21 = t26
   1526        butterfly_4s    v29, v23, v31, v23 // v29 = t31, v23 = t30
   1527        butterfly_4s    v31, v27, v19, v27 // v31 = t28, v27 = t29
   1528 
   1529        dmbutterfly     v23, v24, v1.s[0], v1.s[1], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
   1530        dmbutterfly     v27, v20, v1.s[0], v1.s[1], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
   1531        dmbutterfly     v21, v26, v1.s[2], v1.s[3], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
   1532        dmbutterfly     v25, v22, v1.s[2], v1.s[3], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
   1533        idct32_end
   1534 endfunc
   1535 
   1536 function idct32_odd_quarter
   1537        dsmull_h        v4,  v5,  v16, v10.s[0]
   1538        dsmull_h        v28, v29, v19, v11.s[3]
   1539        dsmull_h        v30, v31, v16, v10.s[1]
   1540        dsmull_h        v22, v23, v17, v13.s[2]
   1541        dsmull_h        v7,  v6,  v17, v13.s[3]
   1542        dsmull_h        v26, v27, v19, v11.s[2]
   1543        dsmull_h        v20, v21, v18, v12.s[0]
   1544        dsmull_h        v24, v25, v18, v12.s[1]
   1545 
   1546        neg             v28.2d, v28.2d
   1547        neg             v29.2d, v29.2d
   1548        neg             v7.2d,  v7.2d
   1549        neg             v6.2d,  v6.2d
   1550 
   1551        drshrn_h        v4,  v4,  v5,  #14
   1552        drshrn_h        v5,  v28, v29, #14
   1553        drshrn_h        v29, v30, v31, #14
   1554        drshrn_h        v28, v22, v23, #14
   1555        drshrn_h        v7,  v7,  v6,  #14
   1556        drshrn_h        v31, v26, v27, #14
   1557        drshrn_h        v6,  v20, v21, #14
   1558        drshrn_h        v30, v24, v25, #14
   1559 
   1560        dmbutterfly_l   v16, v17, v18, v19, v29, v4,  v1.s[0], v1.s[1]
   1561        dmbutterfly_l   v27, v26, v20, v21, v31, v5,  v1.s[0], v1.s[1]
   1562        drshrn_h        v23, v16, v17, #14
   1563        drshrn_h        v24, v18, v19, #14
   1564        neg             v20.2d, v20.2d
   1565        neg             v21.2d, v21.2d
   1566        drshrn_h        v27, v27, v26, #14
   1567        drshrn_h        v20, v20, v21, #14
   1568        dmbutterfly_l   v16, v17, v18, v19, v30, v6,  v1.s[2], v1.s[3]
   1569        drshrn_h        v21, v16, v17, #14
   1570        drshrn_h        v26, v18, v19, #14
   1571        dmbutterfly_l   v16, v17, v18, v19, v28, v7,  v1.s[2], v1.s[3]
   1572        drshrn_h        v25, v16, v17, #14
   1573        neg             v18.2d, v18.2d
   1574        neg             v19.2d, v19.2d
   1575        drshrn_h        v22, v18, v19, #14
   1576 
   1577        idct32_end
   1578 endfunc
   1579 
   1580 .macro idct32_funcs suffix
   1581 // Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix.
   1582 // The 32-point IDCT can be decomposed into two 16-point IDCTs;
   1583 // a normal IDCT16 with every other input component (the even ones, with
   1584 // each output written twice), followed by a separate 16-point IDCT
   1585 // of the odd inputs, added/subtracted onto the outputs of the first idct16.
   1586 // x0 = dst (temp buffer)
   1587 // x1 = unused
   1588 // x2 = src
   1589 // x9 = double input stride
   1590 function idct32_1d_4x32_pass1\suffix\()_neon
   1591        mov             x14, x30
   1592 
   1593        movi            v4.4s,  #0
   1594 
   1595        // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
   1596 .ifb \suffix
   1597 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
   1598        load_clear      \i, x2, x9
   1599 .endr
   1600 .endif
   1601 .ifc \suffix,_quarter
   1602 .irp i, 16, 17, 18, 19
   1603        load_clear      \i, x2, x9
   1604 .endr
   1605 .endif
   1606 .ifc \suffix,_half
   1607 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
   1608        load_clear      \i, x2, x9
   1609 .endr
   1610 .endif
   1611 
   1612        bl              idct16\suffix
   1613 
   1614        // Do four 4x4 transposes. Originally, v16-v31 contain the
   1615        // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
   1616        // contain the four transposed 4x4 blocks.
   1617        transpose_4x4s  v16, v17, v18, v19, v4, v5, v6, v7
   1618        transpose_4x4s  v20, v21, v22, v23, v4, v5, v6, v7
   1619        transpose_4x4s  v24, v25, v26, v27, v4, v5, v6, v7
   1620        transpose_4x4s  v28, v29, v30, v31, v4, v5, v6, v7
   1621 
   1622        // Store the registers a, b, c, d horizontally, followed by the
   1623        // same registers d, c, b, a mirrored.
   1624 .macro store_rev a, b, c, d
   1625        // There's no rev128 instruction, but we reverse each 64 bit
   1626        // half, and then flip them using an ext with 8 bytes offset.
   1627        rev64           v7.4s, \d
   1628        st1             {\a},  [x0], #16
   1629        ext             v7.16b, v7.16b, v7.16b, #8
   1630        st1             {\b},  [x0], #16
   1631        rev64           v6.4s, \c
   1632        st1             {\c},  [x0], #16
   1633        ext             v6.16b, v6.16b, v6.16b, #8
   1634        st1             {\d},  [x0], #16
   1635        rev64           v5.4s, \b
   1636        st1             {v7.4s},  [x0], #16
   1637        ext             v5.16b, v5.16b, v5.16b, #8
   1638        st1             {v6.4s},  [x0], #16
   1639        rev64           v4.4s, \a
   1640        st1             {v5.4s},  [x0], #16
   1641        ext             v4.16b, v4.16b, v4.16b, #8
   1642        st1             {v4.4s},  [x0], #16
   1643 .endm
   1644        store_rev       v16.4s, v20.4s, v24.4s, v28.4s
   1645        store_rev       v17.4s, v21.4s, v25.4s, v29.4s
   1646        store_rev       v18.4s, v22.4s, v26.4s, v30.4s
   1647        store_rev       v19.4s, v23.4s, v27.4s, v31.4s
   1648        sub             x0,  x0,  #512
   1649 .purgem store_rev
   1650 
   1651        // Move x2 back to the start of the input, and move
   1652        // to the first odd row
   1653 .ifb \suffix
   1654        sub             x2,  x2,  x9, lsl #4
   1655 .endif
   1656 .ifc \suffix,_quarter
   1657        sub             x2,  x2,  x9, lsl #2
   1658 .endif
   1659 .ifc \suffix,_half
   1660        sub             x2,  x2,  x9, lsl #3
   1661 .endif
   1662        add             x2,  x2,  #128
   1663 
   1664        movi            v4.4s,  #0
   1665        // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
   1666 .ifb \suffix
   1667 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
   1668        load_clear      \i, x2, x9
   1669 .endr
   1670 .endif
   1671 .ifc \suffix,_quarter
   1672 .irp i, 16, 17, 18, 19
   1673        load_clear      \i, x2, x9
   1674 .endr
   1675 .endif
   1676 .ifc \suffix,_half
   1677 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
   1678        load_clear      \i, x2, x9
   1679 .endr
   1680 .endif
   1681 
   1682        bl              idct32_odd\suffix
   1683 
   1684        transpose_4x4s  v31, v30, v29, v28, v4, v5, v6, v7
   1685        transpose_4x4s  v27, v26, v25, v24, v4, v5, v6, v7
   1686        transpose_4x4s  v23, v22, v21, v20, v4, v5, v6, v7
   1687        transpose_4x4s  v19, v18, v17, v16, v4, v5, v6, v7
   1688 
   1689        // Store the registers a, b, c, d horizontally,
   1690        // adding into the output first, and the mirrored,
   1691        // subtracted from the output.
   1692 .macro store_rev a, b, c, d, a16b, b16b
   1693        ld1             {v4.4s},  [x0]
   1694        rev64           v9.4s, \d
   1695        add             v4.4s, v4.4s, \a
   1696        st1             {v4.4s},  [x0], #16
   1697        rev64           v8.4s, \c
   1698        ld1             {v4.4s},  [x0]
   1699        ext             v9.16b, v9.16b, v9.16b, #8
   1700        add             v4.4s, v4.4s, \b
   1701        st1             {v4.4s},  [x0], #16
   1702        ext             v8.16b, v8.16b, v8.16b, #8
   1703        ld1             {v4.4s},  [x0]
   1704        rev64           \b, \b
   1705        add             v4.4s, v4.4s, \c
   1706        st1             {v4.4s},  [x0], #16
   1707        rev64           \a, \a
   1708        ld1             {v4.4s},  [x0]
   1709        ext             \b16b, \b16b, \b16b, #8
   1710        add             v4.4s, v4.4s, \d
   1711        st1             {v4.4s},  [x0], #16
   1712        ext             \a16b, \a16b, \a16b, #8
   1713        ld1             {v4.4s},  [x0]
   1714        sub             v4.4s, v4.4s, v9.4s
   1715        st1             {v4.4s},  [x0], #16
   1716        ld1             {v4.4s},  [x0]
   1717        sub             v4.4s, v4.4s, v8.4s
   1718        st1             {v4.4s},  [x0], #16
   1719        ld1             {v4.4s},  [x0]
   1720        sub             v4.4s, v4.4s, \b
   1721        st1             {v4.4s},  [x0], #16
   1722        ld1             {v4.4s},  [x0]
   1723        sub             v4.4s, v4.4s, \a
   1724        st1             {v4.4s},  [x0], #16
   1725 .endm
   1726 
   1727        store_rev       v31.4s, v27.4s, v23.4s, v19.4s, v31.16b, v27.16b
   1728        store_rev       v30.4s, v26.4s, v22.4s, v18.4s, v30.16b, v26.16b
   1729        store_rev       v29.4s, v25.4s, v21.4s, v17.4s, v29.16b, v25.16b
   1730        store_rev       v28.4s, v24.4s, v20.4s, v16.4s, v28.16b, v24.16b
   1731 .purgem store_rev
   1732        ret             x14
   1733 endfunc
   1734 
   1735 // This is mostly the same as 4x32_pass1, but without the transpose,
   1736 // and use the source as temp buffer between the two idct passes, and
   1737 // add into the destination.
   1738 // x0 = dst
   1739 // x1 = dst stride
   1740 // x2 = src (temp buffer)
   1741 // x7 = negative double temp buffer stride
   1742 // x9 = double temp buffer stride
   1743 function idct32_1d_4x32_pass2\suffix\()_neon
   1744        mov             x14, x30
   1745 
   1746        // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
   1747 .ifb \suffix
   1748 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
   1749        load            \i, x2, x9
   1750 .endr
   1751        sub             x2,  x2,  x9, lsl #4
   1752 .endif
   1753 .ifc \suffix,_quarter
   1754 .irp i, 16, 17, 18, 19
   1755        load            \i, x2, x9
   1756 .endr
   1757        sub             x2,  x2,  x9, lsl #2
   1758 .endif
   1759 .ifc \suffix,_half
   1760 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
   1761        load            \i, x2, x9
   1762 .endr
   1763        sub             x2,  x2,  x9, lsl #3
   1764 .endif
   1765 
   1766        bl              idct16\suffix
   1767 
   1768 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
   1769        store           \i, x2, x9
   1770 .endr
   1771 
   1772        sub             x2,  x2,  x9, lsl #4
   1773        add             x2,  x2,  #128
   1774 
   1775        // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
   1776 .ifb \suffix
   1777 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
   1778        load            \i, x2, x9
   1779 .endr
   1780        sub             x2,  x2,  x9, lsl #4
   1781 .endif
   1782 .ifc \suffix,_quarter
   1783 .irp i, 16, 17, 18, 19
   1784        load            \i, x2, x9
   1785 .endr
   1786        sub             x2,  x2,  x9, lsl #2
   1787 .endif
   1788 .ifc \suffix,_half
   1789 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
   1790        load            \i, x2, x9
   1791 .endr
   1792        sub             x2,  x2,  x9, lsl #3
   1793 .endif
   1794        sub             x2,  x2,  #128
   1795 
   1796        bl              idct32_odd\suffix
   1797 
   1798 .macro load_acc_store a, b, c, d, neg=0
   1799 .if \neg == 0
   1800        ld1             {v4.4s},  [x2], x9
   1801        ld1             {v5.4s},  [x2], x9
   1802        add             v4.4s, v4.4s, \a
   1803        ld1             {v6.4s},  [x2], x9
   1804        add             v5.4s, v5.4s, \b
   1805        ld1             {v7.4s},  [x2], x9
   1806        add             v6.4s, v6.4s, \c
   1807        add             v7.4s, v7.4s, \d
   1808 .else
   1809        ld1             {v4.4s},  [x2], x7
   1810        ld1             {v5.4s},  [x2], x7
   1811        sub             v4.4s, v4.4s, \a
   1812        ld1             {v6.4s},  [x2], x7
   1813        sub             v5.4s, v5.4s, \b
   1814        ld1             {v7.4s},  [x2], x7
   1815        sub             v6.4s, v6.4s, \c
   1816        sub             v7.4s, v7.4s, \d
   1817 .endif
   1818        ld1             {v8.4h},   [x0], x1
   1819        ld1             {v8.d}[1], [x0], x1
   1820        srshr           v4.4s, v4.4s, #6
   1821        ld1             {v9.4h},   [x0], x1
   1822        srshr           v5.4s, v5.4s, #6
   1823        uaddw           v4.4s, v4.4s, v8.4h
   1824        ld1             {v9.d}[1], [x0], x1
   1825        srshr           v6.4s, v6.4s, #6
   1826        uaddw2          v5.4s, v5.4s, v8.8h
   1827        srshr           v7.4s, v7.4s, #6
   1828        sub             x0,  x0,  x1, lsl #2
   1829        uaddw           v6.4s, v6.4s, v9.4h
   1830        sqxtun          v4.4h, v4.4s
   1831        uaddw2          v7.4s, v7.4s, v9.8h
   1832        sqxtun2         v4.8h, v5.4s
   1833        umin            v4.8h, v4.8h, v15.8h
   1834        st1             {v4.4h},   [x0], x1
   1835        sqxtun          v5.4h, v6.4s
   1836        st1             {v4.d}[1], [x0], x1
   1837        sqxtun2         v5.8h, v7.4s
   1838        umin            v5.8h, v5.8h, v15.8h
   1839        st1             {v5.4h},   [x0], x1
   1840        st1             {v5.d}[1], [x0], x1
   1841 .endm
   1842        load_acc_store  v31.4s, v30.4s, v29.4s, v28.4s
   1843        load_acc_store  v27.4s, v26.4s, v25.4s, v24.4s
   1844        load_acc_store  v23.4s, v22.4s, v21.4s, v20.4s
   1845        load_acc_store  v19.4s, v18.4s, v17.4s, v16.4s
   1846        sub             x2,  x2,  x9
   1847        load_acc_store  v16.4s, v17.4s, v18.4s, v19.4s, 1
   1848        load_acc_store  v20.4s, v21.4s, v22.4s, v23.4s, 1
   1849        load_acc_store  v24.4s, v25.4s, v26.4s, v27.4s, 1
   1850        load_acc_store  v28.4s, v29.4s, v30.4s, v31.4s, 1
   1851 .purgem load_acc_store
   1852        ret             x14
   1853 endfunc
   1854 .endm
   1855 
   1856 idct32_funcs
   1857 idct32_funcs _quarter
   1858 idct32_funcs _half
   1859 
   1860 const min_eob_idct_idct_32, align=4
   1861        .short  0, 9, 34, 70, 135, 240, 336, 448
   1862 endconst
   1863 
   1864 function vp9_idct_idct_32x32_add_16_neon
   1865        cmp             w3,  #1
   1866        b.eq            idct32x32_dc_add_neon
   1867 
   1868        movrel          x10, idct_coeffs
   1869 
   1870        mov             x15, x30
   1871        stp             d8,  d9,  [sp, #-0x10]!
   1872        stp             d10, d11, [sp, #-0x10]!
   1873        stp             d12, d13, [sp, #-0x10]!
   1874        stp             d14, d15, [sp, #-0x10]!
   1875 
   1876        sub             sp,  sp,  #4096
   1877 
   1878        mov             x4,  x0
   1879        mov             x5,  x1
   1880        mov             x6,  x2
   1881 
   1882        // Double stride of the input, since we only read every other line
   1883        mov             x9,  #256
   1884        neg             x7,  x9
   1885 
   1886        ld1             {v0.8h,v1.8h},   [x10], #32
   1887        sxtl            v2.4s,  v1.4h
   1888        sxtl2           v3.4s,  v1.8h
   1889        sxtl2           v1.4s,  v0.8h
   1890        sxtl            v0.4s,  v0.4h
   1891        ld1             {v10.8h,v11.8h}, [x10]
   1892        sxtl            v12.4s, v11.4h
   1893        sxtl2           v13.4s, v11.8h
   1894        sxtl2           v11.4s, v10.8h
   1895        sxtl            v10.4s, v10.4h
   1896 
   1897        dup             v15.8h, w13
   1898 
   1899        cmp             w3,  #34
   1900        b.le            idct32x32_quarter_add_16_neon
   1901        cmp             w3,  #135
   1902        b.le            idct32x32_half_add_16_neon
   1903 
   1904        movrel          x12, min_eob_idct_idct_32, 2
   1905 
   1906 .irp i, 0, 4, 8, 12, 16, 20, 24, 28
   1907        add             x0,  sp,  #(\i*128)
   1908 .if \i > 0
   1909        ldrh            w1,  [x12], #2
   1910        cmp             w3,  w1
   1911        mov             x1,  #(32 - \i)/4
   1912        b.le            1f
   1913 .endif
   1914        add             x2,  x6,  #(\i*4)
   1915        bl              idct32_1d_4x32_pass1_neon
   1916 .endr
   1917        b               3f
   1918 
   1919 1:
   1920        // Write zeros to the temp buffer for pass 2
   1921        movi            v16.4s,  #0
   1922        movi            v17.4s,  #0
   1923        movi            v18.4s,  #0
   1924        movi            v19.4s,  #0
   1925 2:
   1926        subs            x1,  x1,  #1
   1927 .rept 4
   1928        st1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x0], #64
   1929        st1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x0], #64
   1930 .endr
   1931        b.ne            2b
   1932 3:
   1933 .irp i, 0, 4, 8, 12, 16, 20, 24, 28
   1934        add             x0,  x4,  #(\i*2)
   1935        mov             x1,  x5
   1936        add             x2,  sp,  #(\i*4)
   1937        bl              idct32_1d_4x32_pass2_neon
   1938 .endr
   1939 
   1940        add             sp,  sp,  #4096
   1941        ldp             d14, d15, [sp], 0x10
   1942        ldp             d12, d13, [sp], 0x10
   1943        ldp             d10, d11, [sp], 0x10
   1944        ldp             d8,  d9,  [sp], 0x10
   1945 
   1946        ret             x15
   1947 endfunc
   1948 
   1949 function ff_vp9_idct_idct_32x32_add_10_neon, export=1
   1950        mov             x13, #0x03ff
   1951        b               vp9_idct_idct_32x32_add_16_neon
   1952 endfunc
   1953 
   1954 function ff_vp9_idct_idct_32x32_add_12_neon, export=1
   1955        mov             x13, #0x0fff
   1956        b               vp9_idct_idct_32x32_add_16_neon
   1957 endfunc
   1958 
   1959 .macro idct32_partial size
   1960 function idct32x32_\size\()_add_16_neon
   1961 .irp i, 0, 4
   1962        add             x0,  sp,  #(\i*128)
   1963 .ifc \size,quarter
   1964 .if \i == 4
   1965        cmp             w3,  #9
   1966        b.le            1f
   1967 .endif
   1968 .endif
   1969        add             x2,  x6,  #(\i*4)
   1970        bl              idct32_1d_4x32_pass1_\size\()_neon
   1971 .endr
   1972 
   1973 .ifc \size,half
   1974 .irp i, 8, 12
   1975        add             x0,  sp,  #(\i*128)
   1976 .if \i == 12
   1977        cmp             w3,  #70
   1978        b.le            1f
   1979 .endif
   1980        add             x2,  x6,  #(\i*4)
   1981        bl              idct32_1d_4x32_pass1_\size\()_neon
   1982 .endr
   1983 .endif
   1984        b               3f
   1985 
   1986 1:
   1987        // Write zeros to the temp buffer for pass 2
   1988        movi            v16.4s,  #0
   1989        movi            v17.4s,  #0
   1990        movi            v18.4s,  #0
   1991        movi            v19.4s,  #0
   1992 
   1993 .rept 4
   1994        st1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x0], #64
   1995        st1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x0], #64
   1996 .endr
   1997 
   1998 3:
   1999 .irp i, 0, 4, 8, 12, 16, 20, 24, 28
   2000        add             x0,  x4,  #(\i*2)
   2001        mov             x1,  x5
   2002        add             x2,  sp,  #(\i*4)
   2003        bl              idct32_1d_4x32_pass2_\size\()_neon
   2004 .endr
   2005 
   2006        add             sp,  sp,  #4096
   2007        ldp             d14, d15, [sp], 0x10
   2008        ldp             d12, d13, [sp], 0x10
   2009        ldp             d10, d11, [sp], 0x10
   2010        ldp             d8,  d9,  [sp], 0x10
   2011 
   2012        ret             x15
   2013 endfunc
   2014 .endm
   2015 
   2016 idct32_partial quarter
   2017 idct32_partial half