tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

vp9lpf_16bpp_neon.S (34702B)


      1 /*
      2 * Copyright (c) 2017 Google Inc.
      3 *
      4 * This file is part of FFmpeg.
      5 *
      6 * FFmpeg is free software; you can redistribute it and/or
      7 * modify it under the terms of the GNU Lesser General Public
      8 * License as published by the Free Software Foundation; either
      9 * version 2.1 of the License, or (at your option) any later version.
     10 *
     11 * FFmpeg is distributed in the hope that it will be useful,
     12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     14 * Lesser General Public License for more details.
     15 *
     16 * You should have received a copy of the GNU Lesser General Public
     17 * License along with FFmpeg; if not, write to the Free Software
     18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     19 */
     20 
     21 #include "libavutil/aarch64/asm.S"
     22 #include "neon.S"
     23 
     24 
     25 // The input to and output from this macro is in the registers v16-v31,
     26 // and v0-v7 are used as scratch registers.
     27 // p7 = v16 .. p3 = v20, p0 = v23, q0 = v24, q3 = v27, q7 = v31
     28 // Depending on the width of the loop filter, we either use v16-v19
     29 // and v28-v31 as temp registers, or v8-v15.
     30 .macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
     31        dup             v0.8h,  w2                   // E
     32        dup             v2.8h,  w3                   // I
     33        dup             v3.8h,  w4                   // H
     34 
     35        uabd            v4.8h,  v20.8h, v21.8h       // abs(p3 - p2)
     36        uabd            v5.8h,  v21.8h, v22.8h       // abs(p2 - p1)
     37        uabd            v6.8h,  v22.8h, v23.8h       // abs(p1 - p0)
     38        uabd            v7.8h,  v24.8h, v25.8h       // abs(q0 - q1)
     39        uabd            \tmp1\().8h,  v25.8h, v26.8h // abs(q1 - q2)
     40        uabd            \tmp2\().8h,  v26.8h, v27.8h // abs(q2 - q3)
     41        umax            v4.8h,  v4.8h,  v5.8h
     42        umax            v5.8h,  v6.8h,  v7.8h
     43        umax            \tmp1\().8h,  \tmp1\().8h, \tmp2\().8h
     44        uabd            v6.8h,  v23.8h, v24.8h       // abs(p0 - q0)
     45        umax            v4.8h,  v4.8h,  v5.8h
     46        add             v6.8h,  v6.8h,  v6.8h        // abs(p0 - q0) * 2
     47        uabd            v5.8h,  v22.8h, v25.8h       // abs(p1 - q1)
     48        umax            v4.8h,  v4.8h,  \tmp1\().8h  // max(abs(p3 - p2), ..., abs(q2 - q3))
     49        ushr            v5.8h,  v5.8h,  #1
     50        cmhs            v4.8h,  v2.8h,  v4.8h        // max(abs()) <= I
     51        add             v6.8h,  v6.8h,  v5.8h        // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
     52        cmhs            v6.8h,  v0.8h,  v6.8h
     53        and             v4.16b, v4.16b, v6.16b       // fm
     54 
     55        // If no pixels need filtering, just exit as soon as possible
     56        mov             x11, v4.d[0]
     57        mov             x12, v4.d[1]
     58        adds            x11, x11, x12
     59        b.ne            1f
     60        ret             x10
     61 1:
     62 
     63 .if \wd >= 8
     64        dup             v0.8h,  w5
     65 
     66        uabd            v6.8h,  v20.8h, v23.8h       // abs(p3 - p0)
     67        uabd            v2.8h,  v21.8h, v23.8h       // abs(p2 - p0)
     68        uabd            v1.8h,  v22.8h, v23.8h       // abs(p1 - p0)
     69        uabd            \tmp1\().8h,  v25.8h, v24.8h // abs(q1 - q0)
     70        uabd            \tmp2\().8h,  v26.8h, v24.8h // abs(q2 - q0)
     71        uabd            \tmp3\().8h,  v27.8h, v24.8h // abs(q3 - q0)
     72        umax            v6.8h,  v6.8h,  v2.8h
     73        umax            v1.8h,  v1.8h,  \tmp1\().8h
     74        umax            \tmp2\().8h,  \tmp2\().8h,  \tmp3\().8h
     75 .if \wd == 16
     76        uabd            v7.8h,  v16.8h, v23.8h       // abs(p7 - p0)
     77        umax            v6.8h,  v6.8h,  v1.8h
     78        uabd            v2.8h,  v17.8h, v23.8h       // abs(p6 - p0)
     79        umax            v6.8h,  v6.8h,  \tmp2\().8h
     80        uabd            v1.8h,  v18.8h, v23.8h       // abs(p5 - p0)
     81        cmhs            v6.8h,  v0.8h,  v6.8h        // flat8in
     82        uabd            v8.8h,  v19.8h, v23.8h       // abs(p4 - p0)
     83        and             v6.16b, v6.16b, v4.16b       // flat8in && fm
     84        uabd            v9.8h,  v28.8h, v24.8h       // abs(q4 - q0)
     85        bic             v4.16b, v4.16b, v6.16b       // fm && !flat8in
     86        uabd            v10.8h, v29.8h, v24.8h       // abs(q5 - q0)
     87        uabd            v11.8h, v30.8h, v24.8h       // abs(q6 - q0)
     88        uabd            v12.8h, v31.8h, v24.8h       // abs(q7 - q0)
     89 
     90        umax            v7.8h,  v7.8h,  v2.8h
     91        umax            v1.8h,  v1.8h,  v8.8h
     92        umax            v9.8h,  v9.8h,  v10.8h
     93        umax            v11.8h, v11.8h, v12.8h
     94        // The rest of the calculation of flat8out is interleaved below
     95 .else
     96        // The rest of the calculation of flat8in is interleaved below
     97 .endif
     98 .endif
     99 
    100        // Calculate the normal inner loop filter for 2 or 4 pixels
    101        uabd            v5.8h,  v22.8h, v23.8h                  // abs(p1 - p0)
    102 .if \wd == 16
    103        umax            v7.8h,  v7.8h,  v1.8h
    104        umax            v9.8h,  v9.8h,  v11.8h
    105 .elseif \wd == 8
    106        umax            v6.8h,  v6.8h,  v1.8h
    107 .endif
    108        uabd            v1.8h,  v25.8h, v24.8h                  // abs(q1 - q0)
    109 .if \wd == 16
    110        umax            v7.8h,  v7.8h,  v9.8h
    111 .elseif \wd == 8
    112        umax            v6.8h,  v6.8h,  \tmp2\().8h
    113 .endif
    114        dup             \tmp2\().8h,  w6                        // left shift for saturation
    115        sub             \tmp1\().8h,  v22.8h,  v25.8h           // p1 - q1
    116        neg             \tmp6\().8h,  \tmp2\().8h               // negative left shift after saturation
    117        umax            v5.8h,  v5.8h,  v1.8h                   // max(abs(p1 - p0), abs(q1 - q0))
    118        sub             \tmp3\().8h,  v24.8h,  v23.8h           // q0 - p0
    119        movi            \tmp5\().8h,  #3
    120 .if \wd == 8
    121        cmhs            v6.8h,  v0.8h,  v6.8h                   // flat8in
    122 .endif
    123        cmhs            v5.8h,  v3.8h,  v5.8h                   // !hev
    124 .if \wd == 8
    125        and             v6.16b, v6.16b, v4.16b                  // flat8in && fm
    126 .endif
    127        sqshl           \tmp1\().8h,  \tmp1\().8h,  \tmp2\().8h
    128 .if \wd == 16
    129        cmhs            v7.8h,  v0.8h,  v7.8h                   // flat8out
    130 .elseif \wd == 8
    131        bic             v4.16b, v4.16b, v6.16b                  // fm && !flat8in
    132 .endif
    133        and             v5.16b,  v5.16b,  v4.16b                // !hev && fm && !flat8in
    134 .if \wd == 16
    135        and             v7.16b, v7.16b, v6.16b                  // flat8out && flat8in && fm
    136 .endif
    137        sshl            \tmp1\().8h,  \tmp1\().8h,  \tmp6\().8h // av_clip_int2p(p1 - q1, BIT_DEPTH - 1)
    138 
    139        mul             \tmp3\().8h,  \tmp3\().8h,  \tmp5\().8h // 3 * (q0 - p0)
    140        bic             \tmp1\().16b, \tmp1\().16b, v5.16b      // if (!hev) av_clip_int8 = 0
    141        movi            v2.8h,  #4
    142        add             \tmp3\().8h,  \tmp3\().8h,  \tmp1\().8h // 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
    143        movi            v3.8h,  #3
    144        sqshl           \tmp1\().8h,  \tmp3\().8h,  \tmp2\().8h
    145        movi            \tmp5\().8h,  #0
    146        sshl            \tmp1\().8h,  \tmp1\().8h,  \tmp6\().8h // av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f
    147        dup             \tmp6\().8h,  w7                        // max pixel value
    148 .if \wd == 16
    149        bic             v6.16b, v6.16b, v7.16b                  // fm && flat8in && !flat8out
    150 .endif
    151 
    152        ushr            \tmp2\().8h,  \tmp6\().8h,  #1          // (1 << (BIT_DEPTH - 1)) - 1
    153 
    154        add             \tmp3\().8h,  \tmp1\().8h,  v2.8h       // f + 4
    155        add             \tmp4\().8h,  \tmp1\().8h,  v3.8h       // f + 3
    156        smin            \tmp3\().8h,  \tmp3\().8h,  \tmp2\().8h // FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1)
    157        smin            \tmp4\().8h,  \tmp4\().8h,  \tmp2\().8h // FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1)
    158        sshr            \tmp3\().8h,  \tmp3\().8h,  #3          // f1
    159        sshr            \tmp4\().8h,  \tmp4\().8h,  #3          // f2
    160 
    161        add             v0.8h,   v23.8h,  \tmp4\().8h           // p0 + f2
    162        sub             v2.8h,   v24.8h,  \tmp3\().8h           // q0 - f1
    163        smin            v0.8h,   v0.8h,   \tmp6\().8h
    164        smin            v2.8h,   v2.8h,   \tmp6\().8h
    165        srshr           \tmp3\().8h, \tmp3\().8h, #1            // f = (f1 + 1) >> 1
    166        smax            v0.8h,   v0.8h,   \tmp5\().8h           // out p0
    167        smax            v2.8h,   v2.8h,   \tmp5\().8h           // out q0
    168        bit             v23.16b, v0.16b,  v4.16b                // if (fm && !flat8in)
    169        bit             v24.16b, v2.16b,  v4.16b
    170 
    171        add             v0.8h,  v22.8h,  \tmp3\().8h            // p1 + f
    172        sub             v2.8h,  v25.8h,  \tmp3\().8h            // q1 - f
    173 .if \wd >= 8
    174        mov             x11, v6.d[0]
    175 .endif
    176        smin            v0.8h,  v0.8h,  \tmp6\().8h
    177        smin            v2.8h,  v2.8h,  \tmp6\().8h
    178 .if \wd >= 8
    179        mov             x12, v6.d[1]
    180 .endif
    181        smax            v0.8h,  v0.8h,  \tmp5\().8h             // out p1
    182        smax            v2.8h,  v2.8h,  \tmp5\().8h             // out q1
    183 .if \wd >= 8
    184        adds            x11, x11, x12
    185 .endif
    186        bit             v22.16b, v0.16b,  v5.16b                // if (!hev && fm && !flat8in)
    187        bit             v25.16b, v2.16b,  v5.16b
    188 
    189        // If no pixels need flat8in, jump to flat8out
    190        // (or to a writeout of the inner 4 pixels, for wd=8)
    191 .if \wd >= 8
    192 .if \wd == 16
    193        b.eq            6f
    194 .else
    195        b.ne            1f
    196        ret             x13
    197 1:
    198 .endif
    199 
    200        // flat8in
    201        add             \tmp1\().8h, v20.8h, v21.8h
    202        add             \tmp3\().8h, v22.8h, v25.8h
    203        add             \tmp5\().8h, v20.8h, v22.8h
    204        add             \tmp7\().8h, v23.8h, v26.8h
    205        add             v0.8h,  \tmp1\().8h, \tmp1\().8h
    206        add             v0.8h,  v0.8h,  v23.8h
    207        add             v0.8h,  v0.8h,  v24.8h
    208        add             v0.8h,  v0.8h,  \tmp5\().8h
    209        sub             \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
    210        sub             \tmp7\().8h, \tmp7\().8h, \tmp5\().8h
    211        urshr           v2.8h,  v0.8h,  #3                      // out p2
    212 
    213        add             v0.8h,  v0.8h,  \tmp3\().8h
    214        add             \tmp1\().8h, v20.8h,  v23.8h
    215        add             \tmp3\().8h, v24.8h,  v27.8h
    216        urshr           v3.8h,  v0.8h,  #3                      // out p1
    217 
    218        add             v0.8h,  v0.8h,  \tmp7\().8h
    219        sub             \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
    220        add             \tmp5\().8h, v21.8h,  v24.8h
    221        add             \tmp7\().8h, v25.8h,  v27.8h
    222        urshr           v4.8h,  v0.8h,  #3                      // out p0
    223 
    224        add             v0.8h,  v0.8h,  \tmp3\().8h
    225        sub             \tmp7\().8h, \tmp7\().8h, \tmp5\().8h
    226        add             \tmp1\().8h, v22.8h,  v25.8h
    227        add             \tmp3\().8h, v26.8h,  v27.8h
    228        urshr           v5.8h,  v0.8h,  #3                      // out q0
    229 
    230        add             v0.8h,  v0.8h,  \tmp7\().8h
    231        sub             \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
    232        urshr           \tmp5\().8h, v0.8h,  #3                 // out q1
    233 
    234        add             v0.8h,  v0.8h,  \tmp3\().8h
    235        // The output here is written back into the input registers. This doesn't
    236        // matter for the flat8part below, since we only update those pixels
    237        // which won't be touched below.
    238        bit             v21.16b, v2.16b,  v6.16b
    239        bit             v22.16b, v3.16b,  v6.16b
    240        bit             v23.16b, v4.16b,  v6.16b
    241        urshr           \tmp6\().8h,  v0.8h,  #3                // out q2
    242        bit             v24.16b, v5.16b,  v6.16b
    243        bit             v25.16b, \tmp5\().16b,  v6.16b
    244        bit             v26.16b, \tmp6\().16b,  v6.16b
    245 .endif
    246 .if \wd == 16
    247 6:
    248        orr             v2.16b,  v6.16b,  v7.16b
    249        mov             x11, v2.d[0]
    250        mov             x12, v2.d[1]
    251        adds            x11, x11, x12
    252        b.ne            1f
    253        // If no pixels needed flat8in nor flat8out, jump to a
    254        // writeout of the inner 4 pixels
    255        ret             x14
    256 1:
    257 
    258        mov             x11, v7.d[0]
    259        mov             x12, v7.d[1]
    260        adds            x11, x11, x12
    261        b.ne            1f
    262        // If no pixels need flat8out, jump to a writeout of the inner 6 pixels
    263        ret             x15
    264 
    265 1:
    266        // flat8out
    267        // This writes all outputs into v2-v17 (skipping v6 and v16).
    268        // If this part is skipped, the output is read from v21-v26 (which is the input
    269        // to this section).
    270        shl             v0.8h,   v16.8h,  #3     // 8 * v16
    271        sub             v0.8h,   v0.8h,   v16.8h // 7 * v16
    272        add             v0.8h,   v0.8h,   v17.8h
    273        add             v8.8h,   v17.8h,  v18.8h
    274        add             v10.8h,  v19.8h,  v20.8h
    275        add             v0.8h,   v0.8h,   v8.8h
    276        add             v8.8h,   v16.8h,  v17.8h
    277        add             v12.8h,  v21.8h,  v22.8h
    278        add             v0.8h,   v0.8h,   v10.8h
    279        add             v10.8h,  v18.8h,  v25.8h
    280        add             v14.8h,  v23.8h,  v24.8h
    281        sub             v10.8h,  v10.8h,  v8.8h
    282        add             v0.8h,   v0.8h,   v12.8h
    283        add             v0.8h,   v0.8h,   v14.8h
    284        add             v12.8h,  v16.8h,  v18.8h
    285        add             v14.8h,  v19.8h,  v26.8h
    286        urshr           v2.8h,   v0.8h,   #4
    287 
    288        add             v0.8h,   v0.8h,   v10.8h
    289        add             v8.8h,   v16.8h,  v19.8h
    290        add             v10.8h,  v20.8h,  v27.8h
    291        sub             v14.8h,  v14.8h,  v12.8h
    292        bif             v2.16b,  v17.16b, v7.16b
    293        urshr           v3.8h ,  v0.8h,   #4
    294 
    295        add             v0.8h,   v0.8h,   v14.8h
    296        add             v12.8h,  v16.8h,  v20.8h
    297        add             v14.8h,  v21.8h,  v28.8h
    298        sub             v10.8h,  v10.8h,  v8.8h
    299        bif             v3.16b,  v18.16b, v7.16b
    300        urshr           v4.8h,   v0.8h,   #4
    301 
    302        add             v0.8h,   v0.8h,   v10.8h
    303        add             v8.8h,   v16.8h,  v21.8h
    304        add             v10.8h,  v22.8h,  v29.8h
    305        sub             v14.8h,  v14.8h,  v12.8h
    306        bif             v4.16b,  v19.16b, v7.16b
    307        urshr           v5.8h,   v0.8h,   #4
    308 
    309        add             v0.8h,   v0.8h,   v14.8h
    310        add             v12.8h,  v16.8h,  v22.8h
    311        add             v14.8h,  v23.8h,  v30.8h
    312        sub             v10.8h,  v10.8h,  v8.8h
    313        bif             v5.16b,  v20.16b, v7.16b
    314        urshr           v6.8h,   v0.8h,   #4
    315 
    316        add             v0.8h,   v0.8h,   v10.8h
    317        add             v10.8h,  v16.8h,  v23.8h
    318        sub             v14.8h,  v14.8h,  v12.8h
    319        add             v12.8h,  v24.8h,  v31.8h
    320        bif             v6.16b,  v21.16b, v7.16b
    321        urshr           v8.8h,   v0.8h,   #4
    322 
    323        add             v0.8h,   v0.8h,   v14.8h
    324        sub             v10.8h,  v12.8h,  v10.8h
    325        add             v12.8h,  v17.8h,  v24.8h
    326        add             v14.8h,  v25.8h,  v31.8h
    327        bif             v8.16b,  v22.16b, v7.16b
    328        urshr           v9.8h,   v0.8h,   #4
    329 
    330        add             v0.8h,   v0.8h,   v10.8h
    331        sub             v14.8h,  v14.8h,  v12.8h
    332        add             v12.8h,  v26.8h,  v31.8h
    333        bif             v9.16b,  v23.16b, v7.16b
    334        urshr           v10.8h,  v0.8h,   #4
    335 
    336        add             v0.8h,   v0.8h,   v14.8h
    337        add             v14.8h,  v18.8h,  v25.8h
    338        add             v18.8h,  v19.8h,  v26.8h
    339        sub             v12.8h,  v12.8h,  v14.8h
    340        add             v14.8h,  v27.8h,  v31.8h
    341        bif             v10.16b, v24.16b, v7.16b
    342        urshr           v11.8h,  v0.8h,   #4
    343 
    344        add             v0.8h,   v0.8h,   v12.8h
    345        add             v12.8h,  v20.8h,  v27.8h
    346        sub             v14.8h,  v14.8h,  v18.8h
    347        add             v18.8h,  v28.8h,  v31.8h
    348        bif             v11.16b, v25.16b, v7.16b
    349        sub             v18.8h,  v18.8h,  v12.8h
    350        urshr           v12.8h,  v0.8h,   #4
    351 
    352        add             v0.8h,   v0.8h,   v14.8h
    353        add             v14.8h,  v21.8h,  v28.8h
    354        add             v20.8h,  v29.8h,  v31.8h
    355        bif             v12.16b, v26.16b, v7.16b
    356        urshr           v13.8h,  v0.8h,   #4
    357 
    358        add             v0.8h,   v0.8h,   v18.8h
    359        sub             v20.8h,  v20.8h,  v14.8h
    360        add             v18.8h,  v22.8h,  v29.8h
    361        add             v22.8h,  v30.8h,  v31.8h
    362        bif             v13.16b, v27.16b, v7.16b
    363        urshr           v14.8h,  v0.8h,   #4
    364 
    365        add             v0.8h,   v0.8h,   v20.8h
    366        sub             v22.8h,  v22.8h,  v18.8h
    367        bif             v14.16b, v28.16b, v7.16b
    368        urshr           v15.8h,  v0.8h,   #4
    369 
    370        add             v0.8h,   v0.8h,   v22.8h
    371        bif             v15.16b, v29.16b, v7.16b
    372        urshr           v17.8h,  v0.8h,   #4
    373        bif             v17.16b, v30.16b, v7.16b
    374 .endif
    375 .endm
    376 
    377 // For wd <= 8, we use v16-v19 and v28-v31 for temp registers,
    378 // while we need those for inputs/outputs in wd=16 and use v8-v15
    379 // for temp registers there instead.
    380 function vp9_loop_filter_4
    381        loop_filter     4,  v16, v17, v18, v19, v28, v29, v30, v31
    382        ret
    383 endfunc
    384 
    385 function vp9_loop_filter_8
    386        loop_filter     8,  v16, v17, v18, v19, v28, v29, v30, v31
    387        ret
    388 endfunc
    389 
    390 function vp9_loop_filter_16
    391        loop_filter     16, v8,  v9,  v10, v11, v12, v13, v14, v15
    392        ret
    393 endfunc
    394 
    395 .macro loop_filter_4
    396        bl              vp9_loop_filter_4
    397 .endm
    398 
    399 .macro loop_filter_8
    400        // calculate alternative 'return' targets
    401        adr             x13, 6f
    402        bl              vp9_loop_filter_8
    403 .endm
    404 
    405 .macro loop_filter_16
    406        // calculate alternative 'return' targets
    407        adr             x14, 7f
    408        adr             x15, 8f
    409        bl              vp9_loop_filter_16
    410 .endm
    411 
    412 
    413 // The public functions in this file have got the following signature:
    414 // void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr);
    415 
    416 .macro bpp_frontend func, bpp, push
    417 function ff_\func\()_\bpp\()_neon, export=1
    418 .if \push
    419        mov             x16, x30
    420        stp             d8,  d9,  [sp, #-0x40]!
    421        stp             d14, d15, [sp, #0x30]
    422        stp             d12, d13, [sp, #0x20]
    423        stp             d10, d11, [sp, #0x10]
    424 .endif
    425        lsl             w2,  w2,  #\bpp - 8
    426        lsl             w3,  w3,  #\bpp - 8
    427        lsl             w4,  w4,  #\bpp - 8
    428        mov             x5,  #1 << (\bpp - 8)
    429        mov             x6,  #16 - \bpp
    430        mov             x7,  #((1 << \bpp) - 1)
    431 .if \push
    432        bl              \func\()_16_neon
    433        ldp             d10, d11, [sp, #0x10]
    434        ldp             d12, d13, [sp, #0x20]
    435        ldp             d14, d15, [sp, #0x30]
    436        ldp             d8,  d9,  [sp], #0x40
    437        ret             x16
    438 .else
    439        b               \func\()_16_neon
    440 .endif
    441 endfunc
    442 .endm
    443 
    444 .macro bpp_frontends func, push=0
    445        bpp_frontend    \func, 10, \push
    446        bpp_frontend    \func, 12, \push
    447 .endm
    448 
    449 .macro bpp_frontend_rep func, suffix, int_suffix, dir, bpp, push
    450 function ff_\func\()_\suffix\()_\bpp\()_neon, export=1
    451        mov             x16, x30
    452 .if \push
    453        stp             d8,  d9,  [sp, #-0x40]!
    454        stp             d14, d15, [sp, #0x30]
    455        stp             d12, d13, [sp, #0x20]
    456        stp             d10, d11, [sp, #0x10]
    457 .endif
    458        lsl             w2,  w2,  #\bpp - 8
    459        lsl             w3,  w3,  #\bpp - 8
    460        lsl             w4,  w4,  #\bpp - 8
    461        mov             x5,  #1 << (\bpp - 8)
    462        mov             x6,  #16 - \bpp
    463        mov             x7,  #((1 << \bpp) - 1)
    464        bl              \func\()_\int_suffix\()_16_neon
    465 .ifc \dir,h
    466        add             x0,  x0,  x1, lsl #3
    467 .else
    468        add             x0,  x0,  #16
    469 .endif
    470        bl              \func\()_\int_suffix\()_16_neon
    471 .if \push
    472        ldp             d10, d11, [sp, #0x10]
    473        ldp             d12, d13, [sp, #0x20]
    474        ldp             d14, d15, [sp, #0x30]
    475        ldp             d8,  d9,  [sp], 0x40
    476 .endif
    477        ret             x16
    478 endfunc
    479 .endm
    480 
    481 .macro bpp_frontends_rep func, suffix, int_suffix, dir, push=0
    482        bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 10, \push
    483        bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 12, \push
    484 .endm
    485 
    486 .macro bpp_frontend_mix2 wd1, wd2, dir, bpp
    487 function ff_vp9_loop_filter_\dir\()_\wd1\()\wd2\()_16_\bpp\()_neon, export=1
    488        mov             x16, x30
    489        lsr             w8,  w2,  #8
    490        lsr             w14, w3,  #8
    491        lsr             w15, w4,  #8
    492        and             w2,  w2,  #0xff
    493        and             w3,  w3,  #0xff
    494        and             w4,  w4,  #0xff
    495        lsl             w2,  w2,  #\bpp - 8
    496        lsl             w3,  w3,  #\bpp - 8
    497        lsl             w4,  w4,  #\bpp - 8
    498        mov             x5,  #1 << (\bpp - 8)
    499        mov             x6,  #16 - \bpp
    500        mov             x7,  #((1 << \bpp) - 1)
    501        bl              vp9_loop_filter_\dir\()_\wd1\()_8_16_neon
    502 .ifc \dir,h
    503        add             x0,  x0,  x1, lsl #3
    504 .else
    505        add             x0,  x0,  #16
    506 .endif
    507        lsl             w2,  w8,  #\bpp - 8
    508        lsl             w3,  w14, #\bpp - 8
    509        lsl             w4,  w15, #\bpp - 8
    510        bl              vp9_loop_filter_\dir\()_\wd2\()_8_16_neon
    511        ret             x16
    512 endfunc
    513 .endm
    514 
    515 .macro bpp_frontends_mix2 wd1, wd2
    516        bpp_frontend_mix2 \wd1, \wd2, v, 10
    517        bpp_frontend_mix2 \wd1, \wd2, v, 12
    518        bpp_frontend_mix2 \wd1, \wd2, h, 10
    519        bpp_frontend_mix2 \wd1, \wd2, h, 12
    520 .endm
    521 
    522 function vp9_loop_filter_v_4_8_16_neon
    523        mov             x10, x30
    524        sub             x9,  x0,  x1, lsl #2
    525        ld1             {v20.8h}, [x9], x1 // p3
    526        ld1             {v24.8h}, [x0], x1 // q0
    527        ld1             {v21.8h}, [x9], x1 // p2
    528        ld1             {v25.8h}, [x0], x1 // q1
    529        ld1             {v22.8h}, [x9], x1 // p1
    530        ld1             {v26.8h}, [x0], x1 // q2
    531        ld1             {v23.8h}, [x9], x1 // p0
    532        ld1             {v27.8h}, [x0], x1 // q3
    533        sub             x0,  x0,  x1, lsl #2
    534        sub             x9,  x9,  x1, lsl #1
    535 
    536        loop_filter_4
    537 
    538        st1             {v22.8h}, [x9], x1
    539        st1             {v24.8h}, [x0], x1
    540        st1             {v23.8h}, [x9], x1
    541        st1             {v25.8h}, [x0], x1
    542        sub             x0,  x0,  x1, lsl #1
    543 
    544        ret             x10
    545 endfunc
    546 
    547 bpp_frontends vp9_loop_filter_v_4_8
    548 
    549 function vp9_loop_filter_h_4_8_16_neon
    550        mov             x10, x30
    551        sub             x9,  x0,  #8
    552        add             x0,  x9,  x1, lsl #2
    553        ld1             {v20.8h}, [x9], x1
    554        ld1             {v24.8h}, [x0], x1
    555        ld1             {v21.8h}, [x9], x1
    556        ld1             {v25.8h}, [x0], x1
    557        ld1             {v22.8h}, [x9], x1
    558        ld1             {v26.8h}, [x0], x1
    559        ld1             {v23.8h}, [x9], x1
    560        ld1             {v27.8h}, [x0], x1
    561 
    562        sub             x9,  x9,  x1, lsl #2
    563        sub             x0,  x0,  x1, lsl #3
    564        add             x0,  x0,  #8
    565 
    566        transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
    567 
    568        loop_filter_4
    569 
    570        // Move x9 forward by 2 pixels; we don't need to rewrite the
    571        // outermost 2 pixels since they aren't changed.
    572        add             x9,  x9,  #4
    573        add             x0,  x9,  x1, lsl #2
    574 
    575        // We only will write the mid 4 pixels back; after the loop filter,
    576        // these are in v22, v23, v24, v25, ordered as rows (8x4 pixels).
    577        // We need to transpose them to columns, done with a 4x8 transpose
    578        // (which in practice is two 4x4 transposes of the two 4x4 halves
    579        // of the 8x4 pixels; into 4x8 pixels).
    580        transpose_4x8H  v22, v23, v24, v25, v26, v27, v28, v29
    581        st1             {v22.d}[0], [x9], x1
    582        st1             {v22.d}[1], [x0], x1
    583        st1             {v23.d}[0], [x9], x1
    584        st1             {v23.d}[1], [x0], x1
    585        st1             {v24.d}[0], [x9], x1
    586        st1             {v24.d}[1], [x0], x1
    587        st1             {v25.d}[0], [x9], x1
    588        st1             {v25.d}[1], [x0], x1
    589        sub             x0,  x0,  x1, lsl #3
    590        add             x0,  x0,  #4
    591 
    592        ret             x10
    593 endfunc
    594 
    595 bpp_frontends vp9_loop_filter_h_4_8
    596 
    597 function vp9_loop_filter_v_8_8_16_neon
    598        mov             x10, x30
    599        sub             x9,  x0,  x1, lsl #2
    600        ld1             {v20.8h}, [x9], x1 // p3
    601        ld1             {v24.8h}, [x0], x1 // q0
    602        ld1             {v21.8h}, [x9], x1 // p2
    603        ld1             {v25.8h}, [x0], x1 // q1
    604        ld1             {v22.8h}, [x9], x1 // p1
    605        ld1             {v26.8h}, [x0], x1 // q2
    606        ld1             {v23.8h}, [x9], x1 // p0
    607        ld1             {v27.8h}, [x0], x1 // q3
    608        sub             x9,  x9,  x1, lsl #2
    609        sub             x0,  x0,  x1, lsl #2
    610        add             x9,  x9,  x1
    611 
    612        loop_filter_8
    613 
    614        st1             {v21.8h}, [x9], x1
    615        st1             {v24.8h}, [x0], x1
    616        st1             {v22.8h}, [x9], x1
    617        st1             {v25.8h}, [x0], x1
    618        st1             {v23.8h}, [x9], x1
    619        st1             {v26.8h}, [x0], x1
    620        sub             x0,  x0,  x1, lsl #1
    621        sub             x0,  x0,  x1
    622 
    623        ret             x10
    624 6:
    625        sub             x9,  x0,  x1, lsl #1
    626        st1             {v22.8h}, [x9], x1
    627        st1             {v24.8h}, [x0], x1
    628        st1             {v23.8h}, [x9], x1
    629        st1             {v25.8h}, [x0], x1
    630        sub             x0,  x0,  x1, lsl #1
    631        ret             x10
    632 endfunc
    633 
    634 bpp_frontends vp9_loop_filter_v_8_8
    635 
    636 function vp9_loop_filter_h_8_8_16_neon
    637        mov             x10, x30
    638        sub             x9,  x0,  #8
    639        add             x0,  x9,  x1, lsl #2
    640        ld1             {v20.8h}, [x9], x1
    641        ld1             {v24.8h}, [x0], x1
    642        ld1             {v21.8h}, [x9], x1
    643        ld1             {v25.8h}, [x0], x1
    644        ld1             {v22.8h}, [x9], x1
    645        ld1             {v26.8h}, [x0], x1
    646        ld1             {v23.8h}, [x9], x1
    647        ld1             {v27.8h}, [x0], x1
    648 
    649        sub             x9,  x9,  x1, lsl #2
    650        sub             x0,  x0,  x1, lsl #3
    651        add             x0,  x0,  #8
    652 
    653        transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
    654 
    655        loop_filter_8
    656 
    657        add             x0,  x9,  x1, lsl #2
    658 
    659        // Even though only 6 pixels per row have been changed, we write the
    660        // full 8 pixel registers.
    661        transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
    662 
    663        st1             {v20.8h}, [x9], x1
    664        st1             {v24.8h}, [x0], x1
    665        st1             {v21.8h}, [x9], x1
    666        st1             {v25.8h}, [x0], x1
    667        st1             {v22.8h}, [x9], x1
    668        st1             {v26.8h}, [x0], x1
    669        st1             {v23.8h}, [x9], x1
    670        st1             {v27.8h}, [x0], x1
    671        sub             x0,  x0,  x1, lsl #3
    672        add             x0,  x0,  #8
    673 
    674        ret             x10
    675 6:
    676        // If we didn't need to do the flat8in part, we use the same writeback
    677        // as in loop_filter_h_4_8.
    678        add             x9,  x9,  #4
    679        add             x0,  x9,  x1, lsl #2
    680        transpose_4x8H  v22, v23, v24, v25, v26, v27, v28, v29
    681        st1             {v22.d}[0], [x9], x1
    682        st1             {v22.d}[1], [x0], x1
    683        st1             {v23.d}[0], [x9], x1
    684        st1             {v23.d}[1], [x0], x1
    685        st1             {v24.d}[0], [x9], x1
    686        st1             {v24.d}[1], [x0], x1
    687        st1             {v25.d}[0], [x9], x1
    688        st1             {v25.d}[1], [x0], x1
    689        sub             x0,  x0,  x1, lsl #3
    690        add             x0,  x0,  #4
    691        ret             x10
    692 endfunc
    693 
    694 bpp_frontends vp9_loop_filter_h_8_8
    695 
    696 bpp_frontends_mix2 4, 4
    697 bpp_frontends_mix2 4, 8
    698 bpp_frontends_mix2 8, 4
    699 bpp_frontends_mix2 8, 8
    700 
    701 function vp9_loop_filter_v_16_8_16_neon
    702        mov             x10, x30
    703        sub             x9,  x0,  x1, lsl #3
    704        ld1             {v16.8h}, [x9], x1 // p7
    705        ld1             {v24.8h}, [x0], x1 // q0
    706        ld1             {v17.8h}, [x9], x1 // p6
    707        ld1             {v25.8h}, [x0], x1 // q1
    708        ld1             {v18.8h}, [x9], x1 // p5
    709        ld1             {v26.8h}, [x0], x1 // q2
    710        ld1             {v19.8h}, [x9], x1 // p4
    711        ld1             {v27.8h}, [x0], x1 // q3
    712        ld1             {v20.8h}, [x9], x1 // p3
    713        ld1             {v28.8h}, [x0], x1 // q4
    714        ld1             {v21.8h}, [x9], x1 // p2
    715        ld1             {v29.8h}, [x0], x1 // q5
    716        ld1             {v22.8h}, [x9], x1 // p1
    717        ld1             {v30.8h}, [x0], x1 // q6
    718        ld1             {v23.8h}, [x9], x1 // p0
    719        ld1             {v31.8h}, [x0], x1 // q7
    720        sub             x9,  x9,  x1, lsl #3
    721        sub             x0,  x0,  x1, lsl #3
    722        add             x9,  x9,  x1
    723 
    724        loop_filter_16
    725 
    726        // If we did the flat8out part, we get the output in
    727        // v2-v17 (skipping v7 and v16). x9 points to x0 - 7 * stride,
    728        // store v2-v9 there, and v10-v17 into x0.
    729        st1             {v2.8h},  [x9], x1
    730        st1             {v10.8h}, [x0], x1
    731        st1             {v3.8h},  [x9], x1
    732        st1             {v11.8h}, [x0], x1
    733        st1             {v4.8h},  [x9], x1
    734        st1             {v12.8h}, [x0], x1
    735        st1             {v5.8h},  [x9], x1
    736        st1             {v13.8h}, [x0], x1
    737        st1             {v6.8h},  [x9], x1
    738        st1             {v14.8h}, [x0], x1
    739        st1             {v8.8h},  [x9], x1
    740        st1             {v15.8h}, [x0], x1
    741        st1             {v9.8h},  [x9], x1
    742        st1             {v17.8h}, [x0], x1
    743        sub             x0,  x0,  x1, lsl #3
    744        add             x0,  x0,  x1
    745 
    746        ret             x10
    747 8:
    748        add             x9,  x9,  x1, lsl #2
    749        // If we didn't do the flat8out part, the output is left in the
    750        // input registers.
    751        st1             {v21.8h}, [x9], x1
    752        st1             {v24.8h}, [x0], x1
    753        st1             {v22.8h}, [x9], x1
    754        st1             {v25.8h}, [x0], x1
    755        st1             {v23.8h}, [x9], x1
    756        st1             {v26.8h}, [x0], x1
    757        sub             x0,  x0,  x1, lsl #1
    758        sub             x0,  x0,  x1
    759        ret             x10
    760 7:
    761        sub             x9,  x0,  x1, lsl #1
    762        st1             {v22.8h}, [x9], x1
    763        st1             {v24.8h}, [x0], x1
    764        st1             {v23.8h}, [x9], x1
    765        st1             {v25.8h}, [x0], x1
    766        sub             x0,  x0,  x1, lsl #1
    767        ret             x10
    768 endfunc
    769 
    770 bpp_frontends vp9_loop_filter_v_16_8, push=1
    771 bpp_frontends_rep vp9_loop_filter_v_16, 16, 8, v, push=1
    772 
    773 function vp9_loop_filter_h_16_8_16_neon
    774        mov             x10, x30
    775        sub             x9,  x0,  #16
    776        ld1             {v16.8h}, [x9], x1
    777        ld1             {v24.8h}, [x0], x1
    778        ld1             {v17.8h}, [x9], x1
    779        ld1             {v25.8h}, [x0], x1
    780        ld1             {v18.8h}, [x9], x1
    781        ld1             {v26.8h}, [x0], x1
    782        ld1             {v19.8h}, [x9], x1
    783        ld1             {v27.8h}, [x0], x1
    784        ld1             {v20.8h}, [x9], x1
    785        ld1             {v28.8h}, [x0], x1
    786        ld1             {v21.8h}, [x9], x1
    787        ld1             {v29.8h}, [x0], x1
    788        ld1             {v22.8h}, [x9], x1
    789        ld1             {v30.8h}, [x0], x1
    790        ld1             {v23.8h}, [x9], x1
    791        ld1             {v31.8h}, [x0], x1
    792        sub             x0,  x0,  x1, lsl #3
    793        sub             x9,  x9,  x1, lsl #3
    794 
    795        // The 16x8 pixels read above is in two 8x8 blocks; the left
    796        // half in v16-v23, and the right half in v24-v31. Do two 8x8 transposes
    797        // of this, to get one column per register.
    798        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
    799        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
    800 
    801        loop_filter_16
    802 
    803        transpose_8x8H  v16, v2,  v3,  v4,  v5,  v6,  v8,  v9,  v0, v1
    804        transpose_8x8H  v10, v11, v12, v13, v14, v15, v17, v31, v0, v1
    805 
    806        st1             {v16.8h}, [x9], x1
    807        st1             {v10.8h}, [x0], x1
    808        st1             {v2.8h},  [x9], x1
    809        st1             {v11.8h}, [x0], x1
    810        st1             {v3.8h},  [x9], x1
    811        st1             {v12.8h}, [x0], x1
    812        st1             {v4.8h},  [x9], x1
    813        st1             {v13.8h}, [x0], x1
    814        st1             {v5.8h},  [x9], x1
    815        st1             {v14.8h}, [x0], x1
    816        st1             {v6.8h},  [x9], x1
    817        st1             {v15.8h}, [x0], x1
    818        st1             {v8.8h},  [x9], x1
    819        st1             {v17.8h}, [x0], x1
    820        st1             {v9.8h},  [x9], x1
    821        st1             {v31.8h}, [x0], x1
    822        sub             x0,  x0,  x1, lsl #3
    823 
    824        ret             x10
    825 8:
    826        // The same writeback as in loop_filter_h_8_8
    827        sub             x9,  x0,  #8
    828        add             x0,  x9,  x1, lsl #2
    829        transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
    830 
    831        st1             {v20.8h}, [x9], x1
    832        st1             {v24.8h}, [x0], x1
    833        st1             {v21.8h}, [x9], x1
    834        st1             {v25.8h}, [x0], x1
    835        st1             {v22.8h}, [x9], x1
    836        st1             {v26.8h}, [x0], x1
    837        st1             {v23.8h}, [x9], x1
    838        st1             {v27.8h}, [x0], x1
    839        sub             x0,  x0,  x1, lsl #3
    840        add             x0,  x0,  #8
    841        ret             x10
    842 7:
    843        // The same writeback as in loop_filter_h_4_8
    844        sub             x9,  x0,  #4
    845        add             x0,  x9,  x1, lsl #2
    846        transpose_4x8H  v22, v23, v24, v25, v26, v27, v28, v29
    847        st1             {v22.d}[0], [x9], x1
    848        st1             {v22.d}[1], [x0], x1
    849        st1             {v23.d}[0], [x9], x1
    850        st1             {v23.d}[1], [x0], x1
    851        st1             {v24.d}[0], [x9], x1
    852        st1             {v24.d}[1], [x0], x1
    853        st1             {v25.d}[0], [x9], x1
    854        st1             {v25.d}[1], [x0], x1
    855        sub             x0,  x0,  x1, lsl #3
    856        add             x0,  x0,  #4
    857        ret             x10
    858 endfunc
    859 
    860 bpp_frontends vp9_loop_filter_h_16_8, push=1
    861 bpp_frontends_rep vp9_loop_filter_h_16, 16, 8, h, push=1