tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

vp9lpf_16bpp_neon.S (39254B)


      1 /*
      2 * Copyright (c) 2017 Google Inc.
      3 *
      4 * This file is part of FFmpeg.
      5 *
      6 * FFmpeg is free software; you can redistribute it and/or
      7 * modify it under the terms of the GNU Lesser General Public
      8 * License as published by the Free Software Foundation; either
      9 * version 2.1 of the License, or (at your option) any later version.
     10 *
     11 * FFmpeg is distributed in the hope that it will be useful,
     12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     14 * Lesser General Public License for more details.
     15 *
     16 * You should have received a copy of the GNU Lesser General Public
     17 * License along with FFmpeg; if not, write to the Free Software
     18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     19 */
     20 
     21 #include "libavutil/arm/asm.S"
     22 
     23 .macro transpose16_q_8x8 rq0, rq1, rq2, rq3, rq4, rq5, rq6, rq7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15
     24        vswp             \r1,  \r8  @ vtrn.64 \rq0, \rq4
     25        vswp             \r3,  \r10 @ vtrn.64 \rq1, \rq5
     26        vswp             \r5,  \r12 @ vtrn.64 \rq2, \rq6
     27        vswp             \r7,  \r14 @ vtrn.64 \rq3, \rq7
     28        vtrn.32          \rq0, \rq2
     29        vtrn.32          \rq1, \rq3
     30        vtrn.32          \rq4, \rq6
     31        vtrn.32          \rq5, \rq7
     32        vtrn.16          \rq0, \rq1
     33        vtrn.16          \rq2, \rq3
     34        vtrn.16          \rq4, \rq5
     35        vtrn.16          \rq6, \rq7
     36 .endm
     37 
     38 .macro transpose16_4x4 r0, r1, r2, r3
     39        vtrn.32          \r0, \r2
     40        vtrn.32          \r1, \r3
     41        vtrn.16          \r0, \r1
     42        vtrn.16          \r2, \r3
     43 .endm
     44 
     45 @ Do a 4x4 transpose, using q registers for the subtransposes that don't
     46 @ need to address the indiviudal d registers.
     47 @ r0,r1 == rq0, r2,r3 == rq1
     48 .macro transpose16_q_4x4 rq0, rq1, r0, r1, r2, r3
     49        vtrn.32         \rq0, \rq1
     50        vtrn.16         \r0,  \r1
     51        vtrn.16         \r2,  \r3
     52 .endm
     53 
     54 @ The input to and output from this macro is in the registers q8-q15,
     55 @ and q0-q7 are used as scratch registers.
     56 @ p3 = q8, p0 = q11, q0 = q12, q3 = q15
     57 .macro loop_filter_q wd
     58        vdup.u16        q0,  r2          @ E
     59        vdup.u16        q1,  r3          @ I
     60 
     61        vabd.u16        q2,  q8,  q9     @ abs(p3 - p2)
     62        vabd.u16        q3,  q9,  q10    @ abs(p2 - p1)
     63        vabd.u16        q4,  q10, q11    @ abs(p1 - p0)
     64        vabd.u16        q5,  q12, q13    @ abs(q0 - q1)
     65        vabd.u16        q6,  q13, q14    @ abs(q1 - q2)
     66        vabd.u16        q7,  q14, q15    @ abs(q2 - q3)
     67        vmax.u16        q2,  q2,  q3
     68        vmax.u16        q3,  q4,  q5
     69        vmax.u16        q4,  q6,  q7
     70        vabd.u16        q5,  q11, q12    @ abs(p0 - q0)
     71        vmax.u16        q2,  q2,  q3
     72        vadd.u16        q5,  q5,  q5     @ abs(p0 - q0) * 2
     73        vabd.u16        q6,  q10, q13    @ abs(p1 - q1)
     74        vmax.u16        q2,  q2,  q4     @ max(abs(p3 - p2), ..., abs(q2 - q3))
     75        vshr.u16        q6,  q6,  #1
     76        vcle.u16        q2,  q2,  q1     @ max(abs()) <= I
     77        vadd.u16        q5,  q5,  q6     @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
     78        vcle.u16        q5,  q5,  q0
     79        vand            q2,  q2,  q5     @ fm
     80 
     81        vmovn.u16       d10, q2
     82        vmov            r8,  r9,  d10
     83        orrs            r8,  r8,  r9
     84        @ If no pixels need filtering, just exit as soon as possible
     85        beq             9f
     86 
     87 .if \wd >= 8
     88        vdup.u16        q0,  r5
     89 
     90        vabd.u16        q1,  q8,  q11    @ abs(p3 - p0)
     91        vabd.u16        q3,  q9,  q11    @ abs(p2 - p0)
     92        vabd.u16        q4,  q10, q11    @ abs(p1 - p0)
     93        vabd.u16        q5,  q13, q12    @ abs(q1 - q0)
     94        vabd.u16        q6,  q14, q12    @ abs(q2 - q0)
     95        vabd.u16        q7,  q15, q12    @ abs(q3 - q0)
     96        vmax.u16        q1,  q1,  q3
     97        vmax.u16        q4,  q4,  q5
     98        vmax.u16        q6,  q6,  q7
     99        @ The rest of the calculation of flat8in is interleaved below
    100 .endif
    101 
    102        @ Calculate the normal inner loop filter for 2 or 4 pixels
    103        vabd.u16        q3,  q10, q11    @ abs(p1 - p0)
    104 .if \wd == 8
    105        vmax.u16        q1,  q1,  q4
    106 .endif
    107        vabd.u16        q4,  q13, q12    @ abs(q1 - q0)
    108 .if \wd == 8
    109        vmax.u16        q1,  q1,  q6
    110 .endif
    111 
    112        vsub.u16        q5,  q10, q13    @ p1 - q1
    113        vmax.u16        q3,  q3,  q4     @ max(abs(p1 - p0), abs(q1 - q0))
    114        vdup.u16        q4,  r4          @ H
    115        vsub.u16        q6,  q12, q11    @ q0 - p0
    116 .if \wd == 8
    117        vcle.u16        q1,  q1,  q0     @ flat8in
    118 .endif
    119        vdup.u16        q0,  r6          @ left shift for saturation
    120        vcle.u16        q3,  q3,  q4     @ !hev
    121 .if \wd == 8
    122        vand            q1,  q1,  q2     @ flat8in && fm
    123 .endif
    124        vneg.s16        q4,  q0          @ negative left shift after saturation
    125        vqshl.s16       q5,  q5,  q0
    126 .if \wd == 8
    127        vbic            q2,  q2,  q1     @ fm && !flat8in
    128 .endif
    129        vmov.s16        q7,  #3
    130        vand            q3,  q3,  q2     @ !hev && fm && !flat8in
    131        vshl.s16        q5,  q5,  q4     @ av_clip_int2p(p1 - q1, BIT_DEPTH - 1)
    132 
    133        vmul.s16        q6,  q6,  q7     @ 3 * (q0 - p0)
    134        vbic            q5,  q5,  q3     @ if (!hev) av_clip_int2p = 0
    135        vadd.s16        q6,  q6,  q5     @ 3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)]
    136        vmov.s16        q5,  #4
    137        vqshl.s16       q6,  q6,  q0
    138        vmov.s16        q0,  #3
    139        vshl.s16        q6,  q6,  q4     @ av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f
    140        vdup.u16        q4,  r7          @ max pixel value
    141 
    142        vshr.u16        q4,  q4,  #1     @ (1 << (BIT_DEPTH - 1)) - 1)
    143 
    144        vadd.s16        q5,  q6,  q5     @ f + 4
    145        vadd.s16        q0,  q6,  q0     @ f + 3
    146        vmov.s16        q6,  #0
    147        vmin.s16        q5,  q5,  q4     @ FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1)
    148        vmin.s16        q0,  q0,  q4     @ FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1)
    149        vdup.u16        q4,  r7          @ max pixel value
    150        vshr.s16        q5,  q5,  #3     @ f1
    151        vshr.s16        q0,  q0,  #3     @ f2
    152 
    153        vadd.s16        q0,  q11, q0     @ p0 + f2
    154        vsub.s16        q7,  q12, q5     @ q0 - f1
    155        vmin.s16        q0,  q0,  q4
    156        vmin.s16        q7,  q7,  q4
    157        vrshr.s16       q5,  q5,  #1     @ f = (f1 + 1) >> 1
    158        vmax.s16        q0,  q0,  q6     @ out p0
    159        vmax.s16        q7,  q7,  q6     @ out q0
    160        vbit            q11, q0,  q2     @ if (fm && !flat8in)
    161        vbit            q12, q7,  q2
    162 .if \wd >= 8
    163        vmovn.u16       d4,  q1
    164 .endif
    165 
    166        vadd.s16        q0,  q10, q5     @ p1 + f
    167        vsub.s16        q7,  q13, q5     @ q1 - f
    168 .if \wd >= 8
    169        vmov            r8,  r9,  d4
    170 .endif
    171        vmin.s16        q0,  q0,  q4
    172        vmin.s16        q7,  q7,  q4
    173 .if \wd >= 8
    174        orrs            r8,  r8,  r9
    175 .endif
    176        vmax.s16        q0,  q0,  q6     @ out p1
    177        vmax.s16        q7,  q7,  q6     @ out q1
    178        vbit            q10, q0,  q3     @ if (!hev && fm && !flat8in)
    179        vbit            q13, q7,  q3
    180 
    181 .if \wd >= 8
    182        @ If no pixels need flat8in, jump to a writeout of the inner 4 pixels
    183        beq             6f
    184 
    185        @ flat8in
    186        vadd.u16        q2,  q8,  q9
    187        vadd.u16        q3,  q10, q13
    188        vadd.u16        q4,  q8,  q10
    189        vadd.u16        q5,  q11, q14
    190        vadd.u16        q0,  q2,  q2
    191        vadd.u16        q0,  q0,  q11
    192        vadd.u16        q0,  q0,  q12
    193        vadd.u16        q0,  q0,  q4
    194        vsub.s16        q3,  q3,  q2
    195        vsub.s16        q5,  q5,  q4
    196        vrshr.u16       q6,  q0,  #3     @ out p2
    197 
    198        vadd.u16        q0,  q0,  q3
    199        vadd.u16        q2,  q8,  q11
    200        vadd.u16        q3,  q12, q15
    201        vrshr.u16       q7,  q0,  #3     @ out p1
    202 
    203        vadd.u16        q0,  q0,  q5
    204        vsub.s16        q3,  q3,  q2
    205        vadd.u16        q4,  q9,  q12
    206        vbit            q9,  q6,  q1
    207        vadd.u16        q5,  q13, q15
    208        vrshr.u16       q6,  q0,  #3     @ out p0
    209 
    210        vadd.u16        q0,  q0,  q3
    211        vsub.s16        q5,  q5,  q4
    212        vadd.u16        q2,  q10, q13
    213        vbit            q10, q7,  q1
    214        vadd.u16        q3,  q14, q15
    215        vrshr.u16       q7,  q0,  #3     @ out q0
    216 
    217        vadd.u16        q0,  q0,  q5
    218        vsub.s16        q3,  q3,  q2
    219        vbit            q11, q6,  q1
    220        vrshr.u16       q6,  q0,  #3     @ out q1
    221 
    222        vadd.u16        q0,  q0,  q3
    223        vbit            q12, q7,  q1
    224        vrshr.u16       q7,  q0,  #3     @ out q2
    225        vbit            q13, q6,  q1
    226        vbit            q14, q7,  q1
    227 .endif
    228 .endm
    229 
    230 @ The input to and output from this macro is in the registers d16-d31,
    231 @ and d0-d7 are used as scratch registers.
    232 @ p7 = d16 .. p3 = d20, p0 = d23, q0 = d24, q3 = d27, q7 = d31
    233 @ Depending on the width of the loop filter, we either use d16-d19
    234 @ and d28-d31 as temp registers, or d8-d15.
    235 @ In practice, this is only ever instantiated once, so the macro parameters
    236 @ could be hardcoded, but keeping them as is, to keep similarities to the
    237 @ 8 bpp and aarch64 versions.
    238 .macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
    239        vdup.u16        d0,  r2          @ E
    240        vdup.u16        d2,  r3          @ I
    241 
    242        vabd.u16        d4,  d20, d21    @ abs(p3 - p2)
    243        vabd.u16        d5,  d21, d22    @ abs(p2 - p1)
    244        vabd.u16        d6,  d22, d23    @ abs(p1 - p0)
    245        vabd.u16        d7,  d24, d25    @ abs(q0 - q1)
    246        vabd.u16        \tmp1,  d25, d26 @ abs(q1 - q2)
    247        vabd.u16        \tmp2,  d26, d27 @ abs(q2 - q3)
    248        vmax.u16        d4,  d4,  d5
    249        vmax.u16        d5,  d6,  d7
    250        vmax.u16        \tmp1,  \tmp1,  \tmp2
    251        vabd.u16        d6,  d23, d24    @ abs(p0 - q0)
    252        vmax.u16        d4,  d4,  d5
    253        vadd.u16        d6,  d6,  d6     @ abs(p0 - q0) * 2
    254        vabd.u16        d5,  d22, d25    @ abs(p1 - q1)
    255        vmax.u16        d4,  d4,  \tmp1  @ max(abs(p3 - p2), ..., abs(q2 - q3))
    256        vshr.u16        d5,  d5,  #1
    257        vcle.u16        d4,  d4,  d2     @ max(abs()) <= I
    258        vadd.u16        d6,  d6,  d5     @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
    259        vcle.u16        d6,  d6,  d0
    260        vand            d4,  d4,  d6     @ fm
    261 
    262        vdup.u16        d3,  r4          @ H
    263        vmov            r8,  r9,  d4
    264        orrs            r8,  r8,  r9
    265        @ If no pixels need filtering, just exit as soon as possible
    266        beq             9f
    267 
    268 .if \wd >= 8
    269        vdup.u16        d0,  r5
    270 
    271        vabd.u16        d6,  d20, d23    @ abs(p3 - p0)
    272        vabd.u16        d2,  d21, d23    @ abs(p2 - p0)
    273        vabd.u16        d1,  d22, d23    @ abs(p1 - p0)
    274        vabd.u16        \tmp1,  d25, d24 @ abs(q1 - q0)
    275        vabd.u16        \tmp2,  d26, d24 @ abs(q2 - q0)
    276        vabd.u16        \tmp3,  d27, d24 @ abs(q3 - q0)
    277        vmax.u16        d6,  d6,  d2
    278        vmax.u16        d1,  d1,  \tmp1
    279        vmax.u16        \tmp2,  \tmp2,  \tmp3
    280 .if \wd == 16
    281        vabd.u16        d7,  d16, d23    @ abs(p7 - p0)
    282        vmax.u16        d6,  d6,  d1
    283        vabd.u16        d2,  d17, d23    @ abs(p6 - p0)
    284        vmax.u16        d6,  d6,  \tmp2
    285        vabd.u16        d1,  d18, d23    @ abs(p5 - p0)
    286        vcle.u16        d6,  d6,  d0     @ flat8in
    287        vabd.u16        d8,  d19, d23    @ abs(p4 - p0)
    288        vand            d6,  d6,  d4     @ flat8in && fm
    289        vabd.u16        d9,  d28, d24    @ abs(q4 - q0)
    290        vbic            d4,  d4,  d6     @ fm && !flat8in
    291        vabd.u16        d10, d29, d24    @ abs(q5 - q0)
    292        vabd.u16        d11, d30, d24    @ abs(q6 - q0)
    293        vabd.u16        d12, d31, d24    @ abs(q7 - q0)
    294 
    295        vmax.u16        d7,  d7,  d2
    296        vmax.u16        d1,  d1,  d8
    297        vmax.u16        d9,  d9,  d10
    298        vmax.u16        d11, d11, d12
    299        @ The rest of the calculation of flat8out is interleaved below
    300 .else
    301        @ The rest of the calculation of flat8in is interleaved below
    302 .endif
    303 .endif
    304 
    305        @ Calculate the normal inner loop filter for 2 or 4 pixels
    306        vabd.u16        d5,  d22, d23           @ abs(p1 - p0)
    307 .if \wd == 16
    308        vmax.u16        d7,  d7,  d1
    309        vmax.u16        d9,  d9,  d11
    310 .elseif \wd == 8
    311        vmax.u16        d6,  d6,  d1
    312 .endif
    313        vabd.u16        d1,  d25, d24           @ abs(q1 - q0)
    314 .if \wd == 16
    315        vmax.u16        d7,  d7,  d9
    316 .elseif \wd == 8
    317        vmax.u16        d6,  d6,  \tmp2
    318 .endif
    319        vdup.u16        \tmp2,  r6              @ left shift for saturation
    320        vsub.u16        \tmp1,  d22, d25        @ p1 - q1
    321        vneg.s16        \tmp6,  \tmp2           @ negative left shift after saturation
    322        vmax.u16        d5,  d5,  d1            @ max(abs(p1 - p0), abs(q1 - q0))
    323        vsub.u16        \tmp3,   d24, d23       @ q0 - p0
    324        vmov.s16        \tmp5,  #3
    325 .if \wd == 8
    326        vcle.u16        d6,  d6,  d0            @ flat8in
    327 .endif
    328        vcle.u16        d5,  d5,  d3            @ !hev
    329 .if \wd == 8
    330        vand            d6,  d6,  d4            @ flat8in && fm
    331 .endif
    332        vqshl.s16       \tmp1,  \tmp1,  \tmp2
    333 .if \wd == 16
    334        vcle.u16        d7,  d7,  d0            @ flat8out
    335 .elseif \wd == 8
    336        vbic            d4,  d4,  d6            @ fm && !flat8in
    337 .endif
    338        vand            d5,  d5,  d4            @ !hev && fm && !flat8in
    339 .if \wd == 16
    340        vand            d7,  d7,  d6            @ flat8out && flat8in && fm
    341 .endif
    342        vshl.s16        \tmp1,  \tmp1,  \tmp6   @ av_clip_int2p(p1 - q1, BIT_DEPTH - 1)
    343 
    344        vmul.s16        \tmp3,  \tmp3,  \tmp5   @ 3 * (q0 - p0)
    345        vbic            \tmp1,  \tmp1,   d5     @ if (!hev) av_clip_int2p = 0
    346        vmov.s16        d2,  #4
    347        vadd.s16        \tmp3,  \tmp3,  \tmp1   @ 3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)]
    348        vmov.s16        d3,  #3
    349        vqshl.s16       \tmp1,  \tmp3,  \tmp2
    350        vmov.s16        \tmp5,  #0
    351        vshl.s16        \tmp1,  \tmp1,  \tmp6   @ av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f
    352        vdup.u16        \tmp6,  r7              @ max pixel value
    353 .if \wd == 16
    354        vbic            d6,  d6,  d7            @ fm && flat8in && !flat8out
    355 .endif
    356 
    357        vshr.u16        \tmp2,  \tmp6,  #1      @ (1 << (BIT_DEPTH - 1)) - 1
    358 
    359        vadd.s16        \tmp3,  \tmp1,  d2      @ f + 4
    360        vadd.s16        \tmp4,  \tmp1,  d3      @ f + 3
    361        vmin.s16        \tmp3,  \tmp3,  \tmp2   @ FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1)
    362        vmin.s16        \tmp4,  \tmp4,  \tmp2   @ FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1)
    363        vshr.s16        \tmp3,  \tmp3,  #3      @ f1
    364        vshr.s16        \tmp4,  \tmp4,  #3      @ f2
    365 
    366        vadd.s16        d0,  d23, \tmp4         @ p0 + f2
    367        vsub.s16        d2,  d24, \tmp3         @ q0 - f1
    368        vmin.s16        d0,  d0,  \tmp6
    369        vmin.s16        d2,  d2,  \tmp6
    370        vrshr.s16       \tmp3,  \tmp3,  #1      @ f = (f1 + 1) >> 1
    371        vmax.s16        d0,  d0,  \tmp5         @ out p0
    372        vmax.s16        d2,  d2,  \tmp5         @ out q0
    373        vbit            d23, d0,  d4            @ if (fm && !flat8in)
    374        vbit            d24, d2,  d4
    375 
    376        vadd.s16        d0,  d22, \tmp3         @ p1 + f
    377        vsub.s16        d2,  d25, \tmp3         @ q1 - f
    378 .if \wd >= 8
    379        vmov            r8,  r9,  d6
    380 .endif
    381        vmin.s16        d0,  d0,  \tmp6
    382        vmin.s16        d2,  d2,  \tmp6
    383 .if \wd >= 8
    384        orrs            r8,  r8,  r9
    385 .endif
    386        vmax.s16        d0,  d0,  \tmp5         @ out p1
    387        vmax.s16        d2,  d2,  \tmp5         @ out q1
    388        vbit            d22, d0,  d5            @ if (!hev && fm && !flat8in)
    389        vbit            d25, d2,  d5
    390 
    391 .if \wd >= 8
    392        @ If no pixels need flat8in, jump to flat8out
    393        @ (or to a writeout of the inner 4 pixels, for wd=8)
    394        beq             6f
    395 
    396        @ flat8in
    397        vadd.u16        \tmp1,  d20, d21
    398        vadd.u16        \tmp3,  d22, d25
    399        vadd.u16        \tmp5,  d20, d22
    400        vadd.u16        \tmp7,  d23, d26
    401        vadd.u16        d0,  \tmp1,  \tmp1
    402        vadd.u16        d0,  d0,  d23
    403        vadd.u16        d0,  d0,  d24
    404        vadd.u16        d0,  d0,  \tmp5
    405        vsub.s16        \tmp3,  \tmp3,  \tmp1
    406        vsub.s16        \tmp7,  \tmp7,  \tmp5
    407        vrshr.u16       d2,  d0,  #3            @ out p2
    408 
    409        vadd.u16        d0,  d0,  \tmp3
    410        vadd.u16        \tmp1,  d20, d23
    411        vadd.u16        \tmp3,  d24, d27
    412        vrshr.u16       d3,  d0,  #3            @ out p1
    413 
    414        vadd.u16        d0,  d0,  \tmp7
    415        vsub.s16        \tmp3,  \tmp3,  \tmp1
    416        vadd.u16        \tmp5,  d21, d24
    417        vadd.u16        \tmp7,  d25, d27
    418        vrshr.u16       d4,  d0,  #3            @ out p0
    419 
    420        vadd.u16        d0,  d0,  \tmp3
    421        vsub.s16        \tmp7,  \tmp7,  \tmp5
    422        vadd.u16        \tmp1,  d22, d25
    423        vadd.u16        \tmp3,  d26, d27
    424        vrshr.u16       d5,  d0,  #3            @ out d0
    425 
    426        vadd.u16        d0,  d0,  \tmp7
    427        vsub.s16        \tmp3,  \tmp3,  \tmp1
    428        vrshr.u16       \tmp5,  d0,  #3         @ out q1
    429 
    430        vadd.u16        d0,  d0,  \tmp3
    431        @ The output here is written back into the input registers. This doesn't
    432        @ matter for the flat8out part below, since we only update those pixels
    433        @ which won't be touched below.
    434        vbit            d21, d2,  d6
    435        vbit            d22, d3,  d6
    436        vbit            d23, d4,  d6
    437        vrshr.u16       \tmp6,  d0,  #3         @ out q2
    438        vbit            d24, d5,  d6
    439        vbit            d25, \tmp5,  d6
    440        vbit            d26, \tmp6,  d6
    441 .endif
    442 .if \wd == 16
    443 6:
    444        vorr            d2,  d6,  d7
    445        vmov            r8,  r9,  d2
    446        orrs            r8,  r8,  r9
    447        @ If no pixels needed flat8in nor flat8out, jump to a
    448        @ writeout of the inner 4 pixels
    449        beq             7f
    450        vmov            r8,  r9,  d7
    451        orrs            r8,  r8,  r9
    452        @ If no pixels need flat8out, jump to a writeout of the inner 6 pixels
    453        beq             8f
    454 
    455        @ flat8out
    456        @ This writes all outputs into d2-d17 (skipping d6 and d16).
    457        @ If this part is skipped, the output is read from d21-d26 (which is the input
    458        @ to this section).
    459        vshl.u16        d0,  d16, #3  @ 8 * d16
    460        vsub.u16        d0,  d0,  d16 @ 7 * d16
    461        vadd.u16        d0,  d0,  d17
    462        vadd.u16        d8,  d17, d18
    463        vadd.u16        d10, d19, d20
    464        vadd.s16        d0,  d0,  d8
    465        vadd.u16        d8,  d16, d17
    466        vadd.u16        d12, d21, d22
    467        vadd.s16        d0,  d0,  d10
    468        vadd.u16        d10, d18, d25
    469        vadd.u16        d14, d23, d24
    470        vsub.s16        d10, d10, d8
    471        vadd.s16        d0,  d0,  d12
    472        vadd.s16        d0,  d0,  d14
    473        vadd.u16        d12, d16, d18
    474        vadd.u16        d14, d19, d26
    475        vrshr.u16       d2,  d0,  #4
    476 
    477        vadd.s16        d0,  d0,  d10
    478        vadd.u16        d8,  d16, d19
    479        vadd.u16        d10, d20, d27
    480        vsub.s16        d14, d14, d12
    481        vbif            d2,  d17, d7
    482        vrshr.u16       d3,  d0,  #4
    483 
    484        vadd.s16        d0,  d0,  d14
    485        vadd.u16        d12, d16, d20
    486        vadd.u16        d14, d21, d28
    487        vsub.s16        d10, d10, d8
    488        vbif            d3,  d18, d7
    489        vrshr.u16       d4,  d0,  #4
    490 
    491        vadd.s16        d0,  d0,  d10
    492        vadd.u16        d8,  d16, d21
    493        vadd.u16        d10, d22, d29
    494        vsub.s16        d14, d14, d12
    495        vbif            d4,  d19, d7
    496        vrshr.u16       d5,  d0,  #4
    497 
    498        vadd.s16        d0,  d0,  d14
    499        vadd.u16        d12, d16, d22
    500        vadd.u16        d14, d23, d30
    501        vsub.s16        d10, d10, d8
    502        vbif            d5,  d20, d7
    503        vrshr.u16       d6,  d0,  #4
    504 
    505        vadd.s16        d0,  d0,  d10
    506        vadd.u16        d10, d16, d23
    507        vsub.s16        d14, d14, d12
    508        vadd.u16        d12, d24, d31
    509        vbif            d6,  d21, d7
    510        vrshr.u16       d8,  d0,  #4
    511 
    512        vadd.s16        d0,  d0,  d14
    513        vsub.s16        d10, d12, d10
    514        vadd.u16        d12, d17, d24
    515        vadd.u16        d14, d25, d31
    516        vbif            d8,  d22, d7
    517        vrshr.u16       d9,  d0,  #4
    518 
    519        vadd.s16        d0,  d0,  d10
    520        vsub.s16        d14, d14, d12
    521        vadd.u16        d12, d26, d31
    522        vbif            d9,  d23, d7
    523        vrshr.u16       d10, d0,  #4
    524 
    525        vadd.s16        d0,  d0,  d14
    526        vadd.u16        d14, d18, d25
    527        vadd.u16        d18, d19, d26
    528        vsub.s16        d12, d12, d14
    529        vadd.u16        d14, d27, d31
    530        vbif            d10, d24, d7
    531        vrshr.u16       d11, d0,  #4
    532 
    533        vadd.s16        d0,  d0,  d12
    534        vadd.u16        d12, d20, d27
    535        vsub.s16        d14, d14, d18
    536        vadd.u16        d18, d28, d31
    537        vbif            d11, d25, d7
    538        vsub.s16        d18, d18, d12
    539        vrshr.u16       d12, d0,  #4
    540 
    541        vadd.s16        d0,  d0,  d14
    542        vadd.u16        d14, d21, d28
    543        vadd.u16        d20, d29, d31
    544        vbif            d12, d26, d7
    545        vrshr.u16       d13, d0,  #4
    546 
    547        vadd.s16        d0,  d0,  d18
    548        vsub.s16        d20, d20, d14
    549        vadd.u16        d18, d22, d29
    550        vadd.u16        d22, d30, d31
    551        vbif            d13, d27, d7
    552        vrshr.u16       d14, d0,  #4
    553 
    554        vadd.s16        d0,  d0,  d20
    555        vsub.s16        d22, d22, d18
    556        vbif            d14, d28, d7
    557        vrshr.u16       d15, d0,  #4
    558 
    559        vadd.s16        d0,  d0,  d22
    560        vbif            d15, d29, d7
    561        vrshr.u16       d17, d0,  #4
    562        vbif            d17, d30, d7
    563 .endif
    564 .endm
    565 
    566 .macro loop_filter_q_4
    567        loop_filter_q   4
    568 .endm
    569 
    570 .macro loop_filter_q_8
    571        loop_filter_q   8
    572 .endm
    573 
    574 .macro loop_filter_16
    575        loop_filter     16, d8,  d9,  d10, d11, d12, d13, d14, d15
    576 .endm
    577 
    578 
    579 @ The public functions in this file have got the following signature:
    580 @ void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr);
    581 
    582 .macro bpp_frontend func, bpp
    583 function ff_\func\()_\bpp\()_neon, export=1
    584        push            {r4-r9,lr}
    585        ldr             r4,  [sp, #28]
    586        vpush           {q4-q7}
    587        lsl             r2,  r2,  #\bpp - 8
    588        lsl             r3,  r3,  #\bpp - 8
    589        lsl             r4,  r4,  #\bpp - 8
    590        mov             r5,  #1 << (\bpp - 8)
    591        mov             r6,  #16 - \bpp
    592        movw            r7,  #((1 << \bpp) - 1)
    593        bl              \func\()_16_neon
    594        vpop            {q4-q7}
    595        pop             {r4-r9,pc}
    596 endfunc
    597 .endm
    598 
    599 .macro bpp_frontends func
    600        bpp_frontend    \func, 10
    601        bpp_frontend    \func, 12
    602 .endm
    603 
    604 .macro bpp_frontend_rep func, suffix, int_suffix, rep, dir, bpp
    605 function ff_\func\()_\suffix\()_\bpp\()_neon, export=1
    606        push            {r4-r9,lr}
    607        ldr             r4,  [sp, #28]
    608        vpush           {q4-q7}
    609        lsl             r2,  r2,  #\bpp - 8
    610        lsl             r3,  r3,  #\bpp - 8
    611        lsl             r4,  r4,  #\bpp - 8
    612        mov             r5,  #1 << (\bpp - 8)
    613        mov             r6,  #16 - \bpp
    614        movw            r7,  #((1 << \bpp) - 1)
    615        bl              \func\()_\int_suffix\()_16_neon
    616 .ifc \dir,h
    617        add             r0,  r0,  r1, lsl #2
    618 .else
    619        add             r0,  r0,  #8
    620 .endif
    621        bl              \func\()_\int_suffix\()_16_neon
    622 .if \rep >= 4
    623 .ifc \dir,h
    624        add             r0,  r0,  r1, lsl #2
    625        bl              \func\()_\int_suffix\()_16_neon
    626        add             r0,  r0,  r1, lsl #2
    627        bl              \func\()_\int_suffix\()_16_neon
    628 .else
    629        add             r0,  r0,  #8
    630        bl              \func\()_\int_suffix\()_16_neon
    631        add             r0,  r0,  #8
    632        bl              \func\()_\int_suffix\()_16_neon
    633 .endif
    634 .endif
    635        vpop            {q4-q7}
    636        pop             {r4-r9,pc}
    637 endfunc
    638 .endm
    639 
    640 .macro bpp_frontends_rep func, suffix, int_suffix, rep, dir
    641        bpp_frontend_rep \func, \suffix, \int_suffix, \rep, \dir, 10
    642        bpp_frontend_rep \func, \suffix, \int_suffix, \rep, \dir, 12
    643 .endm
    644 
    645 .macro bpp_frontend_mix2 wd1, wd2, dir, bpp
    646 function ff_vp9_loop_filter_\dir\()_\wd1\()\wd2\()_16_\bpp\()_neon, export=1
    647        push            {r4-r9,lr}
    648        ldr             r4,  [sp, #28]
    649        vpush           {q4-q7}
    650        push            {r2, r3, r4}
    651        and             r2,  r2,  #0xff
    652        and             r3,  r3,  #0xff
    653        and             r4,  r4,  #0xff
    654        lsl             r2,  r2,  #\bpp - 8
    655        lsl             r3,  r3,  #\bpp - 8
    656        lsl             r4,  r4,  #\bpp - 8
    657        mov             r5,  #1 << (\bpp - 8)
    658        mov             r6,  #16 - \bpp
    659        movw            r7,  #((1 << \bpp) - 1)
    660        bl              vp9_loop_filter_\dir\()_\wd1\()_8_16_neon
    661 .ifc \dir,h
    662        add             r0,  r0,  r1, lsl #3
    663 .else
    664        add             r0,  r0,  #16
    665 .endif
    666        pop             {r2, r3, r4}
    667        lsr             r2,  r2,  #8
    668        lsr             r3,  r3,  #8
    669        lsr             r4,  r4,  #8
    670        lsl             r2,  r2,  #\bpp - 8
    671        lsl             r3,  r3,  #\bpp - 8
    672        lsl             r4,  r4,  #\bpp - 8
    673        bl              vp9_loop_filter_\dir\()_\wd2\()_8_16_neon
    674        vpop            {q4-q7}
    675        pop             {r4-r9,pc}
    676 endfunc
    677 .endm
    678 
    679 .macro bpp_frontends_mix2 wd1, wd2
    680        bpp_frontend_mix2 \wd1, \wd2, v, 10
    681        bpp_frontend_mix2 \wd1, \wd2, v, 12
    682        bpp_frontend_mix2 \wd1, \wd2, h, 10
    683        bpp_frontend_mix2 \wd1, \wd2, h, 12
    684 .endm
    685 
    686 function vp9_loop_filter_v_4_8_16_neon
    687        sub             r12, r0,  r1, lsl #2
    688        vld1.16         {q8},  [r12,:128], r1 @ p3
    689        vld1.16         {q12}, [r0, :128], r1 @ q0
    690        vld1.16         {q9},  [r12,:128], r1 @ p2
    691        vld1.16         {q13}, [r0, :128], r1 @ q1
    692        vld1.16         {q10}, [r12,:128], r1 @ p1
    693        vld1.16         {q14}, [r0, :128], r1 @ q2
    694        vld1.16         {q11}, [r12,:128], r1 @ p0
    695        vld1.16         {q15}, [r0, :128], r1 @ q3
    696        sub             r0,  r0,  r1, lsl #2
    697        sub             r12, r12, r1, lsl #1
    698 
    699        loop_filter_q_4
    700 
    701        vst1.16         {q10}, [r12,:128], r1
    702        vst1.16         {q12}, [r0, :128], r1
    703        vst1.16         {q11}, [r12,:128], r1
    704        vst1.16         {q13}, [r0, :128], r1
    705        sub             r0,  r0,  r1, lsl #1
    706 9:
    707        bx              lr
    708 endfunc
    709 
    710 bpp_frontends vp9_loop_filter_v_4_8
    711 
    712 
    713 function vp9_loop_filter_h_4_8_16_neon
    714        sub             r12, r0,  #8
    715        add             r0,  r12, r1, lsl #2
    716        vld1.16         {q8},  [r12,:64], r1
    717        vld1.16         {q12}, [r0, :64], r1
    718        vld1.16         {q9},  [r12,:64], r1
    719        vld1.16         {q13}, [r0, :64], r1
    720        vld1.16         {q10}, [r12,:64], r1
    721        vld1.16         {q14}, [r0, :64], r1
    722        vld1.16         {q11}, [r12,:64], r1
    723        vld1.16         {q15}, [r0, :64], r1
    724 
    725        sub             r12, r12, r1, lsl #2
    726        sub             r0,  r0,  r1, lsl #2
    727        @ Move r0/r12 forward by 2 pixels; we don't need to rewrite the
    728        @ outermost 2 pixels since they aren't changed.
    729        add             r12, r12, #4
    730        add             r0,  r0,  #4
    731 
    732        transpose16_q_8x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
    733 
    734        loop_filter_q_4
    735 
    736        @ We only will write the mid 4 pixels back; after the loop filter,
    737        @ these are in q10, q11, q12, q13, ordered as rows (8x4 pixels).
    738        @ We need to transpose them to columns, done with a
    739        @ 4x4 transpose (which in practice is two 4x4 transposes of the two
    740        @ 4x4 halves of the 8x4 pixels; into 4x8 pixels).
    741        transpose16_4x4 q10, q11, q12, q13
    742 
    743        vst1.16         {d20}, [r12], r1
    744        vst1.16         {d21}, [r0],  r1
    745        vst1.16         {d22}, [r12], r1
    746        vst1.16         {d23}, [r0],  r1
    747        vst1.16         {d24}, [r12], r1
    748        vst1.16         {d25}, [r0],  r1
    749        vst1.16         {d26}, [r12], r1
    750        vst1.16         {d27}, [r0],  r1
    751        sub             r12, r12, r1, lsl #2
    752 9:
    753        add             r0,  r12, #4
    754        bx              lr
    755 endfunc
    756 
    757 bpp_frontends vp9_loop_filter_h_4_8
    758 
    759 
    760 function vp9_loop_filter_v_8_8_16_neon
    761        sub             r12, r0,  r1, lsl #2
    762        vld1.16         {q8},  [r12,:128], r1 @ p3
    763        vld1.16         {q12}, [r0, :128], r1 @ q0
    764        vld1.16         {q9},  [r12,:128], r1 @ p2
    765        vld1.16         {q13}, [r0, :128], r1 @ q1
    766        vld1.16         {q10}, [r12,:128], r1 @ p1
    767        vld1.16         {q14}, [r0, :128], r1 @ q2
    768        vld1.16         {q11}, [r12,:128], r1 @ p0
    769        vld1.16         {q15}, [r0, :128], r1 @ q3
    770        sub             r12, r12, r1, lsl #2
    771        sub             r0,  r0,  r1, lsl #2
    772        add             r12, r12, r1
    773 
    774        loop_filter_q_8
    775 
    776        vst1.16         {q9},  [r12,:128], r1
    777        vst1.16         {q12}, [r0, :128], r1
    778        vst1.16         {q10}, [r12,:128], r1
    779        vst1.16         {q13}, [r0, :128], r1
    780        vst1.16         {q11}, [r12,:128], r1
    781        vst1.16         {q14}, [r0, :128], r1
    782        sub             r0,  r0,  r1, lsl #1
    783        sub             r0,  r0,  r1
    784 9:
    785        bx              lr
    786 6:
    787        sub             r12, r0,  r1, lsl #1
    788        vst1.16         {q10}, [r12,:128], r1
    789        vst1.16         {q12}, [r0, :128], r1
    790        vst1.16         {q11}, [r12,:128], r1
    791        vst1.16         {q13}, [r0, :128], r1
    792        sub             r0,  r0,  r1, lsl #1
    793        bx              lr
    794 endfunc
    795 
    796 bpp_frontends vp9_loop_filter_v_8_8
    797 
    798 
    799 function vp9_loop_filter_h_8_8_16_neon
    800        sub             r12, r0,  #8
    801        add             r0,  r12, r1, lsl #2
    802        vld1.16         {q8},  [r12,:64], r1
    803        vld1.16         {q12}, [r0, :64], r1
    804        vld1.16         {q9},  [r12,:64], r1
    805        vld1.16         {q13}, [r0, :64], r1
    806        vld1.16         {q10}, [r12,:64], r1
    807        vld1.16         {q14}, [r0, :64], r1
    808        vld1.16         {q11}, [r12,:64], r1
    809        vld1.16         {q15}, [r0, :64], r1
    810 
    811        sub             r12, r12, r1, lsl #2
    812        sub             r0,  r0,  r1, lsl #2
    813 
    814        transpose16_q_8x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
    815 
    816        loop_filter_q_8
    817 
    818        @ Even though only 6 pixels per row have been changed, we write the
    819        @ full 8 pixel registers.
    820        transpose16_q_8x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
    821 
    822        vst1.16         {q8},  [r12,:64], r1
    823        vst1.16         {q12}, [r0, :64], r1
    824        vst1.16         {q9},  [r12,:64], r1
    825        vst1.16         {q13}, [r0, :64], r1
    826        vst1.16         {q10}, [r12,:64], r1
    827        vst1.16         {q14}, [r0, :64], r1
    828        vst1.16         {q11}, [r12,:64], r1
    829        vst1.16         {q15}, [r0, :64], r1
    830        sub             r12, r12, r1, lsl #2
    831 9:
    832        add             r0,  r12, #8
    833        bx              lr
    834 6:
    835        @ If we didn't need to do the flat8in part, we use the same writeback
    836        @ as in loop_filter_h_4_8.
    837        add             r12, r12, #4
    838        add             r0,  r0,  #4
    839        transpose16_4x4 q10, q11, q12, q13
    840 
    841        vst1.16         {d20}, [r12], r1
    842        vst1.16         {d21}, [r0],  r1
    843        vst1.16         {d22}, [r12], r1
    844        vst1.16         {d23}, [r0],  r1
    845        vst1.16         {d24}, [r12], r1
    846        vst1.16         {d25}, [r0],  r1
    847        vst1.16         {d26}, [r12], r1
    848        vst1.16         {d27}, [r0],  r1
    849        sub             r12, r12, r1, lsl #2
    850        add             r0,  r12, #4
    851        bx              lr
    852 endfunc
    853 
    854 bpp_frontends vp9_loop_filter_h_8_8
    855 
    856 bpp_frontends_mix2 4, 4
    857 bpp_frontends_mix2 4, 8
    858 bpp_frontends_mix2 8, 4
    859 bpp_frontends_mix2 8, 8
    860 
    861 function vp9_loop_filter_v_16_4_16_neon
    862        sub             r12, r0,  r1, lsl #3
    863        @ Read p7-p0 using r12 and q0-q7 using r0
    864        vld1.16         {d16}, [r12,:64], r1 @ p7
    865        vld1.16         {d24}, [r0, :64], r1 @ q0
    866        vld1.16         {d17}, [r12,:64], r1 @ p6
    867        vld1.16         {d25}, [r0, :64], r1 @ q1
    868        vld1.16         {d18}, [r12,:64], r1 @ p5
    869        vld1.16         {d26}, [r0, :64], r1 @ q2
    870        vld1.16         {d19}, [r12,:64], r1 @ p4
    871        vld1.16         {d27}, [r0, :64], r1 @ q3
    872        vld1.16         {d20}, [r12,:64], r1 @ p3
    873        vld1.16         {d28}, [r0, :64], r1 @ q4
    874        vld1.16         {d21}, [r12,:64], r1 @ p2
    875        vld1.16         {d29}, [r0, :64], r1 @ q5
    876        vld1.16         {d22}, [r12,:64], r1 @ p1
    877        vld1.16         {d30}, [r0, :64], r1 @ q6
    878        vld1.16         {d23}, [r12,:64], r1 @ p0
    879        vld1.16         {d31}, [r0, :64], r1 @ q7
    880        sub             r12, r12, r1, lsl #3
    881        sub             r0,  r0,  r1, lsl #3
    882        add             r12, r12, r1
    883 
    884        loop_filter_16
    885 
    886        @ If we did the flat8out part, we get the output in
    887        @ d2-d17 (skipping d7 and d16). r12 points to r0 - 7 * stride,
    888        @ store d2-d9 there, and d10-d17 into r0.
    889        vst1.16         {d2},  [r12,:64], r1
    890        vst1.16         {d10}, [r0, :64], r1
    891        vst1.16         {d3},  [r12,:64], r1
    892        vst1.16         {d11}, [r0, :64], r1
    893        vst1.16         {d4},  [r12,:64], r1
    894        vst1.16         {d12}, [r0, :64], r1
    895        vst1.16         {d5},  [r12,:64], r1
    896        vst1.16         {d13}, [r0, :64], r1
    897        vst1.16         {d6},  [r12,:64], r1
    898        vst1.16         {d14}, [r0, :64], r1
    899        vst1.16         {d8},  [r12,:64], r1
    900        vst1.16         {d15}, [r0, :64], r1
    901        vst1.16         {d9},  [r12,:64], r1
    902        vst1.16         {d17}, [r0, :64], r1
    903        sub             r0,  r0,  r1, lsl #3
    904        add             r0,  r0,  r1
    905 
    906 9:
    907        bx              lr
    908 
    909 8:
    910        add             r12, r12, r1, lsl #2
    911        @ If we didn't do the flat8out part, the output is left in the
    912        @ input registers.
    913        vst1.16         {d21}, [r12,:64], r1
    914        vst1.16         {d24}, [r0, :64], r1
    915        vst1.16         {d22}, [r12,:64], r1
    916        vst1.16         {d25}, [r0, :64], r1
    917        vst1.16         {d23}, [r12,:64], r1
    918        vst1.16         {d26}, [r0, :64], r1
    919        sub             r0,  r0,  r1, lsl #1
    920        sub             r0,  r0,  r1
    921        bx              lr
    922 7:
    923        sub             r12, r0,  r1, lsl #1
    924        vst1.16         {d22}, [r12,:64], r1
    925        vst1.16         {d24}, [r0, :64], r1
    926        vst1.16         {d23}, [r12,:64], r1
    927        vst1.16         {d25}, [r0, :64], r1
    928        sub             r0,  r0,  r1, lsl #1
    929        bx              lr
    930 endfunc
    931 
    932 bpp_frontends_rep vp9_loop_filter_v_16, 8,  4, 2, v
    933 bpp_frontends_rep vp9_loop_filter_v_16, 16, 4, 4, v
    934 
    935 function vp9_loop_filter_h_16_4_16_neon
    936        sub             r12, r0,  #16
    937        sub             r0,  r0,  #8
    938        vld1.16         {d16}, [r12,:64], r1
    939        vld1.16         {d20}, [r0, :64], r1
    940        vld1.16         {d17}, [r12,:64], r1
    941        vld1.16         {d21}, [r0, :64], r1
    942        vld1.16         {d18}, [r12,:64], r1
    943        vld1.16         {d22}, [r0, :64], r1
    944        vld1.16         {d19}, [r12,:64], r1
    945        vld1.16         {d23}, [r0, :64], r1
    946        sub             r12, r12, r1, lsl #2
    947        sub             r0,  r0,  r1, lsl #2
    948        add             r12, r12, #16
    949        add             r0,  r0,  #16
    950        vld1.16         {d24}, [r12,:64], r1
    951        vld1.16         {d28}, [r0, :64], r1
    952        vld1.16         {d25}, [r12,:64], r1
    953        vld1.16         {d29}, [r0, :64], r1
    954        vld1.16         {d26}, [r12,:64], r1
    955        vld1.16         {d30}, [r0, :64], r1
    956        vld1.16         {d27}, [r12,:64], r1
    957        vld1.16         {d31}, [r0, :64], r1
    958        sub             r0,  r0,  r1, lsl #2
    959        sub             r12, r12, r1, lsl #2
    960        sub             r12, r12, #16
    961        sub             r0,  r0,  #16
    962 
    963        @ The 16x4 pixels read above is in four 4x4 blocks
    964        transpose16_q_4x4 q8,  q9,  d16, d17, d18, d19
    965        transpose16_q_4x4 q10, q11, d20, d21, d22, d23
    966        transpose16_q_4x4 q12, q13, d24, d25, d26, d27
    967        transpose16_q_4x4 q14, q15, d28, d29, d30, d31
    968 
    969        loop_filter_16
    970 
    971        @ Transpose back; this is the same transpose as above, but
    972        @ we can't take advantage of q registers for the transpose, since
    973        @ all d registers in the transpose aren't consecutive.
    974        transpose16_4x4 d16, d2,  d3,  d4
    975        transpose16_4x4 d5,  d6,  d8,  d9
    976        transpose16_4x4 d10, d11, d12, d13
    977        transpose16_4x4 d14, d15, d17, d31
    978 
    979        vst1.16         {d16}, [r12,:64], r1
    980        vst1.16         {d5},  [r0, :64], r1
    981 
    982        vst1.16         {d2},  [r12,:64], r1
    983        vst1.16         {d6},  [r0, :64], r1
    984 
    985        vst1.16         {d3},  [r12,:64], r1
    986        vst1.16         {d8},  [r0, :64], r1
    987 
    988        vst1.16         {d4},  [r12,:64], r1
    989        vst1.16         {d9},  [r0, :64], r1
    990 
    991        sub             r12, r12, r1, lsl #2
    992        sub             r0,  r0,  r1, lsl #2
    993        add             r12, r12, #16
    994        add             r0,  r0,  #16
    995 
    996        vst1.16         {d10}, [r12,:64], r1
    997        vst1.16         {d14}, [r0, :64], r1
    998 
    999        vst1.16         {d11}, [r12,:64], r1
   1000        vst1.16         {d15}, [r0, :64], r1
   1001 
   1002        vst1.16         {d12}, [r12,:64], r1
   1003        vst1.16         {d17}, [r0, :64], r1
   1004 
   1005        vst1.16         {d13}, [r12,:64], r1
   1006        vst1.16         {d31}, [r0, :64], r1
   1007        sub             r0,  r0,  r1, lsl #2
   1008        sub             r0,  r0,  #8
   1009        bx              lr
   1010 9:
   1011        add             r0,  r0,  #8
   1012        bx              lr
   1013 8:
   1014        add             r12, r12, #8
   1015        add             r0,  r0,  #8
   1016        transpose16_q_4x4 q10, q11, d20, d21, d22, d23
   1017        transpose16_q_4x4 q12, q13, d24, d25, d26, d27
   1018 
   1019        vst1.16         {d20}, [r12,:64], r1
   1020        vst1.16         {d24}, [r0, :64], r1
   1021        vst1.16         {d21}, [r12,:64], r1
   1022        vst1.16         {d25}, [r0, :64], r1
   1023        vst1.16         {d22}, [r12,:64], r1
   1024        vst1.16         {d26}, [r0, :64], r1
   1025        vst1.16         {d23}, [r12,:64], r1
   1026        vst1.16         {d27}, [r0, :64], r1
   1027        sub             r0,  r0,  r1, lsl #2
   1028        bx              lr
   1029 7:
   1030        add             r12, r12, #12
   1031        add             r0,  r12, r1, lsl #1
   1032        transpose16_q_4x4 q11, q12, d22, d23, d24, d25
   1033 
   1034        vst1.16         {d22}, [r12], r1
   1035        vst1.16         {d24}, [r0],  r1
   1036        vst1.16         {d23}, [r12], r1
   1037        vst1.16         {d25}, [r0],  r1
   1038        sub             r0,  r0,  r1, lsl #2
   1039        add             r0,  r0,  #4
   1040        bx              lr
   1041 endfunc
   1042 
   1043 bpp_frontends_rep vp9_loop_filter_h_16, 8,  4, 2, h
   1044 bpp_frontends_rep vp9_loop_filter_h_16, 16, 4, 4, h