tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

vp9lpf_neon.S (37299B)


      1 /*
      2 * Copyright (c) 2016 Google Inc.
      3 *
      4 * This file is part of FFmpeg.
      5 *
      6 * FFmpeg is free software; you can redistribute it and/or
      7 * modify it under the terms of the GNU Lesser General Public
      8 * License as published by the Free Software Foundation; either
      9 * version 2.1 of the License, or (at your option) any later version.
     10 *
     11 * FFmpeg is distributed in the hope that it will be useful,
     12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     14 * Lesser General Public License for more details.
     15 *
     16 * You should have received a copy of the GNU Lesser General Public
     17 * License along with FFmpeg; if not, write to the Free Software
     18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     19 */
     20 
     21 #include "libavutil/arm/asm.S"
     22 #include "neon.S"
     23 
     24 @ Do an 8x8 transpose, using q registers for the subtransposes that don't
     25 @ need to address the indiviudal d registers.
     26 @ r0,r1 == rq0, r2,r3 == rq1, etc
     27 .macro transpose_q_8x8 rq0, rq1, rq2, rq3, r0, r1, r2, r3, r4, r5, r6, r7
     28        vtrn.32         \rq0, \rq2
     29        vtrn.32         \rq1, \rq3
     30        vtrn.16         \rq0, \rq1
     31        vtrn.16         \rq2, \rq3
     32        vtrn.8          \r0,  \r1
     33        vtrn.8          \r2,  \r3
     34        vtrn.8          \r4,  \r5
     35        vtrn.8          \r6,  \r7
     36 .endm
     37 
     38 @ Do a 4x4 transpose, using q registers for the subtransposes that don't
     39 @ need to address the indiviudal d registers.
     40 @ r0,r1 == rq0, r2,r3 == rq1
     41 .macro transpose_q_4x4 rq0, rq1, r0, r1, r2, r3
     42        vtrn.16         \rq0, \rq1
     43        vtrn.8          \r0,  \r1
     44        vtrn.8          \r2,  \r3
     45 .endm
     46 
     47 @ The input to and output from this macro is in the registers q8-q15,
     48 @ and q0-q7 are used as scratch registers.
     49 @ p3 = q8, p0 = q11, q0 = q12, q3 = q15
     50 .macro loop_filter_q
     51        vdup.u8         d0,  r2          @ E
     52        lsr             r2,  r2,  #8
     53        vdup.u8         d2,  r3          @ I
     54        lsr             r3,  r3,  #8
     55        vdup.u8         d1,  r2          @ E
     56        vdup.u8         d3,  r3          @ I
     57 
     58        vabd.u8         q2,  q8,  q9     @ abs(p3 - p2)
     59        vabd.u8         q3,  q9,  q10    @ abs(p2 - p1)
     60        vabd.u8         q4,  q10, q11    @ abs(p1 - p0)
     61        vabd.u8         q5,  q12, q13    @ abs(q0 - q1)
     62        vabd.u8         q6,  q13, q14    @ abs(q1 - q2)
     63        vabd.u8         q7,  q14, q15    @ abs(q2 - q3)
     64        vmax.u8         q2,  q2,  q3
     65        vmax.u8         q3,  q4,  q5
     66        vmax.u8         q4,  q6,  q7
     67        vabd.u8         q5,  q11, q12    @ abs(p0 - q0)
     68        vmax.u8         q2,  q2,  q3
     69        vqadd.u8        q5,  q5,  q5     @ abs(p0 - q0) * 2
     70        vabd.u8         q7,  q10, q13    @ abs(p1 - q1)
     71        vmax.u8         q2,  q2,  q4     @ max(abs(p3 - p2), ..., abs(q2 - q3))
     72        vshr.u8         q7,  q7,  #1
     73        vcle.u8         q2,  q2,  q1     @ max(abs()) <= I
     74        vqadd.u8        q5,  q5,  q7     @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
     75        vcle.u8         q5,  q5,  q0
     76        vand            q2,  q2,  q5     @ fm
     77 
     78        vshrn.u16       d10, q2,  #4
     79        vmov            r2,  r3,  d10
     80        orrs            r2,  r2,  r3
     81        @ If no pixels need filtering, just exit as soon as possible
     82        beq             9f
     83 
     84        @ Calculate the normal inner loop filter for 2 or 4 pixels
     85        ldr             r3,  [sp, #64]
     86        vabd.u8         q3,  q10, q11    @ abs(p1 - p0)
     87        vabd.u8         q4,  q13, q12    @ abs(q1 - q0)
     88 
     89        vsubl.u8        q5,  d20, d26    @ p1 - q1
     90        vsubl.u8        q6,  d21, d27    @ p1 - q1
     91        vmax.u8         q3,  q3,  q4     @ max(abs(p1 - p0), abs(q1 - q0))
     92        vqmovn.s16      d10, q5          @ av_clip_int8p(p1 - q1)
     93        vqmovn.s16      d11, q6          @ av_clip_int8p(p1 - q1)
     94        vdup.u8         d8,  r3          @ H
     95        lsr             r3,  r3,  #8
     96        vdup.u8         d9,  r3          @ H
     97        vsubl.u8        q6,  d24, d22    @ q0 - p0
     98        vsubl.u8        q7,  d25, d23    @ q0 - p0
     99        vcle.u8         q3,  q3,  q4     @ hev
    100        vmov.s16        q0,  #3
    101        vand            q3,  q3,  q2     @ !hev && fm && !flat8in
    102 
    103        vmul.s16        q6,  q6,  q0     @ 3 * (q0 - p0)
    104        vmul.s16        q7,  q7,  q0     @ 3 * (q0 - p0)
    105        vbic            q5,  q5,  q3     @ if (!hev) av_clip_int8 = 0
    106        vaddw.s8        q6,  q6,  d10    @ 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
    107        vaddw.s8        q7,  q7,  d11    @ 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
    108        vmov.s8         q5,  #4
    109        vqmovn.s16      d12, q6
    110        vqmovn.s16      d13, q7          @ av_clip_int8(3 * (q0 - p0) [+ av_clip_int8(p1 - q1)], BIT_DEPTH - 1) = f
    111        vmov.s8         q0,  #3
    112 
    113        vqadd.s8        q5,  q6,  q5     @ FFMIN(f + 4, 127)
    114        vqadd.s8        q0,  q6,  q0     @ FFMIN(f + 3, 127)
    115        vmovl.u8        q6,  d22         @ p0
    116        vmovl.u8        q7,  d23         @ p0
    117        vshr.s8         q5,  q5,  #3     @ f1
    118        vshr.s8         q0,  q0,  #3     @ f2
    119 
    120        vaddw.s8        q6,  q6,  d0     @ p0 + f2
    121        vaddw.s8        q7,  q7,  d1     @ p0 + f2
    122        vqmovun.s16     d0,  q6          @ out p0
    123        vmovl.u8        q6,  d24         @ q0
    124        vqmovun.s16     d1,  q7          @ out p0
    125        vmovl.u8        q7,  d25         @ q0
    126        vsubw.s8        q6,  q6,  d10    @ q0 - f1
    127        vsubw.s8        q7,  q7,  d11    @ q0 - f1
    128        vqmovun.s16     d12, q6          @ out q0
    129        vqmovun.s16     d13, q7          @ out q0
    130        vrshr.s8        q5,  q5,  #1     @ f = (f1 + 1) >> 1
    131        vbit            q11, q0,  q2     @ if (fm && !flat8in)
    132        vbit            q12, q6,  q2
    133 
    134        vmovl.u8        q0,  d20         @ p1
    135        vmovl.u8        q2,  d21         @ p1
    136        vmovl.u8        q6,  d26         @ q1
    137        vmovl.u8        q7,  d27         @ q1
    138        vaddw.s8        q0,  q0,  d10    @ p1 + f
    139        vaddw.s8        q2,  q2,  d11    @ p1 + f
    140        vsubw.s8        q6,  q6,  d10    @ q1 - f
    141        vsubw.s8        q7,  q7,  d11    @ q1 - f
    142        vqmovun.s16     d0,  q0          @ out p1
    143        vqmovun.s16     d1,  q2          @ out p1
    144        vqmovun.s16     d12, q6          @ out q1
    145        vqmovun.s16     d13, q7          @ out q1
    146        vbit            q10, q0,  q3     @ if (!hev && fm && !flat8in)
    147        vbit            q13, q6,  q3
    148 .endm
    149 
    150 @ The input to and output from this macro is in the registers d16-d31,
    151 @ and d0-d7 are used as scratch registers.
    152 @ p7 = d16 .. p3 = d20, p0 = d23, q0 = d24, q3 = d27, q7 = d31
    153 @ Depending on the width of the loop filter, we either use d16-d19
    154 @ and d28-d31 as temp registers, or d8-d15.
    155 @ tmp1,tmp2 = tmpq1, tmp3,tmp4 = tmpq2, tmp5,tmp6 = tmpq3, tmp7,tmp8 = tmpq4
    156 .macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmpq1, tmpq2, tmpq3, tmpq4
    157        vdup.u8         d0,  r2 @ E
    158        vdup.u8         d2,  r3 @ I
    159        ldr             r3,  [sp]
    160 
    161        vabd.u8         d4,  d20, d21    @ abs(p3 - p2)
    162        vabd.u8         d5,  d21, d22    @ abs(p2 - p1)
    163        vabd.u8         d6,  d22, d23    @ abs(p1 - p0)
    164        vabd.u8         d7,  d24, d25    @ abs(q0 - q1)
    165        vabd.u8         \tmp1,  d25, d26 @ abs(q1 - q2)
    166        vabd.u8         \tmp2,  d26, d27 @ abs(q2 - q3)
    167        vmax.u8         d4,  d4,  d5
    168        vmax.u8         d5,  d6,  d7
    169        vmax.u8         \tmp1,  \tmp1,  \tmp2
    170        vabd.u8         d6,  d23, d24    @ abs(p0 - q0)
    171        vmax.u8         d4,  d4,  d5
    172        vqadd.u8        d6,  d6,  d6     @ abs(p0 - q0) * 2
    173        vabd.u8         d5,  d22, d25    @ abs(p1 - q1)
    174        vmax.u8         d4,  d4,  \tmp1  @ max(abs(p3 - p2), ..., abs(q2 - q3))
    175        vshr.u8         d5,  d5,  #1
    176        vcle.u8         d4,  d4,  d2     @ max(abs()) <= I
    177        vqadd.u8        d6,  d6,  d5     @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
    178        vcle.u8         d5,  d6,  d0
    179        vand            d4,  d4,  d5     @ fm
    180 
    181        vdup.u8         d3,  r3          @ H
    182        vmov            r2,  r3,  d4
    183        orrs            r2,  r2,  r3
    184        @ If no pixels need filtering, just exit as soon as possible
    185        beq             9f
    186 
    187 .if \wd >= 8
    188        vmov.u8         d0,  #1
    189 
    190        vabd.u8         d6,  d20, d23    @ abs(p3 - p0)
    191        vabd.u8         d2,  d21, d23    @ abs(p2 - p0)
    192        vabd.u8         d1,  d22, d23    @ abs(p1 - p0)
    193        vabd.u8         \tmp1,  d25, d24 @ abs(q1 - q0)
    194        vabd.u8         \tmp2,  d26, d24 @ abs(q2 - q0)
    195        vabd.u8         \tmp3,  d27, d24 @ abs(q3 - q0)
    196        vmax.u8         d6,  d6,  d2
    197        vmax.u8         d1,  d1,  \tmp1
    198        vmax.u8         \tmp2,  \tmp2,  \tmp3
    199 .if \wd == 16
    200        vabd.u8         d7,  d16, d23    @ abs(p7 - p0)
    201        vmax.u8         d6,  d6,  d1
    202        vabd.u8         d2,  d17, d23    @ abs(p6 - p0)
    203        vmax.u8         d6,  d6,  \tmp2
    204        vabd.u8         d1,  d18, d23    @ abs(p5 - p0)
    205        vcle.u8         d6,  d6,  d0     @ flat8in
    206        vabd.u8         d8,  d19, d23    @ abs(p4 - p0)
    207        vand            d6,  d6,  d4     @ flat8in && fm
    208        vabd.u8         d9,  d28, d24    @ abs(q4 - q0)
    209        vbic            d4,  d4,  d6     @ fm && !flat8in
    210        vabd.u8         d10, d29, d24    @ abs(q5 - q0)
    211        vabd.u8         d11, d30, d24    @ abs(q6 - q0)
    212        vabd.u8         d12, d31, d24    @ abs(q7 - q0)
    213 
    214        vmax.u8         d7,  d7,  d2
    215        vmax.u8         d1,  d1,  d8
    216        vmax.u8         d9,  d9,  d10
    217        vmax.u8         d11, d11, d12
    218        @ The rest of the calculation of flat8out is interleaved below
    219 .else
    220        @ The rest of the calculation of flat8in is interleaved below
    221 .endif
    222 .endif
    223 
    224        @ Calculate the normal inner loop filter for 2 or 4 pixels
    225        vabd.u8         d5,  d22, d23           @ abs(p1 - p0)
    226 .if \wd == 16
    227        vmax.u8         d7,  d7,  d1
    228        vmax.u8         d9,  d9,  d11
    229 .elseif \wd == 8
    230        vmax.u8         d6,  d6,  d1
    231 .endif
    232        vabd.u8         d1,  d25, d24           @ abs(q1 - q0)
    233 .if \wd == 16
    234        vmax.u8         d7,  d7,  d9
    235 .elseif \wd == 8
    236        vmax.u8         d6,  d6,  \tmp2
    237 .endif
    238        vsubl.u8        \tmpq1,  d22, d25       @ p1 - q1
    239        vmax.u8         d5,  d5,  d1            @ max(abs(p1 - p0), abs(q1 - q0))
    240        vsubl.u8        \tmpq2,  d24, d23       @ q0 - p0
    241        vmov.s16        \tmpq3,  #3
    242 .if \wd == 8
    243        vcle.u8         d6,  d6,  d0            @ flat8in
    244 .endif
    245        vcle.u8         d5,  d5,  d3            @ !hev
    246 .if \wd == 8
    247        vand            d6,  d6,  d4            @ flat8in && fm
    248 .endif
    249        vqmovn.s16      \tmp1,   \tmpq1         @ av_clip_int8(p1 - q1)
    250 .if \wd == 16
    251        vcle.u8         d7,  d7,  d0            @ flat8out
    252 .elseif \wd == 8
    253        vbic            d4,  d4,  d6            @ fm && !flat8in
    254 .endif
    255        vand            d5,  d5,  d4            @ !hev && fm && !flat8in
    256 .if \wd == 16
    257        vand            d7,  d7,  d6            @ flat8out && flat8in && fm
    258 .endif
    259 
    260        vmul.s16        \tmpq2,  \tmpq2, \tmpq3 @ 3 * (q0 - p0)
    261        vbic            \tmp1,   \tmp1,   d5    @ if (!hev) av_clip_int8 = 0
    262        vmov.s8         d2,  #4
    263        vaddw.s8        \tmpq2,  \tmpq2,  \tmp1 @ 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
    264        vmov.s8         d3,  #3
    265        vqmovn.s16      \tmp1,   \tmpq2         @ f
    266 .if \wd == 16
    267        vbic            d6,  d6,  d7            @ fm && flat8in && !flat8out
    268 .endif
    269 
    270        vqadd.s8        \tmp3, \tmp1,  d2       @ FFMIN(f + 4, 127)
    271        vqadd.s8        \tmp4, \tmp1,  d3       @ FFMIN(f + 3, 127)
    272        vmovl.u8        q0,  d23                @ p0
    273        vshr.s8         \tmp3, \tmp3,  #3       @ f1
    274        vshr.s8         \tmp4, \tmp4,  #3       @ f2
    275 
    276        vmovl.u8        q1,  d24                @ q0
    277        vaddw.s8        q0,  q0,  \tmp4         @ p0 + f2
    278        vsubw.s8        q1,  q1,  \tmp3         @ q0 - f1
    279        vqmovun.s16     d0,  q0                 @ out p0
    280        vqmovun.s16     d1,  q1                 @ out q0
    281        vrshr.s8        \tmp3, \tmp3, #1        @ f = (f1 + 1) >> 1
    282        vbit            d23, d0,  d4            @ if (fm && !flat8in)
    283        vbit            d24, d1,  d4
    284 
    285        vmovl.u8        q0,  d22                @ p1
    286        vmovl.u8        q1,  d25                @ q1
    287 .if \wd >= 8
    288        vmov            r2,  r3,  d6
    289 .endif
    290        vaddw.s8        q0,  q0,  \tmp3         @ p1 + f
    291        vsubw.s8        q1,  q1,  \tmp3         @ q1 - f
    292 .if \wd >= 8
    293        orrs            r2,  r2,  r3
    294 .endif
    295        vqmovun.s16     d0,  q0                 @ out p1
    296        vqmovun.s16     d2,  q1                 @ out q1
    297        vbit            d22, d0,  d5            @ if (!hev && fm && !flat8in)
    298        vbit            d25, d2,  d5
    299 
    300 .if \wd >= 8
    301        @ If no pixels need flat8in, jump to flat8out
    302        @ (or to a writeout of the inner 4 pixels, for wd=8)
    303        beq             6f
    304 
    305        @ flat8in
    306        vaddl.u8        \tmpq1, d20, d21
    307        vaddl.u8        \tmpq2, d22, d25
    308        vaddl.u8        \tmpq3, d20, d22
    309        vaddl.u8        \tmpq4, d23, d26
    310        vadd.u16        q0,  \tmpq1, \tmpq1
    311        vaddw.u8        q0,  q0,  d23
    312        vaddw.u8        q0,  q0,  d24
    313        vadd.u16        q0,  q0,  \tmpq3
    314        vsub.s16        \tmpq2, \tmpq2, \tmpq1
    315        vsub.s16        \tmpq4, \tmpq4, \tmpq3
    316        vrshrn.u16      d2,  q0,  #3            @ out p2
    317 
    318        vadd.u16        q0,  q0,  \tmpq2
    319        vaddl.u8        \tmpq1, d20, d23
    320        vaddl.u8        \tmpq2, d24, d27
    321        vrshrn.u16      d3,  q0,  #3            @ out p1
    322 
    323        vadd.u16        q0,  q0,  \tmpq4
    324        vsub.s16        \tmpq2, \tmpq2, \tmpq1
    325        vaddl.u8        \tmpq3, d21, d24
    326        vaddl.u8        \tmpq4, d25, d27
    327        vrshrn.u16      d4,  q0,  #3            @ out p0
    328 
    329        vadd.u16        q0,  q0,  \tmpq2
    330        vsub.s16        \tmpq4, \tmpq4, \tmpq3
    331        vaddl.u8        \tmpq1, d22, d25
    332        vaddl.u8        \tmpq2, d26, d27
    333        vrshrn.u16      d5,  q0,  #3            @ out q0
    334 
    335        vadd.u16        q0,  q0,  \tmpq4
    336        vsub.s16        \tmpq2, \tmpq2, \tmpq1
    337        vrshrn.u16      \tmp5,  q0,  #3         @ out q1
    338 
    339        vadd.u16        q0,  q0,  \tmpq2
    340        @ The output here is written back into the input registers. This doesn't
    341        @ matter for the flat8out part below, since we only update those pixels
    342        @ which won't be touched below.
    343        vbit            d21, d2,  d6
    344        vbit            d22, d3,  d6
    345        vbit            d23, d4,  d6
    346        vrshrn.u16      \tmp6,  q0,  #3         @ out q2
    347        vbit            d24, d5,  d6
    348        vbit            d25, \tmp5,  d6
    349        vbit            d26, \tmp6,  d6
    350 .endif
    351 .if \wd == 16
    352 6:
    353        vorr            d2,  d6,  d7
    354        vmov            r2,  r3,  d2
    355        orrs            r2,  r2,  r3
    356        @ If no pixels needed flat8in nor flat8out, jump to a
    357        @ writeout of the inner 4 pixels
    358        beq             7f
    359        vmov            r2,  r3,  d7
    360        orrs            r2,  r2,  r3
    361        @ If no pixels need flat8out, jump to a writeout of the inner 6 pixels
    362        beq             8f
    363 
    364        @ flat8out
    365        @ This writes all outputs into d2-d17 (skipping d6 and d16).
    366        @ If this part is skipped, the output is read from d21-d26 (which is the input
    367        @ to this section).
    368        vshll.u8        q0,  d16, #3  @ 8 * d16
    369        vsubw.u8        q0,  q0,  d16 @ 7 * d16
    370        vaddw.u8        q0,  q0,  d17
    371        vaddl.u8        q4,  d17, d18
    372        vaddl.u8        q5,  d19, d20
    373        vadd.s16        q0,  q0,  q4
    374        vaddl.u8        q4,  d16, d17
    375        vaddl.u8        q6,  d21, d22
    376        vadd.s16        q0,  q0,  q5
    377        vaddl.u8        q5,  d18, d25
    378        vaddl.u8        q7,  d23, d24
    379        vsub.s16        q5,  q5,  q4
    380        vadd.s16        q0,  q0,  q6
    381        vadd.s16        q0,  q0,  q7
    382        vaddl.u8        q6,  d16, d18
    383        vaddl.u8        q7,  d19, d26
    384        vrshrn.u16      d2,  q0,  #4
    385 
    386        vadd.s16        q0,  q0,  q5
    387        vaddl.u8        q4,  d16, d19
    388        vaddl.u8        q5,  d20, d27
    389        vsub.s16        q7,  q7,  q6
    390        vbif            d2,  d17, d7
    391        vrshrn.u16      d3,  q0,  #4
    392 
    393        vadd.s16        q0,  q0,  q7
    394        vaddl.u8        q6,  d16, d20
    395        vaddl.u8        q7,  d21, d28
    396        vsub.s16        q5,  q5,  q4
    397        vbif            d3,  d18, d7
    398        vrshrn.u16      d4,  q0,  #4
    399 
    400        vadd.s16        q0,  q0,  q5
    401        vaddl.u8        q4,  d16, d21
    402        vaddl.u8        q5,  d22, d29
    403        vsub.s16        q7,  q7,  q6
    404        vbif            d4,  d19, d7
    405        vrshrn.u16      d5,  q0,  #4
    406 
    407        vadd.s16        q0,  q0,  q7
    408        vaddl.u8        q6,  d16, d22
    409        vaddl.u8        q7,  d23, d30
    410        vsub.s16        q5,  q5,  q4
    411        vbif            d5,  d20, d7
    412        vrshrn.u16      d6,  q0,  #4
    413 
    414        vadd.s16        q0,  q0,  q5
    415        vaddl.u8        q5,  d16, d23
    416        vsub.s16        q7,  q7,  q6
    417        vaddl.u8        q6,  d24, d31
    418        vbif            d6,  d21, d7
    419        vrshrn.u16      d8,  q0,  #4
    420 
    421        vadd.s16        q0,  q0,  q7
    422        vsub.s16        q5,  q6,  q5
    423        vaddl.u8        q6,  d17, d24
    424        vaddl.u8        q7,  d25, d31
    425        vbif            d8,  d22, d7
    426        vrshrn.u16      d9,  q0,  #4
    427 
    428        vadd.s16        q0,  q0,  q5
    429        vsub.s16        q7,  q7,  q6
    430        vaddl.u8        q6,  d26, d31
    431        vbif            d9,  d23, d7
    432        vrshrn.u16      d10, q0,  #4
    433 
    434        vadd.s16        q0,  q0,  q7
    435        vaddl.u8        q7,  d18, d25
    436        vaddl.u8        q9,  d19, d26
    437        vsub.s16        q6,  q6,  q7
    438        vaddl.u8        q7,  d27, d31
    439        vbif            d10, d24, d7
    440        vrshrn.u16      d11, q0,  #4
    441 
    442        vadd.s16        q0,  q0,  q6
    443        vaddl.u8        q6,  d20, d27
    444        vsub.s16        q7,  q7,  q9
    445        vaddl.u8        q9,  d28, d31
    446        vbif            d11, d25, d7
    447        vsub.s16        q9,  q9,  q6
    448        vrshrn.u16      d12, q0,  #4
    449 
    450        vadd.s16        q0,  q0,  q7
    451        vaddl.u8        q7,  d21, d28
    452        vaddl.u8        q10, d29, d31
    453        vbif            d12, d26, d7
    454        vrshrn.u16      d13, q0,  #4
    455 
    456        vadd.s16        q0,  q0,  q9
    457        vsub.s16        q10, q10, q7
    458        vaddl.u8        q9,  d22, d29
    459        vaddl.u8        q11, d30, d31
    460        vbif            d13, d27, d7
    461        vrshrn.u16      d14, q0,  #4
    462 
    463        vadd.s16        q0,  q0,  q10
    464        vsub.s16        q11, q11, q9
    465        vbif            d14, d28, d7
    466        vrshrn.u16      d15, q0,  #4
    467 
    468        vadd.s16        q0,  q0,  q11
    469        vbif            d15, d29, d7
    470        vrshrn.u16      d17, q0,  #4
    471        vbif            d17, d30, d7
    472 .endif
    473 .endm
    474 
    475 @ For wd <= 8, we use d16-d19 and d28-d31 for temp registers,
    476 @ while we need those for inputs/outputs in wd=16 and use d8-d15
    477 @ for temp registers there instead.
    478 .macro loop_filter_4
    479        loop_filter     4,  d16, d17, d18, d19, d28, d29, d30, d31, q8,  q9,  q14, q15
    480 .endm
    481 
    482 .macro loop_filter_8
    483        loop_filter     8,  d16, d17, d18, d19, d28, d29, d30, d31, q8,  q9,  q14, q15
    484 .endm
    485 
    486 .macro loop_filter_16
    487        loop_filter     16, d8,  d9,  d10, d11, d12, d13, d14, d15, q4,  q5,  q6,  q7
    488 .endm
    489 
    490 
    491 @ The public functions in this file have got the following signature:
    492 @ void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr);
    493 
    494 function ff_vp9_loop_filter_v_4_8_neon, export=1
    495        sub             r12, r0,  r1, lsl #2
    496        vld1.8          {d20}, [r12,:64], r1 @ p3
    497        vld1.8          {d24}, [r0, :64], r1 @ q0
    498        vld1.8          {d21}, [r12,:64], r1 @ p2
    499        vld1.8          {d25}, [r0, :64], r1 @ q1
    500        vld1.8          {d22}, [r12,:64], r1 @ p1
    501        vld1.8          {d26}, [r0, :64], r1 @ q2
    502        vld1.8          {d23}, [r12,:64], r1 @ p0
    503        vld1.8          {d27}, [r0, :64], r1 @ q3
    504        sub             r0,  r0,  r1, lsl #2
    505        sub             r12, r12, r1, lsl #1
    506 
    507        loop_filter_4
    508 
    509        vst1.8          {d22}, [r12,:64], r1
    510        vst1.8          {d24}, [r0, :64], r1
    511        vst1.8          {d23}, [r12,:64], r1
    512        vst1.8          {d25}, [r0, :64], r1
    513 9:
    514        bx              lr
    515 endfunc
    516 
    517 function ff_vp9_loop_filter_h_4_8_neon, export=1
    518        sub             r12, r0,  #4
    519        add             r0,  r12, r1, lsl #2
    520        vld1.8          {d20}, [r12], r1
    521        vld1.8          {d24}, [r0],  r1
    522        vld1.8          {d21}, [r12], r1
    523        vld1.8          {d25}, [r0],  r1
    524        vld1.8          {d22}, [r12], r1
    525        vld1.8          {d26}, [r0],  r1
    526        vld1.8          {d23}, [r12], r1
    527        vld1.8          {d27}, [r0],  r1
    528 
    529        sub             r12, r12, r1, lsl #2
    530        sub             r0,  r0,  r1, lsl #2
    531        @ Move r0/r12 forward by 2 pixels; we don't need to rewrite the
    532        @ outermost 2 pixels since they aren't changed.
    533        add             r12, r12, #2
    534        add             r0,  r0,  #2
    535 
    536        @ Transpose the 8x8 pixels, taking advantage of q registers, to get
    537        @ one register per column.
    538        transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
    539 
    540        loop_filter_4
    541 
    542        @ We only will write the mid 4 pixels back; after the loop filter,
    543        @ these are in d22, d23, d24, d25 (q11, q12), ordered as rows
    544        @ (8x4 pixels). We need to transpose them to columns, done with a
    545        @ 4x4 transpose (which in practice is two 4x4 transposes of the two
    546        @ 4x4 halves of the 8x4 pixels; into 4x8 pixels).
    547        transpose_q_4x4 q11, q12, d22, d23, d24, d25
    548 
    549        vst1.32         {d22[0]}, [r12], r1
    550        vst1.32         {d22[1]}, [r0],  r1
    551        vst1.32         {d23[0]}, [r12], r1
    552        vst1.32         {d23[1]}, [r0],  r1
    553        vst1.32         {d24[0]}, [r12], r1
    554        vst1.32         {d24[1]}, [r0],  r1
    555        vst1.32         {d25[0]}, [r12], r1
    556        vst1.32         {d25[1]}, [r0],  r1
    557 9:
    558        bx              lr
    559 endfunc
    560 
    561 function ff_vp9_loop_filter_v_44_16_neon, export=1
    562        vpush           {q4-q7}
    563        sub             r12, r0,  r1, lsl #2
    564        vld1.8          {q8},  [r12,:128], r1 @ p3
    565        vld1.8          {q12}, [r0, :128], r1 @ q0
    566        vld1.8          {q9},  [r12,:128], r1 @ p2
    567        vld1.8          {q13}, [r0, :128], r1 @ q1
    568        vld1.8          {q10}, [r12,:128], r1 @ p1
    569        vld1.8          {q14}, [r0, :128], r1 @ q2
    570        vld1.8          {q11}, [r12,:128], r1 @ p0
    571        vld1.8          {q15}, [r0, :128], r1 @ q3
    572        sub             r0,  r0,  r1, lsl #2
    573        sub             r12, r12, r1, lsl #1
    574 
    575        loop_filter_q
    576 
    577        vst1.8          {q10}, [r12,:128], r1
    578        vst1.8          {q12}, [r0, :128], r1
    579        vst1.8          {q11}, [r12,:128], r1
    580        vst1.8          {q13}, [r0, :128], r1
    581 9:
    582        vpop            {q4-q7}
    583        bx              lr
    584 endfunc
    585 
    586 function ff_vp9_loop_filter_h_44_16_neon, export=1
    587        vpush           {q4-q7}
    588        sub             r12, r0,  #4
    589        add             r0,  r12, r1, lsl #2
    590        vld1.8          {d16}, [r12], r1
    591        vld1.8          {d24}, [r0],  r1
    592        vld1.8          {d18}, [r12], r1
    593        vld1.8          {d26}, [r0],  r1
    594        vld1.8          {d20}, [r12], r1
    595        vld1.8          {d28}, [r0],  r1
    596        vld1.8          {d22}, [r12], r1
    597        vld1.8          {d30}, [r0],  r1
    598        mov             r12, r0
    599        add             r0,  r0,  r1, lsl #2
    600        vld1.8          {d17}, [r12], r1
    601        vld1.8          {d25}, [r0],  r1
    602        vld1.8          {d19}, [r12], r1
    603        vld1.8          {d27}, [r0],  r1
    604        vld1.8          {d21}, [r12], r1
    605        vld1.8          {d29}, [r0],  r1
    606        vld1.8          {d23}, [r12], r1
    607        vld1.8          {d31}, [r0],  r1
    608 
    609        @ Transpose the 16x8 pixels, as two 8x8 parts
    610        transpose_8x8   q8,  q9,  q10, q11, q12, q13, q14, q15
    611 
    612        loop_filter_q
    613 
    614        sub             r12, r0,  r1, lsl #4
    615        add             r0,  r12, r1, lsl #3
    616        @ Move r0/r12 forward by 2 pixels; we don't need to rewrite the
    617        @ outermost 2 pixels since they aren't changed.
    618        add             r12, r12, #2
    619        add             r0,  r0,  #2
    620 
    621        @ We only will write the mid 4 pixels back; after the loop filter,
    622        @ these are in q10, q11, q12, q13, ordered as rows (16x4 pixels).
    623        @ We need to transpose them to columns, done with a 4x4 transpose
    624        @ (which in practice is four 4x4 transposes of the 4x4 blocks of
    625        @ the 16x4 pixels; into 4x16 pixels).
    626        transpose_4x4   q10, q11, q12, q13
    627 
    628        vst1.32         {d20[0]}, [r12], r1
    629        vst1.32         {d21[0]}, [r0],  r1
    630        vst1.32         {d22[0]}, [r12], r1
    631        vst1.32         {d23[0]}, [r0],  r1
    632        vst1.32         {d24[0]}, [r12], r1
    633        vst1.32         {d25[0]}, [r0],  r1
    634        vst1.32         {d26[0]}, [r12], r1
    635        vst1.32         {d27[0]}, [r0],  r1
    636        vst1.32         {d20[1]}, [r12], r1
    637        vst1.32         {d21[1]}, [r0],  r1
    638        vst1.32         {d22[1]}, [r12], r1
    639        vst1.32         {d23[1]}, [r0],  r1
    640        vst1.32         {d24[1]}, [r12], r1
    641        vst1.32         {d25[1]}, [r0],  r1
    642        vst1.32         {d26[1]}, [r12], r1
    643        vst1.32         {d27[1]}, [r0],  r1
    644 9:
    645        vpop            {q4-q7}
    646        bx              lr
    647 endfunc
    648 
    649 function ff_vp9_loop_filter_v_8_8_neon, export=1
    650        sub             r12, r0,  r1, lsl #2
    651        vld1.8          {d20}, [r12,:64], r1 @ p3
    652        vld1.8          {d24}, [r0, :64], r1 @ q0
    653        vld1.8          {d21}, [r12,:64], r1 @ p2
    654        vld1.8          {d25}, [r0, :64], r1 @ q1
    655        vld1.8          {d22}, [r12,:64], r1 @ p1
    656        vld1.8          {d26}, [r0, :64], r1 @ q2
    657        vld1.8          {d23}, [r12,:64], r1 @ p0
    658        vld1.8          {d27}, [r0, :64], r1 @ q3
    659        sub             r12, r12, r1, lsl #2
    660        sub             r0,  r0,  r1, lsl #2
    661        add             r12, r12, r1
    662 
    663        loop_filter_8
    664 
    665        vst1.8          {d21}, [r12,:64], r1
    666        vst1.8          {d24}, [r0, :64], r1
    667        vst1.8          {d22}, [r12,:64], r1
    668        vst1.8          {d25}, [r0, :64], r1
    669        vst1.8          {d23}, [r12,:64], r1
    670        vst1.8          {d26}, [r0, :64], r1
    671 9:
    672        bx              lr
    673 6:
    674        sub             r12, r0,  r1, lsl #1
    675        vst1.8          {d22}, [r12,:64], r1
    676        vst1.8          {d24}, [r0, :64], r1
    677        vst1.8          {d23}, [r12,:64], r1
    678        vst1.8          {d25}, [r0, :64], r1
    679        bx              lr
    680 endfunc
    681 
    682 function ff_vp9_loop_filter_h_8_8_neon, export=1
    683        sub             r12, r0,  #4
    684        add             r0,  r12, r1, lsl #2
    685        vld1.8          {d20}, [r12], r1
    686        vld1.8          {d24}, [r0],  r1
    687        vld1.8          {d21}, [r12], r1
    688        vld1.8          {d25}, [r0],  r1
    689        vld1.8          {d22}, [r12], r1
    690        vld1.8          {d26}, [r0],  r1
    691        vld1.8          {d23}, [r12], r1
    692        vld1.8          {d27}, [r0],  r1
    693 
    694        sub             r12, r12, r1, lsl #2
    695        sub             r0,  r0,  r1, lsl #2
    696 
    697        transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
    698 
    699        loop_filter_8
    700 
    701        @ Even though only 6 pixels per row have been changed, we write the
    702        @ full 8 pixel registers.
    703        transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
    704 
    705        vst1.8          {d20}, [r12], r1
    706        vst1.8          {d24}, [r0],  r1
    707        vst1.8          {d21}, [r12], r1
    708        vst1.8          {d25}, [r0],  r1
    709        vst1.8          {d22}, [r12], r1
    710        vst1.8          {d26}, [r0],  r1
    711        vst1.8          {d23}, [r12], r1
    712        vst1.8          {d27}, [r0],  r1
    713 9:
    714        bx              lr
    715 6:
    716        @ If we didn't need to do the flat8in part, we use the same writeback
    717        @ as in loop_filter_h_4_8.
    718        add             r12, r12, #2
    719        add             r0,  r0,  #2
    720        transpose_q_4x4 q11, q12, d22, d23, d24, d25
    721        vst1.32         {d22[0]}, [r12], r1
    722        vst1.32         {d22[1]}, [r0],  r1
    723        vst1.32         {d23[0]}, [r12], r1
    724        vst1.32         {d23[1]}, [r0],  r1
    725        vst1.32         {d24[0]}, [r12], r1
    726        vst1.32         {d24[1]}, [r0],  r1
    727        vst1.32         {d25[0]}, [r12], r1
    728        vst1.32         {d25[1]}, [r0],  r1
    729        bx              lr
    730 endfunc
    731 
    732 function vp9_loop_filter_v_16_neon
    733        sub             r12, r0,  r1, lsl #3
    734        @ Read p7-p0 using r12 and q0-q7 using r0
    735        vld1.8          {d16}, [r12,:64], r1 @ p7
    736        vld1.8          {d24}, [r0, :64], r1 @ q0
    737        vld1.8          {d17}, [r12,:64], r1 @ p6
    738        vld1.8          {d25}, [r0, :64], r1 @ q1
    739        vld1.8          {d18}, [r12,:64], r1 @ p5
    740        vld1.8          {d26}, [r0, :64], r1 @ q2
    741        vld1.8          {d19}, [r12,:64], r1 @ p4
    742        vld1.8          {d27}, [r0, :64], r1 @ q3
    743        vld1.8          {d20}, [r12,:64], r1 @ p3
    744        vld1.8          {d28}, [r0, :64], r1 @ q4
    745        vld1.8          {d21}, [r12,:64], r1 @ p2
    746        vld1.8          {d29}, [r0, :64], r1 @ q5
    747        vld1.8          {d22}, [r12,:64], r1 @ p1
    748        vld1.8          {d30}, [r0, :64], r1 @ q6
    749        vld1.8          {d23}, [r12,:64], r1 @ p0
    750        vld1.8          {d31}, [r0, :64], r1 @ q7
    751        sub             r12, r12, r1, lsl #3
    752        sub             r0,  r0,  r1, lsl #3
    753        add             r12, r12, r1
    754 
    755        loop_filter_16
    756 
    757        @ If we did the flat8out part, we get the output in
    758        @ d2-d17 (skipping d7 and d16). r12 points to r0 - 7 * stride,
    759        @ store d2-d9 there, and d10-d17 into r0.
    760        vst1.8          {d2},  [r12,:64], r1
    761        vst1.8          {d10}, [r0, :64], r1
    762        vst1.8          {d3},  [r12,:64], r1
    763        vst1.8          {d11}, [r0, :64], r1
    764        vst1.8          {d4},  [r12,:64], r1
    765        vst1.8          {d12}, [r0, :64], r1
    766        vst1.8          {d5},  [r12,:64], r1
    767        vst1.8          {d13}, [r0, :64], r1
    768        vst1.8          {d6},  [r12,:64], r1
    769        vst1.8          {d14}, [r0, :64], r1
    770        vst1.8          {d8},  [r12,:64], r1
    771        vst1.8          {d15}, [r0, :64], r1
    772        vst1.8          {d9},  [r12,:64], r1
    773        vst1.8          {d17}, [r0, :64], r1
    774        sub             r0,  r0,  r1, lsl #3
    775        add             r0,  r0,  r1
    776 
    777 9:
    778        bx              lr
    779 
    780 8:
    781        add             r12, r12, r1, lsl #2
    782        @ If we didn't do the flat8out part, the output is left in the
    783        @ input registers.
    784        vst1.8          {d21}, [r12,:64], r1
    785        vst1.8          {d24}, [r0, :64], r1
    786        vst1.8          {d22}, [r12,:64], r1
    787        vst1.8          {d25}, [r0, :64], r1
    788        vst1.8          {d23}, [r12,:64], r1
    789        vst1.8          {d26}, [r0, :64], r1
    790        sub             r0,  r0,  r1, lsl #1
    791        sub             r0,  r0,  r1
    792        bx              lr
    793 7:
    794        sub             r12, r0,  r1, lsl #1
    795        vst1.8          {d22}, [r12,:64], r1
    796        vst1.8          {d24}, [r0, :64], r1
    797        vst1.8          {d23}, [r12,:64], r1
    798        vst1.8          {d25}, [r0, :64], r1
    799        sub             r0,  r0,  r1, lsl #1
    800        bx              lr
    801 endfunc
    802 
    803 function ff_vp9_loop_filter_v_16_8_neon, export=1
    804        ldr             r12, [sp]
    805        push            {lr}
    806        vpush           {q4-q7}
    807        push            {r12}
    808        bl              vp9_loop_filter_v_16_neon
    809        add             sp,  sp,  #4
    810        vpop            {q4-q7}
    811        pop             {pc}
    812 endfunc
    813 
    814 function ff_vp9_loop_filter_v_16_16_neon, export=1
    815        ldr             r12, [sp]
    816        // The filter clobbers r2 and r3, but we need to keep them for the second round
    817        push            {r2, r3, lr}
    818        vpush           {q4-q7}
    819        push            {r12}
    820        bl              vp9_loop_filter_v_16_neon
    821        add             r0,  #8
    822        ldr             r2,  [sp, #68]
    823        ldr             r3,  [sp, #72]
    824        bl              vp9_loop_filter_v_16_neon
    825        add             sp,  sp,  #4
    826        vpop            {q4-q7}
    827        pop             {r2, r3, pc}
    828 endfunc
    829 
    830 function vp9_loop_filter_h_16_neon
    831        sub             r12, r0,  #8
    832        vld1.8          {d16}, [r12,:64], r1
    833        vld1.8          {d24}, [r0, :64], r1
    834        vld1.8          {d17}, [r12,:64], r1
    835        vld1.8          {d25}, [r0, :64], r1
    836        vld1.8          {d18}, [r12,:64], r1
    837        vld1.8          {d26}, [r0, :64], r1
    838        vld1.8          {d19}, [r12,:64], r1
    839        vld1.8          {d27}, [r0, :64], r1
    840        vld1.8          {d20}, [r12,:64], r1
    841        vld1.8          {d28}, [r0, :64], r1
    842        vld1.8          {d21}, [r12,:64], r1
    843        vld1.8          {d29}, [r0, :64], r1
    844        vld1.8          {d22}, [r12,:64], r1
    845        vld1.8          {d30}, [r0, :64], r1
    846        vld1.8          {d23}, [r12,:64], r1
    847        vld1.8          {d31}, [r0, :64], r1
    848        sub             r0,  r0,  r1, lsl #3
    849        sub             r12, r12, r1, lsl #3
    850 
    851        @ The 16x8 pixels read above is in two 8x8 blocks; the left
    852        @ half in d16-d23, and the right half in d24-d31. Do two 8x8 transposes
    853        @ of this, to get one column per register. This could be done with two
    854        @ transpose_8x8 as below, but this takes advantage of the q registers.
    855        transpose16_4x4 q8,  q9,  q10, q11, q12, q13, q14, q15
    856        vtrn.8          d16, d17
    857        vtrn.8          d18, d19
    858        vtrn.8          d20, d21
    859        vtrn.8          d22, d23
    860        vtrn.8          d24, d25
    861        vtrn.8          d26, d27
    862        vtrn.8          d28, d29
    863        vtrn.8          d30, d31
    864 
    865        loop_filter_16
    866 
    867        @ Transpose back; this is the same transpose as above, but
    868        @ we can't take advantage of q registers for the transpose, since
    869        @ all d registers in the transpose aren't consecutive.
    870        transpose_8x8   d16, d2,  d3,  d4,  d5,  d6,  d8,  d9
    871        transpose_8x8   d10, d11, d12, d13, d14, d15, d17, d31
    872 
    873        vst1.8          {d16}, [r12,:64], r1
    874        vst1.8          {d10}, [r0, :64], r1
    875 
    876        vst1.8          {d2},  [r12,:64], r1
    877        vst1.8          {d11}, [r0, :64], r1
    878 
    879        vst1.8          {d3},  [r12,:64], r1
    880        vst1.8          {d12}, [r0, :64], r1
    881 
    882        vst1.8          {d4},  [r12,:64], r1
    883        vst1.8          {d13}, [r0, :64], r1
    884 
    885        vst1.8          {d5},  [r12,:64], r1
    886        vst1.8          {d14}, [r0, :64], r1
    887 
    888        vst1.8          {d6},  [r12,:64], r1
    889        vst1.8          {d15}, [r0, :64], r1
    890 
    891        vst1.8          {d8},  [r12,:64], r1
    892        vst1.8          {d17}, [r0, :64], r1
    893 
    894        vst1.8          {d9},  [r12,:64], r1
    895        vst1.8          {d31}, [r0, :64], r1
    896        sub             r0,  r0,  r1, lsl #3
    897 9:
    898        bx              lr
    899 8:
    900        @ The same writeback as in loop_filter_h_8_8
    901        sub             r12, r0,  #4
    902        add             r0,  r12, r1, lsl #2
    903        transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
    904 
    905        vst1.8          {d20}, [r12], r1
    906        vst1.8          {d24}, [r0],  r1
    907        vst1.8          {d21}, [r12], r1
    908        vst1.8          {d25}, [r0],  r1
    909        vst1.8          {d22}, [r12], r1
    910        vst1.8          {d26}, [r0],  r1
    911        vst1.8          {d23}, [r12], r1
    912        vst1.8          {d27}, [r0],  r1
    913        sub             r0,  r0,  r1, lsl #3
    914        add             r0,  r0,  #4
    915        bx              lr
    916 7:
    917        @ The same writeback as in loop_filter_h_4_8
    918        sub             r12, r0,  #2
    919        add             r0,  r12, r1, lsl #2
    920        transpose_q_4x4 q11, q12, d22, d23, d24, d25
    921        vst1.32         {d22[0]}, [r12], r1
    922        vst1.32         {d22[1]}, [r0],  r1
    923        vst1.32         {d23[0]}, [r12], r1
    924        vst1.32         {d23[1]}, [r0],  r1
    925        vst1.32         {d24[0]}, [r12], r1
    926        vst1.32         {d24[1]}, [r0],  r1
    927        vst1.32         {d25[0]}, [r12], r1
    928        vst1.32         {d25[1]}, [r0],  r1
    929        sub             r0,  r0,  r1, lsl #3
    930        add             r0,  r0,  #2
    931        bx              lr
    932 endfunc
    933 
    934 function ff_vp9_loop_filter_h_16_8_neon, export=1
    935        ldr             r12, [sp]
    936        push            {lr}
    937        vpush           {q4-q7}
    938        push            {r12}
    939        bl              vp9_loop_filter_h_16_neon
    940        add             sp,  sp,  #4
    941        vpop            {q4-q7}
    942        pop             {pc}
    943 endfunc
    944 
    945 function ff_vp9_loop_filter_h_16_16_neon, export=1
    946        ldr             r12, [sp]
    947        // The filter clobbers r2 and r3, but we need to keep them for the second round
    948        push            {r2, r3, lr}
    949        vpush           {q4-q7}
    950        push            {r12}
    951        bl              vp9_loop_filter_h_16_neon
    952        add             r0,  r0,  r1, lsl #3
    953        ldr             r2,  [sp, #68]
    954        ldr             r3,  [sp, #72]
    955        bl              vp9_loop_filter_h_16_neon
    956        add             sp,  sp,  #4
    957        vpop            {q4-q7}
    958        pop             {r2, r3, pc}
    959 endfunc