tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

vp8dsp_armv6.S (64296B)


      1 /*
      2 * VP8 ARMv6 optimisations
      3 *
      4 * Copyright (c) 2010 Google Inc.
      5 * Copyright (c) 2010 Rob Clark <rob@ti.com>
      6 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
      7 *
      8 * This file is part of FFmpeg.
      9 *
     10 * FFmpeg is free software; you can redistribute it and/or
     11 * modify it under the terms of the GNU Lesser General Public
     12 * License as published by the Free Software Foundation; either
     13 * version 2.1 of the License, or (at your option) any later version.
     14 *
     15 * FFmpeg is distributed in the hope that it will be useful,
     16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     18 * Lesser General Public License for more details.
     19 *
     20 * You should have received a copy of the GNU Lesser General Public
     21 * License along with FFmpeg; if not, write to the Free Software
     22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     23 *
     24 * This code was partially ported from libvpx, which uses this license:
     25 *
     26 * Redistribution and use in source and binary forms, with or without
     27 * modification, are permitted provided that the following conditions are
     28 * met:
     29 *
     30 *   * Redistributions of source code must retain the above copyright
     31 *     notice, this list of conditions and the following disclaimer.
     32 *
     33 *   * Redistributions in binary form must reproduce the above copyright
     34 *     notice, this list of conditions and the following disclaimer in
     35 *     the documentation and/or other materials provided with the
     36 *     distribution.
     37 *
     38 *   * Neither the name of Google nor the names of its contributors may
     39 *     be used to endorse or promote products derived from this software
     40 *     without specific prior written permission.
     41 *
     42 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     43 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     44 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     45 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     46 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     47 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     48 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     49 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     50 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     51 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     52 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     53 */
     54 
     55 #include "libavutil/arm/asm.S"
     56 
     57 @ idct
     58 
     59 @ void vp8_luma_dc_wht(int16_t block[4][4][16], int16_t dc[16])
     60 function ff_vp8_luma_dc_wht_armv6, export=1
     61        push            {r4-r10, lr}
     62 
     63        ldm             r1,  {r2-r9}
     64        mov             r10, #0
     65        mov             lr,  #0
     66        uadd16          r12, r2,  r8            @ t0[0,1]
     67        usub16          r2,  r2,  r8            @ t3[0,1]
     68        stm             r1!, {r10, lr}
     69        uadd16          r8,  r4,  r6            @ t1[0,1]
     70        usub16          r4,  r4,  r6            @ t2[0,1]
     71        stm             r1!, {r10, lr}
     72        uadd16          r6,  r12, r8            @ dc0[0,1]
     73        usub16          r12, r12, r8            @ dc2[0,1]
     74        stm             r1!, {r10, lr}
     75        uadd16          r8,  r2,  r4            @ dc1[0,1]
     76        usub16          r2,  r2,  r4            @ dc3[0,1]
     77        stm             r1!, {r10, lr}
     78 
     79        uadd16          lr,  r3,  r9            @ t0[2,3]
     80        usub16          r3,  r3,  r9            @ t3[2,3]
     81        uadd16          r9,  r5,  r7            @ t1[2,3]
     82        usub16          r5,  r5,  r7            @ t2[2,3]
     83 
     84        uadd16          r7,  lr,  r9            @ dc0[2,3]
     85        usub16          lr,  lr,  r9            @ dc2[2,3]
     86        uadd16          r9,  r3,  r5            @ dc1[2,3]
     87        usub16          r3,  r3,  r5            @ dc3[2,3]
     88 
     89        mov             r1,  #3
     90        orr             r1,  r1,  #0x30000      @ 3 | 3 (round)
     91 
     92        pkhbt           r4,  r6,  r8,  lsl #16  @ dc{0,1}[0]
     93        pkhtb           r6,  r8,  r6,  asr #16  @ dc{0,1}[1]
     94        pkhbt           r5,  r12, r2,  lsl #16  @ dc{2,3}[0]
     95        pkhtb           r12, r2,  r12, asr #16  @ dc{2,3}[1]
     96        pkhbt           r8,  r7,  r9,  lsl #16  @ dc{0,1}[2]
     97        uadd16          r4,  r4,  r1
     98        uadd16          r5,  r5,  r1
     99        pkhtb           r7,  r9,  r7,  asr #16  @ dc{0,1}[3]
    100        pkhbt           r2,  lr,  r3,  lsl #16  @ dc{2,3}[2]
    101        pkhtb           lr,  r3,  lr,  asr #16  @ dc{2,3}[3]
    102 
    103        uadd16          r9,  r4,  r7            @ t0[0,1]
    104        uadd16          r3,  r5,  lr            @ t0[2,3]
    105        usub16          r4,  r4,  r7            @ t3[0,1]
    106        usub16          r5,  r5,  lr            @ t3[2,3]
    107        uadd16          r7,  r6,  r8            @ t1[0,1]
    108        uadd16          lr,  r12, r2            @ t1[2,3]
    109        usub16          r6,  r6,  r8            @ t2[0,1]
    110        usub16          r12, r12, r2            @ t2[2,3]
    111 
    112        uadd16          r8,  r9,  r7            @ block[0,1][0]
    113        uadd16          r2,  r3,  lr            @ block[2,3][0]
    114        usub16          r9,  r9,  r7            @ block[0,1][2]
    115        usub16          r3,  r3,  lr            @ block[2,3][2]
    116        uadd16          r7,  r4,  r6            @ block[0,1][1]
    117        uadd16          lr,  r5,  r12           @ block[2,3][1]
    118        usub16          r4,  r4,  r6            @ block[0,1][3]
    119        usub16          r5,  r5,  r12           @ block[2,3][3]
    120 
    121 #if HAVE_ARMV6T2_EXTERNAL
    122        sbfx            r6,  r8,  #3,  #13
    123        sbfx            r12, r7,  #3,  #13
    124        sbfx            r1,  r9,  #3,  #13
    125        sbfx            r10, r4,  #3,  #13
    126 #else
    127        sxth            r6,  r8
    128        sxth            r12, r7
    129        sxth            r1,  r9
    130        sxth            r10, r4
    131        asr             r6,  #3                 @ block[0][0]
    132        asr             r12, #3                 @ block[0][1]
    133        asr             r1,  #3                 @ block[0][2]
    134        asr             r10, #3                 @ block[0][3]
    135 #endif
    136 
    137        strh            r6,  [r0], #32
    138        asr             r8,  r8,  #19           @ block[1][0]
    139        strh            r12, [r0], #32
    140        asr             r7,  r7,  #19           @ block[1][1]
    141        strh            r1,  [r0], #32
    142        asr             r9,  r9,  #19           @ block[1][2]
    143        strh            r10, [r0], #32
    144        asr             r4,  r4,  #19           @ block[1][3]
    145        strh            r8,  [r0], #32
    146        asr             r6,  r2,  #19           @ block[3][0]
    147        strh            r7,  [r0], #32
    148        asr             r12, lr,  #19           @ block[3][1]
    149        strh            r9,  [r0], #32
    150        asr             r1,  r3,  #19           @ block[3][2]
    151        strh            r4,  [r0], #32
    152        asr             r10, r5,  #19           @ block[3][3]
    153 
    154 #if HAVE_ARMV6T2_EXTERNAL
    155        sbfx            r2,  r2,  #3,  #13
    156        sbfx            lr,  lr,  #3,  #13
    157        sbfx            r3,  r3,  #3,  #13
    158        sbfx            r5,  r5,  #3,  #13
    159 #else
    160        sxth            r2,  r2
    161        sxth            lr,  lr
    162        sxth            r3,  r3
    163        sxth            r5,  r5
    164        asr             r2,  #3                 @ block[2][0]
    165        asr             lr,  #3                 @ block[2][1]
    166        asr             r3,  #3                 @ block[2][2]
    167        asr             r5,  #3                 @ block[2][3]
    168 #endif
    169 
    170        strh            r2,  [r0], #32
    171        strh            lr,  [r0], #32
    172        strh            r3,  [r0], #32
    173        strh            r5,  [r0], #32
    174        strh            r6,  [r0], #32
    175        strh            r12, [r0], #32
    176        strh            r1,  [r0], #32
    177        strh            r10, [r0], #32
    178 
    179        pop             {r4-r10, pc}
    180 endfunc
    181 
    182 @ void vp8_luma_dc_wht_dc(int16_t block[4][4][16], int16_t dc[16])
    183 function ff_vp8_luma_dc_wht_dc_armv6, export=1
    184        ldrsh           r2,  [r1]
    185        mov             r3,  #0
    186        add             r2,  r2,  #3
    187        strh            r3,  [r1]
    188        asr             r2,  r2,  #3
    189    .rept 16
    190        strh            r2,  [r0], #32
    191    .endr
    192        bx              lr
    193 endfunc
    194 
    195 @ void vp8_idct_add(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
    196 function ff_vp8_idct_add_armv6, export=1
    197        push            {r4-r12, lr}
    198        sub             sp,  sp,  #32
    199 
    200        movw            r3,  #20091             @ cospi8sqrt2minus1
    201        movw            r4,  #35468             @ sinpi8sqrt2
    202        mov             r5,  sp
    203 1:
    204        ldr             r6,  [r1, #8]       @  i5 | i4  = block1[1] | block1[0]
    205        ldr             lr,  [r1, #16]      @  i9 | i8  = block2[1] | block2[0]
    206        ldr             r12, [r1, #24]      @ i13 | i12 = block3[1] | block3[0]
    207 
    208        smulwt          r9,  r3,  r6            @ ip[5] * cospi8sqrt2minus1
    209        smulwb          r7,  r3,  r6            @ ip[4] * cospi8sqrt2minus1
    210        smulwt          r10, r4,  r6            @ ip[5] * sinpi8sqrt2
    211        smulwb          r8,  r4,  r6            @ ip[4] * sinpi8sqrt2
    212        pkhbt           r7,  r7,  r9,  lsl #16  @ 5c | 4c
    213        smulwt          r11, r3,  r12           @ ip[13] * cospi8sqrt2minus1
    214        pkhbt           r8,  r8,  r10, lsl #16  @ 5s   | 4s   = t2 first half
    215        uadd16          r6,  r6,  r7            @ 5c+5 | 4c+4 = t3 first half
    216        smulwb          r9,  r3,  r12           @ ip[12] * cospi8sqrt2minus1
    217        smulwt          r7,  r4,  r12           @ ip[13] * sinpi8sqrt2
    218        smulwb          r10, r4,  r12           @ ip[12] * sinpi8sqrt2
    219 
    220        pkhbt           r9,  r9,  r11, lsl #16  @ 13c | 12c
    221        ldr             r11, [r1]               @  i1 | i0
    222        pkhbt           r10, r10,  r7, lsl #16  @ 13s | 12s    = t3 second half
    223        uadd16          r7,  r12, r9            @ 13c+13  | 12c+12 = t2 2nd half
    224        uadd16          r6,  r6,  r10           @ d = t3
    225        uadd16          r10, r11, lr            @ a = t0
    226        usub16          r7,  r8,  r7            @ c = t2
    227        usub16          r8,  r11, lr            @ b = t1
    228        uadd16          r9,  r10, r6            @ a+d = tmp{0,1}[0]
    229        usub16          r10, r10, r6            @ a-d = tmp{0,1}[3]
    230        uadd16          r6,  r8,  r7            @ b+c = tmp{0,1}[1]
    231        usub16          r7,  r8,  r7            @ b-c = tmp{0,1}[2]
    232        mov             r8,  #0
    233        cmp             sp,  r5
    234        str             r6,  [r5, #8]           @  o5 | o4
    235        str             r7,  [r5, #16]          @  o9 | o8
    236        str             r10, [r5, #24]          @ o13 | o12
    237        str             r9,  [r5], #4           @  o1 | o0
    238        str             r8,  [r1, #8]
    239        str             r8,  [r1, #16]
    240        str             r8,  [r1, #24]
    241        str             r8,  [r1], #4
    242        beq             1b
    243 
    244        mov             r5,  #2
    245 2:
    246        pop             {r1, r6, r12, lr}
    247        smulwt          r9,  r3,  r12           @ ip[5] * cospi8sqrt2minus1
    248        smulwt          r7,  r3,  r1            @ ip[1] * cospi8sqrt2minus1
    249        smulwt          r10, r4,  r12           @ ip[5] * sinpi8sqrt2
    250        smulwt          r8,  r4,  r1            @ ip[1] * sinpi8sqrt2
    251        pkhbt           r11, r1,  r12, lsl #16  @ i4 | i0 = t0/t1 first half
    252        pkhtb           r1,  r12, r1,  asr #16  @ i5 | i1
    253        pkhbt           r7,  r7,  r9,  lsl #16  @ 5c | 1c
    254        pkhbt           r8,  r8,  r10, lsl #16  @ 5s | 1s = t2 first half
    255        pkhbt           r9,  r6,  lr,  lsl #16  @ i6 | i2 = t0/t1 second half
    256        pkhtb           r12, lr,  r6,  asr #16  @ i7 | i3
    257        uadd16          r1,  r7,  r1            @ 5c+5 | 1c+1 = t3 first half
    258        uadd16          r10, r11, r9            @ a = t0
    259        usub16          r9,  r11, r9            @ b = t1
    260        smulwt          r7,  r3,  r12           @ ip[7] * cospi8sqrt2minus1
    261        smulwb          lr,  r3,  r12           @ ip[3] * cospi8sqrt2minus1
    262        smulwt          r11, r4,  r12           @ ip[7] * sinpi8sqrt2
    263        smulwb          r6,  r4,  r12           @ ip[3] * sinpi8sqrt2
    264        subs            r5,  r5,  #1
    265        pkhbt           r7,  lr,  r7,  lsl #16  @ 7c | 3c
    266        pkhbt           r11, r6,  r11, lsl #16  @ 7s | 3s = t3 second half
    267        mov             r6,  #0x4
    268        orr             r6,  r6,  #0x40000
    269        uadd16          r12, r7,  r12           @ 7c+7 | 3c+3 = t2 second half
    270        uadd16          r10, r10, r6            @ t0 + 4
    271        uadd16          r9,  r9,  r6            @ t1 + 4
    272        usub16          lr,  r8,  r12           @ c (o5 | o1) = t2
    273        uadd16          r12, r11, r1            @ d (o7 | o3) = t3
    274        usub16          r1,  r9,  lr            @ b-c = dst{0,1}[2]
    275        uadd16          r7,  r10, r12           @ a+d = dst{0,1}[0]
    276        usub16          r12, r10, r12           @ a-d = dst{0,1}[3]
    277        uadd16          r10, r9,  lr            @ b+c = dst{0,1}[1]
    278 
    279        asr             lr,  r1,  #3            @ o[1][2]
    280        asr             r9,  r12, #3            @ o[1][3]
    281        pkhtb           r8,  lr,  r7,  asr #19  @ o[1][0,2]
    282        pkhtb           r11, r9,  r10, asr #19  @ o[1][1,3]
    283        ldr             lr,  [r0]
    284        sxth            r12, r12
    285        ldr             r9,  [r0, r2]
    286        sxth            r1,  r1
    287 #if HAVE_ARMV6T2_EXTERNAL
    288        sbfx            r7,  r7,  #3,  #13
    289        sbfx            r10, r10, #3,  #13
    290 #else
    291        sxth            r7,  r7
    292        sxth            r10, r10
    293        asr             r7,  #3                 @ o[0][0]
    294        asr             r10, #3                 @ o[0][1]
    295 #endif
    296        pkhbt           r7,  r7,  r1,  lsl #13  @ o[0][0,2]
    297        pkhbt           r10, r10, r12, lsl #13  @ o[0][1,3]
    298 
    299        uxtab16         r7,  r7,  lr
    300        uxtab16         r10, r10, lr,  ror #8
    301        uxtab16         r8,  r8,  r9
    302        uxtab16         r11, r11, r9,  ror #8
    303        usat16          r7,  #8,  r7
    304        usat16          r10, #8,  r10
    305        usat16          r8,  #8,  r8
    306        usat16          r11, #8,  r11
    307        orr             r7,  r7,  r10, lsl #8
    308        orr             r8,  r8,  r11, lsl #8
    309        str             r8,  [r0, r2]
    310        str_post        r7,  r0,  r2,  lsl #1
    311 
    312        bne             2b
    313 
    314        pop             {r4-r12, pc}
    315 endfunc
    316 
    317 @ void vp8_idct_dc_add(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
    318 function ff_vp8_idct_dc_add_armv6, export=1
    319        push            {r4-r6, lr}
    320        add             r6,  r0,  r2,  lsl #1
    321        ldrsh           r3,  [r1]
    322        mov             r4,  #0
    323        add             r3,  r3,  #4
    324        strh            r4,  [r1], #32
    325        asr             r3,  #3
    326        ldr             r5,  [r0]
    327        ldr             r4,  [r0, r2]
    328        pkhbt           r3,  r3,  r3,  lsl #16
    329        uxtab16         lr,  r3,  r5            @ a1+2 | a1+0
    330        uxtab16         r5,  r3,  r5,  ror #8   @ a1+3 | a1+1
    331        uxtab16         r12, r3,  r4
    332        uxtab16         r4,  r3,  r4,  ror #8
    333        usat16          lr,  #8,  lr
    334        usat16          r5,  #8,  r5
    335        usat16          r12, #8,  r12
    336        usat16          r4,  #8,  r4
    337        orr             lr,  lr,  r5,  lsl #8
    338        ldr             r5,  [r6]
    339        orr             r12, r12, r4,  lsl #8
    340        ldr             r4,  [r6, r2]
    341        str             lr,  [r0]
    342        uxtab16         lr,  r3,  r5
    343        str             r12, [r0, r2]
    344        uxtab16         r5,  r3,  r5,  ror #8
    345        uxtab16         r12, r3,  r4
    346        uxtab16         r4,  r3,  r4,  ror #8
    347        usat16          lr,  #8,  lr
    348        usat16          r5,  #8,  r5
    349        usat16          r12, #8,  r12
    350        usat16          r4,  #8,  r4
    351        orr             lr,  lr,  r5,  lsl #8
    352        orr             r12, r12, r4,  lsl #8
    353        str             lr,  [r6]
    354        str             r12, [r6, r2]
    355        pop             {r4-r6, pc}
    356 endfunc
    357 
    358 @ void vp8_idct_dc_add4uv(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride)
    359 function ff_vp8_idct_dc_add4uv_armv6, export=1
    360        push            {r4, lr}
    361 
    362        bl              X(ff_vp8_idct_dc_add_armv6)
    363        add             r0,  r0,  #4
    364        bl              X(ff_vp8_idct_dc_add_armv6)
    365        add             r0,  r0,  r2,  lsl #2
    366        sub             r0,  r0,  #4
    367        bl              X(ff_vp8_idct_dc_add_armv6)
    368        add             r0,  r0,  #4
    369        bl              X(ff_vp8_idct_dc_add_armv6)
    370 
    371        pop             {r4, pc}
    372 endfunc
    373 
    374 @ void vp8_idct_dc_add4y(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride)
    375 function ff_vp8_idct_dc_add4y_armv6, export=1
    376        push            {r4, lr}
    377 
    378        bl              X(ff_vp8_idct_dc_add_armv6)
    379        add             r0,  r0,  #4
    380        bl              X(ff_vp8_idct_dc_add_armv6)
    381        add             r0,  r0,  #4
    382        bl              X(ff_vp8_idct_dc_add_armv6)
    383        add             r0,  r0,  #4
    384        bl              X(ff_vp8_idct_dc_add_armv6)
    385 
    386        pop             {r4, pc}
    387 endfunc
    388 
    389 @ loopfilter
    390 
    391 .macro  transpose       o3,  o2,  o1,  o0,  i0,  i1,  i2,  i3
    392        uxtb16          \o1, \i1                @ xx 12 xx 10
    393        uxtb16          \o0, \i0                @ xx 02 xx 00
    394        uxtb16          \o3, \i3                @ xx 32 xx 30
    395        uxtb16          \o2, \i2                @ xx 22 xx 20
    396        orr             \o1, \o0, \o1, lsl #8   @ 12 02 10 00
    397        orr             \o3, \o2, \o3, lsl #8   @ 32 22 30 20
    398 
    399        uxtb16          \i1, \i1, ror #8        @ xx 13 xx 11
    400        uxtb16          \i3, \i3, ror #8        @ xx 33 xx 31
    401        uxtb16          \i0, \i0, ror #8        @ xx 03 xx 01
    402        uxtb16          \i2, \i2, ror #8        @ xx 23 xx 21
    403        orr             \i0, \i0, \i1, lsl #8   @ 13 03 11 01
    404        orr             \i2, \i2, \i3, lsl #8   @ 33 23 31 21
    405 
    406        pkhtb           \o2, \o3, \o1, asr #16  @ 32 22 12 02
    407        pkhbt           \o0, \o1, \o3, lsl #16  @ 30 20 10 00
    408        pkhtb           \o3, \i2, \i0, asr #16  @ 33 23 13 03
    409        pkhbt           \o1, \i0, \i2, lsl #16  @ 31 21 11 01
    410 .endm
    411 
    412 .macro  simple_filter
    413        uqsub8          r7,  r3,  r6            @ p1 - q1
    414        uqsub8          r8,  r6,  r3            @ q1 - p1
    415        uqsub8          r10, r4,  r5            @ p0 - q0
    416        uqsub8          r9,  r5,  r4            @ q0 - p0
    417        orr             r7,  r7,  r8            @ abs(p1 - q1)
    418        orr             r9,  r9,  r10           @ abs(p0 - q0)
    419        uhadd8          r7,  r7,  lr            @ abs(p1 - q2) >> 1
    420        uqadd8          r9,  r9,  r9            @ abs(p0 - q0) * 2
    421        uqadd8          r7,  r7,  r9            @ abs(p0 - q0)*2 + abs(p1-q1)/2
    422        mvn             r8,  #0
    423        usub8           r10, r12, r7            @ compare to flimit
    424        sel             r10, r8,  lr            @ filter mask: F or 0
    425        cmp             r10, #0
    426        beq             2f
    427 
    428        eor             r3,  r3,  r2            @ ps1
    429        eor             r6,  r6,  r2            @ qs1
    430        eor             r4,  r4,  r2            @ ps0
    431        eor             r5,  r5,  r2            @ qs0
    432 
    433        qsub8           r3,  r3,  r6            @ vp8_filter = p1 - q1
    434        qsub8           r6,  r5,  r4            @ q0 - p0
    435        qadd8           r3,  r3,  r6            @ += q0 - p0
    436        lsr             r7,  r2,  #5            @ 0x04040404
    437        qadd8           r3,  r3,  r6            @ += q0 - p0
    438        sub             r9,  r7,  r2,  lsr #7   @ 0x03030303
    439        qadd8           r3,  r3,  r6            @ vp8_filter = p1-q1 + 3*(q0-p0)
    440        and             r3,  r3,  r10           @ vp8_filter &= mask
    441 
    442        qadd8           r9,  r3,  r9            @ Filter2 = vp8_filter + 3
    443        qadd8           r3,  r3,  r7            @ Filter1 = vp8_filter + 4
    444 
    445        shadd8          r9,  r9,  lr
    446        shadd8          r3,  r3,  lr
    447        shadd8          r9,  r9,  lr
    448        shadd8          r3,  r3,  lr
    449        shadd8          r9,  r9,  lr            @ Filter2 >>= 3
    450        shadd8          r3,  r3,  lr            @ Filter1 >>= 3
    451 
    452        qadd8           r4,  r4,  r9            @ u = p0 + Filter2
    453        qsub8           r5,  r5,  r3            @ u = q0 - Filter1
    454        eor             r4,  r4,  r2            @ *op0 = u ^ 0x80
    455        eor             r5,  r5,  r2            @ *oq0 = u ^ 0x80
    456 .endm
    457 
    458 @ void vp8_v_loop_filter16_simple(uint8_t *dst, ptrdiff_t stride, int flim)
    459 function ff_vp8_v_loop_filter16_simple_armv6, export=1
    460        push            {r4-r11, lr}
    461 
    462        orr             r2,  r2,  r2,  lsl #16
    463        mov             r11, #4
    464        mov             lr,  #0
    465        orr             r12, r2,  r2,  lsl #8
    466        mov32           r2,  0x80808080
    467 1:
    468        ldr_nreg        r3,  r0,  r1,  lsl #1   @ p1
    469        ldr_nreg        r4,  r0,  r1            @ p0
    470        ldr             r5,  [r0]               @ q0
    471        ldr             r6,  [r0, r1]           @ q1
    472        simple_filter
    473 T       sub             r7,  r0,  r1
    474        str             r5,  [r0]               @ oq0
    475 A       str             r4,  [r0, -r1]          @ op0
    476 T       str             r4,  [r7]
    477 2:
    478        subs            r11, r11, #1
    479        add             r0,  r0,  #4
    480        bne             1b
    481 
    482        pop             {r4-r11, pc}
    483 endfunc
    484 
    485 .macro  filter_mask_p
    486        uqsub8          r6,  r9,  r10           @ p3 - p2
    487        uqsub8          r7,  r10, r9            @ p2 - p3
    488        uqsub8          r8,  r10, r11           @ p2 - p1
    489        uqsub8          r10, r11, r10           @ p1 - p2
    490        orr             r6,  r6,  r7            @ abs(p3-p2)
    491        orr             r8,  r8,  r10           @ abs(p2-p1)
    492        uqsub8          lr,  r6,  r2            @ compare to limit
    493        uqsub8          r8,  r8,  r2            @ compare to limit
    494        uqsub8          r6,  r11, r12           @ p1 - p0
    495        orr             lr,  lr,  r8
    496        uqsub8          r7,  r12, r11           @ p0 - p1
    497        orr             r6,  r6,  r7            @ abs(p1-p0)
    498        uqsub8          r7,  r6,  r2            @ compare to limit
    499        uqsub8          r8,  r6,  r3            @ compare to thresh
    500        orr             lr,  lr,  r7
    501 .endm
    502 
    503 .macro filter_mask_pq
    504        uqsub8          r6,  r11, r10           @ p1 - q1
    505        uqsub8          r7,  r10, r11           @ q1 - p1
    506        uqsub8          r11, r12, r9            @ p0 - q0
    507        uqsub8          r12, r9,  r12           @ q0 - p0
    508        orr             r6,  r6,  r7            @ abs(p1-q1)
    509        orr             r12, r11, r12           @ abs(p0-q0)
    510        mov32           r7,  0x7f7f7f7f
    511        uqadd8          r12, r12, r12           @ abs(p0-q0) * 2
    512        and             r6,  r7,  r6,  lsr #1   @ abs(p1-q1) / 2
    513        uqadd8          r12, r12, r6            @ abs(p0-q0) * 2 + abs(p1-q1)/2
    514 .endm
    515 
    516 .macro  filter_mask_v
    517        filter_mask_p
    518 
    519        ldr             r10, [r0, r1]           @ q1
    520        ldr_post        r9,  r0,  r1,  lsl #1   @ q0
    521 
    522        filter_mask_pq
    523 
    524        ldr             r11, [r0]               @ q2
    525 
    526        uqsub8          r7,  r9,  r10           @ q0 - q1
    527        uqsub8          r6,  r10, r9            @ q1 - q0
    528        uqsub8          r12, r12, r4            @ compare to flimit
    529        uqsub8          r9,  r11, r10           @ q2 - q1
    530        uqsub8          r10, r10, r11           @ q1 - q2
    531        orr             lr,  lr,  r12
    532        ldr             r12, [r0, r1]           @ q3
    533        orr             r6,  r7,  r6            @ abs(q1-q0)
    534        orr             r10, r9,  r10           @ abs(q2-q1)
    535        uqsub8          r9,  r12, r11           @ q3 - q2
    536        uqsub8          r11, r11, r12           @ q2 - q3
    537        uqsub8          r7,  r6,  r2            @ compare to limit
    538        uqsub8          r10, r10, r2            @ compare to limit
    539        uqsub8          r6,  r6,  r3            @ compare to thresh
    540        orr             r9,  r9,  r11           @ abs(q3-q2)
    541        orr             lr,  lr,  r7
    542        orr             lr,  lr,  r10
    543        uqsub8          r9,  r9,  r2            @ compare to limit
    544        orr             lr,  lr,  r9
    545 
    546        mov             r12, #0
    547        usub8           lr,  r12, lr
    548        mvn             r11, #0
    549        sel             lr,  r11, r12           @ filter mask
    550        sub             r0,  r0,  r1,  lsl #1
    551 .endm
    552 
    553 .macro  filter_mask_h
    554        transpose       r12, r11, r10, r9,  r6,  r7,  r8,  lr
    555 
    556        filter_mask_p
    557 
    558        stm             sp,  {r8, r11, r12, lr}
    559        sub             r0,  r0,  r1,  lsl #2
    560        add             r0,  r0,  #4
    561 
    562        ldr             r7,  [r0, r1]
    563        ldr_post        r6,  r0,  r1,  lsl #1
    564        ldr             lr,  [r0, r1]
    565        ldr             r8,  [r0]
    566 
    567        transpose       r12, r11, r10, r9,  r6,  r7,  r8,  lr
    568 
    569        uqsub8          r8,  r12, r11           @ q3 - q2
    570        uqsub8          lr,  r11, r12           @ q2 - q3
    571        uqsub8          r7,  r9,  r10           @ q0 - q1
    572        uqsub8          r6,  r10, r9            @ q1 - q0
    573        uqsub8          r12, r11, r10           @ q2 - q1
    574        uqsub8          r11, r10, r11           @ q1 - q2
    575        orr             r8,  r8,  lr            @ abs(q3-q2)
    576        orr             r6,  r7,  r6            @ abs(q1-q0)
    577        orr             r11, r12, r11           @ abs(q2-q1)
    578        ldr             lr,  [sp, #12]          @ load back (f)limit accumulator
    579        uqsub8          r8,  r8,  r2            @ compare to limit
    580        uqsub8          r7,  r6,  r2            @ compare to limit
    581        uqsub8          r11, r11, r2            @ compare to limit
    582        orr             lr,  lr,  r8
    583        uqsub8          r8,  r6,  r3            @ compare to thresh
    584        orr             lr,  lr,  r7
    585        ldr             r12, [sp, #8]           @ p1
    586        orr             lr,  lr,  r11
    587 
    588        ldr             r11, [sp, #4]           @ p0
    589 
    590        filter_mask_pq
    591 
    592        mov             r10, #0
    593        uqsub8          r12, r12, r4            @ compare to flimit
    594        mvn             r11, #0
    595        orr             lr,  lr,  r12
    596        usub8           lr,  r10, lr
    597        sel             lr,  r11, r10           @ filter mask
    598 .endm
    599 
    600 .macro  filter          inner
    601        mov32           r12, 0x80808080
    602        eor             r11, r7,  r12           @ ps1
    603        eor             r8,  r8,  r12           @ ps0
    604        eor             r9,  r9,  r12           @ qs0
    605        eor             r10, r10, r12           @ qs1
    606 
    607        stm             sp,  {r8-r11}
    608 
    609        qsub8           r7,  r11, r10           @ vp8_signed_char_clamp(ps1-qs1)
    610        qsub8           r8,  r9,  r8            @ vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
    611    .if \inner
    612        and             r7,  r7,  r6            @ vp8_filter &= hev
    613    .endif
    614        qadd8           r7,  r7,  r8
    615        lsr             r10, r12, #5            @ 0x04040404
    616        qadd8           r7,  r7,  r8
    617        sub             r9,  r10, r12, lsr #7   @ 0x03030303
    618        qadd8           r7,  r7,  r8
    619 
    620        and             r7,  r7,  lr            @ vp8_filter &= mask
    621    .if !\inner
    622        mov             r12, r7                 @ Filter2
    623        and             r7,  r7,  r6            @ Filter2 &= hev
    624    .endif
    625        qadd8           lr,  r7,  r9            @ Filter2 = vp8_signed_char_clamp(vp8_filter+3)
    626        qadd8           r7,  r7,  r10           @ Filter1 = vp8_signed_char_clamp(vp8_filter+4)
    627 
    628        mov             r9,  #0
    629        shadd8          lr,  lr,  r9            @ Filter2 >>= 3
    630        shadd8          r7,  r7,  r9            @ Filter1 >>= 3
    631        shadd8          lr,  lr,  r9
    632        shadd8          r7,  r7,  r9
    633        shadd8          lr,  lr,  r9            @ Filter2
    634        shadd8          r7,  r7,  r9            @ Filter1
    635 .endm
    636 
    637 .macro  filter_v        inner
    638        orr             r10, r6,  r8            @ calculate vp8_hevmask
    639        ldr_nreg        r7,  r0,  r1,  lsl #1   @ p1
    640        usub8           r10, r12, r10
    641        ldr_nreg        r8,  r0,  r1            @ p0
    642        sel             r6,  r12, r11           @ obtain vp8_hevmask
    643        ldr             r9,  [r0]               @ q0
    644        ldr             r10, [r0, r1]           @ q1
    645        filter          \inner
    646 .endm
    647 
    648 .macro  filter_h        inner
    649        orr             r9,  r6,  r8
    650        usub8           r9,  r12, r9
    651        sel             r6,  r12, r11           @ hev mask
    652 
    653        stm             sp,  {r6, lr}
    654 
    655        ldr_nreg        r12, r0,  r1,  lsl #1
    656        ldr_nreg        r11, r0,  r1
    657        ldr             r6,  [r0]
    658        ldr             lr,  [r0, r1]
    659 
    660        transpose       r10, r9,  r8,  r7,  r12, r11, r6,  lr
    661 
    662        ldm             sp,  {r6, lr}
    663        filter          \inner
    664 .endm
    665 
    666 .macro  filter_inner
    667        ldm             sp,  {r8, r9}
    668        lsr             r10, r10, #2            @ 0x01010101
    669        qadd8           r8,  r8,  lr            @ u = vp8_signed_char_clamp(ps0 + Filter2)
    670        mov             lr,  #0
    671        qsub8           r9,  r9,  r7            @ u = vp8_signed_char_clamp(qs0 - Filter1)
    672        sadd8           r7,  r7,  r10           @ vp8_filter += 1
    673        ldr             r10, [sp, #8]           @ qs1
    674        shadd8          r7,  r7,  lr            @ vp8_filter >>= 1
    675        eor             r8,  r8,  r12           @ *op0 = u ^ 0x80
    676        bic             r7,  r7,  r6            @ vp8_filter &= ~hev
    677        qadd8           r11, r11, r7            @ u = vp8_signed_char_clamp(ps1 + vp8_filter)
    678        eor             r9,  r9,  r12           @ *oq0 = u ^ 0x80
    679        qsub8           r10, r10, r7            @ u = vp8_signed_char_clamp(qs1 - vp8_filter)
    680        eor             r11, r11, r12           @ *op1 = u ^ 0x80
    681        eor             r10, r10, r12           @ *oq1 = u ^ 0x80
    682 .endm
    683 
    684 .macro  filter_x        c0
    685        mov             lr,  \c0
    686        mov             r7,  #63
    687 
    688        sxtb16          r6,  r12
    689        sxtb16          r10, r12, ror #8
    690        smlabb          r8,  r6,  lr,  r7
    691        smlatb          r6,  r6,  lr,  r7
    692        smlabb          r7,  r10, lr,  r7
    693        smultb          r10, r10, lr
    694        ssat            r8,  #8,  r8,  asr #7
    695        ssat            r6,  #8,  r6,  asr #7
    696        add             r10, r10, #63
    697        ssat            r7,  #8,  r7,  asr #7
    698        ssat            r10, #8,  r10, asr #7
    699 
    700        pkhbt           r6,  r8,  r6,  lsl #16
    701        pkhbt           r10, r7,  r10, lsl #16
    702        uxtb16          r6,  r6
    703        uxtb16          r10, r10
    704 
    705        mov32           lr,  0x80808080
    706 
    707        orr             r10, r6,  r10, lsl #8   @ u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
    708        qsub8           r8,  r9,  r10           @ s = vp8_signed_char_clamp(qs0 - u)
    709        qadd8           r10, r11, r10           @ s = vp8_signed_char_clamp(ps0 + u)
    710        eor             r8,  r8,  lr            @ *oq0 = s ^ 0x80
    711        eor             r10, r10, lr            @ *op0 = s ^ 0x80
    712 .endm
    713 
    714 .macro  filter_1
    715        ldm             sp,  {r8, r9}
    716        qadd8           r11, r8,  lr
    717        qsub8           r9,  r9,  r7
    718        bic             r12, r12, r6            @ vp8_filter &= ~hev
    719        filter_x        #27
    720 .endm
    721 
    722 .macro  filter_2
    723        ldr             r9,   [sp, #8]          @ qs1
    724        ldr             r11,  [sp, #12]         @ ps1
    725        filter_x        #18
    726 .endm
    727 
    728 .macro  filter_3
    729        eor             r9,  r9,  lr
    730        eor             r11, r11, lr
    731        filter_x        #9
    732 .endm
    733 
    734 function vp8_v_loop_filter_inner_armv6
    735        mov             r5,  #4
    736        sub             sp,  sp,  #16
    737 
    738        orr             r2,  r2,  r2,  lsl #16
    739        orr             r3,  r3,  r3,  lsl #16
    740        orr             r6,  r6,  r6,  lsl #16
    741        orr             r4,  r2,  r2,  lsl #8   @ flimE
    742        orr             r2,  r3,  r3,  lsl #8   @ flimI
    743        orr             r3,  r6,  r6,  lsl #8   @ thresh
    744 1:
    745        sub             r0,  r0,  r1,  lsl #2
    746        ldr             r10, [r0, r1]           @ p2
    747        ldr_post        r9,  r0,  r1,  lsl #1   @ p3
    748        ldr             r12, [r0, r1]           @ p0
    749        ldr_post        r11, r0,  r1,  lsl #1   @ p1
    750 
    751        filter_mask_v
    752        cmp             lr,  #0
    753        beq             2f
    754        filter_v        inner=1
    755        filter_inner
    756 
    757 A       str             r11, [r0, -r1, lsl #1]  @ op1
    758 A       str             r8,  [r0, -r1]          @ op0
    759 T       sub             r0,  r0,  r1,  lsl #1
    760 T       str             r8,  [r0, r1]
    761 T       str_post        r11, r0,  r1,  lsl #1
    762        str             r9,  [r0]               @ oq0
    763        str             r10, [r0, r1]           @ oq1
    764 2:
    765        add             r0,  r0,  #4
    766        cmp             r5,  #3
    767        it              eq
    768        ldreq           r0,  [sp, #16]
    769        subs            r5,  r5,  #1
    770        bne             1b
    771 
    772        add             sp,  sp,  #16
    773        pop             {r0, r4-r11, pc}
    774 endfunc
    775 
    776 function ff_vp8_v_loop_filter16_inner_armv6, export=1
    777        push            {r4-r11, lr}
    778        add             r12, r0,  #8
    779        push            {r12}
    780        ldr             r6,  [sp, #40]
    781        orr             r2,  r2,  r2,  lsl #16
    782        b               vp8_v_loop_filter_inner_armv6
    783 endfunc
    784 
    785 function ff_vp8_v_loop_filter8uv_inner_armv6, export=1
    786        push            {r1, r4-r11, lr}
    787        mov             r1,  r2
    788        orr             r2,  r3,  r3,  lsl #16
    789        ldr             r3,  [sp, #40]
    790        ldr             r6,  [sp, #44]
    791        b               vp8_v_loop_filter_inner_armv6
    792 endfunc
    793 
    794 function vp8_v_loop_filter_armv6
    795        mov             r5,  #4
    796        sub             sp,  sp,  #16
    797 
    798        orr             r3,  r3,  r3,  lsl #16
    799        orr             r6,  r6,  r6,  lsl #16
    800        orr             r4,  r2,  r2,  lsl #8   @ flimE
    801        orr             r2,  r3,  r3,  lsl #8   @ flimI
    802        orr             r3,  r6,  r6,  lsl #8   @ thresh
    803 1:
    804        sub             r0,  r0,  r1,  lsl #2
    805        ldr             r10, [r0, r1]           @ p2
    806        ldr_post        r9,  r0,  r1,  lsl #1   @ p3
    807        ldr             r12, [r0, r1]           @ p0
    808        ldr_post        r11, r0,  r1,  lsl #1   @ p1
    809 
    810        filter_mask_v
    811        cmp             lr,  #0
    812        beq             2f
    813 
    814        filter_v        inner=0
    815        filter_1
    816 
    817        str             r8,  [r0]               @ *oq0
    818 A       str             r10, [r0, -r1]          @ *op0
    819 T       sub             r0,  r0,  r1,  lsl #1
    820 T       str             r10, [r0, r1]
    821 
    822        filter_2
    823 
    824 A       str             r10, [r0, -r1, lsl #1]  @ *op1
    825 T       str_post        r10, r0,  r1,  lsl #1
    826        str             r8,  [r0, r1]           @ *oq1
    827 
    828        ldr             r9,  [r0, r1,  lsl #1]  @ q2
    829        add             r0,  r0,  r1
    830 A       ldr             r11, [r0, -r1, lsl #2]  @ p2
    831 T       ldr_dpre        r11, r0,  r1,  lsl #2
    832 
    833        filter_3
    834 
    835 A       str             r10, [r0, -r1, lsl #2]  @ *op2
    836 T       str_post        r10, r0,  r1,  lsl #2
    837        str             r8,  [r0, r1]           @ *oq2
    838        sub             r0,  r0,  r1
    839 2:
    840        add             r0,  r0,  #4
    841        cmp             r5,  #3
    842        it              eq
    843        ldreq           r0,  [sp, #16]
    844        subs            r5,  r5,  #1
    845        bne             1b
    846 
    847        add             sp,  sp,  #16
    848        pop             {r0, r4-r11, pc}
    849 endfunc
    850 
    851 function ff_vp8_v_loop_filter16_armv6, export=1
    852        push            {r4-r11, lr}
    853        add             r12, r0,  #8
    854        push            {r12}
    855        ldr             r6,  [sp, #40]
    856        orr             r2,  r2,  r2,  lsl #16
    857        b               vp8_v_loop_filter_armv6
    858 endfunc
    859 
    860 function ff_vp8_v_loop_filter8uv_armv6, export=1
    861        push            {r1, r4-r11, lr}
    862        mov             r1,  r2
    863        orr             r2,  r3,  r3,  lsl #16
    864        ldr             r3,  [sp, #40]
    865        ldr             r6,  [sp, #44]
    866        b               vp8_v_loop_filter_armv6
    867 endfunc
    868 
    869 @ void vp8_h_loop_filter16_simple(uint8_t *dst, ptrdiff_t stride, int flim)
    870 function ff_vp8_h_loop_filter16_simple_armv6, export=1
    871        push            {r4-r11, lr}
    872        orr             r12, r2,  r2,  lsl #16
    873        mov32           r2,  0x80808080
    874        orr             r12, r12, r12, lsl #8
    875 
    876        mov             lr,  #0
    877        mov             r11, #4
    878 1:
    879        sub             r0,  r0,  #2
    880        ldr             r8,  [r0, r1]
    881        ldr_post        r7,  r0,  r1,  lsl #1
    882        ldr             r10, [r0, r1]
    883        ldr_post        r9,  r0,  r1,  lsl #1
    884        add             r0,  r0,  #2
    885        transpose       r6,  r5,  r4,  r3,  r7,  r8,  r9,  r10
    886        simple_filter
    887        sub             r0,  r0,  r1,  lsl #2
    888        sub             r0,  r0,  #1
    889 
    890        uxtb16          r6,  r4
    891        uxtb16          r8,  r5
    892        uxtb16          r7,  r4,  ror #8
    893        uxtb16          r9,  r5,  ror #8
    894        orr             r6,  r6,  r8,  lsl #8
    895        orr             r7,  r7,  r9,  lsl #8
    896        lsr             r4,  r6,  #16
    897        lsr             r5,  r7,  #16
    898 
    899        strh_post       r6,  r0,  r1
    900        strh_post       r7,  r0,  r1
    901        strh_post       r4,  r0,  r1
    902        strh_post       r5,  r0,  r1
    903        add             r0,  r0,  #1
    904 2:
    905        subs            r11, r11, #1
    906        bne             1b
    907 
    908        pop             {r4-r11, pc}
    909 endfunc
    910 
    911 function vp8_h_loop_filter_inner_armv6
    912        mov             r5,  #4
    913        sub             sp,  sp,  #16
    914 
    915        orr             r3,  r3,  r3,  lsl #16
    916        orr             r9,  r9,  r9,  lsl #16
    917        orr             r4,  r2,  r2,  lsl #8   @ flimE
    918        orr             r2,  r3,  r3,  lsl #8   @ flimI
    919        orr             r3,  r9,  r9,  lsl #8   @ thresh
    920        sub             r0,  r0,  #4
    921 1:
    922        ldr             r7,  [r0, r1]
    923        ldr_post        r6,  r0,  r1,  lsl #1
    924        ldr             lr,  [r0, r1]
    925        ldr_post        r8,  r0,  r1,  lsl #1
    926 
    927        filter_mask_h
    928 
    929        cmp             lr,  #0
    930        sub             r0,  r0,  #2
    931        beq             2f
    932 
    933        ldr             r6,  [sp]
    934 
    935        filter_h        inner=1
    936        filter_inner
    937 
    938        transpose       lr,  r12, r7,  r6,  r11, r8,  r9,  r10
    939 
    940 A       str             r6,  [r0, -r1, lsl #1]
    941 A       str             r7,  [r0, -r1]
    942 T       sub             r0,  r0,  r1,  lsl #1
    943 T       str             r7,  [r0, r1]
    944 T       str_post        r6,  r0,  r1,  lsl #1
    945        str             r12, [r0]
    946        str             lr,  [r0, r1]
    947 2:
    948        sub             r0,  r0,  #2
    949        add             r0,  r0,  r1,  lsl #1
    950        cmp             r5,  #3
    951        it              eq
    952        ldreq           r0,  [sp, #16]
    953        subs            r5,  r5,  #1
    954        bne             1b
    955 
    956        add             sp, sp, #16
    957        pop             {r0, r4-r11, pc}
    958 endfunc
    959 
    960 function ff_vp8_h_loop_filter16_inner_armv6, export=1
    961        push            {r4-r11, lr}
    962        add             r12, r0,  r1,  lsl #3
    963        sub             r12, r12, #4
    964        push            {r12}
    965        ldr             r9,  [sp, #40]
    966        orr             r2,  r2,  r2,  lsl #16
    967        b               vp8_h_loop_filter_inner_armv6
    968 endfunc
    969 
    970 function ff_vp8_h_loop_filter8uv_inner_armv6, export=1
    971        sub             r1,  r1,  #4
    972        push            {r1, r4-r11, lr}
    973        mov             r1,  r2
    974        orr             r2,  r3,  r3,  lsl #16
    975        ldr             r3,  [sp, #40]
    976        ldr             r9,  [sp, #44]
    977        b               vp8_h_loop_filter_inner_armv6
    978 endfunc
    979 
    980 function vp8_h_loop_filter_armv6
    981        mov             r5,  #4
    982        sub             sp,  sp,  #16
    983 
    984        orr             r3,  r3,  r3,  lsl #16
    985        orr             r9,  r9,  r9,  lsl #16
    986        orr             r4,  r2,  r2,  lsl #8   @ flimE
    987        orr             r2,  r3,  r3,  lsl #8   @ flimI
    988        orr             r3,  r9,  r9,  lsl #8   @ thresh
    989 1:
    990        sub             r0,  r0,  #4
    991        ldr             r7,  [r0, r1]
    992        ldr_post        r6,  r0,  r1,  lsl #1
    993        ldr             lr,  [r0, r1]
    994        ldr_post        r8,  r0,  r1,  lsl #1
    995 
    996        filter_mask_h
    997        cmp             lr,  #0
    998        it              eq
    999        addeq           r0,  r0,  r1,  lsl #1
   1000        beq             2f
   1001 
   1002        ldr             r6,  [sp]
   1003        sub             r0,  r0,  #2
   1004 
   1005        filter_h        inner=0
   1006        filter_1
   1007 
   1008        sub             r0,  r0,  r1,  lsl #1
   1009        uxtb16          r6,  r10
   1010        uxtb16          r7,  r8
   1011        uxtb16          r10, r10, ror #8
   1012        uxtb16          r8,  r8,  ror #8
   1013        orr             r6,  r6,  r7,  lsl #8
   1014        orr             r10, r10, r8,  lsl #8
   1015        lsr             r7,  r6,  #16
   1016        lsr             r8,  r10, #16
   1017 
   1018        add             r0,  r0,  #1
   1019        strh_post       r6,  r0,  r1
   1020        strh_post       r10, r0,  r1
   1021        strh_post       r7,  r0,  r1
   1022        strh_post       r8,  r0,  r1
   1023 
   1024        filter_2
   1025 
   1026        sub             r0,  r0,  r1,  lsl #2
   1027        add             r0,  r0,  #3
   1028 
   1029        ldrb            r11, [r0, #-5]          @ p2 for 1/7th difference
   1030        strb            r10, [r0, #-4]          @ op1
   1031        strb            r8,  [r0, #-1]          @ oq1
   1032        ldrb_post       r9,  r0,  r1            @ q2 for 1/7th difference
   1033 
   1034        lsr             r10, r10, #8
   1035        lsr             r8,  r8,  #8
   1036 
   1037        ldrb            r6,  [r0, #-5]
   1038        strb            r10, [r0, #-4]
   1039        strb            r8,  [r0, #-1]
   1040        ldrb_post       r7,  r0,  r1
   1041 
   1042        lsr             r10, r10, #8
   1043        lsr             r8,  r8,  #8
   1044        orr             r11, r11, r6,  lsl #8
   1045        orr             r9,  r9,  r7,  lsl #8
   1046 
   1047        ldrb            r6,  [r0, #-5]
   1048        strb            r10, [r0, #-4]
   1049        strb            r8,  [r0, #-1]
   1050        ldrb_post       r7,  r0,  r1
   1051 
   1052        lsr             r10, r10, #8
   1053        lsr             r8,  r8,  #8
   1054        orr             r11, r11, r6,  lsl #16
   1055        orr             r9,  r9,  r7,  lsl #16
   1056 
   1057        ldrb            r6,  [r0, #-5]
   1058        strb            r10, [r0, #-4]
   1059        strb            r8,  [r0, #-1]
   1060        ldrb_post       r7,  r0,  r1
   1061        orr             r11, r11, r6,  lsl #24
   1062        orr             r9,  r9,  r7,  lsl #24
   1063 
   1064        filter_3
   1065 
   1066        sub             r0,  r0,  r1,  lsl #2
   1067        strb            r10, [r0, #-5]
   1068        strb_post       r8,  r0,  r1
   1069        lsr             r10, r10, #8
   1070        lsr             r8,  r8,  #8
   1071        strb            r10, [r0, #-5]
   1072        strb_post       r8,  r0,  r1
   1073        lsr             r10, r10, #8
   1074        lsr             r8,  r8,  #8
   1075        strb            r10, [r0, #-5]
   1076        strb_post       r8,  r0,  r1
   1077        lsr             r10, r10, #8
   1078        lsr             r8,  r8,  #8
   1079        strb            r10, [r0, #-5]
   1080        strb_post       r8,  r0,  r1
   1081 
   1082        sub             r0,  r0,  #2
   1083 2:
   1084        cmp             r5,  #3
   1085        it              eq
   1086        ldreq           r0,  [sp, #16]
   1087        subs            r5,  r5,  #1
   1088        bne             1b
   1089 
   1090        add             sp,  sp,  #16
   1091        pop             {r0, r4-r11, pc}
   1092 endfunc
   1093 
   1094 function ff_vp8_h_loop_filter16_armv6, export=1
   1095        push            {r4-r11, lr}
   1096        add             r12, r0,  r1,  lsl #3
   1097        push            {r12}
   1098        ldr             r9,  [sp, #40]
   1099        orr             r2,  r2,  r2,  lsl #16
   1100        b               vp8_h_loop_filter_armv6
   1101 endfunc
   1102 
   1103 function ff_vp8_h_loop_filter8uv_armv6, export=1
   1104        push            {r1, r4-r11, lr}
   1105        mov             r1,  r2
   1106        orr             r2,  r3,  r3,  lsl #16
   1107        ldr             r3,  [sp, #40]
   1108        ldr             r9,  [sp, #44]
   1109        b               vp8_h_loop_filter_armv6
   1110 endfunc
   1111 
   1112 .ltorg
   1113 
   1114 @ MC
   1115 
   1116 @ void put_vp8_pixels16(uint8_t *dst, ptrdiff_t dststride, const uint8_t *src,
   1117 @                       ptrdiff_t srcstride, int h, int mx, int my)
   1118 function ff_put_vp8_pixels16_armv6, export=1
   1119        push            {r4-r11}
   1120        ldr             r12, [sp, #32]          @ h
   1121 1:
   1122        subs            r12, r12, #2
   1123        ldr             r5,  [r2, #4]
   1124        ldr             r6,  [r2, #8]
   1125        ldr             r7,  [r2, #12]
   1126        ldr_post        r4,  r2,  r3
   1127        ldr             r9,  [r2, #4]
   1128        ldr             r10, [r2, #8]
   1129        ldr             r11, [r2, #12]
   1130        ldr_post        r8,  r2,  r3
   1131        strd            r6,  r7,  [r0, #8]
   1132        strd_post       r4,  r5,  r0,  r1
   1133        strd            r10, r11, [r0, #8]
   1134        strd_post       r8,  r9,  r0,  r1
   1135        bgt             1b
   1136        pop             {r4-r11}
   1137        bx              lr
   1138 endfunc
   1139 
   1140 @ void put_vp8_pixels8(uint8_t *dst, ptrdiff_t dststride, const uint8_t *src,
   1141 @                      ptrdiff_t srcstride, int h, int mx, int my)
   1142 function ff_put_vp8_pixels8_armv6, export=1
   1143        push            {r4-r11}
   1144        ldr             r12, [sp, #32]          @ h
   1145 1:
   1146        subs            r12, r12, #4
   1147        ldr             r5,  [r2, #4]
   1148        ldr_post        r4,  r2,  r3
   1149        ldr             r7,  [r2, #4]
   1150        ldr_post        r6,  r2,  r3
   1151        ldr             r9,  [r2, #4]
   1152        ldr_post        r8,  r2,  r3
   1153        ldr             r11, [r2, #4]
   1154        ldr_post        r10, r2,  r3
   1155        strd_post       r4,  r5,  r0,  r1
   1156        strd_post       r6,  r7,  r0,  r1
   1157        strd_post       r8,  r9,  r0,  r1
   1158        strd_post       r10, r11, r0,  r1
   1159        bgt             1b
   1160        pop             {r4-r11}
   1161        bx              lr
   1162 endfunc
   1163 
   1164 @ void put_vp8_pixels4(uint8_t *dst, ptrdiff_t dststride, const uint8_t *src,
   1165 @                      ptrdiff_t srcstride, int h, int mx, int my)
   1166 function ff_put_vp8_pixels4_armv6, export=1
   1167        ldr             r12, [sp, #0]           @ h
   1168        push            {r4-r6,lr}
   1169 1:
   1170        subs            r12, r12, #4
   1171        ldr_post        r4,  r2,  r3
   1172        ldr_post        r5,  r2,  r3
   1173        ldr_post        r6,  r2,  r3
   1174        ldr_post        lr,  r2,  r3
   1175        str_post        r4,  r0,  r1
   1176        str_post        r5,  r0,  r1
   1177        str_post        r6,  r0,  r1
   1178        str_post        lr,  r0,  r1
   1179        bgt             1b
   1180        pop             {r4-r6,pc}
   1181 endfunc
   1182 
   1183 @ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
   1184 @ arithmetic can be used to apply filters
   1185 const   sixtap_filters_13245600, align=4
   1186        .short     2, 108, -11,  36,  -8, 1, 0, 0
   1187        .short     3,  77, -16,  77, -16, 3, 0, 0
   1188        .short     1,  36,  -8, 108, -11, 2, 0, 0
   1189 endconst
   1190 
   1191 const   fourtap_filters_1324, align=4
   1192        .short     -6,  12, 123, -1
   1193        .short     -9,  50,  93, -6
   1194        .short     -6,  93,  50, -9
   1195        .short     -1, 123,  12, -6
   1196 endconst
   1197 
   1198 .macro  vp8_mc_1        name, size, hv
   1199 function ff_put_vp8_\name\size\()_\hv\()_armv6, export=1
   1200        sub             r1,  r1,  #\size
   1201        mov             r12, sp
   1202        push            {r1, r4-r11, lr}
   1203        ldm             r12, {r5-r7}
   1204        mov             r4,  #\size
   1205        stm             r12, {r4, r5}
   1206        orr             r12, r6,  r7
   1207        b               bl_put_\name\()_\hv\()_armv6
   1208 endfunc
   1209 .endm
   1210 
   1211 vp8_mc_1                epel,  16, h6
   1212 vp8_mc_1                epel,  16, v6
   1213 vp8_mc_1                epel,   8, h6
   1214 vp8_mc_1                epel,   8, v6
   1215 vp8_mc_1                epel,   8, h4
   1216 vp8_mc_1                epel,   8, v4
   1217 vp8_mc_1                epel,   4, h6
   1218 vp8_mc_1                epel,   4, v6
   1219 vp8_mc_1                epel,   4, h4
   1220 vp8_mc_1                epel,   4, v4
   1221 
   1222 vp8_mc_1                bilin, 16, h
   1223 vp8_mc_1                bilin, 16, v
   1224 vp8_mc_1                bilin,  8, h
   1225 vp8_mc_1                bilin,  8, v
   1226 vp8_mc_1                bilin,  4, h
   1227 vp8_mc_1                bilin,  4, v
   1228 
   1229 @ 4 and 8 pixel wide mc blocks might have height of 8 or 16 lines
   1230 #define TMPSIZE \size * (16 / ((16 / \size + 1) / 2) + \ytaps - 1)
   1231 
   1232 .macro  vp8_mc_hv       name, size, h, v, ytaps
   1233 function ff_put_vp8_\name\size\()_\h\v\()_armv6, export=1
   1234        push            {r0, r1, r4, lr}
   1235        add             r0,  sp,  #16
   1236        sub             sp,  sp,  #TMPSIZE+16
   1237        ldm             r0,  {r0, r12}
   1238        mov             r4,  #\size
   1239        add             lr,  r0,  #\ytaps-1
   1240    .if \ytaps > 2
   1241        sub             r2,  r2,  r3,  lsl #\ytaps >> 1 & 1
   1242    .endif
   1243        stm             sp,  {r4, lr}
   1244        add             r0,  sp,  #16
   1245        mov             r1,  #0
   1246        bl              vp8_put_\name\()_\h\()_armv6
   1247        add             r0,  sp,  #TMPSIZE+16
   1248        ldr             lr,  [sp, #TMPSIZE+16+16]
   1249        ldm             r0,  {r0, r1}
   1250        mov             r3,  #\size
   1251        ldr             r12, [sp, #TMPSIZE+16+16+8]
   1252        str             lr,  [sp, #4]
   1253        add             r2,  sp,  #16 + \size * (\ytaps / 2 - 1)
   1254        sub             r1,  r1,  #\size
   1255        bl              vp8_put_\name\()_\v\()_armv6
   1256        add             sp,  sp,  #TMPSIZE+16+8
   1257        pop             {r4, pc}
   1258 endfunc
   1259 .endm
   1260 
   1261 vp8_mc_hv               epel,  16, h6, v6, 6
   1262 vp8_mc_hv               epel,   8, h6, v6, 6
   1263 vp8_mc_hv               epel,   8, h4, v6, 6
   1264 vp8_mc_hv               epel,   8, h6, v4, 4
   1265 vp8_mc_hv               epel,   8, h4, v4, 4
   1266 vp8_mc_hv               epel,   4, h6, v6, 6
   1267 vp8_mc_hv               epel,   4, h4, v6, 6
   1268 vp8_mc_hv               epel,   4, h6, v4, 4
   1269 vp8_mc_hv               epel,   4, h4, v4, 4
   1270 
   1271 vp8_mc_hv               bilin, 16, h,  v,  2
   1272 vp8_mc_hv               bilin,  8, h,  v,  2
   1273 vp8_mc_hv               bilin,  4, h,  v,  2
   1274 
   1275 .macro  sat4            r0,  r1,  r2,  r3
   1276        asr             \r0, \r0, #7
   1277        asr             \r1, \r1, #7
   1278        pkhbt           \r0, \r0, \r2, lsl #9
   1279        pkhbt           \r1, \r1, \r3, lsl #9
   1280        usat16          \r0, #8,  \r0
   1281        usat16          \r1, #8,  \r1
   1282        orr             \r0, \r0, \r1, lsl #8
   1283 .endm
   1284 
   1285 @ Calling convention for the inner MC functions:
   1286 @       r0      dst
   1287 @       r1      dst_stride - block_width
   1288 @       r2      src
   1289 @       r3      src_stride
   1290 @       r4      block_width
   1291 @       r12     filter_index
   1292 @       [sp]    block_width
   1293 @       [sp+4]  height
   1294 @       [sp+8]  scratch
   1295 
   1296 function vp8_put_epel_h6_armv6
   1297        push            {r1, r4-r11, lr}
   1298 bl_put_epel_h6_armv6:
   1299        sub             r2,  r2,  #2
   1300        movrel          lr,  sixtap_filters_13245600 - 16
   1301        add             lr,  lr,  r12, lsl #3
   1302        sub             r3,  r3,  r4
   1303        str             r3,  [sp, #48]
   1304        ldm             lr,  {r1, r3, lr}
   1305 1:
   1306        ldr             r7,  [r2, #5]           @ src[5-8]
   1307        ldr             r6,  [r2, #2]           @ src[2-5]
   1308        ldr             r5,  [r2], #4           @ src[0-3]
   1309 
   1310        pkhtb           r7,  r7,  r7,  asr #8   @ src[8,7,7,6]
   1311        uxtb16          r9,  r6,  ror #8        @ src[5] | src[3]
   1312        uxtb16          r6,  r6                 @ src[4] | src[2]
   1313        uxtb16          r8,  r5,  ror #8        @ src[3] | src[1]
   1314        uxtb16          r11, r7,  ror #8        @ src[8] | src[7]
   1315        uxtb16          r7,  r7                 @ src[7] | src[6]
   1316        uxtb16          r5,  r5                 @ src[2] | src[0]
   1317 
   1318        mov             r10, #0x40
   1319        smlad           r5,  r5,  r1,  r10      @ filter[0][0]
   1320        smlad           r11, r11, lr,  r10      @ filter[3][2]
   1321        smlad           r12, r7,  lr,  r10      @ filter[2][2]
   1322        smlad           r10, r8,  r1,  r10      @ filter[1][0]
   1323        smlad           r5,  r8,  r3,  r5       @ filter[0][1]
   1324        smlad           r11, r9,  r1,  r11      @ filter[3][0]
   1325        smlad           r12, r9,  r3,  r12      @ filter[2][1]
   1326        pkhtb           r9,  r9,  r6,  asr #16  @ src[5] | src[4]
   1327        smlad           r10, r6,  r3,  r10      @ filter[1][1]
   1328        pkhbt           r7,  r9,  r7,  lsl #16  @ src[6] | src[4]
   1329        smlad           r5,  r9,  lr,  r5       @ filter[0][2]
   1330        pkhtb           r8,  r7,  r9,  asr #16  @ src[6] | src[5]
   1331        smlad           r11, r7,  r3,  r11      @ filter[3][1]
   1332        smlad           r9,  r8,  lr,  r10      @ filter[1][2]
   1333        smlad           r7,  r6,  r1,  r12      @ filter[2][0]
   1334 
   1335        subs            r4,  r4,  #4
   1336 
   1337        sat4            r5,  r9,  r7,  r11
   1338        str             r5,  [r0], #4
   1339 
   1340        bne             1b
   1341 
   1342        add             r4,  sp,  #40
   1343        ldm             r4,  {r4, r5, r12}
   1344        ldr             r6,  [sp]
   1345        subs            r5,  r5,  #1
   1346        add             r2,  r2,  r12
   1347        str             r5,  [sp, #44]
   1348        add             r0,  r0,  r6
   1349 
   1350        bne             1b
   1351 
   1352        pop             {r1, r4-r11, pc}
   1353 endfunc
   1354 
   1355 function vp8_put_epel_v6_armv6
   1356        push            {r1, r4-r11, lr}
   1357 bl_put_epel_v6_armv6:
   1358        movrel          lr,  sixtap_filters_13245600 - 16
   1359        add             lr,  lr,  r12, lsl #3
   1360        str             r3,  [sp, #48]
   1361 1:
   1362        add             r1,  r3,  r3,  lsl #1   @ stride * 3
   1363        ldr_nreg        r5,  r2,  r3            @ src[0,1,2,3 + stride * 1]
   1364        ldr             r6,  [r2, r3]           @ src[0,1,2,3 + stride * 3]
   1365        ldr             r7,  [r2, r3,  lsl #1]  @ src[0,1,2,3 + stride * 4]
   1366        ldr             r8,  [r2, r1]           @ src[0,1,2,3 + stride * 5]
   1367 
   1368        uxtb16          r9,  r5,  ror #8        @ src[3 + s*1] | src[1 + s*1]
   1369        uxtb16          r10, r6,  ror #8        @ src[3 + s*3] | src[1 + s*3]
   1370        uxtb16          r11, r7,  ror #8        @ src[3 + s*4] | src[1 + s*4]
   1371        uxtb16          r12, r8,  ror #8        @ src[3 + s*5] | src[1 + s*5]
   1372        uxtb16          r5,  r5                 @ src[2 + s*1] | src[0 + s*1]
   1373        uxtb16          r6,  r6                 @ src[2 + s*3] | src[0 + s*3]
   1374        uxtb16          r7,  r7                 @ src[2 + s*4] | src[0 + s*4]
   1375        uxtb16          r8,  r8                 @ src[2 + s*5] | src[0 + s*5]
   1376        pkhbt           r1,  r9,  r10, lsl #16  @ src[1 + s*3] | src[1 + s*1]
   1377        pkhtb           r9,  r10, r9,  asr #16  @ src[3 + s*3] | src[3 + s*1]
   1378        pkhbt           r10, r11, r12, lsl #16  @ src[1 + s*5] | src[1 + s*4]
   1379        pkhtb           r11, r12, r11, asr #16  @ src[3 + s*5] | src[3 + s*4]
   1380        pkhbt           r12, r5,  r6,  lsl #16  @ src[0 + s*3] | src[0 + s*1]
   1381        pkhtb           r5,  r6,  r5,  asr #16  @ src[2 + s*3] | src[2 + s*1]
   1382        pkhbt           r6,  r7,  r8,  lsl #16  @ src[0 + s*5] | src[0 + s*4]
   1383        pkhtb           r7,  r8,  r7,  asr #16  @ src[2 + s*5] | src[2 + s*4]
   1384 
   1385        ldr             r8,  [lr, #4]
   1386        mov             r3,  #0x40
   1387        smlad           r12, r12, r8,  r3       @ filter[0][1]
   1388        smlad           r1,  r1,  r8,  r3       @ filter[1][1]
   1389        smlad           r5,  r5,  r8,  r3       @ filter[2][1]
   1390        smlad           r9,  r9,  r8,  r3       @ filter[3][1]
   1391        ldr             r8,  [lr, #8]
   1392        ldr             r3,  [sp, #48]
   1393        smlad           r12, r6,  r8,  r12      @ filter[0][2]
   1394        smlad           r1,  r10, r8,  r1       @ filter[1][2]
   1395        ldr_nreg        r6,  r2,  r3,  lsl #1   @ src[0,1,2,3 + stride * 0]
   1396        ldr             r10, [r2], #4           @ src[0,1,2,3 + stride * 2]
   1397        smlad           r5,  r7,  r8,  r5       @ filter[2][2]
   1398        smlad           r9,  r11, r8,  r9       @ filter[3][2]
   1399 
   1400        uxtb16          r7,  r6,  ror #8        @ src[3 + s*0] | src[1 + s*0]
   1401        uxtb16          r11, r10, ror #8        @ src[3 + s*2] | src[1 + s*2]
   1402        uxtb16          r6,  r6                 @ src[2 + s*0] | src[0 + s*0]
   1403        uxtb16          r10, r10                @ src[2 + s*2] | src[0 + s*2]
   1404 
   1405        pkhbt           r8,  r7,  r11, lsl #16  @ src[1 + s*2] | src[1 + s*0]
   1406        pkhtb           r7,  r11, r7,  asr #16  @ src[3 + s*2] | src[3 + s*0]
   1407        pkhbt           r11, r6,  r10, lsl #16  @ src[0 + s*2] | src[0 + s*0]
   1408        pkhtb           r6,  r10, r6,  asr #16  @ src[2 + s*2] | src[2 + s*0]
   1409 
   1410        ldr             r10, [lr]
   1411        subs            r4,  r4,  #4
   1412        smlad           r12, r11, r10, r12      @ filter[0][0]
   1413        smlad           r1,  r8,  r10, r1       @ filter[1][0]
   1414        smlad           r5,  r6,  r10, r5       @ filter[2][0]
   1415        smlad           r9,  r7,  r10, r9       @ filter[3][0]
   1416 
   1417        sat4            r12, r1,  r5,  r9
   1418        str             r12, [r0], #4
   1419 
   1420        bne             1b
   1421 
   1422        ldrd            r4,  r5,  [sp, #40]
   1423        ldr             r6,  [sp]
   1424        subs            r5,  r5,  #1
   1425        sub             r2,  r2,  r4
   1426        str             r5,  [sp, #44]
   1427        add             r0,  r0,  r6
   1428        add             r2,  r2,  r3
   1429 
   1430        bne             1b
   1431 
   1432        pop             {r1, r4-r11, pc}
   1433 endfunc
   1434 
   1435 function vp8_put_epel_h4_armv6
   1436        push            {r1, r4-r11, lr}
   1437 bl_put_epel_h4_armv6:
   1438        subs            r2,  r2,  #1
   1439        movrel          lr,  fourtap_filters_1324 - 4
   1440        add             lr,  lr,  r12, lsl #2
   1441        sub             r3,  r3,  r4
   1442        ldm             lr,  {r5, r6}
   1443        ldr             lr,  [sp, #44]
   1444 1:
   1445        ldr             r9,  [r2, #3]
   1446        ldr             r8,  [r2, #2]
   1447        ldr             r7,  [r2], #4
   1448 
   1449        uxtb16          r9,  r9,  ror #8        @ src[6] | src[4]
   1450        uxtb16          r10, r8,  ror #8        @ src[5] | src[3]
   1451        uxtb16          r8,  r8                 @ src[4] | src[2]
   1452        uxtb16          r11, r7,  ror #8        @ src[3] | src[1]
   1453        uxtb16          r7,  r7                 @ src[2] | src[0]
   1454 
   1455        mov             r12, #0x40
   1456        smlad           r9,  r9,  r6,  r12      @ filter[3][1]
   1457        smlad           r7,  r7,  r5,  r12      @ filter[0][0]
   1458        smlad           r9,  r10, r5,  r9       @ filter[3][0]
   1459        smlad           r10, r10, r6,  r12      @ filter[2][1]
   1460        smlad           r12, r11, r5,  r12      @ filter[1][0]
   1461        smlad           r7,  r11, r6,  r7       @ filter[0][1]
   1462        smlad           r10, r8,  r5,  r10      @ filter[2][0]
   1463        smlad           r12, r8,  r6,  r12      @ filter[1][1]
   1464 
   1465        subs            r4,  r4,  #4
   1466 
   1467        sat4            r7,  r12, r10, r9
   1468        str             r7,  [r0], #4
   1469 
   1470        bne             1b
   1471 
   1472        subs            lr,  lr,  #1
   1473        ldr             r4,  [sp, #40]
   1474        add             r2,  r2,  r3
   1475        add             r0,  r0,  r1
   1476 
   1477        bne             1b
   1478 
   1479        pop             {r1, r4-r11, pc}
   1480 endfunc
   1481 
   1482 function vp8_put_epel_v4_armv6
   1483        push            {r1, r4-r11, lr}
   1484 bl_put_epel_v4_armv6:
   1485        movrel          lr,  fourtap_filters_1324 - 4
   1486        add             lr,  lr,  r12, lsl #2
   1487        ldm             lr,  {r5, r6}
   1488        str             r3,  [sp, #48]
   1489 1:
   1490        ldr             lr,  [r2, r3, lsl #1]
   1491        ldr             r12, [r2, r3]
   1492        ldr_nreg        r7,  r2,  r3
   1493        ldr             r11, [r2], #4
   1494 
   1495        uxtb16          r8,  lr,  ror #8        @ src[3 + s*3] | src[1 + s*3]
   1496        uxtb16          r9,  r12, ror #8        @ src[3 + s*2] | src[1 + s*2]
   1497        uxtb16          r3,  r7,  ror #8        @ src[3 + s*0] | src[1 + s*0]
   1498        uxtb16          r1,  r11, ror #8        @ src[3 + s*1] | src[1 + s*1]
   1499        uxtb16          lr,  lr                 @ src[2 + s*3] | src[0 + s*3]
   1500        uxtb16          r12, r12                @ src[2 + s*2] | src[0 + s*2]
   1501        uxtb16          r7,  r7                 @ src[2 + s*0] | src[0 + s*0]
   1502        uxtb16          r11, r11                @ src[2 + s*1] | src[0 + s*1]
   1503        pkhbt           r10, r1,  r8,  lsl #16  @ src[1 + s*3] | src[1 + s*1]
   1504        pkhtb           r1,  r8,  r1,  asr #16  @ src[3 + s*3] | src[3 + s*1]
   1505        pkhbt           r8,  r3,  r9,  lsl #16  @ src[1 + s*2] | src[1 + s*0]
   1506        pkhtb           r3,  r9,  r3,  asr #16  @ src[3 + s*2] | src[3 + s*0]
   1507        pkhbt           r9,  r11, lr,  lsl #16  @ src[0 + s*3] | src[0 + s*1]
   1508        pkhtb           r11, lr,  r11, asr #16  @ src[2 + s*3] | src[2 + s*1]
   1509        pkhbt           lr,  r7,  r12, lsl #16  @ src[0 + s*2] | src[0 + s*0]
   1510        pkhtb           r7,  r12, r7,  asr #16  @ src[2 + s*2] | src[2 + s*0]
   1511 
   1512        mov             r12, #0x40
   1513        smlad           r9,  r9,  r6,  r12      @ filter[0][1]
   1514        smlad           r10, r10, r6,  r12      @ filter[1][1]
   1515        smlad           r11, r11, r6,  r12      @ filter[2][1]
   1516        smlad           r1,  r1,  r6,  r12      @ filter[3][1]
   1517        smlad           r9,  lr,  r5,  r9       @ filter[0][0]
   1518        smlad           r10, r8,  r5,  r10      @ filter[1][0]
   1519        smlad           r11, r7,  r5,  r11      @ filter[2][0]
   1520        smlad           r1,  r3,  r5,  r1       @ filter[3][0]
   1521 
   1522        subs            r4,  r4,  #4
   1523        ldr             r3,  [sp, #48]
   1524 
   1525        sat4            r9,  r10, r11, r1
   1526        str             r9,  [r0], #4
   1527 
   1528        bne             1b
   1529 
   1530        ldr             r4,  [sp, #40]
   1531        ldr             r12, [sp, #44]
   1532        add             r2,  r2,  r3
   1533        ldr             r9,  [sp, #0]
   1534        subs            r12, r12, #1
   1535        sub             r2,  r2,  r4
   1536        str             r12, [sp, #44]
   1537        add             r0,  r0,  r9
   1538 
   1539        bne             1b
   1540 
   1541        pop             {r1, r4-r11, pc}
   1542 endfunc
   1543 
   1544 function vp8_put_bilin_h_armv6
   1545        push            {r1, r4-r11, lr}
   1546 bl_put_bilin_h_armv6:
   1547        rsb             r5,  r12, r12, lsl #16
   1548        ldr             r12, [sp, #44]
   1549        sub             r3,  r3,  r4
   1550        add             r5,  r5,  #8
   1551 1:
   1552        ldrb            r6,  [r2], #1
   1553        ldrb            r7,  [r2], #1
   1554        ldrb            r8,  [r2], #1
   1555        ldrb            r9,  [r2], #1
   1556        ldrb            lr,  [r2]
   1557 
   1558        pkhbt           r6,  r6,  r7,  lsl #16  @ src[1] | src[0]
   1559        pkhbt           r7,  r7,  r8,  lsl #16  @ src[2] | src[1]
   1560        pkhbt           r8,  r8,  r9,  lsl #16  @ src[3] | src[2]
   1561        pkhbt           r9,  r9,  lr,  lsl #16  @ src[4] | src[3]
   1562 
   1563        mov             r10, #4
   1564        smlad           r6,  r6,  r5,  r10
   1565        smlad           r7,  r7,  r5,  r10
   1566        smlad           r8,  r8,  r5,  r10
   1567        smlad           r9,  r9,  r5,  r10
   1568 
   1569        subs            r4,  r4,  #4
   1570 
   1571        asr             r6,  #3
   1572        asr             r7,  #3
   1573        pkhbt           r6,  r6,  r8,  lsl #13
   1574        pkhbt           r7,  r7,  r9,  lsl #13
   1575        orr             r6,  r6,  r7,  lsl #8
   1576        str             r6,  [r0], #4
   1577 
   1578        bne             1b
   1579 
   1580        ldr             r4,  [sp, #40]
   1581        subs            r12, r12, #1
   1582        add             r2,  r2,  r3
   1583        add             r0,  r0,  r1
   1584 
   1585        bne             1b
   1586 
   1587        pop             {r1, r4-r11, pc}
   1588 endfunc
   1589 
   1590 function vp8_put_bilin_v_armv6
   1591        push            {r1, r4-r11, lr}
   1592 bl_put_bilin_v_armv6:
   1593        rsb             r5,  r12, r12, lsl #16
   1594        ldr             r12, [sp, #44]
   1595        add             r5,  r5,  #8
   1596 1:
   1597        ldrb            r10, [r2, r3]
   1598        ldrb            r6,  [r2], #1
   1599        ldrb            r11, [r2, r3]
   1600        ldrb            r7,  [r2], #1
   1601        ldrb            lr,  [r2, r3]
   1602        ldrb            r8,  [r2], #1
   1603        ldrb            r9,  [r2, r3]
   1604        pkhbt           r6,  r6,  r10, lsl #16
   1605        ldrb            r10, [r2], #1
   1606        pkhbt           r7,  r7,  r11, lsl #16
   1607        pkhbt           r8,  r8,  lr,  lsl #16
   1608        pkhbt           r9,  r10, r9,  lsl #16
   1609 
   1610        mov             r10, #4
   1611        smlad           r6,  r6,  r5,  r10
   1612        smlad           r7,  r7,  r5,  r10
   1613        smlad           r8,  r8,  r5,  r10
   1614        smlad           r9,  r9,  r5,  r10
   1615 
   1616        subs            r4,  r4,  #4
   1617 
   1618        asr             r6,  #3
   1619        asr             r7,  #3
   1620        pkhbt           r6,  r6,  r8,  lsl #13
   1621        pkhbt           r7,  r7,  r9,  lsl #13
   1622        orr             r6,  r6,  r7,  lsl #8
   1623        str             r6,  [r0], #4
   1624 
   1625        bne             1b
   1626 
   1627        ldr             r4,  [sp, #40]
   1628        subs            r12, r12, #1
   1629        add             r2,  r2,  r3
   1630        add             r0,  r0,  r1
   1631        sub             r2,  r2,  r4
   1632 
   1633        bne             1b
   1634        pop             {r1, r4-r11, pc}
   1635 endfunc