tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

pixman-arm-neon-asm-bilinear.S (45613B)


      1 /*
      2 * Copyright © 2011 SCore Corporation
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
     21 * DEALINGS IN THE SOFTWARE.
     22 *
     23 * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
     24 * Author:  Taekyun Kim (tkq.kim@samsung.com)
     25 */
     26 
     27 /*
     28 * This file contains scaled bilinear scanline functions implemented
     29 * using older siarhei's bilinear macro template.
     30 *
     31 * << General scanline function procedures >>
     32 *  1. bilinear interpolate source pixels
     33 *  2. load mask pixels
     34 *  3. load destination pixels
     35 *  4. duplicate mask to fill whole register
     36 *  5. interleave source & destination pixels
     37 *  6. apply mask to source pixels
     38 *  7. combine source & destination pixels
     39 *  8, Deinterleave final result
     40 *  9. store destination pixels
     41 *
     42 * All registers with single number (i.e. src0, tmp0) are 64-bits registers.
     43 * Registers with double numbers(src01, dst01) are 128-bits registers.
     44 * All temp registers can be used freely outside the code block.
     45 * Assume that symbol(register .req) OUT and MASK are defined at caller of these macro blocks.
     46 *
     47 * Remarks
     48 *  There can be lots of pipeline stalls inside code block and between code blocks.
     49 *  Further optimizations will be done by new macro templates using head/tail_head/tail scheme.
     50 */
     51 
     52 /* Prevent the stack from becoming executable for no reason... */
     53 #if defined(__linux__) && defined (__ELF__)
     54 .section .note.GNU-stack,"",%progbits
     55 #endif
     56 
     57 .text
     58 .arch armv7a
     59 .object_arch armv4
     60 .fpu neon
     61 .eabi_attribute 10, 0
     62 .eabi_attribute 12, 0
     63 .arm
     64 .altmacro
     65 .p2align 2
     66 
     67 #include "pixman-private.h"
     68 #include "pixman-arm-asm.h"
     69 #include "pixman-arm-neon-asm.h"
     70 
     71 pixman_syntax_unified
     72 
     73 /*
     74 * Bilinear macros from pixman-arm-neon-asm.S
     75 */
     76 
     77 /*
     78 * Bilinear scaling support code which tries to provide pixel fetching, color
     79 * format conversion, and interpolation as separate macros which can be used
     80 * as the basic building blocks for constructing bilinear scanline functions.
     81 */
     82 
     83 .macro bilinear_load_8888 reg1, reg2, tmp
     84    mov       TMP1, X, asr #16
     85    add       X, X, UX
     86    add       TMP1, TOP, TMP1, asl #2
     87    vld1.32   {\reg1}, [TMP1], STRIDE
     88    vld1.32   {\reg2}, [TMP1]
     89 .endm
     90 
     91 .macro bilinear_load_0565 reg1, reg2, tmp
     92    mov       TMP1, X, asr #16
     93    add       X, X, UX
     94    add       TMP1, TOP, TMP1, asl #1
     95    vld1.32   {\reg2[0]}, [TMP1], STRIDE
     96    vld1.32   {\reg2[1]}, [TMP1]
     97    convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp
     98 .endm
     99 
    100 .macro bilinear_load_and_vertical_interpolate_two_8888 \
    101                    acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
    102 
    103    bilinear_load_8888 \reg1, \reg2, \tmp1
    104    vmull.u8  \acc1, \reg1, d28
    105    vmlal.u8  \acc1, \reg2, d29
    106    bilinear_load_8888 \reg3, \reg4, \tmp2
    107    vmull.u8  \acc2, \reg3, d28
    108    vmlal.u8  \acc2, \reg4, d29
    109 .endm
    110 
    111 .macro bilinear_load_and_vertical_interpolate_four_8888 \
    112                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
    113                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
    114 
    115    bilinear_load_and_vertical_interpolate_two_8888 \
    116                \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, \xacc2hi
    117    bilinear_load_and_vertical_interpolate_two_8888 \
    118                \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi
    119 .endm
    120 
    121 .macro bilinear_load_and_vertical_interpolate_two_0565 \
    122                acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
    123 
    124    mov       TMP1, X, asr #16
    125    add       X, X, UX
    126    add       TMP1, TOP, TMP1, asl #1
    127    mov       TMP2, X, asr #16
    128    add       X, X, UX
    129    add       TMP2, TOP, TMP2, asl #1
    130    vld1.32   {\acc2lo[0]}, [TMP1], STRIDE
    131    vld1.32   {\acc2hi[0]}, [TMP2], STRIDE
    132    vld1.32   {\acc2lo[1]}, [TMP1]
    133    vld1.32   {\acc2hi[1]}, [TMP2]
    134    convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1
    135    vzip.u8   \reg1, \reg3
    136    vzip.u8   \reg2, \reg4
    137    vzip.u8   \reg3, \reg4
    138    vzip.u8   \reg1, \reg2
    139    vmull.u8  \acc1, \reg1, d28
    140    vmlal.u8  \acc1, \reg2, d29
    141    vmull.u8  \acc2, \reg3, d28
    142    vmlal.u8  \acc2, \reg4, d29
    143 .endm
    144 
    145 .macro bilinear_load_and_vertical_interpolate_four_0565 \
    146                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
    147                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
    148 
    149    mov       TMP1, X, asr #16
    150    add       X, X, UX
    151    add       TMP1, TOP, TMP1, asl #1
    152    mov       TMP2, X, asr #16
    153    add       X, X, UX
    154    add       TMP2, TOP, TMP2, asl #1
    155    vld1.32   {\xacc2lo[0]}, [TMP1], STRIDE
    156    vld1.32   {\xacc2hi[0]}, [TMP2], STRIDE
    157    vld1.32   {\xacc2lo[1]}, [TMP1]
    158    vld1.32   {\xacc2hi[1]}, [TMP2]
    159    convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1
    160    mov       TMP1, X, asr #16
    161    add       X, X, UX
    162    add       TMP1, TOP, TMP1, asl #1
    163    mov       TMP2, X, asr #16
    164    add       X, X, UX
    165    add       TMP2, TOP, TMP2, asl #1
    166    vld1.32   {\yacc2lo[0]}, [TMP1], STRIDE
    167    vzip.u8   \xreg1, \xreg3
    168    vld1.32   {\yacc2hi[0]}, [TMP2], STRIDE
    169    vzip.u8   \xreg2, \xreg4
    170    vld1.32   {\yacc2lo[1]}, [TMP1]
    171    vzip.u8   \xreg3, \xreg4
    172    vld1.32   {\yacc2hi[1]}, [TMP2]
    173    vzip.u8   \xreg1, \xreg2
    174    convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1
    175    vmull.u8  \xacc1, \xreg1, d28
    176    vzip.u8   \yreg1, \yreg3
    177    vmlal.u8  \xacc1, \xreg2, d29
    178    vzip.u8   \yreg2, \yreg4
    179    vmull.u8  \xacc2, \xreg3, d28
    180    vzip.u8   \yreg3, \yreg4
    181    vmlal.u8  \xacc2, \xreg4, d29
    182    vzip.u8   \yreg1, \yreg2
    183    vmull.u8  \yacc1, \yreg1, d28
    184    vmlal.u8  \yacc1, \yreg2, d29
    185    vmull.u8  \yacc2, \yreg3, d28
    186    vmlal.u8  \yacc2, \yreg4, d29
    187 .endm
    188 
    189 .macro bilinear_store_8888 numpix, tmp1, tmp2
    190 .if \numpix == 4
    191    vst1.32   {d0, d1}, [OUT]!
    192 .elseif \numpix == 2
    193    vst1.32   {d0}, [OUT]!
    194 .elseif \numpix == 1
    195    vst1.32   {d0[0]}, [OUT, :32]!
    196 .else
    197    .error bilinear_store_8888 numpix is unsupported
    198 .endif
    199 .endm
    200 
    201 .macro bilinear_store_0565 numpix, tmp1, tmp2
    202    vuzp.u8 d0, d1
    203    vuzp.u8 d2, d3
    204    vuzp.u8 d1, d3
    205    vuzp.u8 d0, d2
    206    convert_8888_to_0565 d2, d1, d0, q1, \tmp1, \tmp2
    207 .if \numpix == 4
    208    vst1.16   {d2}, [OUT]!
    209 .elseif \numpix == 2
    210    vst1.32   {d2[0]}, [OUT]!
    211 .elseif \numpix == 1
    212    vst1.16   {d2[0]}, [OUT]!
    213 .else
    214    .error bilinear_store_0565 numpix is unsupported
    215 .endif
    216 .endm
    217 
    218 
    219 /*
    220 * Macros for loading mask pixels into register 'mask'.
    221 * vdup must be done in somewhere else.
    222 */
    223 .macro bilinear_load_mask_x numpix, mask
    224 .endm
    225 
    226 .macro bilinear_load_mask_8 numpix, mask
    227 .if \numpix == 4
    228    vld1.32     {\mask[0]}, [MASK]!
    229 .elseif \numpix == 2
    230    vld1.16     {\mask[0]}, [MASK]!
    231 .elseif \numpix == 1
    232    vld1.8      {\mask[0]}, [MASK]!
    233 .else
    234    .error bilinear_load_mask_8 \numpix is unsupported
    235 .endif
    236    pld         [MASK, #prefetch_offset]
    237 .endm
    238 
    239 .macro bilinear_load_mask mask_fmt, numpix, mask
    240    bilinear_load_mask_\()\mask_fmt \numpix, \mask
    241 .endm
    242 
    243 
    244 /*
    245 * Macros for loading destination pixels into register 'dst0' and 'dst1'.
    246 * Interleave should be done somewhere else.
    247 */
    248 .macro bilinear_load_dst_0565_src numpix, dst0, dst1, dst01
    249 .endm
    250 
    251 .macro bilinear_load_dst_8888_src numpix, dst0, dst1, dst01
    252 .endm
    253 
    254 .macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01
    255 .if \numpix == 4
    256    vld1.32     {\dst0, \dst1}, [OUT]
    257 .elseif \numpix == 2
    258    vld1.32     {\dst0}, [OUT]
    259 .elseif \numpix == 1
    260    vld1.32     {\dst0[0]}, [OUT]
    261 .else
    262    .error bilinear_load_dst_8888 \numpix is unsupported
    263 .endif
    264    pld         [OUT, #(prefetch_offset * 4)]
    265 .endm
    266 
    267 .macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01
    268    bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01
    269 .endm
    270 
    271 .macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01
    272    bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01
    273 .endm
    274 
    275 .macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01
    276    bilinear_load_dst_\()\dst_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01
    277 .endm
    278 
    279 /*
    280 * Macros for duplicating partially loaded mask to fill entire register.
    281 * We will apply mask to interleaved source pixels, that is
    282 *  (r0, r1, r2, r3, g0, g1, g2, g3) x (m0, m1, m2, m3, m0, m1, m2, m3)
    283 *  (b0, b1, b2, b3, a0, a1, a2, a3) x (m0, m1, m2, m3, m0, m1, m2, m3)
    284 * So, we need to duplicate loaded mask into whole register.
    285 *
    286 * For two pixel case
    287 *  (r0, r1, x, x, g0, g1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
    288 *  (b0, b1, x, x, a0, a1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
    289 * We can do some optimizations for this including last pixel cases.
    290 */
    291 .macro bilinear_duplicate_mask_x numpix, mask
    292 .endm
    293 
    294 .macro bilinear_duplicate_mask_8 numpix, mask
    295 .if \numpix == 4
    296    vdup.32     \mask, \mask[0]
    297 .elseif \numpix == 2
    298    vdup.16     \mask, \mask[0]
    299 .elseif \numpix == 1
    300    vdup.8      \mask, \mask[0]
    301 .else
    302    .error bilinear_duplicate_mask_8 is unsupported
    303 .endif
    304 .endm
    305 
    306 .macro bilinear_duplicate_mask mask_fmt, numpix, mask
    307    bilinear_duplicate_mask_\()\mask_fmt \numpix, \mask
    308 .endm
    309 
    310 /*
    311 * Macros for interleaving src and dst pixels to rrrr gggg bbbb aaaa form.
    312 * Interleave should be done when maks is enabled or operator is 'over'.
    313 */
    314 .macro bilinear_interleave src0, src1, dst0, dst1
    315    vuzp.8      \src0, \src1
    316    vuzp.8      \dst0, \dst1
    317    vuzp.8      \src0, \src1
    318    vuzp.8      \dst0, \dst1
    319 .endm
    320 
    321 .macro bilinear_interleave_src_dst_x_src \
    322                numpix, src0, src1, src01, dst0, dst1, dst01
    323 .endm
    324 
    325 .macro bilinear_interleave_src_dst_x_over \
    326                numpix, src0, src1, src01, dst0, dst1, dst01
    327 
    328    bilinear_interleave \src0, \src1, \dst0, \dst1
    329 .endm
    330 
    331 .macro bilinear_interleave_src_dst_x_add \
    332                numpix, src0, src1, src01, dst0, dst1, dst01
    333 .endm
    334 
    335 .macro bilinear_interleave_src_dst_8_src \
    336                numpix, src0, src1, src01, dst0, dst1, dst01
    337 
    338    bilinear_interleave \src0, \src1, \dst0, \dst1
    339 .endm
    340 
    341 .macro bilinear_interleave_src_dst_8_over \
    342                numpix, src0, src1, src01, dst0, dst1, dst01
    343 
    344    bilinear_interleave \src0, \src1, \dst0, \dst1
    345 .endm
    346 
    347 .macro bilinear_interleave_src_dst_8_add \
    348                numpix, src0, src1, src01, dst0, dst1, dst01
    349 
    350    bilinear_interleave \src0, \src1, \dst0, \dst1
    351 .endm
    352 
    353 .macro bilinear_interleave_src_dst \
    354                mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01
    355 
    356    bilinear_interleave_src_dst_\()\mask_fmt\()_\()\op \
    357                \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01
    358 .endm
    359 
    360 
    361 /*
    362 * Macros for applying masks to src pixels. (see combine_mask_u() function)
    363 * src, dst should be in interleaved form.
    364 * mask register should be in form (m0, m1, m2, m3).
    365 */
    366 .macro bilinear_apply_mask_to_src_x \
    367                numpix, src0, src1, src01, mask, \
    368                tmp01, tmp23, tmp45, tmp67
    369 .endm
    370 
    371 .macro bilinear_apply_mask_to_src_8 \
    372                numpix, src0, src1, src01, mask, \
    373                tmp01, tmp23, tmp45, tmp67
    374 
    375    vmull.u8        \tmp01, \src0, \mask
    376    vmull.u8        \tmp23, \src1, \mask
    377    /* bubbles */
    378    vrshr.u16       \tmp45, \tmp01, #8
    379    vrshr.u16       \tmp67, \tmp23, #8
    380    /* bubbles */
    381    vraddhn.u16     \src0, \tmp45, \tmp01
    382    vraddhn.u16     \src1, \tmp67, \tmp23
    383 .endm
    384 
    385 .macro bilinear_apply_mask_to_src \
    386                mask_fmt, numpix, src0, src1, src01, mask, \
    387                tmp01, tmp23, tmp45, tmp67
    388 
    389    bilinear_apply_mask_to_src_\()\mask_fmt \
    390                \numpix, \src0, \src1, \src01, \mask, \
    391                \tmp01, \tmp23, \tmp45, \tmp67
    392 .endm
    393 
    394 
    395 /*
    396 * Macros for combining src and destination pixels.
    397 * Interleave or not is depending on operator 'op'.
    398 */
    399 .macro bilinear_combine_src \
    400                numpix, src0, src1, src01, dst0, dst1, dst01, \
    401                tmp01, tmp23, tmp45, tmp67, tmp8
    402 .endm
    403 
    404 .macro bilinear_combine_over \
    405                numpix, src0, src1, src01, dst0, dst1, dst01, \
    406                tmp01, tmp23, tmp45, tmp67, tmp8
    407 
    408    vdup.32     \tmp8, \src1[1]
    409    /* bubbles */
    410    vmvn.8      \tmp8, \tmp8
    411    /* bubbles */
    412    vmull.u8    \tmp01, \dst0, \tmp8
    413    /* bubbles */
    414    vmull.u8    \tmp23, \dst1, \tmp8
    415    /* bubbles */
    416    vrshr.u16   \tmp45, \tmp01, #8
    417    vrshr.u16   \tmp67, \tmp23, #8
    418    /* bubbles */
    419    vraddhn.u16 \dst0, \tmp45, \tmp01
    420    vraddhn.u16 \dst1, \tmp67, \tmp23
    421    /* bubbles */
    422    vqadd.u8    \src01, \dst01, \src01
    423 .endm
    424 
    425 .macro bilinear_combine_add \
    426                numpix, src0, src1, src01, dst0, dst1, dst01, \
    427                tmp01, tmp23, tmp45, tmp67, tmp8
    428 
    429    vqadd.u8    \src01, \dst01, \src01
    430 .endm
    431 
    432 .macro bilinear_combine \
    433                op, numpix, src0, src1, src01, dst0, dst1, dst01, \
    434                tmp01, tmp23, tmp45, tmp67, tmp8
    435 
    436    bilinear_combine_\()\op \
    437                \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01, \
    438                \tmp01, \tmp23, \tmp45, \tmp67, \tmp8
    439 .endm
    440 
    441 /*
    442 * Macros for final deinterleaving of destination pixels if needed.
    443 */
    444 .macro bilinear_deinterleave numpix, dst0, dst1, dst01
    445    vuzp.8      \dst0, \dst1
    446    /* bubbles */
    447    vuzp.8      \dst0, \dst1
    448 .endm
    449 
    450 .macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01
    451 .endm
    452 
    453 .macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01
    454    bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
    455 .endm
    456 
    457 .macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01
    458 .endm
    459 
    460 .macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01
    461    bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
    462 .endm
    463 
    464 .macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01
    465    bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
    466 .endm
    467 
    468 .macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01
    469    bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
    470 .endm
    471 
    472 .macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01
    473    bilinear_deinterleave_dst_\()\mask_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01
    474 .endm
    475 
    476 
    477 .macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op
    478    bilinear_load_\()\src_fmt d0, d1, d2
    479    bilinear_load_mask \mask_fmt, 1, d4
    480    bilinear_load_dst \dst_fmt, \op, 1, d18, d19, q9
    481    vmull.u8  q1, d0, d28
    482    vmlal.u8  q1, d1, d29
    483    /* 5 cycles bubble */
    484    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
    485    vmlsl.u16 q0, d2, d30
    486    vmlal.u16 q0, d3, d30
    487    /* 5 cycles bubble */
    488    bilinear_duplicate_mask \mask_fmt, 1, d4
    489    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
    490    /* 3 cycles bubble */
    491    vmovn.u16 d0, q0
    492    /* 1 cycle bubble */
    493    bilinear_interleave_src_dst \
    494                \mask_fmt, \op, 1, d0, d1, q0, d18, d19, q9
    495    bilinear_apply_mask_to_src \
    496                \mask_fmt, 1, d0, d1, q0, d4, \
    497                q3, q8, q10, q11
    498    bilinear_combine \
    499                \op, 1, d0, d1, q0, d18, d19, q9, \
    500                q3, q8, q10, q11, d5
    501    bilinear_deinterleave_dst \mask_fmt, \op, 1, d0, d1, q0
    502    bilinear_store_\()\dst_fmt 1, q2, q3
    503 .endm
    504 
    505 .macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op
    506    bilinear_load_and_vertical_interpolate_two_\()\src_fmt \
    507                q1, q11, d0, d1, d20, d21, d22, d23
    508    bilinear_load_mask \mask_fmt, 2, d4
    509    bilinear_load_dst \dst_fmt, \op, 2, d18, d19, q9
    510    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
    511    vmlsl.u16 q0, d2, d30
    512    vmlal.u16 q0, d3, d30
    513    vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
    514    vmlsl.u16 q10, d22, d31
    515    vmlal.u16 q10, d23, d31
    516    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
    517    vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
    518    bilinear_duplicate_mask \mask_fmt, 2, d4
    519    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
    520    vadd.u16  q12, q12, q13
    521    vmovn.u16 d0, q0
    522    bilinear_interleave_src_dst \
    523                \mask_fmt, \op, 2, d0, d1, q0, d18, d19, q9
    524    bilinear_apply_mask_to_src \
    525                \mask_fmt, 2, d0, d1, q0, d4, \
    526                q3, q8, q10, q11
    527    bilinear_combine \
    528                \op, 2, d0, d1, q0, d18, d19, q9, \
    529                q3, q8, q10, q11, d5
    530    bilinear_deinterleave_dst \mask_fmt, \op, 2, d0, d1, q0
    531    bilinear_store_\()\dst_fmt 2, q2, q3
    532 .endm
    533 
    534 .macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op
    535    bilinear_load_and_vertical_interpolate_four_\()\src_fmt \
    536                q1, q11, d0, d1, d20, d21, d22, d23 \
    537                q3, q9,  d4, d5, d16, d17, d18, d19
    538    pld       [TMP1, PF_OFFS]
    539    sub       TMP1, TMP1, STRIDE
    540    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
    541    vmlsl.u16 q0, d2, d30
    542    vmlal.u16 q0, d3, d30
    543    vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
    544    vmlsl.u16 q10, d22, d31
    545    vmlal.u16 q10, d23, d31
    546    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
    547    vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS
    548    vmlsl.u16 q2, d6, d30
    549    vmlal.u16 q2, d7, d30
    550    vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS
    551    bilinear_load_mask \mask_fmt, 4, d22
    552    bilinear_load_dst \dst_fmt, \op, 4, d2, d3, q1
    553    pld       [TMP1, PF_OFFS]
    554    vmlsl.u16 q8, d18, d31
    555    vmlal.u16 q8, d19, d31
    556    vadd.u16  q12, q12, q13
    557    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
    558    vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
    559    vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
    560    vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS)
    561    bilinear_duplicate_mask \mask_fmt, 4, d22
    562    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
    563    vmovn.u16 d0, q0
    564    vmovn.u16 d1, q2
    565    vadd.u16  q12, q12, q13
    566    bilinear_interleave_src_dst \
    567                \mask_fmt, \op, 4, d0, d1, q0, d2, d3, q1
    568    bilinear_apply_mask_to_src \
    569                \mask_fmt, 4, d0, d1, q0, d22, \
    570                q3, q8, q9, q10
    571    bilinear_combine \
    572                \op, 4, d0, d1, q0, d2, d3, q1, \
    573                q3, q8, q9, q10, d23
    574    bilinear_deinterleave_dst \mask_fmt, \op, 4, d0, d1, q0
    575    bilinear_store_\()\dst_fmt 4, q2, q3
    576 .endm
    577 
    578 .set BILINEAR_FLAG_USE_MASK,		1
    579 .set BILINEAR_FLAG_USE_ALL_NEON_REGS,	2
    580 
    581 /*
    582 * Main template macro for generating NEON optimized bilinear scanline functions.
    583 *
    584 * Bilinear scanline generator macro take folling arguments:
    585 *  fname			- name of the function to generate
    586 *  src_fmt			- source color format (8888 or 0565)
    587 *  dst_fmt			- destination color format (8888 or 0565)
    588 *  src/dst_bpp_shift		- (1 << bpp_shift) is the size of src/dst pixel in bytes
    589 *  process_last_pixel		- code block that interpolate one pixel and does not
    590 *				  update horizontal weight
    591 *  process_two_pixels		- code block that interpolate two pixels and update
    592 *				  horizontal weight
    593 *  process_four_pixels		- code block that interpolate four pixels and update
    594 *				  horizontal weight
    595 *  process_pixblock_head	- head part of middle loop
    596 *  process_pixblock_tail	- tail part of middle loop
    597 *  process_pixblock_tail_head	- tail_head of middle loop
    598 *  pixblock_size		- number of pixels processed in a single middle loop
    599 *  prefetch_distance		- prefetch in the source image by that many pixels ahead
    600 */
    601 
    602 .macro generate_bilinear_scanline_func \
    603 fname, \
    604 src_fmt, dst_fmt, src_bpp_shift, dst_bpp_shift, \
    605 bilinear_process_last_pixel, \
    606 bilinear_process_two_pixels, \
    607 bilinear_process_four_pixels, \
    608 bilinear_process_pixblock_head, \
    609 bilinear_process_pixblock_tail, \
    610 bilinear_process_pixblock_tail_head, \
    611 pixblock_size, \
    612 prefetch_distance, \
    613 flags
    614 
    615 pixman_asm_function \fname
    616 .if \pixblock_size == 8
    617 .elseif \pixblock_size == 4
    618 .else
    619    .error unsupported pixblock size
    620 .endif
    621 
    622 .if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0
    623    OUT       .req    r0
    624    TOP       .req    r1
    625    BOTTOM    .req    r2
    626    WT        .req    r3
    627    WB        .req    r4
    628    X         .req    r5
    629    UX        .req    r6
    630    WIDTH     .req    ip
    631    TMP1      .req    r3
    632    TMP2      .req    r4
    633    PF_OFFS   .req    r7
    634    TMP3      .req    r8
    635    TMP4      .req    r9
    636    STRIDE    .req    r2
    637 
    638    mov		ip, sp
    639    push	{r4, r5, r6, r7, r8, r9}
    640    mov		PF_OFFS, #\prefetch_distance
    641    ldmia	ip, {WB, X, UX, WIDTH}
    642 .else
    643    OUT       .req      r0
    644    MASK      .req      r1
    645    TOP       .req      r2
    646    BOTTOM    .req      r3
    647    WT        .req      r4
    648    WB        .req      r5
    649    X         .req      r6
    650    UX        .req      r7
    651    WIDTH     .req      ip
    652    TMP1      .req      r4
    653    TMP2      .req      r5
    654    PF_OFFS   .req      r8
    655    TMP3      .req      r9
    656    TMP4      .req      r10
    657    STRIDE    .req      r3
    658 
    659    .set prefetch_offset, \prefetch_distance
    660 
    661    mov       ip, sp
    662    push      {r4, r5, r6, r7, r8, r9, r10, ip}
    663    mov       PF_OFFS, #\prefetch_distance
    664    ldmia     ip, {WT, WB, X, UX, WIDTH}
    665 .endif
    666 
    667    mul       PF_OFFS, PF_OFFS, UX
    668 
    669 .if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
    670    vpush     {d8-d15}
    671 .endif
    672 
    673    sub	      STRIDE, BOTTOM, TOP
    674    .unreq    BOTTOM
    675 
    676    cmp       WIDTH, #0
    677    ble       3f
    678 
    679    vdup.u16  q12, X
    680    vdup.u16  q13, UX
    681    vdup.u8   d28, WT
    682    vdup.u8   d29, WB
    683    vadd.u16  d25, d25, d26
    684 
    685    /* ensure good destination alignment  */
    686    cmp       WIDTH, #1
    687    blt       0f
    688    tst       OUT, #(1 << \dst_bpp_shift)
    689    beq       0f
    690    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
    691    vadd.u16  q12, q12, q13
    692    \bilinear_process_last_pixel
    693    sub       WIDTH, WIDTH, #1
    694 0:
    695    vadd.u16  q13, q13, q13
    696    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
    697    vadd.u16  q12, q12, q13
    698 
    699    cmp       WIDTH, #2
    700    blt       0f
    701    tst       OUT, #(1 << (\dst_bpp_shift + 1))
    702    beq       0f
    703    \bilinear_process_two_pixels
    704    sub       WIDTH, WIDTH, #2
    705 0:
    706 .if \pixblock_size == 8
    707    cmp       WIDTH, #4
    708    blt       0f
    709    tst       OUT, #(1 << (\dst_bpp_shift + 2))
    710    beq       0f
    711    \bilinear_process_four_pixels
    712    sub       WIDTH, WIDTH, #4
    713 0:
    714 .endif
    715    subs      WIDTH, WIDTH, #\pixblock_size
    716    blt       1f
    717    mov       PF_OFFS, PF_OFFS, asr #(16 - \src_bpp_shift)
    718    \bilinear_process_pixblock_head
    719    subs      WIDTH, WIDTH, #\pixblock_size
    720    blt       5f
    721 0:
    722    \bilinear_process_pixblock_tail_head
    723    subs      WIDTH, WIDTH, #\pixblock_size
    724    bge       0b
    725 5:
    726    \bilinear_process_pixblock_tail
    727 1:
    728 .if \pixblock_size == 8
    729    tst       WIDTH, #4
    730    beq       2f
    731    \bilinear_process_four_pixels
    732 2:
    733 .endif
    734    /* handle the remaining trailing pixels */
    735    tst       WIDTH, #2
    736    beq       2f
    737    \bilinear_process_two_pixels
    738 2:
    739    tst       WIDTH, #1
    740    beq       3f
    741    \bilinear_process_last_pixel
    742 3:
    743 .if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
    744    vpop      {d8-d15}
    745 .endif
    746 
    747 .if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0
    748    pop       {r4, r5, r6, r7, r8, r9}
    749 .else
    750    pop       {r4, r5, r6, r7, r8, r9, r10, ip}
    751 .endif
    752    bx        lr
    753 
    754    .unreq    OUT
    755    .unreq    TOP
    756    .unreq    WT
    757    .unreq    WB
    758    .unreq    X
    759    .unreq    UX
    760    .unreq    WIDTH
    761    .unreq    TMP1
    762    .unreq    TMP2
    763    .unreq    PF_OFFS
    764    .unreq    TMP3
    765    .unreq    TMP4
    766    .unreq    STRIDE
    767 .if ((\flags) & BILINEAR_FLAG_USE_MASK) != 0
    768    .unreq    MASK
    769 .endif
    770 
    771 pixman_end_asm_function
    772 
    773 .endm
    774 
    775 /* src_8888_8_8888 */
    776 .macro bilinear_src_8888_8_8888_process_last_pixel
    777    bilinear_interpolate_last_pixel 8888, 8, 8888, src
    778 .endm
    779 
    780 .macro bilinear_src_8888_8_8888_process_two_pixels
    781    bilinear_interpolate_two_pixels 8888, 8, 8888, src
    782 .endm
    783 
    784 .macro bilinear_src_8888_8_8888_process_four_pixels
    785    bilinear_interpolate_four_pixels 8888, 8, 8888, src
    786 .endm
    787 
    788 .macro bilinear_src_8888_8_8888_process_pixblock_head
    789    bilinear_src_8888_8_8888_process_four_pixels
    790 .endm
    791 
    792 .macro bilinear_src_8888_8_8888_process_pixblock_tail
    793 .endm
    794 
    795 .macro bilinear_src_8888_8_8888_process_pixblock_tail_head
    796    bilinear_src_8888_8_8888_process_pixblock_tail
    797    bilinear_src_8888_8_8888_process_pixblock_head
    798 .endm
    799 
    800 /* src_8888_8_0565 */
    801 .macro bilinear_src_8888_8_0565_process_last_pixel
    802    bilinear_interpolate_last_pixel 8888, 8, 0565, src
    803 .endm
    804 
    805 .macro bilinear_src_8888_8_0565_process_two_pixels
    806    bilinear_interpolate_two_pixels 8888, 8, 0565, src
    807 .endm
    808 
    809 .macro bilinear_src_8888_8_0565_process_four_pixels
    810    bilinear_interpolate_four_pixels 8888, 8, 0565, src
    811 .endm
    812 
    813 .macro bilinear_src_8888_8_0565_process_pixblock_head
    814    bilinear_src_8888_8_0565_process_four_pixels
    815 .endm
    816 
    817 .macro bilinear_src_8888_8_0565_process_pixblock_tail
    818 .endm
    819 
    820 .macro bilinear_src_8888_8_0565_process_pixblock_tail_head
    821    bilinear_src_8888_8_0565_process_pixblock_tail
    822    bilinear_src_8888_8_0565_process_pixblock_head
    823 .endm
    824 
    825 /* src_0565_8_x888 */
    826 .macro bilinear_src_0565_8_x888_process_last_pixel
    827    bilinear_interpolate_last_pixel 0565, 8, 8888, src
    828 .endm
    829 
    830 .macro bilinear_src_0565_8_x888_process_two_pixels
    831    bilinear_interpolate_two_pixels 0565, 8, 8888, src
    832 .endm
    833 
    834 .macro bilinear_src_0565_8_x888_process_four_pixels
    835    bilinear_interpolate_four_pixels 0565, 8, 8888, src
    836 .endm
    837 
    838 .macro bilinear_src_0565_8_x888_process_pixblock_head
    839    bilinear_src_0565_8_x888_process_four_pixels
    840 .endm
    841 
    842 .macro bilinear_src_0565_8_x888_process_pixblock_tail
    843 .endm
    844 
    845 .macro bilinear_src_0565_8_x888_process_pixblock_tail_head
    846    bilinear_src_0565_8_x888_process_pixblock_tail
    847    bilinear_src_0565_8_x888_process_pixblock_head
    848 .endm
    849 
    850 /* src_0565_8_0565 */
    851 .macro bilinear_src_0565_8_0565_process_last_pixel
    852    bilinear_interpolate_last_pixel 0565, 8, 0565, src
    853 .endm
    854 
    855 .macro bilinear_src_0565_8_0565_process_two_pixels
    856    bilinear_interpolate_two_pixels 0565, 8, 0565, src
    857 .endm
    858 
    859 .macro bilinear_src_0565_8_0565_process_four_pixels
    860    bilinear_interpolate_four_pixels 0565, 8, 0565, src
    861 .endm
    862 
    863 .macro bilinear_src_0565_8_0565_process_pixblock_head
    864    bilinear_src_0565_8_0565_process_four_pixels
    865 .endm
    866 
    867 .macro bilinear_src_0565_8_0565_process_pixblock_tail
    868 .endm
    869 
    870 .macro bilinear_src_0565_8_0565_process_pixblock_tail_head
    871    bilinear_src_0565_8_0565_process_pixblock_tail
    872    bilinear_src_0565_8_0565_process_pixblock_head
    873 .endm
    874 
    875 /* over_8888_8888 */
    876 .macro bilinear_over_8888_8888_process_last_pixel
    877    bilinear_interpolate_last_pixel 8888, x, 8888, over
    878 .endm
    879 
    880 .macro bilinear_over_8888_8888_process_two_pixels
    881    bilinear_interpolate_two_pixels 8888, x, 8888, over
    882 .endm
    883 
    884 .macro bilinear_over_8888_8888_process_four_pixels
    885    bilinear_interpolate_four_pixels 8888, x, 8888, over
    886 .endm
    887 
    888 .macro bilinear_over_8888_8888_process_pixblock_head
    889    mov         TMP1, X, asr #16
    890    add         X, X, UX
    891    add         TMP1, TOP, TMP1, asl #2
    892    mov         TMP2, X, asr #16
    893    add         X, X, UX
    894    add         TMP2, TOP, TMP2, asl #2
    895 
    896    vld1.32     {d22}, [TMP1], STRIDE
    897    vld1.32     {d23}, [TMP1]
    898    mov         TMP3, X, asr #16
    899    add         X, X, UX
    900    add         TMP3, TOP, TMP3, asl #2
    901    vmull.u8    q8, d22, d28
    902    vmlal.u8    q8, d23, d29
    903 
    904    vld1.32     {d22}, [TMP2], STRIDE
    905    vld1.32     {d23}, [TMP2]
    906    mov         TMP4, X, asr #16
    907    add         X, X, UX
    908    add         TMP4, TOP, TMP4, asl #2
    909    vmull.u8    q9, d22, d28
    910    vmlal.u8    q9, d23, d29
    911 
    912    vld1.32     {d22}, [TMP3], STRIDE
    913    vld1.32     {d23}, [TMP3]
    914    vmull.u8    q10, d22, d28
    915    vmlal.u8    q10, d23, d29
    916 
    917    vshll.u16   q0, d16, #BILINEAR_INTERPOLATION_BITS
    918    vmlsl.u16   q0, d16, d30
    919    vmlal.u16   q0, d17, d30
    920 
    921    pld         [TMP4, PF_OFFS]
    922    vld1.32     {d16}, [TMP4], STRIDE
    923    vld1.32     {d17}, [TMP4]
    924    pld         [TMP4, PF_OFFS]
    925    vmull.u8    q11, d16, d28
    926    vmlal.u8    q11, d17, d29
    927 
    928    vshll.u16   q1, d18, #BILINEAR_INTERPOLATION_BITS
    929    vmlsl.u16   q1, d18, d31
    930    vmlal.u16   q1, d19, d31
    931    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
    932    vadd.u16    q12, q12, q13
    933 .endm
    934 
    935 .macro bilinear_over_8888_8888_process_pixblock_tail
    936    vshll.u16   q2, d20, #BILINEAR_INTERPOLATION_BITS
    937    vmlsl.u16   q2, d20, d30
    938    vmlal.u16   q2, d21, d30
    939    vshll.u16   q3, d22, #BILINEAR_INTERPOLATION_BITS
    940    vmlsl.u16   q3, d22, d31
    941    vmlal.u16   q3, d23, d31
    942    vshrn.u32   d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
    943    vshrn.u32   d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
    944    vld1.32     {d2, d3}, [OUT, :128]
    945    pld         [OUT, #(prefetch_offset * 4)]
    946    vshrn.u32   d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
    947    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
    948    vshrn.u32   d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
    949    vmovn.u16   d6, q0
    950    vmovn.u16   d7, q2
    951    vuzp.8      d6, d7
    952    vuzp.8      d2, d3
    953    vuzp.8      d6, d7
    954    vuzp.8      d2, d3
    955    vdup.32     d4, d7[1]
    956    vmvn.8      d4, d4
    957    vmull.u8    q11, d2, d4
    958    vmull.u8    q2, d3, d4
    959    vrshr.u16   q1, q11, #8
    960    vrshr.u16   q10, q2, #8
    961    vraddhn.u16 d2, q1, q11
    962    vraddhn.u16 d3, q10, q2
    963    vqadd.u8    q3, q1, q3
    964    vuzp.8      d6, d7
    965    vuzp.8      d6, d7
    966    vadd.u16    q12, q12, q13
    967    vst1.32     {d6, d7}, [OUT, :128]!
    968 .endm
    969 
    970 .macro bilinear_over_8888_8888_process_pixblock_tail_head
    971                                            vshll.u16   q2, d20, #BILINEAR_INTERPOLATION_BITS
    972    mov         TMP1, X, asr #16
    973    add         X, X, UX
    974    add         TMP1, TOP, TMP1, asl #2
    975                                            vmlsl.u16   q2, d20, d30
    976    mov         TMP2, X, asr #16
    977    add         X, X, UX
    978    add         TMP2, TOP, TMP2, asl #2
    979                                            vmlal.u16   q2, d21, d30
    980                                            vshll.u16   q3, d22, #BILINEAR_INTERPOLATION_BITS
    981    vld1.32     {d20}, [TMP1], STRIDE
    982                                            vmlsl.u16   q3, d22, d31
    983                                            vmlal.u16   q3, d23, d31
    984    vld1.32     {d21}, [TMP1]
    985    vmull.u8    q8, d20, d28
    986    vmlal.u8    q8, d21, d29
    987                                            vshrn.u32   d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
    988                                            vshrn.u32   d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
    989                                            vld1.32     {d2, d3}, [OUT, :128]
    990                                            pld         [OUT, PF_OFFS]
    991                                            vshrn.u32   d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
    992                                            vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
    993    vld1.32     {d22}, [TMP2], STRIDE
    994                                            vshrn.u32   d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
    995                                            vmovn.u16   d6, q0
    996    vld1.32     {d23}, [TMP2]
    997    vmull.u8    q9, d22, d28
    998    mov         TMP3, X, asr #16
    999    add         X, X, UX
   1000    add         TMP3, TOP, TMP3, asl #2
   1001    mov         TMP4, X, asr #16
   1002    add         X, X, UX
   1003    add         TMP4, TOP, TMP4, asl #2
   1004    vmlal.u8    q9, d23, d29
   1005                                            vmovn.u16   d7, q2
   1006    vld1.32     {d22}, [TMP3], STRIDE
   1007                                            vuzp.8      d6, d7
   1008                                            vuzp.8      d2, d3
   1009                                            vuzp.8      d6, d7
   1010                                            vuzp.8      d2, d3
   1011                                            vdup.32     d4, d7[1]
   1012    vld1.32     {d23}, [TMP3]
   1013                                            vmvn.8      d4, d4
   1014    vmull.u8    q10, d22, d28
   1015    vmlal.u8    q10, d23, d29
   1016                                            vmull.u8    q11, d2, d4
   1017                                            vmull.u8    q2, d3, d4
   1018    vshll.u16   q0, d16, #BILINEAR_INTERPOLATION_BITS
   1019    vmlsl.u16   q0, d16, d30
   1020                                            vrshr.u16   q1, q11, #8
   1021    vmlal.u16   q0, d17, d30
   1022                                            vrshr.u16   q8, q2, #8
   1023                                            vraddhn.u16 d2, q1, q11
   1024                                            vraddhn.u16 d3, q8, q2
   1025    pld         [TMP4, PF_OFFS]
   1026    vld1.32     {d16}, [TMP4], STRIDE
   1027                                            vqadd.u8    q3, q1, q3
   1028    vld1.32     {d17}, [TMP4]
   1029    pld         [TMP4, PF_OFFS]
   1030    vmull.u8    q11, d16, d28
   1031    vmlal.u8    q11, d17, d29
   1032                                            vuzp.8      d6, d7
   1033    vshll.u16   q1, d18, #BILINEAR_INTERPOLATION_BITS
   1034                                            vuzp.8      d6, d7
   1035    vmlsl.u16   q1, d18, d31
   1036                                            vadd.u16    q12, q12, q13
   1037    vmlal.u16   q1, d19, d31
   1038    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   1039    vadd.u16    q12, q12, q13
   1040                                            vst1.32     {d6, d7}, [OUT, :128]!
   1041 .endm
   1042 
   1043 /* over_8888_8_8888 */
   1044 .macro bilinear_over_8888_8_8888_process_last_pixel
   1045    bilinear_interpolate_last_pixel 8888, 8, 8888, over
   1046 .endm
   1047 
   1048 .macro bilinear_over_8888_8_8888_process_two_pixels
   1049    bilinear_interpolate_two_pixels 8888, 8, 8888, over
   1050 .endm
   1051 
   1052 .macro bilinear_over_8888_8_8888_process_four_pixels
   1053    bilinear_interpolate_four_pixels 8888, 8, 8888, over
   1054 .endm
   1055 
   1056 .macro bilinear_over_8888_8_8888_process_pixblock_head
   1057    mov         TMP1, X, asr #16
   1058    add         X, X, UX
   1059    add         TMP1, TOP, TMP1, asl #2
   1060    vld1.32     {d0}, [TMP1], STRIDE
   1061    mov         TMP2, X, asr #16
   1062    add         X, X, UX
   1063    add         TMP2, TOP, TMP2, asl #2
   1064    vld1.32     {d1}, [TMP1]
   1065    mov         TMP3, X, asr #16
   1066    add         X, X, UX
   1067    add         TMP3, TOP, TMP3, asl #2
   1068    vld1.32     {d2}, [TMP2], STRIDE
   1069    mov         TMP4, X, asr #16
   1070    add         X, X, UX
   1071    add         TMP4, TOP, TMP4, asl #2
   1072    vld1.32     {d3}, [TMP2]
   1073    vmull.u8    q2, d0, d28
   1074    vmull.u8    q3, d2, d28
   1075    vmlal.u8    q2, d1, d29
   1076    vmlal.u8    q3, d3, d29
   1077    vshll.u16   q0, d4, #BILINEAR_INTERPOLATION_BITS
   1078    vshll.u16   q1, d6, #BILINEAR_INTERPOLATION_BITS
   1079    vmlsl.u16   q0, d4, d30
   1080    vmlsl.u16   q1, d6, d31
   1081    vmlal.u16   q0, d5, d30
   1082    vmlal.u16   q1, d7, d31
   1083    vshrn.u32   d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
   1084    vshrn.u32   d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
   1085    vld1.32     {d2}, [TMP3], STRIDE
   1086    vld1.32     {d3}, [TMP3]
   1087    pld         [TMP4, PF_OFFS]
   1088    vld1.32     {d4}, [TMP4], STRIDE
   1089    vld1.32     {d5}, [TMP4]
   1090    pld         [TMP4, PF_OFFS]
   1091    vmull.u8    q3, d2, d28
   1092    vmlal.u8    q3, d3, d29
   1093    vmull.u8    q1, d4, d28
   1094    vmlal.u8    q1, d5, d29
   1095    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   1096    vld1.32     {d22[0]}, [MASK]!
   1097    pld         [MASK, #prefetch_offset]
   1098    vadd.u16    q12, q12, q13
   1099    vmovn.u16   d16, q0
   1100 .endm
   1101 
   1102 .macro bilinear_over_8888_8_8888_process_pixblock_tail
   1103    vshll.u16   q9, d6, #BILINEAR_INTERPOLATION_BITS
   1104    vshll.u16   q10, d2, #BILINEAR_INTERPOLATION_BITS
   1105    vmlsl.u16   q9, d6, d30
   1106    vmlsl.u16   q10, d2, d31
   1107    vmlal.u16   q9, d7, d30
   1108    vmlal.u16   q10, d3, d31
   1109    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   1110    vadd.u16    q12, q12, q13
   1111    vdup.32     d22, d22[0]
   1112    vshrn.u32   d18, q9, #(2 * BILINEAR_INTERPOLATION_BITS)
   1113    vshrn.u32   d19, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
   1114    vmovn.u16   d17, q9
   1115    vld1.32     {d18, d19}, [OUT, :128]
   1116    pld         [OUT, PF_OFFS]
   1117    vuzp.8      d16, d17
   1118    vuzp.8      d18, d19
   1119    vuzp.8      d16, d17
   1120    vuzp.8      d18, d19
   1121    vmull.u8    q10, d16, d22
   1122    vmull.u8    q11, d17, d22
   1123    vrsra.u16   q10, q10, #8
   1124    vrsra.u16   q11, q11, #8
   1125    vrshrn.u16  d16, q10, #8
   1126    vrshrn.u16  d17, q11, #8
   1127    vdup.32     d22, d17[1]
   1128    vmvn.8      d22, d22
   1129    vmull.u8    q10, d18, d22
   1130    vmull.u8    q11, d19, d22
   1131    vrshr.u16   q9, q10, #8
   1132    vrshr.u16   q0, q11, #8
   1133    vraddhn.u16 d18, q9, q10
   1134    vraddhn.u16 d19, q0, q11
   1135    vqadd.u8    q9, q8, q9
   1136    vuzp.8      d18, d19
   1137    vuzp.8      d18, d19
   1138    vst1.32     {d18, d19}, [OUT, :128]!
   1139 .endm
   1140 
   1141 .macro bilinear_over_8888_8_8888_process_pixblock_tail_head
   1142                                            vshll.u16   q9, d6, #BILINEAR_INTERPOLATION_BITS
   1143    mov         TMP1, X, asr #16
   1144    add         X, X, UX
   1145    add         TMP1, TOP, TMP1, asl #2
   1146                                            vshll.u16   q10, d2, #BILINEAR_INTERPOLATION_BITS
   1147    vld1.32     {d0}, [TMP1], STRIDE
   1148    mov         TMP2, X, asr #16
   1149    add         X, X, UX
   1150    add         TMP2, TOP, TMP2, asl #2
   1151                                            vmlsl.u16   q9, d6, d30
   1152                                            vmlsl.u16   q10, d2, d31
   1153    vld1.32     {d1}, [TMP1]
   1154    mov         TMP3, X, asr #16
   1155    add         X, X, UX
   1156    add         TMP3, TOP, TMP3, asl #2
   1157                                            vmlal.u16   q9, d7, d30
   1158                                            vmlal.u16   q10, d3, d31
   1159    vld1.32     {d2}, [TMP2], STRIDE
   1160    mov         TMP4, X, asr #16
   1161    add         X, X, UX
   1162    add         TMP4, TOP, TMP4, asl #2
   1163                                            vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   1164                                            vadd.u16    q12, q12, q13
   1165    vld1.32     {d3}, [TMP2]
   1166                                            vdup.32     d22, d22[0]
   1167                                            vshrn.u32   d18, q9, #(2 * BILINEAR_INTERPOLATION_BITS)
   1168                                            vshrn.u32   d19, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
   1169    vmull.u8    q2, d0, d28
   1170    vmull.u8    q3, d2, d28
   1171                                            vmovn.u16   d17, q9
   1172                                            vld1.32     {d18, d19}, [OUT, :128]
   1173                                            pld         [OUT, #(prefetch_offset * 4)]
   1174    vmlal.u8    q2, d1, d29
   1175    vmlal.u8    q3, d3, d29
   1176                                            vuzp.8      d16, d17
   1177                                            vuzp.8      d18, d19
   1178    vshll.u16   q0, d4, #BILINEAR_INTERPOLATION_BITS
   1179    vshll.u16   q1, d6, #BILINEAR_INTERPOLATION_BITS
   1180                                            vuzp.8      d16, d17
   1181                                            vuzp.8      d18, d19
   1182    vmlsl.u16   q0, d4, d30
   1183    vmlsl.u16   q1, d6, d31
   1184                                            vmull.u8    q10, d16, d22
   1185                                            vmull.u8    q11, d17, d22
   1186    vmlal.u16   q0, d5, d30
   1187    vmlal.u16   q1, d7, d31
   1188                                            vrsra.u16   q10, q10, #8
   1189                                            vrsra.u16   q11, q11, #8
   1190    vshrn.u32   d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
   1191    vshrn.u32   d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
   1192                                            vrshrn.u16  d16, q10, #8
   1193                                            vrshrn.u16  d17, q11, #8
   1194    vld1.32     {d2}, [TMP3], STRIDE
   1195                                            vdup.32     d22, d17[1]
   1196    vld1.32     {d3}, [TMP3]
   1197                                            vmvn.8      d22, d22
   1198    pld         [TMP4, PF_OFFS]
   1199    vld1.32     {d4}, [TMP4], STRIDE
   1200                                            vmull.u8    q10, d18, d22
   1201                                            vmull.u8    q11, d19, d22
   1202    vld1.32     {d5}, [TMP4]
   1203    pld         [TMP4, PF_OFFS]
   1204    vmull.u8    q3, d2, d28
   1205                                            vrshr.u16   q9, q10, #8
   1206                                            vrshr.u16   q15, q11, #8
   1207    vmlal.u8    q3, d3, d29
   1208    vmull.u8    q1, d4, d28
   1209                                            vraddhn.u16 d18, q9, q10
   1210                                            vraddhn.u16 d19, q15, q11
   1211    vmlal.u8    q1, d5, d29
   1212    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   1213                                            vqadd.u8    q9, q8, q9
   1214    vld1.32     {d22[0]}, [MASK]!
   1215                                            vuzp.8      d18, d19
   1216    vadd.u16    q12, q12, q13
   1217                                            vuzp.8      d18, d19
   1218    vmovn.u16   d16, q0
   1219                                            vst1.32     {d18, d19}, [OUT, :128]!
   1220 .endm
   1221 
   1222 /* add_8888_8888 */
   1223 .macro bilinear_add_8888_8888_process_last_pixel
   1224    bilinear_interpolate_last_pixel 8888, x, 8888, add
   1225 .endm
   1226 
   1227 .macro bilinear_add_8888_8888_process_two_pixels
   1228    bilinear_interpolate_two_pixels 8888, x, 8888, add
   1229 .endm
   1230 
   1231 .macro bilinear_add_8888_8888_process_four_pixels
   1232    bilinear_interpolate_four_pixels 8888, x, 8888, add
   1233 .endm
   1234 
   1235 .macro bilinear_add_8888_8888_process_pixblock_head
   1236    bilinear_add_8888_8888_process_four_pixels
   1237 .endm
   1238 
   1239 .macro bilinear_add_8888_8888_process_pixblock_tail
   1240 .endm
   1241 
   1242 .macro bilinear_add_8888_8888_process_pixblock_tail_head
   1243    bilinear_add_8888_8888_process_pixblock_tail
   1244    bilinear_add_8888_8888_process_pixblock_head
   1245 .endm
   1246 
   1247 /* add_8888_8_8888 */
   1248 .macro bilinear_add_8888_8_8888_process_last_pixel
   1249    bilinear_interpolate_last_pixel 8888, 8, 8888, add
   1250 .endm
   1251 
   1252 .macro bilinear_add_8888_8_8888_process_two_pixels
   1253    bilinear_interpolate_two_pixels 8888, 8, 8888, add
   1254 .endm
   1255 
   1256 .macro bilinear_add_8888_8_8888_process_four_pixels
   1257    bilinear_interpolate_four_pixels 8888, 8, 8888, add
   1258 .endm
   1259 
   1260 .macro bilinear_add_8888_8_8888_process_pixblock_head
   1261    bilinear_add_8888_8_8888_process_four_pixels
   1262 .endm
   1263 
   1264 .macro bilinear_add_8888_8_8888_process_pixblock_tail
   1265 .endm
   1266 
   1267 .macro bilinear_add_8888_8_8888_process_pixblock_tail_head
   1268    bilinear_add_8888_8_8888_process_pixblock_tail
   1269    bilinear_add_8888_8_8888_process_pixblock_head
   1270 .endm
   1271 
   1272 
   1273 /* Bilinear scanline functions */
   1274 generate_bilinear_scanline_func \
   1275    pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_neon, \
   1276    8888, 8888, 2, 2, \
   1277    bilinear_src_8888_8_8888_process_last_pixel, \
   1278    bilinear_src_8888_8_8888_process_two_pixels, \
   1279    bilinear_src_8888_8_8888_process_four_pixels, \
   1280    bilinear_src_8888_8_8888_process_pixblock_head, \
   1281    bilinear_src_8888_8_8888_process_pixblock_tail, \
   1282    bilinear_src_8888_8_8888_process_pixblock_tail_head, \
   1283    4, 28, BILINEAR_FLAG_USE_MASK
   1284 
   1285 generate_bilinear_scanline_func \
   1286    pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_neon, \
   1287    8888, 0565, 2, 1, \
   1288    bilinear_src_8888_8_0565_process_last_pixel, \
   1289    bilinear_src_8888_8_0565_process_two_pixels, \
   1290    bilinear_src_8888_8_0565_process_four_pixels, \
   1291    bilinear_src_8888_8_0565_process_pixblock_head, \
   1292    bilinear_src_8888_8_0565_process_pixblock_tail, \
   1293    bilinear_src_8888_8_0565_process_pixblock_tail_head, \
   1294    4, 28, BILINEAR_FLAG_USE_MASK
   1295 
   1296 generate_bilinear_scanline_func \
   1297    pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_neon, \
   1298    0565, 8888, 1, 2, \
   1299    bilinear_src_0565_8_x888_process_last_pixel, \
   1300    bilinear_src_0565_8_x888_process_two_pixels, \
   1301    bilinear_src_0565_8_x888_process_four_pixels, \
   1302    bilinear_src_0565_8_x888_process_pixblock_head, \
   1303    bilinear_src_0565_8_x888_process_pixblock_tail, \
   1304    bilinear_src_0565_8_x888_process_pixblock_tail_head, \
   1305    4, 28, BILINEAR_FLAG_USE_MASK
   1306 
   1307 generate_bilinear_scanline_func \
   1308    pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_neon, \
   1309    0565, 0565, 1, 1, \
   1310    bilinear_src_0565_8_0565_process_last_pixel, \
   1311    bilinear_src_0565_8_0565_process_two_pixels, \
   1312    bilinear_src_0565_8_0565_process_four_pixels, \
   1313    bilinear_src_0565_8_0565_process_pixblock_head, \
   1314    bilinear_src_0565_8_0565_process_pixblock_tail, \
   1315    bilinear_src_0565_8_0565_process_pixblock_tail_head, \
   1316    4, 28, BILINEAR_FLAG_USE_MASK
   1317 
   1318 generate_bilinear_scanline_func \
   1319    pixman_scaled_bilinear_scanline_8888_8888_OVER_asm_neon, \
   1320    8888, 8888, 2, 2, \
   1321    bilinear_over_8888_8888_process_last_pixel, \
   1322    bilinear_over_8888_8888_process_two_pixels, \
   1323    bilinear_over_8888_8888_process_four_pixels, \
   1324    bilinear_over_8888_8888_process_pixblock_head, \
   1325    bilinear_over_8888_8888_process_pixblock_tail, \
   1326    bilinear_over_8888_8888_process_pixblock_tail_head, \
   1327    4, 28, 0
   1328 
   1329 generate_bilinear_scanline_func \
   1330    pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_neon, \
   1331    8888, 8888, 2, 2, \
   1332    bilinear_over_8888_8_8888_process_last_pixel, \
   1333    bilinear_over_8888_8_8888_process_two_pixels, \
   1334    bilinear_over_8888_8_8888_process_four_pixels, \
   1335    bilinear_over_8888_8_8888_process_pixblock_head, \
   1336    bilinear_over_8888_8_8888_process_pixblock_tail, \
   1337    bilinear_over_8888_8_8888_process_pixblock_tail_head, \
   1338    4, 28, BILINEAR_FLAG_USE_MASK
   1339 
   1340 generate_bilinear_scanline_func \
   1341    pixman_scaled_bilinear_scanline_8888_8888_ADD_asm_neon, \
   1342    8888, 8888, 2, 2, \
   1343    bilinear_add_8888_8888_process_last_pixel, \
   1344    bilinear_add_8888_8888_process_two_pixels, \
   1345    bilinear_add_8888_8888_process_four_pixels, \
   1346    bilinear_add_8888_8888_process_pixblock_head, \
   1347    bilinear_add_8888_8888_process_pixblock_tail, \
   1348    bilinear_add_8888_8888_process_pixblock_tail_head, \
   1349    4, 28, 0
   1350 
   1351 generate_bilinear_scanline_func \
   1352    pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_neon, \
   1353    8888, 8888, 2, 2, \
   1354    bilinear_add_8888_8_8888_process_last_pixel, \
   1355    bilinear_add_8888_8_8888_process_two_pixels, \
   1356    bilinear_add_8888_8_8888_process_four_pixels, \
   1357    bilinear_add_8888_8_8888_process_pixblock_head, \
   1358    bilinear_add_8888_8_8888_process_pixblock_tail, \
   1359    bilinear_add_8888_8_8888_process_pixblock_tail_head, \
   1360    4, 28, BILINEAR_FLAG_USE_MASK