tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

pixman-arma64-neon-asm-bilinear.S (44741B)


      1 /*
      2 * Copyright © 2011 SCore Corporation
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
     21 * DEALINGS IN THE SOFTWARE.
     22 *
     23 * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
     24 * Author:  Taekyun Kim (tkq.kim@samsung.com)
     25 */
     26 
     27 /*
     28 * This file contains scaled bilinear scanline functions implemented
     29 * using older siarhei's bilinear macro template.
     30 *
     31 * << General scanline function procedures >>
     32 *  1. bilinear interpolate source pixels
     33 *  2. load mask pixels
     34 *  3. load destination pixels
     35 *  4. duplicate mask to fill whole register
     36 *  5. interleave source & destination pixels
     37 *  6. apply mask to source pixels
     38 *  7. combine source & destination pixels
     39 *  8, Deinterleave final result
     40 *  9. store destination pixels
     41 *
     42 * All registers with single number (i.e. src0, tmp0) are 64-bits registers.
     43 * Registers with double numbers(src01, dst01) are 128-bits registers.
     44 * All temp registers can be used freely outside the code block.
     45 * Assume that symbol(register .req) OUT and MASK are defined at caller of these macro blocks.
     46 *
     47 * Remarks
     48 *  There can be lots of pipeline stalls inside code block and between code blocks.
     49 *  Further optimizations will be done by new macro templates using head/tail_head/tail scheme.
     50 */
     51 
     52 /* Prevent the stack from becoming executable for no reason... */
     53 #if defined(__linux__) && defined (__ELF__)
     54 .section .note.GNU-stack,"",%progbits
     55 #endif
     56 
     57 .text
     58 .arch armv8-a
     59 .altmacro
     60 .p2align 2
     61 
     62 #include "pixman-private.h"
     63 #include "pixman-arm-asm.h"
     64 #include "pixman-arma64-neon-asm.h"
     65 
     66 /*
     67 * Bilinear macros from pixman-arm-neon-asm.S
     68 */
     69 
     70 /*
     71 * Bilinear scaling support code which tries to provide pixel fetching, color
     72 * format conversion, and interpolation as separate macros which can be used
     73 * as the basic building blocks for constructing bilinear scanline functions.
     74 */
     75 
     76 .macro bilinear_load_8888 reg1, reg2, tmp
     77    asr       WTMP1, X, #16
     78    add       X, X, UX
     79    add       TMP1, TOP, TMP1, lsl #2
     80    ld1       {\()\reg1\().2s}, [TMP1], STRIDE
     81    ld1       {\()\reg2\().2s}, [TMP1]
     82 .endm
     83 
     84 .macro bilinear_load_0565 reg1, reg2, tmp
     85    asr       WTMP1, X, #16
     86    add       X, X, UX
     87    add       TMP1, TOP, TMP1, lsl #1
     88    ld1       {\()\reg2\().s}[0], [TMP1], STRIDE
     89    ld1       {\()\reg2\().s}[1], [TMP1]
     90    convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp
     91 .endm
     92 
     93 .macro bilinear_load_and_vertical_interpolate_two_8888 \
     94                    acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
     95 
     96    bilinear_load_8888 \reg1, \reg2, \tmp1
     97    umull     \()\acc1\().8h, \()\reg1\().8b, v28.8b
     98    umlal     \()\acc1\().8h, \()\reg2\().8b, v29.8b
     99    bilinear_load_8888 \reg3, \reg4, \tmp2
    100    umull     \()\acc2\().8h, \()\reg3\().8b, v28.8b
    101    umlal     \()\acc2\().8h, \()\reg4\().8b, v29.8b
    102 .endm
    103 
    104 .macro bilinear_load_and_vertical_interpolate_four_8888 \
    105                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi, \
    106                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
    107 
    108    bilinear_load_and_vertical_interpolate_two_8888 \
    109                \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, xacc2hi
    110    bilinear_load_and_vertical_interpolate_two_8888 \
    111                \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi
    112 .endm
    113 
    114 .macro vzip reg1, reg2
    115    zip1      v24.8b, \reg1, \reg2
    116    zip2      \reg2,  \reg1, \reg2
    117    mov       \reg1,  v24.8b
    118 .endm
    119 
    120 .macro vuzp reg1, reg2
    121    uzp1     v24.8b, \reg1, \reg2
    122    uzp2     \reg2,  \reg1, \reg2
    123    mov      \reg1,  v24.8b
    124 .endm
    125 
    126 .macro bilinear_load_and_vertical_interpolate_two_0565 \
    127                acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
    128    asr       WTMP1, X, #16
    129    add       X, X, UX
    130    add       TMP1, TOP, TMP1, lsl #1
    131    asr       WTMP2, X, #16
    132    add       X, X, UX
    133    add       TMP2, TOP, TMP2, lsl #1
    134    ld1       {\()\acc2\().s}[0], [TMP1], STRIDE
    135    ld1       {\()\acc2\().s}[2], [TMP2], STRIDE
    136    ld1       {\()\acc2\().s}[1], [TMP1]
    137    ld1       {\()\acc2\().s}[3], [TMP2]
    138    convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1
    139    vzip      \()\reg1\().8b, \()\reg3\().8b
    140    vzip      \()\reg2\().8b, \()\reg4\().8b
    141    vzip      \()\reg3\().8b, \()\reg4\().8b
    142    vzip      \()\reg1\().8b, \()\reg2\().8b
    143    umull     \()\acc1\().8h, \()\reg1\().8b, v28.8b
    144    umlal     \()\acc1\().8h, \()\reg2\().8b, v29.8b
    145    umull     \()\acc2\().8h, \()\reg3\().8b, v28.8b
    146    umlal     \()\acc2\().8h, \()\reg4\().8b, v29.8b
    147 .endm
    148 
    149 .macro bilinear_load_and_vertical_interpolate_four_0565 \
    150                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi, \
    151                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
    152 
    153    asr       WTMP1, X, #16
    154    add       X, X, UX
    155    add       TMP1, TOP, TMP1, lsl #1
    156    asr       WTMP2, X, #16
    157    add       X, X, UX
    158    add       TMP2, TOP, TMP2, lsl #1
    159    ld1       {\()\xacc2\().s}[0], [TMP1], STRIDE
    160    ld1       {\()\xacc2\().s}[2], [TMP2], STRIDE
    161    ld1       {\()\xacc2\().s}[1], [TMP1]
    162    ld1       {\()\xacc2\().s}[3], [TMP2]
    163    convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1
    164    asr       WTMP1, X, #16
    165    add       X, X, UX
    166    add       TMP1, TOP, TMP1, lsl #1
    167    asr       WTMP2, X, #16
    168    add       X, X, UX
    169    add       TMP2, TOP, TMP2, lsl #1
    170    ld1       {\()\yacc2\().s}[0], [TMP1], STRIDE
    171    vzip      \()\xreg1\().8b, \()\xreg3\().8b
    172    ld1       {\()\yacc2\().s}[2], [TMP2], STRIDE
    173    vzip      \()\xreg2\().8b, \()\xreg4\().8b
    174    ld1       {\()\yacc2\().s}[1], [TMP1]
    175    vzip      \()\xreg3\().8b, \()\xreg4\().8b
    176    ld1       {\()\yacc2\().s}[3], [TMP2]
    177    vzip      \()\xreg1\().8b, \()\xreg2\().8b
    178    convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1
    179    umull     \()\xacc1\().8h, \()\xreg1\().8b, v28.8b
    180    vzip      \()\yreg1\().8b, \()\yreg3\().8b
    181    umlal     \()\xacc1\().8h, \()\xreg2\().8b, v29.8b
    182    vzip      \()\yreg2\().8b, \()\yreg4\().8b
    183    umull     \()\xacc2\().8h, \()\xreg3\().8b, v28.8b
    184    vzip      \()\yreg3\().8b, \()\yreg4\().8b
    185    umlal     \()\xacc2\().8h, \()\xreg4\().8b, v29.8b
    186    vzip      \()\yreg1\().8b, \()\yreg2\().8b
    187    umull     \()\yacc1\().8h, \()\yreg1\().8b, v28.8b
    188    umlal     \()\yacc1\().8h, \()\yreg2\().8b, v29.8b
    189    umull     \()\yacc2\().8h, \()\yreg3\().8b, v28.8b
    190    umlal     \()\yacc2\().8h, \()\yreg4\().8b, v29.8b
    191 .endm
    192 
    193 .macro bilinear_store_8888 numpix, tmp1, tmp2
    194 .if \numpix == 4
    195    st1       {v0.2s, v1.2s}, [OUT], #16
    196 .elseif \numpix == 2
    197    st1       {v0.2s}, [OUT], #8
    198 .elseif \numpix == 1
    199    st1       {v0.s}[0], [OUT], #4
    200 .else
    201    .error bilinear_store_8888 \numpix is unsupported
    202 .endif
    203 .endm
    204 
    205 .macro bilinear_store_0565 numpix, tmp1, tmp2
    206    vuzp    v0.8b, v1.8b
    207    vuzp    v2.8b, v3.8b
    208    vuzp    v1.8b, v3.8b
    209    vuzp    v0.8b, v2.8b
    210    convert_8888_to_0565 v2, v1, v0, v1, \tmp1, \tmp2
    211 .if \numpix == 4
    212    st1       {v1.4h}, [OUT], #8
    213 .elseif \numpix == 2
    214    st1       {v1.s}[0], [OUT], #4
    215 .elseif \numpix == 1
    216    st1       {v1.h}[0], [OUT], #2
    217 .else
    218    .error bilinear_store_0565 \numpix is unsupported
    219 .endif
    220 .endm
    221 
    222 
    223 /*
    224 * Macros for loading mask pixels into register 'mask'.
    225 * dup must be done in somewhere else.
    226 */
    227 .macro bilinear_load_mask_x numpix, mask
    228 .endm
    229 
    230 .macro bilinear_load_mask_8 numpix, mask
    231 .if \numpix == 4
    232    ld1         {\()\mask\().s}[0], [MASK], #4
    233 .elseif \numpix == 2
    234    ld1         {\()\mask\().h}[0], [MASK], #2
    235 .elseif \numpix == 1
    236    ld1         {\()\mask\().b}[0], [MASK], #1
    237 .else
    238    .error bilinear_load_mask_8 \numpix is unsupported
    239 .endif
    240    prfum       PREFETCH_MODE, [MASK, #(prefetch_offset)]
    241 .endm
    242 
    243 .macro bilinear_load_mask mask_fmt, numpix, mask
    244    bilinear_load_mask_\mask_fmt \numpix, \mask
    245 .endm
    246 
    247 
    248 /*
    249 * Macros for loading destination pixels into register 'dst0' and 'dst1'.
    250 * Interleave should be done somewhere else.
    251 */
    252 .macro bilinear_load_dst_0565_src numpix, dst0, dst1, dst01
    253 .endm
    254 
    255 .macro bilinear_load_dst_8888_src numpix, dst0, dst1, dst01
    256 .endm
    257 
    258 .macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01
    259 .if \numpix == 4
    260    ld1         {\()\dst0\().2s, \()\dst1\().2s}, [OUT]
    261 .elseif \numpix == 2
    262    ld1         {\()\dst0\().2s}, [OUT]
    263 .elseif \numpix == 1
    264    ld1         {\()\dst0\().s}[0], [OUT]
    265 .else
    266    .error bilinear_load_dst_8888 \numpix is unsupported
    267 .endif
    268    mov         \()\dst01\().d[0], \()\dst0\().d[0]
    269    mov         \()\dst01\().d[1], \()\dst1\().d[0]
    270    prfm        PREFETCH_MODE, [OUT, #(prefetch_offset * 4)]
    271 .endm
    272 
    273 .macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01
    274    bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01
    275 .endm
    276 
    277 .macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01
    278    bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01
    279 .endm
    280 
    281 .macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01
    282    bilinear_load_dst_\()\dst_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01
    283 .endm
    284 
    285 /*
    286 * Macros for duplicating partially loaded mask to fill entire register.
    287 * We will apply mask to interleaved source pixels, that is
    288 *  (r0, r1, r2, r3, g0, g1, g2, g3) x (m0, m1, m2, m3, m0, m1, m2, m3)
    289 *  (b0, b1, b2, b3, a0, a1, a2, a3) x (m0, m1, m2, m3, m0, m1, m2, m3)
    290 * So, we need to duplicate loaded mask into whole register.
    291 *
    292 * For two pixel case
    293 *  (r0, r1, x, x, g0, g1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
    294 *  (b0, b1, x, x, a0, a1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
    295 * We can do some optimizations for this including last pixel cases.
    296 */
    297 .macro bilinear_duplicate_mask_x numpix, mask
    298 .endm
    299 
    300 .macro bilinear_duplicate_mask_8 numpix, mask
    301 .if \numpix == 4
    302    dup         \()\mask\().2s, \()\mask\().s[0]
    303 .elseif \numpix == 2
    304    dup         \()\mask\().4h, \()\mask\().h[0]
    305 .elseif \numpix == 1
    306    dup         \()\mask\().8b, \()\mask\().b[0]
    307 .else
    308    .error bilinear_duplicate_\mask_8 is unsupported
    309 .endif
    310 .endm
    311 
    312 .macro bilinear_duplicate_mask mask_fmt, numpix, mask
    313    bilinear_duplicate_mask_\()\mask_fmt \numpix, \mask
    314 .endm
    315 
    316 /*
    317 * Macros for interleaving src and dst pixels to rrrr gggg bbbb aaaa form.
    318 * Interleave should be done when maks is enabled or operator is 'over'.
    319 */
    320 .macro bilinear_interleave src0, src1, src01, dst0, dst1, dst01
    321    vuzp       \()\src0\().8b, \()\src1\().8b
    322    vuzp       \()\dst0\().8b, \()\dst1\().8b
    323    vuzp       \()\src0\().8b, \()\src1\().8b
    324    vuzp       \()\dst0\().8b, \()\dst1\().8b
    325    mov        \()\src01\().d[1], \()\src1\().d[0]
    326    mov        \()\src01\().d[0], \()\src0\().d[0]
    327    mov        \()\dst01\().d[1], \()\dst1\().d[0]
    328    mov        \()\dst01\().d[0], \()\dst0\().d[0]
    329 .endm
    330 
    331 .macro bilinear_interleave_src_dst_x_src \
    332                numpix, src0, src1, src01, dst0, dst1, dst01
    333 .endm
    334 
    335 .macro bilinear_interleave_src_dst_x_over \
    336                numpix, src0, src1, src01, dst0, dst1, dst01
    337 
    338    bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01
    339 .endm
    340 
    341 .macro bilinear_interleave_src_dst_x_add \
    342                numpix, src0, src1, src01, dst0, dst1, dst01
    343                
    344    bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01
    345 .endm
    346 
    347 .macro bilinear_interleave_src_dst_8_src \
    348                numpix, src0, src1, src01, dst0, dst1, dst01
    349 
    350    bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01
    351 .endm
    352 
    353 .macro bilinear_interleave_src_dst_8_over \
    354                numpix, src0, src1, src01, dst0, dst1, dst01
    355 
    356    bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01
    357 .endm
    358 
    359 .macro bilinear_interleave_src_dst_8_add \
    360                numpix, src0, src1, src01, dst0, dst1, dst01
    361 
    362    bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01
    363 .endm
    364 
    365 .macro bilinear_interleave_src_dst \
    366                mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01
    367 
    368    bilinear_interleave_src_dst_\()\mask_fmt\()_\()\op \
    369                \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01
    370 .endm
    371 
    372 
    373 /*
    374 * Macros for applying masks to src pixels. (see combine_mask_u() function)
    375 * src, dst should be in interleaved form.
    376 * mask register should be in form (m0, m1, m2, m3).
    377 */
    378 .macro bilinear_apply_mask_to_src_x \
    379                numpix, src0, src1, src01, mask, \
    380                tmp01, tmp23, tmp45, tmp67
    381 .endm
    382 
    383 .macro bilinear_apply_mask_to_src_8 \
    384                numpix, src0, src1, src01, mask, \
    385                tmp01, tmp23, tmp45, tmp67
    386 
    387    umull           \()\tmp01\().8h, \()\src0\().8b, \()\mask\().8b
    388    umull           \()\tmp23\().8h, \()\src1\().8b, \()\mask\().8b
    389    /* bubbles */
    390    urshr           \()\tmp45\().8h, \()\tmp01\().8h, #8
    391    urshr           \()\tmp67\().8h, \()\tmp23\().8h, #8
    392    /* bubbles */
    393    raddhn          \()\src0\().8b, \()\tmp45\().8h, \()\tmp01\().8h
    394    raddhn          \()\src1\().8b, \()\tmp67\().8h, \()\tmp23\().8h
    395    mov             \()\src01\().d[0], \()\src0\().d[0]
    396    mov             \()\src01\().d[1], \()\src1\().d[0]
    397 .endm
    398 
    399 .macro bilinear_apply_mask_to_src \
    400                mask_fmt, numpix, src0, src1, src01, mask, \
    401                tmp01, tmp23, tmp45, tmp67
    402 
    403    bilinear_apply_mask_to_src_\()\mask_fmt \
    404                \numpix, \src0, \src1, \src01, \mask, \
    405                \tmp01, \tmp23, \tmp45, \tmp67
    406 .endm
    407 
    408 
    409 /*
    410 * Macros for combining src and destination pixels.
    411 * Interleave or not is depending on operator 'op'.
    412 */
    413 .macro bilinear_combine_src \
    414                numpix, src0, src1, src01, dst0, dst1, dst01, \
    415                tmp01, tmp23, tmp45, tmp67, tmp8
    416 .endm
    417 
    418 .macro bilinear_combine_over \
    419                numpix, src0, src1, src01, dst0, dst1, dst01, \
    420                tmp01, tmp23, tmp45, tmp67, tmp8
    421 
    422    dup         \()\tmp8\().2s, \()\src1\().s[1]
    423    /* bubbles */
    424    mvn         \()\tmp8\().8b, \()\tmp8\().8b
    425    /* bubbles */
    426    umull       \()\tmp01\().8h, \()\dst0\().8b, \()\tmp8\().8b
    427    /* bubbles */
    428    umull       \()\tmp23\().8h, \()\dst1\().8b, \()\tmp8\().8b
    429    /* bubbles */
    430    urshr       \()\tmp45\().8h, \()\tmp01\().8h, #8
    431    urshr       \()\tmp67\().8h, \()\tmp23\().8h, #8
    432    /* bubbles */
    433    raddhn      \()\dst0\().8b, \()\tmp45\().8h, \()\tmp01\().8h
    434    raddhn      \()\dst1\().8b, \()\tmp67\().8h, \()\tmp23\().8h
    435    mov         \()\dst01\().d[0], \()\dst0\().d[0]
    436    mov         \()\dst01\().d[1], \()\dst1\().d[0]
    437    /* bubbles */
    438    uqadd       \()\src0\().8b, \()\dst0\().8b, \()\src0\().8b
    439    uqadd       \()\src1\().8b, \()\dst1\().8b, \()\src1\().8b
    440    mov         \()\src01\().d[0], \()\src0\().d[0]
    441    mov         \()\src01\().d[1], \()\src1\().d[0]
    442 .endm
    443 
    444 .macro bilinear_combine_add \
    445                numpix, src0, src1, src01, dst0, dst1, dst01, \
    446                tmp01, tmp23, tmp45, tmp67, tmp8
    447 
    448    uqadd       \()\src0\().8b, \()\dst0\().8b, \()\src0\().8b
    449    uqadd       \()\src1\().8b, \()\dst1\().8b, \()\src1\().8b
    450    mov         \()\src01\().d[0], \()\src0\().d[0]
    451    mov         \()\src01\().d[1], \()\src1\().d[0]
    452 .endm
    453 
    454 .macro bilinear_combine \
    455                op, numpix, src0, src1, src01, dst0, dst1, dst01, \
    456                tmp01, tmp23, tmp45, tmp67, tmp8
    457 
    458    bilinear_combine_\()\op \
    459                \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01, \
    460                \tmp01, \tmp23, \tmp45, \tmp67, \tmp8
    461 .endm
    462 
    463 /*
    464 * Macros for final deinterleaving of destination pixels if needed.
    465 */
    466 .macro bilinear_deinterleave numpix, dst0, dst1, dst01
    467    vuzp       \()\dst0\().8b, \()\dst1\().8b
    468    /* bubbles */
    469    vuzp       \()\dst0\().8b, \()\dst1\().8b
    470    mov        \()\dst01\().d[0], \()\dst0\().d[0]
    471    mov        \()\dst01\().d[1], \()\dst1\().d[0]
    472 .endm
    473 
    474 .macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01
    475 .endm
    476 
    477 .macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01
    478    bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
    479 .endm
    480 
    481 .macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01
    482    bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
    483 .endm
    484 
    485 .macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01
    486    bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
    487 .endm
    488 
    489 .macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01
    490    bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
    491 .endm
    492 
    493 .macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01
    494    bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
    495 .endm
    496 
    497 .macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01
    498    bilinear_deinterleave_dst_\()\mask_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01
    499 .endm
    500 
    501 
    502 .macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op
    503    bilinear_load_\()\src_fmt v0, v1, v2
    504    bilinear_load_mask \mask_fmt, 1, v4
    505    bilinear_load_dst \dst_fmt, \op, 1, v18, v19, v9
    506    umull     v2.8h, v0.8b, v28.8b
    507    umlal     v2.8h, v1.8b, v29.8b
    508    /* 5 cycles bubble */
    509    ushll     v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS
    510    umlsl     v0.4s, v2.4h, v15.h[0]
    511    umlal2    v0.4s, v2.8h, v15.h[0]
    512    /* 5 cycles bubble */
    513    bilinear_duplicate_mask \mask_fmt, 1, v4
    514    shrn      v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
    515    /* 3 cycles bubble */
    516    xtn       v0.8b, v0.8h
    517    /* 1 cycle bubble */
    518    bilinear_interleave_src_dst \
    519                \mask_fmt, \op, 1, v0, v1, v0, v18, v19, v9
    520    bilinear_apply_mask_to_src \
    521                \mask_fmt, 1, v0, v1, v0, v4, \
    522                v3, v8, v10, v11
    523    bilinear_combine \
    524                \op, 1, v0, v1, v0, v18, v19, v9, \
    525                v3, v8, v10, v11, v5
    526    bilinear_deinterleave_dst \mask_fmt, \op, 1, v0, v1, v0
    527    bilinear_store_\()\dst_fmt 1, v17, v18
    528 .endm
    529 
    530 .macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op
    531    bilinear_load_and_vertical_interpolate_two_\()\src_fmt \
    532                v1, v11, v18, v19, v20, v21, v22, v23
    533    bilinear_load_mask \mask_fmt, 2, v4
    534    bilinear_load_dst \dst_fmt, \op, 2, v18, v19, v9
    535    ushll     v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS
    536    umlsl     v0.4s, v1.4h, v15.h[0]
    537    umlal2    v0.4s, v1.8h, v15.h[0]
    538    ushll     v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
    539    umlsl     v10.4s, v11.4h, v15.h[4]
    540    umlal2    v10.4s, v11.8h, v15.h[4]
    541    shrn      v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
    542    shrn2     v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
    543    bilinear_duplicate_mask \mask_fmt, 2, v4
    544    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
    545    add       v12.8h, v12.8h, v13.8h
    546    xtn       v0.8b, v0.8h
    547    bilinear_interleave_src_dst \
    548                \mask_fmt, \op, 2, v0, v1, v0, v18, v19, v9
    549    bilinear_apply_mask_to_src \
    550                \mask_fmt, 2, v0, v1, v0, v4, \
    551                v3, v8, v10, v11
    552    bilinear_combine \
    553                \op, 2, v0, v1, v0, v18, v19, v9, \
    554                v3, v8, v10, v11, v5
    555    bilinear_deinterleave_dst \mask_fmt, \op, 2, v0, v1, v0
    556    bilinear_store_\()\dst_fmt 2, v16, v17
    557 .endm
    558 
    559 .macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op
    560    bilinear_load_and_vertical_interpolate_four_\()\src_fmt \
    561                v1, v11, v4,  v5,  v6,  v7,  v22, v23, \
    562                v3, v9,  v16, v17, v20, v21, v18, v19
    563    prfm      PREFETCH_MODE, [TMP1, PF_OFFS]
    564    sub       TMP1, TMP1, STRIDE
    565    prfm      PREFETCH_MODE, [TMP1, PF_OFFS]
    566    ushll     v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS
    567    umlsl     v0.4s, v1.4h, v15.h[0]
    568    umlal2    v0.4s, v1.8h, v15.h[0]
    569    ushll     v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
    570    umlsl     v10.4s, v11.4h, v15.h[4]
    571    umlal2    v10.4s, v11.8h, v15.h[4]
    572    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
    573    ushll     v2.4s, v3.4h, #BILINEAR_INTERPOLATION_BITS
    574    umlsl     v2.4s, v3.4h, v15.h[0]
    575    umlal2    v2.4s, v3.8h, v15.h[0]
    576    ushll     v8.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS
    577    umlsl     v8.4s, v9.4h, v15.h[4]
    578    umlal2    v8.4s, v9.8h, v15.h[4]
    579    add       v12.8h, v12.8h, v13.8h
    580    shrn      v0.4h,  v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
    581    shrn2     v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
    582    shrn      v2.4h,  v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
    583    shrn2     v2.8h,  v8.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
    584    bilinear_load_mask \mask_fmt, 4, v4
    585    bilinear_duplicate_mask \mask_fmt, 4, v4
    586    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
    587    xtn       v0.8b, v0.8h
    588    xtn       v1.8b, v2.8h
    589    add       v12.8h, v12.8h, v13.8h
    590    bilinear_load_dst \dst_fmt, \op, 4, v2, v3, v21
    591    bilinear_interleave_src_dst \
    592                \mask_fmt, \op, 4, v0, v1, v0, v2, v3, v11
    593    bilinear_apply_mask_to_src \
    594                \mask_fmt, 4, v0, v1, v0, v4, \
    595                v6, v8, v9, v10
    596    bilinear_combine \
    597                \op, 4, v0, v1, v0, v2, v3, v1, \
    598                v6, v8, v9, v10, v23
    599    bilinear_deinterleave_dst \mask_fmt, \op, 4, v0, v1, v0
    600    bilinear_store_\()\dst_fmt 4, v6, v7
    601 .endm
    602 
    603 .set BILINEAR_FLAG_USE_MASK,        1
    604 .set BILINEAR_FLAG_USE_ALL_NEON_REGS,    2
    605 
    606 /*
    607 * Main template macro for generating NEON optimized bilinear scanline functions.
    608 *
    609 * Bilinear scanline generator macro take folling arguments:
    610 *  fname            - name of the function to generate
    611 *  src_fmt            - source color format (8888 or 0565)
    612 *  dst_fmt            - destination color format (8888 or 0565)
    613 *  src/dst_bpp_shift        - (1 << bpp_shift) is the size of src/dst pixel in bytes
    614 *  process_last_pixel        - code block that interpolate one pixel and does not
    615 *                  update horizontal weight
    616 *  process_two_pixels        - code block that interpolate two pixels and update
    617 *                  horizontal weight
    618 *  process_four_pixels        - code block that interpolate four pixels and update
    619 *                  horizontal weight
    620 *  process_pixblock_head    - head part of middle loop
    621 *  process_pixblock_tail    - tail part of middle loop
    622 *  process_pixblock_tail_head    - tail_head of middle loop
    623 *  pixblock_size        - number of pixels processed in a single middle loop
    624 *  prefetch_distance        - prefetch in the source image by that many pixels ahead
    625 */
    626 
    627 .macro generate_bilinear_scanline_func \
    628    fname, \
    629    src_fmt, dst_fmt, src_bpp_shift, dst_bpp_shift, \
    630    bilinear_process_last_pixel, \
    631    bilinear_process_two_pixels, \
    632    bilinear_process_four_pixels, \
    633    bilinear_process_pixblock_head, \
    634    bilinear_process_pixblock_tail, \
    635    bilinear_process_pixblock_tail_head, \
    636    pixblock_size, \
    637    prefetch_distance, \
    638    flags
    639 
    640 pixman_asm_function \fname
    641 .if \pixblock_size == 8
    642 .elseif \pixblock_size == 4
    643 .else
    644    .error unsupported pixblock size
    645 .endif
    646 
    647 .if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0
    648    OUT       .req    x0
    649    TOP       .req    x1
    650    BOTTOM    .req    x2
    651    WT        .req    x3
    652    WWT       .req    w3
    653    WB        .req    x4
    654    WWB       .req    w4
    655    X         .req    w5
    656    UX        .req    w6
    657    WIDTH     .req    x7
    658    TMP1      .req    x10
    659    WTMP1     .req    w10
    660    TMP2      .req    x11
    661    WTMP2     .req    w11
    662    PF_OFFS   .req    x12
    663    TMP3      .req    x13
    664    WTMP3     .req    w13
    665    TMP4      .req    x14
    666    WTMP4     .req    w14
    667    STRIDE    .req    x15
    668    DUMMY     .req    x30
    669 
    670    stp       x29, x30, [sp, -16]!
    671    mov       x29, sp
    672    sub       sp, sp, 112
    673    sub       x29, x29, 64
    674    st1       {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
    675    st1       {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
    676    stp       x10, x11, [x29, -80]
    677    stp       x12, x13, [x29, -96]
    678    stp       x14, x15, [x29, -112]
    679 .else
    680    OUT       .req      x0
    681    MASK      .req      x1
    682    TOP       .req      x2
    683    BOTTOM    .req      x3
    684    WT        .req      x4
    685    WWT       .req      w4
    686    WB        .req      x5
    687    WWB       .req      w5
    688    X         .req      w6
    689    UX        .req      w7
    690    WIDTH     .req      x8
    691    TMP1      .req      x10
    692    WTMP1     .req      w10
    693    TMP2      .req      x11
    694    WTMP2     .req      w11
    695    PF_OFFS   .req      x12
    696    TMP3      .req      x13
    697    WTMP3     .req      w13
    698    TMP4      .req      x14
    699    WTMP4     .req      w14
    700    STRIDE    .req      x15
    701    DUMMY     .req      x30
    702 
    703    .set prefetch_offset, \prefetch_distance
    704 
    705    stp      x29, x30, [sp, -16]!
    706    mov      x29, sp
    707    sub      x29, x29, 64
    708    st1      {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
    709    st1      {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
    710    stp      x10, x11, [x29, -80]
    711    stp      x12, x13, [x29, -96]
    712    stp      x14, x15, [x29, -112]
    713    str      x8, [x29, -120]
    714    ldr      w8, [x29, 16]
    715    sub      sp, sp, 120
    716 .endif
    717 
    718    mov      WTMP1, #\prefetch_distance
    719    umull    PF_OFFS, WTMP1, UX
    720 
    721    sub      STRIDE, BOTTOM, TOP
    722    .unreq   BOTTOM
    723 
    724    cmp      WIDTH, #0
    725    ble      300f
    726 
    727    dup      v12.8h, X
    728    dup      v13.8h, UX
    729    dup      v28.8b, WWT
    730    dup      v29.8b, WWB
    731    mov      v25.d[0], v12.d[1]
    732    mov      v26.d[0], v13.d[0]
    733    add      v25.4h, v25.4h, v26.4h
    734    mov      v12.d[1], v25.d[0]
    735 
    736    /* ensure good destination alignment  */
    737    cmp       WIDTH, #1
    738    blt       100f
    739    tst       OUT, #(1 << \dst_bpp_shift)
    740    beq       100f
    741    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
    742    add       v12.8h, v12.8h, v13.8h
    743    \bilinear_process_last_pixel
    744    sub       WIDTH, WIDTH, #1
    745 100:
    746    add       v13.8h, v13.8h, v13.8h
    747    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
    748    add       v12.8h, v12.8h, v13.8h
    749 
    750    cmp       WIDTH, #2
    751    blt       100f
    752    tst       OUT, #(1 << (\dst_bpp_shift + 1))
    753    beq       100f
    754    \bilinear_process_two_pixels
    755    sub       WIDTH, WIDTH, #2
    756 100:
    757 .if \pixblock_size == 8
    758    cmp       WIDTH, #4
    759    blt       100f
    760    tst       OUT, #(1 << (\dst_bpp_shift + 2))
    761    beq       100f
    762    \bilinear_process_four_pixels
    763    sub       WIDTH, WIDTH, #4
    764 100:
    765 .endif
    766    subs      WIDTH, WIDTH, #\pixblock_size
    767    blt       100f
    768    asr       PF_OFFS, PF_OFFS, #(16 - \src_bpp_shift)
    769    \bilinear_process_pixblock_head
    770    subs      WIDTH, WIDTH, #\pixblock_size
    771    blt       500f
    772 0:
    773    \bilinear_process_pixblock_tail_head
    774    subs      WIDTH, WIDTH, #\pixblock_size
    775    bge       0b
    776 500:
    777    \bilinear_process_pixblock_tail
    778 100:
    779 .if \pixblock_size == 8
    780    tst       WIDTH, #4
    781    beq       200f
    782    \bilinear_process_four_pixels
    783 200:
    784 .endif
    785    /* handle the remaining trailing pixels */
    786    tst       WIDTH, #2
    787    beq       200f
    788    \bilinear_process_two_pixels
    789 200:
    790    tst       WIDTH, #1
    791    beq       300f
    792    \bilinear_process_last_pixel
    793 300:
    794 
    795 .if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0
    796    sub       x29, x29, 64
    797    ld1       {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
    798    ld1       {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
    799    ldp       x10, x11, [x29, -80]
    800    ldp       x12, x13, [x29, -96]
    801    ldp       x14, x15, [x29, -112]
    802    mov       sp, x29
    803    ldp       x29, x30, [sp], 16
    804 .else
    805    sub       x29, x29, 64
    806    ld1       {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
    807    ld1       {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
    808    ldp       x10, x11, [x29, -80]
    809    ldp       x12, x13, [x29, -96]
    810    ldp       x14, x15, [x29, -112]
    811    ldr       x8, [x29, -120]
    812    mov       sp, x29
    813    ldp       x29, x30, [sp], 16
    814 .endif
    815    VERIFY_LR
    816    ret
    817 
    818    .unreq    OUT
    819    .unreq    TOP
    820    .unreq    WT
    821    .unreq    WWT
    822    .unreq    WB
    823    .unreq    WWB
    824    .unreq    X
    825    .unreq    UX
    826    .unreq    WIDTH
    827    .unreq    TMP1
    828    .unreq    WTMP1
    829    .unreq    TMP2
    830    .unreq    PF_OFFS
    831    .unreq    TMP3
    832    .unreq    TMP4
    833    .unreq    STRIDE
    834 .if ((\flags) & BILINEAR_FLAG_USE_MASK) != 0
    835    .unreq    MASK
    836 .endif
    837 
    838 pixman_end_asm_function
    839 
    840 .endm
    841 
    842 /* src_8888_8_8888 */
    843 .macro bilinear_src_8888_8_8888_process_last_pixel
    844    bilinear_interpolate_last_pixel 8888, 8, 8888, src
    845 .endm
    846 
    847 .macro bilinear_src_8888_8_8888_process_two_pixels
    848    bilinear_interpolate_two_pixels 8888, 8, 8888, src
    849 .endm
    850 
    851 .macro bilinear_src_8888_8_8888_process_four_pixels
    852    bilinear_interpolate_four_pixels 8888, 8, 8888, src
    853 .endm
    854 
    855 .macro bilinear_src_8888_8_8888_process_pixblock_head
    856    bilinear_src_8888_8_8888_process_four_pixels
    857 .endm
    858 
    859 .macro bilinear_src_8888_8_8888_process_pixblock_tail
    860 .endm
    861 
    862 .macro bilinear_src_8888_8_8888_process_pixblock_tail_head
    863    bilinear_src_8888_8_8888_process_pixblock_tail
    864    bilinear_src_8888_8_8888_process_pixblock_head
    865 .endm
    866 
    867 /* src_8888_8_0565 */
    868 .macro bilinear_src_8888_8_0565_process_last_pixel
    869    bilinear_interpolate_last_pixel 8888, 8, 0565, src
    870 .endm
    871 
    872 .macro bilinear_src_8888_8_0565_process_two_pixels
    873    bilinear_interpolate_two_pixels 8888, 8, 0565, src
    874 .endm
    875 
    876 .macro bilinear_src_8888_8_0565_process_four_pixels
    877    bilinear_interpolate_four_pixels 8888, 8, 0565, src
    878 .endm
    879 
    880 .macro bilinear_src_8888_8_0565_process_pixblock_head
    881    bilinear_src_8888_8_0565_process_four_pixels
    882 .endm
    883 
    884 .macro bilinear_src_8888_8_0565_process_pixblock_tail
    885 .endm
    886 
    887 .macro bilinear_src_8888_8_0565_process_pixblock_tail_head
    888    bilinear_src_8888_8_0565_process_pixblock_tail
    889    bilinear_src_8888_8_0565_process_pixblock_head
    890 .endm
    891 
    892 /* src_0565_8_x888 */
    893 .macro bilinear_src_0565_8_x888_process_last_pixel
    894    bilinear_interpolate_last_pixel 0565, 8, 8888, src
    895 .endm
    896 
    897 .macro bilinear_src_0565_8_x888_process_two_pixels
    898    bilinear_interpolate_two_pixels 0565, 8, 8888, src
    899 .endm
    900 
    901 .macro bilinear_src_0565_8_x888_process_four_pixels
    902    bilinear_interpolate_four_pixels 0565, 8, 8888, src
    903 .endm
    904 
    905 .macro bilinear_src_0565_8_x888_process_pixblock_head
    906    bilinear_src_0565_8_x888_process_four_pixels
    907 .endm
    908 
    909 .macro bilinear_src_0565_8_x888_process_pixblock_tail
    910 .endm
    911 
    912 .macro bilinear_src_0565_8_x888_process_pixblock_tail_head
    913    bilinear_src_0565_8_x888_process_pixblock_tail
    914    bilinear_src_0565_8_x888_process_pixblock_head
    915 .endm
    916 
    917 /* src_0565_8_0565 */
    918 .macro bilinear_src_0565_8_0565_process_last_pixel
    919    bilinear_interpolate_last_pixel 0565, 8, 0565, src
    920 .endm
    921 
    922 .macro bilinear_src_0565_8_0565_process_two_pixels
    923    bilinear_interpolate_two_pixels 0565, 8, 0565, src
    924 .endm
    925 
    926 .macro bilinear_src_0565_8_0565_process_four_pixels
    927    bilinear_interpolate_four_pixels 0565, 8, 0565, src
    928 .endm
    929 
    930 .macro bilinear_src_0565_8_0565_process_pixblock_head
    931    bilinear_src_0565_8_0565_process_four_pixels
    932 .endm
    933 
    934 .macro bilinear_src_0565_8_0565_process_pixblock_tail
    935 .endm
    936 
    937 .macro bilinear_src_0565_8_0565_process_pixblock_tail_head
    938    bilinear_src_0565_8_0565_process_pixblock_tail
    939    bilinear_src_0565_8_0565_process_pixblock_head
    940 .endm
    941 
    942 /* over_8888_8888 */
    943 .macro bilinear_over_8888_8888_process_last_pixel
    944    bilinear_interpolate_last_pixel 8888, x, 8888, over
    945 .endm
    946 
    947 .macro bilinear_over_8888_8888_process_two_pixels
    948    bilinear_interpolate_two_pixels 8888, x, 8888, over
    949 .endm
    950 
    951 .macro bilinear_over_8888_8888_process_four_pixels
    952    bilinear_interpolate_four_pixels 8888, x, 8888, over
    953 .endm
    954 
    955 .macro bilinear_over_8888_8888_process_pixblock_head
    956    asr         WTMP1, X, #16
    957    add         X, X, UX
    958    add         TMP1, TOP, TMP1, lsl #2
    959    asr         WTMP2, X, #16
    960    add         X, X, UX
    961    add         TMP2, TOP, TMP2, lsl #2
    962 
    963    ld1         {v22.2s}, [TMP1], STRIDE
    964    ld1         {v23.2s}, [TMP1]
    965    asr         WTMP3, X, #16
    966    add         X, X, UX
    967    add         TMP3, TOP, TMP3, lsl #2
    968    umull       v8.8h, v22.8b, v28.8b
    969    umlal       v8.8h, v23.8b, v29.8b
    970 
    971    ld1         {v22.2s}, [TMP2], STRIDE
    972    ld1         {v23.2s}, [TMP2]
    973    asr         WTMP4, X, #16
    974    add         X, X, UX
    975    add         TMP4, TOP, TMP4, lsl #2
    976    umull       v9.8h, v22.8b, v28.8b
    977    umlal       v9.8h, v23.8b, v29.8b
    978 
    979    ld1         {v22.2s}, [TMP3], STRIDE
    980    ld1         {v23.2s}, [TMP3]
    981    umull       v10.8h, v22.8b, v28.8b
    982    umlal       v10.8h, v23.8b, v29.8b
    983 
    984    ushll       v0.4s, v8.4h, #BILINEAR_INTERPOLATION_BITS
    985    umlsl       v0.4s, v8.4h, v15.h[0]
    986    umlal2      v0.4s, v8.8h, v15.h[0]
    987 
    988    prfm        PREFETCH_MODE, [TMP4, PF_OFFS]
    989    ld1         {v16.2s}, [TMP4], STRIDE
    990    ld1         {v17.2s}, [TMP4]
    991    prfm        PREFETCH_MODE, [TMP4, PF_OFFS]
    992    umull       v11.8h, v16.8b, v28.8b
    993    umlal       v11.8h, v17.8b, v29.8b
    994 
    995    ushll       v1.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS
    996    umlsl       v1.4s, v9.4h, v15.h[4]
    997    umlal2      v1.4s, v9.8h, v15.h[4]
    998    ushr        v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
    999    add         v12.8h, v12.8h, v13.8h
   1000 .endm
   1001 
   1002 .macro bilinear_over_8888_8888_process_pixblock_tail
   1003    ushll       v2.4s, v10.4h, #BILINEAR_INTERPOLATION_BITS
   1004    umlsl       v2.4s, v10.4h, v15.h[0]
   1005    umlal2      v2.4s, v10.8h, v15.h[0]
   1006    ushll       v3.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
   1007    umlsl       v3.4s, v11.4h, v15.h[4]
   1008    umlal2      v3.4s, v11.8h, v15.h[4]
   1009    shrn        v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
   1010    shrn2       v0.8h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
   1011    shrn        v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
   1012    ushr        v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
   1013    shrn2       v2.8h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
   1014    xtn         v6.8b, v0.8h
   1015    xtn         v7.8b, v2.8h
   1016    ld1         {v2.2s, v3.2s}, [OUT]
   1017    prfm        PREFETCH_MODE, [OUT, #(prefetch_offset * 4)]
   1018    vuzp        v6.8b, v7.8b
   1019    vuzp        v2.8b, v3.8b
   1020    vuzp        v6.8b, v7.8b
   1021    vuzp        v2.8b, v3.8b
   1022    dup         v4.2s, v7.s[1]
   1023    mvn         v4.8b, v4.8b
   1024    umull       v11.8h, v2.8b, v4.8b
   1025    umull       v2.8h,  v3.8b, v4.8b
   1026    urshr       v1.8h, v11.8h, #8
   1027    urshr       v10.8h, v2.8h, #8
   1028    raddhn      v3.8b, v10.8h, v2.8h
   1029    raddhn      v2.8b, v1.8h, v11.8h
   1030    uqadd       v6.8b, v2.8b,  v6.8b
   1031    uqadd       v7.8b, v3.8b,  v7.8b
   1032    vuzp        v6.8b, v7.8b
   1033    vuzp        v6.8b, v7.8b
   1034    add         v12.8h, v12.8h, v13.8h
   1035    st1         {v6.2s, v7.2s}, [OUT], #16
   1036 .endm
   1037 
   1038 .macro bilinear_over_8888_8888_process_pixblock_tail_head
   1039                                            ushll       v2.4s, v10.4h, #BILINEAR_INTERPOLATION_BITS
   1040    asr         WTMP1, X, #16
   1041    add         X, X, UX
   1042    add         TMP1, TOP, TMP1, lsl #2
   1043                                            umlsl       v2.4s, v10.4h, v15.h[0]
   1044    asr         WTMP2, X, #16
   1045    add         X, X, UX
   1046    add         TMP2, TOP, TMP2, lsl #2
   1047                                            umlal2      v2.4s, v10.8h, v15.h[0]
   1048                                            ushll       v3.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
   1049    ld1         {v20.2s}, [TMP1], STRIDE
   1050                                            umlsl       v3.4s, v11.4h, v15.h[4]
   1051                                            umlal2      v3.4s, v11.8h, v15.h[4]
   1052    ld1         {v21.2s}, [TMP1]
   1053    umull       v8.8h, v20.8b, v28.8b
   1054    umlal       v8.8h, v21.8b, v29.8b
   1055                                            shrn        v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
   1056                                            shrn2       v0.8h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
   1057                                            shrn        v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
   1058                                            ushr        v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
   1059    ld1         {v22.2s}, [TMP2], STRIDE
   1060                                            shrn2       v2.8h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
   1061                                            xtn         v6.8b, v0.8h
   1062    ld1         {v23.2s}, [TMP2]
   1063    umull       v9.8h, v22.8b, v28.8b
   1064    asr         WTMP3, X, #16
   1065    add         X, X, UX
   1066    add         TMP3, TOP, TMP3, lsl #2
   1067    asr         WTMP4, X, #16
   1068    add         X, X, UX
   1069    add         TMP4, TOP, TMP4, lsl #2
   1070    umlal       v9.8h, v23.8b, v29.8b
   1071                                            xtn         v7.8b, v2.8h
   1072                                            ld1         {v2.2s, v3.2s}, [OUT]
   1073                                            prfm        PREFETCH_MODE, [OUT, PF_OFFS]
   1074    ld1         {v22.2s}, [TMP3], STRIDE
   1075                                            vuzp        v6.8b, v7.8b
   1076                                            vuzp        v2.8b, v3.8b
   1077                                            vuzp        v6.8b, v7.8b
   1078                                            vuzp        v2.8b, v3.8b
   1079                                            dup         v4.2s, v7.s[1]
   1080    ld1         {v23.2s}, [TMP3]
   1081                                            mvn         v4.8b, v4.8b
   1082    umull       v10.8h, v22.8b, v28.8b
   1083    umlal       v10.8h, v23.8b, v29.8b
   1084                                            umull       v11.8h, v2.8b, v4.8b
   1085                                            umull        v2.8h, v3.8b, v4.8b
   1086    ushll       v0.4s, v8.4h, #BILINEAR_INTERPOLATION_BITS
   1087    umlsl       v0.4s, v8.4h, v15.h[0]
   1088                                            urshr       v1.8h, v11.8h, #8
   1089    umlal2      v0.4s, v8.8h, v15.h[0]
   1090                                            urshr       v8.8h, v2.8h, #8
   1091                                            raddhn      v3.8b, v8.8h, v2.8h
   1092                                            raddhn      v2.8b, v1.8h, v11.8h
   1093    prfm        PREFETCH_MODE, [TMP4, PF_OFFS]
   1094    ld1         {v16.2s}, [TMP4], STRIDE
   1095                                            uqadd       v6.8b, v2.8b, v6.8b
   1096                                            uqadd       v7.8b, v3.8b, v7.8b
   1097    ld1         {v17.2s}, [TMP4]
   1098    prfm        PREFETCH_MODE, [TMP4, PF_OFFS]
   1099    umull       v11.8h, v16.8b, v28.8b
   1100    umlal       v11.8h, v17.8b, v29.8b
   1101                                            vuzp        v6.8b, v7.8b
   1102    ushll       v1.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS
   1103                                            vuzp        v6.8b, v7.8b
   1104    umlsl       v1.4s, v9.4h, v15.h[4]
   1105                                            add         v12.8h, v12.8h, v13.8h
   1106    umlal2      v1.4s, v9.8h, v15.h[4]
   1107    ushr        v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
   1108    add         v12.8h, v12.8h, v13.8h
   1109                                            st1         {v6.2s, v7.2s}, [OUT], #16
   1110 .endm
   1111 
   1112 /* over_8888_8_8888 */
   1113 .macro bilinear_over_8888_8_8888_process_last_pixel
   1114    bilinear_interpolate_last_pixel 8888, 8, 8888, over
   1115 .endm
   1116 
   1117 .macro bilinear_over_8888_8_8888_process_two_pixels
   1118    bilinear_interpolate_two_pixels 8888, 8, 8888, over
   1119 .endm
   1120 
   1121 .macro bilinear_over_8888_8_8888_process_four_pixels
   1122    bilinear_interpolate_two_pixels 8888, 8, 8888, over
   1123    bilinear_interpolate_two_pixels 8888, 8, 8888, over
   1124 .endm
   1125 
   1126 .macro bilinear_over_8888_8_8888_process_pixblock_head
   1127    bilinear_over_8888_8_8888_process_four_pixels
   1128 .endm
   1129 
   1130 .macro bilinear_over_8888_8_8888_process_pixblock_tail
   1131 .endm
   1132 
   1133 .macro bilinear_over_8888_8_8888_process_pixblock_tail_head
   1134     bilinear_over_8888_8_8888_process_pixblock_tail
   1135     bilinear_over_8888_8_8888_process_pixblock_head
   1136 .endm
   1137 
   1138 /* add_8888_8888 */
   1139 .macro bilinear_add_8888_8888_process_last_pixel
   1140    bilinear_interpolate_last_pixel 8888, x, 8888, add
   1141 .endm
   1142 
   1143 .macro bilinear_add_8888_8888_process_two_pixels
   1144    bilinear_interpolate_two_pixels 8888, x, 8888, add
   1145 .endm
   1146 
   1147 .macro bilinear_add_8888_8888_process_four_pixels
   1148    bilinear_interpolate_two_pixels 8888, x, 8888, add
   1149    bilinear_interpolate_two_pixels 8888, x, 8888, add
   1150 .endm
   1151 
   1152 .macro bilinear_add_8888_8888_process_pixblock_head
   1153    bilinear_add_8888_8888_process_four_pixels
   1154 .endm
   1155 
   1156 .macro bilinear_add_8888_8888_process_pixblock_tail
   1157 .endm
   1158 
   1159 .macro bilinear_add_8888_8888_process_pixblock_tail_head
   1160    bilinear_add_8888_8888_process_pixblock_tail
   1161    bilinear_add_8888_8888_process_pixblock_head
   1162 .endm
   1163 
   1164 /* add_8888_8_8888 */
   1165 .macro bilinear_add_8888_8_8888_process_last_pixel
   1166    bilinear_interpolate_last_pixel 8888, 8, 8888, add
   1167 .endm
   1168 
   1169 .macro bilinear_add_8888_8_8888_process_two_pixels
   1170    bilinear_interpolate_two_pixels 8888, 8, 8888, add
   1171 .endm
   1172 
   1173 .macro bilinear_add_8888_8_8888_process_four_pixels
   1174    bilinear_interpolate_four_pixels 8888, 8, 8888, add
   1175 .endm
   1176 
   1177 .macro bilinear_add_8888_8_8888_process_pixblock_head
   1178    bilinear_add_8888_8_8888_process_four_pixels
   1179 .endm
   1180 
   1181 .macro bilinear_add_8888_8_8888_process_pixblock_tail
   1182 .endm
   1183 
   1184 .macro bilinear_add_8888_8_8888_process_pixblock_tail_head
   1185    bilinear_add_8888_8_8888_process_pixblock_tail
   1186    bilinear_add_8888_8_8888_process_pixblock_head
   1187 .endm
   1188 
   1189 
   1190 /* Bilinear scanline functions */
   1191 generate_bilinear_scanline_func \
   1192    pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_neon, \
   1193    8888, 8888, 2, 2, \
   1194    bilinear_src_8888_8_8888_process_last_pixel, \
   1195    bilinear_src_8888_8_8888_process_two_pixels, \
   1196    bilinear_src_8888_8_8888_process_four_pixels, \
   1197    bilinear_src_8888_8_8888_process_pixblock_head, \
   1198    bilinear_src_8888_8_8888_process_pixblock_tail, \
   1199    bilinear_src_8888_8_8888_process_pixblock_tail_head, \
   1200    4, 28, BILINEAR_FLAG_USE_MASK
   1201 
   1202 generate_bilinear_scanline_func \
   1203    pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_neon, \
   1204    8888, 0565, 2, 1, \
   1205    bilinear_src_8888_8_0565_process_last_pixel, \
   1206    bilinear_src_8888_8_0565_process_two_pixels, \
   1207    bilinear_src_8888_8_0565_process_four_pixels, \
   1208    bilinear_src_8888_8_0565_process_pixblock_head, \
   1209    bilinear_src_8888_8_0565_process_pixblock_tail, \
   1210    bilinear_src_8888_8_0565_process_pixblock_tail_head, \
   1211    4, 28, BILINEAR_FLAG_USE_MASK
   1212 
   1213 generate_bilinear_scanline_func \
   1214    pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_neon, \
   1215    0565, 8888, 1, 2, \
   1216    bilinear_src_0565_8_x888_process_last_pixel, \
   1217    bilinear_src_0565_8_x888_process_two_pixels, \
   1218    bilinear_src_0565_8_x888_process_four_pixels, \
   1219    bilinear_src_0565_8_x888_process_pixblock_head, \
   1220    bilinear_src_0565_8_x888_process_pixblock_tail, \
   1221    bilinear_src_0565_8_x888_process_pixblock_tail_head, \
   1222    4, 28, BILINEAR_FLAG_USE_MASK
   1223 
   1224 generate_bilinear_scanline_func \
   1225    pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_neon, \
   1226    0565, 0565, 1, 1, \
   1227    bilinear_src_0565_8_0565_process_last_pixel, \
   1228    bilinear_src_0565_8_0565_process_two_pixels, \
   1229    bilinear_src_0565_8_0565_process_four_pixels, \
   1230    bilinear_src_0565_8_0565_process_pixblock_head, \
   1231    bilinear_src_0565_8_0565_process_pixblock_tail, \
   1232    bilinear_src_0565_8_0565_process_pixblock_tail_head, \
   1233    4, 28, BILINEAR_FLAG_USE_MASK
   1234 
   1235 generate_bilinear_scanline_func \
   1236    pixman_scaled_bilinear_scanline_8888_8888_OVER_asm_neon, \
   1237    8888, 8888, 2, 2, \
   1238    bilinear_over_8888_8888_process_last_pixel, \
   1239    bilinear_over_8888_8888_process_two_pixels, \
   1240    bilinear_over_8888_8888_process_four_pixels, \
   1241    bilinear_over_8888_8888_process_pixblock_head, \
   1242    bilinear_over_8888_8888_process_pixblock_tail, \
   1243    bilinear_over_8888_8888_process_pixblock_tail_head, \
   1244    4, 28, 0
   1245 
   1246 generate_bilinear_scanline_func \
   1247    pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_neon, \
   1248    8888, 8888, 2, 2, \
   1249    bilinear_over_8888_8_8888_process_last_pixel, \
   1250    bilinear_over_8888_8_8888_process_two_pixels, \
   1251    bilinear_over_8888_8_8888_process_four_pixels, \
   1252    bilinear_over_8888_8_8888_process_pixblock_head, \
   1253    bilinear_over_8888_8_8888_process_pixblock_tail, \
   1254    bilinear_over_8888_8_8888_process_pixblock_tail_head, \
   1255    4, 28, BILINEAR_FLAG_USE_MASK
   1256 
   1257 generate_bilinear_scanline_func \
   1258    pixman_scaled_bilinear_scanline_8888_8888_ADD_asm_neon, \
   1259    8888, 8888, 2, 2, \
   1260    bilinear_add_8888_8888_process_last_pixel, \
   1261    bilinear_add_8888_8888_process_two_pixels, \
   1262    bilinear_add_8888_8888_process_four_pixels, \
   1263    bilinear_add_8888_8888_process_pixblock_head, \
   1264    bilinear_add_8888_8888_process_pixblock_tail, \
   1265    bilinear_add_8888_8888_process_pixblock_tail_head, \
   1266    4, 28, 0
   1267 
   1268 generate_bilinear_scanline_func \
   1269    pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_neon, \
   1270    8888, 8888, 2, 2, \
   1271    bilinear_add_8888_8_8888_process_last_pixel, \
   1272    bilinear_add_8888_8_8888_process_two_pixels, \
   1273    bilinear_add_8888_8_8888_process_four_pixels, \
   1274    bilinear_add_8888_8_8888_process_pixblock_head, \
   1275    bilinear_add_8888_8_8888_process_pixblock_tail, \
   1276    bilinear_add_8888_8_8888_process_pixblock_tail_head, \
   1277    4, 28, BILINEAR_FLAG_USE_MASK