tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

pixman-arm-neon-asm.S (129297B)


      1 /*
      2 * Copyright © 2009 Nokia Corporation
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
     21 * DEALINGS IN THE SOFTWARE.
     22 *
     23 * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
     24 */
     25 
     26 /*
     27 * This file contains implementations of NEON optimized pixel processing
     28 * functions. There is no full and detailed tutorial, but some functions
     29 * (those which are exposing some new or interesting features) are
     30 * extensively commented and can be used as examples.
     31 *
     32 * You may want to have a look at the comments for following functions:
     33 *  - pixman_composite_over_8888_0565_asm_neon
     34 *  - pixman_composite_over_n_8_0565_asm_neon
     35 */
     36 
     37 /* Prevent the stack from becoming executable for no reason... */
     38 #if defined(__linux__) && defined(__ELF__)
     39 .section .note.GNU-stack,"",%progbits
     40 #endif
     41 
     42    .text
     43    .arch armv7a
     44    .object_arch armv4
     45    .fpu neon
     46    .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
     47    .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
     48    .arm
     49    .altmacro
     50    .p2align 2
     51 
     52 #include "pixman-private.h"
     53 #include "pixman-arm-asm.h"
     54 #include "pixman-arm-neon-asm.h"
     55 
     56    pixman_syntax_unified
     57 
     58 /* Global configuration options and preferences */
     59 
     60 /*
     61 * The code can optionally make use of unaligned memory accesses to improve
     62 * performance of handling leading/trailing pixels for each scanline.
     63 * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
     64 * example in linux if unaligned memory accesses are not configured to
     65 * generate.exceptions.
     66 */
     67 .set RESPECT_STRICT_ALIGNMENT, 1
     68 
     69 /*
     70 * Set default prefetch type. There is a choice between the following options:
     71 *
     72 * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
     73 * as NOP to workaround some HW bugs or for whatever other reason)
     74 *
     75 * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
     76 * advanced prefetch intruduces heavy overhead)
     77 *
     78 * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
     79 * which can run ARM and NEON instructions simultaneously so that extra ARM
     80 * instructions do not add (many) extra cycles, but improve prefetch efficiency)
     81 *
     82 * Note: some types of function can't support advanced prefetch and fallback
     83 *       to simple one (those which handle 24bpp pixels)
     84 */
     85 .set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
     86 
     87 /* Prefetch distance in pixels for simple prefetch */
     88 .set PREFETCH_DISTANCE_SIMPLE, 64
     89 
     90 /*
     91 * Implementation of pixman_composite_over_8888_0565_asm_neon
     92 *
     93 * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
     94 * performs OVER compositing operation. Function fast_composite_over_8888_0565
     95 * from pixman-fast-path.c does the same in C and can be used as a reference.
     96 *
     97 * First we need to have some NEON assembly code which can do the actual
     98 * operation on the pixels and provide it to the template macro.
     99 *
    100 * Template macro quite conveniently takes care of emitting all the necessary
    101 * code for memory reading and writing (including quite tricky cases of
    102 * handling unaligned leading/trailing pixels), so we only need to deal with
    103 * the data in NEON registers.
    104 *
    105 * NEON registers allocation in general is recommented to be the following:
    106 * d0,  d1,  d2,  d3  - contain loaded source pixel data
    107 * d4,  d5,  d6,  d7  - contain loaded destination pixels (if they are needed)
    108 * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used)
    109 * d28, d29, d30, d31 - place for storing the result (destination pixels)
    110 *
    111 * As can be seen above, four 64-bit NEON registers are used for keeping
    112 * intermediate pixel data and up to 8 pixels can be processed in one step
    113 * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
    114 *
    115 * This particular function uses the following registers allocation:
    116 * d0,  d1,  d2,  d3  - contain loaded source pixel data
    117 * d4,  d5            - contain loaded destination pixels (they are needed)
    118 * d28, d29           - place for storing the result (destination pixels)
    119 */
    120 
    121 /*
    122 * Step one. We need to have some code to do some arithmetics on pixel data.
    123 * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
    124 * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5},
    125 * perform all the needed calculations and write the result to {d28, d29}.
    126 * The rationale for having two macros and not just one will be explained
    127 * later. In practice, any single monolitic function which does the work can
    128 * be split into two parts in any arbitrary way without affecting correctness.
    129 *
    130 * There is one special trick here too. Common template macro can optionally
    131 * make our life a bit easier by doing R, G, B, A color components
    132 * deinterleaving for 32bpp pixel formats (and this feature is used in
    133 * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
    134 * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we
    135 * actually use d0 register for blue channel (a vector of eight 8-bit
    136 * values), d1 register for green, d2 for red and d3 for alpha. This
    137 * simple conversion can be also done with a few NEON instructions:
    138 *
    139 * Packed to planar conversion:
    140 *  vuzp.8 d0, d1
    141 *  vuzp.8 d2, d3
    142 *  vuzp.8 d1, d3
    143 *  vuzp.8 d0, d2
    144 *
    145 * Planar to packed conversion:
    146 *  vzip.8 d0, d2
    147 *  vzip.8 d1, d3
    148 *  vzip.8 d2, d3
    149 *  vzip.8 d0, d1
    150 *
    151 * But pixel can be loaded directly in planar format using VLD4.8 NEON
    152 * instruction. It is 1 cycle slower than VLD1.32, so this is not always
    153 * desirable, that's why deinterleaving is optional.
    154 *
    155 * But anyway, here is the code:
    156 */
    157 .macro pixman_composite_over_8888_0565_process_pixblock_head
    158    /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
    159       and put data into d6 - red, d7 - green, d30 - blue */
    160    vshrn.u16   d6, q2, #8
    161    vshrn.u16   d7, q2, #3
    162    vsli.u16    q2, q2, #5
    163    vsri.u8     d6, d6, #5
    164    vmvn.8      d3, d3      /* invert source alpha */
    165    vsri.u8     d7, d7, #6
    166    vshrn.u16   d30, q2, #2
    167    /* now do alpha blending, storing results in 8-bit planar format
    168       into d16 - red, d19 - green, d18 - blue */
    169    vmull.u8    q10, d3, d6
    170    vmull.u8    q11, d3, d7
    171    vmull.u8    q12, d3, d30
    172    vrshr.u16   q13, q10, #8
    173    vrshr.u16   q3, q11, #8
    174    vrshr.u16   q15, q12, #8
    175    vraddhn.u16 d20, q10, q13
    176    vraddhn.u16 d23, q11, q3
    177    vraddhn.u16 d22, q12, q15
    178 .endm
    179 
    180 .macro pixman_composite_over_8888_0565_process_pixblock_tail
    181    /* ... continue alpha blending */
    182    vqadd.u8    d16, d2, d20
    183    vqadd.u8    q9, q0, q11
    184    /* convert the result to r5g6b5 and store it into {d28, d29} */
    185    vshll.u8    q14, d16, #8
    186    vshll.u8    q8, d19, #8
    187    vshll.u8    q9, d18, #8
    188    vsri.u16    q14, q8, #5
    189    vsri.u16    q14, q9, #11
    190 .endm
    191 
    192 /*
    193 * OK, now we got almost everything that we need. Using the above two
    194 * macros, the work can be done right. But now we want to optimize
    195 * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
    196 * a lot from good code scheduling and software pipelining.
    197 *
    198 * Let's construct some code, which will run in the core main loop.
    199 * Some pseudo-code of the main loop will look like this:
    200 *   head
    201 *   while (...) {
    202 *     tail
    203 *     head
    204 *   }
    205 *   tail
    206 *
    207 * It may look a bit weird, but this setup allows to hide instruction
    208 * latencies better and also utilize dual-issue capability more
    209 * efficiently (make pairs of load-store and ALU instructions).
    210 *
    211 * So what we need now is a '*_tail_head' macro, which will be used
    212 * in the core main loop. A trivial straightforward implementation
    213 * of this macro would look like this:
    214 *
    215 *   pixman_composite_over_8888_0565_process_pixblock_tail
    216 *   vst1.16     {d28, d29}, [DST_W, :128]!
    217 *   vld1.16     {d4, d5}, [DST_R, :128]!
    218 *   vld4.32     {d0, d1, d2, d3}, [SRC]!
    219 *   pixman_composite_over_8888_0565_process_pixblock_head
    220 *   cache_preload 8, 8
    221 *
    222 * Now it also got some VLD/VST instructions. We simply can't move from
    223 * processing one block of pixels to the other one with just arithmetics.
    224 * The previously processed data needs to be written to memory and new
    225 * data needs to be fetched. Fortunately, this main loop does not deal
    226 * with partial leading/trailing pixels and can load/store a full block
    227 * of pixels in a bulk. Additionally, destination buffer is already
    228 * 16 bytes aligned here (which is good for performance).
    229 *
    230 * New things here are DST_R, DST_W, SRC and MASK identifiers. These
    231 * are the aliases for ARM registers which are used as pointers for
    232 * accessing data. We maintain separate pointers for reading and writing
    233 * destination buffer (DST_R and DST_W).
    234 *
    235 * Another new thing is 'cache_preload' macro. It is used for prefetching
    236 * data into CPU L2 cache and improve performance when dealing with large
    237 * images which are far larger than cache size. It uses one argument
    238 * (actually two, but they need to be the same here) - number of pixels
    239 * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
    240 * details about this macro. Moreover, if good performance is needed
    241 * the code from this macro needs to be copied into '*_tail_head' macro
    242 * and mixed with the rest of code for optimal instructions scheduling.
    243 * We are actually doing it below.
    244 *
    245 * Now after all the explanations, here is the optimized code.
    246 * Different instruction streams (originaling from '*_head', '*_tail'
    247 * and 'cache_preload' macro) use different indentation levels for
    248 * better readability. Actually taking the code from one of these
    249 * indentation levels and ignoring a few VLD/VST instructions would
    250 * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
    251 * macro!
    252 */
    253 
    254 #if 1
    255 
    256 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head
    257        vqadd.u8    d16, d2, d20
    258    vld1.16     {d4, d5}, [DST_R, :128]!
    259        vqadd.u8    q9, q0, q11
    260    vshrn.u16   d6, q2, #8
    261    fetch_src_pixblock
    262    vshrn.u16   d7, q2, #3
    263    vsli.u16    q2, q2, #5
    264        vshll.u8    q14, d16, #8
    265                                    PF add, PF_X, PF_X, #8
    266        vshll.u8    q8, d19, #8
    267                                    PF tst, PF_CTL, #0xF
    268    vsri.u8     d6, d6, #5
    269                                    PF addne, PF_X, PF_X, #8
    270    vmvn.8      d3, d3
    271                                    PF subne, PF_CTL, PF_CTL, #1
    272    vsri.u8     d7, d7, #6
    273    vshrn.u16   d30, q2, #2
    274    vmull.u8    q10, d3, d6
    275                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
    276    vmull.u8    q11, d3, d7
    277    vmull.u8    q12, d3, d30
    278                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
    279        vsri.u16    q14, q8, #5
    280                                    PF cmp, PF_X, ORIG_W
    281        vshll.u8    q9, d18, #8
    282    vrshr.u16   q13, q10, #8
    283                                    PF subge, PF_X, PF_X, ORIG_W
    284    vrshr.u16   q3, q11, #8
    285    vrshr.u16   q15, q12, #8
    286                                    PF subsge, PF_CTL, PF_CTL, #0x10
    287        vsri.u16    q14, q9, #11
    288                                    PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
    289    vraddhn.u16 d20, q10, q13
    290    vraddhn.u16 d23, q11, q3
    291                                    PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
    292    vraddhn.u16 d22, q12, q15
    293        vst1.16     {d28, d29}, [DST_W, :128]!
    294 .endm
    295 
    296 #else
    297 
    298 /* If we did not care much about the performance, we would just use this... */
    299 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head
    300    pixman_composite_over_8888_0565_process_pixblock_tail
    301    vst1.16     {d28, d29}, [DST_W, :128]!
    302    vld1.16     {d4, d5}, [DST_R, :128]!
    303    fetch_src_pixblock
    304    pixman_composite_over_8888_0565_process_pixblock_head
    305    cache_preload 8, 8
    306 .endm
    307 
    308 #endif
    309 
    310 /*
    311 * And now the final part. We are using 'generate_composite_function' macro
    312 * to put all the stuff together. We are specifying the name of the function
    313 * which we want to get, number of bits per pixel for the source, mask and
    314 * destination (0 if unused, like mask in this case). Next come some bit
    315 * flags:
    316 *   FLAG_DST_READWRITE      - tells that the destination buffer is both read
    317 *                             and written, for write-only buffer we would use
    318 *                             FLAG_DST_WRITEONLY flag instead
    319 *   FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
    320 *                             and separate color channels for 32bpp format.
    321 * The next things are:
    322 *  - the number of pixels processed per iteration (8 in this case, because
    323 *    that's the maximum what can fit into four 64-bit NEON registers).
    324 *  - prefetch distance, measured in pixel blocks. In this case it is 5 times
    325 *    by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
    326 *    prefetch distance can be selected by running some benchmarks.
    327 *
    328 * After that we specify some macros, these are 'default_init',
    329 * 'default_cleanup' here which are empty (but it is possible to have custom
    330 * init/cleanup macros to be able to save/restore some extra NEON registers
    331 * like d8-d15 or do anything else) followed by
    332 * 'pixman_composite_over_8888_0565_process_pixblock_head',
    333 * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
    334 * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
    335 * which we got implemented above.
    336 *
    337 * The last part is the NEON registers allocation scheme.
    338 */
    339 generate_composite_function \
    340    pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
    341    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
    342    8, /* number of pixels, processed in a single block */ \
    343    5, /* prefetch distance */ \
    344    default_init, \
    345    default_cleanup, \
    346    pixman_composite_over_8888_0565_process_pixblock_head, \
    347    pixman_composite_over_8888_0565_process_pixblock_tail, \
    348    pixman_composite_over_8888_0565_process_pixblock_tail_head, \
    349    28, /* dst_w_basereg */ \
    350    4,  /* dst_r_basereg */ \
    351    0,  /* src_basereg   */ \
    352    24  /* mask_basereg  */
    353 
    354 /******************************************************************************/
    355 
    356 .macro pixman_composite_over_n_0565_process_pixblock_head
    357    /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
    358       and put data into d6 - red, d7 - green, d30 - blue */
    359    vshrn.u16   d6, q2, #8
    360    vshrn.u16   d7, q2, #3
    361    vsli.u16    q2, q2, #5
    362    vsri.u8     d6, d6, #5
    363    vsri.u8     d7, d7, #6
    364    vshrn.u16   d30, q2, #2
    365    /* now do alpha blending, storing results in 8-bit planar format
    366       into d16 - red, d19 - green, d18 - blue */
    367    vmull.u8    q10, d3, d6
    368    vmull.u8    q11, d3, d7
    369    vmull.u8    q12, d3, d30
    370    vrshr.u16   q13, q10, #8
    371    vrshr.u16   q3, q11, #8
    372    vrshr.u16   q15, q12, #8
    373    vraddhn.u16 d20, q10, q13
    374    vraddhn.u16 d23, q11, q3
    375    vraddhn.u16 d22, q12, q15
    376 .endm
    377 
    378 .macro pixman_composite_over_n_0565_process_pixblock_tail
    379    /* ... continue alpha blending */
    380    vqadd.u8    d16, d2, d20
    381    vqadd.u8    q9, q0, q11
    382    /* convert the result to r5g6b5 and store it into {d28, d29} */
    383    vshll.u8    q14, d16, #8
    384    vshll.u8    q8, d19, #8
    385    vshll.u8    q9, d18, #8
    386    vsri.u16    q14, q8, #5
    387    vsri.u16    q14, q9, #11
    388 .endm
    389 
    390 /* TODO: expand macros and do better instructions scheduling */
    391 .macro pixman_composite_over_n_0565_process_pixblock_tail_head
    392    pixman_composite_over_n_0565_process_pixblock_tail
    393    vld1.16     {d4, d5}, [DST_R, :128]!
    394    vst1.16     {d28, d29}, [DST_W, :128]!
    395    pixman_composite_over_n_0565_process_pixblock_head
    396    cache_preload 8, 8
    397 .endm
    398 
    399 .macro pixman_composite_over_n_0565_init
    400    add         DUMMY, sp, #ARGS_STACK_OFFSET
    401    vld1.32     {d3[0]}, [DUMMY]
    402    vdup.8      d0, d3[0]
    403    vdup.8      d1, d3[1]
    404    vdup.8      d2, d3[2]
    405    vdup.8      d3, d3[3]
    406    vmvn.8      d3, d3      /* invert source alpha */
    407 .endm
    408 
    409 generate_composite_function \
    410    pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
    411    FLAG_DST_READWRITE, \
    412    8, /* number of pixels, processed in a single block */ \
    413    5, /* prefetch distance */ \
    414    pixman_composite_over_n_0565_init, \
    415    default_cleanup, \
    416    pixman_composite_over_n_0565_process_pixblock_head, \
    417    pixman_composite_over_n_0565_process_pixblock_tail, \
    418    pixman_composite_over_n_0565_process_pixblock_tail_head, \
    419    28, /* dst_w_basereg */ \
    420    4,  /* dst_r_basereg */ \
    421    0,  /* src_basereg   */ \
    422    24  /* mask_basereg  */
    423 
    424 /******************************************************************************/
    425 
    426 .macro pixman_composite_src_8888_0565_process_pixblock_head
    427    vshll.u8    q8, d1, #8
    428    vshll.u8    q14, d2, #8
    429    vshll.u8    q9, d0, #8
    430 .endm
    431 
    432 .macro pixman_composite_src_8888_0565_process_pixblock_tail
    433    vsri.u16    q14, q8, #5
    434    vsri.u16    q14, q9, #11
    435 .endm
    436 
    437 .macro pixman_composite_src_8888_0565_process_pixblock_tail_head
    438        vsri.u16    q14, q8, #5
    439                                    PF add, PF_X, PF_X, #8
    440                                    PF tst, PF_CTL, #0xF
    441    fetch_src_pixblock
    442                                    PF addne, PF_X, PF_X, #8
    443                                    PF subne, PF_CTL, PF_CTL, #1
    444        vsri.u16    q14, q9, #11
    445                                    PF cmp, PF_X, ORIG_W
    446                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
    447    vshll.u8    q8, d1, #8
    448        vst1.16     {d28, d29}, [DST_W, :128]!
    449                                    PF subge, PF_X, PF_X, ORIG_W
    450                                    PF subsge, PF_CTL, PF_CTL, #0x10
    451    vshll.u8    q14, d2, #8
    452                                    PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
    453    vshll.u8    q9, d0, #8
    454 .endm
    455 
    456 generate_composite_function \
    457    pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
    458    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
    459    8, /* number of pixels, processed in a single block */ \
    460    10, /* prefetch distance */ \
    461    default_init, \
    462    default_cleanup, \
    463    pixman_composite_src_8888_0565_process_pixblock_head, \
    464    pixman_composite_src_8888_0565_process_pixblock_tail, \
    465    pixman_composite_src_8888_0565_process_pixblock_tail_head
    466 
    467 /******************************************************************************/
    468 
    469 .macro pixman_composite_src_0565_8888_process_pixblock_head
    470    vshrn.u16   d30, q0, #8
    471    vshrn.u16   d29, q0, #3
    472    vsli.u16    q0, q0, #5
    473    vmov.u8     d31, #255
    474    vsri.u8     d30, d30, #5
    475    vsri.u8     d29, d29, #6
    476    vshrn.u16   d28, q0, #2
    477 .endm
    478 
    479 .macro pixman_composite_src_0565_8888_process_pixblock_tail
    480 .endm
    481 
    482 /* TODO: expand macros and do better instructions scheduling */
    483 .macro pixman_composite_src_0565_8888_process_pixblock_tail_head
    484    pixman_composite_src_0565_8888_process_pixblock_tail
    485    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
    486    fetch_src_pixblock
    487    pixman_composite_src_0565_8888_process_pixblock_head
    488    cache_preload 8, 8
    489 .endm
    490 
    491 generate_composite_function \
    492    pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
    493    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
    494    8, /* number of pixels, processed in a single block */ \
    495    10, /* prefetch distance */ \
    496    default_init, \
    497    default_cleanup, \
    498    pixman_composite_src_0565_8888_process_pixblock_head, \
    499    pixman_composite_src_0565_8888_process_pixblock_tail, \
    500    pixman_composite_src_0565_8888_process_pixblock_tail_head
    501 
    502 /******************************************************************************/
    503 
    504 .macro pixman_composite_add_8_8_process_pixblock_head
    505    vqadd.u8    q14, q0, q2
    506    vqadd.u8    q15, q1, q3
    507 .endm
    508 
    509 .macro pixman_composite_add_8_8_process_pixblock_tail
    510 .endm
    511 
    512 .macro pixman_composite_add_8_8_process_pixblock_tail_head
    513    fetch_src_pixblock
    514                                    PF add, PF_X, PF_X, #32
    515                                    PF tst, PF_CTL, #0xF
    516    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
    517                                    PF addne, PF_X, PF_X, #32
    518                                    PF subne, PF_CTL, PF_CTL, #1
    519        vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
    520                                    PF cmp, PF_X, ORIG_W
    521                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
    522                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
    523                                    PF subge, PF_X, PF_X, ORIG_W
    524                                    PF subsge, PF_CTL, PF_CTL, #0x10
    525    vqadd.u8    q14, q0, q2
    526                                    PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
    527                                    PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
    528    vqadd.u8    q15, q1, q3
    529 .endm
    530 
    531 generate_composite_function \
    532    pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
    533    FLAG_DST_READWRITE, \
    534    32, /* number of pixels, processed in a single block */ \
    535    10, /* prefetch distance */ \
    536    default_init, \
    537    default_cleanup, \
    538    pixman_composite_add_8_8_process_pixblock_head, \
    539    pixman_composite_add_8_8_process_pixblock_tail, \
    540    pixman_composite_add_8_8_process_pixblock_tail_head
    541 
    542 /******************************************************************************/
    543 
    544 .macro pixman_composite_add_8888_8888_process_pixblock_tail_head
    545    fetch_src_pixblock
    546                                    PF add, PF_X, PF_X, #8
    547                                    PF tst, PF_CTL, #0xF
    548    vld1.32     {d4, d5, d6, d7}, [DST_R, :128]!
    549                                    PF addne, PF_X, PF_X, #8
    550                                    PF subne, PF_CTL, PF_CTL, #1
    551        vst1.32     {d28, d29, d30, d31}, [DST_W, :128]!
    552                                    PF cmp, PF_X, ORIG_W
    553                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
    554                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
    555                                    PF subge, PF_X, PF_X, ORIG_W
    556                                    PF subsge, PF_CTL, PF_CTL, #0x10
    557    vqadd.u8    q14, q0, q2
    558                                    PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
    559                                    PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
    560    vqadd.u8    q15, q1, q3
    561 .endm
    562 
    563 generate_composite_function \
    564    pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
    565    FLAG_DST_READWRITE, \
    566    8, /* number of pixels, processed in a single block */ \
    567    10, /* prefetch distance */ \
    568    default_init, \
    569    default_cleanup, \
    570    pixman_composite_add_8_8_process_pixblock_head, \
    571    pixman_composite_add_8_8_process_pixblock_tail, \
    572    pixman_composite_add_8888_8888_process_pixblock_tail_head
    573 
    574 generate_composite_function_single_scanline \
    575    pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
    576    FLAG_DST_READWRITE, \
    577    8, /* number of pixels, processed in a single block */ \
    578    default_init, \
    579    default_cleanup, \
    580    pixman_composite_add_8_8_process_pixblock_head, \
    581    pixman_composite_add_8_8_process_pixblock_tail, \
    582    pixman_composite_add_8888_8888_process_pixblock_tail_head
    583 
    584 /******************************************************************************/
    585 
    586 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
    587    vmvn.8      d24, d3  /* get inverted alpha */
    588    /* do alpha blending */
    589    vmull.u8    q8, d24, d4
    590    vmull.u8    q9, d24, d5
    591    vmull.u8    q10, d24, d6
    592    vmull.u8    q11, d24, d7
    593 .endm
    594 
    595 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
    596    vrshr.u16   q14, q8, #8
    597    vrshr.u16   q15, q9, #8
    598    vrshr.u16   q12, q10, #8
    599    vrshr.u16   q13, q11, #8
    600    vraddhn.u16 d28, q14, q8
    601    vraddhn.u16 d29, q15, q9
    602    vraddhn.u16 d30, q12, q10
    603    vraddhn.u16 d31, q13, q11
    604 .endm
    605 
    606 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
    607    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
    608        vrshr.u16   q14, q8, #8
    609                                    PF add, PF_X, PF_X, #8
    610                                    PF tst, PF_CTL, #0xF
    611        vrshr.u16   q15, q9, #8
    612        vrshr.u16   q12, q10, #8
    613        vrshr.u16   q13, q11, #8
    614                                    PF addne, PF_X, PF_X, #8
    615                                    PF subne, PF_CTL, PF_CTL, #1
    616        vraddhn.u16 d28, q14, q8
    617        vraddhn.u16 d29, q15, q9
    618                                    PF cmp, PF_X, ORIG_W
    619        vraddhn.u16 d30, q12, q10
    620        vraddhn.u16 d31, q13, q11
    621    fetch_src_pixblock
    622                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
    623    vmvn.8      d22, d3
    624                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
    625        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
    626                                    PF subge, PF_X, PF_X, ORIG_W
    627    vmull.u8    q8, d22, d4
    628                                    PF subsge, PF_CTL, PF_CTL, #0x10
    629    vmull.u8    q9, d22, d5
    630                                    PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
    631    vmull.u8    q10, d22, d6
    632                                    PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
    633    vmull.u8    q11, d22, d7
    634 .endm
    635 
    636 generate_composite_function_single_scanline \
    637    pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
    638    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
    639    8, /* number of pixels, processed in a single block */ \
    640    default_init, \
    641    default_cleanup, \
    642    pixman_composite_out_reverse_8888_8888_process_pixblock_head, \
    643    pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \
    644    pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
    645 
    646 /******************************************************************************/
    647 
    648 .macro pixman_composite_over_8888_8888_process_pixblock_head
    649    pixman_composite_out_reverse_8888_8888_process_pixblock_head
    650 .endm
    651 
    652 .macro pixman_composite_over_8888_8888_process_pixblock_tail
    653    pixman_composite_out_reverse_8888_8888_process_pixblock_tail
    654    vqadd.u8    q14, q0, q14
    655    vqadd.u8    q15, q1, q15
    656 .endm
    657 
    658 .macro pixman_composite_over_8888_8888_process_pixblock_tail_head
    659    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
    660        vrshr.u16   q14, q8, #8
    661                                    PF add, PF_X, PF_X, #8
    662                                    PF tst, PF_CTL, #0xF
    663        vrshr.u16   q15, q9, #8
    664        vrshr.u16   q12, q10, #8
    665        vrshr.u16   q13, q11, #8
    666                                    PF addne, PF_X, PF_X, #8
    667                                    PF subne, PF_CTL, PF_CTL, #1
    668        vraddhn.u16 d28, q14, q8
    669        vraddhn.u16 d29, q15, q9
    670                                    PF cmp, PF_X, ORIG_W
    671        vraddhn.u16 d30, q12, q10
    672        vraddhn.u16 d31, q13, q11
    673        vqadd.u8    q14, q0, q14
    674        vqadd.u8    q15, q1, q15
    675    fetch_src_pixblock
    676                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
    677    vmvn.8      d22, d3
    678                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
    679        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
    680                                    PF subge, PF_X, PF_X, ORIG_W
    681    vmull.u8    q8, d22, d4
    682                                    PF subsge, PF_CTL, PF_CTL, #0x10
    683    vmull.u8    q9, d22, d5
    684                                    PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
    685    vmull.u8    q10, d22, d6
    686                                    PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
    687    vmull.u8    q11, d22, d7
    688 .endm
    689 
    690 generate_composite_function \
    691    pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
    692    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
    693    8, /* number of pixels, processed in a single block */ \
    694    5, /* prefetch distance */ \
    695    default_init, \
    696    default_cleanup, \
    697    pixman_composite_over_8888_8888_process_pixblock_head, \
    698    pixman_composite_over_8888_8888_process_pixblock_tail, \
    699    pixman_composite_over_8888_8888_process_pixblock_tail_head
    700 
    701 generate_composite_function_single_scanline \
    702    pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
    703    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
    704    8, /* number of pixels, processed in a single block */ \
    705    default_init, \
    706    default_cleanup, \
    707    pixman_composite_over_8888_8888_process_pixblock_head, \
    708    pixman_composite_over_8888_8888_process_pixblock_tail, \
    709    pixman_composite_over_8888_8888_process_pixblock_tail_head
    710 
    711 /******************************************************************************/
    712 
    713 .macro pixman_composite_over_n_8888_process_pixblock_head
    714    /* deinterleaved source pixels in {d0, d1, d2, d3} */
    715    /* inverted alpha in {d24} */
    716    /* destination pixels in {d4, d5, d6, d7} */
    717    vmull.u8    q8, d24, d4
    718    vmull.u8    q9, d24, d5
    719    vmull.u8    q10, d24, d6
    720    vmull.u8    q11, d24, d7
    721 .endm
    722 
    723 .macro pixman_composite_over_n_8888_process_pixblock_tail
    724    vrshr.u16   q14, q8, #8
    725    vrshr.u16   q15, q9, #8
    726    vrshr.u16   q2, q10, #8
    727    vrshr.u16   q3, q11, #8
    728    vraddhn.u16 d28, q14, q8
    729    vraddhn.u16 d29, q15, q9
    730    vraddhn.u16 d30, q2, q10
    731    vraddhn.u16 d31, q3, q11
    732    vqadd.u8    q14, q0, q14
    733    vqadd.u8    q15, q1, q15
    734 .endm
    735 
    736 .macro pixman_composite_over_n_8888_process_pixblock_tail_head
    737        vrshr.u16   q14, q8, #8
    738        vrshr.u16   q15, q9, #8
    739        vrshr.u16   q2, q10, #8
    740        vrshr.u16   q3, q11, #8
    741        vraddhn.u16 d28, q14, q8
    742        vraddhn.u16 d29, q15, q9
    743        vraddhn.u16 d30, q2, q10
    744        vraddhn.u16 d31, q3, q11
    745    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
    746        vqadd.u8    q14, q0, q14
    747                                    PF add, PF_X, PF_X, #8
    748                                    PF tst, PF_CTL, #0x0F
    749                                    PF addne, PF_X, PF_X, #8
    750                                    PF subne, PF_CTL, PF_CTL, #1
    751        vqadd.u8    q15, q1, q15
    752                                    PF cmp, PF_X, ORIG_W
    753    vmull.u8    q8, d24, d4
    754                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
    755    vmull.u8    q9, d24, d5
    756                                    PF subge, PF_X, PF_X, ORIG_W
    757    vmull.u8    q10, d24, d6
    758                                    PF subsge, PF_CTL, PF_CTL, #0x10
    759    vmull.u8    q11, d24, d7
    760                                    PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
    761        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
    762 .endm
    763 
    764 .macro pixman_composite_over_n_8888_init
    765    add         DUMMY, sp, #ARGS_STACK_OFFSET
    766    vld1.32     {d3[0]}, [DUMMY]
    767    vdup.8      d0, d3[0]
    768    vdup.8      d1, d3[1]
    769    vdup.8      d2, d3[2]
    770    vdup.8      d3, d3[3]
    771    vmvn.8      d24, d3  /* get inverted alpha */
    772 .endm
    773 
    774 generate_composite_function \
    775    pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
    776    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
    777    8, /* number of pixels, processed in a single block */ \
    778    5, /* prefetch distance */ \
    779    pixman_composite_over_n_8888_init, \
    780    default_cleanup, \
    781    pixman_composite_over_8888_8888_process_pixblock_head, \
    782    pixman_composite_over_8888_8888_process_pixblock_tail, \
    783    pixman_composite_over_n_8888_process_pixblock_tail_head
    784 
    785 /******************************************************************************/
    786 
    787 .macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
    788        vrshr.u16   q14, q8, #8
    789                                    PF add, PF_X, PF_X, #8
    790                                    PF tst, PF_CTL, #0xF
    791        vrshr.u16   q15, q9, #8
    792        vrshr.u16   q12, q10, #8
    793        vrshr.u16   q13, q11, #8
    794                                    PF addne, PF_X, PF_X, #8
    795                                    PF subne, PF_CTL, PF_CTL, #1
    796        vraddhn.u16 d28, q14, q8
    797        vraddhn.u16 d29, q15, q9
    798                                    PF cmp, PF_X, ORIG_W
    799        vraddhn.u16 d30, q12, q10
    800        vraddhn.u16 d31, q13, q11
    801        vqadd.u8    q14, q0, q14
    802        vqadd.u8    q15, q1, q15
    803    vld4.8      {d0, d1, d2, d3}, [DST_R, :128]!
    804    vmvn.8      d22, d3
    805                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
    806        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
    807                                    PF subge, PF_X, PF_X, ORIG_W
    808    vmull.u8    q8, d22, d4
    809                                    PF subsge, PF_CTL, PF_CTL, #0x10
    810    vmull.u8    q9, d22, d5
    811    vmull.u8    q10, d22, d6
    812                                    PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
    813    vmull.u8    q11, d22, d7
    814 .endm
    815 
    816 .macro pixman_composite_over_reverse_n_8888_init
    817    add         DUMMY, sp, #ARGS_STACK_OFFSET
    818    vld1.32     {d7[0]}, [DUMMY]
    819    vdup.8      d4, d7[0]
    820    vdup.8      d5, d7[1]
    821    vdup.8      d6, d7[2]
    822    vdup.8      d7, d7[3]
    823 .endm
    824 
    825 generate_composite_function \
    826    pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
    827    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
    828    8, /* number of pixels, processed in a single block */ \
    829    5, /* prefetch distance */ \
    830    pixman_composite_over_reverse_n_8888_init, \
    831    default_cleanup, \
    832    pixman_composite_over_8888_8888_process_pixblock_head, \
    833    pixman_composite_over_8888_8888_process_pixblock_tail, \
    834    pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
    835    28, /* dst_w_basereg */ \
    836    0,  /* dst_r_basereg */ \
    837    4,  /* src_basereg   */ \
    838    24  /* mask_basereg  */
    839 
    840 /******************************************************************************/
    841 
    842 .macro pixman_composite_over_8888_8_0565_process_pixblock_head
    843    vmull.u8    q0,  d24, d8    /* IN for SRC pixels (part1) */
    844    vmull.u8    q1,  d24, d9
    845    vmull.u8    q6,  d24, d10
    846    vmull.u8    q7,  d24, d11
    847        vshrn.u16   d6,  q2, #8 /* convert DST_R data to 32-bpp (part1) */
    848        vshrn.u16   d7,  q2, #3
    849        vsli.u16    q2,  q2, #5
    850    vrshr.u16   q8,  q0,  #8    /* IN for SRC pixels (part2) */
    851    vrshr.u16   q9,  q1,  #8
    852    vrshr.u16   q10, q6,  #8
    853    vrshr.u16   q11, q7,  #8
    854    vraddhn.u16 d0,  q0,  q8
    855    vraddhn.u16 d1,  q1,  q9
    856    vraddhn.u16 d2,  q6,  q10
    857    vraddhn.u16 d3,  q7,  q11
    858        vsri.u8     d6,  d6, #5 /* convert DST_R data to 32-bpp (part2) */
    859        vsri.u8     d7,  d7, #6
    860    vmvn.8      d3,  d3
    861        vshrn.u16   d30, q2, #2
    862    vmull.u8    q8,  d3, d6     /* now do alpha blending */
    863    vmull.u8    q9,  d3, d7
    864    vmull.u8    q10, d3, d30
    865 .endm
    866 
    867 .macro pixman_composite_over_8888_8_0565_process_pixblock_tail
    868    /* 3 cycle bubble (after vmull.u8) */
    869    vrshr.u16   q13, q8,  #8
    870    vrshr.u16   q11, q9,  #8
    871    vrshr.u16   q15, q10, #8
    872    vraddhn.u16 d16, q8,  q13
    873    vraddhn.u16 d27, q9,  q11
    874    vraddhn.u16 d26, q10, q15
    875    vqadd.u8    d16, d2,  d16
    876    /* 1 cycle bubble */
    877    vqadd.u8    q9,  q0,  q13
    878    vshll.u8    q14, d16, #8    /* convert to 16bpp */
    879    vshll.u8    q8,  d19, #8
    880    vshll.u8    q9,  d18, #8
    881    vsri.u16    q14, q8,  #5
    882    /* 1 cycle bubble */
    883    vsri.u16    q14, q9,  #11
    884 .endm
    885 
    886 .macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
    887    vld1.16     {d4, d5}, [DST_R, :128]!
    888    vshrn.u16   d6,  q2,  #8
    889    fetch_mask_pixblock
    890    vshrn.u16   d7,  q2,  #3
    891    fetch_src_pixblock
    892    vmull.u8    q6,  d24, d10
    893        vrshr.u16   q13, q8,  #8
    894        vrshr.u16   q11, q9,  #8
    895        vrshr.u16   q15, q10, #8
    896        vraddhn.u16 d16, q8,  q13
    897        vraddhn.u16 d27, q9,  q11
    898        vraddhn.u16 d26, q10, q15
    899        vqadd.u8    d16, d2,  d16
    900    vmull.u8    q1,  d24, d9
    901        vqadd.u8    q9,  q0,  q13
    902        vshll.u8    q14, d16, #8
    903    vmull.u8    q0,  d24, d8
    904        vshll.u8    q8,  d19, #8
    905        vshll.u8    q9,  d18, #8
    906        vsri.u16    q14, q8,  #5
    907    vmull.u8    q7,  d24, d11
    908        vsri.u16    q14, q9,  #11
    909 
    910    cache_preload 8, 8
    911 
    912    vsli.u16    q2,  q2,  #5
    913    vrshr.u16   q8,  q0,  #8
    914    vrshr.u16   q9,  q1,  #8
    915    vrshr.u16   q10, q6,  #8
    916    vrshr.u16   q11, q7,  #8
    917    vraddhn.u16 d0,  q0,  q8
    918    vraddhn.u16 d1,  q1,  q9
    919    vraddhn.u16 d2,  q6,  q10
    920    vraddhn.u16 d3,  q7,  q11
    921    vsri.u8     d6,  d6,  #5
    922    vsri.u8     d7,  d7,  #6
    923    vmvn.8      d3,  d3
    924    vshrn.u16   d30, q2,  #2
    925    vst1.16     {d28, d29}, [DST_W, :128]!
    926    vmull.u8    q8,  d3,  d6
    927    vmull.u8    q9,  d3,  d7
    928    vmull.u8    q10, d3,  d30
    929 .endm
    930 
    931 generate_composite_function \
    932    pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
    933    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
    934    8, /* number of pixels, processed in a single block */ \
    935    5, /* prefetch distance */ \
    936    default_init_need_all_regs, \
    937    default_cleanup_need_all_regs, \
    938    pixman_composite_over_8888_8_0565_process_pixblock_head, \
    939    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
    940    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
    941    28, /* dst_w_basereg */ \
    942    4,  /* dst_r_basereg */ \
    943    8,  /* src_basereg   */ \
    944    24  /* mask_basereg  */
    945 
    946 /******************************************************************************/
    947 
    948 /*
    949 * This function needs a special initialization of solid mask.
    950 * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
    951 * offset, split into color components and replicated in d8-d11
    952 * registers. Additionally, this function needs all the NEON registers,
    953 * so it has to save d8-d15 registers which are callee saved according
    954 * to ABI. These registers are restored from 'cleanup' macro. All the
    955 * other NEON registers are caller saved, so can be clobbered freely
    956 * without introducing any problems.
    957 */
    958 .macro pixman_composite_over_n_8_0565_init
    959    add         DUMMY, sp, #ARGS_STACK_OFFSET
    960    vpush       {d8-d15}
    961    vld1.32     {d11[0]}, [DUMMY]
    962    vdup.8      d8, d11[0]
    963    vdup.8      d9, d11[1]
    964    vdup.8      d10, d11[2]
    965    vdup.8      d11, d11[3]
    966 .endm
    967 
    968 .macro pixman_composite_over_n_8_0565_cleanup
    969    vpop        {d8-d15}
    970 .endm
    971 
    972 generate_composite_function \
    973    pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
    974    FLAG_DST_READWRITE, \
    975    8, /* number of pixels, processed in a single block */ \
    976    5, /* prefetch distance */ \
    977    pixman_composite_over_n_8_0565_init, \
    978    pixman_composite_over_n_8_0565_cleanup, \
    979    pixman_composite_over_8888_8_0565_process_pixblock_head, \
    980    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
    981    pixman_composite_over_8888_8_0565_process_pixblock_tail_head
    982 
    983 /******************************************************************************/
    984 
    985 .macro pixman_composite_over_8888_n_0565_init
    986    add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
    987    vpush       {d8-d15}
    988    vld1.32     {d24[0]}, [DUMMY]
    989    vdup.8      d24, d24[3]
    990 .endm
    991 
    992 .macro pixman_composite_over_8888_n_0565_cleanup
    993    vpop        {d8-d15}
    994 .endm
    995 
    996 generate_composite_function \
    997    pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \
    998    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
    999    8, /* number of pixels, processed in a single block */ \
   1000    5, /* prefetch distance */ \
   1001    pixman_composite_over_8888_n_0565_init, \
   1002    pixman_composite_over_8888_n_0565_cleanup, \
   1003    pixman_composite_over_8888_8_0565_process_pixblock_head, \
   1004    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
   1005    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
   1006    28, /* dst_w_basereg */ \
   1007    4,  /* dst_r_basereg */ \
   1008    8,  /* src_basereg   */ \
   1009    24  /* mask_basereg  */
   1010 
   1011 /******************************************************************************/
   1012 
   1013 .macro pixman_composite_src_0565_0565_process_pixblock_head
   1014 .endm
   1015 
   1016 .macro pixman_composite_src_0565_0565_process_pixblock_tail
   1017 .endm
   1018 
   1019 .macro pixman_composite_src_0565_0565_process_pixblock_tail_head
   1020    vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
   1021    fetch_src_pixblock
   1022    cache_preload 16, 16
   1023 .endm
   1024 
   1025 generate_composite_function \
   1026    pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
   1027    FLAG_DST_WRITEONLY, \
   1028    16, /* number of pixels, processed in a single block */ \
   1029    10, /* prefetch distance */ \
   1030    default_init, \
   1031    default_cleanup, \
   1032    pixman_composite_src_0565_0565_process_pixblock_head, \
   1033    pixman_composite_src_0565_0565_process_pixblock_tail, \
   1034    pixman_composite_src_0565_0565_process_pixblock_tail_head, \
   1035    0, /* dst_w_basereg */ \
   1036    0, /* dst_r_basereg */ \
   1037    0, /* src_basereg   */ \
   1038    0  /* mask_basereg  */
   1039 
   1040 /******************************************************************************/
   1041 
   1042 .macro pixman_composite_src_n_8_process_pixblock_head
   1043 .endm
   1044 
   1045 .macro pixman_composite_src_n_8_process_pixblock_tail
   1046 .endm
   1047 
   1048 .macro pixman_composite_src_n_8_process_pixblock_tail_head
   1049    vst1.8  {d0, d1, d2, d3}, [DST_W, :128]!
   1050 .endm
   1051 
   1052 .macro pixman_composite_src_n_8_init
   1053    add         DUMMY, sp, #ARGS_STACK_OFFSET
   1054    vld1.32     {d0[0]}, [DUMMY]
   1055    vsli.u64    d0, d0, #8
   1056    vsli.u64    d0, d0, #16
   1057    vsli.u64    d0, d0, #32
   1058    vorr        d1, d0, d0
   1059    vorr        q1, q0, q0
   1060 .endm
   1061 
   1062 .macro pixman_composite_src_n_8_cleanup
   1063 .endm
   1064 
   1065 generate_composite_function \
   1066    pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
   1067    FLAG_DST_WRITEONLY, \
   1068    32, /* number of pixels, processed in a single block */ \
   1069    0,  /* prefetch distance */ \
   1070    pixman_composite_src_n_8_init, \
   1071    pixman_composite_src_n_8_cleanup, \
   1072    pixman_composite_src_n_8_process_pixblock_head, \
   1073    pixman_composite_src_n_8_process_pixblock_tail, \
   1074    pixman_composite_src_n_8_process_pixblock_tail_head, \
   1075    0, /* dst_w_basereg */ \
   1076    0, /* dst_r_basereg */ \
   1077    0, /* src_basereg   */ \
   1078    0  /* mask_basereg  */
   1079 
   1080 /******************************************************************************/
   1081 
   1082 .macro pixman_composite_src_n_0565_process_pixblock_head
   1083 .endm
   1084 
   1085 .macro pixman_composite_src_n_0565_process_pixblock_tail
   1086 .endm
   1087 
   1088 .macro pixman_composite_src_n_0565_process_pixblock_tail_head
   1089    vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
   1090 .endm
   1091 
   1092 .macro pixman_composite_src_n_0565_init
   1093    add         DUMMY, sp, #ARGS_STACK_OFFSET
   1094    vld1.32     {d0[0]}, [DUMMY]
   1095    vsli.u64    d0, d0, #16
   1096    vsli.u64    d0, d0, #32
   1097    vorr        d1, d0, d0
   1098    vorr        q1, q0, q0
   1099 .endm
   1100 
   1101 .macro pixman_composite_src_n_0565_cleanup
   1102 .endm
   1103 
   1104 generate_composite_function \
   1105    pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
   1106    FLAG_DST_WRITEONLY, \
   1107    16, /* number of pixels, processed in a single block */ \
   1108    0,  /* prefetch distance */ \
   1109    pixman_composite_src_n_0565_init, \
   1110    pixman_composite_src_n_0565_cleanup, \
   1111    pixman_composite_src_n_0565_process_pixblock_head, \
   1112    pixman_composite_src_n_0565_process_pixblock_tail, \
   1113    pixman_composite_src_n_0565_process_pixblock_tail_head, \
   1114    0, /* dst_w_basereg */ \
   1115    0, /* dst_r_basereg */ \
   1116    0, /* src_basereg   */ \
   1117    0  /* mask_basereg  */
   1118 
   1119 /******************************************************************************/
   1120 
   1121 .macro pixman_composite_src_n_8888_process_pixblock_head
   1122 .endm
   1123 
   1124 .macro pixman_composite_src_n_8888_process_pixblock_tail
   1125 .endm
   1126 
   1127 .macro pixman_composite_src_n_8888_process_pixblock_tail_head
   1128    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
   1129 .endm
   1130 
   1131 .macro pixman_composite_src_n_8888_init
   1132    add         DUMMY, sp, #ARGS_STACK_OFFSET
   1133    vld1.32     {d0[0]}, [DUMMY]
   1134    vsli.u64    d0, d0, #32
   1135    vorr        d1, d0, d0
   1136    vorr        q1, q0, q0
   1137 .endm
   1138 
   1139 .macro pixman_composite_src_n_8888_cleanup
   1140 .endm
   1141 
   1142 generate_composite_function \
   1143    pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
   1144    FLAG_DST_WRITEONLY, \
   1145    8, /* number of pixels, processed in a single block */ \
   1146    0, /* prefetch distance */ \
   1147    pixman_composite_src_n_8888_init, \
   1148    pixman_composite_src_n_8888_cleanup, \
   1149    pixman_composite_src_n_8888_process_pixblock_head, \
   1150    pixman_composite_src_n_8888_process_pixblock_tail, \
   1151    pixman_composite_src_n_8888_process_pixblock_tail_head, \
   1152    0, /* dst_w_basereg */ \
   1153    0, /* dst_r_basereg */ \
   1154    0, /* src_basereg   */ \
   1155    0  /* mask_basereg  */
   1156 
   1157 /******************************************************************************/
   1158 
   1159 .macro pixman_composite_src_8888_8888_process_pixblock_head
   1160 .endm
   1161 
   1162 .macro pixman_composite_src_8888_8888_process_pixblock_tail
   1163 .endm
   1164 
   1165 .macro pixman_composite_src_8888_8888_process_pixblock_tail_head
   1166    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
   1167    fetch_src_pixblock
   1168    cache_preload 8, 8
   1169 .endm
   1170 
   1171 generate_composite_function \
   1172    pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
   1173    FLAG_DST_WRITEONLY, \
   1174    8, /* number of pixels, processed in a single block */ \
   1175    10, /* prefetch distance */ \
   1176    default_init, \
   1177    default_cleanup, \
   1178    pixman_composite_src_8888_8888_process_pixblock_head, \
   1179    pixman_composite_src_8888_8888_process_pixblock_tail, \
   1180    pixman_composite_src_8888_8888_process_pixblock_tail_head, \
   1181    0, /* dst_w_basereg */ \
   1182    0, /* dst_r_basereg */ \
   1183    0, /* src_basereg   */ \
   1184    0  /* mask_basereg  */
   1185 
   1186 /******************************************************************************/
   1187 
   1188 .macro pixman_composite_src_x888_8888_process_pixblock_head
   1189    vorr     q0, q0, q2
   1190    vorr     q1, q1, q2
   1191 .endm
   1192 
   1193 .macro pixman_composite_src_x888_8888_process_pixblock_tail
   1194 .endm
   1195 
   1196 .macro pixman_composite_src_x888_8888_process_pixblock_tail_head
   1197    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
   1198    fetch_src_pixblock
   1199    vorr     q0, q0, q2
   1200    vorr     q1, q1, q2
   1201    cache_preload 8, 8
   1202 .endm
   1203 
   1204 .macro pixman_composite_src_x888_8888_init
   1205    vmov.u8  q2, #0xFF
   1206    vshl.u32 q2, q2, #24
   1207 .endm
   1208 
   1209 generate_composite_function \
   1210    pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
   1211    FLAG_DST_WRITEONLY, \
   1212    8, /* number of pixels, processed in a single block */ \
   1213    10, /* prefetch distance */ \
   1214    pixman_composite_src_x888_8888_init, \
   1215    default_cleanup, \
   1216    pixman_composite_src_x888_8888_process_pixblock_head, \
   1217    pixman_composite_src_x888_8888_process_pixblock_tail, \
   1218    pixman_composite_src_x888_8888_process_pixblock_tail_head, \
   1219    0, /* dst_w_basereg */ \
   1220    0, /* dst_r_basereg */ \
   1221    0, /* src_basereg   */ \
   1222    0  /* mask_basereg  */
   1223 
   1224 /******************************************************************************/
   1225 
   1226 .macro pixman_composite_src_n_8_8888_process_pixblock_head
   1227    /* expecting solid source in {d0, d1, d2, d3} */
   1228    /* mask is in d24 (d25, d26, d27 are unused) */
   1229 
   1230    /* in */
   1231    vmull.u8    q8, d24, d0
   1232    vmull.u8    q9, d24, d1
   1233    vmull.u8    q10, d24, d2
   1234    vmull.u8    q11, d24, d3
   1235    vrsra.u16   q8, q8, #8
   1236    vrsra.u16   q9, q9, #8
   1237    vrsra.u16   q10, q10, #8
   1238    vrsra.u16   q11, q11, #8
   1239 .endm
   1240 
   1241 .macro pixman_composite_src_n_8_8888_process_pixblock_tail
   1242    vrshrn.u16  d28, q8, #8
   1243    vrshrn.u16  d29, q9, #8
   1244    vrshrn.u16  d30, q10, #8
   1245    vrshrn.u16  d31, q11, #8
   1246 .endm
   1247 
   1248 .macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
   1249    fetch_mask_pixblock
   1250                                    PF add, PF_X, PF_X, #8
   1251        vrshrn.u16  d28, q8, #8
   1252                                    PF tst, PF_CTL, #0x0F
   1253        vrshrn.u16  d29, q9, #8
   1254                                    PF addne, PF_X, PF_X, #8
   1255        vrshrn.u16  d30, q10, #8
   1256                                    PF subne, PF_CTL, PF_CTL, #1
   1257        vrshrn.u16  d31, q11, #8
   1258                                    PF cmp, PF_X, ORIG_W
   1259    vmull.u8    q8, d24, d0
   1260                                    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
   1261    vmull.u8    q9, d24, d1
   1262                                    PF subge, PF_X, PF_X, ORIG_W
   1263    vmull.u8    q10, d24, d2
   1264                                    PF subsge, PF_CTL, PF_CTL, #0x10
   1265    vmull.u8    q11, d24, d3
   1266                                    PF ldrbge, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
   1267        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
   1268    vrsra.u16   q8, q8, #8
   1269    vrsra.u16   q9, q9, #8
   1270    vrsra.u16   q10, q10, #8
   1271    vrsra.u16   q11, q11, #8
   1272 .endm
   1273 
   1274 .macro pixman_composite_src_n_8_8888_init
   1275    add         DUMMY, sp, #ARGS_STACK_OFFSET
   1276    vld1.32     {d3[0]}, [DUMMY]
   1277    vdup.8      d0, d3[0]
   1278    vdup.8      d1, d3[1]
   1279    vdup.8      d2, d3[2]
   1280    vdup.8      d3, d3[3]
   1281 .endm
   1282 
   1283 .macro pixman_composite_src_n_8_8888_cleanup
   1284 .endm
   1285 
   1286 generate_composite_function \
   1287    pixman_composite_src_n_8_8888_asm_neon, 0, 8, 32, \
   1288    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
   1289    8, /* number of pixels, processed in a single block */ \
   1290    5, /* prefetch distance */ \
   1291    pixman_composite_src_n_8_8888_init, \
   1292    pixman_composite_src_n_8_8888_cleanup, \
   1293    pixman_composite_src_n_8_8888_process_pixblock_head, \
   1294    pixman_composite_src_n_8_8888_process_pixblock_tail, \
   1295    pixman_composite_src_n_8_8888_process_pixblock_tail_head, \
   1296 
   1297 /******************************************************************************/
   1298 
   1299 .macro pixman_composite_src_n_8_8_process_pixblock_head
   1300    vmull.u8    q0, d24, d16
   1301    vmull.u8    q1, d25, d16
   1302    vmull.u8    q2, d26, d16
   1303    vmull.u8    q3, d27, d16
   1304    vrsra.u16   q0, q0,  #8
   1305    vrsra.u16   q1, q1,  #8
   1306    vrsra.u16   q2, q2,  #8
   1307    vrsra.u16   q3, q3,  #8
   1308 .endm
   1309 
   1310 .macro pixman_composite_src_n_8_8_process_pixblock_tail
   1311    vrshrn.u16  d28, q0, #8
   1312    vrshrn.u16  d29, q1, #8
   1313    vrshrn.u16  d30, q2, #8
   1314    vrshrn.u16  d31, q3, #8
   1315 .endm
   1316 
   1317 .macro pixman_composite_src_n_8_8_process_pixblock_tail_head
   1318    fetch_mask_pixblock
   1319                                    PF add, PF_X, PF_X, #8
   1320        vrshrn.u16  d28, q0, #8
   1321                                    PF tst, PF_CTL, #0x0F
   1322        vrshrn.u16  d29, q1, #8
   1323                                    PF addne, PF_X, PF_X, #8
   1324        vrshrn.u16  d30, q2, #8
   1325                                    PF subne, PF_CTL, PF_CTL, #1
   1326        vrshrn.u16  d31, q3, #8
   1327                                    PF cmp, PF_X, ORIG_W
   1328    vmull.u8    q0,  d24, d16
   1329                                    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
   1330    vmull.u8    q1,  d25, d16
   1331                                    PF subge, PF_X, PF_X, ORIG_W
   1332    vmull.u8    q2,  d26, d16
   1333                                    PF subsge, PF_CTL, PF_CTL, #0x10
   1334    vmull.u8    q3,  d27, d16
   1335                                    PF ldrbge, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
   1336        vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
   1337    vrsra.u16   q0, q0,  #8
   1338    vrsra.u16   q1, q1,  #8
   1339    vrsra.u16   q2, q2,  #8
   1340    vrsra.u16   q3, q3,  #8
   1341 .endm
   1342 
   1343 .macro pixman_composite_src_n_8_8_init
   1344    add         DUMMY, sp, #ARGS_STACK_OFFSET
   1345    vld1.32     {d16[0]}, [DUMMY]
   1346    vdup.8      d16, d16[3]
   1347 .endm
   1348 
   1349 .macro pixman_composite_src_n_8_8_cleanup
   1350 .endm
   1351 
   1352 generate_composite_function \
   1353    pixman_composite_src_n_8_8_asm_neon, 0, 8, 8, \
   1354    FLAG_DST_WRITEONLY, \
   1355    32, /* number of pixels, processed in a single block */ \
   1356    5, /* prefetch distance */ \
   1357    pixman_composite_src_n_8_8_init, \
   1358    pixman_composite_src_n_8_8_cleanup, \
   1359    pixman_composite_src_n_8_8_process_pixblock_head, \
   1360    pixman_composite_src_n_8_8_process_pixblock_tail, \
   1361    pixman_composite_src_n_8_8_process_pixblock_tail_head
   1362 
   1363 /******************************************************************************/
   1364 
   1365 .macro pixman_composite_over_n_8_8888_process_pixblock_head
   1366    /* expecting deinterleaved source data in {d8, d9, d10, d11} */
   1367    /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
   1368    /* and destination data in {d4, d5, d6, d7} */
   1369    /* mask is in d24 (d25, d26, d27 are unused) */
   1370 
   1371    /* in */
   1372    vmull.u8    q6, d24, d8
   1373    vmull.u8    q7, d24, d9
   1374    vmull.u8    q8, d24, d10
   1375    vmull.u8    q9, d24, d11
   1376    vrshr.u16   q10, q6, #8
   1377    vrshr.u16   q11, q7, #8
   1378    vrshr.u16   q12, q8, #8
   1379    vrshr.u16   q13, q9, #8
   1380    vraddhn.u16 d0, q6, q10
   1381    vraddhn.u16 d1, q7, q11
   1382    vraddhn.u16 d2, q8, q12
   1383    vraddhn.u16 d3, q9, q13
   1384    vmvn.8      d25, d3  /* get inverted alpha */
   1385    /* source:      d0 - blue, d1 - green, d2 - red, d3 - alpha */
   1386    /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */
   1387    /* now do alpha blending */
   1388    vmull.u8    q8, d25, d4
   1389    vmull.u8    q9, d25, d5
   1390    vmull.u8    q10, d25, d6
   1391    vmull.u8    q11, d25, d7
   1392 .endm
   1393 
   1394 .macro pixman_composite_over_n_8_8888_process_pixblock_tail
   1395    vrshr.u16   q14, q8, #8
   1396    vrshr.u16   q15, q9, #8
   1397    vrshr.u16   q6, q10, #8
   1398    vrshr.u16   q7, q11, #8
   1399    vraddhn.u16 d28, q14, q8
   1400    vraddhn.u16 d29, q15, q9
   1401    vraddhn.u16 d30, q6, q10
   1402    vraddhn.u16 d31, q7, q11
   1403    vqadd.u8    q14, q0, q14
   1404    vqadd.u8    q15, q1, q15
   1405 .endm
   1406 
   1407 .macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
   1408        vrshr.u16   q14, q8, #8
   1409    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
   1410        vrshr.u16   q15, q9, #8
   1411    fetch_mask_pixblock
   1412        vrshr.u16   q6, q10, #8
   1413                                    PF add, PF_X, PF_X, #8
   1414        vrshr.u16   q7, q11, #8
   1415                                    PF tst, PF_CTL, #0x0F
   1416        vraddhn.u16 d28, q14, q8
   1417                                    PF addne, PF_X, PF_X, #8
   1418        vraddhn.u16 d29, q15, q9
   1419                                    PF subne, PF_CTL, PF_CTL, #1
   1420        vraddhn.u16 d30, q6, q10
   1421                                    PF cmp, PF_X, ORIG_W
   1422        vraddhn.u16 d31, q7, q11
   1423                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
   1424    vmull.u8    q6, d24, d8
   1425                                    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
   1426    vmull.u8    q7, d24, d9
   1427                                    PF subge, PF_X, PF_X, ORIG_W
   1428    vmull.u8    q8, d24, d10
   1429                                    PF subsge, PF_CTL, PF_CTL, #0x10
   1430    vmull.u8    q9, d24, d11
   1431                                    PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
   1432        vqadd.u8    q14, q0, q14
   1433                                    PF ldrbge, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
   1434        vqadd.u8    q15, q1, q15
   1435    vrshr.u16   q10, q6, #8
   1436    vrshr.u16   q11, q7, #8
   1437    vrshr.u16   q12, q8, #8
   1438    vrshr.u16   q13, q9, #8
   1439    vraddhn.u16 d0, q6, q10
   1440    vraddhn.u16 d1, q7, q11
   1441    vraddhn.u16 d2, q8, q12
   1442    vraddhn.u16 d3, q9, q13
   1443        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
   1444    vmvn.8      d25, d3
   1445    vmull.u8    q8, d25, d4
   1446    vmull.u8    q9, d25, d5
   1447    vmull.u8    q10, d25, d6
   1448    vmull.u8    q11, d25, d7
   1449 .endm
   1450 
   1451 .macro pixman_composite_over_n_8_8888_init
   1452    add         DUMMY, sp, #ARGS_STACK_OFFSET
   1453    vpush       {d8-d15}
   1454    vld1.32     {d11[0]}, [DUMMY]
   1455    vdup.8      d8, d11[0]
   1456    vdup.8      d9, d11[1]
   1457    vdup.8      d10, d11[2]
   1458    vdup.8      d11, d11[3]
   1459 .endm
   1460 
   1461 .macro pixman_composite_over_n_8_8888_cleanup
   1462    vpop        {d8-d15}
   1463 .endm
   1464 
   1465 generate_composite_function \
   1466    pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
   1467    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   1468    8, /* number of pixels, processed in a single block */ \
   1469    5, /* prefetch distance */ \
   1470    pixman_composite_over_n_8_8888_init, \
   1471    pixman_composite_over_n_8_8888_cleanup, \
   1472    pixman_composite_over_n_8_8888_process_pixblock_head, \
   1473    pixman_composite_over_n_8_8888_process_pixblock_tail, \
   1474    pixman_composite_over_n_8_8888_process_pixblock_tail_head
   1475 
   1476 /******************************************************************************/
   1477 
   1478 .macro pixman_composite_over_n_8_8_process_pixblock_head
   1479    vmull.u8    q0,  d24, d8
   1480    vmull.u8    q1,  d25, d8
   1481    vmull.u8    q6,  d26, d8
   1482    vmull.u8    q7,  d27, d8
   1483    vrshr.u16   q10, q0,  #8
   1484    vrshr.u16   q11, q1,  #8
   1485    vrshr.u16   q12, q6,  #8
   1486    vrshr.u16   q13, q7,  #8
   1487    vraddhn.u16 d0,  q0,  q10
   1488    vraddhn.u16 d1,  q1,  q11
   1489    vraddhn.u16 d2,  q6,  q12
   1490    vraddhn.u16 d3,  q7,  q13
   1491    vmvn.8      q12, q0
   1492    vmvn.8      q13, q1
   1493    vmull.u8    q8,  d24, d4
   1494    vmull.u8    q9,  d25, d5
   1495    vmull.u8    q10, d26, d6
   1496    vmull.u8    q11, d27, d7
   1497 .endm
   1498 
   1499 .macro pixman_composite_over_n_8_8_process_pixblock_tail
   1500    vrshr.u16   q14, q8,  #8
   1501    vrshr.u16   q15, q9,  #8
   1502    vrshr.u16   q12, q10, #8
   1503    vrshr.u16   q13, q11, #8
   1504    vraddhn.u16 d28, q14, q8
   1505    vraddhn.u16 d29, q15, q9
   1506    vraddhn.u16 d30, q12, q10
   1507    vraddhn.u16 d31, q13, q11
   1508    vqadd.u8    q14, q0,  q14
   1509    vqadd.u8    q15, q1,  q15
   1510 .endm
   1511 
   1512 /* TODO: expand macros and do better instructions scheduling */
   1513 .macro pixman_composite_over_n_8_8_process_pixblock_tail_head
   1514    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
   1515    pixman_composite_over_n_8_8_process_pixblock_tail
   1516    fetch_mask_pixblock
   1517    cache_preload 32, 32
   1518    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
   1519    pixman_composite_over_n_8_8_process_pixblock_head
   1520 .endm
   1521 
   1522 .macro pixman_composite_over_n_8_8_init
   1523    add         DUMMY, sp, #ARGS_STACK_OFFSET
   1524    vpush       {d8-d15}
   1525    vld1.32     {d8[0]}, [DUMMY]
   1526    vdup.8      d8, d8[3]
   1527 .endm
   1528 
   1529 .macro pixman_composite_over_n_8_8_cleanup
   1530    vpop        {d8-d15}
   1531 .endm
   1532 
   1533 generate_composite_function \
   1534    pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \
   1535    FLAG_DST_READWRITE, \
   1536    32, /* number of pixels, processed in a single block */ \
   1537    5, /* prefetch distance */ \
   1538    pixman_composite_over_n_8_8_init, \
   1539    pixman_composite_over_n_8_8_cleanup, \
   1540    pixman_composite_over_n_8_8_process_pixblock_head, \
   1541    pixman_composite_over_n_8_8_process_pixblock_tail, \
   1542    pixman_composite_over_n_8_8_process_pixblock_tail_head
   1543 
   1544 /******************************************************************************/
   1545 
   1546 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
   1547    /*
   1548     * 'combine_mask_ca' replacement
   1549     *
   1550     * input:  solid src (n) in {d8,  d9,  d10, d11}
   1551     *         dest in          {d4,  d5,  d6,  d7 }
   1552     *         mask in          {d24, d25, d26, d27}
   1553     * output: updated src in   {d0,  d1,  d2,  d3 }
   1554     *         updated mask in  {d24, d25, d26, d3 }
   1555     */
   1556    vmull.u8    q0,  d24, d8
   1557    vmull.u8    q1,  d25, d9
   1558    vmull.u8    q6,  d26, d10
   1559    vmull.u8    q7,  d27, d11
   1560    vmull.u8    q9,  d11, d25
   1561    vmull.u8    q12, d11, d24
   1562    vmull.u8    q13, d11, d26
   1563    vrshr.u16   q8,  q0,  #8
   1564    vrshr.u16   q10, q1,  #8
   1565    vrshr.u16   q11, q6,  #8
   1566    vraddhn.u16 d0,  q0,  q8
   1567    vraddhn.u16 d1,  q1,  q10
   1568    vraddhn.u16 d2,  q6,  q11
   1569    vrshr.u16   q11, q12, #8
   1570    vrshr.u16   q8,  q9,  #8
   1571    vrshr.u16   q6,  q13, #8
   1572    vrshr.u16   q10, q7,  #8
   1573    vraddhn.u16 d24, q12, q11
   1574    vraddhn.u16 d25, q9,  q8
   1575    vraddhn.u16 d26, q13, q6
   1576    vraddhn.u16 d3,  q7,  q10
   1577    /*
   1578     * 'combine_over_ca' replacement
   1579     *
   1580     * output: updated dest in {d28, d29, d30, d31}
   1581     */
   1582    vmvn.8      q12, q12
   1583    vmvn.8      d26, d26
   1584    vmull.u8    q8,  d24, d4
   1585    vmull.u8    q9,  d25, d5
   1586    vmvn.8      d27, d3
   1587    vmull.u8    q10, d26, d6
   1588    vmull.u8    q11, d27, d7
   1589 .endm
   1590 
   1591 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
   1592    /* ... continue 'combine_over_ca' replacement */
   1593    vrshr.u16   q14, q8,  #8
   1594    vrshr.u16   q15, q9,  #8
   1595    vrshr.u16   q6,  q10, #8
   1596    vrshr.u16   q7,  q11, #8
   1597    vraddhn.u16 d28, q14, q8
   1598    vraddhn.u16 d29, q15, q9
   1599    vraddhn.u16 d30, q6,  q10
   1600    vraddhn.u16 d31, q7,  q11
   1601    vqadd.u8    q14, q0,  q14
   1602    vqadd.u8    q15, q1,  q15
   1603 .endm
   1604 
   1605 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
   1606        vrshr.u16   q14, q8, #8
   1607        vrshr.u16   q15, q9, #8
   1608    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
   1609        vrshr.u16   q6, q10, #8
   1610        vrshr.u16   q7, q11, #8
   1611        vraddhn.u16 d28, q14, q8
   1612        vraddhn.u16 d29, q15, q9
   1613        vraddhn.u16 d30, q6, q10
   1614        vraddhn.u16 d31, q7, q11
   1615    fetch_mask_pixblock
   1616        vqadd.u8    q14, q0, q14
   1617        vqadd.u8    q15, q1, q15
   1618    cache_preload 8, 8
   1619    pixman_composite_over_n_8888_8888_ca_process_pixblock_head
   1620    vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
   1621 .endm
   1622 
   1623 .macro pixman_composite_over_n_8888_8888_ca_init
   1624    add         DUMMY, sp, #ARGS_STACK_OFFSET
   1625    vpush       {d8-d15}
   1626    vld1.32     {d11[0]}, [DUMMY]
   1627    vdup.8      d8, d11[0]
   1628    vdup.8      d9, d11[1]
   1629    vdup.8      d10, d11[2]
   1630    vdup.8      d11, d11[3]
   1631 .endm
   1632 
   1633 .macro pixman_composite_over_n_8888_8888_ca_cleanup
   1634    vpop        {d8-d15}
   1635 .endm
   1636 
   1637 generate_composite_function \
   1638    pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
   1639    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   1640    8, /* number of pixels, processed in a single block */ \
   1641    5, /* prefetch distance */ \
   1642    pixman_composite_over_n_8888_8888_ca_init, \
   1643    pixman_composite_over_n_8888_8888_ca_cleanup, \
   1644    pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
   1645    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
   1646    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
   1647 
   1648 /******************************************************************************/
   1649 
   1650 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head
   1651    /*
   1652     * 'combine_mask_ca' replacement
   1653     *
   1654     * input:  solid src (n) in {d8,  d9,  d10, d11}  [B, G, R, A]
   1655     *         mask in          {d24, d25, d26}       [B, G, R]
   1656     * output: updated src in   {d0,  d1,  d2 }       [B, G, R]
   1657     *         updated mask in  {d24, d25, d26}       [B, G, R]
   1658     */
   1659    vmull.u8    q0,  d24, d8
   1660    vmull.u8    q1,  d25, d9
   1661    vmull.u8    q6,  d26, d10
   1662    vmull.u8    q9,  d11, d25
   1663    vmull.u8    q12, d11, d24
   1664    vmull.u8    q13, d11, d26
   1665    vrshr.u16   q8,  q0,  #8
   1666    vrshr.u16   q10, q1,  #8
   1667    vrshr.u16   q11, q6,  #8
   1668    vraddhn.u16 d0,  q0,  q8
   1669    vraddhn.u16 d1,  q1,  q10
   1670    vraddhn.u16 d2,  q6,  q11
   1671    vrshr.u16   q11, q12, #8
   1672    vrshr.u16   q8,  q9,  #8
   1673    vrshr.u16   q6,  q13, #8
   1674    vraddhn.u16 d24, q12, q11
   1675    vraddhn.u16 d25, q9,  q8
   1676    /*
   1677     * convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
   1678     * and put data into d16 - blue, d17 - green, d18 - red
   1679     */
   1680       vshrn.u16   d17, q2,  #3
   1681       vshrn.u16   d18, q2,  #8
   1682    vraddhn.u16 d26, q13, q6
   1683       vsli.u16    q2,  q2,  #5
   1684       vsri.u8     d18, d18, #5
   1685       vsri.u8     d17, d17, #6
   1686    /*
   1687     * 'combine_over_ca' replacement
   1688     *
   1689     * output: updated dest in d16 - blue, d17 - green, d18 - red
   1690     */
   1691    vmvn.8      q12, q12
   1692       vshrn.u16   d16, q2,  #2
   1693    vmvn.8      d26, d26
   1694    vmull.u8    q6,  d16, d24
   1695    vmull.u8    q7,  d17, d25
   1696    vmull.u8    q11, d18, d26
   1697 .endm
   1698 
   1699 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail
   1700    /* ... continue 'combine_over_ca' replacement */
   1701    vrshr.u16   q10, q6,  #8
   1702    vrshr.u16   q14, q7,  #8
   1703    vrshr.u16   q15, q11, #8
   1704    vraddhn.u16 d16, q10, q6
   1705    vraddhn.u16 d17, q14, q7
   1706    vraddhn.u16 d18, q15, q11
   1707    vqadd.u8    q8,  q0,  q8
   1708    vqadd.u8    d18, d2,  d18
   1709    /*
   1710     * convert the results in d16, d17, d18 to r5g6b5 and store
   1711     * them into {d28, d29}
   1712     */
   1713    vshll.u8    q14, d18, #8
   1714    vshll.u8    q10, d17, #8
   1715    vshll.u8    q15, d16, #8
   1716    vsri.u16    q14, q10, #5
   1717    vsri.u16    q14, q15, #11
   1718 .endm
   1719 
   1720 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
   1721    fetch_mask_pixblock
   1722        vrshr.u16   q10, q6, #8
   1723        vrshr.u16   q14, q7, #8
   1724    vld1.16     {d4, d5}, [DST_R, :128]!
   1725        vrshr.u16   q15, q11, #8
   1726        vraddhn.u16 d16, q10, q6
   1727        vraddhn.u16 d17, q14, q7
   1728        vraddhn.u16 d22, q15, q11
   1729            /* process_pixblock_head */
   1730            /*
   1731             * 'combine_mask_ca' replacement
   1732             *
   1733             * input:  solid src (n) in {d8,  d9,  d10, d11}  [B, G, R, A]
   1734             *         mask in          {d24, d25, d26}       [B, G, R]
   1735             * output: updated src in   {d0,  d1,  d2 }       [B, G, R]
   1736             *         updated mask in  {d24, d25, d26}       [B, G, R]
   1737             */
   1738            vmull.u8    q6,  d26, d10
   1739        vqadd.u8    q8,  q0, q8
   1740            vmull.u8    q0,  d24, d8
   1741        vqadd.u8    d22, d2, d22
   1742            vmull.u8    q1,  d25, d9
   1743        /*
   1744         * convert the result in d16, d17, d22 to r5g6b5 and store
   1745         * it into {d28, d29}
   1746         */
   1747        vshll.u8    q14, d22, #8
   1748        vshll.u8    q10, d17, #8
   1749        vshll.u8    q15, d16, #8
   1750            vmull.u8    q9,  d11, d25
   1751        vsri.u16    q14, q10, #5
   1752            vmull.u8    q12, d11, d24
   1753            vmull.u8    q13, d11, d26
   1754        vsri.u16    q14, q15, #11
   1755    cache_preload 8, 8
   1756            vrshr.u16   q8,  q0,  #8
   1757            vrshr.u16   q10, q1,  #8
   1758            vrshr.u16   q11, q6,  #8
   1759            vraddhn.u16 d0,  q0,  q8
   1760            vraddhn.u16 d1,  q1,  q10
   1761            vraddhn.u16 d2,  q6,  q11
   1762            vrshr.u16   q11, q12, #8
   1763            vrshr.u16   q8,  q9,  #8
   1764            vrshr.u16   q6,  q13, #8
   1765            vraddhn.u16 d24, q12, q11
   1766            vraddhn.u16 d25, q9,  q8
   1767                /*
   1768                 * convert 8 r5g6b5 pixel data from {d4, d5} to planar
   1769          * 8-bit format and put data into d16 - blue, d17 - green,
   1770          * d18 - red
   1771                 */
   1772                vshrn.u16   d17, q2,  #3
   1773                vshrn.u16   d18, q2,  #8
   1774            vraddhn.u16 d26, q13, q6
   1775                vsli.u16    q2,  q2,  #5
   1776                vsri.u8     d17, d17, #6
   1777                vsri.u8     d18, d18, #5
   1778            /*
   1779             * 'combine_over_ca' replacement
   1780             *
   1781             * output: updated dest in d16 - blue, d17 - green, d18 - red
   1782             */
   1783            vmvn.8      q12, q12
   1784                vshrn.u16   d16, q2,  #2
   1785            vmvn.8      d26, d26
   1786            vmull.u8    q7,  d17, d25
   1787            vmull.u8    q6,  d16, d24
   1788            vmull.u8    q11, d18, d26
   1789    vst1.16     {d28, d29}, [DST_W, :128]!
   1790 .endm
   1791 
   1792 .macro pixman_composite_over_n_8888_0565_ca_init
   1793    add         DUMMY, sp, #ARGS_STACK_OFFSET
   1794    vpush       {d8-d15}
   1795    vld1.32     {d11[0]}, [DUMMY]
   1796    vdup.8      d8, d11[0]
   1797    vdup.8      d9, d11[1]
   1798    vdup.8      d10, d11[2]
   1799    vdup.8      d11, d11[3]
   1800 .endm
   1801 
   1802 .macro pixman_composite_over_n_8888_0565_ca_cleanup
   1803    vpop        {d8-d15}
   1804 .endm
   1805 
   1806 generate_composite_function \
   1807    pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \
   1808    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   1809    8, /* number of pixels, processed in a single block */ \
   1810    5, /* prefetch distance */ \
   1811    pixman_composite_over_n_8888_0565_ca_init, \
   1812    pixman_composite_over_n_8888_0565_ca_cleanup, \
   1813    pixman_composite_over_n_8888_0565_ca_process_pixblock_head, \
   1814    pixman_composite_over_n_8888_0565_ca_process_pixblock_tail, \
   1815    pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
   1816 
   1817 /******************************************************************************/
   1818 
   1819 .macro pixman_composite_in_n_8_process_pixblock_head
   1820    /* expecting source data in {d0, d1, d2, d3} */
   1821    /* and destination data in {d4, d5, d6, d7} */
   1822    vmull.u8    q8,  d4,  d3
   1823    vmull.u8    q9,  d5,  d3
   1824    vmull.u8    q10, d6,  d3
   1825    vmull.u8    q11, d7,  d3
   1826 .endm
   1827 
   1828 .macro pixman_composite_in_n_8_process_pixblock_tail
   1829    vrshr.u16   q14, q8,  #8
   1830    vrshr.u16   q15, q9,  #8
   1831    vrshr.u16   q12, q10, #8
   1832    vrshr.u16   q13, q11, #8
   1833    vraddhn.u16 d28, q8,  q14
   1834    vraddhn.u16 d29, q9,  q15
   1835    vraddhn.u16 d30, q10, q12
   1836    vraddhn.u16 d31, q11, q13
   1837 .endm
   1838 
   1839 .macro pixman_composite_in_n_8_process_pixblock_tail_head
   1840    pixman_composite_in_n_8_process_pixblock_tail
   1841    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
   1842    cache_preload 32, 32
   1843    pixman_composite_in_n_8_process_pixblock_head
   1844    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
   1845 .endm
   1846 
   1847 .macro pixman_composite_in_n_8_init
   1848    add         DUMMY, sp, #ARGS_STACK_OFFSET
   1849    vld1.32     {d3[0]}, [DUMMY]
   1850    vdup.8      d3, d3[3]
   1851 .endm
   1852 
   1853 .macro pixman_composite_in_n_8_cleanup
   1854 .endm
   1855 
   1856 generate_composite_function \
   1857    pixman_composite_in_n_8_asm_neon, 0, 0, 8, \
   1858    FLAG_DST_READWRITE, \
   1859    32, /* number of pixels, processed in a single block */ \
   1860    5, /* prefetch distance */ \
   1861    pixman_composite_in_n_8_init, \
   1862    pixman_composite_in_n_8_cleanup, \
   1863    pixman_composite_in_n_8_process_pixblock_head, \
   1864    pixman_composite_in_n_8_process_pixblock_tail, \
   1865    pixman_composite_in_n_8_process_pixblock_tail_head, \
   1866    28, /* dst_w_basereg */ \
   1867    4,  /* dst_r_basereg */ \
   1868    0,  /* src_basereg   */ \
   1869    24  /* mask_basereg  */
   1870 
   1871 .macro pixman_composite_add_n_8_8_process_pixblock_head
   1872    /* expecting source data in {d8, d9, d10, d11} */
   1873    /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
   1874    /* and destination data in {d4, d5, d6, d7} */
   1875    /* mask is in d24, d25, d26, d27 */
   1876    vmull.u8    q0, d24, d11
   1877    vmull.u8    q1, d25, d11
   1878    vmull.u8    q6, d26, d11
   1879    vmull.u8    q7, d27, d11
   1880    vrshr.u16   q10, q0, #8
   1881    vrshr.u16   q11, q1, #8
   1882    vrshr.u16   q12, q6, #8
   1883    vrshr.u16   q13, q7, #8
   1884    vraddhn.u16 d0, q0, q10
   1885    vraddhn.u16 d1, q1, q11
   1886    vraddhn.u16 d2, q6, q12
   1887    vraddhn.u16 d3, q7, q13
   1888    vqadd.u8    q14, q0, q2
   1889    vqadd.u8    q15, q1, q3
   1890 .endm
   1891 
   1892 .macro pixman_composite_add_n_8_8_process_pixblock_tail
   1893 .endm
   1894 
   1895 /* TODO: expand macros and do better instructions scheduling */
   1896 .macro pixman_composite_add_n_8_8_process_pixblock_tail_head
   1897    pixman_composite_add_n_8_8_process_pixblock_tail
   1898    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
   1899    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
   1900    fetch_mask_pixblock
   1901    cache_preload 32, 32
   1902    pixman_composite_add_n_8_8_process_pixblock_head
   1903 .endm
   1904 
   1905 .macro pixman_composite_add_n_8_8_init
   1906    add         DUMMY, sp, #ARGS_STACK_OFFSET
   1907    vpush       {d8-d15}
   1908    vld1.32     {d11[0]}, [DUMMY]
   1909    vdup.8      d11, d11[3]
   1910 .endm
   1911 
   1912 .macro pixman_composite_add_n_8_8_cleanup
   1913    vpop        {d8-d15}
   1914 .endm
   1915 
   1916 generate_composite_function \
   1917    pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
   1918    FLAG_DST_READWRITE, \
   1919    32, /* number of pixels, processed in a single block */ \
   1920    5, /* prefetch distance */ \
   1921    pixman_composite_add_n_8_8_init, \
   1922    pixman_composite_add_n_8_8_cleanup, \
   1923    pixman_composite_add_n_8_8_process_pixblock_head, \
   1924    pixman_composite_add_n_8_8_process_pixblock_tail, \
   1925    pixman_composite_add_n_8_8_process_pixblock_tail_head
   1926 
   1927 /******************************************************************************/
   1928 
   1929 .macro pixman_composite_add_8_8_8_process_pixblock_head
   1930    /* expecting source data in {d0, d1, d2, d3} */
   1931    /* destination data in {d4, d5, d6, d7} */
   1932    /* mask in {d24, d25, d26, d27} */
   1933    vmull.u8    q8, d24, d0
   1934    vmull.u8    q9, d25, d1
   1935    vmull.u8    q10, d26, d2
   1936    vmull.u8    q11, d27, d3
   1937    vrshr.u16   q0, q8, #8
   1938    vrshr.u16   q1, q9, #8
   1939    vrshr.u16   q12, q10, #8
   1940    vrshr.u16   q13, q11, #8
   1941    vraddhn.u16 d0, q0, q8
   1942    vraddhn.u16 d1, q1, q9
   1943    vraddhn.u16 d2, q12, q10
   1944    vraddhn.u16 d3, q13, q11
   1945    vqadd.u8    q14, q0, q2
   1946    vqadd.u8    q15, q1, q3
   1947 .endm
   1948 
   1949 .macro pixman_composite_add_8_8_8_process_pixblock_tail
   1950 .endm
   1951 
   1952 /* TODO: expand macros and do better instructions scheduling */
   1953 .macro pixman_composite_add_8_8_8_process_pixblock_tail_head
   1954    pixman_composite_add_8_8_8_process_pixblock_tail
   1955    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
   1956    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
   1957    fetch_mask_pixblock
   1958    fetch_src_pixblock
   1959    cache_preload 32, 32
   1960    pixman_composite_add_8_8_8_process_pixblock_head
   1961 .endm
   1962 
   1963 .macro pixman_composite_add_8_8_8_init
   1964 .endm
   1965 
   1966 .macro pixman_composite_add_8_8_8_cleanup
   1967 .endm
   1968 
   1969 generate_composite_function \
   1970    pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
   1971    FLAG_DST_READWRITE, \
   1972    32, /* number of pixels, processed in a single block */ \
   1973    5, /* prefetch distance */ \
   1974    pixman_composite_add_8_8_8_init, \
   1975    pixman_composite_add_8_8_8_cleanup, \
   1976    pixman_composite_add_8_8_8_process_pixblock_head, \
   1977    pixman_composite_add_8_8_8_process_pixblock_tail, \
   1978    pixman_composite_add_8_8_8_process_pixblock_tail_head
   1979 
   1980 /******************************************************************************/
   1981 
   1982 .macro pixman_composite_add_8888_8888_8888_process_pixblock_head
   1983    /* expecting source data in {d0, d1, d2, d3} */
   1984    /* destination data in {d4, d5, d6, d7} */
   1985    /* mask in {d24, d25, d26, d27} */
   1986    vmull.u8    q8,  d27, d0
   1987    vmull.u8    q9,  d27, d1
   1988    vmull.u8    q10, d27, d2
   1989    vmull.u8    q11, d27, d3
   1990    /* 1 cycle bubble */
   1991    vrsra.u16   q8,  q8,  #8
   1992    vrsra.u16   q9,  q9,  #8
   1993    vrsra.u16   q10, q10, #8
   1994    vrsra.u16   q11, q11, #8
   1995 .endm
   1996 
   1997 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
   1998    /* 2 cycle bubble */
   1999    vrshrn.u16  d28, q8,  #8
   2000    vrshrn.u16  d29, q9,  #8
   2001    vrshrn.u16  d30, q10, #8
   2002    vrshrn.u16  d31, q11, #8
   2003    vqadd.u8    q14, q2,  q14
   2004    /* 1 cycle bubble */
   2005    vqadd.u8    q15, q3,  q15
   2006 .endm
   2007 
   2008 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
   2009    fetch_src_pixblock
   2010        vrshrn.u16  d28, q8,  #8
   2011    fetch_mask_pixblock
   2012        vrshrn.u16  d29, q9,  #8
   2013    vmull.u8    q8,  d27, d0
   2014        vrshrn.u16  d30, q10, #8
   2015    vmull.u8    q9,  d27, d1
   2016        vrshrn.u16  d31, q11, #8
   2017    vmull.u8    q10, d27, d2
   2018        vqadd.u8    q14, q2,  q14
   2019    vmull.u8    q11, d27, d3
   2020        vqadd.u8    q15, q3,  q15
   2021    vrsra.u16   q8,  q8,  #8
   2022    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
   2023    vrsra.u16   q9,  q9,  #8
   2024        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
   2025    vrsra.u16   q10, q10, #8
   2026 
   2027    cache_preload 8, 8
   2028 
   2029    vrsra.u16   q11, q11, #8
   2030 .endm
   2031 
   2032 generate_composite_function \
   2033    pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
   2034    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   2035    8, /* number of pixels, processed in a single block */ \
   2036    10, /* prefetch distance */ \
   2037    default_init, \
   2038    default_cleanup, \
   2039    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
   2040    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
   2041    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
   2042 
   2043 generate_composite_function_single_scanline \
   2044    pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
   2045    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   2046    8, /* number of pixels, processed in a single block */ \
   2047    default_init, \
   2048    default_cleanup, \
   2049    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
   2050    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
   2051    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
   2052 
   2053 /******************************************************************************/
   2054 
   2055 generate_composite_function \
   2056    pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \
   2057    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   2058    8, /* number of pixels, processed in a single block */ \
   2059    5, /* prefetch distance */ \
   2060    default_init, \
   2061    default_cleanup, \
   2062    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
   2063    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
   2064    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
   2065    28, /* dst_w_basereg */ \
   2066    4,  /* dst_r_basereg */ \
   2067    0,  /* src_basereg   */ \
   2068    27  /* mask_basereg  */
   2069 
   2070 /******************************************************************************/
   2071 
   2072 .macro pixman_composite_add_n_8_8888_init
   2073    add         DUMMY, sp, #ARGS_STACK_OFFSET
   2074    vld1.32     {d3[0]}, [DUMMY]
   2075    vdup.8      d0, d3[0]
   2076    vdup.8      d1, d3[1]
   2077    vdup.8      d2, d3[2]
   2078    vdup.8      d3, d3[3]
   2079 .endm
   2080 
   2081 .macro pixman_composite_add_n_8_8888_cleanup
   2082 .endm
   2083 
   2084 generate_composite_function \
   2085    pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \
   2086    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   2087    8, /* number of pixels, processed in a single block */ \
   2088    5, /* prefetch distance */ \
   2089    pixman_composite_add_n_8_8888_init, \
   2090    pixman_composite_add_n_8_8888_cleanup, \
   2091    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
   2092    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
   2093    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
   2094    28, /* dst_w_basereg */ \
   2095    4,  /* dst_r_basereg */ \
   2096    0,  /* src_basereg   */ \
   2097    27  /* mask_basereg  */
   2098 
   2099 /******************************************************************************/
   2100 
   2101 .macro pixman_composite_add_8888_n_8888_init
   2102    add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
   2103    vld1.32     {d27[0]}, [DUMMY]
   2104    vdup.8      d27, d27[3]
   2105 .endm
   2106 
   2107 .macro pixman_composite_add_8888_n_8888_cleanup
   2108 .endm
   2109 
   2110 generate_composite_function \
   2111    pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \
   2112    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   2113    8, /* number of pixels, processed in a single block */ \
   2114    5, /* prefetch distance */ \
   2115    pixman_composite_add_8888_n_8888_init, \
   2116    pixman_composite_add_8888_n_8888_cleanup, \
   2117    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
   2118    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
   2119    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
   2120    28, /* dst_w_basereg */ \
   2121    4,  /* dst_r_basereg */ \
   2122    0,  /* src_basereg   */ \
   2123    27  /* mask_basereg  */
   2124 
   2125 /******************************************************************************/
   2126 
   2127 .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
   2128    /* expecting source data in {d0, d1, d2, d3} */
   2129    /* destination data in {d4, d5, d6, d7} */
   2130    /* solid mask is in d15 */
   2131 
   2132    /* 'in' */
   2133    vmull.u8    q8, d15, d3
   2134    vmull.u8    q6, d15, d2
   2135    vmull.u8    q5, d15, d1
   2136    vmull.u8    q4, d15, d0
   2137    vrshr.u16   q13, q8, #8
   2138    vrshr.u16   q12, q6, #8
   2139    vrshr.u16   q11, q5, #8
   2140    vrshr.u16   q10, q4, #8
   2141    vraddhn.u16 d3, q8, q13
   2142    vraddhn.u16 d2, q6, q12
   2143    vraddhn.u16 d1, q5, q11
   2144    vraddhn.u16 d0, q4, q10
   2145    vmvn.8      d24, d3  /* get inverted alpha */
   2146    /* now do alpha blending */
   2147    vmull.u8    q8, d24, d4
   2148    vmull.u8    q9, d24, d5
   2149    vmull.u8    q10, d24, d6
   2150    vmull.u8    q11, d24, d7
   2151 .endm
   2152 
   2153 .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
   2154    vrshr.u16   q14, q8, #8
   2155    vrshr.u16   q15, q9, #8
   2156    vrshr.u16   q12, q10, #8
   2157    vrshr.u16   q13, q11, #8
   2158    vraddhn.u16 d28, q14, q8
   2159    vraddhn.u16 d29, q15, q9
   2160    vraddhn.u16 d30, q12, q10
   2161    vraddhn.u16 d31, q13, q11
   2162 .endm
   2163 
   2164 /* TODO: expand macros and do better instructions scheduling */
   2165 .macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
   2166    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
   2167    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
   2168    fetch_src_pixblock
   2169    cache_preload 8, 8
   2170    fetch_mask_pixblock
   2171    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
   2172    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
   2173 .endm
   2174 
   2175 generate_composite_function_single_scanline \
   2176    pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
   2177    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   2178    8, /* number of pixels, processed in a single block */ \
   2179    default_init_need_all_regs, \
   2180    default_cleanup_need_all_regs, \
   2181    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
   2182    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
   2183    pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \
   2184    28, /* dst_w_basereg */ \
   2185    4,  /* dst_r_basereg */ \
   2186    0,  /* src_basereg   */ \
   2187    12  /* mask_basereg  */
   2188 
   2189 /******************************************************************************/
   2190 
   2191 .macro pixman_composite_over_8888_n_8888_process_pixblock_head
   2192    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
   2193 .endm
   2194 
   2195 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail
   2196    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
   2197    vqadd.u8    q14, q0, q14
   2198    vqadd.u8    q15, q1, q15
   2199 .endm
   2200 
   2201 /* TODO: expand macros and do better instructions scheduling */
   2202 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
   2203    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
   2204    pixman_composite_over_8888_n_8888_process_pixblock_tail
   2205    fetch_src_pixblock
   2206    cache_preload 8, 8
   2207    pixman_composite_over_8888_n_8888_process_pixblock_head
   2208    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
   2209 .endm
   2210 
   2211 .macro pixman_composite_over_8888_n_8888_init
   2212    add         DUMMY, sp, #48
   2213    vpush       {d8-d15}
   2214    vld1.32     {d15[0]}, [DUMMY]
   2215    vdup.8      d15, d15[3]
   2216 .endm
   2217 
   2218 .macro pixman_composite_over_8888_n_8888_cleanup
   2219    vpop        {d8-d15}
   2220 .endm
   2221 
   2222 generate_composite_function \
   2223    pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
   2224    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   2225    8, /* number of pixels, processed in a single block */ \
   2226    5, /* prefetch distance */ \
   2227    pixman_composite_over_8888_n_8888_init, \
   2228    pixman_composite_over_8888_n_8888_cleanup, \
   2229    pixman_composite_over_8888_n_8888_process_pixblock_head, \
   2230    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
   2231    pixman_composite_over_8888_n_8888_process_pixblock_tail_head
   2232 
   2233 /******************************************************************************/
   2234 
   2235 /* TODO: expand macros and do better instructions scheduling */
   2236 .macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
   2237    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
   2238    pixman_composite_over_8888_n_8888_process_pixblock_tail
   2239    fetch_src_pixblock
   2240    cache_preload 8, 8
   2241    fetch_mask_pixblock
   2242    pixman_composite_over_8888_n_8888_process_pixblock_head
   2243    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
   2244 .endm
   2245 
   2246 generate_composite_function \
   2247    pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
   2248    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   2249    8, /* number of pixels, processed in a single block */ \
   2250    5, /* prefetch distance */ \
   2251    default_init_need_all_regs, \
   2252    default_cleanup_need_all_regs, \
   2253    pixman_composite_over_8888_n_8888_process_pixblock_head, \
   2254    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
   2255    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
   2256    28, /* dst_w_basereg */ \
   2257    4,  /* dst_r_basereg */ \
   2258    0,  /* src_basereg   */ \
   2259    12  /* mask_basereg  */
   2260 
   2261 generate_composite_function_single_scanline \
   2262    pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
   2263    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   2264    8, /* number of pixels, processed in a single block */ \
   2265    default_init_need_all_regs, \
   2266    default_cleanup_need_all_regs, \
   2267    pixman_composite_over_8888_n_8888_process_pixblock_head, \
   2268    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
   2269    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
   2270    28, /* dst_w_basereg */ \
   2271    4,  /* dst_r_basereg */ \
   2272    0,  /* src_basereg   */ \
   2273    12  /* mask_basereg  */
   2274 
   2275 /******************************************************************************/
   2276 
   2277 /* TODO: expand macros and do better instructions scheduling */
   2278 .macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
   2279    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
   2280    pixman_composite_over_8888_n_8888_process_pixblock_tail
   2281    fetch_src_pixblock
   2282    cache_preload 8, 8
   2283    fetch_mask_pixblock
   2284    pixman_composite_over_8888_n_8888_process_pixblock_head
   2285    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
   2286 .endm
   2287 
   2288 generate_composite_function \
   2289    pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
   2290    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   2291    8, /* number of pixels, processed in a single block */ \
   2292    5, /* prefetch distance */ \
   2293    default_init_need_all_regs, \
   2294    default_cleanup_need_all_regs, \
   2295    pixman_composite_over_8888_n_8888_process_pixblock_head, \
   2296    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
   2297    pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
   2298    28, /* dst_w_basereg */ \
   2299    4,  /* dst_r_basereg */ \
   2300    0,  /* src_basereg   */ \
   2301    15  /* mask_basereg  */
   2302 
   2303 /******************************************************************************/
   2304 
   2305 .macro pixman_composite_src_0888_0888_process_pixblock_head
   2306 .endm
   2307 
   2308 .macro pixman_composite_src_0888_0888_process_pixblock_tail
   2309 .endm
   2310 
   2311 .macro pixman_composite_src_0888_0888_process_pixblock_tail_head
   2312    vst3.8 {d0, d1, d2}, [DST_W]!
   2313    fetch_src_pixblock
   2314    cache_preload 8, 8
   2315 .endm
   2316 
   2317 generate_composite_function \
   2318    pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
   2319    FLAG_DST_WRITEONLY, \
   2320    8, /* number of pixels, processed in a single block */ \
   2321    10, /* prefetch distance */ \
   2322    default_init, \
   2323    default_cleanup, \
   2324    pixman_composite_src_0888_0888_process_pixblock_head, \
   2325    pixman_composite_src_0888_0888_process_pixblock_tail, \
   2326    pixman_composite_src_0888_0888_process_pixblock_tail_head, \
   2327    0, /* dst_w_basereg */ \
   2328    0, /* dst_r_basereg */ \
   2329    0, /* src_basereg   */ \
   2330    0  /* mask_basereg  */
   2331 
   2332 /******************************************************************************/
   2333 
   2334 .macro pixman_composite_src_0888_8888_rev_process_pixblock_head
   2335    vswp   d0, d2
   2336 .endm
   2337 
   2338 .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
   2339 .endm
   2340 
   2341 .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
   2342    vst4.8 {d0, d1, d2, d3}, [DST_W]!
   2343    fetch_src_pixblock
   2344    vswp   d0, d2
   2345    cache_preload 8, 8
   2346 .endm
   2347 
   2348 .macro pixman_composite_src_0888_8888_rev_init
   2349    veor   d3, d3, d3
   2350 .endm
   2351 
   2352 generate_composite_function \
   2353    pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
   2354    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
   2355    8, /* number of pixels, processed in a single block */ \
   2356    10, /* prefetch distance */ \
   2357    pixman_composite_src_0888_8888_rev_init, \
   2358    default_cleanup, \
   2359    pixman_composite_src_0888_8888_rev_process_pixblock_head, \
   2360    pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
   2361    pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
   2362    0, /* dst_w_basereg */ \
   2363    0, /* dst_r_basereg */ \
   2364    0, /* src_basereg   */ \
   2365    0  /* mask_basereg  */
   2366 
   2367 /******************************************************************************/
   2368 
   2369 .macro pixman_composite_src_0888_0565_rev_process_pixblock_head
   2370    vshll.u8    q8, d1, #8
   2371    vshll.u8    q9, d2, #8
   2372 .endm
   2373 
   2374 .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
   2375    vshll.u8    q14, d0, #8
   2376    vsri.u16    q14, q8, #5
   2377    vsri.u16    q14, q9, #11
   2378 .endm
   2379 
   2380 .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
   2381        vshll.u8    q14, d0, #8
   2382    fetch_src_pixblock
   2383        vsri.u16    q14, q8, #5
   2384        vsri.u16    q14, q9, #11
   2385    vshll.u8    q8, d1, #8
   2386        vst1.16 {d28, d29}, [DST_W, :128]!
   2387    vshll.u8    q9, d2, #8
   2388 .endm
   2389 
   2390 generate_composite_function \
   2391    pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
   2392    FLAG_DST_WRITEONLY, \
   2393    8, /* number of pixels, processed in a single block */ \
   2394    10, /* prefetch distance */ \
   2395    default_init, \
   2396    default_cleanup, \
   2397    pixman_composite_src_0888_0565_rev_process_pixblock_head, \
   2398    pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
   2399    pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
   2400    28, /* dst_w_basereg */ \
   2401    0, /* dst_r_basereg */ \
   2402    0, /* src_basereg   */ \
   2403    0  /* mask_basereg  */
   2404 
   2405 /******************************************************************************/
   2406 
   2407 .macro pixman_composite_src_pixbuf_8888_process_pixblock_head
   2408    vmull.u8    q8, d3, d0
   2409    vmull.u8    q9, d3, d1
   2410    vmull.u8    q10, d3, d2
   2411 .endm
   2412 
   2413 .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
   2414    vrshr.u16   q11, q8, #8
   2415    vswp        d3, d31
   2416    vrshr.u16   q12, q9, #8
   2417    vrshr.u16   q13, q10, #8
   2418    vraddhn.u16 d30, q11, q8
   2419    vraddhn.u16 d29, q12, q9
   2420    vraddhn.u16 d28, q13, q10
   2421 .endm
   2422 
   2423 .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
   2424        vrshr.u16   q11, q8, #8
   2425        vswp        d3, d31
   2426        vrshr.u16   q12, q9, #8
   2427        vrshr.u16   q13, q10, #8
   2428    fetch_src_pixblock
   2429        vraddhn.u16 d30, q11, q8
   2430                                    PF add, PF_X, PF_X, #8
   2431                                    PF tst, PF_CTL, #0xF
   2432                                    PF addne, PF_X, PF_X, #8
   2433                                    PF subne, PF_CTL, PF_CTL, #1
   2434        vraddhn.u16 d29, q12, q9
   2435        vraddhn.u16 d28, q13, q10
   2436    vmull.u8    q8, d3, d0
   2437    vmull.u8    q9, d3, d1
   2438    vmull.u8    q10, d3, d2
   2439        vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
   2440                                    PF cmp, PF_X, ORIG_W
   2441                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
   2442                                    PF subge, PF_X, PF_X, ORIG_W
   2443                                    PF subsge, PF_CTL, PF_CTL, #0x10
   2444                                    PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
   2445 .endm
   2446 
   2447 generate_composite_function \
   2448    pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
   2449    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
   2450    8, /* number of pixels, processed in a single block */ \
   2451    10, /* prefetch distance */ \
   2452    default_init, \
   2453    default_cleanup, \
   2454    pixman_composite_src_pixbuf_8888_process_pixblock_head, \
   2455    pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
   2456    pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
   2457    28, /* dst_w_basereg */ \
   2458    0, /* dst_r_basereg */ \
   2459    0, /* src_basereg   */ \
   2460    0  /* mask_basereg  */
   2461 
   2462 /******************************************************************************/
   2463 
   2464 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_head
   2465    vmull.u8    q8, d3, d0
   2466    vmull.u8    q9, d3, d1
   2467    vmull.u8    q10, d3, d2
   2468 .endm
   2469 
   2470 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail
   2471    vrshr.u16   q11, q8, #8
   2472    vswp        d3, d31
   2473    vrshr.u16   q12, q9, #8
   2474    vrshr.u16   q13, q10, #8
   2475    vraddhn.u16 d28, q11, q8
   2476    vraddhn.u16 d29, q12, q9
   2477    vraddhn.u16 d30, q13, q10
   2478 .endm
   2479 
   2480 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head
   2481        vrshr.u16   q11, q8, #8
   2482        vswp        d3, d31
   2483        vrshr.u16   q12, q9, #8
   2484        vrshr.u16   q13, q10, #8
   2485    fetch_src_pixblock
   2486        vraddhn.u16 d28, q11, q8
   2487                                    PF add, PF_X, PF_X, #8
   2488                                    PF tst, PF_CTL, #0xF
   2489                                    PF addne, PF_X, PF_X, #8
   2490                                    PF subne, PF_CTL, PF_CTL, #1
   2491        vraddhn.u16 d29, q12, q9
   2492        vraddhn.u16 d30, q13, q10
   2493    vmull.u8    q8, d3, d0
   2494    vmull.u8    q9, d3, d1
   2495    vmull.u8    q10, d3, d2
   2496        vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
   2497                                    PF cmp, PF_X, ORIG_W
   2498                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
   2499                                    PF subge, PF_X, PF_X, ORIG_W
   2500                                    PF subsge, PF_CTL, PF_CTL, #0x10
   2501                                    PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
   2502 .endm
   2503 
   2504 generate_composite_function \
   2505    pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
   2506    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
   2507    8, /* number of pixels, processed in a single block */ \
   2508    10, /* prefetch distance */ \
   2509    default_init, \
   2510    default_cleanup, \
   2511    pixman_composite_src_rpixbuf_8888_process_pixblock_head, \
   2512    pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \
   2513    pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \
   2514    28, /* dst_w_basereg */ \
   2515    0, /* dst_r_basereg */ \
   2516    0, /* src_basereg   */ \
   2517    0  /* mask_basereg  */
   2518 
   2519 /******************************************************************************/
   2520 
   2521 .macro pixman_composite_over_0565_8_0565_process_pixblock_head
   2522    /* mask is in d15 */
   2523    convert_0565_to_x888 q4, d2, d1, d0
   2524    convert_0565_to_x888 q5, d6, d5, d4
   2525    /* source pixel data is in      {d0, d1, d2, XX} */
   2526    /* destination pixel data is in {d4, d5, d6, XX} */
   2527    vmvn.8      d7,  d15
   2528    vmull.u8    q6,  d15, d2
   2529    vmull.u8    q5,  d15, d1
   2530    vmull.u8    q4,  d15, d0
   2531    vmull.u8    q8,  d7,  d4
   2532    vmull.u8    q9,  d7,  d5
   2533    vmull.u8    q13, d7,  d6
   2534    vrshr.u16   q12, q6,  #8
   2535    vrshr.u16   q11, q5,  #8
   2536    vrshr.u16   q10, q4,  #8
   2537    vraddhn.u16 d2,  q6,  q12
   2538    vraddhn.u16 d1,  q5,  q11
   2539    vraddhn.u16 d0,  q4,  q10
   2540 .endm
   2541 
   2542 .macro pixman_composite_over_0565_8_0565_process_pixblock_tail
   2543    vrshr.u16   q14, q8,  #8
   2544    vrshr.u16   q15, q9,  #8
   2545    vrshr.u16   q12, q13, #8
   2546    vraddhn.u16 d28, q14, q8
   2547    vraddhn.u16 d29, q15, q9
   2548    vraddhn.u16 d30, q12, q13
   2549    vqadd.u8    q0,  q0,  q14
   2550    vqadd.u8    q1,  q1,  q15
   2551    /* 32bpp result is in {d0, d1, d2, XX} */
   2552    convert_8888_to_0565 d2, d1, d0, q14, q15, q3
   2553 .endm
   2554 
   2555 /* TODO: expand macros and do better instructions scheduling */
   2556 .macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
   2557    fetch_mask_pixblock
   2558    pixman_composite_over_0565_8_0565_process_pixblock_tail
   2559    fetch_src_pixblock
   2560    vld1.16    {d10, d11}, [DST_R, :128]!
   2561    cache_preload 8, 8
   2562    pixman_composite_over_0565_8_0565_process_pixblock_head
   2563    vst1.16    {d28, d29}, [DST_W, :128]!
   2564 .endm
   2565 
   2566 generate_composite_function \
   2567    pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \
   2568    FLAG_DST_READWRITE, \
   2569    8, /* number of pixels, processed in a single block */ \
   2570    5, /* prefetch distance */ \
   2571    default_init_need_all_regs, \
   2572    default_cleanup_need_all_regs, \
   2573    pixman_composite_over_0565_8_0565_process_pixblock_head, \
   2574    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
   2575    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
   2576    28, /* dst_w_basereg */ \
   2577    10,  /* dst_r_basereg */ \
   2578    8,  /* src_basereg   */ \
   2579    15  /* mask_basereg  */
   2580 
   2581 /******************************************************************************/
   2582 
   2583 .macro pixman_composite_over_0565_n_0565_init
   2584    add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
   2585    vpush       {d8-d15}
   2586    vld1.32     {d15[0]}, [DUMMY]
   2587    vdup.8      d15, d15[3]
   2588 .endm
   2589 
   2590 .macro pixman_composite_over_0565_n_0565_cleanup
   2591    vpop        {d8-d15}
   2592 .endm
   2593 
   2594 generate_composite_function \
   2595    pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \
   2596    FLAG_DST_READWRITE, \
   2597    8, /* number of pixels, processed in a single block */ \
   2598    5, /* prefetch distance */ \
   2599    pixman_composite_over_0565_n_0565_init, \
   2600    pixman_composite_over_0565_n_0565_cleanup, \
   2601    pixman_composite_over_0565_8_0565_process_pixblock_head, \
   2602    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
   2603    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
   2604    28, /* dst_w_basereg */ \
   2605    10, /* dst_r_basereg */ \
   2606    8,  /* src_basereg   */ \
   2607    15  /* mask_basereg  */
   2608 
   2609 /******************************************************************************/
   2610 
   2611 .macro pixman_composite_add_0565_8_0565_process_pixblock_head
   2612    /* mask is in d15 */
   2613    convert_0565_to_x888 q4, d2, d1, d0
   2614    convert_0565_to_x888 q5, d6, d5, d4
   2615    /* source pixel data is in      {d0, d1, d2, XX} */
   2616    /* destination pixel data is in {d4, d5, d6, XX} */
   2617    vmull.u8    q6,  d15, d2
   2618    vmull.u8    q5,  d15, d1
   2619    vmull.u8    q4,  d15, d0
   2620    vrshr.u16   q12, q6,  #8
   2621    vrshr.u16   q11, q5,  #8
   2622    vrshr.u16   q10, q4,  #8
   2623    vraddhn.u16 d2,  q6,  q12
   2624    vraddhn.u16 d1,  q5,  q11
   2625    vraddhn.u16 d0,  q4,  q10
   2626 .endm
   2627 
   2628 .macro pixman_composite_add_0565_8_0565_process_pixblock_tail
   2629    vqadd.u8    q0,  q0,  q2
   2630    vqadd.u8    q1,  q1,  q3
   2631    /* 32bpp result is in {d0, d1, d2, XX} */
   2632    convert_8888_to_0565 d2, d1, d0, q14, q15, q3
   2633 .endm
   2634 
   2635 /* TODO: expand macros and do better instructions scheduling */
   2636 .macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
   2637    fetch_mask_pixblock
   2638    pixman_composite_add_0565_8_0565_process_pixblock_tail
   2639    fetch_src_pixblock
   2640    vld1.16    {d10, d11}, [DST_R, :128]!
   2641    cache_preload 8, 8
   2642    pixman_composite_add_0565_8_0565_process_pixblock_head
   2643    vst1.16    {d28, d29}, [DST_W, :128]!
   2644 .endm
   2645 
   2646 generate_composite_function \
   2647    pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \
   2648    FLAG_DST_READWRITE, \
   2649    8, /* number of pixels, processed in a single block */ \
   2650    5, /* prefetch distance */ \
   2651    default_init_need_all_regs, \
   2652    default_cleanup_need_all_regs, \
   2653    pixman_composite_add_0565_8_0565_process_pixblock_head, \
   2654    pixman_composite_add_0565_8_0565_process_pixblock_tail, \
   2655    pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \
   2656    28, /* dst_w_basereg */ \
   2657    10, /* dst_r_basereg */ \
   2658    8,  /* src_basereg   */ \
   2659    15  /* mask_basereg  */
   2660 
   2661 /******************************************************************************/
   2662 
   2663 .macro pixman_composite_out_reverse_8_0565_process_pixblock_head
   2664    /* mask is in d15 */
   2665    convert_0565_to_x888 q5, d6, d5, d4
   2666    /* destination pixel data is in {d4, d5, d6, xx} */
   2667    vmvn.8      d24, d15 /* get inverted alpha */
   2668    /* now do alpha blending */
   2669    vmull.u8    q8, d24, d4
   2670    vmull.u8    q9, d24, d5
   2671    vmull.u8    q10, d24, d6
   2672 .endm
   2673 
   2674 .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail
   2675    vrshr.u16   q14, q8, #8
   2676    vrshr.u16   q15, q9, #8
   2677    vrshr.u16   q12, q10, #8
   2678    vraddhn.u16 d0, q14, q8
   2679    vraddhn.u16 d1, q15, q9
   2680    vraddhn.u16 d2, q12, q10
   2681    /* 32bpp result is in {d0, d1, d2, XX} */
   2682    convert_8888_to_0565 d2, d1, d0, q14, q15, q3
   2683 .endm
   2684 
   2685 /* TODO: expand macros and do better instructions scheduling */
   2686 .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
   2687    fetch_src_pixblock
   2688    pixman_composite_out_reverse_8_0565_process_pixblock_tail
   2689    vld1.16    {d10, d11}, [DST_R, :128]!
   2690    cache_preload 8, 8
   2691    pixman_composite_out_reverse_8_0565_process_pixblock_head
   2692    vst1.16    {d28, d29}, [DST_W, :128]!
   2693 .endm
   2694 
   2695 generate_composite_function \
   2696    pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \
   2697    FLAG_DST_READWRITE, \
   2698    8, /* number of pixels, processed in a single block */ \
   2699    5, /* prefetch distance */ \
   2700    default_init_need_all_regs, \
   2701    default_cleanup_need_all_regs, \
   2702    pixman_composite_out_reverse_8_0565_process_pixblock_head, \
   2703    pixman_composite_out_reverse_8_0565_process_pixblock_tail, \
   2704    pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \
   2705    28, /* dst_w_basereg */ \
   2706    10, /* dst_r_basereg */ \
   2707    15, /* src_basereg   */ \
   2708    0   /* mask_basereg  */
   2709 
   2710 /******************************************************************************/
   2711 
   2712 .macro pixman_composite_out_reverse_8_8888_process_pixblock_head
   2713    /* src is in d0 */
   2714    /* destination pixel data is in {d4, d5, d6, d7} */
   2715    vmvn.8      d1, d0 /* get inverted alpha */
   2716    /* now do alpha blending */
   2717    vmull.u8    q8, d1, d4
   2718    vmull.u8    q9, d1, d5
   2719    vmull.u8    q10, d1, d6
   2720    vmull.u8    q11, d1, d7
   2721 .endm
   2722 
   2723 .macro pixman_composite_out_reverse_8_8888_process_pixblock_tail
   2724    vrshr.u16   q14, q8, #8
   2725    vrshr.u16   q15, q9, #8
   2726    vrshr.u16   q12, q10, #8
   2727    vrshr.u16   q13, q11, #8
   2728    vraddhn.u16 d28, q14, q8
   2729    vraddhn.u16 d29, q15, q9
   2730    vraddhn.u16 d30, q12, q10
   2731    vraddhn.u16 d31, q13, q11
   2732    /* 32bpp result is in {d28, d29, d30, d31} */
   2733 .endm
   2734 
   2735 /* TODO: expand macros and do better instructions scheduling */
   2736 .macro pixman_composite_out_reverse_8_8888_process_pixblock_tail_head
   2737    fetch_src_pixblock
   2738    pixman_composite_out_reverse_8_8888_process_pixblock_tail
   2739    vld4.8    {d4, d5, d6, d7}, [DST_R, :128]!
   2740    cache_preload 8, 8
   2741    pixman_composite_out_reverse_8_8888_process_pixblock_head
   2742    vst4.8    {d28, d29, d30, d31}, [DST_W, :128]!
   2743 .endm
   2744 
   2745 generate_composite_function \
   2746    pixman_composite_out_reverse_8_8888_asm_neon, 8, 0, 32, \
   2747    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   2748    8, /* number of pixels, processed in a single block */ \
   2749    5, /* prefetch distance */ \
   2750    default_init, \
   2751    default_cleanup, \
   2752    pixman_composite_out_reverse_8_8888_process_pixblock_head, \
   2753    pixman_composite_out_reverse_8_8888_process_pixblock_tail, \
   2754    pixman_composite_out_reverse_8_8888_process_pixblock_tail_head, \
   2755    28, /* dst_w_basereg */ \
   2756    4, /* dst_r_basereg */ \
   2757    0, /* src_basereg   */ \
   2758    0   /* mask_basereg  */
   2759 
   2760 /******************************************************************************/
   2761 
   2762 generate_composite_function_nearest_scanline \
   2763    pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \
   2764    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   2765    8, /* number of pixels, processed in a single block */ \
   2766    default_init, \
   2767    default_cleanup, \
   2768    pixman_composite_over_8888_8888_process_pixblock_head, \
   2769    pixman_composite_over_8888_8888_process_pixblock_tail, \
   2770    pixman_composite_over_8888_8888_process_pixblock_tail_head
   2771 
   2772 generate_composite_function_nearest_scanline \
   2773    pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \
   2774    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   2775    8, /* number of pixels, processed in a single block */ \
   2776    default_init, \
   2777    default_cleanup, \
   2778    pixman_composite_over_8888_0565_process_pixblock_head, \
   2779    pixman_composite_over_8888_0565_process_pixblock_tail, \
   2780    pixman_composite_over_8888_0565_process_pixblock_tail_head, \
   2781    28, /* dst_w_basereg */ \
   2782    4,  /* dst_r_basereg */ \
   2783    0,  /* src_basereg   */ \
   2784    24  /* mask_basereg  */
   2785 
   2786 generate_composite_function_nearest_scanline \
   2787    pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \
   2788    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
   2789    8, /* number of pixels, processed in a single block */ \
   2790    default_init, \
   2791    default_cleanup, \
   2792    pixman_composite_src_8888_0565_process_pixblock_head, \
   2793    pixman_composite_src_8888_0565_process_pixblock_tail, \
   2794    pixman_composite_src_8888_0565_process_pixblock_tail_head
   2795 
   2796 generate_composite_function_nearest_scanline \
   2797    pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \
   2798    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
   2799    8, /* number of pixels, processed in a single block */ \
   2800    default_init, \
   2801    default_cleanup, \
   2802    pixman_composite_src_0565_8888_process_pixblock_head, \
   2803    pixman_composite_src_0565_8888_process_pixblock_tail, \
   2804    pixman_composite_src_0565_8888_process_pixblock_tail_head
   2805 
   2806 generate_composite_function_nearest_scanline \
   2807    pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \
   2808    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   2809    8, /* number of pixels, processed in a single block */ \
   2810    default_init_need_all_regs, \
   2811    default_cleanup_need_all_regs, \
   2812    pixman_composite_over_8888_8_0565_process_pixblock_head, \
   2813    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
   2814    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
   2815    28, /* dst_w_basereg */ \
   2816    4,  /* dst_r_basereg */ \
   2817    8,  /* src_basereg   */ \
   2818    24  /* mask_basereg  */
   2819 
   2820 generate_composite_function_nearest_scanline \
   2821    pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \
   2822    FLAG_DST_READWRITE, \
   2823    8, /* number of pixels, processed in a single block */ \
   2824    default_init_need_all_regs, \
   2825    default_cleanup_need_all_regs, \
   2826    pixman_composite_over_0565_8_0565_process_pixblock_head, \
   2827    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
   2828    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
   2829    28, /* dst_w_basereg */ \
   2830    10,  /* dst_r_basereg */ \
   2831    8,  /* src_basereg   */ \
   2832    15  /* mask_basereg  */
   2833 
   2834 /******************************************************************************/
   2835 
   2836 /*
   2837 * Bilinear scaling support code which tries to provide pixel fetching, color
   2838 * format conversion, and interpolation as separate macros which can be used
   2839 * as the basic building blocks for constructing bilinear scanline functions.
   2840 */
   2841 
   2842 .macro bilinear_load_8888 reg1, reg2, tmp
   2843    mov       TMP1, X, asr #16
   2844    add       X, X, UX
   2845    add       TMP1, TOP, TMP1, asl #2
   2846    vld1.32   {\reg1}, [TMP1], STRIDE
   2847    vld1.32   {\reg2}, [TMP1]
   2848 .endm
   2849 
   2850 .macro bilinear_load_0565 reg1, reg2, tmp
   2851    mov       TMP1, X, asr #16
   2852    add       X, X, UX
   2853    add       TMP1, TOP, TMP1, asl #1
   2854    vld1.32   {\reg2[0]}, [TMP1], STRIDE
   2855    vld1.32   {\reg2[1]}, [TMP1]
   2856    convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp
   2857 .endm
   2858 
   2859 .macro bilinear_load_and_vertical_interpolate_two_8888 \
   2860                    acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
   2861 
   2862    bilinear_load_8888 \reg1, \reg2, \tmp1
   2863    vmull.u8  \acc1, \reg1, d28
   2864    vmlal.u8  \acc1, \reg2, d29
   2865    bilinear_load_8888 \reg3, \reg4, \tmp2
   2866    vmull.u8  \acc2, \reg3, d28
   2867    vmlal.u8  \acc2, \reg4, d29
   2868 .endm
   2869 
   2870 .macro bilinear_load_and_vertical_interpolate_four_8888 \
   2871                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
   2872                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
   2873 
   2874    bilinear_load_and_vertical_interpolate_two_8888 \
   2875                \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, \xacc2hi
   2876    bilinear_load_and_vertical_interpolate_two_8888 \
   2877                \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi
   2878 .endm
   2879 
   2880 .macro bilinear_load_and_vertical_interpolate_two_0565 \
   2881                acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
   2882 
   2883    mov       TMP1, X, asr #16
   2884    add       X, X, UX
   2885    add       TMP1, TOP, TMP1, asl #1
   2886    mov       TMP2, X, asr #16
   2887    add       X, X, UX
   2888    add       TMP2, TOP, TMP2, asl #1
   2889    vld1.32   {\acc2lo[0]}, [TMP1], STRIDE
   2890    vld1.32   {\acc2hi[0]}, [TMP2], STRIDE
   2891    vld1.32   {\acc2lo[1]}, [TMP1]
   2892    vld1.32   {\acc2hi[1]}, [TMP2]
   2893    convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1
   2894    vzip.u8   \reg1, \reg3
   2895    vzip.u8   \reg2, \reg4
   2896    vzip.u8   \reg3, \reg4
   2897    vzip.u8   \reg1, \reg2
   2898    vmull.u8  \acc1, \reg1, d28
   2899    vmlal.u8  \acc1, \reg2, d29
   2900    vmull.u8  \acc2, \reg3, d28
   2901    vmlal.u8  \acc2, \reg4, d29
   2902 .endm
   2903 
   2904 .macro bilinear_load_and_vertical_interpolate_four_0565 \
   2905                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
   2906                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
   2907 
   2908    mov       TMP1, X, asr #16
   2909    add       X, X, UX
   2910    add       TMP1, TOP, TMP1, asl #1
   2911    mov       TMP2, X, asr #16
   2912    add       X, X, UX
   2913    add       TMP2, TOP, TMP2, asl #1
   2914    vld1.32   {\xacc2lo[0]}, [TMP1], STRIDE
   2915    vld1.32   {\xacc2hi[0]}, [TMP2], STRIDE
   2916    vld1.32   {\xacc2lo[1]}, [TMP1]
   2917    vld1.32   {\xacc2hi[1]}, [TMP2]
   2918    convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1
   2919    mov       TMP1, X, asr #16
   2920    add       X, X, UX
   2921    add       TMP1, TOP, TMP1, asl #1
   2922    mov       TMP2, X, asr #16
   2923    add       X, X, UX
   2924    add       TMP2, TOP, TMP2, asl #1
   2925    vld1.32   {\yacc2lo[0]}, [TMP1], STRIDE
   2926    vzip.u8   \xreg1, \xreg3
   2927    vld1.32   {\yacc2hi[0]}, [TMP2], STRIDE
   2928    vzip.u8   \xreg2, \xreg4
   2929    vld1.32   {\yacc2lo[1]}, [TMP1]
   2930    vzip.u8   \xreg3, \xreg4
   2931    vld1.32   {\yacc2hi[1]}, [TMP2]
   2932    vzip.u8   \xreg1, \xreg2
   2933    convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1
   2934    vmull.u8  \xacc1, \xreg1, d28
   2935    vzip.u8   \yreg1, \yreg3
   2936    vmlal.u8  \xacc1, \xreg2, d29
   2937    vzip.u8   \yreg2, \yreg4
   2938    vmull.u8  \xacc2, \xreg3, d28
   2939    vzip.u8   \yreg3, \yreg4
   2940    vmlal.u8  \xacc2, \xreg4, d29
   2941    vzip.u8   \yreg1, \yreg2
   2942    vmull.u8  \yacc1, \yreg1, d28
   2943    vmlal.u8  \yacc1, \yreg2, d29
   2944    vmull.u8  \yacc2, \yreg3, d28
   2945    vmlal.u8  \yacc2, \yreg4, d29
   2946 .endm
   2947 
   2948 .macro bilinear_store_8888 numpix, tmp1, tmp2
   2949 .if \numpix == 4
   2950    vst1.32   {d0, d1}, [OUT, :128]!
   2951 .elseif \numpix == 2
   2952    vst1.32   {d0}, [OUT, :64]!
   2953 .elseif \numpix == 1
   2954    vst1.32   {d0[0]}, [OUT, :32]!
   2955 .else
   2956    .error bilinear_store_8888 \numpix is unsupported
   2957 .endif
   2958 .endm
   2959 
   2960 .macro bilinear_store_0565 numpix, tmp1, tmp2
   2961    vuzp.u8 d0, d1
   2962    vuzp.u8 d2, d3
   2963    vuzp.u8 d1, d3
   2964    vuzp.u8 d0, d2
   2965    convert_8888_to_0565 d2, d1, d0, q1, \tmp1, \tmp2
   2966 .if \numpix == 4
   2967    vst1.16   {d2}, [OUT, :64]!
   2968 .elseif \numpix == 2
   2969    vst1.32   {d2[0]}, [OUT, :32]!
   2970 .elseif \numpix == 1
   2971    vst1.16   {d2[0]}, [OUT, :16]!
   2972 .else
   2973    .error bilinear_store_0565 \numpix is unsupported
   2974 .endif
   2975 .endm
   2976 
   2977 .macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
   2978    bilinear_load_\()\src_fmt d0, d1, d2
   2979    vmull.u8  q1, d0, d28
   2980    vmlal.u8  q1, d1, d29
   2981    /* 5 cycles bubble */
   2982    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
   2983    vmlsl.u16 q0, d2, d30
   2984    vmlal.u16 q0, d3, d30
   2985    /* 5 cycles bubble */
   2986    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
   2987    /* 3 cycles bubble */
   2988    vmovn.u16 d0, q0
   2989    /* 1 cycle bubble */
   2990    bilinear_store_\()\dst_fmt 1, q2, q3
   2991 .endm
   2992 
   2993 .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
   2994    bilinear_load_and_vertical_interpolate_two_\()\src_fmt \
   2995                q1, q11, d0, d1, d20, d21, d22, d23
   2996    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
   2997    vmlsl.u16 q0, d2, d30
   2998    vmlal.u16 q0, d3, d30
   2999    vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
   3000    vmlsl.u16 q10, d22, d31
   3001    vmlal.u16 q10, d23, d31
   3002    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
   3003    vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
   3004    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   3005    vadd.u16  q12, q12, q13
   3006    vmovn.u16 d0, q0
   3007    bilinear_store_\()\dst_fmt 2, q2, q3
   3008 .endm
   3009 
   3010 .macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
   3011    bilinear_load_and_vertical_interpolate_four_\()\src_fmt \
   3012                q1, q11, d0, d1, d20, d21, d22, d23 \
   3013                q3, q9,  d4, d5, d16, d17, d18, d19
   3014    pld       [TMP1, PF_OFFS]
   3015    sub       TMP1, TMP1, STRIDE
   3016    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
   3017    vmlsl.u16 q0, d2, d30
   3018    vmlal.u16 q0, d3, d30
   3019    vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
   3020    vmlsl.u16 q10, d22, d31
   3021    vmlal.u16 q10, d23, d31
   3022    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   3023    vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS
   3024    vmlsl.u16 q2, d6, d30
   3025    vmlal.u16 q2, d7, d30
   3026    vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS
   3027    pld       [TMP2, PF_OFFS]
   3028    vmlsl.u16 q8, d18, d31
   3029    vmlal.u16 q8, d19, d31
   3030    vadd.u16  q12, q12, q13
   3031    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
   3032    vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
   3033    vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
   3034    vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS)
   3035    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   3036    vmovn.u16 d0, q0
   3037    vmovn.u16 d1, q2
   3038    vadd.u16  q12, q12, q13
   3039    bilinear_store_\()\dst_fmt 4, q2, q3
   3040 .endm
   3041 
   3042 .macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
   3043 .ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
   3044    bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_head
   3045 .else
   3046    bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
   3047 .endif
   3048 .endm
   3049 
   3050 .macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
   3051 .ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
   3052    bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail
   3053 .endif
   3054 .endm
   3055 
   3056 .macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
   3057 .ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
   3058    bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head
   3059 .else
   3060    bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
   3061 .endif
   3062 .endm
   3063 
   3064 .macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
   3065 .ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
   3066    bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_head
   3067 .else
   3068    bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt
   3069    bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
   3070 .endif
   3071 .endm
   3072 
   3073 .macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
   3074 .ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
   3075    bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail
   3076 .else
   3077    bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt
   3078 .endif
   3079 .endm
   3080 
   3081 .macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
   3082 .ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
   3083    bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head
   3084 .else
   3085    bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
   3086    bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
   3087 .endif
   3088 .endm
   3089 
   3090 .set BILINEAR_FLAG_UNROLL_4,          0
   3091 .set BILINEAR_FLAG_UNROLL_8,          1
   3092 .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
   3093 
   3094 /*
   3095 * Main template macro for generating NEON optimized bilinear scanline
   3096 * functions.
   3097 *
   3098 * Bilinear scanline scaler macro template uses the following arguments:
   3099 *  fname             - name of the function to generate
   3100 *  src_fmt           - source color format (8888 or 0565)
   3101 *  dst_fmt           - destination color format (8888 or 0565)
   3102 *  bpp_shift         - (1 << bpp_shift) is the size of source pixel in bytes
   3103 *  prefetch_distance - prefetch in the source image by that many
   3104 *                      pixels ahead
   3105 */
   3106 
   3107 .macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
   3108                                       src_bpp_shift, dst_bpp_shift, \
   3109                                       prefetch_distance, flags
   3110 
   3111 pixman_asm_function \fname
   3112    OUT       .req      r0
   3113    TOP       .req      r1
   3114    BOTTOM    .req      r2
   3115    WT        .req      r3
   3116    WB        .req      r4
   3117    X         .req      r5
   3118    UX        .req      r6
   3119    WIDTH     .req      ip
   3120    TMP1      .req      r3
   3121    TMP2      .req      r4
   3122    PF_OFFS   .req      r7
   3123    TMP3      .req      r8
   3124    TMP4      .req      r9
   3125    STRIDE    .req      r2
   3126 
   3127    mov       ip, sp
   3128    push      {r4, r5, r6, r7, r8, r9}
   3129    mov       PF_OFFS, #\prefetch_distance
   3130    ldmia     ip, {WB, X, UX, WIDTH}
   3131    mul       PF_OFFS, PF_OFFS, UX
   3132 
   3133 .if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
   3134    vpush     {d8-d15}
   3135 .endif
   3136 
   3137    sub       STRIDE, BOTTOM, TOP
   3138    .unreq    BOTTOM
   3139 
   3140    cmp       WIDTH, #0
   3141    ble       3f
   3142 
   3143    vdup.u16  q12, X
   3144    vdup.u16  q13, UX
   3145    vdup.u8   d28, WT
   3146    vdup.u8   d29, WB
   3147    vadd.u16  d25, d25, d26
   3148 
   3149    /* ensure good destination alignment  */
   3150    cmp       WIDTH, #1
   3151    blt       0f
   3152    tst       OUT, #(1 << \dst_bpp_shift)
   3153    beq       0f
   3154    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   3155    vadd.u16  q12, q12, q13
   3156    bilinear_interpolate_last_pixel \src_fmt, \dst_fmt
   3157    sub       WIDTH, WIDTH, #1
   3158 0:
   3159    vadd.u16  q13, q13, q13
   3160    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   3161    vadd.u16  q12, q12, q13
   3162 
   3163    cmp       WIDTH, #2
   3164    blt       0f
   3165    tst       OUT, #(1 << (\dst_bpp_shift + 1))
   3166    beq       0f
   3167    bilinear_interpolate_two_pixels \src_fmt, \dst_fmt
   3168    sub       WIDTH, WIDTH, #2
   3169 0:
   3170 .if ((\flags) & BILINEAR_FLAG_UNROLL_8) != 0
   3171 /*********** 8 pixels per iteration *****************/
   3172    cmp       WIDTH, #4
   3173    blt       0f
   3174    tst       OUT, #(1 << (\dst_bpp_shift + 2))
   3175    beq       0f
   3176    bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
   3177    sub       WIDTH, WIDTH, #4
   3178 0:
   3179    subs      WIDTH, WIDTH, #8
   3180    blt       1f
   3181    mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
   3182    bilinear_interpolate_eight_pixels_head \src_fmt, \dst_fmt
   3183    subs      WIDTH, WIDTH, #8
   3184    blt       5f
   3185 0:
   3186    bilinear_interpolate_eight_pixels_tail_head \src_fmt, \dst_fmt
   3187    subs      WIDTH, WIDTH, #8
   3188    bge       0b
   3189 5:
   3190    bilinear_interpolate_eight_pixels_tail \src_fmt, \dst_fmt
   3191 1:
   3192    tst       WIDTH, #4
   3193    beq       2f
   3194    bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
   3195 2:
   3196 .else
   3197 /*********** 4 pixels per iteration *****************/
   3198    subs      WIDTH, WIDTH, #4
   3199    blt       1f
   3200    mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
   3201    bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt
   3202    subs      WIDTH, WIDTH, #4
   3203    blt       5f
   3204 0:
   3205    bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
   3206    subs      WIDTH, WIDTH, #4
   3207    bge       0b
   3208 5:
   3209    bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt
   3210 1:
   3211 /****************************************************/
   3212 .endif
   3213    /* handle the remaining trailing pixels */
   3214    tst       WIDTH, #2
   3215    beq       2f
   3216    bilinear_interpolate_two_pixels \src_fmt, \dst_fmt
   3217 2:
   3218    tst       WIDTH, #1
   3219    beq       3f
   3220    bilinear_interpolate_last_pixel \src_fmt, \dst_fmt
   3221 3:
   3222 .if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
   3223    vpop      {d8-d15}
   3224 .endif
   3225    pop       {r4, r5, r6, r7, r8, r9}
   3226    bx        lr
   3227 
   3228    .unreq    OUT
   3229    .unreq    TOP
   3230    .unreq    WT
   3231    .unreq    WB
   3232    .unreq    X
   3233    .unreq    UX
   3234    .unreq    WIDTH
   3235    .unreq    TMP1
   3236    .unreq    TMP2
   3237    .unreq    PF_OFFS
   3238    .unreq    TMP3
   3239    .unreq    TMP4
   3240    .unreq    STRIDE
   3241    pixman_end_asm_function
   3242 
   3243 .endm
   3244 
   3245 /*****************************************************************************/
   3246 
   3247 .set have_bilinear_interpolate_four_pixels_8888_8888, 1
   3248 
   3249 .macro bilinear_interpolate_four_pixels_8888_8888_head
   3250    mov       TMP1, X, asr #16
   3251    add       X, X, UX
   3252    add       TMP1, TOP, TMP1, asl #2
   3253    mov       TMP2, X, asr #16
   3254    add       X, X, UX
   3255    add       TMP2, TOP, TMP2, asl #2
   3256 
   3257    vld1.32   {d22}, [TMP1], STRIDE
   3258    vld1.32   {d23}, [TMP1]
   3259    mov       TMP3, X, asr #16
   3260    add       X, X, UX
   3261    add       TMP3, TOP, TMP3, asl #2
   3262    vmull.u8  q8, d22, d28
   3263    vmlal.u8  q8, d23, d29
   3264 
   3265    vld1.32   {d22}, [TMP2], STRIDE
   3266    vld1.32   {d23}, [TMP2]
   3267    mov       TMP4, X, asr #16
   3268    add       X, X, UX
   3269    add       TMP4, TOP, TMP4, asl #2
   3270    vmull.u8  q9, d22, d28
   3271    vmlal.u8  q9, d23, d29
   3272 
   3273    vld1.32   {d22}, [TMP3], STRIDE
   3274    vld1.32   {d23}, [TMP3]
   3275    vmull.u8  q10, d22, d28
   3276    vmlal.u8  q10, d23, d29
   3277 
   3278    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
   3279    vmlsl.u16 q0, d16, d30
   3280    vmlal.u16 q0, d17, d30
   3281 
   3282    pld       [TMP4, PF_OFFS]
   3283    vld1.32   {d16}, [TMP4], STRIDE
   3284    vld1.32   {d17}, [TMP4]
   3285    pld       [TMP4, PF_OFFS]
   3286    vmull.u8  q11, d16, d28
   3287    vmlal.u8  q11, d17, d29
   3288 
   3289    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
   3290    vmlsl.u16 q1, d18, d31
   3291 .endm
   3292 
   3293 .macro bilinear_interpolate_four_pixels_8888_8888_tail
   3294    vmlal.u16 q1, d19, d31
   3295    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   3296    vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
   3297    vmlsl.u16 q2, d20, d30
   3298    vmlal.u16 q2, d21, d30
   3299    vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
   3300    vmlsl.u16 q3, d22, d31
   3301    vmlal.u16 q3, d23, d31
   3302    vadd.u16  q12, q12, q13
   3303    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
   3304    vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
   3305    vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
   3306    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   3307    vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
   3308    vmovn.u16 d6, q0
   3309    vmovn.u16 d7, q2
   3310    vadd.u16  q12, q12, q13
   3311    vst1.32   {d6, d7}, [OUT, :128]!
   3312 .endm
   3313 
   3314 .macro bilinear_interpolate_four_pixels_8888_8888_tail_head
   3315    mov       TMP1, X, asr #16
   3316    add       X, X, UX
   3317    add       TMP1, TOP, TMP1, asl #2
   3318    mov       TMP2, X, asr #16
   3319    add       X, X, UX
   3320    add       TMP2, TOP, TMP2, asl #2
   3321        vmlal.u16 q1, d19, d31
   3322        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   3323        vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
   3324        vmlsl.u16 q2, d20, d30
   3325        vmlal.u16 q2, d21, d30
   3326        vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
   3327    vld1.32   {d20}, [TMP1], STRIDE
   3328        vmlsl.u16 q3, d22, d31
   3329        vmlal.u16 q3, d23, d31
   3330    vld1.32   {d21}, [TMP1]
   3331    vmull.u8  q8, d20, d28
   3332    vmlal.u8  q8, d21, d29
   3333        vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
   3334        vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
   3335        vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
   3336    vld1.32   {d22}, [TMP2], STRIDE
   3337        vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
   3338        vadd.u16  q12, q12, q13
   3339    vld1.32   {d23}, [TMP2]
   3340    vmull.u8  q9, d22, d28
   3341    mov       TMP3, X, asr #16
   3342    add       X, X, UX
   3343    add       TMP3, TOP, TMP3, asl #2
   3344    mov       TMP4, X, asr #16
   3345    add       X, X, UX
   3346    add       TMP4, TOP, TMP4, asl #2
   3347    vmlal.u8  q9, d23, d29
   3348    vld1.32   {d22}, [TMP3], STRIDE
   3349        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   3350    vld1.32   {d23}, [TMP3]
   3351    vmull.u8  q10, d22, d28
   3352    vmlal.u8  q10, d23, d29
   3353        vmovn.u16 d6, q0
   3354    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
   3355        vmovn.u16 d7, q2
   3356    vmlsl.u16 q0, d16, d30
   3357    vmlal.u16 q0, d17, d30
   3358    pld       [TMP4, PF_OFFS]
   3359    vld1.32   {d16}, [TMP4], STRIDE
   3360        vadd.u16  q12, q12, q13
   3361    vld1.32   {d17}, [TMP4]
   3362    pld       [TMP4, PF_OFFS]
   3363    vmull.u8  q11, d16, d28
   3364    vmlal.u8  q11, d17, d29
   3365        vst1.32   {d6, d7}, [OUT, :128]!
   3366    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
   3367    vmlsl.u16 q1, d18, d31
   3368 .endm
   3369 
   3370 /*****************************************************************************/
   3371 
   3372 .set have_bilinear_interpolate_eight_pixels_8888_0565, 1
   3373 
   3374 .macro bilinear_interpolate_eight_pixels_8888_0565_head
   3375    mov       TMP1, X, asr #16
   3376    add       X, X, UX
   3377    add       TMP1, TOP, TMP1, asl #2
   3378    mov       TMP2, X, asr #16
   3379    add       X, X, UX
   3380    add       TMP2, TOP, TMP2, asl #2
   3381    vld1.32   {d20}, [TMP1], STRIDE
   3382    vld1.32   {d21}, [TMP1]
   3383    vmull.u8  q8, d20, d28
   3384    vmlal.u8  q8, d21, d29
   3385    vld1.32   {d22}, [TMP2], STRIDE
   3386    vld1.32   {d23}, [TMP2]
   3387    vmull.u8  q9, d22, d28
   3388    mov       TMP3, X, asr #16
   3389    add       X, X, UX
   3390    add       TMP3, TOP, TMP3, asl #2
   3391    mov       TMP4, X, asr #16
   3392    add       X, X, UX
   3393    add       TMP4, TOP, TMP4, asl #2
   3394    vmlal.u8  q9, d23, d29
   3395    vld1.32   {d22}, [TMP3], STRIDE
   3396    vld1.32   {d23}, [TMP3]
   3397    vmull.u8  q10, d22, d28
   3398    vmlal.u8  q10, d23, d29
   3399    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
   3400    vmlsl.u16 q0, d16, d30
   3401    vmlal.u16 q0, d17, d30
   3402    pld       [TMP4, PF_OFFS]
   3403    vld1.32   {d16}, [TMP4], STRIDE
   3404    vld1.32   {d17}, [TMP4]
   3405    pld       [TMP4, PF_OFFS]
   3406    vmull.u8  q11, d16, d28
   3407    vmlal.u8  q11, d17, d29
   3408    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
   3409    vmlsl.u16 q1, d18, d31
   3410 
   3411    mov       TMP1, X, asr #16
   3412    add       X, X, UX
   3413    add       TMP1, TOP, TMP1, asl #2
   3414    mov       TMP2, X, asr #16
   3415    add       X, X, UX
   3416    add       TMP2, TOP, TMP2, asl #2
   3417        vmlal.u16 q1, d19, d31
   3418        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   3419        vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
   3420        vmlsl.u16 q2, d20, d30
   3421        vmlal.u16 q2, d21, d30
   3422        vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
   3423    vld1.32   {d20}, [TMP1], STRIDE
   3424        vmlsl.u16 q3, d22, d31
   3425        vmlal.u16 q3, d23, d31
   3426    vld1.32   {d21}, [TMP1]
   3427    vmull.u8  q8, d20, d28
   3428    vmlal.u8  q8, d21, d29
   3429        vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
   3430        vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
   3431        vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
   3432    vld1.32   {d22}, [TMP2], STRIDE
   3433        vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
   3434        vadd.u16  q12, q12, q13
   3435    vld1.32   {d23}, [TMP2]
   3436    vmull.u8  q9, d22, d28
   3437    mov       TMP3, X, asr #16
   3438    add       X, X, UX
   3439    add       TMP3, TOP, TMP3, asl #2
   3440    mov       TMP4, X, asr #16
   3441    add       X, X, UX
   3442    add       TMP4, TOP, TMP4, asl #2
   3443    vmlal.u8  q9, d23, d29
   3444    vld1.32   {d22}, [TMP3], STRIDE
   3445        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   3446    vld1.32   {d23}, [TMP3]
   3447    vmull.u8  q10, d22, d28
   3448    vmlal.u8  q10, d23, d29
   3449        vmovn.u16 d8, q0
   3450    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
   3451        vmovn.u16 d9, q2
   3452    vmlsl.u16 q0, d16, d30
   3453    vmlal.u16 q0, d17, d30
   3454    pld       [TMP4, PF_OFFS]
   3455    vld1.32   {d16}, [TMP4], STRIDE
   3456        vadd.u16  q12, q12, q13
   3457    vld1.32   {d17}, [TMP4]
   3458    pld       [TMP4, PF_OFFS]
   3459    vmull.u8  q11, d16, d28
   3460    vmlal.u8  q11, d17, d29
   3461    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
   3462    vmlsl.u16 q1, d18, d31
   3463 .endm
   3464 
   3465 .macro bilinear_interpolate_eight_pixels_8888_0565_tail
   3466    vmlal.u16 q1, d19, d31
   3467    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   3468    vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
   3469    vmlsl.u16 q2, d20, d30
   3470    vmlal.u16 q2, d21, d30
   3471    vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
   3472    vmlsl.u16 q3, d22, d31
   3473    vmlal.u16 q3, d23, d31
   3474    vadd.u16  q12, q12, q13
   3475    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
   3476    vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
   3477    vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
   3478    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   3479    vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
   3480    vmovn.u16 d10, q0
   3481    vmovn.u16 d11, q2
   3482    vadd.u16  q12, q12, q13
   3483 
   3484    vuzp.u8   d8, d9
   3485    vuzp.u8   d10, d11
   3486    vuzp.u8   d9, d11
   3487    vuzp.u8   d8, d10
   3488    vshll.u8  q6, d9, #8
   3489    vshll.u8  q5, d10, #8
   3490    vshll.u8  q7, d8, #8
   3491    vsri.u16  q5, q6, #5
   3492    vsri.u16  q5, q7, #11
   3493    vst1.32   {d10, d11}, [OUT, :128]!
   3494 .endm
   3495 
   3496 .macro bilinear_interpolate_eight_pixels_8888_0565_tail_head
   3497    mov       TMP1, X, asr #16
   3498    add       X, X, UX
   3499    add       TMP1, TOP, TMP1, asl #2
   3500    mov       TMP2, X, asr #16
   3501    add       X, X, UX
   3502    add       TMP2, TOP, TMP2, asl #2
   3503        vmlal.u16 q1, d19, d31
   3504        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   3505            vuzp.u8 d8, d9
   3506        vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
   3507        vmlsl.u16 q2, d20, d30
   3508        vmlal.u16 q2, d21, d30
   3509        vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
   3510    vld1.32   {d20}, [TMP1], STRIDE
   3511        vmlsl.u16 q3, d22, d31
   3512        vmlal.u16 q3, d23, d31
   3513    vld1.32   {d21}, [TMP1]
   3514    vmull.u8  q8, d20, d28
   3515    vmlal.u8  q8, d21, d29
   3516        vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
   3517        vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
   3518        vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
   3519    vld1.32   {d22}, [TMP2], STRIDE
   3520        vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
   3521        vadd.u16  q12, q12, q13
   3522    vld1.32   {d23}, [TMP2]
   3523    vmull.u8  q9, d22, d28
   3524    mov       TMP3, X, asr #16
   3525    add       X, X, UX
   3526    add       TMP3, TOP, TMP3, asl #2
   3527    mov       TMP4, X, asr #16
   3528    add       X, X, UX
   3529    add       TMP4, TOP, TMP4, asl #2
   3530    vmlal.u8  q9, d23, d29
   3531    vld1.32   {d22}, [TMP3], STRIDE
   3532        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   3533    vld1.32   {d23}, [TMP3]
   3534    vmull.u8  q10, d22, d28
   3535    vmlal.u8  q10, d23, d29
   3536        vmovn.u16 d10, q0
   3537    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
   3538        vmovn.u16 d11, q2
   3539    vmlsl.u16 q0, d16, d30
   3540    vmlal.u16 q0, d17, d30
   3541    pld       [TMP4, PF_OFFS]
   3542    vld1.32   {d16}, [TMP4], STRIDE
   3543        vadd.u16  q12, q12, q13
   3544    vld1.32   {d17}, [TMP4]
   3545    pld       [TMP4, PF_OFFS]
   3546    vmull.u8  q11, d16, d28
   3547    vmlal.u8  q11, d17, d29
   3548            vuzp.u8 d10, d11
   3549    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
   3550    vmlsl.u16 q1, d18, d31
   3551 
   3552    mov       TMP1, X, asr #16
   3553    add       X, X, UX
   3554    add       TMP1, TOP, TMP1, asl #2
   3555    mov       TMP2, X, asr #16
   3556    add       X, X, UX
   3557    add       TMP2, TOP, TMP2, asl #2
   3558        vmlal.u16 q1, d19, d31
   3559            vuzp.u8 d9, d11
   3560        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   3561        vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
   3562            vuzp.u8 d8, d10
   3563        vmlsl.u16 q2, d20, d30
   3564        vmlal.u16 q2, d21, d30
   3565        vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
   3566    vld1.32   {d20}, [TMP1], STRIDE
   3567        vmlsl.u16 q3, d22, d31
   3568        vmlal.u16 q3, d23, d31
   3569    vld1.32   {d21}, [TMP1]
   3570    vmull.u8  q8, d20, d28
   3571    vmlal.u8  q8, d21, d29
   3572            vshll.u8  q6, d9, #8
   3573            vshll.u8  q5, d10, #8
   3574            vshll.u8  q7, d8, #8
   3575        vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
   3576            vsri.u16  q5, q6, #5
   3577        vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
   3578            vsri.u16  q5, q7, #11
   3579        vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
   3580    vld1.32   {d22}, [TMP2], STRIDE
   3581        vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
   3582        vadd.u16  q12, q12, q13
   3583    vld1.32   {d23}, [TMP2]
   3584    vmull.u8  q9, d22, d28
   3585    mov       TMP3, X, asr #16
   3586    add       X, X, UX
   3587    add       TMP3, TOP, TMP3, asl #2
   3588    mov       TMP4, X, asr #16
   3589    add       X, X, UX
   3590    add       TMP4, TOP, TMP4, asl #2
   3591    vmlal.u8  q9, d23, d29
   3592    vld1.32   {d22}, [TMP3], STRIDE
   3593        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   3594    vld1.32   {d23}, [TMP3]
   3595    vmull.u8  q10, d22, d28
   3596    vmlal.u8  q10, d23, d29
   3597        vmovn.u16 d8, q0
   3598    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
   3599        vmovn.u16 d9, q2
   3600    vmlsl.u16 q0, d16, d30
   3601    vmlal.u16 q0, d17, d30
   3602    pld       [TMP4, PF_OFFS]
   3603    vld1.32   {d16}, [TMP4], STRIDE
   3604        vadd.u16  q12, q12, q13
   3605    vld1.32   {d17}, [TMP4]
   3606    pld       [TMP4, PF_OFFS]
   3607    vmull.u8  q11, d16, d28
   3608    vmlal.u8  q11, d17, d29
   3609    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
   3610            vst1.32   {d10, d11}, [OUT, :128]!
   3611    vmlsl.u16 q1, d18, d31
   3612 .endm
   3613 /*****************************************************************************/
   3614 
   3615 generate_bilinear_scanline_func \
   3616    pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \
   3617    2, 2, 28, BILINEAR_FLAG_UNROLL_4
   3618 
   3619 generate_bilinear_scanline_func \
   3620    pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, \
   3621    2, 1, 28, BILINEAR_FLAG_UNROLL_8 | BILINEAR_FLAG_USE_ALL_NEON_REGS
   3622 
   3623 generate_bilinear_scanline_func \
   3624    pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, \
   3625    1, 2, 28, BILINEAR_FLAG_UNROLL_4
   3626 
   3627 generate_bilinear_scanline_func \
   3628    pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, \
   3629    1, 1, 28, BILINEAR_FLAG_UNROLL_4