tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

pixman-arm-simd-asm.S (44191B)


      1 /*
      2 * Copyright © 2012 Raspberry Pi Foundation
      3 * Copyright © 2012 RISC OS Open Ltd
      4 *
      5 * Permission to use, copy, modify, distribute, and sell this software and its
      6 * documentation for any purpose is hereby granted without fee, provided that
      7 * the above copyright notice appear in all copies and that both that
      8 * copyright notice and this permission notice appear in supporting
      9 * documentation, and that the name of the copyright holders not be used in
     10 * advertising or publicity pertaining to distribution of the software without
     11 * specific, written prior permission.  The copyright holders make no
     12 * representations about the suitability of this software for any purpose.  It
     13 * is provided "as is" without express or implied warranty.
     14 *
     15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
     16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
     17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
     18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
     20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
     21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
     22 * SOFTWARE.
     23 *
     24 * Author:  Ben Avison (bavison@riscosopen.org)
     25 *
     26 */
     27 
     28 /* Prevent the stack from becoming executable */
     29 #if defined(__linux__) && defined(__ELF__)
     30 .section .note.GNU-stack,"",%progbits
     31 #endif
     32 
     33 .text
     34 .arch armv6
     35 .object_arch armv4
     36 .arm
     37 .altmacro
     38 .p2align 2
     39 
     40 #include "pixman-arm-asm.h"
     41 #include "pixman-arm-simd-asm.h"
     42 
     43 pixman_syntax_unified
     44 
     45 /* A head macro should do all processing which results in an output of up to
     46 * 16 bytes, as far as the final load instruction. The corresponding tail macro
     47 * should complete the processing of the up-to-16 bytes. The calling macro will
     48 * sometimes choose to insert a preload or a decrement of X between them.
     49 *   cond           ARM condition code for code block
     50 *   numbytes       Number of output bytes that should be generated this time
     51 *   firstreg       First WK register in which to place output
     52 *   unaligned_src  Whether to use non-wordaligned loads of source image
     53 *   unaligned_mask Whether to use non-wordaligned loads of mask image
     54 *   preload        If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output
     55 */
     56 
     57 .macro blit_init
     58        line_saved_regs STRIDE_D, STRIDE_S
     59 .endm
     60 
     61 .macro blit_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
     62        pixld   \cond, \numbytes, \firstreg, SRC, \unaligned_src
     63 .endm
     64 
     65 .macro blit_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
     66    WK4     .req    STRIDE_D
     67    WK5     .req    STRIDE_S
     68    WK6     .req    MASK
     69    WK7     .req    STRIDE_M
     70 110:    pixld   , 16, 0, SRC, \unaligned_src
     71        pixld   , 16, 4, SRC, \unaligned_src
     72        pld     [SRC, SCRATCH]
     73        pixst   , 16, 0, DST
     74        pixst   , 16, 4, DST
     75        subs    X, X, #32*8/src_bpp
     76        bhs     110b
     77    .unreq  WK4
     78    .unreq  WK5
     79    .unreq  WK6
     80    .unreq  WK7
     81 .endm
     82 
     83 generate_composite_function \
     84    pixman_composite_src_8888_8888_asm_armv6, 32, 0, 32, \
     85    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
     86    4, /* prefetch distance */ \
     87    blit_init, \
     88    nop_macro, /* newline */ \
     89    nop_macro, /* cleanup */ \
     90    blit_process_head, \
     91    nop_macro, /* process tail */ \
     92    blit_inner_loop
     93 
     94 generate_composite_function \
     95    pixman_composite_src_0565_0565_asm_armv6, 16, 0, 16, \
     96    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
     97    4, /* prefetch distance */ \
     98    blit_init, \
     99    nop_macro, /* newline */ \
    100    nop_macro, /* cleanup */ \
    101    blit_process_head, \
    102    nop_macro, /* process tail */ \
    103    blit_inner_loop
    104 
    105 generate_composite_function \
    106    pixman_composite_src_8_8_asm_armv6, 8, 0, 8, \
    107    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
    108    3, /* prefetch distance */ \
    109    blit_init, \
    110    nop_macro, /* newline */ \
    111    nop_macro, /* cleanup */ \
    112    blit_process_head, \
    113    nop_macro, /* process tail */ \
    114    blit_inner_loop
    115 
    116 /******************************************************************************/
    117 
    118 .macro src_n_8888_init
    119        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
    120        mov     STRIDE_S, SRC
    121        mov     MASK, SRC
    122        mov     STRIDE_M, SRC
    123 .endm
    124 
    125 .macro src_n_0565_init
    126        ldrh    SRC, [sp, #ARGS_STACK_OFFSET]
    127        orr     SRC, SRC, SRC, lsl #16
    128        mov     STRIDE_S, SRC
    129        mov     MASK, SRC
    130        mov     STRIDE_M, SRC
    131 .endm
    132 
    133 .macro src_n_8_init
    134        ldrb    SRC, [sp, #ARGS_STACK_OFFSET]
    135        orr     SRC, SRC, SRC, lsl #8
    136        orr     SRC, SRC, SRC, lsl #16
    137        mov     STRIDE_S, SRC
    138        mov     MASK, SRC
    139        mov     STRIDE_M, SRC
    140 .endm
    141 
    142 .macro fill_process_tail  cond, numbytes, firstreg
    143    WK4     .req    SRC
    144    WK5     .req    STRIDE_S
    145    WK6     .req    MASK
    146    WK7     .req    STRIDE_M
    147        pixst   \cond, \numbytes, 4, DST
    148    .unreq  WK4
    149    .unreq  WK5
    150    .unreq  WK6
    151    .unreq  WK7
    152 .endm
    153 
    154 generate_composite_function \
    155    pixman_composite_src_n_8888_asm_armv6, 0, 0, 32, \
    156    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
    157    0, /* prefetch distance doesn't apply */ \
    158    src_n_8888_init \
    159    nop_macro, /* newline */ \
    160    nop_macro /* cleanup */ \
    161    nop_macro /* process head */ \
    162    fill_process_tail
    163 
    164 generate_composite_function \
    165    pixman_composite_src_n_0565_asm_armv6, 0, 0, 16, \
    166    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
    167    0, /* prefetch distance doesn't apply */ \
    168    src_n_0565_init \
    169    nop_macro, /* newline */ \
    170    nop_macro /* cleanup */ \
    171    nop_macro /* process head */ \
    172    fill_process_tail
    173 
    174 generate_composite_function \
    175    pixman_composite_src_n_8_asm_armv6, 0, 0, 8, \
    176    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
    177    0, /* prefetch distance doesn't apply */ \
    178    src_n_8_init \
    179    nop_macro, /* newline */ \
    180    nop_macro /* cleanup */ \
    181    nop_macro /* process head */ \
    182    fill_process_tail
    183 
    184 /******************************************************************************/
    185 
    186 .macro src_x888_8888_pixel, cond, reg
    187        orr\()\cond WK\()\reg, WK\()\reg, #0xFF000000
    188 .endm
    189 
    190 .macro pixman_composite_src_x888_8888_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
    191        pixld   \cond, \numbytes, \firstreg, SRC, \unaligned_src
    192 .endm
    193 
    194 .macro pixman_composite_src_x888_8888_process_tail   cond, numbytes, firstreg
    195        src_x888_8888_pixel \cond, %(\firstreg+0)
    196 .if \numbytes >= 8
    197        src_x888_8888_pixel \cond, %(\firstreg+1)
    198  .if \numbytes == 16
    199        src_x888_8888_pixel \cond, %(\firstreg+2)
    200        src_x888_8888_pixel \cond, %(\firstreg+3)
    201  .endif
    202 .endif
    203 .endm
    204 
    205 generate_composite_function \
    206    pixman_composite_src_x888_8888_asm_armv6, 32, 0, 32, \
    207    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \
    208    3, /* prefetch distance */ \
    209    nop_macro, /* init */ \
    210    nop_macro, /* newline */ \
    211    nop_macro, /* cleanup */ \
    212    pixman_composite_src_x888_8888_process_head, \
    213    pixman_composite_src_x888_8888_process_tail
    214 
    215 /******************************************************************************/
    216 
    217 .macro src_0565_8888_init
    218        /* Hold loop invariants in MASK and STRIDE_M */
    219        ldr     MASK, =0x07E007E0
    220        mov     STRIDE_M, #0xFF000000
    221        /* Set GE[3:0] to 1010 so SEL instructions do what we want */
    222        ldr     SCRATCH, =0x80008000
    223        uadd8   SCRATCH, SCRATCH, SCRATCH
    224 .endm
    225 
    226 .macro src_0565_8888_2pixels, reg1, reg2
    227        and     SCRATCH, WK\()\reg1, MASK                   @ 00000GGGGGG0000000000gggggg00000
    228        bic     WK\()\reg2, WK\()\reg1, MASK                @ RRRRR000000BBBBBrrrrr000000bbbbb
    229        orr     SCRATCH, SCRATCH, SCRATCH, lsr #6           @ 00000GGGGGGGGGGGG0000ggggggggggg
    230        mov     WK\()\reg1, WK\()\reg2, lsl #16             @ rrrrr000000bbbbb0000000000000000
    231        mov     SCRATCH, SCRATCH, ror #19                   @ GGGG0000ggggggggggg00000GGGGGGGG
    232        bic     WK\()\reg2, WK\()\reg2, WK\()\reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000
    233        orr     WK\()\reg1, WK\()\reg1, WK\()\reg1, lsr #5  @ rrrrrrrrrr0bbbbbbbbbb00000000000
    234        orr     WK\()\reg2, WK\()\reg2, WK\()\reg2, lsr #5  @ RRRRRRRRRR0BBBBBBBBBB00000000000
    235        pkhtb   WK\()\reg1, WK\()\reg1, WK\()\reg1, asr #5  @ rrrrrrrr--------bbbbbbbb--------
    236        sel     WK\()\reg1, WK\()\reg1, SCRATCH             @ rrrrrrrrggggggggbbbbbbbb--------
    237        mov     SCRATCH, SCRATCH, ror #16                   @ ggg00000GGGGGGGGGGGG0000gggggggg
    238        pkhtb   WK\()\reg2, WK\()\reg2, WK\()\reg2, asr #5  @ RRRRRRRR--------BBBBBBBB--------
    239        sel     WK\()\reg2, WK\()\reg2, SCRATCH             @ RRRRRRRRGGGGGGGGBBBBBBBB--------
    240        orr     WK\()\reg1, STRIDE_M, WK\()\reg1, lsr #8    @ 11111111rrrrrrrrggggggggbbbbbbbb
    241        orr     WK\()\reg2, STRIDE_M, WK\()\reg2, lsr #8    @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
    242 .endm
    243 
    244 /* This version doesn't need STRIDE_M, but is one instruction longer.
    245   It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case?
    246        and     SCRATCH, WK\()\reg1, MASK                   @ 00000GGGGGG0000000000gggggg00000
    247        bic     WK\()\reg1, WK\()\reg1, MASK                @ RRRRR000000BBBBBrrrrr000000bbbbb
    248        orr     SCRATCH, SCRATCH, SCRATCH, lsr #6           @ 00000GGGGGGGGGGGG0000ggggggggggg
    249        mov     WK\()\reg2, WK\()\reg1, lsr #16             @ 0000000000000000RRRRR000000BBBBB
    250        mov     SCRATCH, SCRATCH, ror #27                   @ GGGGGGGGGGGG0000ggggggggggg00000
    251        bic     WK\()\reg1, WK\()\reg1, WK\()\reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb
    252        mov     WK\()\reg2, WK\()\reg2, lsl #3              @ 0000000000000RRRRR000000BBBBB000
    253        mov     WK\()\reg1, WK\()\reg1, lsl #3              @ 0000000000000rrrrr000000bbbbb000
    254        orr     WK\()\reg2, WK\()\reg2, WK\()\reg2, lsr #5  @ 0000000000000RRRRRRRRRR0BBBBBBBB
    255        orr     WK\()\reg1, WK\()\reg1, WK\()\reg1, lsr #5  @ 0000000000000rrrrrrrrrr0bbbbbbbb
    256        pkhbt   WK\()\reg2, WK\()\reg2, WK\()\reg2, lsl #5  @ --------RRRRRRRR--------BBBBBBBB
    257        pkhbt   WK\()\reg1, WK\()\reg1, WK\()\reg1, lsl #5  @ --------rrrrrrrr--------bbbbbbbb
    258        sel     WK\()\reg2, SCRATCH, WK\()\reg2             @ --------RRRRRRRRGGGGGGGGBBBBBBBB
    259        sel     WK\()\reg1, SCRATCH, WK\()\reg1             @ --------rrrrrrrrggggggggbbbbbbbb
    260        orr     WK\()\reg2, WK\()\reg2, #0xFF000000         @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
    261        orr     WK\()\reg1, WK\()\reg1, #0xFF000000         @ 11111111rrrrrrrrggggggggbbbbbbbb
    262 */
    263 
    264 .macro src_0565_8888_1pixel, reg
    265        bic     SCRATCH, WK\()\reg, MASK                 @ 0000000000000000rrrrr000000bbbbb
    266        and     WK\()\reg, WK\()\reg, MASK               @ 000000000000000000000gggggg00000
    267        mov     SCRATCH, SCRATCH, lsl #3                 @ 0000000000000rrrrr000000bbbbb000
    268        mov     WK\()\reg, WK\()\reg, lsl #5             @ 0000000000000000gggggg0000000000
    269        orr     SCRATCH, SCRATCH, SCRATCH, lsr #5        @ 0000000000000rrrrrrrrrr0bbbbbbbb
    270        orr     WK\()\reg, WK\()\reg, WK\()\reg, lsr #6  @ 000000000000000gggggggggggg00000
    271        pkhbt   SCRATCH, SCRATCH, SCRATCH, lsl #5        @ --------rrrrrrrr--------bbbbbbbb
    272        sel     WK\()\reg, WK\()\reg, SCRATCH            @ --------rrrrrrrrggggggggbbbbbbbb
    273        orr     WK\()\reg, WK\()\reg, #0xFF000000        @ 11111111rrrrrrrrggggggggbbbbbbbb
    274 .endm
    275 
    276 .macro src_0565_8888_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
    277 .if \numbytes == 16
    278        pixldst ld,, 8, \firstreg, %(\firstreg+2),,, SRC, \unaligned_src
    279 .elseif \numbytes == 8
    280        pixld   , 4, \firstreg, SRC, \unaligned_src
    281 .elseif \numbytes == 4
    282        pixld   , 2, \firstreg, SRC, \unaligned_src
    283 .endif
    284 .endm
    285 
    286 .macro src_0565_8888_process_tail   cond, numbytes, firstreg
    287 .if \numbytes == 16
    288        src_0565_8888_2pixels \firstreg, %(\firstreg+1)
    289        src_0565_8888_2pixels %(\firstreg+2), %(\firstreg+3)
    290 .elseif \numbytes == 8
    291        src_0565_8888_2pixels \firstreg, %(\firstreg+1)
    292 .else
    293        src_0565_8888_1pixel \firstreg
    294 .endif
    295 .endm
    296 
    297 generate_composite_function \
    298    pixman_composite_src_0565_8888_asm_armv6, 16, 0, 32, \
    299    FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \
    300    3, /* prefetch distance */ \
    301    src_0565_8888_init, \
    302    nop_macro, /* newline */ \
    303    nop_macro, /* cleanup */ \
    304    src_0565_8888_process_head, \
    305    src_0565_8888_process_tail
    306 
    307 /******************************************************************************/
    308 
    309 .macro src_x888_0565_init
    310        /* Hold loop invariant in MASK */
    311        ldr     MASK, =0x001F001F
    312        line_saved_regs  STRIDE_S, ORIG_W
    313 .endm
    314 
    315 .macro src_x888_0565_1pixel  s, d
    316        and     WK\()\d, MASK, WK\()\s, lsr #3           @ 00000000000rrrrr00000000000bbbbb
    317        and     STRIDE_S, WK\()\s, #0xFC00               @ 0000000000000000gggggg0000000000
    318        orr     WK\()\d, WK\()\d, WK\()\d, lsr #5        @ 00000000000-----rrrrr000000bbbbb
    319        orr     WK\()\d, WK\()\d, STRIDE_S, lsr #5       @ 00000000000-----rrrrrggggggbbbbb
    320        /* Top 16 bits are discarded during the following STRH */
    321 .endm
    322 
    323 .macro src_x888_0565_2pixels  slo, shi, d, tmp
    324        and     SCRATCH, WK\()\shi, #0xFC00                 @ 0000000000000000GGGGGG0000000000
    325        and     WK\()\tmp, MASK, WK\()\shi, lsr #3          @ 00000000000RRRRR00000000000BBBBB
    326        and     WK\()\shi, MASK, WK\()\slo, lsr #3          @ 00000000000rrrrr00000000000bbbbb
    327        orr     WK\()\tmp, WK\()\tmp, WK\()\tmp, lsr #5     @ 00000000000-----RRRRR000000BBBBB
    328        orr     WK\()\tmp, WK\()\tmp, SCRATCH, lsr #5       @ 00000000000-----RRRRRGGGGGGBBBBB
    329        and     SCRATCH, WK\()\slo, #0xFC00                 @ 0000000000000000gggggg0000000000
    330        orr     WK\()\shi, WK\()\shi, WK\()\shi, lsr #5     @ 00000000000-----rrrrr000000bbbbb
    331        orr     WK\()\shi, WK\()\shi, SCRATCH, lsr #5       @ 00000000000-----rrrrrggggggbbbbb
    332        pkhbt   WK\()\d, WK\()\shi, WK\()\tmp, lsl #16      @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb
    333 .endm
    334 
    335 .macro src_x888_0565_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
    336        WK4     .req    STRIDE_S
    337        WK5     .req    STRIDE_M
    338        WK6     .req    WK3
    339        WK7     .req    ORIG_W
    340 .if \numbytes == 16
    341        pixld   , 16, 4, SRC, 0
    342        src_x888_0565_2pixels  4, 5, 0, 0
    343        pixld   , 8, 4, SRC, 0
    344        src_x888_0565_2pixels  6, 7, 1, 1
    345        pixld   , 8, 6, SRC, 0
    346 .else
    347        pixld   , \numbytes*2, 4, SRC, 0
    348 .endif
    349 .endm
    350 
    351 .macro src_x888_0565_process_tail   cond, numbytes, firstreg
    352 .if \numbytes == 16
    353        src_x888_0565_2pixels  4, 5, 2, 2
    354        src_x888_0565_2pixels  6, 7, 3, 4
    355 .elseif \numbytes == 8
    356        src_x888_0565_2pixels  4, 5, 1, 1
    357        src_x888_0565_2pixels  6, 7, 2, 2
    358 .elseif \numbytes == 4
    359        src_x888_0565_2pixels  4, 5, 1, 1
    360 .else
    361        src_x888_0565_1pixel  4, 1
    362 .endif
    363 .if \numbytes == 16
    364        pixst   , \numbytes, 0, DST
    365 .else
    366        pixst   , \numbytes, 1, DST
    367 .endif
    368        .unreq  WK4
    369        .unreq  WK5
    370        .unreq  WK6
    371        .unreq  WK7
    372 .endm
    373 
    374 generate_composite_function \
    375    pixman_composite_src_x888_0565_asm_armv6, 32, 0, 16, \
    376    FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \
    377    3, /* prefetch distance */ \
    378    src_x888_0565_init, \
    379    nop_macro, /* newline */ \
    380    nop_macro, /* cleanup */ \
    381    src_x888_0565_process_head, \
    382    src_x888_0565_process_tail
    383 
    384 /******************************************************************************/
    385 
    386 .macro add_8_8_8pixels  cond, dst1, dst2
    387        uqadd8\()\cond  WK\()\dst1, WK\()\dst1, MASK
    388        uqadd8\()\cond  WK\()\dst2, WK\()\dst2, STRIDE_M
    389 .endm
    390 
    391 .macro add_8_8_4pixels  cond, dst
    392        uqadd8\()\cond  WK\()\dst, WK\()\dst, MASK
    393 .endm
    394 
    395 .macro add_8_8_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
    396    WK4     .req    MASK
    397    WK5     .req    STRIDE_M
    398 .if \numbytes == 16
    399        pixld   \cond, 8, 4, SRC, \unaligned_src
    400        pixld   \cond, 16, \firstreg, DST, 0
    401        add_8_8_8pixels \cond, \firstreg, %(\firstreg+1)
    402        pixld   \cond, 8, 4, SRC, \unaligned_src
    403 .else
    404        pixld   \cond, \numbytes, 4, SRC, \unaligned_src
    405        pixld   \cond, \numbytes, \firstreg, DST, 0
    406 .endif
    407    .unreq  WK4
    408    .unreq  WK5
    409 .endm
    410 
    411 .macro add_8_8_process_tail  cond, numbytes, firstreg
    412 .if \numbytes == 16
    413        add_8_8_8pixels \cond, %(\firstreg+2), %(\firstreg+3)
    414 .elseif \numbytes == 8
    415        add_8_8_8pixels \cond, \firstreg, %(\firstreg+1)
    416 .else
    417        add_8_8_4pixels \cond, \firstreg
    418 .endif
    419 .endm
    420 
    421 generate_composite_function \
    422    pixman_composite_add_8_8_asm_armv6, 8, 0, 8, \
    423    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_PRESERVES_SCRATCH, \
    424    2, /* prefetch distance */ \
    425    nop_macro, /* init */ \
    426    nop_macro, /* newline */ \
    427    nop_macro, /* cleanup */ \
    428    add_8_8_process_head, \
    429    add_8_8_process_tail
    430 
    431 /******************************************************************************/
    432 
    433 .macro over_8888_8888_init
    434        /* Hold loop invariant in MASK */
    435        ldr     MASK, =0x00800080
    436        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
    437        uadd8   SCRATCH, MASK, MASK
    438        line_saved_regs STRIDE_D, STRIDE_S, ORIG_W
    439 .endm
    440 
    441 .macro over_8888_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
    442    WK4     .req    STRIDE_D
    443    WK5     .req    STRIDE_S
    444    WK6     .req    STRIDE_M
    445    WK7     .req    ORIG_W
    446        pixld   , \numbytes, %(4+\firstreg), SRC, \unaligned_src
    447        pixld   , \numbytes, \firstreg, DST, 0
    448    .unreq  WK4
    449    .unreq  WK5
    450    .unreq  WK6
    451    .unreq  WK7
    452 .endm
    453 
    454 .macro over_8888_8888_check_transparent  numbytes, reg0, reg1, reg2, reg3
    455        /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */
    456        teq     WK\()\reg0, #0
    457 .if \numbytes > 4
    458        teqeq   WK\()\reg1, #0
    459  .if \numbytes > 8
    460        teqeq   WK\()\reg2, #0
    461        teqeq   WK\()\reg3, #0
    462  .endif
    463 .endif
    464 .endm
    465 
    466 .macro over_8888_8888_prepare  next
    467        mov     WK\()\next, WK\()\next, lsr #24
    468 .endm
    469 
    470 .macro over_8888_8888_1pixel src, dst, offset, next
    471        /* src = destination component multiplier */
    472        rsb     WK\()\src, WK\()\src, #255
    473        /* Split even/odd bytes of dst into SCRATCH/dst */
    474        uxtb16  SCRATCH, WK\()\dst
    475        uxtb16  WK\()\dst, WK\()\dst, ror #8
    476        /* Multiply through, adding 0.5 to the upper byte of result for rounding */
    477        mla     SCRATCH, SCRATCH, WK\()\src, MASK
    478        mla     WK\()\dst, WK\()\dst, WK\()\src, MASK
    479        /* Where we would have had a stall between the result of the first MLA and the shifter input,
    480         * reload the complete source pixel */
    481        ldr     WK\()\src, [SRC, #\offset]
    482        /* Multiply by 257/256 to approximate 256/255 */
    483        uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
    484        /* In this stall, start processing the next pixel */
    485 .if \offset < -4
    486        mov     WK\()\next, WK\()\next, lsr #24
    487 .endif
    488        uxtab16 WK\()\dst, WK\()\dst, WK\()\dst, ror #8
    489        /* Recombine even/odd bytes of multiplied destination */
    490        mov     SCRATCH, SCRATCH, ror #8
    491        sel     WK\()\dst, SCRATCH, WK\()\dst
    492        /* Saturated add of source to multiplied destination */
    493        uqadd8  WK\()\dst, WK\()\dst, WK\()\src
    494 .endm
    495 
    496 .macro over_8888_8888_process_tail  cond, numbytes, firstreg
    497    WK4     .req    STRIDE_D
    498    WK5     .req    STRIDE_S
    499    WK6     .req    STRIDE_M
    500    WK7     .req    ORIG_W
    501        over_8888_8888_check_transparent \numbytes, %(4+\firstreg), %(5+\firstreg), %(6+\firstreg), %(7+\firstreg)
    502        beq     10f
    503        over_8888_8888_prepare  %(4+\firstreg)
    504 .set PROCESS_REG, \firstreg
    505 .set PROCESS_OFF, -\numbytes
    506 .rept \numbytes / 4
    507        over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG)
    508  .set PROCESS_REG, PROCESS_REG+1
    509  .set PROCESS_OFF, PROCESS_OFF+4
    510 .endr
    511        pixst   , \numbytes, \firstreg, DST
    512 10:
    513    .unreq  WK4
    514    .unreq  WK5
    515    .unreq  WK6
    516    .unreq  WK7
    517 .endm
    518 
    519 generate_composite_function \
    520    pixman_composite_over_8888_8888_asm_armv6, 32, 0, 32 \
    521    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
    522    2, /* prefetch distance */ \
    523    over_8888_8888_init, \
    524    nop_macro, /* newline */ \
    525    nop_macro, /* cleanup */ \
    526    over_8888_8888_process_head, \
    527    over_8888_8888_process_tail
    528 
    529 /******************************************************************************/
    530 
    531 /* Multiply each byte of a word by a byte.
    532 * Useful when there aren't any obvious ways to fill the stalls with other instructions.
    533 * word  Register containing 4 bytes
    534 * byte  Register containing byte multiplier (bits 8-31 must be 0)
    535 * tmp   Scratch register
    536 * half  Register containing the constant 0x00800080
    537 * GE[3:0] bits must contain 0101
    538 */
    539 .macro mul_8888_8  word, byte, tmp, half
    540        /* Split even/odd bytes of word apart */
    541        uxtb16  \tmp, \word
    542        uxtb16  \word, \word, ror #8
    543        /* Multiply bytes together with rounding, then by 257/256 */
    544        mla     \tmp, \tmp, \byte, \half
    545        mla     \word, \word, \byte, \half /* 1 stall follows */
    546        uxtab16 \tmp, \tmp, \tmp, ror #8  /* 1 stall follows */
    547        uxtab16 \word, \word, \word, ror #8
    548        /* Recombine bytes */
    549        mov     \tmp, \tmp, ror #8
    550        sel     \word, \tmp, \word
    551 .endm
    552 
    553 /******************************************************************************/
    554 
    555 .macro over_8888_n_8888_init
    556        /* Mask is constant */
    557        ldr     MASK, [sp, #ARGS_STACK_OFFSET+8]
    558        /* Hold loop invariant in STRIDE_M */
    559        ldr     STRIDE_M, =0x00800080
    560        /* We only want the alpha bits of the constant mask */
    561        mov     MASK, MASK, lsr #24
    562        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
    563        uadd8   SCRATCH, STRIDE_M, STRIDE_M
    564        line_saved_regs Y, STRIDE_D, STRIDE_S, ORIG_W
    565 .endm
    566 
    567 .macro over_8888_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
    568    WK4     .req    Y
    569    WK5     .req    STRIDE_D
    570    WK6     .req    STRIDE_S
    571    WK7     .req    ORIG_W
    572        pixld   , \numbytes, %(4+(\firstreg%2)), SRC, \unaligned_src
    573        pixld   , \numbytes, \firstreg, DST, 0
    574    .unreq  WK4
    575    .unreq  WK5
    576    .unreq  WK6
    577    .unreq  WK7
    578 .endm
    579 
    580 .macro over_8888_n_8888_1pixel src, dst
    581        mul_8888_8  WK\()\src, MASK, SCRATCH, STRIDE_M
    582        sub     WK7, WK6, WK\()\src, lsr #24
    583        mul_8888_8  WK\()\dst, WK7, SCRATCH, STRIDE_M
    584        uqadd8  WK\()\dst, WK\()\dst, WK\()\src
    585 .endm
    586 
    587 .macro over_8888_n_8888_process_tail  cond, numbytes, firstreg
    588    WK4     .req    Y
    589    WK5     .req    STRIDE_D
    590    WK6     .req    STRIDE_S
    591    WK7     .req    ORIG_W
    592        over_8888_8888_check_transparent \numbytes, %(4+(\firstreg%2)), %(5+(\firstreg%2)), %(6+\firstreg), %(7+\firstreg)
    593        beq     10f
    594        mov     WK6, #255
    595 .set PROCESS_REG, \firstreg
    596 .rept \numbytes / 4
    597  .if \numbytes == 16 && PROCESS_REG == 2
    598        /* We're using WK6 and WK7 as temporaries, so half way through
    599         * 4 pixels, reload the second two source pixels but this time
    600         * into WK4 and WK5 */
    601        ldmdb   SRC, {WK4, WK5}
    602  .endif
    603        over_8888_n_8888_1pixel  %(4+(PROCESS_REG%2)), %(PROCESS_REG)
    604  .set PROCESS_REG, PROCESS_REG+1
    605 .endr
    606        pixst   , \numbytes, \firstreg, DST
    607 10:
    608    .unreq  WK4
    609    .unreq  WK5
    610    .unreq  WK6
    611    .unreq  WK7
    612 .endm
    613 
    614 generate_composite_function \
    615    pixman_composite_over_8888_n_8888_asm_armv6, 32, 0, 32 \
    616    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
    617    2, /* prefetch distance */ \
    618    over_8888_n_8888_init, \
    619    nop_macro, /* newline */ \
    620    nop_macro, /* cleanup */ \
    621    over_8888_n_8888_process_head, \
    622    over_8888_n_8888_process_tail
    623 
    624 /******************************************************************************/
    625 
    626 .macro over_n_8_8888_init
    627        /* Source is constant, but splitting it into even/odd bytes is a loop invariant */
    628        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
    629        /* Not enough registers to hold this constant, but we still use it here to set GE[3:0] */
    630        ldr     SCRATCH, =0x00800080
    631        uxtb16  STRIDE_S, SRC
    632        uxtb16  SRC, SRC, ror #8
    633        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
    634        uadd8   SCRATCH, SCRATCH, SCRATCH
    635        line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W
    636 .endm
    637 
    638 .macro over_n_8_8888_newline
    639        ldr     STRIDE_D, =0x00800080
    640        b       1f
    641 .ltorg
    642 1:
    643 .endm
    644 
    645 .macro over_n_8_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
    646    WK4     .req    STRIDE_M
    647        pixld   , \numbytes/4, 4, MASK, \unaligned_mask
    648        pixld   , \numbytes, \firstreg, DST, 0
    649    .unreq  WK4
    650 .endm
    651 
    652 .macro over_n_8_8888_1pixel src, dst
    653        uxtb    Y, WK4, ror #\src*8
    654        /* Trailing part of multiplication of source */
    655        mla     SCRATCH, STRIDE_S, Y, STRIDE_D
    656        mla     Y, SRC, Y, STRIDE_D
    657        mov     ORIG_W, #255
    658        uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
    659        uxtab16 Y, Y, Y, ror #8
    660        mov     SCRATCH, SCRATCH, ror #8
    661        sub     ORIG_W, ORIG_W, Y, lsr #24
    662        sel     Y, SCRATCH, Y
    663        /* Then multiply the destination */
    664        mul_8888_8  WK\()\dst, ORIG_W, SCRATCH, STRIDE_D
    665        uqadd8  WK\()\dst, WK\()\dst, Y
    666 .endm
    667 
    668 .macro over_n_8_8888_process_tail  cond, numbytes, firstreg
    669    WK4     .req    STRIDE_M
    670        teq     WK4, #0
    671        beq     10f
    672 .set PROCESS_REG, \firstreg
    673 .rept \numbytes / 4
    674        over_n_8_8888_1pixel  %(PROCESS_REG-\firstreg), %(PROCESS_REG)
    675  .set PROCESS_REG, PROCESS_REG+1
    676 .endr
    677        pixst   , \numbytes, \firstreg, DST
    678 10:
    679    .unreq  WK4
    680 .endm
    681 
    682 generate_composite_function \
    683    pixman_composite_over_n_8_8888_asm_armv6, 0, 8, 32 \
    684    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
    685    2, /* prefetch distance */ \
    686    over_n_8_8888_init, \
    687    over_n_8_8888_newline, \
    688    nop_macro, /* cleanup */ \
    689    over_n_8_8888_process_head, \
    690    over_n_8_8888_process_tail
    691 
    692 /******************************************************************************/
    693 
    694 .macro over_reverse_n_8888_init
    695        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
    696        ldr     MASK, =0x00800080
    697        /* Split source pixel into RB/AG parts */
    698        uxtb16  STRIDE_S, SRC
    699        uxtb16  STRIDE_M, SRC, ror #8
    700        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
    701        uadd8   SCRATCH, MASK, MASK
    702        line_saved_regs  STRIDE_D, ORIG_W
    703 .endm
    704 
    705 .macro over_reverse_n_8888_newline
    706        mov     STRIDE_D, #0xFF
    707 .endm
    708 
    709 .macro over_reverse_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
    710        pixld   , \numbytes, \firstreg, DST, 0
    711 .endm
    712 
    713 .macro over_reverse_n_8888_1pixel  d, is_only
    714        teq     WK\()\d, #0
    715        beq     8f       /* replace with source */
    716        bics    ORIG_W, STRIDE_D, WK\()\d, lsr #24
    717 .if \is_only == 1
    718        beq     49f      /* skip store */
    719 .else
    720        beq     9f       /* write same value back */
    721 .endif
    722        mla     SCRATCH, STRIDE_S, ORIG_W, MASK /* red/blue */
    723        mla     ORIG_W, STRIDE_M, ORIG_W, MASK  /* alpha/green */
    724        uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
    725        uxtab16 ORIG_W, ORIG_W, ORIG_W, ror #8
    726        mov     SCRATCH, SCRATCH, ror #8
    727        sel     ORIG_W, SCRATCH, ORIG_W
    728        uqadd8  WK\()\d, WK\()\d, ORIG_W
    729        b       9f
    730 8:      mov     WK\()\d, SRC
    731 9:
    732 .endm
    733 
    734 .macro over_reverse_n_8888_tail  numbytes, reg1, reg2, reg3, reg4
    735 .if \numbytes == 4
    736        over_reverse_n_8888_1pixel  \reg1, 1
    737 .else
    738        and     SCRATCH, WK\()\reg1, WK\()\reg2
    739  .if \numbytes == 16
    740        and     SCRATCH, SCRATCH, WK\()\reg3
    741        and     SCRATCH, SCRATCH, WK\()\reg4
    742  .endif
    743        mvns    SCRATCH, SCRATCH, asr #24
    744        beq     49f /* skip store if all opaque */
    745        over_reverse_n_8888_1pixel  \reg1, 0
    746        over_reverse_n_8888_1pixel  \reg2, 0
    747  .if \numbytes == 16
    748        over_reverse_n_8888_1pixel  \reg3, 0
    749        over_reverse_n_8888_1pixel  \reg4, 0
    750  .endif
    751 .endif
    752        pixst   , \numbytes, \reg1, DST
    753 49:
    754 .endm
    755 
    756 .macro over_reverse_n_8888_process_tail  cond, numbytes, firstreg
    757        over_reverse_n_8888_tail  \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2), %(\firstreg+3)
    758 .endm
    759 
    760 generate_composite_function \
    761    pixman_composite_over_reverse_n_8888_asm_armv6, 0, 0, 32 \
    762    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \
    763    3, /* prefetch distance */ \
    764    over_reverse_n_8888_init, \
    765    over_reverse_n_8888_newline, \
    766    nop_macro, /* cleanup */ \
    767    over_reverse_n_8888_process_head, \
    768    over_reverse_n_8888_process_tail
    769 
    770 /******************************************************************************/
    771 
    772 .macro over_white_8888_8888_ca_init
    773        HALF    .req    SRC
    774        TMP0    .req    STRIDE_D
    775        TMP1    .req    STRIDE_S
    776        TMP2    .req    STRIDE_M
    777        TMP3    .req    ORIG_W
    778        WK4     .req    SCRATCH
    779        line_saved_regs STRIDE_D, STRIDE_M, ORIG_W
    780        ldr     SCRATCH, =0x800080
    781        mov     HALF, #0x80
    782        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
    783        uadd8   SCRATCH, SCRATCH, SCRATCH
    784        .set DST_PRELOAD_BIAS, 8
    785 .endm
    786 
    787 .macro over_white_8888_8888_ca_cleanup
    788        .set DST_PRELOAD_BIAS, 0
    789        .unreq  HALF
    790        .unreq  TMP0
    791        .unreq  TMP1
    792        .unreq  TMP2
    793        .unreq  TMP3
    794        .unreq  WK4
    795 .endm
    796 
    797 .macro over_white_8888_8888_ca_combine  m, d
    798        uxtb16  TMP1, TMP0                /* rb_notmask */
    799        uxtb16  TMP2, \d                  /* rb_dest; 1 stall follows */
    800        smlatt  TMP3, TMP2, TMP1, HALF    /* red */
    801        smlabb  TMP2, TMP2, TMP1, HALF    /* blue */
    802        uxtb16  TMP0, TMP0, ror #8        /* ag_notmask */
    803        uxtb16  TMP1, \d, ror #8          /* ag_dest; 1 stall follows */
    804        smlatt  \d, TMP1, TMP0, HALF      /* alpha */
    805        smlabb  TMP1, TMP1, TMP0, HALF    /* green */
    806        pkhbt   TMP0, TMP2, TMP3, lsl #16 /* rb; 1 stall follows */
    807        pkhbt   TMP1, TMP1, \d, lsl #16   /* ag */
    808        uxtab16 TMP0, TMP0, TMP0, ror #8
    809        uxtab16 TMP1, TMP1, TMP1, ror #8
    810        mov     TMP0, TMP0, ror #8
    811        sel     \d, TMP0, TMP1
    812        uqadd8  \d, \d, \m                 /* d is a late result */
    813 .endm
    814 
    815 .macro over_white_8888_8888_ca_1pixel_head
    816        pixld   , 4, 1, MASK, 0
    817        pixld   , 4, 3, DST, 0
    818 .endm
    819 
    820 .macro over_white_8888_8888_ca_1pixel_tail
    821        mvn     TMP0, WK1
    822        teq     WK1, WK1, asr #32
    823        bne     1f
    824        bcc     3f
    825        mov     WK3, WK1
    826        b       2f
    827 1:      over_white_8888_8888_ca_combine WK1, WK3
    828 2:      pixst   , 4, 3, DST
    829 3:
    830 .endm
    831 
    832 .macro over_white_8888_8888_ca_2pixels_head
    833        pixld   , 8, 1, MASK, 0
    834 .endm
    835 
    836 .macro over_white_8888_8888_ca_2pixels_tail
    837        pixld   , 8, 3, DST
    838        mvn     TMP0, WK1
    839        teq     WK1, WK1, asr #32
    840        bne     1f
    841        movcs   WK3, WK1
    842        bcs     2f
    843        teq     WK2, #0
    844        beq     5f
    845        b       2f
    846 1:      over_white_8888_8888_ca_combine WK1, WK3
    847 2:      mvn     TMP0, WK2
    848        teq     WK2, WK2, asr #32
    849        bne     3f
    850        movcs   WK4, WK2
    851        b       4f
    852 3:      over_white_8888_8888_ca_combine WK2, WK4
    853 4:      pixst   , 8, 3, DST
    854 5:
    855 .endm
    856 
    857 .macro over_white_8888_8888_ca_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
    858 .if \numbytes == 4
    859        over_white_8888_8888_ca_1pixel_head
    860 .else
    861  .if \numbytes == 16
    862        over_white_8888_8888_ca_2pixels_head
    863        over_white_8888_8888_ca_2pixels_tail
    864  .endif
    865        over_white_8888_8888_ca_2pixels_head
    866 .endif
    867 .endm
    868 
    869 .macro over_white_8888_8888_ca_process_tail  cond, numbytes, firstreg
    870 .if \numbytes == 4
    871        over_white_8888_8888_ca_1pixel_tail
    872 .else
    873        over_white_8888_8888_ca_2pixels_tail
    874 .endif
    875 .endm
    876 
    877 generate_composite_function \
    878    pixman_composite_over_white_8888_8888_ca_asm_armv6, 0, 32, 32 \
    879    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH \
    880    2, /* prefetch distance */ \
    881    over_white_8888_8888_ca_init, \
    882    nop_macro, /* newline */ \
    883    over_white_8888_8888_ca_cleanup, \
    884    over_white_8888_8888_ca_process_head, \
    885    over_white_8888_8888_ca_process_tail
    886 
    887 
    888 .macro over_n_8888_8888_ca_init
    889        /* Set up constants. RB_SRC and AG_SRC are in registers;
    890         * RB_FLDS, A_SRC, and the two HALF values need to go on the
    891         * stack (and the ful SRC value is already there) */
    892        ldr     SCRATCH, [sp, #ARGS_STACK_OFFSET]
    893        mov     WK0, #0x00FF0000
    894        orr     WK0, WK0, #0xFF        /* RB_FLDS (0x00FF00FF) */
    895        mov     WK1, #0x80             /* HALF default value */
    896        mov     WK2, SCRATCH, lsr #24  /* A_SRC */
    897        orr     WK3, WK1, WK1, lsl #16 /* HALF alternate value (0x00800080) */
    898        push    {WK0-WK3}
    899 .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+16
    900        uxtb16  SRC, SCRATCH
    901        uxtb16  STRIDE_S, SCRATCH, ror #8
    902 
    903        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
    904        uadd8   SCRATCH, WK3, WK3
    905 
    906        .unreq  WK0
    907        .unreq  WK1
    908        .unreq  WK2
    909        .unreq  WK3
    910        WK0     .req    Y
    911        WK1     .req    STRIDE_D
    912        RB_SRC  .req    SRC
    913        AG_SRC  .req    STRIDE_S
    914        WK2     .req    STRIDE_M
    915        RB_FLDS .req    r8       /* the reloaded constants have to be at consecutive registers starting at an even one */
    916        A_SRC   .req    r8
    917        HALF    .req    r9
    918        WK3     .req    r10
    919        WK4     .req    r11
    920        WK5     .req    SCRATCH
    921        WK6     .req    ORIG_W
    922 
    923        line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W
    924 .endm
    925 
    926 .macro over_n_8888_8888_ca_cleanup
    927        add     sp, sp, #16
    928 .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-16
    929 
    930        .unreq  WK0
    931        .unreq  WK1
    932        .unreq  RB_SRC
    933        .unreq  AG_SRC
    934        .unreq  WK2
    935        .unreq  RB_FLDS
    936        .unreq  A_SRC
    937        .unreq  HALF
    938        .unreq  WK3
    939        .unreq  WK4
    940        .unreq  WK5
    941        .unreq  WK6
    942        WK0     .req    r8
    943        WK1     .req    r9
    944        WK2     .req    r10
    945        WK3     .req    r11
    946 .endm
    947 
    948 .macro over_n_8888_8888_ca_1pixel_head
    949        pixld   , 4, 6, MASK, 0
    950        pixld   , 4, 0, DST, 0
    951 .endm
    952 
    953 .macro over_n_8888_8888_ca_1pixel_tail
    954        ldrd    A_SRC, HALF, [sp, #LOCALS_STACK_OFFSET+8]
    955        uxtb16  WK1, WK6                 /* rb_mask (first step of hard case placed in what would otherwise be a stall) */
    956        teq     WK6, WK6, asr #32        /* Zc if transparent, ZC if opaque */
    957        bne     20f
    958        bcc     40f
    959        /* Mask is fully opaque (all channels) */
    960        ldr     WK6, [sp, #ARGS_STACK_OFFSET] /* get SRC back */
    961        eors    A_SRC, A_SRC, #0xFF
    962        bne     10f
    963        /* Source is also opaque - same as src_8888_8888 */
    964        mov     WK0, WK6
    965        b       30f
    966 10:     /* Same as over_8888_8888 */
    967        mul_8888_8 WK0, A_SRC, WK5, HALF
    968        uqadd8  WK0, WK0, WK6
    969        b       30f
    970 20:     /* No simplifications possible - do it the hard way */
    971        uxtb16  WK2, WK6, ror #8         /* ag_mask */
    972        mla     WK3, WK1, A_SRC, HALF    /* rb_mul; 2 cycles */
    973        mla     WK4, WK2, A_SRC, HALF    /* ag_mul; 2 cycles */
    974        ldrd    RB_FLDS, HALF, [sp, #LOCALS_STACK_OFFSET]
    975        uxtb16  WK5, WK0                 /* rb_dest */
    976        uxtab16 WK3, WK3, WK3, ror #8
    977        uxtb16  WK6, WK0, ror #8         /* ag_dest */
    978        uxtab16 WK4, WK4, WK4, ror #8
    979        smlatt  WK0, RB_SRC, WK1, HALF   /* red1 */
    980        smlabb  WK1, RB_SRC, WK1, HALF   /* blue1 */
    981        bic     WK3, RB_FLDS, WK3, lsr #8
    982        bic     WK4, RB_FLDS, WK4, lsr #8
    983        pkhbt   WK1, WK1, WK0, lsl #16   /* rb1 */
    984        smlatt  WK0, WK5, WK3, HALF      /* red2 */
    985        smlabb  WK3, WK5, WK3, HALF      /* blue2 */
    986        uxtab16 WK1, WK1, WK1, ror #8
    987        smlatt  WK5, AG_SRC, WK2, HALF   /* alpha1 */
    988        pkhbt   WK3, WK3, WK0, lsl #16   /* rb2 */
    989        smlabb  WK0, AG_SRC, WK2, HALF   /* green1 */
    990        smlatt  WK2, WK6, WK4, HALF      /* alpha2 */
    991        smlabb  WK4, WK6, WK4, HALF      /* green2 */
    992        pkhbt   WK0, WK0, WK5, lsl #16   /* ag1 */
    993        uxtab16 WK3, WK3, WK3, ror #8
    994        pkhbt   WK4, WK4, WK2, lsl #16   /* ag2 */
    995        uxtab16 WK0, WK0, WK0, ror #8
    996        uxtab16 WK4, WK4, WK4, ror #8
    997        mov     WK1, WK1, ror #8
    998        mov     WK3, WK3, ror #8
    999        sel     WK2, WK1, WK0            /* recombine source*mask */
   1000        sel     WK1, WK3, WK4            /* recombine dest*(1-source_alpha*mask) */
   1001        uqadd8  WK0, WK1, WK2            /* followed by 1 stall */
   1002 30:     /* The destination buffer is already in the L1 cache, so
   1003         * there's little point in amalgamating writes */
   1004        pixst   , 4, 0, DST
   1005 40:
   1006 .endm
   1007 
   1008 .macro over_n_8888_8888_ca_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
   1009 .rept (\numbytes / 4) - 1
   1010        over_n_8888_8888_ca_1pixel_head
   1011        over_n_8888_8888_ca_1pixel_tail
   1012 .endr
   1013        over_n_8888_8888_ca_1pixel_head
   1014 .endm
   1015 
   1016 .macro over_n_8888_8888_ca_process_tail  cond, numbytes, firstreg
   1017        over_n_8888_8888_ca_1pixel_tail
   1018 .endm
   1019 
   1020 pixman_asm_function pixman_composite_over_n_8888_8888_ca_asm_armv6
   1021        ldr     ip, [sp]
   1022        cmp     ip, #-1
   1023        beq     pixman_composite_over_white_8888_8888_ca_asm_armv6
   1024        /* else drop through... */
   1025 pixman_end_asm_function
   1026 generate_composite_function \
   1027    pixman_composite_over_n_8888_8888_ca_asm_armv6_helper, 0, 32, 32 \
   1028    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_PROCESS_CORRUPTS_WK0 \
   1029    2, /* prefetch distance */ \
   1030    over_n_8888_8888_ca_init, \
   1031    nop_macro, /* newline */ \
   1032    over_n_8888_8888_ca_cleanup, \
   1033    over_n_8888_8888_ca_process_head, \
   1034    over_n_8888_8888_ca_process_tail
   1035 
   1036 /******************************************************************************/
   1037 
   1038 .macro in_reverse_8888_8888_init
   1039        /* Hold loop invariant in MASK */
   1040        ldr     MASK, =0x00800080
   1041        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
   1042        uadd8   SCRATCH, MASK, MASK
   1043        /* Offset the source pointer: we only need the alpha bytes */
   1044        add     SRC, SRC, #3
   1045        line_saved_regs  ORIG_W
   1046 .endm
   1047 
   1048 .macro in_reverse_8888_8888_head  numbytes, reg1, reg2, reg3
   1049        ldrb    ORIG_W, [SRC], #4
   1050 .if \numbytes >= 8
   1051        ldrb    WK\()\reg1, [SRC], #4
   1052  .if \numbytes == 16
   1053        ldrb    WK\()\reg2, [SRC], #4
   1054        ldrb    WK\()\reg3, [SRC], #4
   1055  .endif
   1056 .endif
   1057        add     DST, DST, #\numbytes
   1058 .endm
   1059 
   1060 .macro in_reverse_8888_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
   1061        in_reverse_8888_8888_head  \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2)
   1062 .endm
   1063 
   1064 .macro in_reverse_8888_8888_1pixel  s, d, offset, is_only
   1065 .if \is_only != 1
   1066        movs    \s, ORIG_W
   1067  .if \offset != 0
   1068        ldrb    ORIG_W, [SRC, #\offset]
   1069  .endif
   1070        beq     1f
   1071        teq     STRIDE_M, #0xFF
   1072        beq     2f
   1073 .endif
   1074        uxtb16  SCRATCH, \d                 /* rb_dest */
   1075        uxtb16  \d, \d, ror #8               /* ag_dest */
   1076        mla     SCRATCH, SCRATCH, \s, MASK
   1077        mla     \d, \d, \s, MASK
   1078        uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
   1079        uxtab16 \d, \d, \d, ror #8
   1080        mov     SCRATCH, SCRATCH, ror #8
   1081        sel     \d, SCRATCH, \d
   1082        b       2f
   1083 .if \offset == 0
   1084 48:     /* Last mov d,#0 of the set - used as part of shortcut for
   1085         * source values all 0 */
   1086 .endif
   1087 1:      mov     \d, #0
   1088 2:
   1089 .endm
   1090 
   1091 .macro in_reverse_8888_8888_tail  numbytes, reg1, reg2, reg3, reg4
   1092 .if \numbytes == 4
   1093        teq     ORIG_W, ORIG_W, asr #32
   1094        ldrne   WK\()\reg1, [DST, #-4]
   1095 .elseif \numbytes == 8
   1096        teq     ORIG_W, WK\()\reg1
   1097        teqeq   ORIG_W, ORIG_W, asr #32  /* all 0 or all -1? */
   1098        ldmdbne DST, {WK\()\reg1-WK\()\reg2}
   1099 .else
   1100        teq     ORIG_W, WK\()\reg1
   1101        teqeq   ORIG_W, WK\()\reg2
   1102        teqeq   ORIG_W, WK\()\reg3
   1103        teqeq   ORIG_W, ORIG_W, asr #32  /* all 0 or all -1? */
   1104        ldmdbne DST, {WK\()\reg1-WK\()\reg4}
   1105 .endif
   1106        cmnne   DST, #0   /* clear C if NE */
   1107        bcs     49f       /* no writes to dest if source all -1 */
   1108        beq     48f       /* set dest to all 0 if source all 0 */
   1109 .if \numbytes == 4
   1110        in_reverse_8888_8888_1pixel  ORIG_W, WK\()\reg1, 0, 1
   1111        str     WK\()\reg1, [DST, #-4]
   1112 .elseif \numbytes == 8
   1113        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg1, -4, 0
   1114        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg2, 0, 0
   1115        stmdb   DST, {WK\()\reg1-WK\()\reg2}
   1116 .else
   1117        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg1, -12, 0
   1118        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg2, -8, 0
   1119        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg3, -4, 0
   1120        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg4, 0, 0
   1121        stmdb   DST, {WK\()\reg1-WK\()\reg4}
   1122 .endif
   1123 49:
   1124 .endm
   1125 
   1126 .macro in_reverse_8888_8888_process_tail  cond, numbytes, firstreg
   1127        in_reverse_8888_8888_tail  \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2), %(\firstreg+3)
   1128 .endm
   1129 
   1130 generate_composite_function \
   1131    pixman_composite_in_reverse_8888_8888_asm_armv6, 32, 0, 32 \
   1132    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_NO_PRELOAD_DST \
   1133    2, /* prefetch distance */ \
   1134    in_reverse_8888_8888_init, \
   1135    nop_macro, /* newline */ \
   1136    nop_macro, /* cleanup */ \
   1137    in_reverse_8888_8888_process_head, \
   1138    in_reverse_8888_8888_process_tail
   1139 
   1140 /******************************************************************************/
   1141 
   1142 .macro over_n_8888_init
   1143        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
   1144        /* Hold loop invariant in MASK */
   1145        ldr     MASK, =0x00800080
   1146        /* Hold multiplier for destination in STRIDE_M */
   1147        mov     STRIDE_M, #255
   1148        sub     STRIDE_M, STRIDE_M, SRC, lsr #24
   1149        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
   1150        uadd8   SCRATCH, MASK, MASK
   1151 .endm
   1152 
   1153 .macro over_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
   1154        pixld   , \numbytes, \firstreg, DST, 0
   1155 .endm
   1156 
   1157 .macro over_n_8888_1pixel dst
   1158        mul_8888_8  WK\()\dst, STRIDE_M, SCRATCH, MASK
   1159        uqadd8  WK\()\dst, WK\()\dst, SRC
   1160 .endm
   1161 
   1162 .macro over_n_8888_process_tail  cond, numbytes, firstreg
   1163 .set PROCESS_REG, \firstreg
   1164 .rept \numbytes / 4
   1165        over_n_8888_1pixel %(PROCESS_REG)
   1166  .set PROCESS_REG, PROCESS_REG+1
   1167 .endr
   1168        pixst   , \numbytes, \firstreg, DST
   1169 .endm
   1170 
   1171 generate_composite_function \
   1172    pixman_composite_over_n_8888_asm_armv6, 0, 0, 32 \
   1173    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE \
   1174    2, /* prefetch distance */ \
   1175    over_n_8888_init, \
   1176    nop_macro, /* newline */ \
   1177    nop_macro, /* cleanup */ \
   1178    over_n_8888_process_head, \
   1179    over_n_8888_process_tail
   1180 
   1181 /******************************************************************************/