pixman-arm-simd-asm.S (44191B)
1 /* 2 * Copyright © 2012 Raspberry Pi Foundation 3 * Copyright © 2012 RISC OS Open Ltd 4 * 5 * Permission to use, copy, modify, distribute, and sell this software and its 6 * documentation for any purpose is hereby granted without fee, provided that 7 * the above copyright notice appear in all copies and that both that 8 * copyright notice and this permission notice appear in supporting 9 * documentation, and that the name of the copyright holders not be used in 10 * advertising or publicity pertaining to distribution of the software without 11 * specific, written prior permission. The copyright holders make no 12 * representations about the suitability of this software for any purpose. It 13 * is provided "as is" without express or implied warranty. 14 * 15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS 16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY 18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN 20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING 21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS 22 * SOFTWARE. 23 * 24 * Author: Ben Avison (bavison@riscosopen.org) 25 * 26 */ 27 28 /* Prevent the stack from becoming executable */ 29 #if defined(__linux__) && defined(__ELF__) 30 .section .note.GNU-stack,"",%progbits 31 #endif 32 33 .text 34 .arch armv6 35 .object_arch armv4 36 .arm 37 .altmacro 38 .p2align 2 39 40 #include "pixman-arm-asm.h" 41 #include "pixman-arm-simd-asm.h" 42 43 pixman_syntax_unified 44 45 /* A head macro should do all processing which results in an output of up to 46 * 16 bytes, as far as the final load instruction. The corresponding tail macro 47 * should complete the processing of the up-to-16 bytes. The calling macro will 48 * sometimes choose to insert a preload or a decrement of X between them. 49 * cond ARM condition code for code block 50 * numbytes Number of output bytes that should be generated this time 51 * firstreg First WK register in which to place output 52 * unaligned_src Whether to use non-wordaligned loads of source image 53 * unaligned_mask Whether to use non-wordaligned loads of mask image 54 * preload If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output 55 */ 56 57 .macro blit_init 58 line_saved_regs STRIDE_D, STRIDE_S 59 .endm 60 61 .macro blit_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 62 pixld \cond, \numbytes, \firstreg, SRC, \unaligned_src 63 .endm 64 65 .macro blit_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment 66 WK4 .req STRIDE_D 67 WK5 .req STRIDE_S 68 WK6 .req MASK 69 WK7 .req STRIDE_M 70 110: pixld , 16, 0, SRC, \unaligned_src 71 pixld , 16, 4, SRC, \unaligned_src 72 pld [SRC, SCRATCH] 73 pixst , 16, 0, DST 74 pixst , 16, 4, DST 75 subs X, X, #32*8/src_bpp 76 bhs 110b 77 .unreq WK4 78 .unreq WK5 79 .unreq WK6 80 .unreq WK7 81 .endm 82 83 generate_composite_function \ 84 pixman_composite_src_8888_8888_asm_armv6, 32, 0, 32, \ 85 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \ 86 4, /* prefetch distance */ \ 87 blit_init, \ 88 nop_macro, /* newline */ \ 89 nop_macro, /* cleanup */ \ 90 blit_process_head, \ 91 nop_macro, /* process tail */ \ 92 blit_inner_loop 93 94 generate_composite_function \ 95 pixman_composite_src_0565_0565_asm_armv6, 16, 0, 16, \ 96 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \ 97 4, /* prefetch distance */ \ 98 blit_init, \ 99 nop_macro, /* newline */ \ 100 nop_macro, /* cleanup */ \ 101 blit_process_head, \ 102 nop_macro, /* process tail */ \ 103 blit_inner_loop 104 105 generate_composite_function \ 106 pixman_composite_src_8_8_asm_armv6, 8, 0, 8, \ 107 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \ 108 3, /* prefetch distance */ \ 109 blit_init, \ 110 nop_macro, /* newline */ \ 111 nop_macro, /* cleanup */ \ 112 blit_process_head, \ 113 nop_macro, /* process tail */ \ 114 blit_inner_loop 115 116 /******************************************************************************/ 117 118 .macro src_n_8888_init 119 ldr SRC, [sp, #ARGS_STACK_OFFSET] 120 mov STRIDE_S, SRC 121 mov MASK, SRC 122 mov STRIDE_M, SRC 123 .endm 124 125 .macro src_n_0565_init 126 ldrh SRC, [sp, #ARGS_STACK_OFFSET] 127 orr SRC, SRC, SRC, lsl #16 128 mov STRIDE_S, SRC 129 mov MASK, SRC 130 mov STRIDE_M, SRC 131 .endm 132 133 .macro src_n_8_init 134 ldrb SRC, [sp, #ARGS_STACK_OFFSET] 135 orr SRC, SRC, SRC, lsl #8 136 orr SRC, SRC, SRC, lsl #16 137 mov STRIDE_S, SRC 138 mov MASK, SRC 139 mov STRIDE_M, SRC 140 .endm 141 142 .macro fill_process_tail cond, numbytes, firstreg 143 WK4 .req SRC 144 WK5 .req STRIDE_S 145 WK6 .req MASK 146 WK7 .req STRIDE_M 147 pixst \cond, \numbytes, 4, DST 148 .unreq WK4 149 .unreq WK5 150 .unreq WK6 151 .unreq WK7 152 .endm 153 154 generate_composite_function \ 155 pixman_composite_src_n_8888_asm_armv6, 0, 0, 32, \ 156 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ 157 0, /* prefetch distance doesn't apply */ \ 158 src_n_8888_init \ 159 nop_macro, /* newline */ \ 160 nop_macro /* cleanup */ \ 161 nop_macro /* process head */ \ 162 fill_process_tail 163 164 generate_composite_function \ 165 pixman_composite_src_n_0565_asm_armv6, 0, 0, 16, \ 166 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ 167 0, /* prefetch distance doesn't apply */ \ 168 src_n_0565_init \ 169 nop_macro, /* newline */ \ 170 nop_macro /* cleanup */ \ 171 nop_macro /* process head */ \ 172 fill_process_tail 173 174 generate_composite_function \ 175 pixman_composite_src_n_8_asm_armv6, 0, 0, 8, \ 176 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ 177 0, /* prefetch distance doesn't apply */ \ 178 src_n_8_init \ 179 nop_macro, /* newline */ \ 180 nop_macro /* cleanup */ \ 181 nop_macro /* process head */ \ 182 fill_process_tail 183 184 /******************************************************************************/ 185 186 .macro src_x888_8888_pixel, cond, reg 187 orr\()\cond WK\()\reg, WK\()\reg, #0xFF000000 188 .endm 189 190 .macro pixman_composite_src_x888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 191 pixld \cond, \numbytes, \firstreg, SRC, \unaligned_src 192 .endm 193 194 .macro pixman_composite_src_x888_8888_process_tail cond, numbytes, firstreg 195 src_x888_8888_pixel \cond, %(\firstreg+0) 196 .if \numbytes >= 8 197 src_x888_8888_pixel \cond, %(\firstreg+1) 198 .if \numbytes == 16 199 src_x888_8888_pixel \cond, %(\firstreg+2) 200 src_x888_8888_pixel \cond, %(\firstreg+3) 201 .endif 202 .endif 203 .endm 204 205 generate_composite_function \ 206 pixman_composite_src_x888_8888_asm_armv6, 32, 0, 32, \ 207 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \ 208 3, /* prefetch distance */ \ 209 nop_macro, /* init */ \ 210 nop_macro, /* newline */ \ 211 nop_macro, /* cleanup */ \ 212 pixman_composite_src_x888_8888_process_head, \ 213 pixman_composite_src_x888_8888_process_tail 214 215 /******************************************************************************/ 216 217 .macro src_0565_8888_init 218 /* Hold loop invariants in MASK and STRIDE_M */ 219 ldr MASK, =0x07E007E0 220 mov STRIDE_M, #0xFF000000 221 /* Set GE[3:0] to 1010 so SEL instructions do what we want */ 222 ldr SCRATCH, =0x80008000 223 uadd8 SCRATCH, SCRATCH, SCRATCH 224 .endm 225 226 .macro src_0565_8888_2pixels, reg1, reg2 227 and SCRATCH, WK\()\reg1, MASK @ 00000GGGGGG0000000000gggggg00000 228 bic WK\()\reg2, WK\()\reg1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb 229 orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg 230 mov WK\()\reg1, WK\()\reg2, lsl #16 @ rrrrr000000bbbbb0000000000000000 231 mov SCRATCH, SCRATCH, ror #19 @ GGGG0000ggggggggggg00000GGGGGGGG 232 bic WK\()\reg2, WK\()\reg2, WK\()\reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000 233 orr WK\()\reg1, WK\()\reg1, WK\()\reg1, lsr #5 @ rrrrrrrrrr0bbbbbbbbbb00000000000 234 orr WK\()\reg2, WK\()\reg2, WK\()\reg2, lsr #5 @ RRRRRRRRRR0BBBBBBBBBB00000000000 235 pkhtb WK\()\reg1, WK\()\reg1, WK\()\reg1, asr #5 @ rrrrrrrr--------bbbbbbbb-------- 236 sel WK\()\reg1, WK\()\reg1, SCRATCH @ rrrrrrrrggggggggbbbbbbbb-------- 237 mov SCRATCH, SCRATCH, ror #16 @ ggg00000GGGGGGGGGGGG0000gggggggg 238 pkhtb WK\()\reg2, WK\()\reg2, WK\()\reg2, asr #5 @ RRRRRRRR--------BBBBBBBB-------- 239 sel WK\()\reg2, WK\()\reg2, SCRATCH @ RRRRRRRRGGGGGGGGBBBBBBBB-------- 240 orr WK\()\reg1, STRIDE_M, WK\()\reg1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb 241 orr WK\()\reg2, STRIDE_M, WK\()\reg2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB 242 .endm 243 244 /* This version doesn't need STRIDE_M, but is one instruction longer. 245 It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case? 246 and SCRATCH, WK\()\reg1, MASK @ 00000GGGGGG0000000000gggggg00000 247 bic WK\()\reg1, WK\()\reg1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb 248 orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg 249 mov WK\()\reg2, WK\()\reg1, lsr #16 @ 0000000000000000RRRRR000000BBBBB 250 mov SCRATCH, SCRATCH, ror #27 @ GGGGGGGGGGGG0000ggggggggggg00000 251 bic WK\()\reg1, WK\()\reg1, WK\()\reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb 252 mov WK\()\reg2, WK\()\reg2, lsl #3 @ 0000000000000RRRRR000000BBBBB000 253 mov WK\()\reg1, WK\()\reg1, lsl #3 @ 0000000000000rrrrr000000bbbbb000 254 orr WK\()\reg2, WK\()\reg2, WK\()\reg2, lsr #5 @ 0000000000000RRRRRRRRRR0BBBBBBBB 255 orr WK\()\reg1, WK\()\reg1, WK\()\reg1, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb 256 pkhbt WK\()\reg2, WK\()\reg2, WK\()\reg2, lsl #5 @ --------RRRRRRRR--------BBBBBBBB 257 pkhbt WK\()\reg1, WK\()\reg1, WK\()\reg1, lsl #5 @ --------rrrrrrrr--------bbbbbbbb 258 sel WK\()\reg2, SCRATCH, WK\()\reg2 @ --------RRRRRRRRGGGGGGGGBBBBBBBB 259 sel WK\()\reg1, SCRATCH, WK\()\reg1 @ --------rrrrrrrrggggggggbbbbbbbb 260 orr WK\()\reg2, WK\()\reg2, #0xFF000000 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB 261 orr WK\()\reg1, WK\()\reg1, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb 262 */ 263 264 .macro src_0565_8888_1pixel, reg 265 bic SCRATCH, WK\()\reg, MASK @ 0000000000000000rrrrr000000bbbbb 266 and WK\()\reg, WK\()\reg, MASK @ 000000000000000000000gggggg00000 267 mov SCRATCH, SCRATCH, lsl #3 @ 0000000000000rrrrr000000bbbbb000 268 mov WK\()\reg, WK\()\reg, lsl #5 @ 0000000000000000gggggg0000000000 269 orr SCRATCH, SCRATCH, SCRATCH, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb 270 orr WK\()\reg, WK\()\reg, WK\()\reg, lsr #6 @ 000000000000000gggggggggggg00000 271 pkhbt SCRATCH, SCRATCH, SCRATCH, lsl #5 @ --------rrrrrrrr--------bbbbbbbb 272 sel WK\()\reg, WK\()\reg, SCRATCH @ --------rrrrrrrrggggggggbbbbbbbb 273 orr WK\()\reg, WK\()\reg, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb 274 .endm 275 276 .macro src_0565_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 277 .if \numbytes == 16 278 pixldst ld,, 8, \firstreg, %(\firstreg+2),,, SRC, \unaligned_src 279 .elseif \numbytes == 8 280 pixld , 4, \firstreg, SRC, \unaligned_src 281 .elseif \numbytes == 4 282 pixld , 2, \firstreg, SRC, \unaligned_src 283 .endif 284 .endm 285 286 .macro src_0565_8888_process_tail cond, numbytes, firstreg 287 .if \numbytes == 16 288 src_0565_8888_2pixels \firstreg, %(\firstreg+1) 289 src_0565_8888_2pixels %(\firstreg+2), %(\firstreg+3) 290 .elseif \numbytes == 8 291 src_0565_8888_2pixels \firstreg, %(\firstreg+1) 292 .else 293 src_0565_8888_1pixel \firstreg 294 .endif 295 .endm 296 297 generate_composite_function \ 298 pixman_composite_src_0565_8888_asm_armv6, 16, 0, 32, \ 299 FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \ 300 3, /* prefetch distance */ \ 301 src_0565_8888_init, \ 302 nop_macro, /* newline */ \ 303 nop_macro, /* cleanup */ \ 304 src_0565_8888_process_head, \ 305 src_0565_8888_process_tail 306 307 /******************************************************************************/ 308 309 .macro src_x888_0565_init 310 /* Hold loop invariant in MASK */ 311 ldr MASK, =0x001F001F 312 line_saved_regs STRIDE_S, ORIG_W 313 .endm 314 315 .macro src_x888_0565_1pixel s, d 316 and WK\()\d, MASK, WK\()\s, lsr #3 @ 00000000000rrrrr00000000000bbbbb 317 and STRIDE_S, WK\()\s, #0xFC00 @ 0000000000000000gggggg0000000000 318 orr WK\()\d, WK\()\d, WK\()\d, lsr #5 @ 00000000000-----rrrrr000000bbbbb 319 orr WK\()\d, WK\()\d, STRIDE_S, lsr #5 @ 00000000000-----rrrrrggggggbbbbb 320 /* Top 16 bits are discarded during the following STRH */ 321 .endm 322 323 .macro src_x888_0565_2pixels slo, shi, d, tmp 324 and SCRATCH, WK\()\shi, #0xFC00 @ 0000000000000000GGGGGG0000000000 325 and WK\()\tmp, MASK, WK\()\shi, lsr #3 @ 00000000000RRRRR00000000000BBBBB 326 and WK\()\shi, MASK, WK\()\slo, lsr #3 @ 00000000000rrrrr00000000000bbbbb 327 orr WK\()\tmp, WK\()\tmp, WK\()\tmp, lsr #5 @ 00000000000-----RRRRR000000BBBBB 328 orr WK\()\tmp, WK\()\tmp, SCRATCH, lsr #5 @ 00000000000-----RRRRRGGGGGGBBBBB 329 and SCRATCH, WK\()\slo, #0xFC00 @ 0000000000000000gggggg0000000000 330 orr WK\()\shi, WK\()\shi, WK\()\shi, lsr #5 @ 00000000000-----rrrrr000000bbbbb 331 orr WK\()\shi, WK\()\shi, SCRATCH, lsr #5 @ 00000000000-----rrrrrggggggbbbbb 332 pkhbt WK\()\d, WK\()\shi, WK\()\tmp, lsl #16 @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb 333 .endm 334 335 .macro src_x888_0565_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 336 WK4 .req STRIDE_S 337 WK5 .req STRIDE_M 338 WK6 .req WK3 339 WK7 .req ORIG_W 340 .if \numbytes == 16 341 pixld , 16, 4, SRC, 0 342 src_x888_0565_2pixels 4, 5, 0, 0 343 pixld , 8, 4, SRC, 0 344 src_x888_0565_2pixels 6, 7, 1, 1 345 pixld , 8, 6, SRC, 0 346 .else 347 pixld , \numbytes*2, 4, SRC, 0 348 .endif 349 .endm 350 351 .macro src_x888_0565_process_tail cond, numbytes, firstreg 352 .if \numbytes == 16 353 src_x888_0565_2pixels 4, 5, 2, 2 354 src_x888_0565_2pixels 6, 7, 3, 4 355 .elseif \numbytes == 8 356 src_x888_0565_2pixels 4, 5, 1, 1 357 src_x888_0565_2pixels 6, 7, 2, 2 358 .elseif \numbytes == 4 359 src_x888_0565_2pixels 4, 5, 1, 1 360 .else 361 src_x888_0565_1pixel 4, 1 362 .endif 363 .if \numbytes == 16 364 pixst , \numbytes, 0, DST 365 .else 366 pixst , \numbytes, 1, DST 367 .endif 368 .unreq WK4 369 .unreq WK5 370 .unreq WK6 371 .unreq WK7 372 .endm 373 374 generate_composite_function \ 375 pixman_composite_src_x888_0565_asm_armv6, 32, 0, 16, \ 376 FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \ 377 3, /* prefetch distance */ \ 378 src_x888_0565_init, \ 379 nop_macro, /* newline */ \ 380 nop_macro, /* cleanup */ \ 381 src_x888_0565_process_head, \ 382 src_x888_0565_process_tail 383 384 /******************************************************************************/ 385 386 .macro add_8_8_8pixels cond, dst1, dst2 387 uqadd8\()\cond WK\()\dst1, WK\()\dst1, MASK 388 uqadd8\()\cond WK\()\dst2, WK\()\dst2, STRIDE_M 389 .endm 390 391 .macro add_8_8_4pixels cond, dst 392 uqadd8\()\cond WK\()\dst, WK\()\dst, MASK 393 .endm 394 395 .macro add_8_8_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 396 WK4 .req MASK 397 WK5 .req STRIDE_M 398 .if \numbytes == 16 399 pixld \cond, 8, 4, SRC, \unaligned_src 400 pixld \cond, 16, \firstreg, DST, 0 401 add_8_8_8pixels \cond, \firstreg, %(\firstreg+1) 402 pixld \cond, 8, 4, SRC, \unaligned_src 403 .else 404 pixld \cond, \numbytes, 4, SRC, \unaligned_src 405 pixld \cond, \numbytes, \firstreg, DST, 0 406 .endif 407 .unreq WK4 408 .unreq WK5 409 .endm 410 411 .macro add_8_8_process_tail cond, numbytes, firstreg 412 .if \numbytes == 16 413 add_8_8_8pixels \cond, %(\firstreg+2), %(\firstreg+3) 414 .elseif \numbytes == 8 415 add_8_8_8pixels \cond, \firstreg, %(\firstreg+1) 416 .else 417 add_8_8_4pixels \cond, \firstreg 418 .endif 419 .endm 420 421 generate_composite_function \ 422 pixman_composite_add_8_8_asm_armv6, 8, 0, 8, \ 423 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_PRESERVES_SCRATCH, \ 424 2, /* prefetch distance */ \ 425 nop_macro, /* init */ \ 426 nop_macro, /* newline */ \ 427 nop_macro, /* cleanup */ \ 428 add_8_8_process_head, \ 429 add_8_8_process_tail 430 431 /******************************************************************************/ 432 433 .macro over_8888_8888_init 434 /* Hold loop invariant in MASK */ 435 ldr MASK, =0x00800080 436 /* Set GE[3:0] to 0101 so SEL instructions do what we want */ 437 uadd8 SCRATCH, MASK, MASK 438 line_saved_regs STRIDE_D, STRIDE_S, ORIG_W 439 .endm 440 441 .macro over_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 442 WK4 .req STRIDE_D 443 WK5 .req STRIDE_S 444 WK6 .req STRIDE_M 445 WK7 .req ORIG_W 446 pixld , \numbytes, %(4+\firstreg), SRC, \unaligned_src 447 pixld , \numbytes, \firstreg, DST, 0 448 .unreq WK4 449 .unreq WK5 450 .unreq WK6 451 .unreq WK7 452 .endm 453 454 .macro over_8888_8888_check_transparent numbytes, reg0, reg1, reg2, reg3 455 /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */ 456 teq WK\()\reg0, #0 457 .if \numbytes > 4 458 teqeq WK\()\reg1, #0 459 .if \numbytes > 8 460 teqeq WK\()\reg2, #0 461 teqeq WK\()\reg3, #0 462 .endif 463 .endif 464 .endm 465 466 .macro over_8888_8888_prepare next 467 mov WK\()\next, WK\()\next, lsr #24 468 .endm 469 470 .macro over_8888_8888_1pixel src, dst, offset, next 471 /* src = destination component multiplier */ 472 rsb WK\()\src, WK\()\src, #255 473 /* Split even/odd bytes of dst into SCRATCH/dst */ 474 uxtb16 SCRATCH, WK\()\dst 475 uxtb16 WK\()\dst, WK\()\dst, ror #8 476 /* Multiply through, adding 0.5 to the upper byte of result for rounding */ 477 mla SCRATCH, SCRATCH, WK\()\src, MASK 478 mla WK\()\dst, WK\()\dst, WK\()\src, MASK 479 /* Where we would have had a stall between the result of the first MLA and the shifter input, 480 * reload the complete source pixel */ 481 ldr WK\()\src, [SRC, #\offset] 482 /* Multiply by 257/256 to approximate 256/255 */ 483 uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 484 /* In this stall, start processing the next pixel */ 485 .if \offset < -4 486 mov WK\()\next, WK\()\next, lsr #24 487 .endif 488 uxtab16 WK\()\dst, WK\()\dst, WK\()\dst, ror #8 489 /* Recombine even/odd bytes of multiplied destination */ 490 mov SCRATCH, SCRATCH, ror #8 491 sel WK\()\dst, SCRATCH, WK\()\dst 492 /* Saturated add of source to multiplied destination */ 493 uqadd8 WK\()\dst, WK\()\dst, WK\()\src 494 .endm 495 496 .macro over_8888_8888_process_tail cond, numbytes, firstreg 497 WK4 .req STRIDE_D 498 WK5 .req STRIDE_S 499 WK6 .req STRIDE_M 500 WK7 .req ORIG_W 501 over_8888_8888_check_transparent \numbytes, %(4+\firstreg), %(5+\firstreg), %(6+\firstreg), %(7+\firstreg) 502 beq 10f 503 over_8888_8888_prepare %(4+\firstreg) 504 .set PROCESS_REG, \firstreg 505 .set PROCESS_OFF, -\numbytes 506 .rept \numbytes / 4 507 over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG) 508 .set PROCESS_REG, PROCESS_REG+1 509 .set PROCESS_OFF, PROCESS_OFF+4 510 .endr 511 pixst , \numbytes, \firstreg, DST 512 10: 513 .unreq WK4 514 .unreq WK5 515 .unreq WK6 516 .unreq WK7 517 .endm 518 519 generate_composite_function \ 520 pixman_composite_over_8888_8888_asm_armv6, 32, 0, 32 \ 521 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \ 522 2, /* prefetch distance */ \ 523 over_8888_8888_init, \ 524 nop_macro, /* newline */ \ 525 nop_macro, /* cleanup */ \ 526 over_8888_8888_process_head, \ 527 over_8888_8888_process_tail 528 529 /******************************************************************************/ 530 531 /* Multiply each byte of a word by a byte. 532 * Useful when there aren't any obvious ways to fill the stalls with other instructions. 533 * word Register containing 4 bytes 534 * byte Register containing byte multiplier (bits 8-31 must be 0) 535 * tmp Scratch register 536 * half Register containing the constant 0x00800080 537 * GE[3:0] bits must contain 0101 538 */ 539 .macro mul_8888_8 word, byte, tmp, half 540 /* Split even/odd bytes of word apart */ 541 uxtb16 \tmp, \word 542 uxtb16 \word, \word, ror #8 543 /* Multiply bytes together with rounding, then by 257/256 */ 544 mla \tmp, \tmp, \byte, \half 545 mla \word, \word, \byte, \half /* 1 stall follows */ 546 uxtab16 \tmp, \tmp, \tmp, ror #8 /* 1 stall follows */ 547 uxtab16 \word, \word, \word, ror #8 548 /* Recombine bytes */ 549 mov \tmp, \tmp, ror #8 550 sel \word, \tmp, \word 551 .endm 552 553 /******************************************************************************/ 554 555 .macro over_8888_n_8888_init 556 /* Mask is constant */ 557 ldr MASK, [sp, #ARGS_STACK_OFFSET+8] 558 /* Hold loop invariant in STRIDE_M */ 559 ldr STRIDE_M, =0x00800080 560 /* We only want the alpha bits of the constant mask */ 561 mov MASK, MASK, lsr #24 562 /* Set GE[3:0] to 0101 so SEL instructions do what we want */ 563 uadd8 SCRATCH, STRIDE_M, STRIDE_M 564 line_saved_regs Y, STRIDE_D, STRIDE_S, ORIG_W 565 .endm 566 567 .macro over_8888_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 568 WK4 .req Y 569 WK5 .req STRIDE_D 570 WK6 .req STRIDE_S 571 WK7 .req ORIG_W 572 pixld , \numbytes, %(4+(\firstreg%2)), SRC, \unaligned_src 573 pixld , \numbytes, \firstreg, DST, 0 574 .unreq WK4 575 .unreq WK5 576 .unreq WK6 577 .unreq WK7 578 .endm 579 580 .macro over_8888_n_8888_1pixel src, dst 581 mul_8888_8 WK\()\src, MASK, SCRATCH, STRIDE_M 582 sub WK7, WK6, WK\()\src, lsr #24 583 mul_8888_8 WK\()\dst, WK7, SCRATCH, STRIDE_M 584 uqadd8 WK\()\dst, WK\()\dst, WK\()\src 585 .endm 586 587 .macro over_8888_n_8888_process_tail cond, numbytes, firstreg 588 WK4 .req Y 589 WK5 .req STRIDE_D 590 WK6 .req STRIDE_S 591 WK7 .req ORIG_W 592 over_8888_8888_check_transparent \numbytes, %(4+(\firstreg%2)), %(5+(\firstreg%2)), %(6+\firstreg), %(7+\firstreg) 593 beq 10f 594 mov WK6, #255 595 .set PROCESS_REG, \firstreg 596 .rept \numbytes / 4 597 .if \numbytes == 16 && PROCESS_REG == 2 598 /* We're using WK6 and WK7 as temporaries, so half way through 599 * 4 pixels, reload the second two source pixels but this time 600 * into WK4 and WK5 */ 601 ldmdb SRC, {WK4, WK5} 602 .endif 603 over_8888_n_8888_1pixel %(4+(PROCESS_REG%2)), %(PROCESS_REG) 604 .set PROCESS_REG, PROCESS_REG+1 605 .endr 606 pixst , \numbytes, \firstreg, DST 607 10: 608 .unreq WK4 609 .unreq WK5 610 .unreq WK6 611 .unreq WK7 612 .endm 613 614 generate_composite_function \ 615 pixman_composite_over_8888_n_8888_asm_armv6, 32, 0, 32 \ 616 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \ 617 2, /* prefetch distance */ \ 618 over_8888_n_8888_init, \ 619 nop_macro, /* newline */ \ 620 nop_macro, /* cleanup */ \ 621 over_8888_n_8888_process_head, \ 622 over_8888_n_8888_process_tail 623 624 /******************************************************************************/ 625 626 .macro over_n_8_8888_init 627 /* Source is constant, but splitting it into even/odd bytes is a loop invariant */ 628 ldr SRC, [sp, #ARGS_STACK_OFFSET] 629 /* Not enough registers to hold this constant, but we still use it here to set GE[3:0] */ 630 ldr SCRATCH, =0x00800080 631 uxtb16 STRIDE_S, SRC 632 uxtb16 SRC, SRC, ror #8 633 /* Set GE[3:0] to 0101 so SEL instructions do what we want */ 634 uadd8 SCRATCH, SCRATCH, SCRATCH 635 line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W 636 .endm 637 638 .macro over_n_8_8888_newline 639 ldr STRIDE_D, =0x00800080 640 b 1f 641 .ltorg 642 1: 643 .endm 644 645 .macro over_n_8_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 646 WK4 .req STRIDE_M 647 pixld , \numbytes/4, 4, MASK, \unaligned_mask 648 pixld , \numbytes, \firstreg, DST, 0 649 .unreq WK4 650 .endm 651 652 .macro over_n_8_8888_1pixel src, dst 653 uxtb Y, WK4, ror #\src*8 654 /* Trailing part of multiplication of source */ 655 mla SCRATCH, STRIDE_S, Y, STRIDE_D 656 mla Y, SRC, Y, STRIDE_D 657 mov ORIG_W, #255 658 uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 659 uxtab16 Y, Y, Y, ror #8 660 mov SCRATCH, SCRATCH, ror #8 661 sub ORIG_W, ORIG_W, Y, lsr #24 662 sel Y, SCRATCH, Y 663 /* Then multiply the destination */ 664 mul_8888_8 WK\()\dst, ORIG_W, SCRATCH, STRIDE_D 665 uqadd8 WK\()\dst, WK\()\dst, Y 666 .endm 667 668 .macro over_n_8_8888_process_tail cond, numbytes, firstreg 669 WK4 .req STRIDE_M 670 teq WK4, #0 671 beq 10f 672 .set PROCESS_REG, \firstreg 673 .rept \numbytes / 4 674 over_n_8_8888_1pixel %(PROCESS_REG-\firstreg), %(PROCESS_REG) 675 .set PROCESS_REG, PROCESS_REG+1 676 .endr 677 pixst , \numbytes, \firstreg, DST 678 10: 679 .unreq WK4 680 .endm 681 682 generate_composite_function \ 683 pixman_composite_over_n_8_8888_asm_armv6, 0, 8, 32 \ 684 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \ 685 2, /* prefetch distance */ \ 686 over_n_8_8888_init, \ 687 over_n_8_8888_newline, \ 688 nop_macro, /* cleanup */ \ 689 over_n_8_8888_process_head, \ 690 over_n_8_8888_process_tail 691 692 /******************************************************************************/ 693 694 .macro over_reverse_n_8888_init 695 ldr SRC, [sp, #ARGS_STACK_OFFSET] 696 ldr MASK, =0x00800080 697 /* Split source pixel into RB/AG parts */ 698 uxtb16 STRIDE_S, SRC 699 uxtb16 STRIDE_M, SRC, ror #8 700 /* Set GE[3:0] to 0101 so SEL instructions do what we want */ 701 uadd8 SCRATCH, MASK, MASK 702 line_saved_regs STRIDE_D, ORIG_W 703 .endm 704 705 .macro over_reverse_n_8888_newline 706 mov STRIDE_D, #0xFF 707 .endm 708 709 .macro over_reverse_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 710 pixld , \numbytes, \firstreg, DST, 0 711 .endm 712 713 .macro over_reverse_n_8888_1pixel d, is_only 714 teq WK\()\d, #0 715 beq 8f /* replace with source */ 716 bics ORIG_W, STRIDE_D, WK\()\d, lsr #24 717 .if \is_only == 1 718 beq 49f /* skip store */ 719 .else 720 beq 9f /* write same value back */ 721 .endif 722 mla SCRATCH, STRIDE_S, ORIG_W, MASK /* red/blue */ 723 mla ORIG_W, STRIDE_M, ORIG_W, MASK /* alpha/green */ 724 uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 725 uxtab16 ORIG_W, ORIG_W, ORIG_W, ror #8 726 mov SCRATCH, SCRATCH, ror #8 727 sel ORIG_W, SCRATCH, ORIG_W 728 uqadd8 WK\()\d, WK\()\d, ORIG_W 729 b 9f 730 8: mov WK\()\d, SRC 731 9: 732 .endm 733 734 .macro over_reverse_n_8888_tail numbytes, reg1, reg2, reg3, reg4 735 .if \numbytes == 4 736 over_reverse_n_8888_1pixel \reg1, 1 737 .else 738 and SCRATCH, WK\()\reg1, WK\()\reg2 739 .if \numbytes == 16 740 and SCRATCH, SCRATCH, WK\()\reg3 741 and SCRATCH, SCRATCH, WK\()\reg4 742 .endif 743 mvns SCRATCH, SCRATCH, asr #24 744 beq 49f /* skip store if all opaque */ 745 over_reverse_n_8888_1pixel \reg1, 0 746 over_reverse_n_8888_1pixel \reg2, 0 747 .if \numbytes == 16 748 over_reverse_n_8888_1pixel \reg3, 0 749 over_reverse_n_8888_1pixel \reg4, 0 750 .endif 751 .endif 752 pixst , \numbytes, \reg1, DST 753 49: 754 .endm 755 756 .macro over_reverse_n_8888_process_tail cond, numbytes, firstreg 757 over_reverse_n_8888_tail \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2), %(\firstreg+3) 758 .endm 759 760 generate_composite_function \ 761 pixman_composite_over_reverse_n_8888_asm_armv6, 0, 0, 32 \ 762 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \ 763 3, /* prefetch distance */ \ 764 over_reverse_n_8888_init, \ 765 over_reverse_n_8888_newline, \ 766 nop_macro, /* cleanup */ \ 767 over_reverse_n_8888_process_head, \ 768 over_reverse_n_8888_process_tail 769 770 /******************************************************************************/ 771 772 .macro over_white_8888_8888_ca_init 773 HALF .req SRC 774 TMP0 .req STRIDE_D 775 TMP1 .req STRIDE_S 776 TMP2 .req STRIDE_M 777 TMP3 .req ORIG_W 778 WK4 .req SCRATCH 779 line_saved_regs STRIDE_D, STRIDE_M, ORIG_W 780 ldr SCRATCH, =0x800080 781 mov HALF, #0x80 782 /* Set GE[3:0] to 0101 so SEL instructions do what we want */ 783 uadd8 SCRATCH, SCRATCH, SCRATCH 784 .set DST_PRELOAD_BIAS, 8 785 .endm 786 787 .macro over_white_8888_8888_ca_cleanup 788 .set DST_PRELOAD_BIAS, 0 789 .unreq HALF 790 .unreq TMP0 791 .unreq TMP1 792 .unreq TMP2 793 .unreq TMP3 794 .unreq WK4 795 .endm 796 797 .macro over_white_8888_8888_ca_combine m, d 798 uxtb16 TMP1, TMP0 /* rb_notmask */ 799 uxtb16 TMP2, \d /* rb_dest; 1 stall follows */ 800 smlatt TMP3, TMP2, TMP1, HALF /* red */ 801 smlabb TMP2, TMP2, TMP1, HALF /* blue */ 802 uxtb16 TMP0, TMP0, ror #8 /* ag_notmask */ 803 uxtb16 TMP1, \d, ror #8 /* ag_dest; 1 stall follows */ 804 smlatt \d, TMP1, TMP0, HALF /* alpha */ 805 smlabb TMP1, TMP1, TMP0, HALF /* green */ 806 pkhbt TMP0, TMP2, TMP3, lsl #16 /* rb; 1 stall follows */ 807 pkhbt TMP1, TMP1, \d, lsl #16 /* ag */ 808 uxtab16 TMP0, TMP0, TMP0, ror #8 809 uxtab16 TMP1, TMP1, TMP1, ror #8 810 mov TMP0, TMP0, ror #8 811 sel \d, TMP0, TMP1 812 uqadd8 \d, \d, \m /* d is a late result */ 813 .endm 814 815 .macro over_white_8888_8888_ca_1pixel_head 816 pixld , 4, 1, MASK, 0 817 pixld , 4, 3, DST, 0 818 .endm 819 820 .macro over_white_8888_8888_ca_1pixel_tail 821 mvn TMP0, WK1 822 teq WK1, WK1, asr #32 823 bne 1f 824 bcc 3f 825 mov WK3, WK1 826 b 2f 827 1: over_white_8888_8888_ca_combine WK1, WK3 828 2: pixst , 4, 3, DST 829 3: 830 .endm 831 832 .macro over_white_8888_8888_ca_2pixels_head 833 pixld , 8, 1, MASK, 0 834 .endm 835 836 .macro over_white_8888_8888_ca_2pixels_tail 837 pixld , 8, 3, DST 838 mvn TMP0, WK1 839 teq WK1, WK1, asr #32 840 bne 1f 841 movcs WK3, WK1 842 bcs 2f 843 teq WK2, #0 844 beq 5f 845 b 2f 846 1: over_white_8888_8888_ca_combine WK1, WK3 847 2: mvn TMP0, WK2 848 teq WK2, WK2, asr #32 849 bne 3f 850 movcs WK4, WK2 851 b 4f 852 3: over_white_8888_8888_ca_combine WK2, WK4 853 4: pixst , 8, 3, DST 854 5: 855 .endm 856 857 .macro over_white_8888_8888_ca_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 858 .if \numbytes == 4 859 over_white_8888_8888_ca_1pixel_head 860 .else 861 .if \numbytes == 16 862 over_white_8888_8888_ca_2pixels_head 863 over_white_8888_8888_ca_2pixels_tail 864 .endif 865 over_white_8888_8888_ca_2pixels_head 866 .endif 867 .endm 868 869 .macro over_white_8888_8888_ca_process_tail cond, numbytes, firstreg 870 .if \numbytes == 4 871 over_white_8888_8888_ca_1pixel_tail 872 .else 873 over_white_8888_8888_ca_2pixels_tail 874 .endif 875 .endm 876 877 generate_composite_function \ 878 pixman_composite_over_white_8888_8888_ca_asm_armv6, 0, 32, 32 \ 879 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH \ 880 2, /* prefetch distance */ \ 881 over_white_8888_8888_ca_init, \ 882 nop_macro, /* newline */ \ 883 over_white_8888_8888_ca_cleanup, \ 884 over_white_8888_8888_ca_process_head, \ 885 over_white_8888_8888_ca_process_tail 886 887 888 .macro over_n_8888_8888_ca_init 889 /* Set up constants. RB_SRC and AG_SRC are in registers; 890 * RB_FLDS, A_SRC, and the two HALF values need to go on the 891 * stack (and the ful SRC value is already there) */ 892 ldr SCRATCH, [sp, #ARGS_STACK_OFFSET] 893 mov WK0, #0x00FF0000 894 orr WK0, WK0, #0xFF /* RB_FLDS (0x00FF00FF) */ 895 mov WK1, #0x80 /* HALF default value */ 896 mov WK2, SCRATCH, lsr #24 /* A_SRC */ 897 orr WK3, WK1, WK1, lsl #16 /* HALF alternate value (0x00800080) */ 898 push {WK0-WK3} 899 .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+16 900 uxtb16 SRC, SCRATCH 901 uxtb16 STRIDE_S, SCRATCH, ror #8 902 903 /* Set GE[3:0] to 0101 so SEL instructions do what we want */ 904 uadd8 SCRATCH, WK3, WK3 905 906 .unreq WK0 907 .unreq WK1 908 .unreq WK2 909 .unreq WK3 910 WK0 .req Y 911 WK1 .req STRIDE_D 912 RB_SRC .req SRC 913 AG_SRC .req STRIDE_S 914 WK2 .req STRIDE_M 915 RB_FLDS .req r8 /* the reloaded constants have to be at consecutive registers starting at an even one */ 916 A_SRC .req r8 917 HALF .req r9 918 WK3 .req r10 919 WK4 .req r11 920 WK5 .req SCRATCH 921 WK6 .req ORIG_W 922 923 line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W 924 .endm 925 926 .macro over_n_8888_8888_ca_cleanup 927 add sp, sp, #16 928 .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-16 929 930 .unreq WK0 931 .unreq WK1 932 .unreq RB_SRC 933 .unreq AG_SRC 934 .unreq WK2 935 .unreq RB_FLDS 936 .unreq A_SRC 937 .unreq HALF 938 .unreq WK3 939 .unreq WK4 940 .unreq WK5 941 .unreq WK6 942 WK0 .req r8 943 WK1 .req r9 944 WK2 .req r10 945 WK3 .req r11 946 .endm 947 948 .macro over_n_8888_8888_ca_1pixel_head 949 pixld , 4, 6, MASK, 0 950 pixld , 4, 0, DST, 0 951 .endm 952 953 .macro over_n_8888_8888_ca_1pixel_tail 954 ldrd A_SRC, HALF, [sp, #LOCALS_STACK_OFFSET+8] 955 uxtb16 WK1, WK6 /* rb_mask (first step of hard case placed in what would otherwise be a stall) */ 956 teq WK6, WK6, asr #32 /* Zc if transparent, ZC if opaque */ 957 bne 20f 958 bcc 40f 959 /* Mask is fully opaque (all channels) */ 960 ldr WK6, [sp, #ARGS_STACK_OFFSET] /* get SRC back */ 961 eors A_SRC, A_SRC, #0xFF 962 bne 10f 963 /* Source is also opaque - same as src_8888_8888 */ 964 mov WK0, WK6 965 b 30f 966 10: /* Same as over_8888_8888 */ 967 mul_8888_8 WK0, A_SRC, WK5, HALF 968 uqadd8 WK0, WK0, WK6 969 b 30f 970 20: /* No simplifications possible - do it the hard way */ 971 uxtb16 WK2, WK6, ror #8 /* ag_mask */ 972 mla WK3, WK1, A_SRC, HALF /* rb_mul; 2 cycles */ 973 mla WK4, WK2, A_SRC, HALF /* ag_mul; 2 cycles */ 974 ldrd RB_FLDS, HALF, [sp, #LOCALS_STACK_OFFSET] 975 uxtb16 WK5, WK0 /* rb_dest */ 976 uxtab16 WK3, WK3, WK3, ror #8 977 uxtb16 WK6, WK0, ror #8 /* ag_dest */ 978 uxtab16 WK4, WK4, WK4, ror #8 979 smlatt WK0, RB_SRC, WK1, HALF /* red1 */ 980 smlabb WK1, RB_SRC, WK1, HALF /* blue1 */ 981 bic WK3, RB_FLDS, WK3, lsr #8 982 bic WK4, RB_FLDS, WK4, lsr #8 983 pkhbt WK1, WK1, WK0, lsl #16 /* rb1 */ 984 smlatt WK0, WK5, WK3, HALF /* red2 */ 985 smlabb WK3, WK5, WK3, HALF /* blue2 */ 986 uxtab16 WK1, WK1, WK1, ror #8 987 smlatt WK5, AG_SRC, WK2, HALF /* alpha1 */ 988 pkhbt WK3, WK3, WK0, lsl #16 /* rb2 */ 989 smlabb WK0, AG_SRC, WK2, HALF /* green1 */ 990 smlatt WK2, WK6, WK4, HALF /* alpha2 */ 991 smlabb WK4, WK6, WK4, HALF /* green2 */ 992 pkhbt WK0, WK0, WK5, lsl #16 /* ag1 */ 993 uxtab16 WK3, WK3, WK3, ror #8 994 pkhbt WK4, WK4, WK2, lsl #16 /* ag2 */ 995 uxtab16 WK0, WK0, WK0, ror #8 996 uxtab16 WK4, WK4, WK4, ror #8 997 mov WK1, WK1, ror #8 998 mov WK3, WK3, ror #8 999 sel WK2, WK1, WK0 /* recombine source*mask */ 1000 sel WK1, WK3, WK4 /* recombine dest*(1-source_alpha*mask) */ 1001 uqadd8 WK0, WK1, WK2 /* followed by 1 stall */ 1002 30: /* The destination buffer is already in the L1 cache, so 1003 * there's little point in amalgamating writes */ 1004 pixst , 4, 0, DST 1005 40: 1006 .endm 1007 1008 .macro over_n_8888_8888_ca_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 1009 .rept (\numbytes / 4) - 1 1010 over_n_8888_8888_ca_1pixel_head 1011 over_n_8888_8888_ca_1pixel_tail 1012 .endr 1013 over_n_8888_8888_ca_1pixel_head 1014 .endm 1015 1016 .macro over_n_8888_8888_ca_process_tail cond, numbytes, firstreg 1017 over_n_8888_8888_ca_1pixel_tail 1018 .endm 1019 1020 pixman_asm_function pixman_composite_over_n_8888_8888_ca_asm_armv6 1021 ldr ip, [sp] 1022 cmp ip, #-1 1023 beq pixman_composite_over_white_8888_8888_ca_asm_armv6 1024 /* else drop through... */ 1025 pixman_end_asm_function 1026 generate_composite_function \ 1027 pixman_composite_over_n_8888_8888_ca_asm_armv6_helper, 0, 32, 32 \ 1028 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_PROCESS_CORRUPTS_WK0 \ 1029 2, /* prefetch distance */ \ 1030 over_n_8888_8888_ca_init, \ 1031 nop_macro, /* newline */ \ 1032 over_n_8888_8888_ca_cleanup, \ 1033 over_n_8888_8888_ca_process_head, \ 1034 over_n_8888_8888_ca_process_tail 1035 1036 /******************************************************************************/ 1037 1038 .macro in_reverse_8888_8888_init 1039 /* Hold loop invariant in MASK */ 1040 ldr MASK, =0x00800080 1041 /* Set GE[3:0] to 0101 so SEL instructions do what we want */ 1042 uadd8 SCRATCH, MASK, MASK 1043 /* Offset the source pointer: we only need the alpha bytes */ 1044 add SRC, SRC, #3 1045 line_saved_regs ORIG_W 1046 .endm 1047 1048 .macro in_reverse_8888_8888_head numbytes, reg1, reg2, reg3 1049 ldrb ORIG_W, [SRC], #4 1050 .if \numbytes >= 8 1051 ldrb WK\()\reg1, [SRC], #4 1052 .if \numbytes == 16 1053 ldrb WK\()\reg2, [SRC], #4 1054 ldrb WK\()\reg3, [SRC], #4 1055 .endif 1056 .endif 1057 add DST, DST, #\numbytes 1058 .endm 1059 1060 .macro in_reverse_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 1061 in_reverse_8888_8888_head \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2) 1062 .endm 1063 1064 .macro in_reverse_8888_8888_1pixel s, d, offset, is_only 1065 .if \is_only != 1 1066 movs \s, ORIG_W 1067 .if \offset != 0 1068 ldrb ORIG_W, [SRC, #\offset] 1069 .endif 1070 beq 1f 1071 teq STRIDE_M, #0xFF 1072 beq 2f 1073 .endif 1074 uxtb16 SCRATCH, \d /* rb_dest */ 1075 uxtb16 \d, \d, ror #8 /* ag_dest */ 1076 mla SCRATCH, SCRATCH, \s, MASK 1077 mla \d, \d, \s, MASK 1078 uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 1079 uxtab16 \d, \d, \d, ror #8 1080 mov SCRATCH, SCRATCH, ror #8 1081 sel \d, SCRATCH, \d 1082 b 2f 1083 .if \offset == 0 1084 48: /* Last mov d,#0 of the set - used as part of shortcut for 1085 * source values all 0 */ 1086 .endif 1087 1: mov \d, #0 1088 2: 1089 .endm 1090 1091 .macro in_reverse_8888_8888_tail numbytes, reg1, reg2, reg3, reg4 1092 .if \numbytes == 4 1093 teq ORIG_W, ORIG_W, asr #32 1094 ldrne WK\()\reg1, [DST, #-4] 1095 .elseif \numbytes == 8 1096 teq ORIG_W, WK\()\reg1 1097 teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */ 1098 ldmdbne DST, {WK\()\reg1-WK\()\reg2} 1099 .else 1100 teq ORIG_W, WK\()\reg1 1101 teqeq ORIG_W, WK\()\reg2 1102 teqeq ORIG_W, WK\()\reg3 1103 teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */ 1104 ldmdbne DST, {WK\()\reg1-WK\()\reg4} 1105 .endif 1106 cmnne DST, #0 /* clear C if NE */ 1107 bcs 49f /* no writes to dest if source all -1 */ 1108 beq 48f /* set dest to all 0 if source all 0 */ 1109 .if \numbytes == 4 1110 in_reverse_8888_8888_1pixel ORIG_W, WK\()\reg1, 0, 1 1111 str WK\()\reg1, [DST, #-4] 1112 .elseif \numbytes == 8 1113 in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg1, -4, 0 1114 in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg2, 0, 0 1115 stmdb DST, {WK\()\reg1-WK\()\reg2} 1116 .else 1117 in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg1, -12, 0 1118 in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg2, -8, 0 1119 in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg3, -4, 0 1120 in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg4, 0, 0 1121 stmdb DST, {WK\()\reg1-WK\()\reg4} 1122 .endif 1123 49: 1124 .endm 1125 1126 .macro in_reverse_8888_8888_process_tail cond, numbytes, firstreg 1127 in_reverse_8888_8888_tail \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2), %(\firstreg+3) 1128 .endm 1129 1130 generate_composite_function \ 1131 pixman_composite_in_reverse_8888_8888_asm_armv6, 32, 0, 32 \ 1132 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_NO_PRELOAD_DST \ 1133 2, /* prefetch distance */ \ 1134 in_reverse_8888_8888_init, \ 1135 nop_macro, /* newline */ \ 1136 nop_macro, /* cleanup */ \ 1137 in_reverse_8888_8888_process_head, \ 1138 in_reverse_8888_8888_process_tail 1139 1140 /******************************************************************************/ 1141 1142 .macro over_n_8888_init 1143 ldr SRC, [sp, #ARGS_STACK_OFFSET] 1144 /* Hold loop invariant in MASK */ 1145 ldr MASK, =0x00800080 1146 /* Hold multiplier for destination in STRIDE_M */ 1147 mov STRIDE_M, #255 1148 sub STRIDE_M, STRIDE_M, SRC, lsr #24 1149 /* Set GE[3:0] to 0101 so SEL instructions do what we want */ 1150 uadd8 SCRATCH, MASK, MASK 1151 .endm 1152 1153 .macro over_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 1154 pixld , \numbytes, \firstreg, DST, 0 1155 .endm 1156 1157 .macro over_n_8888_1pixel dst 1158 mul_8888_8 WK\()\dst, STRIDE_M, SCRATCH, MASK 1159 uqadd8 WK\()\dst, WK\()\dst, SRC 1160 .endm 1161 1162 .macro over_n_8888_process_tail cond, numbytes, firstreg 1163 .set PROCESS_REG, \firstreg 1164 .rept \numbytes / 4 1165 over_n_8888_1pixel %(PROCESS_REG) 1166 .set PROCESS_REG, PROCESS_REG+1 1167 .endr 1168 pixst , \numbytes, \firstreg, DST 1169 .endm 1170 1171 generate_composite_function \ 1172 pixman_composite_over_n_8888_asm_armv6, 0, 0, 32 \ 1173 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE \ 1174 2, /* prefetch distance */ \ 1175 over_n_8888_init, \ 1176 nop_macro, /* newline */ \ 1177 nop_macro, /* cleanup */ \ 1178 over_n_8888_process_head, \ 1179 over_n_8888_process_tail 1180 1181 /******************************************************************************/