pixman-arm-simd-asm.h (34714B)
1 /* 2 * Copyright © 2012 Raspberry Pi Foundation 3 * Copyright © 2012 RISC OS Open Ltd 4 * 5 * Permission to use, copy, modify, distribute, and sell this software and its 6 * documentation for any purpose is hereby granted without fee, provided that 7 * the above copyright notice appear in all copies and that both that 8 * copyright notice and this permission notice appear in supporting 9 * documentation, and that the name of the copyright holders not be used in 10 * advertising or publicity pertaining to distribution of the software without 11 * specific, written prior permission. The copyright holders make no 12 * representations about the suitability of this software for any purpose. It 13 * is provided "as is" without express or implied warranty. 14 * 15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS 16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY 18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN 20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING 21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS 22 * SOFTWARE. 23 * 24 * Author: Ben Avison (bavison@riscosopen.org) 25 * 26 */ 27 28 /* 29 * Because the alignment of pixel data to cachelines, and even the number of 30 * cachelines per row can vary from row to row, and because of the need to 31 * preload each scanline once and only once, this prefetch strategy treats 32 * each row of pixels independently. When a pixel row is long enough, there 33 * are three distinct phases of prefetch: 34 * * an inner loop section, where each time a cacheline of data is 35 * processed, another cacheline is preloaded (the exact distance ahead is 36 * determined empirically using profiling results from lowlevel-blt-bench) 37 * * a leading section, where enough cachelines are preloaded to ensure no 38 * cachelines escape being preloaded when the inner loop starts 39 * * a trailing section, where a limited number (0 or more) of cachelines 40 * are preloaded to deal with data (if any) that hangs off the end of the 41 * last iteration of the inner loop, plus any trailing bytes that were not 42 * enough to make up one whole iteration of the inner loop 43 * 44 * There are (in general) three distinct code paths, selected between 45 * depending upon how long the pixel row is. If it is long enough that there 46 * is at least one iteration of the inner loop (as described above) then 47 * this is described as the "wide" case. If it is shorter than that, but 48 * there are still enough bytes output that there is at least one 16-byte- 49 * long, 16-byte-aligned write to the destination (the optimum type of 50 * write), then this is the "medium" case. If it is not even this long, then 51 * this is the "narrow" case, and there is no attempt to align writes to 52 * 16-byte boundaries. In the "medium" and "narrow" cases, all the 53 * cachelines containing data from the pixel row are prefetched up-front. 54 */ 55 56 /* 57 * Determine whether we put the arguments on the stack for debugging. 58 */ 59 #undef DEBUG_PARAMS 60 61 /* 62 * Bit flags for 'generate_composite_function' macro which are used 63 * to tune generated functions behavior. 64 */ 65 .set FLAG_DST_WRITEONLY, 0 66 .set FLAG_DST_READWRITE, 1 67 .set FLAG_COND_EXEC, 0 68 .set FLAG_BRANCH_OVER, 2 69 .set FLAG_PROCESS_PRESERVES_PSR, 0 70 .set FLAG_PROCESS_CORRUPTS_PSR, 4 71 .set FLAG_PROCESS_DOESNT_STORE, 0 72 .set FLAG_PROCESS_DOES_STORE, 8 /* usually because it needs to conditionally skip it */ 73 .set FLAG_NO_SPILL_LINE_VARS, 0 74 .set FLAG_SPILL_LINE_VARS_WIDE, 16 75 .set FLAG_SPILL_LINE_VARS_NON_WIDE, 32 76 .set FLAG_SPILL_LINE_VARS, 48 77 .set FLAG_PROCESS_CORRUPTS_SCRATCH, 0 78 .set FLAG_PROCESS_PRESERVES_SCRATCH, 64 79 .set FLAG_PROCESS_PRESERVES_WK0, 0 80 .set FLAG_PROCESS_CORRUPTS_WK0, 128 /* if possible, use the specified register(s) instead so WK0 can hold number of leading pixels */ 81 .set FLAG_PRELOAD_DST, 0 82 .set FLAG_NO_PRELOAD_DST, 256 83 84 /* 85 * Number of bytes by which to adjust preload offset of destination 86 * buffer (allows preload instruction to be moved before the load(s)) 87 */ 88 .set DST_PRELOAD_BIAS, 0 89 90 /* 91 * Offset into stack where mask and source pointer/stride can be accessed. 92 */ 93 #ifdef DEBUG_PARAMS 94 .set ARGS_STACK_OFFSET, (9*4+9*4) 95 #else 96 .set ARGS_STACK_OFFSET, (9*4) 97 #endif 98 99 /* 100 * Offset into stack where space allocated during init macro can be accessed. 101 */ 102 .set LOCALS_STACK_OFFSET, 0 103 104 /* 105 * Constants for selecting preferable prefetch type. 106 */ 107 .set PREFETCH_TYPE_NONE, 0 108 .set PREFETCH_TYPE_STANDARD, 1 109 110 /* 111 * Definitions of macros for load/store of pixel data. 112 */ 113 114 .macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0 115 .if \numbytes == 16 116 .if \unaligned == 1 117 \op\()r\()\cond WK\()\reg0, [\base], #4 118 \op\()r\()\cond WK\()\reg1, [\base], #4 119 \op\()r\()\cond WK\()\reg2, [\base], #4 120 \op\()r\()\cond WK\()\reg3, [\base], #4 121 .else 122 \op\()mia\()\cond \base!, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3} 123 .endif 124 .elseif \numbytes == 8 125 .if \unaligned == 1 126 \op\()r\()\cond WK\()\reg0, [\base], #4 127 \op\()r\()\cond WK\()\reg1, [\base], #4 128 .else 129 \op\()mia\()\cond \base!, {WK\()\reg0,WK\()\reg1} 130 .endif 131 .elseif \numbytes == 4 132 \op\()r\()\cond WK\()\reg0, [\base], #4 133 .elseif \numbytes == 2 134 \op\()rh\()\cond WK\()\reg0, [\base], #2 135 .elseif \numbytes == 1 136 \op\()rb\()\cond WK\()\reg0, [\base], #1 137 .else 138 .error "unsupported size: \numbytes" 139 .endif 140 .endm 141 142 .macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base 143 .if \numbytes == 16 144 stm\()\cond\()db \base, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3} 145 .elseif \numbytes == 8 146 stmdb\()\cond \base, {WK\()\reg0,WK\()\reg1} 147 .elseif \numbytes == 4 148 str\()\cond WK\()\reg0, [\base, #-4] 149 .elseif \numbytes == 2 150 strh\()\cond WK\()\reg0, [\base, #-2] 151 .elseif \numbytes == 1 152 strb\()\cond WK\()\reg0, [\base, #-1] 153 .else 154 .error "unsupported size: \numbytes" 155 .endif 156 .endm 157 158 .macro pixld cond, numbytes, firstreg, base, unaligned 159 pixldst ld, \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base, \unaligned 160 .endm 161 162 .macro pixst cond, numbytes, firstreg, base 163 .if (flags) & FLAG_DST_READWRITE 164 pixst_baseupdated \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base 165 .else 166 pixldst st, \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base 167 .endif 168 .endm 169 170 .macro PF a, x:vararg 171 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD) 172 \a \x 173 .endif 174 .endm 175 176 177 .macro preload_leading_step1 bpp, ptr, base 178 /* If the destination is already 16-byte aligned, then we need to preload 179 * between 0 and prefetch_distance (inclusive) cache lines ahead so there 180 * are no gaps when the inner loop starts. 181 */ 182 .if \bpp > 0 183 PF bic, \ptr, \base, #31 184 .set OFFSET, 0 185 .rept prefetch_distance+1 186 PF pld, [\ptr, #OFFSET] 187 .set OFFSET, OFFSET+32 188 .endr 189 .endif 190 .endm 191 192 .macro preload_leading_step2 bpp, bpp_shift, ptr, base 193 /* However, if the destination is not 16-byte aligned, we may need to 194 * preload more cache lines than that. The question we need to ask is: 195 * are the bytes corresponding to the leading pixels more than the amount 196 * by which the source pointer will be rounded down for preloading, and if 197 * so, by how many cache lines? Effectively, we want to calculate 198 * leading_bytes = ((-dst)&15)*src_bpp/dst_bpp 199 * inner_loop_offset = (src+leading_bytes)&31 200 * extra_needed = leading_bytes - inner_loop_offset 201 * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only 202 * possible when there are 4 src bytes for every 1 dst byte). 203 */ 204 .if \bpp > 0 205 .ifc \base,DST 206 /* The test can be simplified further when preloading the destination */ 207 PF tst, \base, #16 208 PF beq, 61f 209 .else 210 .if \bpp/dst_w_bpp == 4 211 PF add, SCRATCH, \base, WK0, lsl #\bpp_shift-dst_bpp_shift 212 PF and, SCRATCH, SCRATCH, #31 213 PF rsb, SCRATCH, SCRATCH, WK0, lsl #\bpp_shift-dst_bpp_shift 214 PF sub, SCRATCH, SCRATCH, #1 /* so now ranges are -16..-1 / 0..31 / 32..63 */ 215 PF movs, SCRATCH, SCRATCH, lsl #32-6 /* so this sets NC / nc / Nc */ 216 PF bcs, 61f 217 PF bpl, 60f 218 PF pld, [ptr, #32*(prefetch_distance+2)] 219 .else 220 PF mov, SCRATCH, \base, lsl #32-5 221 PF add, SCRATCH, SCRATCH, WK0, lsl #32-5+\bpp_shift-dst_bpp_shift 222 PF rsbs, SCRATCH, SCRATCH, WK0, lsl #32-5+\bpp_shift-dst_bpp_shift 223 PF bls, 61f 224 .endif 225 .endif 226 60: PF pld, [\ptr, #32*(prefetch_distance+1)] 227 61: 228 .endif 229 .endm 230 231 #define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2)) 232 .macro preload_middle bpp, base, scratch_holds_offset 233 .if \bpp > 0 234 /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */ 235 .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/\bpp) 236 .if \scratch_holds_offset 237 PF pld, [\base, SCRATCH] 238 .else 239 PF bic, SCRATCH, \base, #31 240 PF pld, [SCRATCH, #32*prefetch_distance] 241 .endif 242 .endif 243 .endif 244 .endm 245 246 .macro preload_trailing bpp, bpp_shift, base 247 .if \bpp > 0 248 .if \bpp*pix_per_block > 256 249 /* Calculations are more complex if more than one fetch per block */ 250 PF and, WK1, \base, #31 251 PF add, WK1, WK1, WK0, lsl #\bpp_shift 252 PF add, WK1, WK1, #32*(\bpp*pix_per_block/256-1)*(prefetch_distance+1) 253 PF bic, SCRATCH, \base, #31 254 80: PF pld, [SCRATCH, #32*(prefetch_distance+1)] 255 PF add, SCRATCH, SCRATCH, #32 256 PF subs, WK1, WK1, #32 257 PF bhi, 80b 258 .else 259 /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */ 260 PF mov, SCRATCH, \base, lsl #32-5 261 PF adds, SCRATCH, SCRATCH, X, lsl #32-5+\bpp_shift 262 PF adcseq, SCRATCH, SCRATCH, #0 263 /* The instruction above has two effects: ensures Z is only 264 * set if C was clear (so Z indicates that both shifted quantities 265 * were 0), and clears C if Z was set (so C indicates that the sum 266 * of the shifted quantities was greater and not equal to 32) */ 267 PF beq, 82f 268 PF bic, SCRATCH, \base, #31 269 PF bcc, 81f 270 PF pld, [SCRATCH, #32*(prefetch_distance+2)] 271 81: PF pld, [SCRATCH, #32*(prefetch_distance+1)] 272 82: 273 .endif 274 .endif 275 .endm 276 277 278 .macro preload_line narrow_case, bpp, bpp_shift, base 279 /* "narrow_case" - just means that the macro was invoked from the "narrow" 280 * code path rather than the "medium" one - because in the narrow case, 281 * the row of pixels is known to output no more than 30 bytes, then 282 * (assuming the source pixels are no wider than the the destination 283 * pixels) they cannot possibly straddle more than 2 32-byte cachelines, 284 * meaning there's no need for a loop. 285 * "bpp" - number of bits per pixel in the channel (source, mask or 286 * destination) that's being preloaded, or 0 if this channel is not used 287 * for reading 288 * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course) 289 * "base" - base address register of channel to preload (SRC, MASK or DST) 290 */ 291 .if \bpp > 0 292 .if \narrow_case && (\bpp <= dst_w_bpp) 293 /* In these cases, each line for each channel is in either 1 or 2 cache lines */ 294 PF bic, WK0, \base, #31 295 PF pld, [WK0] 296 PF add, WK1, \base, X, LSL #\bpp_shift 297 PF sub, WK1, WK1, #1 298 PF bic, WK1, WK1, #31 299 PF cmp, WK1, WK0 300 PF beq, 90f 301 PF pld, [WK1] 302 90: 303 .else 304 PF bic, WK0, \base, #31 305 PF pld, [WK0] 306 PF add, WK1, \base, X, lsl #\bpp_shift 307 PF sub, WK1, WK1, #1 308 PF bic, WK1, WK1, #31 309 PF cmp, WK1, WK0 310 PF beq, 92f 311 91: PF add, WK0, WK0, #32 312 PF cmp, WK0, WK1 313 PF pld, [WK0] 314 PF bne, 91b 315 92: 316 .endif 317 .endif 318 .endm 319 320 321 .macro conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx 322 \process_head \cond, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, 0 323 .if \decrementx 324 sub\()\cond X, X, #8*\numbytes/dst_w_bpp 325 .endif 326 \process_tail \cond, \numbytes, \firstreg 327 .if !((flags) & FLAG_PROCESS_DOES_STORE) 328 pixst \cond, \numbytes, \firstreg, DST 329 .endif 330 .endm 331 332 .macro conditional_process1 cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx 333 .if (flags) & FLAG_BRANCH_OVER 334 .ifc \cond,mi 335 bpl 100f 336 .endif 337 .ifc \cond,cs 338 bcc 100f 339 .endif 340 .ifc \cond,ne 341 beq 100f 342 .endif 343 conditional_process1_helper , \process_head, \process_tail, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, \decrementx 344 100: 345 .else 346 conditional_process1_helper \cond, \process_head, \process_tail, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, \decrementx 347 .endif 348 .endm 349 350 .macro conditional_process2 test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx 351 .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE) 352 /* Can't interleave reads and writes */ 353 \test 354 conditional_process1 \cond1, \process_head, \process_tail, \numbytes1, \firstreg1, \unaligned_src, \unaligned_mask, \decrementx 355 .if (flags) & FLAG_PROCESS_CORRUPTS_PSR 356 \test 357 .endif 358 conditional_process1 \cond2, \process_head, \process_tail, \numbytes2, \firstreg2, \unaligned_src, \unaligned_mask, \decrementx 359 .else 360 /* Can interleave reads and writes for better scheduling */ 361 \test 362 \process_head \cond1, \numbytes1, \firstreg1, \unaligned_src, \unaligned_mask, 0 363 \process_head \cond2, \numbytes2, \firstreg2, \unaligned_src, \unaligned_mask, 0 364 .if \decrementx 365 sub\()\cond1 X, X, #8*\numbytes1/dst_w_bpp 366 sub\()\cond2 X, X, #8*\numbytes2/dst_w_bpp 367 .endif 368 \process_tail \cond1, \numbytes1, \firstreg1 369 \process_tail \cond2, \numbytes2, \firstreg2 370 pixst \cond1, \numbytes1, \firstreg1, DST 371 pixst \cond2, \numbytes2, \firstreg2, DST 372 .endif 373 .endm 374 375 376 .macro test_bits_1_0_ptr 377 .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 378 movs SCRATCH, X, lsl #32-1 /* C,N = bits 1,0 of DST */ 379 .else 380 movs SCRATCH, WK0, lsl #32-1 /* C,N = bits 1,0 of DST */ 381 .endif 382 .endm 383 384 .macro test_bits_3_2_ptr 385 .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 386 movs SCRATCH, X, lsl #32-3 /* C,N = bits 3, 2 of DST */ 387 .else 388 movs SCRATCH, WK0, lsl #32-3 /* C,N = bits 3, 2 of DST */ 389 .endif 390 .endm 391 392 .macro leading_15bytes process_head, process_tail 393 /* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */ 394 .set DECREMENT_X, 1 395 .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 396 .set DECREMENT_X, 0 397 sub X, X, WK0, lsr #dst_bpp_shift 398 str X, [sp, #LINE_SAVED_REG_COUNT*4] 399 mov X, WK0 400 .endif 401 /* Use unaligned loads in all cases for simplicity */ 402 .if dst_w_bpp == 8 403 conditional_process2 test_bits_1_0_ptr, mi, cs, \process_head, \process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X 404 .elseif dst_w_bpp == 16 405 test_bits_1_0_ptr 406 conditional_process1 cs, \process_head, \process_tail, 2, 2, 1, 1, DECREMENT_X 407 .endif 408 conditional_process2 test_bits_3_2_ptr, mi, cs, \process_head, \process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X 409 .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 410 ldr X, [sp, #LINE_SAVED_REG_COUNT*4] 411 .endif 412 .endm 413 414 .macro test_bits_3_2_pix 415 movs SCRATCH, X, lsl #dst_bpp_shift+32-3 416 .endm 417 418 .macro test_bits_1_0_pix 419 .if dst_w_bpp == 8 420 movs SCRATCH, X, lsl #dst_bpp_shift+32-1 421 .else 422 movs SCRATCH, X, lsr #1 423 .endif 424 .endm 425 426 .macro trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask 427 conditional_process2 test_bits_3_2_pix, cs, mi, \process_head, \process_tail, 8, 4, 0, 2, \unaligned_src, \unaligned_mask, 0 428 .if dst_w_bpp == 16 429 test_bits_1_0_pix 430 conditional_process1 cs, \process_head, \process_tail, 2, 0, \unaligned_src, \unaligned_mask, 0 431 .elseif dst_w_bpp == 8 432 conditional_process2 test_bits_1_0_pix, cs, mi, \process_head, \process_tail, 2, 1, 0, 1, \unaligned_src, \unaligned_mask, 0 433 .endif 434 .endm 435 436 437 .macro wide_case_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment 438 110: 439 .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */ 440 .rept pix_per_block*dst_w_bpp/128 441 \process_head , 16, 0, \unaligned_src, \unaligned_mask, 1 442 .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) 443 preload_middle src_bpp, SRC, 1 444 .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) 445 preload_middle mask_bpp, MASK, 1 446 .else 447 preload_middle src_bpp, SRC, 0 448 preload_middle mask_bpp, MASK, 0 449 .endif 450 .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0) && (((flags) & FLAG_NO_PRELOAD_DST) == 0) 451 /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that 452 * destination prefetches are 32-byte aligned. It's also the easiest channel to offset 453 * preloads for, to achieve staggered prefetches for multiple channels, because there are 454 * always two STMs per prefetch, so there is always an opposite STM on which to put the 455 * preload. Note, no need to BIC the base register here */ 456 PF pld, [DST, #32*prefetch_distance - \dst_alignment] 457 .endif 458 \process_tail , 16, 0 459 .if !((flags) & FLAG_PROCESS_DOES_STORE) 460 pixst , 16, 0, DST 461 .endif 462 .set SUBBLOCK, SUBBLOCK+1 463 .endr 464 subs X, X, #pix_per_block 465 bhs 110b 466 .endm 467 468 .macro wide_case_inner_loop_and_trailing_pixels process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask 469 /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */ 470 .if dst_r_bpp > 0 471 tst DST, #16 472 bne 111f 473 \process_inner_loop \process_head, \process_tail, \unaligned_src, \unaligned_mask, 16 + DST_PRELOAD_BIAS 474 b 112f 475 111: 476 .endif 477 \process_inner_loop \process_head, \process_tail, \unaligned_src, \unaligned_mask, 0 + DST_PRELOAD_BIAS 478 112: 479 /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */ 480 .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256) 481 PF and, WK0, X, #pix_per_block-1 482 .endif 483 preload_trailing src_bpp, src_bpp_shift, SRC 484 preload_trailing mask_bpp, mask_bpp_shift, MASK 485 .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 486 preload_trailing dst_r_bpp, dst_bpp_shift, DST 487 .endif 488 add X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp 489 /* The remainder of the line is handled identically to the medium case */ 490 medium_case_inner_loop_and_trailing_pixels \process_head, \process_tail,, \exit_label, \unaligned_src, \unaligned_mask 491 .endm 492 493 .macro medium_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask 494 120: 495 \process_head , 16, 0, \unaligned_src, \unaligned_mask, 0 496 \process_tail , 16, 0 497 .if !((flags) & FLAG_PROCESS_DOES_STORE) 498 pixst , 16, 0, DST 499 .endif 500 subs X, X, #128/dst_w_bpp 501 bhs 120b 502 /* Trailing pixels */ 503 tst X, #128/dst_w_bpp - 1 504 beq \exit_label 505 trailing_15bytes \process_head, \process_tail, \unaligned_src, \unaligned_mask 506 .endm 507 508 .macro narrow_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask 509 tst X, #16*8/dst_w_bpp 510 conditional_process1 ne, \process_head, \process_tail, 16, 0, \unaligned_src, \unaligned_mask, 0 511 /* Trailing pixels */ 512 /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */ 513 trailing_15bytes \process_head, \process_tail, \unaligned_src, \unaligned_mask 514 .endm 515 516 .macro switch_on_alignment action, process_head, process_tail, process_inner_loop, exit_label 517 /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */ 518 .if mask_bpp == 8 || mask_bpp == 16 519 tst MASK, #3 520 bne 141f 521 .endif 522 .if src_bpp == 8 || src_bpp == 16 523 tst SRC, #3 524 bne 140f 525 .endif 526 \action \process_head, \process_tail, \process_inner_loop, \exit_label, 0, 0 527 .if src_bpp == 8 || src_bpp == 16 528 b \exit_label 529 140: 530 \action \process_head, \process_tail, \process_inner_loop, \exit_label, 1, 0 531 .endif 532 .if mask_bpp == 8 || mask_bpp == 16 533 b \exit_label 534 141: 535 .if src_bpp == 8 || src_bpp == 16 536 tst SRC, #3 537 bne 142f 538 .endif 539 \action \process_head, \process_tail, \process_inner_loop, \exit_label, 0, 1 540 .if src_bpp == 8 || src_bpp == 16 541 b \exit_label 542 142: 543 \action \process_head, \process_tail, \process_inner_loop, \exit_label, 1, 1 544 .endif 545 .endif 546 .endm 547 548 549 .macro end_of_line restore_x, vars_spilled, loop_label, last_one 550 .if \vars_spilled 551 /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */ 552 /* This is ldmia sp,{} */ 553 .word 0xE89D0000 | LINE_SAVED_REGS 554 .endif 555 subs Y, Y, #1 556 .if \vars_spilled 557 .if (LINE_SAVED_REGS) & (1<<1) 558 str Y, [sp] 559 .endif 560 .endif 561 add DST, DST, STRIDE_D 562 .if src_bpp > 0 563 add SRC, SRC, STRIDE_S 564 .endif 565 .if mask_bpp > 0 566 add MASK, MASK, STRIDE_M 567 .endif 568 .if \restore_x 569 mov X, ORIG_W 570 .endif 571 bhs \loop_label 572 .ifc "\last_one","" 573 .if \vars_spilled 574 b 197f 575 .else 576 b 198f 577 .endif 578 .else 579 .if (!\vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS) 580 b 198f 581 .endif 582 .endif 583 .endm 584 585 586 .macro generate_composite_function fname, \ 587 src_bpp_, \ 588 mask_bpp_, \ 589 dst_w_bpp_, \ 590 flags_, \ 591 prefetch_distance_, \ 592 init, \ 593 newline, \ 594 cleanup, \ 595 process_head, \ 596 process_tail, \ 597 process_inner_loop 598 599 pixman_asm_function \fname 600 601 /* 602 * Make some macro arguments globally visible and accessible 603 * from other macros 604 */ 605 .set src_bpp, \src_bpp_ 606 .set mask_bpp, \mask_bpp_ 607 .set dst_w_bpp, \dst_w_bpp_ 608 .set flags, \flags_ 609 .set prefetch_distance, \prefetch_distance_ 610 611 /* 612 * Select prefetch type for this function. 613 */ 614 .if prefetch_distance == 0 615 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE 616 .else 617 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD 618 .endif 619 620 .if src_bpp == 32 621 .set src_bpp_shift, 2 622 .elseif src_bpp == 24 623 .set src_bpp_shift, 0 624 .elseif src_bpp == 16 625 .set src_bpp_shift, 1 626 .elseif src_bpp == 8 627 .set src_bpp_shift, 0 628 .elseif src_bpp == 0 629 .set src_bpp_shift, -1 630 .else 631 .error "requested src bpp (src_bpp) is not supported" 632 .endif 633 634 .if mask_bpp == 32 635 .set mask_bpp_shift, 2 636 .elseif mask_bpp == 24 637 .set mask_bpp_shift, 0 638 .elseif mask_bpp == 8 639 .set mask_bpp_shift, 0 640 .elseif mask_bpp == 0 641 .set mask_bpp_shift, -1 642 .else 643 .error "requested mask bpp (mask_bpp) is not supported" 644 .endif 645 646 .if dst_w_bpp == 32 647 .set dst_bpp_shift, 2 648 .elseif dst_w_bpp == 24 649 .set dst_bpp_shift, 0 650 .elseif dst_w_bpp == 16 651 .set dst_bpp_shift, 1 652 .elseif dst_w_bpp == 8 653 .set dst_bpp_shift, 0 654 .else 655 .error "requested dst bpp (dst_w_bpp) is not supported" 656 .endif 657 658 .if (((flags) & FLAG_DST_READWRITE) != 0) 659 .set dst_r_bpp, dst_w_bpp 660 .else 661 .set dst_r_bpp, 0 662 .endif 663 664 .set pix_per_block, 16*8/dst_w_bpp 665 .if src_bpp != 0 666 .if 32*8/src_bpp > pix_per_block 667 .set pix_per_block, 32*8/src_bpp 668 .endif 669 .endif 670 .if mask_bpp != 0 671 .if 32*8/mask_bpp > pix_per_block 672 .set pix_per_block, 32*8/mask_bpp 673 .endif 674 .endif 675 .if dst_r_bpp != 0 676 .if 32*8/dst_r_bpp > pix_per_block 677 .set pix_per_block, 32*8/dst_r_bpp 678 .endif 679 .endif 680 681 /* The standard entry conditions set up by pixman-arm-common.h are: 682 * r0 = width (pixels) 683 * r1 = height (rows) 684 * r2 = pointer to top-left pixel of destination 685 * r3 = destination stride (pixels) 686 * [sp] = source pixel value, or pointer to top-left pixel of source 687 * [sp,#4] = 0 or source stride (pixels) 688 * The following arguments are unused for non-mask operations 689 * [sp,#8] = mask pixel value, or pointer to top-left pixel of mask 690 * [sp,#12] = 0 or mask stride (pixels) 691 */ 692 693 /* 694 * Assign symbolic names to registers 695 */ 696 X .req r0 /* pixels to go on this line */ 697 Y .req r1 /* lines to go */ 698 DST .req r2 /* destination pixel pointer */ 699 STRIDE_D .req r3 /* destination stride (bytes, minus width) */ 700 SRC .req r4 /* source pixel pointer */ 701 STRIDE_S .req r5 /* source stride (bytes, minus width) */ 702 MASK .req r6 /* mask pixel pointer (if applicable) */ 703 STRIDE_M .req r7 /* mask stride (bytes, minus width) */ 704 WK0 .req r8 /* pixel data registers */ 705 WK1 .req r9 706 WK2 .req r10 707 WK3 .req r11 708 SCRATCH .req r12 709 ORIG_W .req r14 /* width (pixels) */ 710 711 push {r4-r11, lr} /* save all registers */ 712 713 subs Y, Y, #1 714 blo 199f 715 716 #ifdef DEBUG_PARAMS 717 sub sp, sp, #9*4 718 #endif 719 720 .if src_bpp > 0 721 ldr SRC, [sp, #ARGS_STACK_OFFSET] 722 ldr STRIDE_S, [sp, #ARGS_STACK_OFFSET+4] 723 .endif 724 .if mask_bpp > 0 725 ldr MASK, [sp, #ARGS_STACK_OFFSET+8] 726 ldr STRIDE_M, [sp, #ARGS_STACK_OFFSET+12] 727 .endif 728 729 #ifdef DEBUG_PARAMS 730 add Y, Y, #1 731 stmia sp, {r0-r7,pc} 732 sub Y, Y, #1 733 #endif 734 735 \init 736 737 .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 738 /* Reserve a word in which to store X during leading pixels */ 739 sub sp, sp, #4 740 .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+4 741 .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET+4 742 .endif 743 744 lsl STRIDE_D, #dst_bpp_shift /* stride in bytes */ 745 sub STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift 746 .if src_bpp > 0 747 lsl STRIDE_S, #src_bpp_shift 748 sub STRIDE_S, STRIDE_S, X, lsl #src_bpp_shift 749 .endif 750 .if mask_bpp > 0 751 lsl STRIDE_M, #mask_bpp_shift 752 sub STRIDE_M, STRIDE_M, X, lsl #mask_bpp_shift 753 .endif 754 755 /* Are we not even wide enough to have one 16-byte aligned 16-byte block write? */ 756 cmp X, #2*16*8/dst_w_bpp - 1 757 blo 170f 758 .if src_bpp || mask_bpp || dst_r_bpp /* Wide and medium cases are the same for fill */ 759 /* To preload ahead on the current line, we need at least (prefetch_distance+2) 32-byte blocks on all prefetch channels */ 760 cmp X, #(prefetch_distance+3)*pix_per_block - 1 761 blo 160f 762 763 /* Wide case */ 764 /* Adjust X so that the decrement instruction can also test for 765 * inner loop termination. We want it to stop when there are 766 * (prefetch_distance+1) complete blocks to go. */ 767 sub X, X, #(prefetch_distance+2)*pix_per_block 768 mov ORIG_W, X 769 .if (flags) & FLAG_SPILL_LINE_VARS_WIDE 770 /* This is stmdb sp!,{} */ 771 .word 0xE92D0000 | LINE_SAVED_REGS 772 .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 773 .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 774 .endif 775 151: /* New line */ 776 \newline 777 preload_leading_step1 src_bpp, WK1, SRC 778 preload_leading_step1 mask_bpp, WK2, MASK 779 .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 780 preload_leading_step1 dst_r_bpp, WK3, DST 781 .endif 782 783 ands WK0, DST, #15 784 beq 154f 785 rsb WK0, WK0, #16 /* number of leading bytes until destination aligned */ 786 787 preload_leading_step2 src_bpp, src_bpp_shift, WK1, SRC 788 preload_leading_step2 mask_bpp, mask_bpp_shift, WK2, MASK 789 .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 790 preload_leading_step2 dst_r_bpp, dst_bpp_shift, WK3, DST 791 .endif 792 793 leading_15bytes \process_head, \process_tail 794 795 154: /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */ 796 .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) 797 and SCRATCH, SRC, #31 798 rsb SCRATCH, SCRATCH, #32*prefetch_distance 799 .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) 800 and SCRATCH, MASK, #31 801 rsb SCRATCH, SCRATCH, #32*prefetch_distance 802 .endif 803 .ifc "\process_inner_loop","" 804 switch_on_alignment wide_case_inner_loop_and_trailing_pixels, \process_head, \process_tail, wide_case_inner_loop, 157f 805 .else 806 switch_on_alignment wide_case_inner_loop_and_trailing_pixels, \process_head, \process_tail, \process_inner_loop, 157f 807 .endif 808 809 157: /* Check for another line */ 810 end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b 811 .if (flags) & FLAG_SPILL_LINE_VARS_WIDE 812 .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 813 .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 814 .endif 815 .endif 816 817 .ltorg 818 819 160: /* Medium case */ 820 mov ORIG_W, X 821 .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE 822 /* This is stmdb sp!,{} */ 823 .word 0xE92D0000 | LINE_SAVED_REGS 824 .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 825 .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 826 .endif 827 161: /* New line */ 828 \newline 829 preload_line 0, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */ 830 preload_line 0, mask_bpp, mask_bpp_shift, MASK 831 .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 832 preload_line 0, dst_r_bpp, dst_bpp_shift, DST 833 .endif 834 835 sub X, X, #128/dst_w_bpp /* simplifies inner loop termination */ 836 ands WK0, DST, #15 837 beq 164f 838 rsb WK0, WK0, #16 /* number of leading bytes until destination aligned */ 839 840 leading_15bytes \process_head, \process_tail 841 842 164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */ 843 switch_on_alignment medium_case_inner_loop_and_trailing_pixels, \process_head, \process_tail,, 167f 844 845 167: /* Check for another line */ 846 end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b 847 848 .ltorg 849 850 170: /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */ 851 .if dst_w_bpp < 32 852 mov ORIG_W, X 853 .endif 854 .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE 855 /* This is stmdb sp!,{} */ 856 .word 0xE92D0000 | LINE_SAVED_REGS 857 .endif 858 171: /* New line */ 859 \newline 860 preload_line 1, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */ 861 preload_line 1, mask_bpp, mask_bpp_shift, MASK 862 .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 863 preload_line 1, dst_r_bpp, dst_bpp_shift, DST 864 .endif 865 866 .if dst_w_bpp == 8 867 tst DST, #3 868 beq 174f 869 172: subs X, X, #1 870 blo 177f 871 \process_head , 1, 0, 1, 1, 0 872 \process_tail , 1, 0 873 .if !((flags) & FLAG_PROCESS_DOES_STORE) 874 pixst , 1, 0, DST 875 .endif 876 tst DST, #3 877 bne 172b 878 .elseif dst_w_bpp == 16 879 tst DST, #2 880 beq 174f 881 subs X, X, #1 882 blo 177f 883 \process_head , 2, 0, 1, 1, 0 884 \process_tail , 2, 0 885 .if !((flags) & FLAG_PROCESS_DOES_STORE) 886 pixst , 2, 0, DST 887 .endif 888 .endif 889 890 174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */ 891 switch_on_alignment narrow_case_inner_loop_and_trailing_pixels, \process_head, \process_tail,, 177f 892 893 177: /* Check for another line */ 894 end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one 895 .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE 896 .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 897 .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 898 .endif 899 900 197: 901 .if (flags) & FLAG_SPILL_LINE_VARS 902 add sp, sp, #LINE_SAVED_REG_COUNT*4 903 .endif 904 198: 905 .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 906 .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-4 907 .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET-4 908 add sp, sp, #4 909 .endif 910 911 \cleanup 912 913 #ifdef DEBUG_PARAMS 914 add sp, sp, #9*4 /* junk the debug copy of arguments */ 915 #endif 916 199: 917 pop {r4-r11, pc} /* exit */ 918 919 .ltorg 920 921 .unreq X 922 .unreq Y 923 .unreq DST 924 .unreq STRIDE_D 925 .unreq SRC 926 .unreq STRIDE_S 927 .unreq MASK 928 .unreq STRIDE_M 929 .unreq WK0 930 .unreq WK1 931 .unreq WK2 932 .unreq WK3 933 .unreq SCRATCH 934 .unreq ORIG_W 935 pixman_end_asm_function 936 .endm 937 938 .macro line_saved_regs x:vararg 939 .set LINE_SAVED_REGS, 0 940 .set LINE_SAVED_REG_COUNT, 0 941 .irp SAVED_REG,\x 942 .ifc "SAVED_REG","Y" 943 .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1) 944 .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 945 .endif 946 .ifc "SAVED_REG","STRIDE_D" 947 .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3) 948 .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 949 .endif 950 .ifc "SAVED_REG","STRIDE_S" 951 .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<5) 952 .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 953 .endif 954 .ifc "SAVED_REG","STRIDE_M" 955 .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<7) 956 .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 957 .endif 958 .ifc "SAVED_REG","ORIG_W" 959 .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<14) 960 .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 961 .endif 962 .endr 963 .endm 964 965 .macro nop_macro x:vararg 966 .endm