pixman-arma64-neon-asm.h (45160B)
1 /* 2 * Copyright © 2009 Nokia Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 * DEALINGS IN THE SOFTWARE. 22 * 23 * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com) 24 */ 25 26 /* 27 * This file contains a macro ('generate_composite_function') which can 28 * construct 2D image processing functions, based on a common template. 29 * Any combinations of source, destination and mask images with 8bpp, 30 * 16bpp, 24bpp, 32bpp color formats are supported. 31 * 32 * This macro takes care of: 33 * - handling of leading and trailing unaligned pixels 34 * - doing most of the work related to L2 cache preload 35 * - encourages the use of software pipelining for better instructions 36 * scheduling 37 * 38 * The user of this macro has to provide some configuration parameters 39 * (bit depths for the images, prefetch distance, etc.) and a set of 40 * macros, which should implement basic code chunks responsible for 41 * pixels processing. See 'pixman-armv8-neon-asm.S' file for the usage 42 * examples. 43 * 44 * TODO: 45 * - try overlapped pixel method (from Ian Rickards) when processing 46 * exactly two blocks of pixels 47 * - maybe add an option to do reverse scanline processing 48 */ 49 50 #include "pixman-arm-asm.h" 51 52 /* 53 * Bit flags for 'generate_composite_function' macro which are used 54 * to tune generated functions behavior. 55 */ 56 .set FLAG_DST_WRITEONLY, 0 57 .set FLAG_DST_READWRITE, 1 58 .set FLAG_DEINTERLEAVE_32BPP, 2 59 60 /* 61 * Constants for selecting preferable prefetch type. 62 */ 63 .set PREFETCH_TYPE_NONE, 0 /* No prefetch at all */ 64 .set PREFETCH_TYPE_SIMPLE, 1 /* A simple, fixed-distance-ahead prefetch */ 65 .set PREFETCH_TYPE_ADVANCED, 2 /* Advanced fine-grained prefetch */ 66 67 /* 68 * prefetch mode 69 * available modes are: 70 * pldl1keep 71 * pldl1strm 72 * pldl2keep 73 * pldl2strm 74 * pldl3keep 75 * pldl3strm 76 */ 77 #define PREFETCH_MODE pldl1keep 78 79 /* 80 * Definitions of supplementary pixld/pixst macros (for partial load/store of 81 * pixel data). 82 */ 83 84 .macro pixldst1 op, elem_size, reg1, mem_operand, abits 85 \op {v\()\reg1\().\()\elem_size}, [\()\mem_operand\()], #8 86 .endm 87 88 .macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits 89 \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size}, [\()\mem_operand\()], #16 90 .endm 91 92 .macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits 93 \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size, v\()\reg3\().\()\elem_size, v\()\reg4\().\()\elem_size}, [\()\mem_operand\()], #32 94 .endm 95 96 .macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits, bytes 97 \op {v\()\reg1\().\()\elem_size}[\idx], [\()\mem_operand\()], #\()\bytes\() 98 .endm 99 100 .macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand 101 \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size, v\()\reg3\().\()\elem_size}, [\()\mem_operand\()], #24 102 .endm 103 104 .macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand 105 \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size, v\()\reg3\().\()\elem_size}[\idx], [\()\mem_operand\()], #3 106 .endm 107 108 .macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits 109 .if \numbytes == 32 110 .if \elem_size==32 111 pixldst4 \op, 2s, %(\basereg+4), %(\basereg+5), \ 112 %(\basereg+6), %(\basereg+7), \mem_operand, \abits 113 .elseif \elem_size==16 114 pixldst4 \op, 4h, %(\basereg+4), %(\basereg+5), \ 115 %(\basereg+6), %(\basereg+7), \mem_operand, \abits 116 .else 117 pixldst4 \op, 8b, %(\basereg+4), %(\basereg+5), \ 118 %(\basereg+6), %(\basereg+7), \mem_operand, \abits 119 .endif 120 .elseif \numbytes == 16 121 .if \elem_size==32 122 pixldst2 \op, 2s, %(\basereg+2), %(\basereg+3), \mem_operand, \abits 123 .elseif \elem_size==16 124 pixldst2 \op, 4h, %(\basereg+2), %(\basereg+3), \mem_operand, \abits 125 .else 126 pixldst2 \op, 8b, %(\basereg+2), %(\basereg+3), \mem_operand, \abits 127 .endif 128 .elseif \numbytes == 8 129 .if \elem_size==32 130 pixldst1 \op, 2s, %(\basereg+1), \mem_operand, \abits 131 .elseif \elem_size==16 132 pixldst1 \op, 4h, %(\basereg+1), \mem_operand, \abits 133 .else 134 pixldst1 \op, 8b, %(\basereg+1), \mem_operand, \abits 135 .endif 136 .elseif \numbytes == 4 137 .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 32) 138 pixldst0 \op, s, %(\basereg+0), 1, \mem_operand, \abits, 4 139 .elseif \elem_size == 16 140 pixldst0 \op, h, %(\basereg+0), 2, \mem_operand, \abits, 2 141 pixldst0 \op, h, %(\basereg+0), 3, \mem_operand, \abits, 2 142 .else 143 pixldst0 \op, b, %(\basereg+0), 4, \mem_operand, \abits, 1 144 pixldst0 \op, b, %(\basereg+0), 5, \mem_operand, \abits, 1 145 pixldst0 \op, b, %(\basereg+0), 6, \mem_operand, \abits, 1 146 pixldst0 \op, b, %(\basereg+0), 7, \mem_operand, \abits, 1 147 .endif 148 .elseif \numbytes == 2 149 .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 16) 150 pixldst0 \op, h, %(\basereg+0), 1, \mem_operand, \abits, 2 151 .else 152 pixldst0 \op, b, %(\basereg+0), 2, \mem_operand, \abits, 1 153 pixldst0 \op, b, %(\basereg+0), 3, \mem_operand, \abits, 1 154 .endif 155 .elseif \numbytes == 1 156 pixldst0 \op, b, %(\basereg+0), 1, \mem_operand, \abits, 1 157 .else 158 .error "unsupported size: \numbytes" 159 .endif 160 .endm 161 162 .macro pixld numpix, bpp, basereg, mem_operand, abits=0 163 .if \bpp > 0 164 .if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) 165 pixldst4 ld4, 8b, %(\basereg+4), %(\basereg+5), \ 166 %(\basereg+6), %(\basereg+7), \mem_operand, \abits 167 .elseif (\bpp == 24) && (\numpix == 8) 168 pixldst3 ld3, 8b, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand 169 .elseif (\bpp == 24) && (\numpix == 4) 170 pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand 171 pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand 172 pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand 173 pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand 174 .elseif (\bpp == 24) && (\numpix == 2) 175 pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand 176 pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand 177 .elseif (\bpp == 24) && (\numpix == 1) 178 pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand 179 .else 180 pixldst %(\numpix * \bpp / 8), ld1, %(\bpp), \basereg, \mem_operand, \abits 181 .endif 182 .endif 183 .endm 184 185 .macro pixst numpix, bpp, basereg, mem_operand, abits=0 186 .if \bpp > 0 187 .if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) 188 pixldst4 st4, 8b, %(\basereg+4), %(\basereg+5), \ 189 %(\basereg+6), %(\basereg+7), \mem_operand, \abits 190 .elseif (\bpp == 24) && (\numpix == 8) 191 pixldst3 st3, 8b, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand 192 .elseif (\bpp == 24) && (\numpix == 4) 193 pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand 194 pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand 195 pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand 196 pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand 197 .elseif (\bpp == 24) && (\numpix == 2) 198 pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand 199 pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand 200 .elseif (\bpp == 24) && (\numpix == 1) 201 pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand 202 .elseif \numpix * \bpp == 32 && \abits == 32 203 pixldst 4, st1, 32, \basereg, \mem_operand, \abits 204 .elseif \numpix * \bpp == 16 && \abits == 16 205 pixldst 2, st1, 16, \basereg, \mem_operand, \abits 206 .else 207 pixldst %(\numpix * \bpp / 8), st1, %(\bpp), \basereg, \mem_operand, \abits 208 .endif 209 .endif 210 .endm 211 212 .macro pixld_a numpix, bpp, basereg, mem_operand 213 .if (\bpp * \numpix) <= 128 214 pixld \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix) 215 .else 216 pixld \numpix, \bpp, \basereg, \mem_operand, 128 217 .endif 218 .endm 219 220 .macro pixst_a numpix, bpp, basereg, mem_operand 221 .if (\bpp * \numpix) <= 128 222 pixst \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix) 223 .else 224 pixst \numpix, \bpp, \basereg, \mem_operand, 128 225 .endif 226 .endm 227 228 /* 229 * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register 230 * aliases to be defined) 231 */ 232 .macro pixld1_s elem_size, reg1, mem_operand 233 .if \elem_size == 16 234 asr TMP1, VX, #16 235 adds VX, VX, UNIT_X 236 bmi 55f 237 5: 238 subs VX, VX, SRC_WIDTH_FIXED 239 bpl 5b 240 55: 241 add TMP1, \mem_operand, TMP1, lsl #1 242 asr TMP2, VX, #16 243 adds VX, VX, UNIT_X 244 bmi 55f 245 5: 246 subs VX, VX, SRC_WIDTH_FIXED 247 bpl 5b 248 55: 249 add TMP2, \mem_operand, TMP2, lsl #1 250 ld1 {v\()\reg1\().h}[0], [TMP1] 251 asr TMP1, VX, #16 252 adds VX, VX, UNIT_X 253 bmi 55f 254 5: 255 subs VX, VX, SRC_WIDTH_FIXED 256 bpl 5b 257 55: 258 add TMP1, \mem_operand, TMP1, lsl #1 259 ld1 {v\()\reg1\().h}[1], [TMP2] 260 asr TMP2, VX, #16 261 adds VX, VX, UNIT_X 262 bmi 55f 263 5: 264 subs VX, VX, SRC_WIDTH_FIXED 265 bpl 5b 266 55: 267 add TMP2, \mem_operand, TMP2, lsl #1 268 ld1 {v\()\reg1\().h}[2], [TMP1] 269 ld1 {v\()\reg1\().h}[3], [TMP2] 270 .elseif \elem_size == 32 271 asr TMP1, VX, #16 272 adds VX, VX, UNIT_X 273 bmi 55f 274 5: 275 subs VX, VX, SRC_WIDTH_FIXED 276 bpl 5b 277 55: 278 add TMP1, \mem_operand, TMP1, lsl #2 279 asr TMP2, VX, #16 280 adds VX, VX, UNIT_X 281 bmi 55f 282 5: 283 subs VX, VX, SRC_WIDTH_FIXED 284 bpl 5b 285 55: 286 add TMP2, \mem_operand, TMP2, lsl #2 287 ld1 {v\()\reg1\().s}[0], [TMP1] 288 ld1 {v\()\reg1\().s}[1], [TMP2] 289 .else 290 .error "unsupported" 291 .endif 292 .endm 293 294 .macro pixld2_s elem_size, reg1, reg2, mem_operand 295 .if 0 /* \elem_size == 32 */ 296 mov TMP1, VX, asr #16 297 add VX, VX, UNIT_X, asl #1 298 add TMP1, \mem_operand, TMP1, asl #2 299 mov TMP2, VX, asr #16 300 sub VX, VX, UNIT_X 301 add TMP2, \mem_operand, TMP2, asl #2 302 ld1 {v\()\reg1\().s}[0], [TMP1] 303 mov TMP1, VX, asr #16 304 add VX, VX, UNIT_X, asl #1 305 add TMP1, \mem_operand, TMP1, asl #2 306 ld1 {v\()\reg2\().s}[0], [TMP2, :32] 307 mov TMP2, VX, asr #16 308 add VX, VX, UNIT_X 309 add TMP2, \mem_operand, TMP2, asl #2 310 ld1 {v\()\reg1\().s}[1], [TMP1] 311 ld1 {v\()\reg2\().s}[1], [TMP2] 312 .else 313 pixld1_s \elem_size, \reg1, \mem_operand 314 pixld1_s \elem_size, \reg2, \mem_operand 315 .endif 316 .endm 317 318 .macro pixld0_s elem_size, reg1, idx, mem_operand 319 .if \elem_size == 16 320 asr TMP1, VX, #16 321 adds VX, VX, UNIT_X 322 bmi 55f 323 5: 324 subs VX, VX, SRC_WIDTH_FIXED 325 bpl 5b 326 55: 327 add TMP1, \mem_operand, TMP1, lsl #1 328 ld1 {v\()\reg1\().h}[\idx], [TMP1] 329 .elseif \elem_size == 32 330 asr DUMMY, VX, #16 331 mov TMP1, DUMMY 332 adds VX, VX, UNIT_X 333 bmi 55f 334 5: 335 subs VX, VX, SRC_WIDTH_FIXED 336 bpl 5b 337 55: 338 add TMP1, \mem_operand, TMP1, lsl #2 339 ld1 {v\()\reg1\().s}[\idx], [TMP1] 340 .endif 341 .endm 342 343 .macro pixld_s_internal numbytes, elem_size, basereg, mem_operand 344 .if \numbytes == 32 345 pixld2_s \elem_size, %(\basereg+4), %(\basereg+5), \mem_operand 346 pixld2_s \elem_size, %(\basereg+6), %(\basereg+7), \mem_operand 347 pixdeinterleave \elem_size, %(\basereg+4) 348 .elseif \numbytes == 16 349 pixld2_s \elem_size, %(\basereg+2), %(\basereg+3), \mem_operand 350 .elseif \numbytes == 8 351 pixld1_s \elem_size, %(\basereg+1), \mem_operand 352 .elseif \numbytes == 4 353 .if \elem_size == 32 354 pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand 355 .elseif \elem_size == 16 356 pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand 357 pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand 358 .else 359 pixld0_s \elem_size, %(\basereg+0), 4, \mem_operand 360 pixld0_s \elem_size, %(\basereg+0), 5, \mem_operand 361 pixld0_s \elem_size, %(\basereg+0), 6, \mem_operand 362 pixld0_s \elem_size, %(\basereg+0), 7, \mem_operand 363 .endif 364 .elseif \numbytes == 2 365 .if \elem_size == 16 366 pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand 367 .else 368 pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand 369 pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand 370 .endif 371 .elseif \numbytes == 1 372 pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand 373 .else 374 .error "unsupported size: \numbytes" 375 .endif 376 .endm 377 378 .macro pixld_s numpix, bpp, basereg, mem_operand 379 .if \bpp > 0 380 pixld_s_internal %(\numpix * \bpp / 8), %(\bpp), \basereg, \mem_operand 381 .endif 382 .endm 383 384 .macro vuzp8 reg1, reg2 385 umov DUMMY, v16.d[0] 386 uzp1 v16.8b, v\()\reg1\().8b, v\()\reg2\().8b 387 uzp2 v\()\reg2\().8b, v\()\reg1\().8b, v\()\reg2\().8b 388 mov v\()\reg1\().8b, v16.8b 389 mov v16.d[0], DUMMY 390 .endm 391 392 .macro vzip8 reg1, reg2 393 umov DUMMY, v16.d[0] 394 zip1 v16.8b, v\()\reg1\().8b, v\()\reg2\().8b 395 zip2 v\()\reg2\().8b, v\()\reg1\().8b, v\()\reg2\().8b 396 mov v\()\reg1\().8b, v16.8b 397 mov v16.d[0], DUMMY 398 .endm 399 400 /* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ 401 .macro pixdeinterleave bpp, basereg 402 .if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) 403 vuzp8 %(\basereg+0), %(\basereg+1) 404 vuzp8 %(\basereg+2), %(\basereg+3) 405 vuzp8 %(\basereg+1), %(\basereg+3) 406 vuzp8 %(\basereg+0), %(\basereg+2) 407 .endif 408 .endm 409 410 /* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ 411 .macro pixinterleave bpp, basereg 412 .if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) 413 vzip8 %(\basereg+0), %(\basereg+2) 414 vzip8 %(\basereg+1), %(\basereg+3) 415 vzip8 %(\basereg+2), %(\basereg+3) 416 vzip8 %(\basereg+0), %(\basereg+1) 417 .endif 418 .endm 419 420 /* 421 * This is a macro for implementing cache preload. The main idea is that 422 * cache preload logic is mostly independent from the rest of pixels 423 * processing code. It starts at the top left pixel and moves forward 424 * across pixels and can jump across scanlines. Prefetch distance is 425 * handled in an 'incremental' way: it starts from 0 and advances to the 426 * optimal distance over time. After reaching optimal prefetch distance, 427 * it is kept constant. There are some checks which prevent prefetching 428 * unneeded pixel lines below the image (but it still can prefetch a bit 429 * more data on the right side of the image - not a big issue and may 430 * be actually helpful when rendering text glyphs). Additional trick is 431 * the use of LDR instruction for prefetch instead of PLD when moving to 432 * the next line, the point is that we have a high chance of getting TLB 433 * miss in this case, and PLD would be useless. 434 * 435 * This sounds like it may introduce a noticeable overhead (when working with 436 * fully cached data). But in reality, due to having a separate pipeline and 437 * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can 438 * execute simultaneously with NEON and be completely shadowed by it. Thus 439 * we get no performance overhead at all (*). This looks like a very nice 440 * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher, 441 * but still can implement some rather advanced prefetch logic in software 442 * for almost zero cost! 443 * 444 * (*) The overhead of the prefetcher is visible when running some trivial 445 * pixels processing like simple copy. Anyway, having prefetch is a must 446 * when working with the graphics data. 447 */ 448 .macro PF a, x:vararg 449 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED) 450 \a \x 451 .endif 452 .endm 453 454 .macro cache_preload std_increment, boost_increment 455 .if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0) 456 .if \std_increment != 0 457 PF add, PF_X, PF_X, #\std_increment 458 .endif 459 PF tst, PF_CTL, #0xF 460 PF beq, 71f 461 PF add, PF_X, PF_X, #\boost_increment 462 PF sub, PF_CTL, PF_CTL, #1 463 71: 464 PF cmp, PF_X, ORIG_W 465 .if src_bpp_shift >= 0 466 PF lsl, DUMMY, PF_X, #src_bpp_shift 467 PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] 468 .endif 469 .if dst_r_bpp != 0 470 PF lsl, DUMMY, PF_X, #dst_bpp_shift 471 PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] 472 .endif 473 .if mask_bpp_shift >= 0 474 PF lsl, DUMMY, PF_X, #mask_bpp_shift 475 PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY] 476 .endif 477 PF ble, 72f 478 PF sub, PF_X, PF_X, ORIG_W 479 PF subs, PF_CTL, PF_CTL, #0x10 480 PF ble, 72f 481 .if src_bpp_shift >= 0 482 PF add, PF_SRC, PF_SRC, SRC_STRIDE, lsl #src_bpp_shift 483 PF ldrsb, DUMMY, [PF_SRC] 484 .endif 485 .if dst_r_bpp != 0 486 PF add, PF_DST, PF_DST, DST_STRIDE, lsl #dst_bpp_shift 487 PF ldrsb, DUMMY, [PF_DST] 488 .endif 489 .if mask_bpp_shift >= 0 490 PF add, PF_MASK, PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift 491 PF ldrsb, DUMMY, [PF_MASK] 492 .endif 493 72: 494 .endif 495 .endm 496 497 .macro cache_preload_simple 498 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE) 499 .if src_bpp > 0 500 prfm PREFETCH_MODE, [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)] 501 .endif 502 .if dst_r_bpp > 0 503 prfm PREFETCH_MODE, [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)] 504 .endif 505 .if mask_bpp > 0 506 prfm PREFETCH_MODE, [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)] 507 .endif 508 .endif 509 .endm 510 511 .macro fetch_mask_pixblock 512 pixld pixblock_size, mask_bpp, \ 513 (mask_basereg - pixblock_size * mask_bpp / 64), MASK 514 .endm 515 516 /* 517 * Macro which is used to process leading pixels until destination 518 * pointer is properly aligned (at 16 bytes boundary). When destination 519 * buffer uses 16bpp format, this is unnecessary, or even pointless. 520 */ 521 .macro ensure_destination_ptr_alignment process_pixblock_head, \ 522 process_pixblock_tail, \ 523 process_pixblock_tail_head 524 .if dst_w_bpp != 24 525 tst DST_R, #0xF 526 beq 52f 527 528 .if src_bpp > 0 || mask_bpp > 0 || dst_r_bpp > 0 529 .irp lowbit, 1, 2, 4, 8, 16 530 531 .if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp)) 532 .if \lowbit < 16 /* we don't need more than 16-byte alignment */ 533 tst DST_R, #\lowbit 534 beq 51f 535 .endif 536 pixld_src (\lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC 537 pixld (\lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK 538 .if dst_r_bpp > 0 539 pixld_a (\lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R 540 .else 541 add DST_R, DST_R, #\lowbit 542 .endif 543 PF add, PF_X, PF_X, #(\lowbit * 8 / dst_w_bpp) 544 sub W, W, #(\lowbit * 8 / dst_w_bpp) 545 51: 546 .endif 547 .endr 548 .endif 549 pixdeinterleave src_bpp, src_basereg 550 pixdeinterleave mask_bpp, mask_basereg 551 pixdeinterleave dst_r_bpp, dst_r_basereg 552 553 \process_pixblock_head 554 cache_preload 0, pixblock_size 555 cache_preload_simple 556 \process_pixblock_tail 557 558 pixinterleave dst_w_bpp, dst_w_basereg 559 560 .irp lowbit, 1, 2, 4, 8, 16 561 .if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp)) 562 .if \lowbit < 16 /* we don't need more than 16-byte alignment */ 563 tst DST_W, #\lowbit 564 beq 51f 565 .endif 566 .if src_bpp == 0 && mask_bpp == 0 && dst_r_bpp == 0 567 sub W, W, #(\lowbit * 8 / dst_w_bpp) 568 .endif 569 pixst_a (\lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W 570 51: 571 .endif 572 .endr 573 .endif 574 52: 575 .endm 576 577 /* 578 * Special code for processing up to (pixblock_size - 1) remaining 579 * trailing pixels. As SIMD processing performs operation on 580 * pixblock_size pixels, anything smaller than this has to be loaded 581 * and stored in a special way. Loading and storing of pixel data is 582 * performed in such a way that we fill some 'slots' in the NEON 583 * registers (some slots naturally are unused), then perform compositing 584 * operation as usual. In the end, the data is taken from these 'slots' 585 * and saved to memory. 586 * 587 * cache_preload_flag - allows to suppress prefetch if 588 * set to 0 589 * dst_aligned_flag - selects whether destination buffer 590 * is aligned 591 */ 592 .macro process_trailing_pixels cache_preload_flag, \ 593 dst_aligned_flag, \ 594 process_pixblock_head, \ 595 process_pixblock_tail, \ 596 process_pixblock_tail_head 597 tst W, #(pixblock_size - 1) 598 beq 52f 599 .if src_bpp > 0 || mask_bpp > 0 || dst_r_bpp > 0 600 .irp chunk_size, 16, 8, 4, 2, 1 601 .if pixblock_size > \chunk_size 602 tst W, #\chunk_size 603 beq 51f 604 pixld_src \chunk_size, src_bpp, src_basereg, SRC 605 pixld \chunk_size, mask_bpp, mask_basereg, MASK 606 .if \dst_aligned_flag != 0 607 pixld_a \chunk_size, dst_r_bpp, dst_r_basereg, DST_R 608 .else 609 pixld \chunk_size, dst_r_bpp, dst_r_basereg, DST_R 610 .endif 611 .if \cache_preload_flag != 0 612 PF add, PF_X, PF_X, #\chunk_size 613 .endif 614 51: 615 .endif 616 .endr 617 .endif 618 pixdeinterleave src_bpp, src_basereg 619 pixdeinterleave mask_bpp, mask_basereg 620 pixdeinterleave dst_r_bpp, dst_r_basereg 621 622 \process_pixblock_head 623 .if \cache_preload_flag != 0 624 cache_preload 0, pixblock_size 625 cache_preload_simple 626 .endif 627 \process_pixblock_tail 628 pixinterleave dst_w_bpp, dst_w_basereg 629 .irp chunk_size, 16, 8, 4, 2, 1 630 .if pixblock_size > \chunk_size 631 tst W, #\chunk_size 632 beq 51f 633 .if \dst_aligned_flag != 0 634 pixst_a \chunk_size, dst_w_bpp, dst_w_basereg, DST_W 635 .else 636 pixst \chunk_size, dst_w_bpp, dst_w_basereg, DST_W 637 .endif 638 51: 639 .endif 640 .endr 641 52: 642 .endm 643 644 /* 645 * Macro, which performs all the needed operations to switch to the next 646 * scanline and start the next loop iteration unless all the scanlines 647 * are already processed. 648 */ 649 .macro advance_to_next_scanline start_of_loop_label 650 mov W, ORIG_W 651 add DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift 652 .if src_bpp != 0 653 add SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift 654 .endif 655 .if mask_bpp != 0 656 add MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift 657 .endif 658 .if (dst_w_bpp != 24) 659 sub DST_W, DST_W, W, lsl #dst_bpp_shift 660 .endif 661 .if (src_bpp != 24) && (src_bpp != 0) 662 sub SRC, SRC, W, lsl #src_bpp_shift 663 .endif 664 .if (mask_bpp != 24) && (mask_bpp != 0) 665 sub MASK, MASK, W, lsl #mask_bpp_shift 666 .endif 667 subs H, H, #1 668 mov DST_R, DST_W 669 bge \start_of_loop_label 670 .endm 671 672 /* 673 * Registers are allocated in the following way by default: 674 * v0, v1, v2, v3 - reserved for loading source pixel data 675 * v4, v5, v6, v7 - reserved for loading destination pixel data 676 * v24, v25, v26, v27 - reserved for loading mask pixel data 677 * v28, v29, v30, v31 - final destination pixel data for writeback to memory 678 */ 679 .macro generate_composite_function fname, \ 680 src_bpp_, \ 681 mask_bpp_, \ 682 dst_w_bpp_, \ 683 flags, \ 684 pixblock_size_, \ 685 prefetch_distance, \ 686 init, \ 687 cleanup, \ 688 process_pixblock_head, \ 689 process_pixblock_tail, \ 690 process_pixblock_tail_head, \ 691 dst_w_basereg_ = 28, \ 692 dst_r_basereg_ = 4, \ 693 src_basereg_ = 0, \ 694 mask_basereg_ = 24 695 696 pixman_asm_function \fname 697 stp x29, x30, [sp, -16]! 698 mov x29, sp 699 sub sp, sp, 232 /* push all registers */ 700 sub x29, x29, 64 701 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32 702 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32 703 stp x8, x9, [x29, -80] 704 stp x10, x11, [x29, -96] 705 stp x12, x13, [x29, -112] 706 stp x14, x15, [x29, -128] 707 stp x16, x17, [x29, -144] 708 stp x18, x19, [x29, -160] 709 stp x20, x21, [x29, -176] 710 stp x22, x23, [x29, -192] 711 stp x24, x25, [x29, -208] 712 stp x26, x27, [x29, -224] 713 str x28, [x29, -232] 714 715 /* 716 * Select prefetch type for this function. If prefetch distance is 717 * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch 718 * has to be used instead of ADVANCED. 719 */ 720 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT 721 .if \prefetch_distance == 0 722 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE 723 .elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \ 724 ((\src_bpp_ == 24) || (\mask_bpp_ == 24) || (\dst_w_bpp_ == 24)) 725 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE 726 .endif 727 728 /* 729 * Make some macro arguments globally visible and accessible 730 * from other macros 731 */ 732 .set src_bpp, \src_bpp_ 733 .set mask_bpp, \mask_bpp_ 734 .set dst_w_bpp, \dst_w_bpp_ 735 .set pixblock_size, \pixblock_size_ 736 .set dst_w_basereg, \dst_w_basereg_ 737 .set dst_r_basereg, \dst_r_basereg_ 738 .set src_basereg, \src_basereg_ 739 .set mask_basereg, \mask_basereg_ 740 741 .macro pixld_src x:vararg 742 pixld \x 743 .endm 744 .macro fetch_src_pixblock 745 pixld_src pixblock_size, src_bpp, \ 746 (src_basereg - pixblock_size * src_bpp / 64), SRC 747 .endm 748 /* 749 * Assign symbolic names to registers 750 */ 751 W .req x0 /* width (is updated during processing) */ 752 H .req x1 /* height (is updated during processing) */ 753 DST_W .req x2 /* destination buffer pointer for writes */ 754 DST_STRIDE .req x3 /* destination image stride */ 755 SRC .req x4 /* source buffer pointer */ 756 SRC_STRIDE .req x5 /* source image stride */ 757 MASK .req x6 /* mask pointer */ 758 MASK_STRIDE .req x7 /* mask stride */ 759 760 DST_R .req x8 /* destination buffer pointer for reads */ 761 762 PF_CTL .req x9 /* combined lines counter and prefetch */ 763 /* distance increment counter */ 764 PF_X .req x10 /* pixel index in a scanline for current */ 765 /* pretetch position */ 766 PF_SRC .req x11 /* pointer to source scanline start */ 767 /* for prefetch purposes */ 768 PF_DST .req x12 /* pointer to destination scanline start */ 769 /* for prefetch purposes */ 770 PF_MASK .req x13 /* pointer to mask scanline start */ 771 /* for prefetch purposes */ 772 773 ORIG_W .req x14 /* saved original width */ 774 DUMMY .req x15 /* temporary register */ 775 776 sxtw x0, w0 777 sxtw x1, w1 778 sxtw x3, w3 779 sxtw x5, w5 780 sxtw x7, w7 781 782 .set mask_bpp_shift, -1 783 .if src_bpp == 32 784 .set src_bpp_shift, 2 785 .elseif src_bpp == 24 786 .set src_bpp_shift, 0 787 .elseif src_bpp == 16 788 .set src_bpp_shift, 1 789 .elseif src_bpp == 8 790 .set src_bpp_shift, 0 791 .elseif src_bpp == 0 792 .set src_bpp_shift, -1 793 .else 794 .error "requested src bpp (src_bpp) is not supported" 795 .endif 796 .if mask_bpp == 32 797 .set mask_bpp_shift, 2 798 .elseif mask_bpp == 24 799 .set mask_bpp_shift, 0 800 .elseif mask_bpp == 8 801 .set mask_bpp_shift, 0 802 .elseif mask_bpp == 0 803 .set mask_bpp_shift, -1 804 .else 805 .error "requested mask bpp (mask_bpp) is not supported" 806 .endif 807 .if dst_w_bpp == 32 808 .set dst_bpp_shift, 2 809 .elseif dst_w_bpp == 24 810 .set dst_bpp_shift, 0 811 .elseif dst_w_bpp == 16 812 .set dst_bpp_shift, 1 813 .elseif dst_w_bpp == 8 814 .set dst_bpp_shift, 0 815 .else 816 .error "requested dst bpp (dst_w_bpp) is not supported" 817 .endif 818 819 .if (((\flags) & FLAG_DST_READWRITE) != 0) 820 .set dst_r_bpp, dst_w_bpp 821 .else 822 .set dst_r_bpp, 0 823 .endif 824 .if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0) 825 .set DEINTERLEAVE_32BPP_ENABLED, 1 826 .else 827 .set DEINTERLEAVE_32BPP_ENABLED, 0 828 .endif 829 830 .if \prefetch_distance < 0 || \prefetch_distance > 15 831 .error "invalid prefetch distance (\prefetch_distance)" 832 .endif 833 834 PF mov, PF_X, #0 835 mov DST_R, DST_W 836 837 .if src_bpp == 24 838 sub SRC_STRIDE, SRC_STRIDE, W 839 sub SRC_STRIDE, SRC_STRIDE, W, lsl #1 840 .endif 841 .if mask_bpp == 24 842 sub MASK_STRIDE, MASK_STRIDE, W 843 sub MASK_STRIDE, MASK_STRIDE, W, lsl #1 844 .endif 845 .if dst_w_bpp == 24 846 sub DST_STRIDE, DST_STRIDE, W 847 sub DST_STRIDE, DST_STRIDE, W, lsl #1 848 .endif 849 850 /* 851 * Setup advanced prefetcher initial state 852 */ 853 PF mov, PF_SRC, SRC 854 PF mov, PF_DST, DST_R 855 PF mov, PF_MASK, MASK 856 /* PF_CTL = \prefetch_distance | ((h - 1) << 4) */ 857 PF lsl, PF_CTL, H, #4 858 PF add, PF_CTL, PF_CTL, #(\prefetch_distance - 0x10) 859 860 \init 861 subs H, H, #1 862 mov ORIG_W, W 863 blt 9f 864 cmp W, #(pixblock_size * 2) 865 blt 800f 866 /* 867 * This is the start of the pipelined loop, which if optimized for 868 * long scanlines 869 */ 870 0: 871 ensure_destination_ptr_alignment \process_pixblock_head, \ 872 \process_pixblock_tail, \ 873 \process_pixblock_tail_head 874 875 /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ 876 pixld_a pixblock_size, dst_r_bpp, \ 877 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R 878 fetch_src_pixblock 879 pixld pixblock_size, mask_bpp, \ 880 (mask_basereg - pixblock_size * mask_bpp / 64), MASK 881 PF add, PF_X, PF_X, #pixblock_size 882 \process_pixblock_head 883 cache_preload 0, pixblock_size 884 cache_preload_simple 885 subs W, W, #(pixblock_size * 2) 886 blt 200f 887 888 100: 889 \process_pixblock_tail_head 890 cache_preload_simple 891 subs W, W, #pixblock_size 892 bge 100b 893 894 200: 895 \process_pixblock_tail 896 pixst_a pixblock_size, dst_w_bpp, \ 897 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W 898 899 /* Process the remaining trailing pixels in the scanline */ 900 process_trailing_pixels 1, 1, \ 901 \process_pixblock_head, \ 902 \process_pixblock_tail, \ 903 \process_pixblock_tail_head 904 advance_to_next_scanline 0b 905 906 \cleanup 907 1000: 908 /* pop all registers */ 909 sub x29, x29, 64 910 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 911 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 912 ldp x8, x9, [x29, -80] 913 ldp x10, x11, [x29, -96] 914 ldp x12, x13, [x29, -112] 915 ldp x14, x15, [x29, -128] 916 ldp x16, x17, [x29, -144] 917 ldp x18, x19, [x29, -160] 918 ldp x20, x21, [x29, -176] 919 ldp x22, x23, [x29, -192] 920 ldp x24, x25, [x29, -208] 921 ldp x26, x27, [x29, -224] 922 ldr x28, [x29, -232] 923 mov sp, x29 924 ldp x29, x30, [sp], 16 925 VERIFY_LR 926 ret /* exit */ 927 /* 928 * This is the start of the loop, designed to process images with small width 929 * (less than pixblock_size * 2 pixels). In this case neither pipelining 930 * nor prefetch are used. 931 */ 932 800: 933 .if src_bpp_shift >= 0 934 PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift 935 PF prfm, PREFETCH_MODE, [SRC, DUMMY] 936 .endif 937 .if dst_r_bpp != 0 938 PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift 939 PF prfm, PREFETCH_MODE, [DST_R, DUMMY] 940 .endif 941 .if mask_bpp_shift >= 0 942 PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift 943 PF prfm, PREFETCH_MODE, [MASK, DUMMY] 944 .endif 945 /* Process exactly pixblock_size pixels if needed */ 946 tst W, #pixblock_size 947 beq 100f 948 pixld pixblock_size, dst_r_bpp, \ 949 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R 950 fetch_src_pixblock 951 pixld pixblock_size, mask_bpp, \ 952 (mask_basereg - pixblock_size * mask_bpp / 64), MASK 953 \process_pixblock_head 954 \process_pixblock_tail 955 pixst pixblock_size, dst_w_bpp, \ 956 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W 957 100: 958 /* Process the remaining trailing pixels in the scanline */ 959 process_trailing_pixels 0, 0, \ 960 \process_pixblock_head, \ 961 \process_pixblock_tail, \ 962 \process_pixblock_tail_head 963 advance_to_next_scanline 800b 964 9: 965 \cleanup 966 /* pop all registers */ 967 sub x29, x29, 64 968 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 969 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 970 ldp x8, x9, [x29, -80] 971 ldp x10, x11, [x29, -96] 972 ldp x12, x13, [x29, -112] 973 ldp x14, x15, [x29, -128] 974 ldp x16, x17, [x29, -144] 975 ldp x18, x19, [x29, -160] 976 ldp x20, x21, [x29, -176] 977 ldp x22, x23, [x29, -192] 978 ldp x24, x25, [x29, -208] 979 ldp x26, x27, [x29, -224] 980 ldr x28, [x29, -232] 981 mov sp, x29 982 ldp x29, x30, [sp], 16 983 VERIFY_LR 984 ret /* exit */ 985 986 .purgem fetch_src_pixblock 987 .purgem pixld_src 988 989 .unreq SRC 990 .unreq MASK 991 .unreq DST_R 992 .unreq DST_W 993 .unreq ORIG_W 994 .unreq W 995 .unreq H 996 .unreq SRC_STRIDE 997 .unreq DST_STRIDE 998 .unreq MASK_STRIDE 999 .unreq PF_CTL 1000 .unreq PF_X 1001 .unreq PF_SRC 1002 .unreq PF_DST 1003 .unreq PF_MASK 1004 .unreq DUMMY 1005 pixman_end_asm_function 1006 .endm 1007 1008 /* 1009 * A simplified variant of function generation template for a single 1010 * scanline processing (for implementing pixman combine functions) 1011 */ 1012 .macro generate_composite_function_scanline use_nearest_scaling, \ 1013 fname, \ 1014 src_bpp_, \ 1015 mask_bpp_, \ 1016 dst_w_bpp_, \ 1017 flags, \ 1018 pixblock_size_, \ 1019 init, \ 1020 cleanup, \ 1021 process_pixblock_head, \ 1022 process_pixblock_tail, \ 1023 process_pixblock_tail_head, \ 1024 dst_w_basereg_ = 28, \ 1025 dst_r_basereg_ = 4, \ 1026 src_basereg_ = 0, \ 1027 mask_basereg_ = 24 1028 1029 pixman_asm_function \fname 1030 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE 1031 1032 /* 1033 * Make some macro arguments globally visible and accessible 1034 * from other macros 1035 */ 1036 .set src_bpp, \src_bpp_ 1037 .set mask_bpp, \mask_bpp_ 1038 .set dst_w_bpp, \dst_w_bpp_ 1039 .set pixblock_size, \pixblock_size_ 1040 .set dst_w_basereg, \dst_w_basereg_ 1041 .set dst_r_basereg, \dst_r_basereg_ 1042 .set src_basereg, \src_basereg_ 1043 .set mask_basereg, \mask_basereg_ 1044 1045 .if \use_nearest_scaling != 0 1046 /* 1047 * Assign symbolic names to registers for nearest scaling 1048 */ 1049 W .req x0 1050 DST_W .req x1 1051 SRC .req x2 1052 VX .req x3 1053 UNIT_X .req x4 1054 SRC_WIDTH_FIXED .req x5 1055 MASK .req x6 1056 TMP1 .req x8 1057 TMP2 .req x9 1058 DST_R .req x10 1059 DUMMY .req x30 1060 1061 .macro pixld_src x:vararg 1062 pixld_s \x 1063 .endm 1064 1065 sxtw x0, w0 1066 sxtw x3, w3 1067 sxtw x4, w4 1068 sxtw x5, w5 1069 1070 stp x29, x30, [sp, -16]! 1071 mov x29, sp 1072 sub sp, sp, 88 1073 sub x29, x29, 64 1074 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 1075 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 1076 stp x8, x9, [x29, -80] 1077 str x10, [x29, -88] 1078 .else 1079 /* 1080 * Assign symbolic names to registers 1081 */ 1082 W .req x0 /* width (is updated during processing) */ 1083 DST_W .req x1 /* destination buffer pointer for writes */ 1084 SRC .req x2 /* source buffer pointer */ 1085 MASK .req x3 /* mask pointer */ 1086 DST_R .req x4 /* destination buffer pointer for reads */ 1087 DUMMY .req x30 1088 1089 .macro pixld_src x:vararg 1090 pixld \x 1091 .endm 1092 1093 sxtw x0, w0 1094 1095 stp x29, x30, [sp, -16]! 1096 mov x29, sp 1097 sub sp, sp, 64 1098 sub x29, x29, 64 1099 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 1100 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 1101 .endif 1102 1103 .if (((\flags) & FLAG_DST_READWRITE) != 0) 1104 .set dst_r_bpp, dst_w_bpp 1105 .else 1106 .set dst_r_bpp, 0 1107 .endif 1108 .if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0) 1109 .set DEINTERLEAVE_32BPP_ENABLED, 1 1110 .else 1111 .set DEINTERLEAVE_32BPP_ENABLED, 0 1112 .endif 1113 1114 .macro fetch_src_pixblock 1115 pixld_src pixblock_size, src_bpp, \ 1116 (src_basereg - pixblock_size * src_bpp / 64), SRC 1117 .endm 1118 1119 \init 1120 mov DST_R, DST_W 1121 1122 cmp W, #pixblock_size 1123 blt 800f 1124 1125 ensure_destination_ptr_alignment \process_pixblock_head, \ 1126 \process_pixblock_tail, \ 1127 \process_pixblock_tail_head 1128 1129 subs W, W, #pixblock_size 1130 blt 700f 1131 1132 /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ 1133 pixld_a pixblock_size, dst_r_bpp, \ 1134 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R 1135 fetch_src_pixblock 1136 pixld pixblock_size, mask_bpp, \ 1137 (mask_basereg - pixblock_size * mask_bpp / 64), MASK 1138 \process_pixblock_head 1139 subs W, W, #pixblock_size 1140 blt 200f 1141 100: 1142 \process_pixblock_tail_head 1143 subs W, W, #pixblock_size 1144 bge 100b 1145 200: 1146 \process_pixblock_tail 1147 pixst_a pixblock_size, dst_w_bpp, \ 1148 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W 1149 700: 1150 /* Process the remaining trailing pixels in the scanline (dst aligned) */ 1151 process_trailing_pixels 0, 1, \ 1152 \process_pixblock_head, \ 1153 \process_pixblock_tail, \ 1154 \process_pixblock_tail_head 1155 1156 \cleanup 1157 .if \use_nearest_scaling != 0 1158 sub x29, x29, 64 1159 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 1160 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 1161 ldp x8, x9, [x29, -80] 1162 ldr x10, [x29, -96] 1163 mov sp, x29 1164 ldp x29, x30, [sp], 16 1165 VERIFY_LR 1166 ret /* exit */ 1167 .else 1168 sub x29, x29, 64 1169 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 1170 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 1171 mov sp, x29 1172 ldp x29, x30, [sp], 16 1173 VERIFY_LR 1174 ret /* exit */ 1175 .endif 1176 800: 1177 /* Process the remaining trailing pixels in the scanline (dst unaligned) */ 1178 process_trailing_pixels 0, 0, \ 1179 \process_pixblock_head, \ 1180 \process_pixblock_tail, \ 1181 \process_pixblock_tail_head 1182 1183 \cleanup 1184 .if \use_nearest_scaling != 0 1185 sub x29, x29, 64 1186 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 1187 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 1188 ldp x8, x9, [x29, -80] 1189 ldr x10, [x29, -88] 1190 mov sp, x29 1191 ldp x29, x30, [sp], 16 1192 VERIFY_LR 1193 ret /* exit */ 1194 1195 .unreq DUMMY 1196 .unreq DST_R 1197 .unreq SRC 1198 .unreq W 1199 .unreq VX 1200 .unreq UNIT_X 1201 .unreq TMP1 1202 .unreq TMP2 1203 .unreq DST_W 1204 .unreq MASK 1205 .unreq SRC_WIDTH_FIXED 1206 1207 .else 1208 sub x29, x29, 64 1209 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 1210 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 1211 mov sp, x29 1212 ldp x29, x30, [sp], 16 1213 VERIFY_LR 1214 ret /* exit */ 1215 1216 .unreq DUMMY 1217 .unreq SRC 1218 .unreq MASK 1219 .unreq DST_R 1220 .unreq DST_W 1221 .unreq W 1222 .endif 1223 1224 .purgem fetch_src_pixblock 1225 .purgem pixld_src 1226 1227 pixman_end_asm_function 1228 .endm 1229 1230 .macro generate_composite_function_single_scanline x:vararg 1231 generate_composite_function_scanline 0, \x 1232 .endm 1233 1234 .macro generate_composite_function_nearest_scanline x:vararg 1235 generate_composite_function_scanline 1, \x 1236 .endm 1237 1238 /* Default prologue/epilogue, nothing special needs to be done */ 1239 1240 .macro default_init 1241 .endm 1242 1243 .macro default_cleanup 1244 .endm 1245 1246 /* 1247 * Prologue/epilogue variant which additionally saves/restores v8-v15 1248 * registers (they need to be saved/restored by callee according to ABI). 1249 * This is required if the code needs to use all the NEON registers. 1250 */ 1251 1252 .macro default_init_need_all_regs 1253 .endm 1254 1255 .macro default_cleanup_need_all_regs 1256 .endm 1257 1258 /******************************************************************************/ 1259 1260 /* 1261 * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in) 1262 * into a planar a8r8g8b8 format (with a, r, g, b color components 1263 * stored into 64-bit registers out_a, out_r, out_g, out_b respectively). 1264 * 1265 * Warning: the conversion is destructive and the original 1266 * value (in) is lost. 1267 */ 1268 .macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b 1269 shrn \()\out_r\().8b, \()\in\().8h, #8 1270 shrn \()\out_g\().8b, \()\in\().8h, #3 1271 sli \()\in\().8h, \()\in\().8h, #5 1272 movi \()\out_a\().8b, #255 1273 sri \()\out_r\().8b, \()\out_r\().8b, #5 1274 sri \()\out_g\().8b, \()\out_g\().8b, #6 1275 shrn \()\out_b\().8b, \()\in\().8h, #2 1276 .endm 1277 1278 .macro convert_0565_to_x888 in, out_r, out_g, out_b 1279 shrn \()\out_r\().8b, \()\in\().8h, #8 1280 shrn \()\out_g\().8b, \()\in\().8h, #3 1281 sli \()\in\().8h, \()\in\().8h, #5 1282 sri \()\out_r\().8b, \()\out_r\().8b, #5 1283 sri \()\out_g\().8b, \()\out_g\().8b, #6 1284 shrn \()\out_b\().8b, \()\in\().8h, #2 1285 .endm 1286 1287 /* 1288 * Conversion from planar a8r8g8b8 format (with a, r, g, b color components 1289 * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6 1290 * pixels packed in 128-bit register (out). Requires two temporary 128-bit 1291 * registers (tmp1, tmp2) 1292 */ 1293 .macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2 1294 ushll \()\tmp1\().8h, \()\in_g\().8b, #7 1295 shl \()\tmp1\().8h, \()\tmp1\().8h, #1 1296 ushll \()\out\().8h, \()\in_r\().8b, #7 1297 shl \()\out\().8h, \()\out\().8h, #1 1298 ushll \()\tmp2\().8h, \()\in_b\().8b, #7 1299 shl \()\tmp2\().8h, \()\tmp2\().8h, #1 1300 sri \()\out\().8h, \()\tmp1\().8h, #5 1301 sri \()\out\().8h, \()\tmp2\().8h, #11 1302 .endm 1303 1304 /* 1305 * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels 1306 * returned in (out0, out1) registers pair. Requires one temporary 1307 * 64-bit register (tmp). 'out1' and 'in' may overlap, the original 1308 * value from 'in' is lost 1309 */ 1310 .macro convert_four_0565_to_x888_packed in, out0, out1, tmp 1311 shl \()\out0\().4h, \()\in\().4h, #5 /* G top 6 bits */ 1312 shl \()\tmp\().4h, \()\in\().4h, #11 /* B top 5 bits */ 1313 sri \()\in\().4h, \()\in\().4h, #5 /* R is ready \in top bits */ 1314 sri \()\out0\().4h, \()\out0\().4h, #6 /* G is ready \in top bits */ 1315 sri \()\tmp\().4h, \()\tmp\().4h, #5 /* B is ready \in top bits */ 1316 ushr \()\out1\().4h, \()\in\().4h, #8 /* R is \in place */ 1317 sri \()\out0\().4h, \()\tmp\().4h, #8 /* G \() B is \in place */ 1318 zip1 \()\tmp\().4h, \()\out0\().4h, \()\out1\().4h /* everything is \in place */ 1319 zip2 \()\out1\().4h, \()\out0\().4h, \()\out1\().4h 1320 mov \()\out0\().d[0], \()\tmp\().d[0] 1321 .endm